commit - release 1

This commit is contained in:
2026-03-18 11:47:48 -07:00
commit 6f43807900
109 changed files with 14909 additions and 0 deletions
+15
View File
@@ -0,0 +1,15 @@
{
"permissions": {
"allow": [
"Bash(python -c \":*)",
"Bash(xargs ls -la)",
"Bash(wc -l /c/Users/ywkang/kernbench/src/kernbench/policy/address/phyaddr.py /c/Users/ywkang/kernbench/src/kernbench/runtime_api/*.py)",
"Bash(python scripts/gen_placement_diagrams.py)",
"Bash(python -c:*)",
"Bash(python -m kernbench.cli.main probe --topology topology.yaml)",
"Bash(xargs grep -l \"class.*ComponentBase\\\\|class.*DefaultComponent\")",
"Bash(python -m pytest tests/test_probe.py -v)",
"Bash(python -m pytest tests/test_probe.py tests/test_component_registry.py -v)"
]
}
}
+36
View File
@@ -0,0 +1,36 @@
{
"permissions": {
"allow": [
"Bash(python -m ruff check --select I --fix src/kernbench/runtime_api/context.py src/kernbench/runtime_api/bench_runner.py src/kernbench/cli/main.py)",
"Bash(python -m pytest tests/ -q)",
"Bash(python -m pytest tests/ -v)",
"Bash(python -m pytest tests/test_topology_load.py tests/test_cli.py -v)",
"Bash(python -c \":*)",
"Bash(python -m pytest tests/ -q --tb=no)",
"Bash(python -m pytest tests/ -v --tb=short)",
"Bash(python -m pytest tests/test_component_registry.py -v --tb=line)",
"Bash(python -m pytest tests/ --ignore=tests/test_component_registry.py -q --tb=no)",
"Bash(python -m pytest tests/ -q --tb=short)",
"Bash(python -m pytest --tb=short -q)",
"Bash(python -m pytest tests/test_phase_a_components.py -v)",
"Bash(python -m pytest --tb=short)",
"Bash(python -m pytest tests/ -x -q)",
"Bash(python -m pytest tests/test_probe.py::test_h2d_latency_monotonic -x -q)",
"Bash(python -m pytest tests/ -x --tb=short)",
"Bash(python -m pytest tests/ --tb=line)",
"Bash(python -m pytest tests/ --tb=short)",
"Bash(python -m kernbench probe --topology topology.yaml)",
"Bash(python -m pytest tests/ --tb=short -q)",
"Bash(python -m pytest tests/test_probe.py tests/test_component_registry.py tests/test_topology_compile.py -v)",
"Bash(python -m pytest tests/test_tensor.py tests/test_engine.py tests/test_probe.py tests/test_component_registry.py -v)",
"Bash(grep -l \"class.*CPU\" \"/c/Users/ywkang/kernbench/src/kernbench/components/impls\"/*.py)",
"Bash(grep -n \"^class \" \"/c/Users/ywkang/kernbench/src/kernbench/components/impls\"/*.py)",
"Bash(python -m pytest tests/test_engine.py tests/test_probe.py tests/test_component_registry.py -v)",
"Bash(grep -E \"\\\\.\\(py|md\\)$\")",
"Bash(python -m pytest tests/test_pe_components.py -v)",
"Bash(python -m pytest tests/test_triton_emu.py -v)",
"Bash(python -m pytest tests/test_pe_components.py tests/test_triton_emu.py -v)",
"Bash(python -m pytest tests/test_pe_components.py::test_mcpu_multi_pe_kernel_launch tests/test_pe_components.py::test_qkv_gemm_bench_multi_pe_completes -v)"
]
}
}
+12
View File
@@ -0,0 +1,12 @@
root = true
[*]
charset = utf-8
end_of_line = lf
insert_final_newline = true
trim_trailing_whitespace = true
indent_style = space
indent_size = 4
[*.md]
trim_trailing_whitespace = false
+28
View File
@@ -0,0 +1,28 @@
# OS / Editor
.DS_Store
.vscode/.history/
*.swp
# Python
__pycache__/
*.py[cod]
*.pyd
.pytest_cache/
.mypy_cache/
.ruff_cache/
# Virtualenv
.venv/
# Packaging
dist/
build/
*.egg-info/
# Env
.env
.env.*
!.env.example
# Logs
*.log
+3
View File
@@ -0,0 +1,3 @@
[MESSAGES CONTROL]
disable=missing-function-docstring
disable=missing-module-docstring
+16
View File
@@ -0,0 +1,16 @@
{
"recommendations": [
"ms-python.python",
"ms-python.vscode-pylance",
"charliermarsh.ruff",
"ms-python.debugpy",
"eamodio.gitlens",
"usernamehw.errorlens",
"wayou.vscode-todo-highlight",
"christian-kohler.path-intellisense",
"editorconfig.editorconfig",
"ms-azuretools.vscode-docker",
"humao.rest-client",
"shd101wyy.markdown-preview-enhanced"
]
}
+55
View File
@@ -0,0 +1,55 @@
{
"version": "0.2.0",
"configurations": [
{
"name": "Python: Current File",
"type": "python",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal",
"justMyCode": true
},
{
"name": "Run kernbench CLI",
"type": "python",
"request": "launch",
"module": "kernbench.cli.main",
"console": "integratedTerminal",
"justMyCode": true,
"args": [
"run",
"--topology", "topology.yaml",
"--bench", "qkv_gemm"
],
"env": {
"PYTHONPATH": "${workspaceFolder}/src"
}
},
{
"name": "Run KernBench Probe",
"type": "python",
"request": "launch",
"module": "kernbench.cli.main",
"console": "integratedTerminal",
"justMyCode": true,
"args": [
"probe",
"--topology", "topology.yaml",
],
"env": {
"PYTHONPATH": "${workspaceFolder}/src"
}
},
{
"name": "Pytest: Run All (terminal)",
"type": "python",
"request": "launch",
"module": "pytest",
"args": [
"-q"
],
"console": "integratedTerminal",
"justMyCode": true
}
]
}
+39
View File
@@ -0,0 +1,39 @@
{
"editor.formatOnSave": true,
"editor.formatOnSaveMode": "file",
"editor.tabSize": 4,
"editor.insertSpaces": true,
"editor.rulers": [
100
],
"editor.minimap.enabled": false,
"files.eol": "\n",
"files.trimTrailingWhitespace": true,
"files.insertFinalNewline": true,
"search.useIgnoreFiles": true,
"git.autofetch": true,
"python.venvFolders": [".venv"],
"python.testing.pytestEnabled": true,
"python.testing.unittestEnabled": false,
"python.testing.autoTestDiscoverOnSaveEnabled": true,
"python.testing.pytestArgs": [
"tests", "-q", "-vv", "-s", "--tb=short", "--no-header",
],
"python.analysis.typeCheckingMode": "basic",
"python.analysis.autoImportCompletions": true,
"python.terminal.activateEnvironment": true,
"ruff.lint.enable": true,
"ruff.format.enable": true,
"editor.defaultFormatter": "charliermarsh.ruff",
"[python]": {
"editor.defaultFormatter": "charliermarsh.ruff"
},
"editor.codeActionsOnSave": {
"source.fixAll": "explicit",
"source.organizeImports": "explicit"
},
"python.analysis.extraPaths": [
"${workspaceFolder}/src"
]
}
+127
View File
@@ -0,0 +1,127 @@
{
"version": "2.0.0",
"tasks": [
{
"label": "venv: create",
"type": "shell",
"command": "python -m venv .venv",
"problemMatcher": []
},
{
"label": "deps: install",
"type": "shell",
"command": "${command:python.interpreterPath}",
"args": [
"-m",
"pip",
"install",
"-U",
"pip",
"&&",
"${command:python.interpreterPath}",
"-m",
"pip",
"install",
"-e",
".[dev]"
],
"problemMatcher": [],
"dependsOn": "venv: create"
},
{
"label": "Run KernBench CLI",
"type": "shell",
"command": "${command:python.interpreterPath}",
"args": [
"-m",
"kernbench.cli.main",
"run",
"--topology",
"topology.yaml",
"--bench",
"qkv_gemm"
],
"options": { "cwd": "${workspaceFolder}" },
"problemMatcher": [],
"presentation": {
"reveal": "silent",
"panel": "shared",
"clear": true
}
},
{
"label": "Run KernBench Probe",
"type": "shell",
"command": "${command:python.interpreterPath}",
"args": [
"-m",
"kernbench.cli.main",
"probe",
"--topology",
"topology.yaml"
],
"options": { "cwd": "${workspaceFolder}" },
"problemMatcher": [],
"presentation": {
"reveal": "silent",
"panel": "shared",
"clear": true
}
},
{
"label": "Pytest: Run All (terminal)",
"type": "shell",
"command": "${command:python.interpreterPath}",
"args": [
"-m",
"pytest",
"-q",
"-s"
],
"problemMatcher": [],
"group": "test"
},
{
"label": "lint",
"type": "shell",
"command": "${command:python.interpreterPath}",
"args": [
"-m",
"ruff",
"check",
"."
],
"problemMatcher": []
},
{
"label": "format",
"type": "shell",
"command": "${command:python.interpreterPath}",
"args": [
"-m",
"ruff",
"format",
"."
],
"problemMatcher": []
},
{
"label": "which python",
"type": "shell",
"command": "${command:python.interpreterPath}",
"args": [
"-c",
"import sys;print(sys.executable)"
],
"problemMatcher": []
}
]
}
+196
View File
@@ -0,0 +1,196 @@
# Claude Code Instructions (Repo)
This repository uses Claude Code with strict architectural and verification rules.
SPEC.md and ADRs are the source of truth.
---
## Terminology
- runtime API:
Host-facing public API used by benchmarks and user code (e.g., tensor deployment, kernel launch).
- simulation engine (sim_engine):
Discrete-event engine responsible for request injection, scheduling, and completion tracking.
- components:
Device-side nodes modeling hardware behavior (IO_CPU, M_CPU, PE_CPU, routers, engines, etc.).
## Authority & Scope
- SPEC.md defines the architectural contract.
- ADRs (docs/adr/ADR-*.md) define non-trivial architectural decisions.
- If a change conflicts with SPEC.md or an ADR:
- STOP.
- Explain the conflict.
- Propose options (keep spec, update ADR, or narrow scope).
- Do NOT silently change architecture.
- The repository structure reflects architectural intent; Claude Code MUST respect existing module boundaries and file locations.
---
## Design Questions
- Design / architecture questions are ALWAYS allowed.
- Design questions MUST NOT modify:
- production code
- test code
- SPEC.md
- ADRs
- If a design question implies a change, default to Phase 1.
---
## Change & Test Protocol (Mandatory)
All non-trivial changes MUST follow a two-phase process.
Design discussion is always allowed; code changes are not.
---
### Phase 1 — Proposal + Verification
(No Production Code Changes)
#### Purpose
- Decide *what* to change and *how it will be validated*
- Establish verification coverage BEFORE touching production code
#### Phase 1 MUST include
1) **Design Proposal**
- Explain the design change.
- Explain why the change is needed.
- Explain consistency with SPEC.md and relevant ADRs.
1) **Verification Plan**
- SPEC requirement(s) / ADR(s) affected (e.g., R1/R2/R5, ADR-0002).
- Tests that validate the change:
- existing tests to run, and/or
- new tests to add.
- Concrete input cases used by the tests:
- topology (SIP / CUBE / PE layout)
- request parameters (src, dst, size_bytes).
- Expected observable assertions, such as:
- hop trace contains key waypoints,
- latency invariants (e.g., > 0, monotonic increase),
- deterministic route selection.
- **expected changes (or no changes) in generated diagrams**, if applicable.
If the Verification Plan is missing or vague, STOP.
#### Allowed in Phase 1
- Creating or modifying **test code only**
- Running tests and reporting results
#### Forbidden in Phase 1
- Any production code changes
- Any SPEC.md or ADR modifications
- Any production diff output
#### Phase 1 Output
- Proposal + Verification Plan
- Tests added/modified (if any)
- Test execution results (PASS / FAIL)
- Clear recommendation:
- "No Phase 2 needed" OR
- "Await approval for Phase 2"
---
### Phase 2 — Apply + Verify + Rollback
#### Trigger
Phase 2 is triggered ONLY by the exact user approval phrase:
**"ok"**
#### Phase 2 Rules
- Output **minimal unified diffs only**
- Modify ONLY production files declared in Phase 1
- Do NOT include explanations, comments, or unchanged code
- Automatically apply the diff to the working tree
#### Mandatory Verification
- Run the tests defined in the Phase 1 Verification Plan
#### Success Path
If ALL tests PASS:
- Keep the applied changes
- Ensure generated diagrams (if affected) are consistent
- Report success concisely
#### Failure Path (Mandatory)
If ANY test FAILS:
- Immediately rollback ALL Phase 2 changes
- Do NOT keep partial changes
- Report:
- failing test names
- error messages / assertions
- brief hypothesis of the root cause
- Return to Phase 1 state
Tests must NEVER be weakened, removed, or altered to force Phase 2 to pass.
---
## What Counts as "Non-Trivial"
(Protocol Required)
Any of the following:
- routing policy or ordering changes
- topology builder changes (nodes, links, parameters)
- address decoding / PhysAddr behavior
- latency composition rules
- changes affecting determinism or connectivity
- changes touching two or more production files
---
## Allowed Exceptions
(Protocol Still Required)
- comments or docstrings
- formatting-only changes
- type annotation changes with no runtime behavior change
In exceptions, Phase 1 MUST explicitly state:
**"No behavior change; tests unchanged."**
---
## CLI Semantics
- `kernbench run --device <id>` runs the benchmark on a single device.
- Omitting `--device` runs the benchmark on all devices discovered in the topology (logically parallel).
- Device enumeration is handled by the CLI only; benchmarks MUST remain single-device.
## Derived Artifacts (Clarification)
- Generated diagrams under `docs/diagrams/` are **derived artifacts**, not production code.
- Creating or updating files in `docs/diagrams/`:
- does NOT count as a production code change,
- does NOT require Phase 2 approval,
- MUST be consistent with SPEC.md and ADRs.
## Enforcement Defaults
- If unsure whether a change is non-trivial → treat it as non-trivial.
- If unsure whether Phase 2 is allowed → STOP and ask.
- SPEC.md and ADRs are the final authority.
- runtime API MUST NOT hardcode topology/routing or internal hop sequences.
- sim_engine MUST remain independent of runtime API semantics (no tensor/kernel policy logic).
+13
View File
@@ -0,0 +1,13 @@
# Python Project (VS Code Template)
## Quick start
1. Create venv + install dev deps (editable):
- VS Code: Run Task → `deps: install (editable)`
2. Run tests:
- VS Code: Run Task → `test`
3. Lint / format:
- `lint`, `format` tasks
## Structure
- `src/` app code
- `tests/` pytest
+327
View File
@@ -0,0 +1,327 @@
# KernBench System-Level Simulator — SPEC
This document defines the architectural contract for the KernBench
system-level discrete-event simulator for our AI Accelerator SIP-based systems.
All implementations, tests, and changes MUST conform to this SPEC.
---
## 0. Goal
Build a **system-level, discrete-event simulator** to evaluate the performance of
**LLM kernels running on our AI Accelerator SIP-based systems**, under varying
**SIP architectures, topologies, and interconnect configurations**.
The simulator models **data-movement and control paths across the full hardware
hierarchy** and computes **end-to-end execution latency** for kernel executions
dispatched to Processing Elements (PEs).
Primary objectives:
- compare LLM kernel execution latency under different system configurations
- model PE↔HBM, PE↔PE, CUBE↔CUBE, and SIP↔SIP communication and control paths
- guarantee deterministic, verifiable behavior with strong debuggability
- support visual inspection of the modeled system at multiple abstraction levels
---
## 0.1 Golden Invariants (Must NOT be violated)
- End-to-end latency is computed **strictly by explicit traversal** over modeled
components and links.
- Every routed request MUST incur **latency > 0**.
- Routing decisions MUST be **deterministic** given
(topology + routing policy + request).
- All valid request flows MUST have explicit connectivity in the model.
- No hidden shortcuts, implicit bypasses, or magic paths are allowed.
- Architectural decisions documented in ADRs override local optimizations.
---
## 0.2 Architectural References (ADRs)
Major architectural decisions are documented in ADRs and referenced by number.
- ADR-0001: PhysAddr layout & address decoding contract
- ADR-0002: Routing distance, ordering, and bypass rules
- ADR-0003: Target system hierarchy & modeling scope (Tray / SIP / CUBE / PE / IO chiplet)
- ADR-0004: Memory semantics & local-HBM bandwidth guarantee contract
- ADR-0005: Diagram views (SIP / CUBE / PE) and distance-aware layout rules
- ADR-0006: Topology compilation, distance extraction, and automatic diagram generation
- ADR-0007: runtime_api vs sim_engine responsibility boundaries
- ADR-0008: Tensor deployment and allocation (Host allocator, PA-first)
- ADR-0009: Kernel execution fan-out and completion semantics
- ADR-0010: CLI device selection and multi-device execution semantics
- ADR-0011: Memory addressing simplification (PA-first)
- ADR-0012: Host ↔ IO_CPU message schema (PA-first, PE-tagged shards)
- ADR-0013: Verification strategy and Phase 1 test plan
SPEC MUST remain consistent with accepted ADRs.
---
## 1. Core Requirements
### R1. Correct Routing and Control Path
- A request MUST traverse the correct sequence of components based on:
- source location,
- destination address or placement tags,
- routing policy and available topology connectivity.
- Local vs remote traffic MUST be distinguishable:
- same SIP vs different SIP,
- same CUBE vs different CUBE,
- (optional) same PE-group vs cross PE-group.
- Routing behavior MUST be reproducible and deterministic.
---
### R2. Latency is Computed by Traversal
End-to-end latency is the sum of:
- per-node fixed latency (processing / router delay),
- per-link latency (fixed and/or size-aware serialization: bytes / BW),
- per-service latency (e.g., memory controller service time).
The simulator MUST:
- support both fixed and size-aware latency,
- emit hop-by-hop traces with timestamps and component identifiers.
---
### R3. Topology is Configurable and Variable
Topology MUST NOT be hardcoded.
The simulator MUST accept multiple topologies (YAML / JSON / dict), varying:
- SIP count,
- CUBE count per SIP,
- PE count per CUBE,
- on-chip fabric structure (e.g., mesh / NoC / XBAR),
- IO chiplets and interconnects,
- link bandwidth, latency, and capacity parameters.
Given a topology:
- all required request flows MUST have valid connectivity,
- missing links are a topology construction error, not a routing error.
---
### R4. DI-First Component Design (Swappable Implementations)
All components MUST be replaceable behind stable interfaces, including:
- routers and fabrics (NoC, bridges, switches),
- XBAR-like selectors,
- DMA engines and queues,
- memory controllers and services (HBM, TCM, queues),
- management and control processors (modeled components).
The simulator MUST:
- use dependency injection (DI) to bind node specifications to implementation classes,
- allow component swapping without changing test logic,
- avoid leaking routing or policy logic into unrelated components.
---
### R5. Multi-Domain Communication Modeling
The simulator MUST model communication across hierarchical domains, including:
- PE ↔ local HBM
- PE ↔ remote HBM in the same CUBE
- PE ↔ remote HBM in other CUBEs within the same SIP
- PE ↔ remote HBM in other SIPs
- PE ↔ PE messaging (e.g., IPCQ)
- PE ↔ IO chiplets
- CUBE ↔ CUBE (e.g., via UCIe)
- SIP ↔ SIP (e.g., via PCIe or UAL)
Policy-based bypass is allowed ONLY if:
- the bypass path is explicitly represented in the model,
- the bypass incurs non-zero latency,
- the bypass is visible in traces and diagrams.
---
### R6. Verification-Driven Development
Development MUST follow a verification-driven workflow:
- behavior is validated by tests with meaningful input cases,
- tests encode SPEC-defined invariants, not incidental implementation details,
- changes without clear verification coverage are not allowed.
---
## R7. Runtime API
The simulator MUST provide a host-facing runtime API that:
- exposes tensor deployment and kernel execution operations,
- submits requests only to endpoint components (e.g., IO_CPU),
- owns host-side tensor handles and allocation metadata as PA shard maps,
- remains topology-agnostic and does not perform routing or fan-out.
Tensor deployment in Phase 0 produces **device physical-address (PA) shard mappings**.
Each shard explicitly identifies its target `(sip, cube, pe)` and PA range.
No separate host-visible allocation RPC (e.g., AllocateTensorMeta) exists.
---
## R8. Simulation Engine
The simulator MUST include a discrete-event simulation engine that:
- injects requests into the system graph,
- schedules events deterministically,
- tracks completion via correlation identifiers,
- decomposes runtime API operations into explicit graph requests
(e.g., MemoryWrite, MemoryRead, KernelLaunch).
---
## R9. CLI Execution Semantics
The CLI MUST support executing benchmarks:
- on a specified device.
Benchmarks are executed once per invocation within a single simulation instance.
If multiple devices are present in the topology, a benchmark MAY interact with
multiple devices internally, but the CLI does not launch multiple independent
benchmark instances by default.
---
## R10. Memory Addressing (Phase 0)
In Phase 0, the simulator uses a **PA-first memory model**:
- All memory operations use device physical addresses (PA) only.
- Virtual addressing, MMU/IOMMU, and address translation latency are out of scope.
- Tensor placement is represented as a list of PA shards, each explicitly tagged
with `(sip, cube, pe)`.
All memory access latency MUST be modeled explicitly via graph traversal.
No implicit translation or hidden latency is allowed.
---
## 2. Model Concepts
### 2.1 Graph Execution Model
- Nodes represent modeled components (PE blocks, XBAR, NoC, bridges,
HBM controllers, IO components, etc.).
- Directed edges represent interconnect links with latency and bandwidth attributes.
- Execution model:
- a node receives a request,
- incurs node or service latency,
- emits the request to the next hop via a link,
- repeats until the destination service completes.
---
### 2.2 Routing
Routing MAY be implemented as:
- policy-based routing (code-driven),
- routing tables (config-driven),
- topology-driven routing (e.g., mesh XY),
- or a hybrid approach.
Routing MUST:
- consume decoded address domains or explicit placement tags,
- operate only on explicit topology connectivity,
- remain deterministic.
Kernel execution requests reference tensors via PA shard mappings.
Each shard explicitly identifies its target PE, allowing IO_CPU to
deterministically fan-out execution without relying on PA decoding.
---
## 3. Inputs and Identity
### 3.1 Node Identity Scheme
Nodes MUST have stable, parsable identifiers sufficient for domain inference
and trace-based debugging.
Example patterns:
- `tray.host_cpu`
- `sip{S}.io{I}.pcie_ep`
- `sip{S}.cube{C}.fabric`
- `sip{S}.cube{C}.pe{P}`
- `sip{S}.cube{C}.hbm_ctrl`
---
### 3.2 Link Specifications
A link MAY include:
- fixed latency (ns),
- bandwidth (GB/s) for serialization latency,
- optional capacity for contention modeling.
Topology builders MUST ensure:
- required links exist,
- link parameters are consistent with topology intent.
---
## 4. Output, Debuggability, and Diagrams
The simulator MUST provide:
- per-request hop-by-hop traces with timestamps,
- clear error messages for missing connectivity
(e.g., "no link for A → B"),
- reproducible, inspectable representations of the modeled system.
Diagrams are **derived artifacts** of the simulator model:
- They MUST be generatable from the **compiled topology** and **distance metadata**
used by execution and routing.
- Generation MAY be performed lazily or cached by the implementation,
as long as outputs remain consistent with the compiled topology.
Diagram abstraction levels and distance-aware layout rules are defined in ADR-0005.
Automatic diagram generation and output conventions are defined in ADR-0006.
By default, generated diagrams are written under:
- `docs/diagrams/`
---
## 5. Non-Goals (for now)
The following are explicitly out of scope:
- cycle-accurate microarchitecture modeling,
- detailed cache coherence protocols,
- full PCIe / CXL protocol correctness.
These MAY be layered later via additional components and policies.
---
## 6. Decision Boundaries
- SPEC.md defines architectural intent and invariants.
- Code implements SPEC and MUST NOT introduce hidden invariants.
- Tests validate SPEC-defined behavior and MUST NOT encode fixed topology assumptions.
- ADRs record non-trivial architectural decisions and MUST be referenced when relevant.
View File
+2
View File
@@ -0,0 +1,2 @@
def run(ctx):
print("IPCQ all reduce kernel bench")
+37
View File
@@ -0,0 +1,37 @@
from __future__ import annotations
import importlib
from collections.abc import Callable
from typing import Any
from kernbench.runtime_api.context import RuntimeContext
BenchFn = Callable[[RuntimeContext], Any]
def resolve_bench(bench_id: str) -> BenchFn:
"""
Resolve a bench id into a callable bench function.
Expected layout (repo root):
benches/<bench_id>.py
def run(ctx: RuntimeContext) -> Any
"""
bench_id = bench_id.strip()
if not bench_id:
raise ValueError("Bench id is empty.")
module_path = f"benches.{bench_id}"
try:
mod = importlib.import_module(module_path)
except ModuleNotFoundError as e:
raise ValueError(f"Unknown bench '{bench_id}'. Expected module {module_path}.py") from e
run_fn = getattr(mod, "run", None)
if run_fn is None:
raise ValueError(f"Bench module {module_path} must define a 'run(ctx)' function.")
if not callable(run_fn):
raise ValueError(f"'run' in {module_path} is not callable.")
return run_fn
+39
View File
@@ -0,0 +1,39 @@
"""QKV GEMM benchmark: Q*K^T projection on a single PE.
Demonstrates the full host-to-PE kernel launch pipeline:
Host → PCIE_EP → IO_CPU → M_CPU → NOC → PE_CPU → PE_SCHEDULER → engines
Kernel: tl.load(a) + tl.ref(b) + tl.composite(gemm) + tl.wait()
- Tensor a is loaded into TCM via DMA
- Tensor b stays in HBM; PE_SCHEDULER streams it per-tile (32x64x32)
"""
from kernbench.policy.placement.dp import DPPolicy
# GEMM dimensions: (M, K) x (K, N) → (M, N)
M, K, N = 128, 256, 128
DTYPE = "f16"
def _gemm_kernel(a_ptr, b_ptr, out_ptr, M, K, N, tl, DTYPE="f16"):
"""QKV GEMM kernel: out = a @ b.
a is loaded into TCM (DMA_READ).
b is referenced in HBM (tl.ref, no DMA — scheduler streams per-tile).
"""
a = tl.load(a_ptr, shape=(M, K), dtype=DTYPE)
b = tl.ref(b_ptr, shape=(K, N), dtype=DTYPE)
handle = tl.composite(op="gemm", a=a, b=b, out_ptr=out_ptr)
tl.wait(handle)
def run(ctx):
"""Run the QKV GEMM benchmark."""
# DP placement: a=replicate (cube-level), b/out=column_wise (N-axis, single PE)
a = ctx.zeros((M, K), dtype=DTYPE, dp=DPPolicy(cube="replicate", pe="replicate"), name="a")
b = ctx.zeros((K, N), dtype=DTYPE, dp=DPPolicy(cube="replicate", pe="column_wise"), name="b")
out = ctx.empty(
(M, N), dtype=DTYPE, dp=DPPolicy(cube="replicate", pe="column_wise"), name="out",
)
# Launch GEMM kernel
ctx.launch("qkv_gemm", _gemm_kernel, a, b, out, M, K, N)
+39
View File
@@ -0,0 +1,39 @@
"""QKV GEMM benchmark: Q*K^T projection on all PEs in a cube (multi-PE).
Column-parallel GEMM: a is replicated (cube-level), b/out are column-sharded.
M_CPU fans out KernelLaunchMsg to all 8 PE_CPUs (ADR-0009 D3).
Kernel: tl.load(a) + tl.ref(b) + tl.composite(gemm) + tl.wait()
- Tensor a is loaded into TCM via DMA
- Tensor b stays in HBM; PE_SCHEDULER streams it per-tile (32x64x32)
"""
from kernbench.policy.placement.dp import DPPolicy
# GEMM dimensions: (M, K) x (K, N) -> (M, N)
M, K, N = 128, 256, 128
DTYPE = "f16"
def _gemm_kernel(a_ptr, b_ptr, out_ptr, M, K, N, tl, DTYPE="f16"):
"""QKV GEMM kernel: out = a @ b.
a is loaded into TCM (DMA_READ).
b is referenced in HBM (tl.ref, no DMA -- scheduler streams per-tile).
"""
a = tl.load(a_ptr, shape=(M, K), dtype=DTYPE)
b = tl.ref(b_ptr, shape=(K, N), dtype=DTYPE)
handle = tl.composite(op="gemm", a=a, b=b, out_ptr=out_ptr)
tl.wait(handle)
def run(ctx):
"""Run the multi-PE QKV GEMM benchmark."""
# DP placement: a=replicate (cube-level), b/out=column_wise (N-axis split)
a = ctx.zeros((M, K), dtype=DTYPE, dp=DPPolicy(cube="replicate", pe="replicate"), name="a")
b = ctx.zeros((K, N), dtype=DTYPE, dp=DPPolicy(cube="replicate", pe="column_wise"), name="b")
out = ctx.empty(
(M, N), dtype=DTYPE, dp=DPPolicy(cube="replicate", pe="column_wise"), name="out",
)
# Launch GEMM kernel on all PEs
ctx.launch("qkv_gemm_multi", _gemm_kernel, a, b, out, M, K, N)
+108
View File
@@ -0,0 +1,108 @@
# ADR-0001: PhysAddr Layout & Address Decoding Contract
## Status
Accepted
## Date
2026-02-27
## Context
KernBench Graph Latency Simulator must route requests deterministically and compute end-to-end latency strictly by graph traversal.
To model local vs remote traffic (same/different SIP, same/different CUBE, optional PE-group), requests need a stable, parsable address/location scheme that:
- can be decoded into routing domains (SIP/CUBE/HBM/PE-resource, etc.)
- remains topology-agnostic (no hardcoded counts)
- supports swappable policy and DI-first components without leaking topology assumptions into node implementations
## Decision
We define a **PhysAddr value object** and an **address decoding contract** that converts an integer address into routing domains.
### D1. PhysAddr is an immutable value object
- PhysAddr is immutable and comparable as a pure value.
- Any allocator returns a **fully specified PhysAddr** (not partial metadata).
- No global state may be required to interpret a PhysAddr.
### D2. PhysAddr fields (logical contract)
PhysAddr must be able to represent at least:
- `rack_id` (optional but reserved for scale-out)
- `sip_id` (device / SIP domain)
- `sip_seg` (SIP-level segment/window selection, e.g., cube window)
- `local_offset` (offset within the chosen segment/window)
Decoded/derived fields may include (optional):
- `cube_id`
- `kind` (e.g., HBM vs PE-resource vs raw)
- `unit_type` / `pe_id` (if PE-level addressing is modeled)
**Important:** The exact bit allocation may evolve, but the *semantic fields above* must remain decodable without hidden assumptions.
### D3. Decoding is deterministic and policy-compatible
- Decoding must deterministically map an integer address to:
- destination SIP domain (`sip_id`)
- destination sub-domain (`cube_id` if applicable)
- destination target kind (HBM/PE-resource/other)
- Decoding must not depend on runtime topology sizes; it may depend on **explicit topology parameters** provided through configuration (e.g., segment size, slice size), and those parameters must live in the topology/config layer (not in random components).
### D4. Topology-derived constants live in the topology layer
Constants such as segment sizes (e.g., HBM slice size / window size) are derived from topology configuration (YAML/JSON/dict) and are provided to the decoder via DI/config.
They must not be hardcoded in node implementations.
### D5. Routing consumes decoded domains, not raw bits
Routing policy uses decoded domains:
- `src` location (sip/cube/pe or node_id)
- `dst` domains derived from PhysAddr decoding
- `size_bytes` for size-aware link latency
Routing must not inspect raw bit-fields directly except inside the decoding module.
## Alternatives Considered
1) **Use raw integers everywhere, decode ad-hoc in routing**
- Rejected: leads to duplicated logic, inconsistent routing, and hidden assumptions embedded in multiple components.
1) **Hardcode topology sizes (SIP/CUBE/PE counts) into decoding**
- Rejected: violates SPEC (R3) and breaks swappability and configuration-driven topologies.
1) **Put decoding inside memory controllers or routers**
- Rejected: leaks policy into components and undermines DI-first, swappable implementations (SPEC R4).
## Consequences
### Positive
- Deterministic routing domains enable clear test invariants for local vs remote paths (SPEC R1, R5).
- Keeps topology variability (SPEC R3) while preserving consistent semantics.
- DI-first: decoder can be swapped or extended without changing components or tests (SPEC R4).
### Tradeoffs / Costs
- Requires explicit configuration for any topology-derived sizes.
- Introduces a single “blessed” decoding module that must remain stable and well-tested.
## Implementation Notes (Non-normative)
- Recommended module boundary:
- `src/kernbench/policy/address/phyaddr.py`
- Tests should cover:
- deterministic decoding
- local vs remote classification from decoded fields
- invariants: “allocator returns full PhysAddr”, “decoding requires no global state”
## Links
- SPEC.md: R1 (routing), R3 (configurable topology), R4 (DI-first), R5 (multi-domain comm)
+103
View File
@@ -0,0 +1,103 @@
# ADR-0002: Routing Distance, Ordering & Bypass Rules
## Status
Accepted
## Date
2026-02-27
## Context
The KernBench Graph Latency Simulator must compare kernel execution time
across different architectures and topologies by computing end-to-end
latency from graph traversal.
To support meaningful comparison:
- routing must be deterministic
- latency must reflect actual interconnect structure
- local vs remote traffic must be distinguishable
- “bypass” optimizations must not undermine debuggability or correctness
The simulator also aims to avoid software-managed metadata and hidden
shortcuts that obscure control paths.
## Decision
### D1. Distance is accumulated latency, not hop count
- Routing “distance” is defined as the **sum of per-node and per-link latency**.
- Hop count alone must not be used for ordering or path selection.
- Size-aware serialization latency (bytes / BW) contributes to distance.
### D2. Routing order is derived from graph traversal
- The chosen route is the path with minimum accumulated latency
given the constructed graph and routing policy.
- Deterministic ordering must be guaranteed for identical inputs
(topology + policy + request).
### D3. Bypass is explicit and graph-represented
- Any bypass (e.g., local cube HBM access via XBAR instead of NOC) must be:
- explicitly represented as a graph path, and
- subject to latency accumulation like any other path.
- Example: PE_DMA has dual egress — one to XBAR (HBM path) and one to NOC (non-HBM path).
Both are explicit graph edges; neither is a “bypass” — they are distinct data paths
serving different memory domains.
- Implicit or “magic” bypass paths are disallowed.
### D4. No zero-latency end-to-end paths
- Every routed request must incur **end-to-end** latency > 0.
- Individual fabric segments (e.g., NOC hops) MAY have distance_mm = 0
when the fabric is distributed and distance is not meaningful at that granularity.
This is allowed because other components on the same path (e.g., PE_DMA, SRAM,
UCIe endpoints) contribute non-zero latency, ensuring the end-to-end invariant holds.
- Fully zero-latency end-to-end paths are disallowed, except for explicit
test-only stubs clearly marked as such.
### D5. Policy vs topology responsibility split
- Topology builder:
- defines nodes and links and their latency/BW parameters
- Routing policy:
- selects among available graph paths based on decoded domains
- Routing policy must not assume missing links; missing connectivity
is a topology construction error.
### D6. No software-managed routing metadata
- Routing decisions must not rely on per-request software-managed metadata
that tracks distance, hop count, or ordering outside the graph model.
- All distance/order computation is derived from traversal itself.
## Alternatives Considered
1) **Hop-count based routing**
- Rejected: ignores heterogeneous latency/BW and misrepresents
architectural differences.
2) **Implicit local shortcuts**
- Rejected: breaks debuggability and violates traversal-based latency.
3) **Software-managed distance metadata**
- Rejected: increases control overhead and obscures routing semantics.
## Consequences
### Positive
- Clear, debuggable hop-by-hop traces (SPEC R2, R4).
- Architecture comparisons reflect real interconnect structure.
- Routing behavior is reproducible and deterministic.
### Tradeoffs / Costs
- Graph construction must be correct and complete.
- Bypass modeling requires explicit graph representation,
which slightly increases topology description complexity.
## Implementation Notes (Non-normative)
- Recommended responsibilities:
- Graph builder: ensure all required paths exist.
- Router: select next hop based on decoded domains and policy.
- Tests should assert:
- non-zero end-to-end latency
- deterministic routing for identical inputs
- bypass paths appear explicitly in emitted traces
## Links
- SPEC.md: R1 (routing), R2 (latency), R3 (topology), R5 (multi-domain comm)
- ADR-0001: PhysAddr layout & decoding contract
@@ -0,0 +1,64 @@
# ADR-0003: Target System Hierarchy & Modeling Scope
## Status
Accepted
## Context
We need a system-level simulator to evaluate LLM kernel performance on our AI Accelerator platform.
The platform is organized as a compute tray containing multiple identical SIPs connected via PCIe or UAL
through switching fabrics, with a host CPU issuing commands/kernels.
## Decision
We model the system hierarchy explicitly:
### D1. Tray-level
- A compute tray contains:
- Host CPU (issues requests / coordinates runtime & data placement)
- Multiple identical SIPs (accelerators)
- Interconnect fabric between SIPs (PCIe and/or UAL via switches)
### D2. SIP-level
- A SIP is a multi-die package composed of:
- Multiple CUBEs (HBM die + compute PEs + UCIe)
- One or more IO chiplets (host/SIP interfaces)
- IO chiplets:
- provide interfaces: PCIe-EP, IO_CPU, optionally UAL-EP
- can be multiple per SIP
- placement constrained to SIP shoreline (top/bottom/left/right); each shoreline may host 12 IO chiplets
### D3. CUBE-level
- A CUBE contains:
- HBM + memory controller (HBM_CTRL)
- XBAR (top/bottom): HBM pseudo-channel crossbar, PE's dedicated path to HBM
- Bridge (left/right): connects XBAR.top ↔ XBAR.bottom for cross-half HBM access
- NOC: distributed on-die fabric spanning the entire cube (distance modeled as 0);
carries non-HBM traffic including inter-cube (UCIe), command (M_CPU↔PE_CPU), and shared SRAM access
- Shared SRAM: cube-level shared memory accessible by all PEs via NOC
- management/control CPU (M_CPU) coordinating PE command distribution and completion aggregation
- multiple PEs
- up to 4 UCIe endpoints (N/E/W/S) for CUBE↔CUBE and CUBE↔IO connectivity
### D4. PE-level
- A PE can execute one kernel instance
- PE contains internal control + accelerators (modeled at PE view granularity):
- PE_CPU, command handler, PE_TCM, DMA/GEMM/MATH engines, internal queues
## Consequences
- The simulator supports abstraction by “views”:
- SIP view hides PE internals
- CUBE view treats each PE as a single block
- PE view expands PE internals
- Topology remains parameterized; sizes/counts/links come from configuration.
## Links
- SPEC R3/R5
- ADR-0005 (diagram views)
@@ -0,0 +1,64 @@
# ADR-0004: Memory Semantics & Local-HBM Bandwidth Guarantee
## Status
Accepted
## Context
Accurately modeling PE↔HBM behavior is essential for kernel latency estimation.
Each PE has a notion of “local HBM” that must guarantee full HBM bandwidth, independent of intervening on-die fabric bandwidth.
## Decision
### D1. Local HBM definition
- Each PE is assigned a logically defined “local HBM” region.
- Local HBM corresponds to the pseudo-channel subset directly attached to that PEs DMA path
via the XBAR (top or bottom, depending on PE corner placement).
- The path is: PE_DMA → XBAR.top/bottom → HBM_CTRL.
- The mapping (HBM pseudo-channels → PE local regions) is derived from topology configuration.
### D2. Local HBM bandwidth guarantee contract
- Accesses from a PE to its local HBM MUST guarantee full HBM read/write bandwidth
independent of intervening fabric bandwidth limits.
- This guarantee is modeled by:
- a dedicated logical path and/or service model that enforces HBM BW at the PE-local-HBM interaction point,
- while still incurring non-zero latency along explicitly modeled components.
### D3. Cross-half HBM semantics
- A PE connected to XBAR.bottom that accesses HBM pseudo-channels on the XBAR.top half
(or vice versa) traverses a bridge:
- PE_DMA → XBAR.bottom → bridge → XBAR.top → HBM_CTRL
- Bridge bandwidth may limit cross-half HBM access relative to local-half access.
### D4. Non-local HBM semantics (inter-cube / inter-SIP)
- Accesses from a PE to HBM in a different cube or SIP MAY be limited by:
- NOC bandwidth within the cube,
- inter-cube UCIe links,
- inter-SIP fabric (PCIe/UAL).
- These paths MUST be explicit and traceable.
### D5. Shared SRAM semantics
- Each CUBE contains a shared SRAM accessible by all PEs in that CUBE.
- Access path: PE_DMA → NOC → shared SRAM.
- Shared SRAM bandwidth is limited by the NOC↔SRAM link bandwidth.
- Shared SRAM is not part of the HBM address space; it is a separate memory domain.
## Verification Notes
Tests should cover:
- local-HBM case: BW matches HBM BW regardless of fabric BW parameter
- cross-half HBM case: latency includes bridge traversal
- non-local cases (inter-cube/inter-SIP): BW/latency respond to fabric/link parameters
- shared SRAM case: access via NOC with correct BW
## Links
- SPEC R2/R5
- ADR-0002 (distance/order & explicit bypass)
@@ -0,0 +1,186 @@
# ADR-0005: Diagram Views & Distance-Aware Layout Rules
## Status
Accepted
## Context
We require verifiable and inspectable system modeling for a large-scale,
parameterized AI Accelerator system.
Humans must be able to:
- visually inspect the modeled topology,
- reason about communication structure and relative distance,
- do so at multiple abstraction levels without being overwhelmed by detail.
The simulator models distance (accumulated latency) as a first-class concept.
Diagrams must reflect this distance by default.
---
## Global Defaults
- All diagrams MUST be **distance-aware by default**.
- All diagrams MUST render **representative views** of the architecture.
- Instance indices (e.g., sip0, cube2, pe3) MUST NOT be required for diagram generation.
- Instance indices MAY be used ONLY:
- to define a distance anchor in asymmetric or debugging scenarios, or
- when explicitly requested.
---
## Representative Rendering Rule
- All CUBEs share the same internal structure.
- All PEs share the same internal structure.
Therefore:
- SIP-level diagrams render representative CUBEs and IO chiplets.
- CUBE-level diagrams render representative PEs as opaque blocks.
- PE-level diagrams render a representative PE with fully expanded internals.
Diagrams MUST NOT depend on specific SIP, CUBE, or PE indices
unless explicitly requested.
---
## Diagram Views
### View A — SIP-Level Diagram
**Purpose**
Explain system-scale structure and connectivity.
**Visible elements**
- SIP boundaries (optional)
- CUBEs (opaque blocks)
- IO chiplets (opaque blocks)
- Optional UCIe stubs only if needed to clarify connectivity
**Hidden elements**
- PE internals
- CUBE internal fabric
- IO chiplet internals
**Visible links**
- Host ↔ IO chiplets (PCIe)
- SIP ↔ SIP (PCIe / UAL via switches)
- IO ↔ CUBE (on-package links)
---
### View B — CUBE-Level Diagram
**Purpose**
Explain cube-internal structure and data/control flow.
**Visible elements**
- XBAR (top/bottom): HBM pseudo-channel crossbar
- Bridge (left/right): cross-half HBM connectors between XBAR.top and XBAR.bottom
- NOC: distributed on-die fabric for non-HBM traffic
- HBM subsystem (HBM_CTRL)
- Shared SRAM: cube-level shared memory
- Management CPU (M_CPU)
- PEs as opaque blocks (PE[0..N1])
- UCIe endpoints (N/E/W/S) as ports
**Hidden elements**
- PE internals
**Visible links**
- PE → XBAR (HBM data path, top or bottom by corner placement)
- PE → NOC (non-HBM data path)
- XBAR ↔ bridge ↔ XBAR (cross-half HBM access)
- XBAR → HBM_CTRL
- NOC ↔ UCIe endpoints
- NOC ↔ shared SRAM
- M_CPU ↔ NOC (command path)
- NOC → PE_CPU (command delivery, collapsed into PE block)
---
### View C — PE-Level Diagram
**Purpose**
Explain internal PE behavior and execution structure.
**Visible elements**
- PE_CPU
- Command handler / scheduler
- PE_TCM (local SRAM)
- HW accelerators (DMA, GEMM, MATH, etc.)
- Local HBM interface
- Optional IPCQ / messaging endpoints
**Visible links**
- Control paths (CPU → scheduler → engines)
- Data paths (engines ↔ TCM, DMA ↔ local HBM)
- External fabric ports as abstract ports only
---
## Distance-Aware Layout (Default)
### Distance definition
- Distance is defined as **accumulated latency**, consistent with ADR-0002.
- Distance is computed from a single anchor node.
### Default anchor selection
- SIP view: IO chiplet (or Host CPU if present)
- CUBE view: a representative PE
- PE view: PE_CPU or Command Handler
Anchors are **implicit defaults** and MUST NOT be required to be specified.
### Layout rules
- Diagrams MUST be laid out in layers based on distance buckets.
- Layout direction MUST be consistent within a view type
(preferred: left-to-right).
- Nodes with equal distance MUST have stable ordering
(by role or identifier, deterministically).
Cycles MAY be rendered using dashed or curved edges for readability,
without affecting distance semantics.
---
## Generation Contract (for Tools / Claude Code)
When generating diagrams:
- Assume distance-aware layout by default.
- Assume representative rendering by default.
- Do NOT ask for SIP/CUBE/PE indices unless required.
- Do NOT expand hidden abstraction levels.
- Prefer architectural clarity over micro-hop fidelity.
---
## Consequences
- Diagrams are stable across topology scaling.
- Changes in distance or routing policy are reflected visually.
- Diagrams serve as verifiable artifacts derived from the simulator model,
not as hand-maintained documentation.
---
## Links
- SPEC Section 4 (Output, Debuggability, and Diagrams)
- ADR-0002 (Routing distance semantics)
- ADR-0006 (Topology compilation & automatic diagram generation)
@@ -0,0 +1,130 @@
# ADR-0006: Topology Compilation, Distance Extraction, and Automatic Diagram Generation
## Status
Accepted
## Context
The simulator compiles topology configuration (e.g., topology.yaml) into an explicit model graph,
and computes routing and accumulated latency (distance).
Diagrams should be generated from these authoritative artifacts to ensure consistency and avoid
hand-maintained topology drawings.
Additionally, for usability, diagrams should be emitted automatically into a stable location
so that developers can preview them immediately in the repository.
---
## Decision
### D1. Topology compilation is the single source of truth
- topology.yaml (or equivalent config) is compiled into:
- an explicit system graph,
- node/link attributes,
- routing policies.
This compiled graph is the authoritative representation of the system.
### D2. Distance extraction during compilation
- During or immediately after topology compilation, the simulator MUST compute distance metadata
(accumulated latency) consistent with ADR-0002.
- Distance metadata MUST be sufficient to support distance-aware diagram layout as defined in ADR-0005.
- Distributed fabric segments (e.g., NOC) MAY have distance_mm = 0 per ADR-0002 D4;
layout placement for such nodes uses explicit position metadata rather than distance buckets.
### D3. Diagram generation is a derived artifact
- Diagrams MUST be generated from:
- the compiled topology graph,
- extracted distance metadata,
- view/layout rules defined in ADR-0005.
- Diagram generation MUST NOT require additional hand-written topology descriptions.
### D4. Automatic diagram emission to the repository
- As part of topology compilation, the implementation MUST produce the following diagrams by default:
- SIP-level diagram (representative, distance-aware)
- CUBE-level diagram (representative, distance-aware)
- PE-level diagram (representative, distance-aware)
- The default output directory is:
- `docs/diagrams/`
- The generator MUST overwrite/update only when the compiled topology (or diagram rules) changes.
### D5. View-specific projection and layout
For each view (SIP / CUBE / PE):
- The generator MUST project the compiled graph into a reduced view graph:
- hide/collapse nodes according to ADR-0005,
- preserve connectivity semantics relevant to that view,
- compute distance buckets and assign layout layers deterministically.
- CUBE-level projection MUST include:
- XBAR (top/bottom), bridge (left/right), NOC, HBM_CTRL, shared SRAM, M_CPU, UCIe ports,
and PEs as opaque blocks.
- Distinct edge kinds for HBM path (PE→XBAR) vs non-HBM path (PE→NOC).
- Default anchors are implicit (ADR-0005) and MUST NOT require instance indices.
### D6. Output formats and determinism
- The generator MUST output at least one of:
- Mermaid (Markdown-native)
- Graphviz DOT (rank-based control)
- SVG (mm-accurate layout, no external dependencies)
- SVG is preferred when mm-accurate position metadata is available from the compiled topology.
- Output MUST be deterministic:
- same topology + same rules → identical diagram text
- File naming MUST be deterministic and stable (see "Output Conventions").
### D7. Performance and caching
- Diagram generation MAY be lazy and/or cached, as long as the outputs in `docs/diagrams/`
remain consistent with the compiled topology.
- The implementation SHOULD use a cache key based on:
- topology content hash,
- routing policy version,
- diagram rules version,
- view type (SIP/CUBE/PE).
---
## Output Conventions
### Directory
- `docs/diagrams/` is the canonical output directory for generated diagrams.
### File names (recommended, deterministic)
- `system_view.svg` / `system_view.mmd` / `system_view.dot`
- `sip_view.svg` / `sip_view.mmd` / `sip_view.dot`
- `cube_view.svg` / `cube_view.mmd` / `cube_view.dot`
- `pe_view.svg` / `pe_view.mmd` / `pe_view.dot`
Optionally, for multi-topology workflows:
- `sip_view__{topology_id}.svg`
- `cube_view__{topology_id}.svg`
- `pe_view__{topology_id}.svg`
### Repository policy
- Generated diagram files MAY be committed to the repository to enable diff-based review.
- If committed, they MUST be reproducible from topology compilation.
---
## Consequences
- Diagrams are always consistent with simulator behavior.
- Architectural changes automatically propagate to visualizations.
- Diagram diffs become meaningful indicators of architectural change.
---
## Links
- SPEC Section 4 (Output, Debuggability, and Diagrams)
- ADR-0002 (Distance semantics)
- ADR-0005 (Diagram views and layout rules)
@@ -0,0 +1,89 @@
# ADR-0007: Runtime API and Simulation Engine Boundaries
## Status
Accepted
## Context
The simulator consists of multiple layers with distinct responsibilities:
- a host-facing API layer used by benchmarks and user code,
- a discrete-event simulation engine that executes requests,
- device components that model hardware behavior.
Without strict boundaries, orchestration logic can leak into components,
or simulation internals can become entangled with user-facing APIs.
This ADR defines clear responsibility boundaries between:
- runtime API,
- simulation engine (sim_engine),
- hardware components.
---
## Decision
### D1. Runtime API is host-facing orchestration only
The runtime API represents host/driver-level behavior and MUST:
- expose high-level operations (tensor deployment, kernel launch),
- submit requests only to endpoint components (e.g., IO_CPU),
- await completion via futures/handles,
- own and persist host-side metadata (tensor allocation maps, kernel bindings).
The runtime API MUST NOT:
- hardcode hop-by-hop routing or fan-out,
- directly invoke internal components (M_CPU, PE_CPU, engines),
- embed topology- or routing-specific assumptions.
---
### D2. Simulation engine executes and schedules requests
The simulation engine (sim_engine) MUST:
- inject requests into the compiled topology graph,
- schedule and execute events using a discrete-event model,
- manage correlation ids and completion tracking,
- decompose operations into low-level requests when required
(e.g., MemoryWrite events).
The simulation engine MUST NOT:
- define tensor semantics,
- define kernel execution policies,
- expose internal graph details to the runtime API.
---
### D3. Components own fan-out and aggregation
Device-side components MUST:
- fan-out requests to downstream domains
(IO_CPU → M_CPU → PE_CPU → schedulers/engines),
- aggregate completion and failure signals,
- propagate results deterministically upstream.
Neither the runtime API nor the simulation engine may orchestrate
component-level fan-out explicitly.
---
## Consequences
- Runtime APIs remain stable as topology and routing evolve.
- Simulation internals can change without affecting user-facing code.
- Component implementations remain swappable via DI.
---
## Links
- SPEC R4, R7, R8
- ADR-0008 (Tensor deployment)
- ADR-0009 (Kernel execution)
@@ -0,0 +1,100 @@
# ADR-0008: Tensor Deployment and Allocation (Host Allocator, PA-first)
## Status
Accepted
## Context
Benchmarks require PyTorch-like tensor semantics:
- tensor creation (empty, fill),
- deployment to accelerator devices (tensor.to()).
In the realistic system, host software manages allocation/mapping and installs
mappings for DMA/MMU. For Phase 0 we simplify (ADR-0011):
- device memory operations use PA only,
- VA/MMU/IOMMU is not modeled.
To keep the host↔device interface minimal, we avoid a separate
AllocateTensorMeta message. Instead, host allocation produces a PA shard map
that is used directly by MemoryWrite/Read and KernelLaunch.
---
## Decision
### D1. Tensor is a host-owned handle with PA shard mapping
A Tensor object is a host-owned handle that encapsulates:
- shape and dtype,
- initialization intent,
- device placement and allocation metadata as a PA shard map.
After deployment, the Tensor handle MUST contain:
- a list of shards, each with (sip,cube,pe,pa,nbytes,offset_bytes).
This PA shard mapping is the single source of truth for kernel argument binding.
---
### D2. Deployment uses a host allocator (Phase 0)
In Phase 0, tensor deployment produces PA shard mappings via a host allocator:
- placement (split/replicate/hybrid) is decided by a DP policy,
- allocation assigns PA ranges at the PE level and returns shard mappings,
- the Tensor handle stores the resulting shard list deterministically.
No separate host-visible device allocation RPC is required in Phase 0.
---
### D3. Data initialization and transfer uses MemoryWrite/Read only
Any data initialization or transfer implied by a tensor (e.g., fill, copy)
MUST be represented using Host ↔ IO_CPU messages only:
- MemoryWrite
- MemoryRead
Rules:
- MemoryWrite/Read MUST reference PA + (sip,cube,pe) tags (ADR-0012).
- Allocation metadata MUST NOT be embedded as a separate allocation message.
- Bulk tensor data MUST NOT be embedded in Phase 0 messages.
The simulation engine schedules MemoryWrite/Read through the graph so that
latency is computed by explicit traversal.
---
### D4. Extension path (non-breaking)
Future ADRs MAY introduce optional VA/MMU/IOMMU modeling by adding:
- virtual addressing in tensor handles,
- mapping install steps,
- translation latency/page granularity.
The Phase 0 PA shard map remains a valid fast-path configuration.
---
## Consequences
- Host↔IO_CPU contract remains minimal (MemoryRead/Write + KernelLaunch).
- KernelLaunch can pass per-PE data placement explicitly via shard tags.
- Early implementation stays simple and testable.
---
## Links
- ADR-0011 (PA-first)
- ADR-0012 (Host↔IO_CPU schema)
- ADR-0007 (runtime_api vs sim_engine boundaries)
- ADR-0009 (Kernel execution)
@@ -0,0 +1,74 @@
# ADR-0009: Kernel Execution Messaging and Completion Semantics
## Status
Accepted
## Context
Kernel execution is initiated by the host and proceeds through
device control components:
Host → IO_CPU → M_CPU → PE_CPU → schedulers → engines
Completion propagates in reverse order.
To keep benchmarks simple and topology-agnostic,
kernel execution must be endpoint-driven with deterministic aggregation.
---
## Decision
### D1. Kernel launch is an endpoint request
A kernel launch is initiated by submitting a single KernelLaunch request
to the IO_CPU endpoint.
The runtime API MUST:
- construct the kernel launch request,
- submit it to IO_CPU,
- await a single completion result.
The runtime API MUST NOT orchestrate internal fan-out.
---
### D2. Tensor arguments are passed by metadata
KernelLaunch requests MUST reference tensor arguments via:
- host-owned tensor handles, or
- resolved device address maps derived from those handles.
Bulk tensor data MUST NOT be embedded in kernel launch messages.
---
### D3. Fan-out and aggregation are component responsibilities
- IO_CPU fans out work to M_CPUs.
- M_CPU fans out work to PE_CPUs.
- PE_CPU manages kernel execution and engine dispatch.
Completion semantics:
- M_CPU completes when all targeted PEs complete or a failure policy triggers.
- IO_CPU completes when all targeted CUBEs complete or a failure policy triggers.
---
### D4. Completion and failure propagation
- All messages MUST carry correlation identifiers.
- Completion and failure MUST propagate deterministically to the host.
- The simulation engine provides futures/handles to observe completion.
---
## Links
- SPEC R1, R2, R7, R8
- ADR-0007 (Runtime API boundaries)
- ADR-0008 (Tensor deployment)
+62
View File
@@ -0,0 +1,62 @@
# ADR-0010: CLI Device Selection and Multi-Device Execution Semantics
## Status
Accepted
## Context
Benchmarks represent device-agnostic workloads that operate on a single device.
Users may want to run a benchmark:
- on a specific device, or
- across all devices in the system.
Device enumeration must not leak into benchmarks or runtime APIs.
---
## Decision
### D1. Benchmarks are single-device by design
- A benchmark MUST define behavior for a single device only.
- A benchmark MUST accept a device identifier as input.
- Benchmarks MUST NOT enumerate or loop over multiple devices.
---
### D2. CLI controls device selection
The `kernbench run` command supports an optional `--device` argument:
- If `--device <id>` is specified:
- the benchmark executes once for the specified device.
- If `--device` is omitted:
- the benchmark executes once using all the SIPs discovered in the topology.
---
### D3. Multi-device execution is logically parallel
When running on multiple devices:
- benchmark executions are submitted to a single simulation engine instance,
- executions are logically parallel in simulation time,
- inter-device contention is naturally modeled.
---
### D4. Runtime API and simulation engine remain device-scoped
- Runtime API calls operate on one device per invocation.
- The simulation engine schedules all requests deterministically.
- Neither layer enumerates devices.
---
## Links
- SPEC R7, R8
- ADR-0007 (Runtime API boundaries)
@@ -0,0 +1,65 @@
# ADR-0011: Memory Addressing Simplification (PA-first)
## Status
Accepted
## Context
A realistic system uses host-side virtual addressing and an MMU/IOMMU-style
translation path for DMA: host allocates physical memory at PE level, maps it
into a virtual address space, installs mappings, and DMA requests use virtual
addresses that are translated to physical addresses.
For early development, we want a minimal, deterministic model that enables:
- correct routing and latency accounting through the graph,
- stable tensor deployment and kernel execution semantics,
- future extension toward VA/MMU without rewriting workflows.
---
## Decision
### D1. Phase 0 model is PA-only
The simulator uses a PA-first model:
- All device memory accesses (MemoryRead/MemoryWrite) operate on device physical
addresses (PA) plus size.
- Tensor handles store PA-based shard mappings after deployment.
- KernelLaunch passes tensor arguments as PA-based mappings (or references to them).
- MMU/IOMMU concepts (virtual address spaces, page tables, translation latency)
are NOT modeled in Phase 0.
### D2. Allocation produces PA mappings
Device allocation selects PE-local memory regions and returns PA mappings
sufficient to execute kernels and issue DMA requests.
### D3. Extension path (non-breaking)
A future ADR MAY introduce an optional VA/MMU layer by:
- introducing virtual addresses in tensor handles,
- adding a mapping-install step,
- modeling translation latency and page granularity.
The Phase 0 PA model remains a valid fast-path configuration.
---
## Consequences
- Early implementation stays simple and testable.
- All latency remains explicit via graph traversal, not hidden translation.
- Future VA/MMU modeling can be added without breaking existing benchmarks.
---
## Links
- ADR-0007 (runtime_api vs sim_engine boundaries)
- ADR-0008 (tensor deployment)
- ADR-0009 (kernel execution)
- SPEC R2 (latency by traversal)
+232
View File
@@ -0,0 +1,232 @@
# ADR-0012: Host ↔ IO_CPU Message Schema (PA-first, PE-tagged)
## Status
Accepted
## Context
Phase 0 uses a PA-first memory model (ADR-0011):
- memory operations use device physical addresses (PA) only,
- VA/MMU/IOMMU is not modeled.
The host-facing runtime API interacts with the device via the IO_CPU endpoint.
We define stable, minimal message schemas for Host ↔ IO_CPU so that:
- benchmarks remain stable,
- IO_CPU-internal fan-out/aggregation can evolve independently,
- completion and failure propagation is deterministic.
We also require PE-tagging (A 방식): each shard explicitly carries (sip,cube,pe)
so IO_CPU can deterministically route/fan-out without relying on PA decoding.
---
## Decision
### D1. Contract scope
This schema is the stable contract ONLY for Host ↔ IO_CPU.
Messages beyond IO_CPU (to M_CPU, PE_CPU, schedulers, engines) are component-internal
and are NOT part of this host contract in Phase 0.
---
### D2. Required message set
The runtime API MUST use only these message types for Host ↔ IO_CPU:
- MemoryWrite
- MemoryRead
- KernelLaunch
All operations required by benchmarks (tensor init/copy, kernel run) MUST be expressible
with these messages.
---
### D3. Common envelope (mandatory for all requests)
All Host ↔ IO_CPU requests MUST include:
- `msg_type: str`
- `correlation_id: str`
- generated by the host
- used to match responses deterministically
- `request_id: str`
- unique within a correlation_id
- `target_device: str`
- device identifier (e.g., "sip:0")
- `timestamp_tag: str | None` (optional)
- debug tag only; MUST NOT affect determinism
All Host ↔ IO_CPU responses MUST include:
- `correlation_id: str`
- `request_id: str`
- `completion: Completion`
---
### D4. Completion schema (mandatory)
`Completion` MUST have:
- `ok: bool`
- `error_code: str | None`
- `error_message: str | None`
Rules:
- If `ok == true` then `error_code` and `error_message` MUST be null.
- If `ok == false` then `error_code` MUST be non-null.
- Completion semantics MUST be deterministic.
---
### D5. MemoryWrite schema (PA-first, PE-tagged)
`MemoryWrite` represents a host-initiated write/initialize operation to device memory.
Mandatory fields:
- common envelope fields (D3)
- destination placement tags (A 방식):
- `dst_sip: int`
- `dst_cube: int`
- `dst_pe: int`
- `dst_pa: int`
- destination physical address in the destination PE's address space
- `nbytes: int`
- `src_kind: "pattern" | "host_buffer_ref"`
- Phase 0 MUST support "pattern"
- `pattern: Pattern | None`
- required if `src_kind == "pattern"`
`Pattern` (Phase 0 mandatory support):
- `pattern_kind: "zero" | "fill_u8" | "fill_u16" | "fill_u32" | "fill_fp16" | "fill_fp32"`
- `value: number | None`
- required for fill_*; ignored for zero
Optional fields:
- `dst_mem_kind: "HBM" | "TCM" | "AUTO"` (default "AUTO")
- `debug_label: str | None`
Notes:
- This message MUST NOT embed bulk tensor data in Phase 0.
- All latency MUST come from explicit graph traversal and modeled components.
---
### D6. MemoryRead schema (PA-first, PE-tagged)
`MemoryRead` represents a host-initiated read from device memory.
Mandatory fields:
- common envelope fields (D3)
- source placement tags (A 방식):
- `src_sip: int`
- `src_cube: int`
- `src_pe: int`
- `src_pa: int`
- `nbytes: int`
Optional fields:
- `dst_kind: "host_sink" | "discard"` (default "host_sink")
- `debug_label: str | None`
Response payload:
- actual bytes are NOT required in Phase 0 (latency/traces focus)
- implementations MAY return lightweight stats or hashes later via a new ADR
---
### D7. KernelLaunch schema (PA-first, PE-tagged shards)
`KernelLaunch` represents launching a kernel on a target device via IO_CPU.
Mandatory fields:
- common envelope fields (D3)
- `kernel_ref: KernelRef`
- `args: list[KernelArg]`
`KernelRef` MUST have:
- `name: str`
- `kind: "deployed" | "builtin"`
- `deploy_pa: int | None` — PA where kernel binary was deployed (required for "deployed")
- `deploy_sip: int` — SIP where binary resides
- `deploy_cube: int` — cube where binary resides
- `deploy_pe: int` — PE where binary resides
- `nbytes_code: int` — kernel binary size (for BW modeling)
Kernel binaries MUST be pre-deployed to device memory via MemoryWrite.
KernelLaunch MUST NOT embed kernel source code or IR in the launch message.
`KernelArg` supports tensor args by PA mapping and scalars by value.
Tensor arg (mandatory):
- `arg_kind: "tensor"`
- `tensor_pa_map: TensorPAMap`
`TensorPAMap` MUST have:
- `shards: list[TensorShard]`
`TensorShard` MUST have (A 방식 강제):
- `sip: int`
- `cube: int`
- `pe: int`
- `pa: int`
- `nbytes: int`
- `offset_bytes: int`
Scalar arg (mandatory):
- `arg_kind: "scalar"`
- `dtype: "i32" | "i64" | "fp16" | "fp32" | "bool"`
- `value: number | bool`
Optional KernelLaunch fields:
- `grid: dict | None`
- `meta: dict | None`
- `failure_policy: "fail_fast" | "collect_all"` (default "fail_fast")
- `debug_label: str | None`
Notes:
- KernelLaunch MUST NOT embed bulk tensor data.
- KernelLaunch MUST be submitted only to the IO_CPU endpoint.
- IO_CPU MUST fan-out work internally using the shard (sip,cube,pe) tags.
---
## Verification Notes
Tests SHOULD validate:
- schema validation rejects missing mandatory fields,
- deterministic correlation/response matching,
- MemoryWrite/Read/KernelLaunch produce explicit hop traces,
- all routed requests incur latency > 0.
---
## Links
- ADR-0011 (PA-first memory addressing)
- ADR-0007 (runtime_api vs sim_engine boundaries)
- ADR-0009 (kernel execution fan-out/aggregation)
- SPEC R2, R7, R8
+139
View File
@@ -0,0 +1,139 @@
# ADR-0013: Verification Strategy and Phase 1 Test Plan
## Status
Accepted
## Context
KernBench is a system-level simulator whose correctness is defined by:
- adherence to SPEC-defined invariants,
- determinism and debuggability,
- explicit modeling of routing and latency.
Given the evolving implementation, we need a stable verification strategy
that prevents architectural drift while allowing incremental development.
This ADR defines the Phase 1 verification plan and what constitutes
"correct behavior" for early implementations.
---
## Decision
### D1. Verification is contract-based
Verification MUST be derived from:
- SPEC requirements,
- accepted ADRs.
Tests MUST validate architectural contracts, not incidental implementation details.
---
### D2. Phase 1 verification scope
Phase 1 verification focuses on:
- message contract validity (ADR-0012),
- routing and fan-out semantics at the IO_CPU boundary (ADR-0009),
- PA-first memory addressing and shard tagging (ADR-0011),
- core latency and trace invariants (SPEC 0.1, R2).
Microarchitectural accuracy, bandwidth contention, and cycle-level behavior
are explicitly out of scope in Phase 1.
---
### D3. Required Phase 1 verification cases
The following verification cases MUST be supported by the implementation:
#### V1. Message schema validation
- KernelLaunch requests missing `(sip, cube, pe)` in any tensor shard MUST be rejected.
- MemoryWrite/MemoryRead requests missing destination/source placement tags MUST be rejected.
- Completion results MUST follow the `ok / error_code / error_message` contract.
#### V2. IO_CPU fan-out and aggregation
Given:
- a topology with one SIP, one CUBE, and two PEs,
- a KernelLaunch request containing two tensor shards targeting different PEs,
The system MUST:
- submit a single KernelLaunch to IO_CPU,
- fan-out work internally to both PEs,
- aggregate completion and return a single deterministic completion to the host.
#### V3. Latency and trace invariants
For any valid request:
- the hop-by-hop trace MUST be non-empty,
- total latency MUST be greater than zero,
- repeated runs with identical inputs MUST produce identical traces.
#### V4. Topology independence and cross-domain coverage
Verification cases MUST pass for multiple topology shapes, including:
- minimal: (1 SIP, 1 CUBE, 1 PE)
- multi-PE: (1 SIP, 1 CUBE, N PEs)
- multi-CUBE within a SIP: (1 SIP, M CUBEs, ≥1 PE per CUBE)
- multi-SIP tray: (K SIPs, ≥1 CUBE per SIP, ≥1 PE per CUBE)
For multi-CUBE and multi-SIP topologies, Phase 1 verification focuses on:
- explicit connectivity (required links exist),
- deterministic routing and control-path traversal,
- non-empty traces and latency > 0 for representative cross-domain requests
(inter-CUBE and inter-SIP paths).
Tests MUST NOT hardcode topology sizes, node ids, or link counts.
Instead, tests MUST derive expectations from the compiled topology metadata
---
### D4. Phase 1 artifacts
Phase 1 MAY include:
- verification-only test code,
- topology fixtures,
- trace inspection utilities.
Phase 1 MUST NOT require:
- production code changes solely to satisfy tests,
- weakening or removing tests to allow progress.
---
### D5. Phase 2 enforcement
Phase 2 (Apply) MUST:
- run the Phase 1 verification cases,
- rollback all changes if any verification fails,
- preserve tests as authoritative contracts.
---
## Consequences
- Architectural correctness is enforced early.
- Tests serve as executable documentation of system behavior.
- Implementation remains flexible without losing rigor.
---
## Links
- SPEC 0.1, R2, R6
- ADR-0011 (PA-first memory addressing)
- ADR-0012 (Host ↔ IO_CPU message schema)
- ADR-0009 (Kernel execution semantics)
@@ -0,0 +1,364 @@
# ADR-0014: PE Internal Execution Model (PE_CPU, PE_SCHEDULER, and Composite Commands)
## Status
Proposed
## Context
ADR-0003 (system hierarchy) and ADR-0009 (kernel execution semantics) reference PE internals but do not define:
- the dispatch model inside a PE,
- the responsibilities of PE_SCHEDULER,
- the PE_TCM-centric dataflow contract used by accelerator engines.
We need a deterministic and debuggable PE-internal execution contract that supports:
- simple single-engine commands
- composite commands that build a tiled pipeline across DMA and accelerator engines
The simulator must produce deterministic traces and allow modeling of PE-internal pipelining without introducing nondeterministic engine scheduling.
## Decision
### D1. PE internal component roles
Each PE contains the following logical components.
**PE_CPU**
- Executes kernel instruction stream or kernel control logic.
- Generates PE commands.
- Submits commands to PE_SCHEDULER.
- PE_CPU does NOT enqueue work directly into engine queues.
**PE_SCHEDULER**
- The sole dispatcher inside a PE.
- Receives commands from PE_CPU.
- Expands composite commands into sub-commands.
- Tracks dependencies and command state.
- Dispatches work to engine queues.
- Manages tile scheduling for composite commands.
**PE_DMA**
- Handles memory transfers between PE_TCM and external memory domains.
- PE_DMA has **dual egress** at the CUBE level:
- **→ XBAR**: dedicated path to HBM (local and cross-half via bridge)
- **→ NOC**: path to non-HBM destinations (shared SRAM, inter-cube UCIe, etc.)
- Supported directions include:
- HBM → PE_TCM (via XBAR)
- PE_TCM → HBM (via XBAR)
- PE_TCM → shared SRAM (via NOC)
- PE_TCM → other memory domains (via NOC, if supported by topology)
**PE_GEMM**
- Matrix multiplication engine.
- Reads activations from PE_TCM.
- May stream weights directly from HBM.
**PE_MATH**
- Element-wise computation engine.
- Reads and writes PE_TCM.
**PE_TCM**
- Local SRAM used as the staging memory for accelerator operations.
---
### D2. Command lifecycle and queues
PE_SCHEDULER maintains three logical structures.
**SubmissionQueue**
- Written by PE_CPU.
- Contains incoming PE commands waiting to be processed.
**InflightTable**
- Owned and mutated only by PE_SCHEDULER.
- Tracks:
- expanded sub-commands
- dependency state
- engine assignment
- completion status
**CompletionQueue**
- Written by PE_SCHEDULER.
- Contains final completion records for commands.
**Single-writer rule**
- Only PE_SCHEDULER is allowed to mutate command completion state.
- Engine components must report completion via explicit completion events/messages.
**Command completion**
A command becomes DONE when:
- all sub-commands complete
- PE_SCHEDULER publishes a completion record to CompletionQueue.
---
### D3. Dispatch modes
PE commands are divided into two categories.
#### D3.1 Simple command
A simple command expands to exactly one engine sub-command.
Examples include:
- DMA transfer
- GEMM compute
- MATH compute
Execution flow:
```
PE_CPU → SubmissionQueue → PE_SCHEDULER → engine queue → engine execution → completion event → PE_SCHEDULER → CompletionQueue
```
#### D3.2 Composite command (tiled pipeline)
Composite commands implement tiled pipelined execution across engines.
Each tile executes the following pipeline:
```
Input DMA (READ)
→ Compute (GEMM or MATH)
→ Output DMA (WRITE)
```
**Tiling rule**
If the DMA payload exceeds hardware tile size, PE_SCHEDULER splits the transfer into tiles.
Each tile is assigned a monotonically increasing `tile_id`.
**Tile dependency rules**
For tile `t`:
- Compute must wait for input DMA: `DMA_READ(t) → COMPUTE(t)`
- Output DMA must wait for compute: `COMPUTE(t) → DMA_WRITE(t)`
- All dependencies are enforced by PE_SCHEDULER.
**Overlap policy (Phase 0 default)**
Operations for different tiles may overlap when engine resources permit.
Allowed overlaps:
```
DMA_READ(t+1) ∥ COMPUTE(t)
DMA_WRITE(t1) ∥ COMPUTE(t)
DMA_READ(t) ∥ DMA_WRITE(t)
```
Disallowed overlaps:
```
GEMM(t) ∥ GEMM(t)
MATH(t) ∥ MATH(t)
GEMM(t) ∥ MATH(t)
```
---
### D4. Engine execution model (Phase 0 default)
Each engine behaves as a deterministic service resource.
**DMA engine**
PE_DMA contains two independent channels.
```
DMA_READ capacity = 1
DMA_WRITE capacity = 1
```
Rules:
- DMA_READ and DMA_WRITE may execute concurrently.
- Multiple READs cannot overlap.
- Multiple WRITEs cannot overlap.
Example allowed:
```
DMA_READ(t+1) ∥ DMA_WRITE(t)
```
Example not allowed:
```
DMA_READ(t) ∥ DMA_READ(t+1)
DMA_WRITE(t) ∥ DMA_WRITE(t+1)
```
**Compute engine**
Compute operations share a single compute resource.
```
PE_ACCEL capacity = 1
```
Both GEMM and MATH require this shared compute slot.
Consequences:
- GEMM ∥ GEMM not allowed
- MATH ∥ MATH not allowed
- GEMM ∥ MATH not allowed
Only one compute operation can run in a PE at a time.
**Compute opcode restriction**
Composite commands contain one compute opcode only.
Examples:
```
COMPOSITE_GEMM
COMPOSITE_MATH
```
Mixed compute pipelines such as `GEMM → MATH` are not supported in Phase 0.
**Engine completion signaling**
Every engine emits a completion event when a sub-command finishes.
Completion events are delivered to PE_SCHEDULER.
---
### D5. Dataflow model
Compute operations use a TCM-centric dataflow model.
**Input path (HBM)**
```
HBM → XBAR → PE_DMA (DMA_READ) → PE_TCM
```
**Input path (shared SRAM)**
```
Shared SRAM → NOC → PE_DMA (DMA_READ) → PE_TCM
```
**Compute stage**
Compute engines read input tensors from PE_TCM.
```
PE_TCM → GEMM / MATH
```
Weights for GEMM may optionally stream directly from HBM (via XBAR).
**Output path (HBM)**
Compute results are written to PE_TCM, then DMA writes to HBM.
```
PE_TCM → PE_DMA (DMA_WRITE) → XBAR → HBM
```
**Output path (shared SRAM)**
```
PE_TCM → PE_DMA (DMA_WRITE) → NOC → Shared SRAM
```
#### D5.1 PE_TCM partitioning and ownership boundary
The PE_TCM address space is partitioned into two logical regions.
**SchedulerReservedTCM**
- A staging region owned exclusively by PE_SCHEDULER.
- This region is used for composite command tile buffers.
- PE_SCHEDULER:
- partitions this region into tile buffers
- assigns buffers for DMA_READ, COMPUTE, and DMA_WRITE stages
- guarantees input/output buffer separation
- manages tile buffer lifetime
**AllocatableTCM**
- General-purpose region managed by PEMemAllocator.
- Used by host or DP-visible allocations.
**Visibility rule (hard isolation)**
- PEMemAllocator must not see or allocate memory inside SchedulerReservedTCM.
- SchedulerReservedTCM is excluded from allocator-managed ranges by construction.
- This prevents DP or host allocations from interfering with scheduler staging buffers.
**Tile buffer rules**
Within SchedulerReservedTCM:
- input buffers and output buffers must not overlap
- PE_SCHEDULER assigns tile buffers for DMA and compute stages
- tile buffers remain valid until the corresponding DMA_WRITE completes
- Buffer reuse is allowed only after the tile lifetime finishes.
---
### D6. Observability and trace contract
The simulator must emit deterministic trace events.
Required events include:
- `command_submitted`
- `sub_command_dispatched`
- `engine_start`
- `engine_complete`
- `tile_ready`
- `command_complete`
Trace ordering must be deterministic for identical inputs.
---
### D7. Topology representation
PE internal components are declared in `cube.pe_template`.
The template is instantiated once per PE.
PE instances are derived from `cube.pe_layout`.
External connectivity such as:
- PE_DMA → XBAR (HBM data path)
- PE_DMA → NOC (non-HBM data path: shared SRAM, inter-cube UCIe)
- NOC → PE_CPU (command path from M_CPU)
is modeled at the CUBE level (see ADR-0003 D3).
---
## Links
- SPEC R3, R4
- ADR-0003 D4 (PE-level system hierarchy)
- ADR-0005 View C (PE-level diagram)
- ADR-0008 D2 (PA-level allocation at PE scope; PEMemAllocator is the per-PE allocator instance)
- ADR-0009 D3 (kernel execution fan-out and PE_CPU dispatch)
@@ -0,0 +1,178 @@
# ADR-0015: Component Port/Wire Model and Fabric Routing
## Status
Proposed
## Context
ADR-0007 D2 assigns path-walking and low-level request decomposition to the simulation engine.
In practice, the engine iterates the topology path and calls `run()` on each component
sequentially — conflating routing policy with component behavior and preventing realistic
hardware modeling (queues, contention, fan-out).
ADR-0007 D3 already states that components own fan-out and aggregation, but the current
implementation does not enforce this for fabric traversal.
This ADR defines:
- how components communicate via typed port queues,
- how propagation delay is modeled (wire processes),
- the fabric path for Memory R/W through M_CPU.DMA,
- the reduced role of the simulation engine,
- M_CPU.DMA as an internal subcomponent of M_CPU.
---
## Decision
### D1. Component port model
Each component has typed input/output ports modeled as SimPy Stores:
```
in_ports: dict[str, simpy.Store] # keyed by source node_id
out_ports: dict[str, simpy.Store] # keyed by destination node_id
```
Ports are created at engine initialization based on graph edges.
Each directed edge (src → dst) results in:
- `src.out_ports[dst]` — the sending end
- `dst.in_ports[src]` — the receiving end
---
### D2. Wire process (propagation delay)
For each directed edge (src, dst) in the topology graph, a SimPy wire process
models propagation delay:
```python
def wire_process(env, out_port, in_port, delay_ns):
while True:
cmd = yield out_port.get()
yield env.timeout(delay_ns)
yield in_port.put(cmd)
```
Wire processes are started at engine initialization.
BW constraints are enforced by the sending component's out_port capacity or token model,
not by the wire process itself.
---
### D3. Engine role (reduced)
The simulation engine MUST:
- wire components at initialization (create port Stores, start wire processes),
- identify the entry component for each request type (PCIE_EP),
- put the request into the entry component's in_port,
- wait for a completion event.
The simulation engine MUST NOT:
- walk the topology path during request execution,
- call component `run()` methods directly,
- track per-hop latency or decompose fan-out.
This supersedes ADR-0007 D2's "decompose operations into low-level requests" clause.
ADR-0007 D2 must be amended accordingly.
---
### D4. Unified fabric path for Memory R/W and Kernel Launch
Both Memory R/W and Kernel Launch use the same fabric path to reach the target cube's M_CPU.
The difference is what M_CPU does upon receiving the request.
**Forward path (IO_CPU → target M_CPU):**
```
IO_CPU
→ [transit cubes: ucie_out → wire → ucie_in → noc → ucie_out] (zero or more)
→ target cube: ucie_in → noc → M_CPU
```
**At M_CPU (diverges by operation type):**
```
Memory R/W: M_CPU → M_CPU.DMA → noc → hbm_ctrl
Kernel Launch: M_CPU → PE[0..n] (parallel fan-out)
```
**Completion path (reverse, same fabric):**
```
Memory R/W: hbm_ctrl → noc → M_CPU.DMA → M_CPU
Kernel Launch: PE[0..n] all complete → M_CPU (aggregation)
M_CPU → [transit cubes: ucie → noc → ucie] → IO_CPU → runtime_api
```
---
### D5. M_CPU.DMA is an internal subcomponent of M_CPU
M_CPU.DMA is NOT a separate topology node.
It is an internal subcomponent owned by the M_CPU component implementation.
M_CPU.DMA:
- owns the DMA READ and DMA WRITE queues (capacity=1 each, per ADR-0014 D4),
- issues memory requests over the NOC to hbm_ctrl,
- receives completion from hbm_ctrl via the NOC,
- reports completion to M_CPU,
- is created and managed inside M_CPU's `__init__` and `run()`.
M_CPU.DMA does not appear as a node in the compiled topology graph.
---
### D6. Transit cube forwarding
A cube that is not the target of a memory or kernel request acts as a transit node.
Transit cubes forward requests without consuming them:
```
ucie_in (from upstream) → noc → ucie_out (to downstream)
```
Transit forwarding is implemented entirely within the ucie_in component.
The noc and ucie_out components in a transit cube forward the packet without modification.
---
### D7. _formula_latency is preserved as a lower-bound cross-check
The path-based formula latency function (`_formula_latency`) is preserved in the engine
as a lower bound for correctness verification.
Invariant:
- Phase 0: `_formula_latency == component model total_ns`
- Phase 1+: `_formula_latency <= component model total_ns` (contention adds queueing)
This function is independent of the port/wire model and requires only the topology graph.
It is used for shard comparison in `_route_kernel` and as a regression guard.
---
## Consequences
- Components model realistic hardware behavior (queues, contention, fan-out).
- Propagation delay is modeled accurately per edge.
- Engine is decoupled from routing policy.
- Component implementations remain swappable via DI (ADR-0007 D3).
- ADR-0007 D2 must be amended to remove path-walking from engine responsibilities.
- ADR-0009 D3 should be updated to reference the unified fabric path (D4 above).
---
## Links
- ADR-0007 D2 (to be amended: engine path-walking clause)
- ADR-0009 D3 (kernel execution fan-out; fabric path to be referenced)
- ADR-0014 D4 (DMA engine capacity=1)
- ADR-0012 D1 (host ↔ IO_CPU message schema; M_CPU.DMA is component-internal)
+363
View File
@@ -0,0 +1,363 @@
# 실무 DI 패턴: kernbench 구현으로 배우는 Dependency Injection
---
## 슬라이드 1 — 오늘 이야기할 것
**질문:** 코드를 어떻게 설계해야 테스트하기 쉽고, 갈아끼우기 쉬울까?
**답:** Dependency Injection (DI)
오늘은 이론이 아니라 **실제로 돌아가는 시뮬레이터 코드**를 보면서 배웁니다.
```
kernbench
└── AI 가속기 하드웨어를 Python으로 시뮬레이션하는 프레임워크
- 수십 개의 하드웨어 컴포넌트 (NOC, HBM, PE, CPU...)
- 각 컴포넌트는 런타임에 교체 가능
- 테스트에서 Mock 컴포넌트로 즉시 대체 가능
```
---
## 슬라이드 2 — DI가 없으면 어떤 일이 생기나
```python
# ❌ DI 없는 코드
class IoCpuComponent:
def run(self, env, nbytes):
router = PathRouter() # 직접 생성 — 교체 불가
hbm = HbmCtrlComponent() # 직접 생성 — 교체 불가
yield env.timeout(10.0)
```
**문제:**
- 테스트할 때 실제 `PathRouter``HbmCtrl`이 항상 따라온다
- 컴포넌트를 Mock으로 바꾸려면 **소스 코드를 수정**해야 한다
- 다른 topology(다른 라우팅 전략)를 쓰고 싶으면 **또 수정**
> 클래스가 자기 의존성을 스스로 만들면, 그 클래스는 의존성과 결합된다
---
## 슬라이드 3 — DI의 핵심 원칙
**의존성은 밖에서 만들어서 안으로 넣어준다**
```
┌────────────────────────────┐
│ 조립자 (Assembler) │ ← 누가 무엇을 쓸지 결정
│ GraphEngine.__init__ │
└────────────┬───────────────┘
│ ctx 주입
┌────────────────────────────┐
│ 컴포넌트 (Component) │ ← 어떻게 동작하는지만 알면 됨
│ IoCpuComponent │
│ self.ctx.router.find_path(...) ← 그냥 사용
└────────────────────────────┘
```
**세 가지 역할 분리:**
1. **Interface** — 무엇을 할 수 있는가 (`ComponentBase`)
2. **Implementation** — 어떻게 하는가 (`IoCpuComponent`, `HbmCtrlComponent`, ...)
3. **Assembler** — 무엇을 연결할 것인가 (`GraphEngine`)
---
## 슬라이드 4 — 패턴 1: Constructor Injection
> 생성자로 의존성을 받는다
```python
# kernbench/components/base.py
class ComponentBase(ABC):
def __init__(self, node: Node, ctx: ComponentContext | None = None):
self.node = node
self.ctx = ctx # 외부에서 주입받은 의존성
self.in_ports: dict[str, simpy.Store] = {}
self.out_ports: dict[str, simpy.Store] = {}
```
```python
# 사용 측 — ctx를 직접 만들지 않는다
class IoCpuComponent(ComponentBase):
def _dispatch(self, env, txn):
path = self.ctx.router.find_node_path(...) # ctx는 이미 들어와 있음
yield self.out_ports[next_hop].put(...)
```
**언제 쓰나:**
- 컴포넌트가 살아있는 동안 의존성이 바뀌지 않을 때
- 의존성 없이는 컴포넌트가 동작하지 않을 때 (필수 의존성)
---
## 슬라이드 5 — Context Object 패턴
> 의존성이 많아지면 묶어서 하나로
```python
# kernbench/components/context.py
@dataclass
class ComponentContext:
router: PathRouter # 라우팅 정책
resolver: AddressResolver # 주소 해석
positions: dict[str, ...] # 물리적 위치 정보
ns_per_mm: float # 전파 지연 상수
edge_map: dict[...] # 엣지 정보
spec: dict # 토폴로지 스펙
```
**왜 Context로 묶나?**
- 생성자 인자가 6개면 → 컴포넌트 추가할 때마다 시그니처 변경
- Context 하나면 → 새 필드 추가해도 기존 컴포넌트 무영향
- 컴포넌트는 **필요한 것만 꺼내 쓴다**
```python
class TwoDMeshNocComponent(ComponentBase):
def _route(self, env, txn):
src_pos = self.ctx.positions.get(prev_hop) # 위치만 사용
ns_per_mm = self.ctx.ns_per_mm # 상수만 사용
# router, resolver 등은 건드리지 않음
```
---
## 슬라이드 6 — 패턴 2: Registry + Factory
> 문자열 키 → 클래스 매핑으로 런타임 교체
```python
# kernbench/components/base.py
class ComponentRegistry:
_registry: dict[str, type[ComponentBase]] = {}
@classmethod
def register(cls, impl: str, component_cls: type[ComponentBase]):
cls._registry[impl] = component_cls
@classmethod
def create(cls, node, overrides=None, ctx=None) -> ComponentBase:
if overrides and node.impl in overrides:
return overrides[node.impl](node, ctx) # 1순위: 호출자 override
if node.impl in cls._registry:
return cls._registry[node.impl](node, ctx) # 2순위: 등록된 구현
return DefaultComponent(node, ctx) # 3순위: 기본값 fallback
```
**Resolution 우선순위:**
```
overrides[impl] ← 테스트/실험용 주입
↓ (없으면)
_registry[impl] ← 프로덕션 구현
↓ (없으면)
DefaultComponent ← 안전한 fallback
```
---
## 슬라이드 7 — Registry 등록 방식
```python
# kernbench/components/impls/__init__.py
from kernbench.components.base import ComponentRegistry
from kernbench.components.impls.noc import TwoDMeshNocComponent
from kernbench.components.impls.io_cpu import IoCpuComponent
# ...
ComponentRegistry.register("noc_2d_mesh_v1", TwoDMeshNocComponent)
ComponentRegistry.register("io_cpu_v1", IoCpuComponent)
ComponentRegistry.register("hbm_ctrl_v1", HbmCtrlComponent)
# ...
```
**topology.yaml (설정 파일)**
```yaml
nodes:
- id: sip0.cube0.noc
impl: noc_2d_mesh_v1 # ← 이 문자열이 Registry 키
```
**흐름:**
```
YAML → impl 문자열 → Registry.create() → 실제 컴포넌트 인스턴스
```
impl 문자열만 바꾸면 동작이 바뀐다. 코드 수정 없음.
---
## 슬라이드 8 — 패턴 3: Override Injection (테스트용)
> 호출자가 특정 impl만 갈아끼운다
```python
# tests/test_component_registry.py
class SpyXbar(ComponentBase):
calls = 0
def run(self, env, nbytes):
SpyXbar.calls += 1
yield env.timeout(0)
# 테스트에서 xbar_v1만 SpyXbar로 교체
engine = GraphEngine(
graph,
component_overrides={"xbar_v1": SpyXbar} # ← 이것만 추가
)
result = engine.run(msg)
assert SpyXbar.calls > 0 # Xbar가 실제로 호출됐는지 검증
```
**핵심:** 테스트 코드가 프로덕션 코드를 **수정하지 않는다**
---
## 슬라이드 9 — 조립자: GraphEngine
> 컴포넌트를 생성하고 연결하는 유일한 곳
```python
# kernbench/sim_engine/engine.py
class GraphEngine:
def __init__(self, graph, component_overrides=None):
# 1. 공유 의존성 생성
ctx = ComponentContext(
router=PathRouter(graph),
resolver=AddressResolver(graph),
positions={nid: n.pos_mm for nid, n in graph.nodes.items()},
ns_per_mm=...,
)
# 2. 컴포넌트 생성 (DI: ctx 주입)
self._components = {
node_id: ComponentRegistry.create(node, overrides, ctx)
for node_id, node in graph.nodes.items()
}
# 3. 포트 연결 (배선)
for e in graph.edges:
store = simpy.Store(self._env)
self._components[e.src].out_ports[e.dst] = store
self._components[e.dst].in_ports[e.src] = store
```
**생성 → 주입 → 연결** — 이 세 단계가 한 곳에서만 일어난다
---
## 슬라이드 10 — 전체 구조 한눈에 보기
```
topology.yaml
│ impl: "noc_2d_mesh_v1"
GraphEngine.__init__() ← 조립자
├── ComponentContext 생성 ← 공유 의존성 묶음
│ ├── PathRouter
│ ├── AddressResolver
│ └── positions, ns_per_mm, ...
├── ComponentRegistry.create(node, overrides, ctx)
│ ├── overrides["noc_2d_mesh_v1"]? → SpyNoc (테스트)
│ ├── registry["noc_2d_mesh_v1"]? → TwoDMeshNocComponent (프로덕션)
│ └── fallback → DefaultComponent
└── 포트 배선: out_ports / in_ports 연결
Component (TwoDMeshNocComponent)
└── self.ctx.positions, self.ctx.ns_per_mm 사용
(라우터, 리졸버는 건드리지 않음 — 필요한 것만)
```
---
## 슬라이드 11 — 무엇을 얻었나
| 상황 | DI 없이 | DI 있이 |
|------|---------|---------|
| NOC 알고리즘 교체 | 소스 코드 수정 | YAML에서 impl 문자열 변경 |
| Xbar 동작 검증 | 실제 HW 전부 구동 | `overrides={"xbar_v1": SpyXbar}` |
| 새 컴포넌트 추가 | 기존 코드 수정 | `register("new_v1", NewComp)` |
| 컨텍스트 필드 추가 | 모든 생성자 수정 | `ComponentContext`에 필드 추가 |
| 테스트 격리 | 불가능 | 필요한 것만 override |
---
## 슬라이드 12 — 실무 적용 체크리스트
**설계할 때 물어볼 것:**
1. **이 클래스가 직접 `new`(생성)하는 것은 무엇인가?**
→ 생성하는 것 = 교체할 수 없는 것. 생성자로 받을 수 없는지 검토.
2. **의존성이 3개 이상이면?**
→ Context Object로 묶어라.
3. **테스트에서 이 클래스를 단독으로 실행할 수 있는가?**
→ 없다면 DI가 필요하다는 신호.
4. **설정(YAML/config)으로 동작을 바꾸고 싶은가?**
→ Registry + 문자열 키 패턴.
5. **누가 조립하는가?**
→ 조립자는 하나여야 한다. 컴포넌트 안에 조립 로직이 있으면 안 된다.
---
## 슬라이드 13 — 안티패턴: 이것은 하지 말자
```python
# ❌ 서비스 로케이터 (컴포넌트 안에서 registry 호출)
class BadComponent(ComponentBase):
def run(self, env, nbytes):
router = ComponentRegistry.get("router") # 컴포넌트가 직접 찾는다
...
# ❌ 전역 싱글톤 직접 참조
class BadComponent(ComponentBase):
def run(self, env, nbytes):
router = GlobalRouter.instance() # 교체 불가
...
# ❌ 생성자 안에서 의존성 생성
class BadComponent(ComponentBase):
def __init__(self, node):
self.router = PathRouter(node.graph) # 테스트에서 격리 불가
```
**공통 문제:** 컴포넌트가 자기 의존성을 스스로 해결한다 → 결합도 증가
---
## 슬라이드 14 — 요약
> **DI = 의존성의 생성과 사용을 분리하는 것**
```
생성 → Registry / Assembler (GraphEngine)
사용 → Component (IoCpuComponent, TwoDMeshNocComponent, ...)
```
**kernbench에서 배운 패턴 3가지:**
1. **Constructor Injection** — 필수 의존성은 생성자로
2. **Context Object** — 의존성 묶음을 하나의 dataclass로
3. **Registry + Override** — 문자열 키로 구현체 선택, 테스트에서 교체
**결과:** 141개 테스트, YAML 한 줄로 컴포넌트 교체, 프로덕션 코드 수정 없이 Mock 주입
---
*참고 코드: kernbench/src/kernbench/components/*
+26
View File
@@ -0,0 +1,26 @@
# Generated Diagrams
This directory contains diagrams generated from topology compilation.
## What these files are
- Derived artifacts generated from:
- compiled topology graph
- distance (accumulated latency) metadata
- view/layout rules (ADR-0005)
These files are meant for quick visual inspection and review.
## Default outputs
- SIP view: `sip_view.mmd` (and/or `sip_view.dot`)
- CUBE view: `cube_view.mmd` (and/or `cube_view.dot`)
- PE view: `pe_view.mmd` (and/or `pe_view.dot`)
## How to preview
- In VS Code:
- open `.mmd` or `.md` containing Mermaid blocks and use Markdown Preview
- for `.dot`, use a Graphviz preview extension or `dot -Tpng`
## Notes
- Diagrams are representative and distance-aware by default.
- Instance indices are not required unless debugging asymmetry.
- Outputs should be deterministic for the same topology and rules.
+156
View File
@@ -0,0 +1,156 @@
<svg xmlns="http://www.w3.org/2000/svg" width="556" height="472" viewBox="0 0 556 472">
<title>cube</title>
<rect width="556" height="472" fill="#f8fafc"/>
<text x="278" y="18" text-anchor="middle" font-family="monospace" font-size="14" font-weight="bold" fill="#1e293b">CUBE VIEW</text>
<rect x="40.0" y="40.0" width="476.0" height="392.0" rx="6" fill="none" stroke="#475569" stroke-width="2" stroke-dasharray="8,4"/>
<rect x="152.0" y="166.0" width="252.0" height="140.0" rx="4" fill="#d1fae5" stroke="#10b981" stroke-width="1.5" stroke-dasharray="6,3" opacity="0.5"/>
<text x="278.0" y="278.0" text-anchor="middle" font-family="monospace" font-size="11" fill="#047857" opacity="0.7">HBM</text>
<polyline points="82.0,82.0 82.0,95.0 82.0,95.0 82.0,138.0" fill="none" stroke="#f97316" stroke-width="1" opacity="0.8"/>
<text x="82.0" y="92.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">6.0mm 256GB/s</text>
<polyline points="82.0,82.0 82.0,144.0 334.0,144.0 334.0,236.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
<polyline points="334.0,236.0 334.0,144.0 82.0,144.0 82.0,82.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.6"/>
<polyline points="166.0,82.0 166.0,95.0 166.0,95.0 166.0,138.0" fill="none" stroke="#f97316" stroke-width="1" opacity="0.8"/>
<text x="166.0" y="92.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">6.0mm 256GB/s</text>
<polyline points="166.0,82.0 166.0,154.0 334.0,154.0 334.0,236.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
<polyline points="334.0,236.0 334.0,144.0 166.0,144.0 166.0,82.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.6"/>
<polyline points="390.0,82.0 390.0,95.0 390.0,95.0 390.0,138.0" fill="none" stroke="#f97316" stroke-width="1" opacity="0.8"/>
<text x="390.0" y="92.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">6.0mm 256GB/s</text>
<polyline points="390.0,82.0 390.0,164.0 334.0,164.0 334.0,236.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
<polyline points="334.0,236.0 334.0,144.0 390.0,144.0 390.0,82.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.6"/>
<polyline points="474.0,82.0 474.0,95.0 474.0,95.0 474.0,138.0" fill="none" stroke="#f97316" stroke-width="1" opacity="0.8"/>
<text x="474.0" y="92.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">6.0mm 256GB/s</text>
<polyline points="474.0,82.0 474.0,174.0 334.0,174.0 334.0,236.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
<polyline points="334.0,236.0 334.0,144.0 474.0,144.0 474.0,82.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.6"/>
<polyline points="82.0,390.0 82.0,347.0 82.0,347.0 82.0,334.0" fill="none" stroke="#f97316" stroke-width="1" opacity="0.8"/>
<text x="82.0" y="344.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">6.0mm 256GB/s</text>
<polyline points="82.0,390.0 82.0,338.0 334.0,338.0 334.0,236.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
<polyline points="334.0,236.0 334.0,298.0 82.0,298.0 82.0,390.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.6"/>
<polyline points="166.0,390.0 166.0,347.0 166.0,347.0 166.0,334.0" fill="none" stroke="#f97316" stroke-width="1" opacity="0.8"/>
<text x="166.0" y="344.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">6.0mm 256GB/s</text>
<polyline points="166.0,390.0 166.0,348.0 334.0,348.0 334.0,236.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
<polyline points="334.0,236.0 334.0,298.0 166.0,298.0 166.0,390.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.6"/>
<polyline points="390.0,390.0 390.0,347.0 390.0,347.0 390.0,334.0" fill="none" stroke="#f97316" stroke-width="1" opacity="0.8"/>
<text x="390.0" y="344.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">6.0mm 256GB/s</text>
<polyline points="390.0,390.0 390.0,358.0 334.0,358.0 334.0,236.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
<polyline points="334.0,236.0 334.0,298.0 390.0,298.0 390.0,390.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.6"/>
<polyline points="474.0,390.0 474.0,347.0 474.0,347.0 474.0,334.0" fill="none" stroke="#f97316" stroke-width="1" opacity="0.8"/>
<text x="474.0" y="344.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">6.0mm 256GB/s</text>
<polyline points="474.0,390.0 474.0,368.0 334.0,368.0 334.0,236.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
<polyline points="334.0,236.0 334.0,298.0 474.0,298.0 474.0,390.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.6"/>
<polyline points="82.0,138.0 222.0,138.0 222.0,236.0" fill="none" stroke="#10b981" stroke-width="1" opacity="0.8"/>
<text x="152.0" y="183.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.5mm 256GB/s</text>
<polyline points="166.0,138.0 222.0,138.0 222.0,236.0" fill="none" stroke="#10b981" stroke-width="1" opacity="0.8"/>
<text x="194.0" y="183.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.5mm 256GB/s</text>
<polyline points="390.0,138.0 222.0,138.0 222.0,236.0" fill="none" stroke="#10b981" stroke-width="1" opacity="0.8"/>
<text x="306.0" y="183.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.5mm 256GB/s</text>
<polyline points="474.0,138.0 222.0,138.0 222.0,236.0" fill="none" stroke="#10b981" stroke-width="1" opacity="0.8"/>
<text x="348.0" y="183.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.5mm 256GB/s</text>
<polyline points="82.0,334.0 222.0,334.0 222.0,236.0" fill="none" stroke="#10b981" stroke-width="1" opacity="0.8"/>
<text x="152.0" y="281.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.5mm 256GB/s</text>
<polyline points="166.0,334.0 222.0,334.0 222.0,236.0" fill="none" stroke="#10b981" stroke-width="1" opacity="0.8"/>
<text x="194.0" y="281.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.5mm 256GB/s</text>
<polyline points="390.0,334.0 222.0,334.0 222.0,236.0" fill="none" stroke="#10b981" stroke-width="1" opacity="0.8"/>
<text x="306.0" y="281.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.5mm 256GB/s</text>
<polyline points="474.0,334.0 222.0,334.0 222.0,236.0" fill="none" stroke="#10b981" stroke-width="1" opacity="0.8"/>
<text x="348.0" y="281.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.5mm 256GB/s</text>
<line x1="82.0" y1="138.0" x2="166.0" y2="138.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
<text x="124.0" y="134.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.0mm 128GB/s</text>
<line x1="166.0" y1="138.0" x2="82.0" y2="138.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
<text x="124.0" y="134.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.0mm 128GB/s</text>
<line x1="166.0" y1="138.0" x2="390.0" y2="138.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
<text x="278.0" y="134.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">10.0mm 128GB/s</text>
<line x1="390.0" y1="138.0" x2="166.0" y2="138.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
<text x="278.0" y="134.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">10.0mm 128GB/s</text>
<line x1="390.0" y1="138.0" x2="474.0" y2="138.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
<text x="432.0" y="134.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.0mm 128GB/s</text>
<line x1="474.0" y1="138.0" x2="390.0" y2="138.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
<text x="432.0" y="134.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.0mm 128GB/s</text>
<line x1="82.0" y1="334.0" x2="166.0" y2="334.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
<text x="124.0" y="330.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.0mm 128GB/s</text>
<line x1="166.0" y1="334.0" x2="82.0" y2="334.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
<text x="124.0" y="330.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.0mm 128GB/s</text>
<line x1="166.0" y1="334.0" x2="390.0" y2="334.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
<text x="278.0" y="330.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">10.0mm 128GB/s</text>
<line x1="390.0" y1="334.0" x2="166.0" y2="334.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
<text x="278.0" y="330.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">10.0mm 128GB/s</text>
<line x1="390.0" y1="334.0" x2="474.0" y2="334.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
<text x="432.0" y="330.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.0mm 128GB/s</text>
<line x1="474.0" y1="334.0" x2="390.0" y2="334.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
<text x="432.0" y="330.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.0mm 128GB/s</text>
<polyline points="82.0,138.0 110.0,138.0 110.0,292.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
<text x="96.0" y="211.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.0mm 512GB/s</text>
<polyline points="110.0,292.0 82.0,292.0 82.0,138.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
<text x="96.0" y="211.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.0mm 512GB/s</text>
<polyline points="82.0,334.0 110.0,334.0 110.0,292.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
<text x="96.0" y="309.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.0mm 512GB/s</text>
<polyline points="110.0,292.0 82.0,292.0 82.0,334.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
<text x="96.0" y="309.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.0mm 512GB/s</text>
<polyline points="474.0,138.0 446.0,138.0 446.0,292.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
<text x="460.0" y="211.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.0mm 512GB/s</text>
<polyline points="446.0,292.0 474.0,292.0 474.0,138.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
<text x="460.0" y="211.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.0mm 512GB/s</text>
<polyline points="474.0,334.0 446.0,334.0 446.0,292.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
<text x="460.0" y="309.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.0mm 512GB/s</text>
<polyline points="446.0,292.0 474.0,292.0 474.0,334.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
<text x="460.0" y="309.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.0mm 512GB/s</text>
<polyline points="334.0,236.0 334.0,131.4 278.0,131.4 278.0,56.8" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.6"/>
<polyline points="334.0,236.0 334.0,310.6 278.0,310.6 278.0,415.2" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.6"/>
<polyline points="334.0,236.0 334.0,221.0 488.0,221.0 488.0,236.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.6"/>
<polyline points="334.0,236.0 334.0,221.0 68.0,221.0 68.0,236.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.6"/>
<polyline points="446.0,194.0 446.0,200.0 334.0,200.0 334.0,236.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.6"/>
<polyline points="334.0,236.0 334.0,200.0 446.0,200.0 446.0,194.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.6"/>
<polyline points="334.0,236.0 110.0,236.0 110.0,194.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.8"/>
<polyline points="110.0,194.0 334.0,194.0 334.0,236.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.8"/>
<rect x="250.0" y="40.0" width="56.0" height="33.6" rx="4" fill="#3b82f6" stroke="#475569" stroke-width="1"/>
<text x="278.0" y="60.8" text-anchor="middle" font-family="monospace" font-size="10" fill="#ffffff">UCIe-N</text>
<rect x="250.0" y="398.4" width="56.0" height="33.6" rx="4" fill="#3b82f6" stroke="#475569" stroke-width="1"/>
<text x="278.0" y="419.2" text-anchor="middle" font-family="monospace" font-size="10" fill="#ffffff">UCIe-S</text>
<rect x="460.0" y="219.2" width="56.0" height="33.6" rx="4" fill="#3b82f6" stroke="#475569" stroke-width="1"/>
<text x="488.0" y="240.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#ffffff">UCIe-E</text>
<rect x="40.0" y="219.2" width="56.0" height="33.6" rx="4" fill="#3b82f6" stroke="#475569" stroke-width="1"/>
<text x="68.0" y="240.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#ffffff">UCIe-W</text>
<rect x="306.0" y="219.2" width="56.0" height="33.6" rx="4" fill="#a78bfa" stroke="#475569" stroke-width="1"/>
<text x="334.0" y="240.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">NOC</text>
<rect x="418.0" y="177.2" width="56.0" height="33.6" rx="4" fill="#f59e0b" stroke="#475569" stroke-width="1"/>
<text x="446.0" y="198.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">M CPU</text>
<rect x="194.0" y="219.2" width="56.0" height="33.6" rx="4" fill="#10b981" stroke="#475569" stroke-width="1"/>
<text x="222.0" y="240.0" text-anchor="middle" font-family="monospace" font-size="8" fill="#ffffff">HBM CTRL</text>
<rect x="82.0" y="177.2" width="56.0" height="33.6" rx="4" fill="#f59e0b" stroke="#475569" stroke-width="1"/>
<text x="110.0" y="198.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">SRAM</text>
<rect x="82.0" y="275.2" width="56.0" height="33.6" rx="4" fill="#f97316" stroke="#475569" stroke-width="1"/>
<text x="110.0" y="296.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">Bridge LEFT</text>
<rect x="418.0" y="275.2" width="56.0" height="33.6" rx="4" fill="#f97316" stroke="#475569" stroke-width="1"/>
<text x="446.0" y="296.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">Bridge RIGHT</text>
<rect x="56.8" y="68.0" width="50.4" height="28.0" rx="4" fill="#94a3b8" stroke="#475569" stroke-width="1"/>
<text x="82.0" y="86.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">PE0</text>
<rect x="54.0" y="121.2" width="56.0" height="33.6" rx="4" fill="#f97316" stroke="#475569" stroke-width="1"/>
<text x="82.0" y="142.0" text-anchor="middle" font-family="monospace" font-size="8" fill="#1e293b">XBAR PE0</text>
<rect x="140.8" y="68.0" width="50.4" height="28.0" rx="4" fill="#94a3b8" stroke="#475569" stroke-width="1"/>
<text x="166.0" y="86.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">PE1</text>
<rect x="138.0" y="121.2" width="56.0" height="33.6" rx="4" fill="#f97316" stroke="#475569" stroke-width="1"/>
<text x="166.0" y="142.0" text-anchor="middle" font-family="monospace" font-size="8" fill="#1e293b">XBAR PE1</text>
<rect x="364.8" y="68.0" width="50.4" height="28.0" rx="4" fill="#94a3b8" stroke="#475569" stroke-width="1"/>
<text x="390.0" y="86.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">PE2</text>
<rect x="362.0" y="121.2" width="56.0" height="33.6" rx="4" fill="#f97316" stroke="#475569" stroke-width="1"/>
<text x="390.0" y="142.0" text-anchor="middle" font-family="monospace" font-size="8" fill="#1e293b">XBAR PE2</text>
<rect x="448.8" y="68.0" width="50.4" height="28.0" rx="4" fill="#94a3b8" stroke="#475569" stroke-width="1"/>
<text x="474.0" y="86.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">PE3</text>
<rect x="446.0" y="121.2" width="56.0" height="33.6" rx="4" fill="#f97316" stroke="#475569" stroke-width="1"/>
<text x="474.0" y="142.0" text-anchor="middle" font-family="monospace" font-size="8" fill="#1e293b">XBAR PE3</text>
<rect x="56.8" y="376.0" width="50.4" height="28.0" rx="4" fill="#94a3b8" stroke="#475569" stroke-width="1"/>
<text x="82.0" y="394.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">PE4</text>
<rect x="54.0" y="317.2" width="56.0" height="33.6" rx="4" fill="#f97316" stroke="#475569" stroke-width="1"/>
<text x="82.0" y="338.0" text-anchor="middle" font-family="monospace" font-size="8" fill="#1e293b">XBAR PE4</text>
<rect x="140.8" y="376.0" width="50.4" height="28.0" rx="4" fill="#94a3b8" stroke="#475569" stroke-width="1"/>
<text x="166.0" y="394.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">PE5</text>
<rect x="138.0" y="317.2" width="56.0" height="33.6" rx="4" fill="#f97316" stroke="#475569" stroke-width="1"/>
<text x="166.0" y="338.0" text-anchor="middle" font-family="monospace" font-size="8" fill="#1e293b">XBAR PE5</text>
<rect x="364.8" y="376.0" width="50.4" height="28.0" rx="4" fill="#94a3b8" stroke="#475569" stroke-width="1"/>
<text x="390.0" y="394.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">PE6</text>
<rect x="362.0" y="317.2" width="56.0" height="33.6" rx="4" fill="#f97316" stroke="#475569" stroke-width="1"/>
<text x="390.0" y="338.0" text-anchor="middle" font-family="monospace" font-size="8" fill="#1e293b">XBAR PE6</text>
<rect x="448.8" y="376.0" width="50.4" height="28.0" rx="4" fill="#94a3b8" stroke="#475569" stroke-width="1"/>
<text x="474.0" y="394.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">PE7</text>
<rect x="446.0" y="317.2" width="56.0" height="33.6" rx="4" fill="#f97316" stroke="#475569" stroke-width="1"/>
<text x="474.0" y="338.0" text-anchor="middle" font-family="monospace" font-size="8" fill="#1e293b">XBAR PE7</text>
</svg>

After

Width:  |  Height:  |  Size: 18 KiB

+31
View File
@@ -0,0 +1,31 @@
<svg xmlns="http://www.w3.org/2000/svg" width="500" height="360" viewBox="0 0 500 360">
<title>pe</title>
<rect width="500" height="360" fill="#f8fafc"/>
<text x="250" y="18" text-anchor="middle" font-family="monospace" font-size="14" font-weight="bold" fill="#1e293b">PE VIEW</text>
<line x1="92.5" y1="180.0" x2="180.0" y2="180.0" stroke="#94a3b8" stroke-width="1.5" opacity="0.8"/>
<text x="136.2" y="176.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">0.5mm</text>
<polyline points="180.0,180.0 180.0,92.5 285.0,92.5" fill="none" stroke="#94a3b8" stroke-width="1.5" opacity="0.8"/>
<text x="232.5" y="132.2" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">0.5mm</text>
<line x1="180.0" y1="180.0" x2="285.0" y2="180.0" stroke="#94a3b8" stroke-width="1.5" opacity="0.8"/>
<text x="232.5" y="176.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">0.5mm</text>
<polyline points="180.0,180.0 180.0,267.5 285.0,267.5" fill="none" stroke="#94a3b8" stroke-width="1.5" opacity="0.8"/>
<text x="232.5" y="219.8" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">0.5mm</text>
<polyline points="285.0,92.5 390.0,92.5 390.0,180.0" fill="none" stroke="#94a3b8" stroke-width="1.5" opacity="0.8"/>
<text x="337.5" y="132.2" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">0.5mm 512GB/s</text>
<line x1="285.0" y1="180.0" x2="390.0" y2="180.0" stroke="#94a3b8" stroke-width="1.5" opacity="0.8"/>
<text x="337.5" y="176.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">0.5mm 512GB/s</text>
<polyline points="285.0,267.5 390.0,267.5 390.0,180.0" fill="none" stroke="#94a3b8" stroke-width="1.5" opacity="0.8"/>
<text x="337.5" y="219.8" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">0.5mm 512GB/s</text>
<rect x="48.8" y="155.5" width="87.5" height="49.0" rx="4" fill="#ef4444" stroke="#475569" stroke-width="1"/>
<text x="92.5" y="184.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#ffffff">PE CPU</text>
<rect x="136.2" y="155.5" width="87.5" height="49.0" rx="4" fill="#f59e0b" stroke="#475569" stroke-width="1"/>
<text x="180.0" y="184.0" text-anchor="middle" font-family="monospace" font-size="9" fill="#1e293b">PE SCHEDULER</text>
<rect x="241.2" y="68.0" width="87.5" height="49.0" rx="4" fill="#3b82f6" stroke="#475569" stroke-width="1"/>
<text x="285.0" y="96.5" text-anchor="middle" font-family="monospace" font-size="10" fill="#ffffff">PE DMA</text>
<rect x="241.2" y="155.5" width="87.5" height="49.0" rx="4" fill="#8b5cf6" stroke="#475569" stroke-width="1"/>
<text x="285.0" y="184.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#ffffff">PE GEMM</text>
<rect x="241.2" y="243.0" width="87.5" height="49.0" rx="4" fill="#ec4899" stroke="#475569" stroke-width="1"/>
<text x="285.0" y="271.5" text-anchor="middle" font-family="monospace" font-size="10" fill="#ffffff">PE MATH</text>
<rect x="346.2" y="155.5" width="87.5" height="49.0" rx="4" fill="#10b981" stroke="#475569" stroke-width="1"/>
<text x="390.0" y="184.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#ffffff">PE TCM</text>
</svg>

After

Width:  |  Height:  |  Size: 3.2 KiB

+72
View File
@@ -0,0 +1,72 @@
<svg xmlns="http://www.w3.org/2000/svg" width="820" height="500" viewBox="0 0 820 500" font-family="monospace">
<rect width="820" height="500" fill="#f8fafc" rx="6"/>
<text x="410" y="32" text-anchor="middle" font-size="16" font-weight="bold" fill="#1e293b">Placement: column_wise</text>
<text x="410.0" y="54.0" text-anchor="middle" font-size="12" fill="#475569" font-weight="normal">Tensor (1024×512) fp16 → K axis split into 8 parts</text>
<text x="320.0" y="82.0" text-anchor="middle" font-size="11" fill="#475569" font-weight="normal">← K=512 →</text>
<text x="68.0" y="250.0" text-anchor="middle" font-size="11" fill="#475569" transform="rotate(-90 68.0 250.0)">↑ M=1024 ↓</text>
<rect x="80.0" y="90.0" width="60.0" height="320.0" fill="#3b82f6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="110.0" y="246.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE0</text>
<text x="110.0" y="262.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">(1024×64)</text>
<rect x="140.0" y="90.0" width="60.0" height="320.0" fill="#10b981" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="170.0" y="246.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE1</text>
<text x="170.0" y="262.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">(1024×64)</text>
<rect x="200.0" y="90.0" width="60.0" height="320.0" fill="#f59e0b" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="230.0" y="246.0" text-anchor="middle" font-size="12" fill="#000" font-weight="bold">PE2</text>
<text x="230.0" y="262.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">(1024×64)</text>
<rect x="260.0" y="90.0" width="60.0" height="320.0" fill="#ef4444" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="290.0" y="246.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE3</text>
<text x="290.0" y="262.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">(1024×64)</text>
<rect x="320.0" y="90.0" width="60.0" height="320.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="350.0" y="246.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE4</text>
<text x="350.0" y="262.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">(1024×64)</text>
<rect x="380.0" y="90.0" width="60.0" height="320.0" fill="#ec4899" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="410.0" y="246.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE5</text>
<text x="410.0" y="262.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">(1024×64)</text>
<rect x="440.0" y="90.0" width="60.0" height="320.0" fill="#06b6d4" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="470.0" y="246.0" text-anchor="middle" font-size="12" fill="#000" font-weight="bold">PE6</text>
<text x="470.0" y="262.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">(1024×64)</text>
<rect x="500.0" y="90.0" width="60.0" height="320.0" fill="#f97316" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="530.0" y="246.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE7</text>
<text x="530.0" y="262.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">(1024×64)</text>
<rect x="80.0" y="90.0" width="480.0" height="320.0" fill="none" stroke="#1e293b" stroke-width="2" fill-opacity="1.0" rx="2"/>
<text x="110.0" y="426.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">off=0 B</text>
<text x="110.0" y="440.0" text-anchor="middle" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
<text x="170.0" y="426.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">off=128 KB</text>
<text x="170.0" y="440.0" text-anchor="middle" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
<text x="230.0" y="426.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">off=256 KB</text>
<text x="230.0" y="440.0" text-anchor="middle" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
<text x="290.0" y="426.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">off=384 KB</text>
<text x="290.0" y="440.0" text-anchor="middle" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
<text x="350.0" y="426.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">off=512 KB</text>
<text x="350.0" y="440.0" text-anchor="middle" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
<text x="410.0" y="426.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">off=640 KB</text>
<text x="410.0" y="440.0" text-anchor="middle" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
<text x="470.0" y="426.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">off=768 KB</text>
<text x="470.0" y="440.0" text-anchor="middle" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
<text x="530.0" y="426.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">off=896 KB</text>
<text x="530.0" y="440.0" text-anchor="middle" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
<text x="670.0" y="100.0" text-anchor="middle" font-size="12" fill="#1e293b" font-weight="bold">PE Legend</text>
<rect x="620.0" y="106.0" width="16.0" height="16.0" fill="#3b82f6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="642.0" y="118.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE0</text>
<rect x="620.0" y="128.0" width="16.0" height="16.0" fill="#10b981" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="642.0" y="140.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE1</text>
<rect x="620.0" y="150.0" width="16.0" height="16.0" fill="#f59e0b" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="642.0" y="162.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE2</text>
<rect x="620.0" y="172.0" width="16.0" height="16.0" fill="#ef4444" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="642.0" y="184.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE3</text>
<rect x="620.0" y="194.0" width="16.0" height="16.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="642.0" y="206.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE4</text>
<rect x="620.0" y="216.0" width="16.0" height="16.0" fill="#ec4899" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="642.0" y="228.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE5</text>
<rect x="620.0" y="238.0" width="16.0" height="16.0" fill="#06b6d4" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="642.0" y="250.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE6</text>
<rect x="620.0" y="260.0" width="16.0" height="16.0" fill="#f97316" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="642.0" y="272.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE7</text>
<rect x="620.0" y="320.0" width="167.0" height="120.0" fill="#e2e8f0" stroke="#94a3b8" stroke-width="1" fill-opacity="1.0" rx="2"/>
<text x="630.0" y="338.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Strategy: column_wise</text>
<text x="630.0" y="356.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Split axis: K</text>
<text x="630.0" y="374.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Shards: 8</text>
<text x="630.0" y="392.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Each: (1024, 64)</text>
<text x="630.0" y="410.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Each: 128 KB</text>
<text x="630.0" y="428.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Total: 1 MB</text>
</svg>

After

Width:  |  Height:  |  Size: 8.1 KiB

+47
View File
@@ -0,0 +1,47 @@
<svg xmlns="http://www.w3.org/2000/svg" width="820" height="500" viewBox="0 0 820 500" font-family="monospace">
<rect width="820" height="500" fill="#f8fafc" rx="6"/>
<text x="410" y="32" text-anchor="middle" font-size="16" font-weight="bold" fill="#1e293b">Placement: replicate</text>
<text x="410.0" y="54.0" text-anchor="middle" font-size="12" fill="#475569" font-weight="normal">Tensor (1024×512) fp16 → full copy to each PE</text>
<rect x="60.0" y="90.0" width="163.0" height="162.0" fill="#3b82f6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="141.5" y="157.0" text-anchor="middle" font-size="14" fill="#fff" font-weight="bold">PE0</text>
<text x="141.5" y="177.0" text-anchor="middle" font-size="11" fill="#fff" font-weight="normal">(1024×512)</text>
<text x="141.5" y="193.0" text-anchor="middle" font-size="10" fill="#fff" font-weight="normal">1 MB</text>
<text x="141.5" y="207.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">offset=0</text>
<rect x="239.0" y="90.0" width="163.0" height="162.0" fill="#10b981" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="320.5" y="157.0" text-anchor="middle" font-size="14" fill="#fff" font-weight="bold">PE1</text>
<text x="320.5" y="177.0" text-anchor="middle" font-size="11" fill="#fff" font-weight="normal">(1024×512)</text>
<text x="320.5" y="193.0" text-anchor="middle" font-size="10" fill="#fff" font-weight="normal">1 MB</text>
<text x="320.5" y="207.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">offset=0</text>
<rect x="418.0" y="90.0" width="163.0" height="162.0" fill="#f59e0b" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="499.5" y="157.0" text-anchor="middle" font-size="14" fill="#000" font-weight="bold">PE2</text>
<text x="499.5" y="177.0" text-anchor="middle" font-size="11" fill="#000" font-weight="normal">(1024×512)</text>
<text x="499.5" y="193.0" text-anchor="middle" font-size="10" fill="#000" font-weight="normal">1 MB</text>
<text x="499.5" y="207.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">offset=0</text>
<rect x="597.0" y="90.0" width="163.0" height="162.0" fill="#ef4444" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="678.5" y="157.0" text-anchor="middle" font-size="14" fill="#fff" font-weight="bold">PE3</text>
<text x="678.5" y="177.0" text-anchor="middle" font-size="11" fill="#fff" font-weight="normal">(1024×512)</text>
<text x="678.5" y="193.0" text-anchor="middle" font-size="10" fill="#fff" font-weight="normal">1 MB</text>
<text x="678.5" y="207.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">offset=0</text>
<rect x="60.0" y="268.0" width="163.0" height="162.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="141.5" y="335.0" text-anchor="middle" font-size="14" fill="#fff" font-weight="bold">PE4</text>
<text x="141.5" y="355.0" text-anchor="middle" font-size="11" fill="#fff" font-weight="normal">(1024×512)</text>
<text x="141.5" y="371.0" text-anchor="middle" font-size="10" fill="#fff" font-weight="normal">1 MB</text>
<text x="141.5" y="385.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">offset=0</text>
<rect x="239.0" y="268.0" width="163.0" height="162.0" fill="#ec4899" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="320.5" y="335.0" text-anchor="middle" font-size="14" fill="#fff" font-weight="bold">PE5</text>
<text x="320.5" y="355.0" text-anchor="middle" font-size="11" fill="#fff" font-weight="normal">(1024×512)</text>
<text x="320.5" y="371.0" text-anchor="middle" font-size="10" fill="#fff" font-weight="normal">1 MB</text>
<text x="320.5" y="385.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">offset=0</text>
<rect x="418.0" y="268.0" width="163.0" height="162.0" fill="#06b6d4" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="499.5" y="335.0" text-anchor="middle" font-size="14" fill="#000" font-weight="bold">PE6</text>
<text x="499.5" y="355.0" text-anchor="middle" font-size="11" fill="#000" font-weight="normal">(1024×512)</text>
<text x="499.5" y="371.0" text-anchor="middle" font-size="10" fill="#000" font-weight="normal">1 MB</text>
<text x="499.5" y="385.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">offset=0</text>
<rect x="597.0" y="268.0" width="163.0" height="162.0" fill="#f97316" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="678.5" y="335.0" text-anchor="middle" font-size="14" fill="#fff" font-weight="bold">PE7</text>
<text x="678.5" y="355.0" text-anchor="middle" font-size="11" fill="#fff" font-weight="normal">(1024×512)</text>
<text x="678.5" y="371.0" text-anchor="middle" font-size="10" fill="#fff" font-weight="normal">1 MB</text>
<text x="678.5" y="385.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">offset=0</text>
<rect x="60.0" y="450.0" width="496.0" height="30.0" fill="#e2e8f0" stroke="#94a3b8" stroke-width="1" fill-opacity="1.0" rx="2"/>
<text x="70.0" y="468.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Strategy: replicate | Shards: 8 | Each: 1 MB | Total mem: 8 MB</text>
</svg>

After

Width:  |  Height:  |  Size: 5.2 KiB

+72
View File
@@ -0,0 +1,72 @@
<svg xmlns="http://www.w3.org/2000/svg" width="820" height="560" viewBox="0 0 820 560" font-family="monospace">
<rect width="820" height="560" fill="#f8fafc" rx="6"/>
<text x="410" y="32" text-anchor="middle" font-size="16" font-weight="bold" fill="#1e293b">Placement: row_wise</text>
<text x="410.0" y="54.0" text-anchor="middle" font-size="12" fill="#475569" font-weight="normal">Tensor (1024×512) fp16 → M axis split into 8 parts</text>
<text x="240.0" y="82.0" text-anchor="middle" font-size="11" fill="#475569" font-weight="normal">← K=512 →</text>
<text x="68.0" y="290.0" text-anchor="middle" font-size="11" fill="#475569" transform="rotate(-90 68.0 290.0)">↑ M=1024 ↓</text>
<rect x="80.0" y="90.0" width="320.0" height="50.0" fill="#3b82f6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="240.0" y="111.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE0</text>
<text x="240.0" y="127.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">(128×512)</text>
<rect x="80.0" y="140.0" width="320.0" height="50.0" fill="#10b981" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="240.0" y="161.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE1</text>
<text x="240.0" y="177.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">(128×512)</text>
<rect x="80.0" y="190.0" width="320.0" height="50.0" fill="#f59e0b" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="240.0" y="211.0" text-anchor="middle" font-size="12" fill="#000" font-weight="bold">PE2</text>
<text x="240.0" y="227.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">(128×512)</text>
<rect x="80.0" y="240.0" width="320.0" height="50.0" fill="#ef4444" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="240.0" y="261.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE3</text>
<text x="240.0" y="277.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">(128×512)</text>
<rect x="80.0" y="290.0" width="320.0" height="50.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="240.0" y="311.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE4</text>
<text x="240.0" y="327.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">(128×512)</text>
<rect x="80.0" y="340.0" width="320.0" height="50.0" fill="#ec4899" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="240.0" y="361.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE5</text>
<text x="240.0" y="377.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">(128×512)</text>
<rect x="80.0" y="390.0" width="320.0" height="50.0" fill="#06b6d4" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="240.0" y="411.0" text-anchor="middle" font-size="12" fill="#000" font-weight="bold">PE6</text>
<text x="240.0" y="427.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">(128×512)</text>
<rect x="80.0" y="440.0" width="320.0" height="50.0" fill="#f97316" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="240.0" y="461.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE7</text>
<text x="240.0" y="477.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">(128×512)</text>
<rect x="80.0" y="90.0" width="320.0" height="400.0" fill="none" stroke="#1e293b" stroke-width="2" fill-opacity="1.0" rx="2"/>
<text x="410.0" y="111.0" text-anchor="start" font-size="9" fill="#475569" font-weight="normal">off=0 B</text>
<text x="410.0" y="125.0" text-anchor="start" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
<text x="410.0" y="161.0" text-anchor="start" font-size="9" fill="#475569" font-weight="normal">off=128 KB</text>
<text x="410.0" y="175.0" text-anchor="start" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
<text x="410.0" y="211.0" text-anchor="start" font-size="9" fill="#475569" font-weight="normal">off=256 KB</text>
<text x="410.0" y="225.0" text-anchor="start" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
<text x="410.0" y="261.0" text-anchor="start" font-size="9" fill="#475569" font-weight="normal">off=384 KB</text>
<text x="410.0" y="275.0" text-anchor="start" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
<text x="410.0" y="311.0" text-anchor="start" font-size="9" fill="#475569" font-weight="normal">off=512 KB</text>
<text x="410.0" y="325.0" text-anchor="start" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
<text x="410.0" y="361.0" text-anchor="start" font-size="9" fill="#475569" font-weight="normal">off=640 KB</text>
<text x="410.0" y="375.0" text-anchor="start" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
<text x="410.0" y="411.0" text-anchor="start" font-size="9" fill="#475569" font-weight="normal">off=768 KB</text>
<text x="410.0" y="425.0" text-anchor="start" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
<text x="410.0" y="461.0" text-anchor="start" font-size="9" fill="#475569" font-weight="normal">off=896 KB</text>
<text x="410.0" y="475.0" text-anchor="start" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
<text x="630.0" y="100.0" text-anchor="middle" font-size="12" fill="#1e293b" font-weight="bold">PE Legend</text>
<rect x="580.0" y="106.0" width="16.0" height="16.0" fill="#3b82f6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="602.0" y="118.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE0</text>
<rect x="580.0" y="128.0" width="16.0" height="16.0" fill="#10b981" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="602.0" y="140.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE1</text>
<rect x="580.0" y="150.0" width="16.0" height="16.0" fill="#f59e0b" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="602.0" y="162.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE2</text>
<rect x="580.0" y="172.0" width="16.0" height="16.0" fill="#ef4444" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="602.0" y="184.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE3</text>
<rect x="580.0" y="194.0" width="16.0" height="16.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="602.0" y="206.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE4</text>
<rect x="580.0" y="216.0" width="16.0" height="16.0" fill="#ec4899" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="602.0" y="228.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE5</text>
<rect x="580.0" y="238.0" width="16.0" height="16.0" fill="#06b6d4" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="602.0" y="250.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE6</text>
<rect x="580.0" y="260.0" width="16.0" height="16.0" fill="#f97316" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="602.0" y="272.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE7</text>
<rect x="580.0" y="320.0" width="146.0" height="120.0" fill="#e2e8f0" stroke="#94a3b8" stroke-width="1" fill-opacity="1.0" rx="2"/>
<text x="590.0" y="338.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Strategy: row_wise</text>
<text x="590.0" y="356.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Split axis: M</text>
<text x="590.0" y="374.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Shards: 8</text>
<text x="590.0" y="392.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Each: (128, 512)</text>
<text x="590.0" y="410.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Each: 128 KB</text>
<text x="590.0" y="428.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Total: 1 MB</text>
</svg>

After

Width:  |  Height:  |  Size: 8.1 KiB

@@ -0,0 +1,116 @@
<svg xmlns="http://www.w3.org/2000/svg" width="820" height="620" viewBox="0 0 820 620" font-family="monospace">
<rect width="820" height="620" fill="#f8fafc" rx="6"/>
<text x="410" y="32" text-anchor="middle" font-size="16" font-weight="bold" fill="#1e293b">Placement: tiled_column_major</text>
<text x="410.0" y="54.0" text-anchor="middle" font-size="11" fill="#475569" font-weight="normal">Tensor (1024×512) fp16, tile=(256×128) → 4×4=16 tiles, column-major (K first)</text>
<text x="280.0" y="82.0" text-anchor="middle" font-size="11" fill="#475569" font-weight="normal">← K=512 →</text>
<text x="68.0" y="290.0" text-anchor="middle" font-size="11" fill="#475569" transform="rotate(-90 68.0 290.0)">↑ M=1024 ↓</text>
<rect x="80.0" y="90.0" width="100.0" height="100.0" fill="#3b82f6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="130.0" y="136.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE0</text>
<text x="130.0" y="152.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t0</text>
<rect x="180.0" y="90.0" width="100.0" height="100.0" fill="#10b981" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="230.0" y="136.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE1</text>
<text x="230.0" y="152.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t1</text>
<rect x="280.0" y="90.0" width="100.0" height="100.0" fill="#f59e0b" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="330.0" y="136.0" text-anchor="middle" font-size="12" fill="#000" font-weight="bold">PE2</text>
<text x="330.0" y="152.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">t2</text>
<rect x="380.0" y="90.0" width="100.0" height="100.0" fill="#ef4444" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="430.0" y="136.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE3</text>
<text x="430.0" y="152.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t3</text>
<rect x="80.0" y="190.0" width="100.0" height="100.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="130.0" y="236.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE4</text>
<text x="130.0" y="252.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t4</text>
<rect x="180.0" y="190.0" width="100.0" height="100.0" fill="#ec4899" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="230.0" y="236.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE5</text>
<text x="230.0" y="252.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t5</text>
<rect x="280.0" y="190.0" width="100.0" height="100.0" fill="#06b6d4" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="330.0" y="236.0" text-anchor="middle" font-size="12" fill="#000" font-weight="bold">PE6</text>
<text x="330.0" y="252.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">t6</text>
<rect x="380.0" y="190.0" width="100.0" height="100.0" fill="#f97316" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="430.0" y="236.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE7</text>
<text x="430.0" y="252.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t7</text>
<rect x="80.0" y="290.0" width="100.0" height="100.0" fill="#3b82f6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="130.0" y="336.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE0</text>
<text x="130.0" y="352.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t8</text>
<rect x="180.0" y="290.0" width="100.0" height="100.0" fill="#10b981" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="230.0" y="336.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE1</text>
<text x="230.0" y="352.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t9</text>
<rect x="280.0" y="290.0" width="100.0" height="100.0" fill="#f59e0b" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="330.0" y="336.0" text-anchor="middle" font-size="12" fill="#000" font-weight="bold">PE2</text>
<text x="330.0" y="352.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">t10</text>
<rect x="380.0" y="290.0" width="100.0" height="100.0" fill="#ef4444" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="430.0" y="336.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE3</text>
<text x="430.0" y="352.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t11</text>
<rect x="80.0" y="390.0" width="100.0" height="100.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="130.0" y="436.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE4</text>
<text x="130.0" y="452.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t12</text>
<rect x="180.0" y="390.0" width="100.0" height="100.0" fill="#ec4899" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="230.0" y="436.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE5</text>
<text x="230.0" y="452.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t13</text>
<rect x="280.0" y="390.0" width="100.0" height="100.0" fill="#06b6d4" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="330.0" y="436.0" text-anchor="middle" font-size="12" fill="#000" font-weight="bold">PE6</text>
<text x="330.0" y="452.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">t14</text>
<rect x="380.0" y="390.0" width="100.0" height="100.0" fill="#f97316" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="430.0" y="436.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE7</text>
<text x="430.0" y="452.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t15</text>
<rect x="80.0" y="90.0" width="400.0" height="400.0" fill="none" stroke="#1e293b" stroke-width="2" fill-opacity="1.0" rx="2"/>
<text x="130.0" y="506.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">k=0..127</text>
<text x="230.0" y="506.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">k=128..255</text>
<text x="330.0" y="506.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">k=256..383</text>
<text x="430.0" y="506.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">k=384..511</text>
<text x="64.0" y="140.0" text-anchor="end" font-size="9" fill="#475569" font-weight="normal">m=0..255</text>
<text x="64.0" y="240.0" text-anchor="end" font-size="9" fill="#475569" font-weight="normal">m=256..511</text>
<text x="64.0" y="340.0" text-anchor="end" font-size="9" fill="#475569" font-weight="normal">m=512..767</text>
<text x="64.0" y="440.0" text-anchor="end" font-size="9" fill="#475569" font-weight="normal">m=768..1023</text>
<text x="590.0" y="90.0" text-anchor="middle" font-size="12" fill="#1e293b" font-weight="bold">PE Legend</text>
<rect x="540.0" y="96.0" width="16.0" height="16.0" fill="#3b82f6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="562.0" y="108.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE0</text>
<rect x="540.0" y="118.0" width="16.0" height="16.0" fill="#10b981" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="562.0" y="130.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE1</text>
<rect x="540.0" y="140.0" width="16.0" height="16.0" fill="#f59e0b" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="562.0" y="152.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE2</text>
<rect x="540.0" y="162.0" width="16.0" height="16.0" fill="#ef4444" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="562.0" y="174.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE3</text>
<rect x="540.0" y="184.0" width="16.0" height="16.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="562.0" y="196.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE4</text>
<rect x="540.0" y="206.0" width="16.0" height="16.0" fill="#ec4899" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="562.0" y="218.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE5</text>
<rect x="540.0" y="228.0" width="16.0" height="16.0" fill="#06b6d4" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="562.0" y="240.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE6</text>
<rect x="540.0" y="250.0" width="16.0" height="16.0" fill="#f97316" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="562.0" y="262.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE7</text>
<text x="540.0" y="310.0" text-anchor="middle" font-size="12" fill="#1e293b" font-weight="bold">Tile Assignment Order</text>
<rect x="540.0" y="318.0" width="12.0" height="12.0" fill="#3b82f6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="558.0" y="328.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 0 → PE0 (0,0) off=0 B</text>
<rect x="540.0" y="334.0" width="12.0" height="12.0" fill="#10b981" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="558.0" y="344.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 1 → PE1 (0,1) off=256 B</text>
<rect x="540.0" y="350.0" width="12.0" height="12.0" fill="#f59e0b" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="558.0" y="360.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 2 → PE2 (0,2) off=512 B</text>
<rect x="540.0" y="366.0" width="12.0" height="12.0" fill="#ef4444" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="558.0" y="376.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 3 → PE3 (0,3) off=768 B</text>
<rect x="540.0" y="382.0" width="12.0" height="12.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="558.0" y="392.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 4 → PE4 (1,0) off=256 KB</text>
<rect x="540.0" y="398.0" width="12.0" height="12.0" fill="#ec4899" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="558.0" y="408.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 5 → PE5 (1,1) off=256 KB</text>
<rect x="540.0" y="414.0" width="12.0" height="12.0" fill="#06b6d4" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="558.0" y="424.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 6 → PE6 (1,2) off=256 KB</text>
<rect x="540.0" y="430.0" width="12.0" height="12.0" fill="#f97316" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="558.0" y="440.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 7 → PE7 (1,3) off=256 KB</text>
<rect x="540.0" y="446.0" width="12.0" height="12.0" fill="#3b82f6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="558.0" y="456.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 8 → PE0 (2,0) off=512 KB</text>
<rect x="540.0" y="462.0" width="12.0" height="12.0" fill="#10b981" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="558.0" y="472.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 9 → PE1 (2,1) off=512 KB</text>
<rect x="540.0" y="478.0" width="12.0" height="12.0" fill="#f59e0b" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="558.0" y="488.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t10 → PE2 (2,2) off=512 KB</text>
<rect x="540.0" y="494.0" width="12.0" height="12.0" fill="#ef4444" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="558.0" y="504.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t11 → PE3 (2,3) off=512 KB</text>
<rect x="540.0" y="510.0" width="12.0" height="12.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="558.0" y="520.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t12 → PE4 (3,0) off=768 KB</text>
<rect x="540.0" y="526.0" width="12.0" height="12.0" fill="#ec4899" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="558.0" y="536.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t13 → PE5 (3,1) off=768 KB</text>
<rect x="540.0" y="542.0" width="12.0" height="12.0" fill="#06b6d4" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="558.0" y="552.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t14 → PE6 (3,2) off=768 KB</text>
<rect x="540.0" y="558.0" width="12.0" height="12.0" fill="#f97316" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="558.0" y="568.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t15 → PE7 (3,3) off=768 KB</text>
<rect x="80.0" y="560.0" width="608.0" height="30.0" fill="#e2e8f0" stroke="#94a3b8" stroke-width="1" fill-opacity="1.0" rx="2"/>
<text x="90.0" y="578.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Strategy: tiled_column_major | Tile: (256×128)=64 KB | Tiles: 16 | Total: 1 MB</text>
</svg>

After

Width:  |  Height:  |  Size: 14 KiB

+116
View File
@@ -0,0 +1,116 @@
<svg xmlns="http://www.w3.org/2000/svg" width="820" height="620" viewBox="0 0 820 620" font-family="monospace">
<rect width="820" height="620" fill="#f8fafc" rx="6"/>
<text x="410" y="32" text-anchor="middle" font-size="16" font-weight="bold" fill="#1e293b">Placement: tiled_row_major</text>
<text x="410.0" y="54.0" text-anchor="middle" font-size="11" fill="#475569" font-weight="normal">Tensor (1024×512) fp16, tile=(256×128) → 4×4=16 tiles, row-major (M first)</text>
<text x="280.0" y="82.0" text-anchor="middle" font-size="11" fill="#475569" font-weight="normal">← K=512 →</text>
<text x="68.0" y="290.0" text-anchor="middle" font-size="11" fill="#475569" transform="rotate(-90 68.0 290.0)">↑ M=1024 ↓</text>
<rect x="80.0" y="90.0" width="100.0" height="100.0" fill="#3b82f6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="130.0" y="136.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE0</text>
<text x="130.0" y="152.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t0</text>
<rect x="80.0" y="190.0" width="100.0" height="100.0" fill="#10b981" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="130.0" y="236.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE1</text>
<text x="130.0" y="252.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t1</text>
<rect x="80.0" y="290.0" width="100.0" height="100.0" fill="#f59e0b" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="130.0" y="336.0" text-anchor="middle" font-size="12" fill="#000" font-weight="bold">PE2</text>
<text x="130.0" y="352.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">t2</text>
<rect x="80.0" y="390.0" width="100.0" height="100.0" fill="#ef4444" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="130.0" y="436.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE3</text>
<text x="130.0" y="452.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t3</text>
<rect x="180.0" y="90.0" width="100.0" height="100.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="230.0" y="136.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE4</text>
<text x="230.0" y="152.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t4</text>
<rect x="180.0" y="190.0" width="100.0" height="100.0" fill="#ec4899" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="230.0" y="236.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE5</text>
<text x="230.0" y="252.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t5</text>
<rect x="180.0" y="290.0" width="100.0" height="100.0" fill="#06b6d4" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="230.0" y="336.0" text-anchor="middle" font-size="12" fill="#000" font-weight="bold">PE6</text>
<text x="230.0" y="352.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">t6</text>
<rect x="180.0" y="390.0" width="100.0" height="100.0" fill="#f97316" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="230.0" y="436.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE7</text>
<text x="230.0" y="452.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t7</text>
<rect x="280.0" y="90.0" width="100.0" height="100.0" fill="#3b82f6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="330.0" y="136.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE0</text>
<text x="330.0" y="152.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t8</text>
<rect x="280.0" y="190.0" width="100.0" height="100.0" fill="#10b981" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="330.0" y="236.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE1</text>
<text x="330.0" y="252.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t9</text>
<rect x="280.0" y="290.0" width="100.0" height="100.0" fill="#f59e0b" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="330.0" y="336.0" text-anchor="middle" font-size="12" fill="#000" font-weight="bold">PE2</text>
<text x="330.0" y="352.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">t10</text>
<rect x="280.0" y="390.0" width="100.0" height="100.0" fill="#ef4444" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="330.0" y="436.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE3</text>
<text x="330.0" y="452.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t11</text>
<rect x="380.0" y="90.0" width="100.0" height="100.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="430.0" y="136.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE4</text>
<text x="430.0" y="152.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t12</text>
<rect x="380.0" y="190.0" width="100.0" height="100.0" fill="#ec4899" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="430.0" y="236.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE5</text>
<text x="430.0" y="252.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t13</text>
<rect x="380.0" y="290.0" width="100.0" height="100.0" fill="#06b6d4" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="430.0" y="336.0" text-anchor="middle" font-size="12" fill="#000" font-weight="bold">PE6</text>
<text x="430.0" y="352.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">t14</text>
<rect x="380.0" y="390.0" width="100.0" height="100.0" fill="#f97316" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
<text x="430.0" y="436.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE7</text>
<text x="430.0" y="452.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t15</text>
<rect x="80.0" y="90.0" width="400.0" height="400.0" fill="none" stroke="#1e293b" stroke-width="2" fill-opacity="1.0" rx="2"/>
<text x="130.0" y="506.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">k=0..127</text>
<text x="230.0" y="506.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">k=128..255</text>
<text x="330.0" y="506.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">k=256..383</text>
<text x="430.0" y="506.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">k=384..511</text>
<text x="64.0" y="140.0" text-anchor="end" font-size="9" fill="#475569" font-weight="normal">m=0..255</text>
<text x="64.0" y="240.0" text-anchor="end" font-size="9" fill="#475569" font-weight="normal">m=256..511</text>
<text x="64.0" y="340.0" text-anchor="end" font-size="9" fill="#475569" font-weight="normal">m=512..767</text>
<text x="64.0" y="440.0" text-anchor="end" font-size="9" fill="#475569" font-weight="normal">m=768..1023</text>
<text x="590.0" y="90.0" text-anchor="middle" font-size="12" fill="#1e293b" font-weight="bold">PE Legend</text>
<rect x="540.0" y="96.0" width="16.0" height="16.0" fill="#3b82f6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="562.0" y="108.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE0</text>
<rect x="540.0" y="118.0" width="16.0" height="16.0" fill="#10b981" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="562.0" y="130.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE1</text>
<rect x="540.0" y="140.0" width="16.0" height="16.0" fill="#f59e0b" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="562.0" y="152.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE2</text>
<rect x="540.0" y="162.0" width="16.0" height="16.0" fill="#ef4444" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="562.0" y="174.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE3</text>
<rect x="540.0" y="184.0" width="16.0" height="16.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="562.0" y="196.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE4</text>
<rect x="540.0" y="206.0" width="16.0" height="16.0" fill="#ec4899" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="562.0" y="218.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE5</text>
<rect x="540.0" y="228.0" width="16.0" height="16.0" fill="#06b6d4" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="562.0" y="240.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE6</text>
<rect x="540.0" y="250.0" width="16.0" height="16.0" fill="#f97316" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="562.0" y="262.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE7</text>
<text x="540.0" y="310.0" text-anchor="middle" font-size="12" fill="#1e293b" font-weight="bold">Tile Assignment Order</text>
<rect x="540.0" y="318.0" width="12.0" height="12.0" fill="#3b82f6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="558.0" y="328.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 0 → PE0 (0,0) off=0 B</text>
<rect x="540.0" y="334.0" width="12.0" height="12.0" fill="#10b981" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="558.0" y="344.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 1 → PE1 (1,0) off=256 KB</text>
<rect x="540.0" y="350.0" width="12.0" height="12.0" fill="#f59e0b" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="558.0" y="360.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 2 → PE2 (2,0) off=512 KB</text>
<rect x="540.0" y="366.0" width="12.0" height="12.0" fill="#ef4444" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="558.0" y="376.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 3 → PE3 (3,0) off=768 KB</text>
<rect x="540.0" y="382.0" width="12.0" height="12.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="558.0" y="392.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 4 → PE4 (0,1) off=256 B</text>
<rect x="540.0" y="398.0" width="12.0" height="12.0" fill="#ec4899" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="558.0" y="408.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 5 → PE5 (1,1) off=256 KB</text>
<rect x="540.0" y="414.0" width="12.0" height="12.0" fill="#06b6d4" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="558.0" y="424.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 6 → PE6 (2,1) off=512 KB</text>
<rect x="540.0" y="430.0" width="12.0" height="12.0" fill="#f97316" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="558.0" y="440.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 7 → PE7 (3,1) off=768 KB</text>
<rect x="540.0" y="446.0" width="12.0" height="12.0" fill="#3b82f6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="558.0" y="456.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 8 → PE0 (0,2) off=512 B</text>
<rect x="540.0" y="462.0" width="12.0" height="12.0" fill="#10b981" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="558.0" y="472.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 9 → PE1 (1,2) off=256 KB</text>
<rect x="540.0" y="478.0" width="12.0" height="12.0" fill="#f59e0b" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="558.0" y="488.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t10 → PE2 (2,2) off=512 KB</text>
<rect x="540.0" y="494.0" width="12.0" height="12.0" fill="#ef4444" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="558.0" y="504.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t11 → PE3 (3,2) off=768 KB</text>
<rect x="540.0" y="510.0" width="12.0" height="12.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="558.0" y="520.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t12 → PE4 (0,3) off=768 B</text>
<rect x="540.0" y="526.0" width="12.0" height="12.0" fill="#ec4899" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="558.0" y="536.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t13 → PE5 (1,3) off=256 KB</text>
<rect x="540.0" y="542.0" width="12.0" height="12.0" fill="#06b6d4" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="558.0" y="552.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t14 → PE6 (2,3) off=512 KB</text>
<rect x="540.0" y="558.0" width="12.0" height="12.0" fill="#f97316" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
<text x="558.0" y="568.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t15 → PE7 (3,3) off=768 KB</text>
<rect x="80.0" y="560.0" width="587.0" height="30.0" fill="#e2e8f0" stroke="#94a3b8" stroke-width="1" fill-opacity="1.0" rx="2"/>
<text x="90.0" y="578.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Strategy: tiled_row_major | Tile: (256×128)=64 KB | Tiles: 16 | Total: 1 MB</text>
</svg>

After

Width:  |  Height:  |  Size: 14 KiB

+95
View File
@@ -0,0 +1,95 @@
<svg xmlns="http://www.w3.org/2000/svg" width="648" height="648" viewBox="0 0 648 648">
<title>sip</title>
<rect width="648" height="648" fill="#f8fafc"/>
<text x="324" y="18" text-anchor="middle" font-family="monospace" font-size="14" font-weight="bold" fill="#1e293b">SIP VIEW</text>
<line x1="108.0" y1="144.0" x2="252.0" y2="144.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
<text x="180.0" y="140.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
<line x1="108.0" y1="144.0" x2="108.0" y2="264.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
<text x="108.0" y="200.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
<line x1="252.0" y1="144.0" x2="396.0" y2="144.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
<text x="324.0" y="140.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
<line x1="252.0" y1="144.0" x2="252.0" y2="264.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
<text x="252.0" y="200.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
<line x1="396.0" y1="144.0" x2="540.0" y2="144.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
<text x="468.0" y="140.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
<line x1="396.0" y1="144.0" x2="396.0" y2="264.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
<text x="396.0" y="200.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
<line x1="540.0" y1="144.0" x2="540.0" y2="264.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
<text x="540.0" y="200.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
<line x1="108.0" y1="264.0" x2="252.0" y2="264.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
<text x="180.0" y="260.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
<line x1="108.0" y1="264.0" x2="108.0" y2="384.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
<text x="108.0" y="320.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
<line x1="252.0" y1="264.0" x2="396.0" y2="264.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
<text x="324.0" y="260.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
<line x1="252.0" y1="264.0" x2="252.0" y2="384.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
<text x="252.0" y="320.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
<line x1="396.0" y1="264.0" x2="540.0" y2="264.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
<text x="468.0" y="260.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
<line x1="396.0" y1="264.0" x2="396.0" y2="384.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
<text x="396.0" y="320.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
<line x1="540.0" y1="264.0" x2="540.0" y2="384.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
<text x="540.0" y="320.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
<line x1="108.0" y1="384.0" x2="252.0" y2="384.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
<text x="180.0" y="380.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
<line x1="108.0" y1="384.0" x2="108.0" y2="504.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
<text x="108.0" y="440.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
<line x1="252.0" y1="384.0" x2="396.0" y2="384.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
<text x="324.0" y="380.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
<line x1="252.0" y1="384.0" x2="252.0" y2="504.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
<text x="252.0" y="440.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
<line x1="396.0" y1="384.0" x2="540.0" y2="384.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
<text x="468.0" y="380.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
<line x1="396.0" y1="384.0" x2="396.0" y2="504.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
<text x="396.0" y="440.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
<line x1="540.0" y1="384.0" x2="540.0" y2="504.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
<text x="540.0" y="440.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
<line x1="108.0" y1="504.0" x2="252.0" y2="504.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
<text x="180.0" y="500.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
<line x1="252.0" y1="504.0" x2="396.0" y2="504.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
<text x="324.0" y="500.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
<line x1="396.0" y1="504.0" x2="540.0" y2="504.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
<text x="468.0" y="500.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
<polyline points="324.0,56.0 108.0,56.0 108.0,144.0" fill="none" stroke="#0ea5e9" stroke-width="1" opacity="0.8"/>
<text x="216.0" y="96.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.5mm 512GB/s</text>
<polyline points="324.0,56.0 252.0,56.0 252.0,144.0" fill="none" stroke="#0ea5e9" stroke-width="1" opacity="0.8"/>
<text x="288.0" y="96.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.5mm 512GB/s</text>
<polyline points="324.0,56.0 396.0,56.0 396.0,144.0" fill="none" stroke="#0ea5e9" stroke-width="1" opacity="0.8"/>
<text x="360.0" y="96.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.5mm 512GB/s</text>
<polyline points="324.0,56.0 540.0,56.0 540.0,144.0" fill="none" stroke="#0ea5e9" stroke-width="1" opacity="0.8"/>
<text x="432.0" y="96.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.5mm 512GB/s</text>
<rect x="84.0" y="128.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
<text x="108.0" y="148.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (0,0)</text>
<rect x="228.0" y="128.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
<text x="252.0" y="148.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (1,0)</text>
<rect x="372.0" y="128.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
<text x="396.0" y="148.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (2,0)</text>
<rect x="516.0" y="128.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
<text x="540.0" y="148.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (3,0)</text>
<rect x="84.0" y="248.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
<text x="108.0" y="268.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (0,1)</text>
<rect x="228.0" y="248.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
<text x="252.0" y="268.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (1,1)</text>
<rect x="372.0" y="248.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
<text x="396.0" y="268.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (2,1)</text>
<rect x="516.0" y="248.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
<text x="540.0" y="268.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (3,1)</text>
<rect x="84.0" y="368.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
<text x="108.0" y="388.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (0,2)</text>
<rect x="228.0" y="368.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
<text x="252.0" y="388.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (1,2)</text>
<rect x="372.0" y="368.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
<text x="396.0" y="388.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (2,2)</text>
<rect x="516.0" y="368.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
<text x="540.0" y="388.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (3,2)</text>
<rect x="84.0" y="488.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
<text x="108.0" y="508.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (0,3)</text>
<rect x="228.0" y="488.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
<text x="252.0" y="508.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (1,3)</text>
<rect x="372.0" y="488.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
<text x="396.0" y="508.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (2,3)</text>
<rect x="516.0" y="488.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
<text x="540.0" y="508.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (3,3)</text>
<rect x="308.0" y="50.0" width="32.0" height="12.0" rx="4" fill="#0ea5e9" stroke="#475569" stroke-width="1"/>
<text x="324.0" y="60.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#ffffff">IO io0</text>
</svg>

After

Width:  |  Height:  |  Size: 10 KiB

+19
View File
@@ -0,0 +1,19 @@
<svg xmlns="http://www.w3.org/2000/svg" width="768" height="396" viewBox="0 0 768 396">
<title>system</title>
<rect width="768" height="396" fill="#f8fafc"/>
<text x="384" y="18" text-anchor="middle" font-family="monospace" font-size="14" font-weight="bold" fill="#1e293b">SYSTEM VIEW</text>
<polyline points="384.0,60.0 182.0,60.0 182.0,120.0" fill="none" stroke="#6366f1" stroke-width="1" opacity="0.8"/>
<text x="283.0" y="86.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">20.0mm 256GB/s</text>
<polyline points="384.0,60.0 586.0,60.0 586.0,120.0" fill="none" stroke="#6366f1" stroke-width="1" opacity="0.8"/>
<text x="485.0" y="86.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">20.0mm 256GB/s</text>
<rect x="374.0" y="57.0" width="20.0" height="6.0" rx="4" fill="#6366f1" stroke="#475569" stroke-width="1"/>
<text x="384.0" y="64.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#ffffff">Fabric Switch</text>
<rect x="62.0" y="138.0" width="240.0" height="200.0" rx="4" fill="#e0e7ff" stroke="#475569" stroke-width="1"/>
<text x="182.0" y="242.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">SIP 0</text>
<rect x="174.0" y="117.0" width="16.0" height="6.0" rx="4" fill="#0ea5e9" stroke="#475569" stroke-width="1"/>
<text x="182.0" y="124.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#ffffff">IO io0</text>
<rect x="466.0" y="138.0" width="240.0" height="200.0" rx="4" fill="#e0e7ff" stroke="#475569" stroke-width="1"/>
<text x="586.0" y="242.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">SIP 1</text>
<rect x="578.0" y="117.0" width="16.0" height="6.0" rx="4" fill="#0ea5e9" stroke="#475569" stroke-width="1"/>
<text x="586.0" y="124.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#ffffff">IO io0</text>
</svg>

After

Width:  |  Height:  |  Size: 1.9 KiB

+381
View File
@@ -0,0 +1,381 @@
# Latency Model
## Overview
kernbench uses a discrete-event simulation (SimPy) to compute end-to-end latency.
Every request flows through a graph of **components** connected by **wires**.
The total latency reported is the **actual SimPy wall-clock** (`env.now` delta),
not a static formula—so contention and queueing are captured automatically.
```
total_ns (actual) = wire_prop + component_overhead + drain + queueing
├── deterministic ──────────────────┘ │
└── contention-dependent ────────────────────┘
```
## Three Deterministic Cost Components
### 1. Wire Propagation
```
wire_ns = distance_mm × ns_per_mm (global: 0.01 = 10 ps/mm)
```
Every edge in the topology graph has a `distance_mm`. A SimPy wire process
delays each message by `wire_ns` before delivering it to the next component.
For on-chip silicon this is ~10 ps/mm; the same constant applies everywhere
since all links are on-die or interposer. Wire propagation is typically <1 ns
and negligible compared to other costs.
### 2. Component Overhead (`overhead_ns`)
```
component_ns = node.attrs["overhead_ns"]
```
Each component on the path adds a fixed processing delay via `yield env.timeout(overhead_ns)`.
This models arbitration, protocol processing, pipeline stages, etc.
| Component | overhead_ns | Meaning |
|-----------|-------------|---------|
| pcie_ep | 5.0 | PCIe protocol processing |
| io_cpu | 10.0 | Command decode / dispatch |
| m_cpu | 5.0 | DMA scheduling |
| fabric switch | 5.0 | Packet arbitration |
| xbar | 2.0 | Crossbar arbitration |
| xbar bridge | 1.0 | Bridge traversal between xbar halves |
| ucie | 1.0 | UCIe protocol overhead per port |
| noc (2D mesh) | 0.0 | Hop delay modeled internally via manhattan distance |
| hbm_ctrl | 0.0 | Access time captured in drain_ns |
| pe_cpu | 2.0 | Command dispatch |
| pe_scheduler | 1.0 | PE-internal scheduling |
| pe_gemm/math | 0.0 | Placeholder; will use flops-based model |
### 3. Drain (Serialization Delay)
```
drain_ns = nbytes / bottleneck_bw_gbs
```
**Wormhole (cut-through) model**: data flows through intermediate nodes as a
pipeline. Serialization cost is paid **once** at the terminal node, not at
every hop. The bottleneck is the minimum `bw_gbs` across all edges in the path.
Example: 4096 bytes through a path with bottleneck 128 GB/s → `4096 / 128 = 32.0 ns`.
### Formula (Theoretical Lower Bound)
```
formula_ns = Σ(wire_prop) + Σ(overhead_ns) + drain_ns
```
This is the latency with **zero contention**—no other request competing for
any resource. The engine provides `_formula_latency()` for verification.
With no contention: `actual == formula`. With contention: `actual > formula`.
### Diagram: PE DMA Read (pe0 → local slice0, 4096 bytes)
```mermaid
sequenceDiagram
participant D as pe_dma
participant X as xbar.pe0
participant H as hbm_ctrl.slice0
D->>X: txn (4096B)
Note over X: overhead 2.0 ns
X->>H: txn (wire 0.025 ns)
Note over H: acquire Resource
Note over H: overhead 0 ns
Note over H: drain 4096/256 = 16.0 ns
Note over H: release Resource
H-->>D: done.succeed()
Note over D,H: total_ns = 18.09 ns<br/>formula = wire(0.025) + ovhd(2.0) + drain(16.0) = 18.025 ns<br/>actual ≈ formula (no contention)
```
### Diagram: Two Requests — No Contention vs HOL Blocking
#### Case 1: Different slices (parallel, no contention)
```mermaid
sequenceDiagram
participant A as Request A
participant S0 as hbm_ctrl.slice0<br/>Resource(cap=1)
participant S1 as hbm_ctrl.slice1<br/>Resource(cap=1)
Note over A,S1: t=2 ns — both requests arrive at their own slice
A->>S0: A (4KB)
A->>S1: B (4KB)
Note over S0: acquire (immediate)
Note over S1: acquire (immediate)
Note over S0: drain 16.0 ns
Note over S1: drain 16.0 ns
Note over S0: t=18 release
Note over S1: t=18 release
Note over A,S1: A actual = 18 ns, B actual = 18 ns<br/>No waiting — separate Resources
```
#### Case 2: Same slice (HOL blocking)
```mermaid
sequenceDiagram
participant A as Request A (4KB)
participant Q as hbm_ctrl.slice0<br/>Resource(cap=1)
participant B as Request B (64B)
Note over A,B: t=0 — A arrives first
A->>Q: acquire (immediate)
Note over Q: drain A = 16.0 ns
Note over B,Q: t=5 — B arrives, yield req → BLOCKED
B--xQ: waiting...
Note over Q: t=16 — A drain done, release
Q->>B: B acquires resource
Note over Q: drain B = 0.25 ns
Note over Q: t=16.25 — B done, release
Note over A,B: A actual = 16.0 ns (== formula)<br/>B actual = 11.25 ns (formula 0.25 + queueing 11.0)<br/>HOL blocking: short request waits behind long drain
```
---
## How SimPy Tracks Latency
### Measurement
```python
start_ns = env.now
yield txn_done # wait for the transaction to complete
total_ns = env.now - start_ns # ← this is what probe reports
```
`env.now` is SimPy's simulation clock. It only advances when a process `yield`s
a timeout or waits on a resource/store. The delta between start and done captures
**everything**: wire delays, component overheads, drain, and any queueing.
### Component Pipeline
Each component is a SimPy process:
```
_fan_in (per in_port) → _inbox (Store) → _worker → out_ports
```
1. **`_fan_in`**: relays messages from each `in_port` into a shared `_inbox` Store.
2. **`_worker`**: pulls from `_inbox`, spawns `_forward_txn` per message.
3. **`_forward_txn`**: calls `run()` (overhead), then puts to `out_ports[next_hop]`.
The worker uses `env.process()` (pipeline model), so multiple messages can be
in-flight through the same component concurrently. Contention happens when
they compete for shared resources (e.g., `simpy.Resource` in hbm_ctrl).
### Wire Process
```python
while True:
msg = yield out_port.get() # wait for sender
yield env.timeout(prop_ns) # propagation delay
yield in_port.put(msg) # deliver to receiver
```
Each directed edge has its own wire process. Messages are delayed by exactly
`distance_mm × ns_per_mm`.
---
## Contention and Queueing
Queueing delay is **not a separate formula term**—it emerges from SimPy's
event scheduling when multiple requests compete for the same resource.
### Where Contention Occurs
| Resource | SimPy Type | Capacity | Effect |
|----------|-----------|----------|--------|
| hbm_ctrl | `simpy.Resource` | 1 | Serializes HBM access |
| m_cpu DMA read engine | `simpy.Resource` | 1 | Serializes DMA reads |
| m_cpu DMA write engine | `simpy.Resource` | 1 | Serializes DMA writes |
| pe_dma channels | `simpy.Resource` | configurable | Serializes PE DMA ops |
| component inbox | `simpy.Store` | unbounded | No backpressure (FIFO) |
### How Queueing Works
```python
# hbm_ctrl._worker
with self._resource.request() as req:
yield req # ← BLOCKS if resource is occupied
yield from self.run(env, txn.nbytes)
yield env.timeout(drain_ns)
```
If request A holds the resource and request B arrives:
- B's `yield req` blocks until A releases the resource
- SimPy advances B's `env.now` by A's remaining service time
- This "extra" time shows up in B's `total_ns` automatically
```
No contention: actual_ns == formula_ns
Contention: actual_ns > formula_ns
queueing_delay = actual_ns - formula_ns
```
### Head-of-Line (HOL) Blocking at hbm_ctrl
The `simpy.Resource` is held for the **entire** `with` block—both overhead and
drain. The resource is NOT released between overhead and drain:
```python
with self._resource.request() as req:
yield req # acquire (or wait)
yield from self.run(env, txn.nbytes) # overhead_ns ─┐
yield env.timeout(drain_ns) # drain_ns │ resource held
# ← resource released here ───────────────────────────────┘
```
This means a short request arriving during a long request's drain must wait
for the full remaining drain time—classic head-of-line blocking:
```
Request A: 4 KB, drain = 16.0 ns (arrives at t=0)
Request B: 64 B, drain = 0.25 ns (arrives at t=5)
Timeline:
t=0.00 A acquires resource
t=0.00 A: overhead (0 ns)
t=0.00 A: drain starts (16.0 ns)
t=5.00 B arrives → yield req → BLOCKED (A holds resource)
t=16.00 A: drain done → resource released
t=16.00 B acquires resource
t=16.00 B: overhead (0 ns)
t=16.25 B: drain done → resource released
B actual = 11.25 ns (waited 11.0 + own 0.25)
B formula = 0.25 ns
B queueing = 11.0 ns ← HOL blocking penalty
```
**Why this is physically realistic**: An HBM channel processes one burst at a
time. While data is being serialized onto the channel (drain), no other request
can use that channel. The FIFO ordering (`simpy.Resource` default) reflects
the simplest controller scheduling policy.
**Alternative: priority scheduling**: If needed, `simpy.PriorityResource` can
prioritize shorter requests (Shortest Job First), but this is not currently
used since FIFO matches typical HBM controller behavior.
---
## Worked Example: Two Concurrent PE DMA Reads
Setup: PE0 and PE1 in cube0 both read 4096 bytes from their local HBM slices
(slice0 and slice1), submitted to the **same engine** at the same time.
### Paths
```
DMA A: pe0.pe_dma → xbar.pe0 → hbm_ctrl.slice0
DMA B: pe1.pe_dma → xbar.pe1 → hbm_ctrl.slice1
```
### No Contention (different HBM slices)
Since slice0 and slice1 are **separate** hbm_ctrl instances, each with its own
`simpy.Resource(capacity=1)`, there is no resource competition.
```
DMA A timeline:
t=0.00 pe_dma dequeues txn
t=0.00 xbar.pe0: overhead_ns=2.0 → t=2.00
t=2.025 wire prop (2.5mm × 0.01) → t=2.025
t=2.025 hbm_ctrl.slice0: yield req → immediate (no contention)
t=2.025 hbm_ctrl.slice0: overhead_ns=0 → t=2.025
t=18.025 drain_ns = 4096/256 = 16.0 → t=18.025
t=18.025 done
DMA B timeline: (identical, on its own slice)
t=0.00 → ... → t=18.09 done
```
Both complete at ~18.09 ns. `actual == formula` for both.
### With Contention (same HBM slice)
Now suppose both PE0 and PE1 read from **slice0**:
```
DMA A: pe0.pe_dma → xbar.pe0 → hbm_ctrl.slice0
DMA B: pe1.pe_dma → xbar.pe1 → xbar.pe0 → hbm_ctrl.slice0
(chain traversal to reach slice0)
```
```
DMA A timeline:
t=0.00 xbar.pe0(2.0) → wire → hbm_ctrl.slice0
t=2.025 yield req → immediate (first to arrive)
t=18.025 drain 16.0 → release resource → done
actual_A = 18.025 ns (== formula)
DMA B timeline:
t=0.00 xbar.pe1(2.0) → xbar.pe0(2.0) → wire → hbm_ctrl.slice0
t=4.035 yield req → BLOCKED (A holds resource until t=18.025)
t=18.025 acquire resource
t=34.025 drain 16.0 → release → done
actual_B = 34.035 ns
formula_B = wire(0.035) + overhead(4.0) + drain(32.0) = 36.035 ns
But actual_B is different because drain uses bottleneck BW of B's path (128 GB/s)
while A's path has BW 256 GB/s. Let's recalculate:
B's bottleneck: xbar_x_bw = 128 GB/s → drain = 4096/128 = 32.0 ns
formula_B = 0.035 + 4.0 + 32.0 = 36.035 ns
actual_B = 36.035 + queueing ≈ 50+ ns
queueing = time waiting for A to release hbm_ctrl
```
The key insight: **queueing delay is not in the formula**. It only appears in
the actual SimPy simulation when resources are contested. The probe reports
`actual_ns`, which includes all queueing. To see pure queueing overhead,
compare `actual_ns` vs `formula_ns` (available in PE DMA traces).
---
## Probe Output Explained
```
=== PE DMA Latency ===
Case Target Actual Ovhd Drain Wire Ovhd% Drain% Eff.BW BN.BW Util%
pe-local-hbm c0.pe0->c0.slice0 18.09 2.0 16.0 0.08 11.1% 88.5% 226.49 256.0 88.5%
pe-cross-half-hbm c0.pe0->c0.slice4 37.14 5.0 32.0 0.14 13.5% 86.1% 110.27 128.0 86.1%
```
| Column | Meaning |
|--------|---------|
| **Actual** | SimPy measured `env.now` delta (includes contention if any) |
| **Ovhd** | Sum of `overhead_ns` for all components on the forward path |
| **Drain** | `nbytes / bottleneck_bw` — serialization at terminal |
| **Wire** | Sum of `distance_mm × ns_per_mm` for all edges |
| **Ovhd%** | `Ovhd / Actual × 100` — fraction of time spent in component processing |
| **Drain%** | `Drain / Actual × 100` — fraction of time spent in data transfer |
| **Eff.BW** | `nbytes / Actual` — achieved bandwidth |
| **BN.BW** | Bottleneck bandwidth (min `bw_gbs` on path) |
| **Util%** | `Eff.BW / BN.BW × 100` — how close to theoretical max BW |
### Why Util% < 100%
`Util% = Drain% = drain_ns / actual_ns`. The gap from 100% is the overhead
fraction. For small transfers (4KB), overhead is significant relative to drain.
For large transfers, drain dominates and utilization approaches 100%.
```
4 KB: Ovhd=2.0, Drain=16.0 → Util=88.5% (overhead is 11% of time)
64 KB: Ovhd=2.0, Drain=256.0 → Util=99.2% (overhead is <1% of time)
```
### H2D Path: Why Ovhd% is ~40%
H2D traverses many components (pcie_ep → io_cpu → ucie → noc → m_cpu → noc →
xbar → hbm_ctrl + response path). Total forward overhead is ~23 ns vs drain
of 32 ns for 4KB, so overhead is comparable to data transfer time—resulting
in ~55% utilization. This is expected for small command-path transfers.
+30
View File
@@ -0,0 +1,30 @@
[build-system]
requires = ["setuptools>=68", "wheel"]
build-backend = "setuptools.build_meta"
[project]
name = "kernbench"
version = "0.1.0"
requires-python = ">=3.10"
dependencies = ["pytest", "simpy", "pyyaml"]
[project.scripts]
kernbench = "kernbench.cli.main:main"
[project.optional-dependencies]
dev = [
"pytest>=7",
"ruff>=0.4.0",
]
[tool.ruff]
line-length = 100
target-version = "py310"
fix = false
[tool.ruff.lint]
select = ["E", "F", "I", "B", "UP"]
ignore = ["F401"]
[tool.pytest.ini_options]
addopts = ["--disable-warnings"]
+393
View File
@@ -0,0 +1,393 @@
#!/usr/bin/env python3
"""Generate SVG diagrams illustrating each placement strategy.
Example tensor: (M=1024, K=512) fp16 (itemsize=2), 8 PEs.
Tiled variants use tile_m=256, tile_k=128.
Output: docs/diagrams/placement_*.svg
"""
from __future__ import annotations
import math
from pathlib import Path
# ── Diagram parameters ──────────────────────────────────────────────
M, K = 1024, 512
ITEMSIZE = 2
NUM_PE = 8
TILE_M, TILE_K = 256, 128
PE_COLORS = [
"#3b82f6", # PE0 blue
"#10b981", # PE1 emerald
"#f59e0b", # PE2 amber
"#ef4444", # PE3 red
"#8b5cf6", # PE4 violet
"#ec4899", # PE5 pink
"#06b6d4", # PE6 cyan
"#f97316", # PE7 orange
]
PE_TEXT_COLORS = [
"#fff", "#fff", "#000", "#fff",
"#fff", "#fff", "#000", "#fff",
]
OUT_DIR = Path(__file__).parent.parent / "docs" / "diagrams"
# ── SVG helpers ─────────────────────────────────────────────────────
def _svg_header(w: int, h: int, title: str) -> str:
return (
f'<svg xmlns="http://www.w3.org/2000/svg" width="{w}" height="{h}"'
f' viewBox="0 0 {w} {h}" font-family="monospace">\n'
f'<rect width="{w}" height="{h}" fill="#f8fafc" rx="6"/>\n'
f'<text x="{w // 2}" y="32" text-anchor="middle" font-size="16"'
f' font-weight="bold" fill="#1e293b">{title}</text>\n'
)
def _svg_footer() -> str:
return "</svg>\n"
def _rect(x: float, y: float, w: float, h: float, fill: str,
stroke: str = "#334155", sw: float = 1.0, opacity: float = 1.0) -> str:
return (
f'<rect x="{x:.1f}" y="{y:.1f}" width="{w:.1f}" height="{h:.1f}"'
f' fill="{fill}" stroke="{stroke}" stroke-width="{sw}"'
f' fill-opacity="{opacity}" rx="2"/>\n'
)
def _text(x: float, y: float, txt: str, size: int = 11,
anchor: str = "middle", fill: str = "#1e293b",
weight: str = "normal") -> str:
return (
f'<text x="{x:.1f}" y="{y:.1f}" text-anchor="{anchor}"'
f' font-size="{size}" fill="{fill}" font-weight="{weight}">{txt}</text>\n'
)
def _line(x1: float, y1: float, x2: float, y2: float,
stroke: str = "#94a3b8", sw: float = 1) -> str:
return (
f'<line x1="{x1:.1f}" y1="{y1:.1f}" x2="{x2:.1f}" y2="{y2:.1f}"'
f' stroke="{stroke}" stroke-width="{sw}"/>\n'
)
def _format_bytes(n: int) -> str:
if n >= (1 << 20):
return f"{n >> 20} MB"
if n >= (1 << 10):
return f"{n >> 10} KB"
return f"{n} B"
def _legend(x: float, y0: float, num_pe: int = NUM_PE) -> str:
s = _text(x + 50, y0, "PE Legend", size=12, weight="bold")
for i in range(num_pe):
ly = y0 + 18 + i * 22
s += _rect(x, ly - 12, 16, 16, PE_COLORS[i])
s += _text(x + 22, ly, f"PE{i}", size=11, anchor="start")
return s
def _axes(gx: float, gy: float, gw: float, gh: float,
m_label: str = "M=1024", k_label: str = "K=512") -> str:
"""Draw axis labels and dimension arrows."""
s = ""
# K axis (horizontal) label above grid
s += _text(gx + gw / 2, gy - 8, f"{k_label}", size=11, fill="#475569")
# M axis (vertical) label left of grid
mx = gx - 12
my = gy + gh / 2
s += (
f'<text x="{mx:.1f}" y="{my:.1f}" text-anchor="middle"'
f' font-size="11" fill="#475569"'
f' transform="rotate(-90 {mx:.1f} {my:.1f})">↑ {m_label} ↓</text>\n'
)
return s
def _info_box(x: float, y: float, lines: list[str]) -> str:
"""Rounded info box with key/value lines."""
bw = max(len(l) for l in lines) * 7 + 20
bh = len(lines) * 18 + 12
s = _rect(x, y, bw, bh, "#e2e8f0", stroke="#94a3b8", sw=1)
for i, line in enumerate(lines):
s += _text(x + 10, y + 18 + i * 18, line, size=10, anchor="start", fill="#334155")
return s
# ── Grid drawing ────────────────────────────────────────────────────
def _draw_grid(
gx: float, gy: float, gw: float, gh: float,
cells: list[dict], # [{row, col, rspan, cspan, pe, label?, offset?}]
rows: int, cols: int,
cell_labels: bool = True,
) -> str:
"""Draw a grid of colored cells representing shard placement."""
cw = gw / cols
ch = gh / rows
s = ""
for c in cells:
cx = gx + c["col"] * cw
cy = gy + c["row"] * ch
w = c.get("cspan", 1) * cw
h = c.get("rspan", 1) * ch
pe = c["pe"]
s += _rect(cx, cy, w, h, PE_COLORS[pe], stroke="#334155", sw=1.5)
# PE label
lx = cx + w / 2
ly = cy + h / 2
s += _text(lx, ly - 4, f"PE{pe}", size=12,
fill=PE_TEXT_COLORS[pe], weight="bold")
if cell_labels and "label" in c:
s += _text(lx, ly + 12, c["label"], size=9,
fill=PE_TEXT_COLORS[pe])
# Grid border
s += _rect(gx, gy, gw, gh, "none", stroke="#1e293b", sw=2)
return s
# ── Strategy-specific generators ────────────────────────────────────
def gen_column_wise() -> str:
"""Column-wise: split K into 8 equal parts."""
W, H = 820, 500
s = _svg_header(W, H, "Placement: column_wise")
s += _text(W // 2, 54, f"Tensor ({M}×{K}) fp16 → K axis split into {NUM_PE} parts",
size=12, fill="#475569")
gx, gy, gw, gh = 80, 90, 480, 320
chunk_k = K // NUM_PE # 64
chunk_bytes = M * chunk_k * ITEMSIZE
s += _axes(gx, gy, gw, gh)
cells = []
for i in range(NUM_PE):
cells.append({
"row": 0, "col": i, "rspan": 1, "cspan": 1,
"pe": i,
"label": f"({M}×{chunk_k})",
})
s += _draw_grid(gx, gy, gw, gh, cells, rows=1, cols=NUM_PE)
# Column dimension labels
cw = gw / NUM_PE
for i in range(NUM_PE):
cx = gx + i * cw + cw / 2
off = i * chunk_bytes
s += _text(cx, gy + gh + 16, f"off={_format_bytes(off)}", size=9, fill="#475569")
s += _text(cx, gy + gh + 30, f"{_format_bytes(chunk_bytes)}", size=9, fill="#64748b")
s += _legend(620, 100)
s += _info_box(620, 320, [
f"Strategy: column_wise",
f"Split axis: K",
f"Shards: {NUM_PE}",
f"Each: ({M}, {chunk_k})",
f"Each: {_format_bytes(chunk_bytes)}",
f"Total: {_format_bytes(M * K * ITEMSIZE)}",
])
s += _svg_footer()
return s
def gen_row_wise() -> str:
"""Row-wise: split M into 8 equal parts."""
W, H = 820, 560
s = _svg_header(W, H, "Placement: row_wise")
s += _text(W // 2, 54, f"Tensor ({M}×{K}) fp16 → M axis split into {NUM_PE} parts",
size=12, fill="#475569")
gx, gy, gw, gh = 80, 90, 320, 400
chunk_m = M // NUM_PE # 128
chunk_bytes = chunk_m * K * ITEMSIZE
s += _axes(gx, gy, gw, gh)
cells = []
for i in range(NUM_PE):
cells.append({
"row": i, "col": 0, "rspan": 1, "cspan": 1,
"pe": i,
"label": f"({chunk_m}×{K})",
})
s += _draw_grid(gx, gy, gw, gh, cells, rows=NUM_PE, cols=1)
# Row dimension labels
ch = gh / NUM_PE
for i in range(NUM_PE):
cy = gy + i * ch + ch / 2
off = i * chunk_bytes
s += _text(gx + gw + 10, cy - 4, f"off={_format_bytes(off)}",
size=9, anchor="start", fill="#475569")
s += _text(gx + gw + 10, cy + 10, f"{_format_bytes(chunk_bytes)}",
size=9, anchor="start", fill="#64748b")
s += _legend(580, 100)
s += _info_box(580, 320, [
f"Strategy: row_wise",
f"Split axis: M",
f"Shards: {NUM_PE}",
f"Each: ({chunk_m}, {K})",
f"Each: {_format_bytes(chunk_bytes)}",
f"Total: {_format_bytes(M * K * ITEMSIZE)}",
])
s += _svg_footer()
return s
def gen_replicate() -> str:
"""Replicate: full copy per PE."""
W, H = 820, 500
s = _svg_header(W, H, "Placement: replicate")
s += _text(W // 2, 54, f"Tensor ({M}×{K}) fp16 → full copy to each PE",
size=12, fill="#475569")
full_bytes = M * K * ITEMSIZE
# Show 8 small copies in 2 rows × 4 cols
cols, rows = 4, 2
margin_x, margin_y = 60, 90
gap = 16
bw = (700 - (cols - 1) * gap) / cols
bh = (340 - (rows - 1) * gap) / rows
for i in range(NUM_PE):
r = i // cols
c = i % cols
bx = margin_x + c * (bw + gap)
by = margin_y + r * (bh + gap)
s += _rect(bx, by, bw, bh, PE_COLORS[i], stroke="#334155", sw=1.5)
s += _text(bx + bw / 2, by + bh / 2 - 14, f"PE{i}",
size=14, fill=PE_TEXT_COLORS[i], weight="bold")
s += _text(bx + bw / 2, by + bh / 2 + 6, f"({M}×{K})",
size=11, fill=PE_TEXT_COLORS[i])
s += _text(bx + bw / 2, by + bh / 2 + 22, f"{_format_bytes(full_bytes)}",
size=10, fill=PE_TEXT_COLORS[i])
s += _text(bx + bw / 2, by + bh / 2 + 36, "offset=0",
size=9, fill=PE_TEXT_COLORS[i])
s += _info_box(60, 450, [
f"Strategy: replicate | Shards: {NUM_PE} | Each: {_format_bytes(full_bytes)}"
f" | Total mem: {_format_bytes(full_bytes * NUM_PE)}",
])
s += _svg_footer()
return s
def gen_tiled(column_major: bool) -> str:
"""2D tiled placement. column_major=True → tiled_column_major."""
name = "tiled_column_major" if column_major else "tiled_row_major"
order = "column-major (K first)" if column_major else "row-major (M first)"
tiles_m = M // TILE_M # 4
tiles_k = K // TILE_K # 4
total_tiles = tiles_m * tiles_k # 16
tile_bytes = TILE_M * TILE_K * ITEMSIZE
W, H = 820, 620
s = _svg_header(W, H, f"Placement: {name}")
s += _text(W // 2, 54,
f"Tensor ({M}×{K}) fp16, tile=({TILE_M}×{TILE_K}) → "
f"{tiles_m}×{tiles_k}={total_tiles} tiles, {order}",
size=11, fill="#475569")
gx, gy, gw, gh = 80, 90, 400, 400
s += _axes(gx, gy, gw, gh)
# Build tile → PE mapping
cells = []
idx = 0
if column_major:
# iterate M first (rows), then K (cols) — but column-major means
# we traverse in the order that fills columns first
# Actually: column-major = K axis first within each M row
# The implementation iterates: for mi in tiles_m: for ki in tiles_k
for mi in range(tiles_m):
for ki in range(tiles_k):
pe = idx % NUM_PE
row_bytes = K * ITEMSIZE
offset = (mi * TILE_M * row_bytes) + (ki * TILE_K * ITEMSIZE)
cells.append({
"row": mi, "col": ki, "rspan": 1, "cspan": 1,
"pe": pe,
"label": f"t{idx}",
"offset": offset,
"idx": idx,
})
idx += 1
else:
# row-major: iterate K first (cols), then M (rows)
for ki in range(tiles_k):
for mi in range(tiles_m):
pe = idx % NUM_PE
row_bytes = K * ITEMSIZE
offset = (mi * TILE_M * row_bytes) + (ki * TILE_K * ITEMSIZE)
cells.append({
"row": mi, "col": ki, "rspan": 1, "cspan": 1,
"pe": pe,
"label": f"t{idx}",
"offset": offset,
"idx": idx,
})
idx += 1
s += _draw_grid(gx, gy, gw, gh, cells, rows=tiles_m, cols=tiles_k)
# Tile dimension labels on top
cw = gw / tiles_k
for ki in range(tiles_k):
cx = gx + ki * cw + cw / 2
s += _text(cx, gy + gh + 16, f"k={ki * TILE_K}..{(ki + 1) * TILE_K - 1}",
size=9, fill="#475569")
# Tile dimension labels on left
ch = gh / tiles_m
for mi in range(tiles_m):
cy = gy + mi * ch + ch / 2
s += _text(gx - 16, cy, f"m={mi * TILE_M}..{(mi + 1) * TILE_M - 1}",
size=9, anchor="end", fill="#475569")
s += _legend(540, 90)
# Assignment table
table_y = 310
s += _text(540, table_y, "Tile Assignment Order", size=12, weight="bold")
# Sort cells by idx for table
sorted_cells = sorted(cells, key=lambda c: c["idx"])
for i, c in enumerate(sorted_cells):
ty = table_y + 18 + i * 16
if ty > H - 20:
break
pe = c["pe"]
s += _rect(540, ty - 10, 12, 12, PE_COLORS[pe])
s += _text(558, ty,
f"t{c['idx']:>2d} → PE{pe} ({c['row']},{c['col']})"
f" off={_format_bytes(c['offset'])}",
size=9, anchor="start", fill="#334155")
s += _info_box(80, H - 60, [
f"Strategy: {name} | Tile: ({TILE_M}×{TILE_K})={_format_bytes(tile_bytes)}"
f" | Tiles: {total_tiles} | Total: {_format_bytes(M * K * ITEMSIZE)}",
])
s += _svg_footer()
return s
# ── Main ────────────────────────────────────────────────────────────
def main() -> None:
OUT_DIR.mkdir(parents=True, exist_ok=True)
diagrams = {
"placement_column_wise.svg": gen_column_wise(),
"placement_row_wise.svg": gen_row_wise(),
"placement_replicate.svg": gen_replicate(),
"placement_tiled_column_major.svg": gen_tiled(column_major=True),
"placement_tiled_row_major.svg": gen_tiled(column_major=False),
}
for name, svg in diagrams.items():
path = OUT_DIR / name
path.write_text(svg, encoding="utf-8")
print(f" wrote {path}")
print(f"\nGenerated {len(diagrams)} placement diagrams.")
if __name__ == "__main__":
main()
View File
+64
View File
@@ -0,0 +1,64 @@
import argparse
import sys
from benches.loader import resolve_bench
from kernbench.cli.probe import cmd_probe
from kernbench.cli.report import format_report
from kernbench.common.types import SimEngine
from kernbench.runtime_api.bench_runner import run_bench
from kernbench.runtime_api.types import DeviceSelector, resolve_device
from kernbench.sim_engine.engine import GraphEngine
from kernbench.topology.builder import resolve_topology
def build_parser() -> argparse.ArgumentParser:
p = argparse.ArgumentParser(prog="kernbench")
sub = p.add_subparsers(dest="cmd", required=True)
runp = sub.add_parser("run", help="Run a benchmark")
runp.add_argument("--topology", required=True)
runp.add_argument("--bench", required=True)
runp.add_argument(
"--device", default=None, help="Target device: 'all' or 'sip:<N>' (default: all)"
)
runp.set_defaults(_handler=cmd_run)
probep = sub.add_parser("probe", help="Probe latency and BW for predefined traffic patterns")
probep.add_argument("--topology", required=True)
probep.add_argument("--case", default="all", help="Case name or 'all' (default: all)")
probep.set_defaults(_handler=cmd_probe)
return p
def engine_factory(topology: object, device: DeviceSelector) -> SimEngine:
topo_obj = getattr(topology, "topology_obj", topology)
return GraphEngine(topo_obj)
def cmd_run(args) -> int:
print("> Running benchmark with:", args)
topo = resolve_topology(args.topology)
bench = resolve_bench(args.bench)
device = resolve_device(args.device)
result = run_bench(topology=topo, bench_fn=bench, device=device, engine_factory=engine_factory)
topo_obj = getattr(topo, "topology_obj", topo)
spec = getattr(topo_obj, "spec", None)
if result.traces:
print(format_report(result.traces, title=args.bench, spec=spec))
print(result.summary_text())
return 0 if result.completion.ok else 1
def main(argv=None) -> int:
parser = build_parser()
args = parser.parse_args(argv)
return int(args._handler(args))
if __name__ == "__main__":
sys.exit(main())
+248
View File
@@ -0,0 +1,248 @@
"""kernbench probe: latency and BW verification utility.
Runs predefined traffic patterns through the simulation engine and reports
latency, effective bandwidth, bottleneck bandwidth, and utilization for each
case. Validates monotonicity invariants across hop counts and access types.
"""
from __future__ import annotations
from pathlib import Path
from kernbench.policy.address.phyaddr import PhysAddr
from kernbench.policy.routing.router import AddressResolver, PathRouter
from kernbench.runtime_api.kernel import MemoryWriteMsg, PeDmaMsg
from kernbench.sim_engine.engine import GraphEngine
from kernbench.topology.builder import load_topology
from kernbench.topology.types import TopologyGraph
# -- Helpers ----------------------------------------------------------
def _hbm_pa(sip: int, cube: int, pe_id: int, spec: dict) -> int:
mm = spec["cube"]["memory_map"]
slice_bytes = mm["hbm_total_gb_per_cube"] * (1 << 30) // mm["hbm_slices_per_cube"]
pa = PhysAddr.pe_hbm_addr(
rack_id=0, sip_id=sip, cube_id=cube, pe_id=pe_id,
pe_local_hbm_offset=0x1000, slice_size_bytes=slice_bytes,
)
return pa.encode()
def _build_edge_map(graph: TopologyGraph) -> dict[tuple[str, str], object]:
return {(e.src, e.dst): e for e in graph.edges}
def _formula_breakdown(
path: list[str], nbytes: int, edge_map: dict, graph: TopologyGraph,
) -> tuple[float, float, float, float]:
"""Return (wire_ns, overhead_ns, drain_ns, formula_ns) for a path."""
ns_per_mm = graph.spec.get("system", {}).get("ns_per_mm", 0.01)
wire_ns = 0.0
for i in range(len(path) - 1):
e = edge_map.get((path[i], path[i + 1]))
if e:
wire_ns += e.distance_mm * ns_per_mm
overhead_ns = 0.0
for nid in path:
node = graph.nodes.get(nid)
if node:
overhead_ns += float(node.attrs.get("overhead_ns", 0.0))
bws = [e.bw_gbs for i in range(len(path) - 1)
if (e := edge_map.get((path[i], path[i + 1]))) and e.bw_gbs]
drain_ns = nbytes / min(bws) if bws else 0.0
return wire_ns, overhead_ns, drain_ns, wire_ns + overhead_ns + drain_ns
def _bottleneck_bw(path: list[str], edge_map: dict) -> float | None:
"""Per-request bottleneck: single request uses one connection."""
bws: list[float] = []
for i in range(len(path) - 1):
e = edge_map.get((path[i], path[i + 1]))
if e and e.bw_gbs:
bws.append(e.bw_gbs)
return min(bws) if bws else None
def _fmt_bw(bw: float | None) -> str:
return f"{bw:.1f}" if bw is not None else "-"
def _fmt_util(eff: float, bn: float | None) -> str:
if bn is None or bn <= 0:
return "-"
return f"{eff / bn * 100:.1f}%"
def _short_name(node_id: str) -> str:
"""Shorten node id: keep last 2 segments to avoid ambiguity (xbar.pe0 vs pe0)."""
parts = node_id.split(".")
return ".".join(parts[-2:]) if len(parts) >= 2 else node_id
def _short_path(path: list[str]) -> str:
return " -> ".join(_short_name(n) for n in path)
# -- Probe runner -----------------------------------------------------
def run_probe(topology_path: str, case_filter: str | None = None) -> int:
path = Path(topology_path).expanduser().resolve()
graph = load_topology(path)
edge_map = _build_edge_map(graph)
spec = graph.spec
resolver = AddressResolver(graph)
router = PathRouter(graph)
nbytes = 4096
show_all = case_filter is None or case_filter == "all"
# === H2D Write ===
h2d_cases = [
("h2d-1hop", 0, 1),
("h2d-2hop", 4, 2),
("h2d-3hop", 8, 3),
("h2d-4hop", 12, 4),
]
h2d_results: list[tuple[str, int, float, float, float | None]] = []
h2d_paths: list[tuple[str, list[str], list[str], list[str]]] = []
print()
print("=== H2D Write Latency (IO->HBM, varying hop count) ===")
print(f" {'Case':<14} {'Target':<16} {'Hops':>4} {'Actual':>8}"
f" {'Ovhd':>6} {'Drain':>6} {'Wire':>5} {'Ovhd%':>6} {'Drain%':>7}"
f" {'Eff.BW':>8} {'BN.BW':>8} {'Util%':>6}")
print(" " + "-" * 115)
for name, cube, hops in h2d_cases:
if not show_all and case_filter != name:
continue
engine = GraphEngine(graph)
pa = _hbm_pa(sip=0, cube=cube, pe_id=0, spec=spec)
msg = MemoryWriteMsg(
correlation_id="probe", request_id=name,
dst_sip=0, dst_cube=cube, dst_pe=0,
dst_pa=pa, nbytes=nbytes, pattern="zero",
)
h = engine.submit(msg)
engine.wait(h)
_, trace = engine.get_completion(h)
total_ns = trace["total_ns"]
eff_bw = nbytes / total_ns if total_ns > 0 else 0.0
pa_obj = PhysAddr.decode(pa)
dst_node = resolver.resolve(pa_obj)
pcie_ep = resolver.find_pcie_ep(0)
io_cpu = resolver.find_io_cpu(0)
m_cpu = resolver.find_m_cpu(0, cube)
leg1 = router.find_node_path(pcie_ep, io_cpu)
leg2 = router.find_node_path(io_cpu, m_cpu)
leg3 = router.find_mcpu_dma_path(m_cpu, dst_node)
full_path = leg1 + leg2[1:] + leg3[1:]
bn_bw = _bottleneck_bw(full_path, edge_map)
# Forward path breakdown only (response path is implicit in actual_ns)
fwd_path = leg1 + leg2[1:] + leg3[1:]
wire, ovhd, drain, formula = _formula_breakdown(fwd_path, nbytes, edge_map, graph)
ovhd_pct = ovhd / total_ns * 100 if total_ns > 0 else 0
drain_pct = drain / total_ns * 100 if total_ns > 0 else 0
h2d_results.append((name, hops, total_ns, eff_bw, bn_bw))
h2d_paths.append((name, leg1, leg2, leg3))
print(f" {name:<14} cube{cube}.pe0{'':<8} {hops:>4} {total_ns:>8.2f}"
f" {ovhd:>6.1f} {drain:>6.1f} {wire:>5.2f} {ovhd_pct:>5.1f}% {drain_pct:>5.1f}%"
f" {eff_bw:>8.2f} {_fmt_bw(bn_bw):>8} {_fmt_util(eff_bw, bn_bw):>6}")
if len(h2d_results) >= 2:
lats = [r[2] for r in h2d_results]
mono = all(lats[i] < lats[i + 1] for i in range(len(lats) - 1))
sym = "[v]" if mono else "[x]"
print(f" {sym} Monotonic increase: {'PASS' if mono else 'FAIL'}")
if h2d_paths:
print()
print(" Route Details:")
print(f" {'Case':<14} {'Leg':>4} Path")
print(" " + "-" * 80)
for name, leg1, leg2, leg3 in h2d_paths:
print(f" {name:<14} {'L1':>4} {_short_path(leg1)}")
print(f" {'':<14} {'L2':>4} {_short_path(leg2)}")
print(f" {'':<14} {'L3':>4} {_short_path(leg3)}")
# === PE DMA → HBM (direct PE-level injection) ===
# (name, sip, src_cube, src_pe, dst_cube, dst_pe)
pe_cases = [
("pe-local-hbm", 0, 0, 0, 0, 0), # pe0 → slice0 (local, 256 GB/s)
("pe-same-half-hbm", 0, 0, 0, 0, 1), # pe0 → slice1 (xbar chain, 128 GB/s)
("pe-cross-half-hbm", 0, 0, 0, 0, 4), # pe0 → slice4 (xbar chain, 128 GB/s)
("pe-cross-cube-hbm", 0, 0, 0, 1, 0), # cube0.pe0 → cube1.slice0 (NOC, 128 GB/s)
]
pe_results: list[tuple[str, float, float, float | None]] = []
pe_paths: list[tuple[str, list[str]]] = []
print()
print("=== PE DMA Latency (pe_dma -> xbar -> HBM, direct injection) ===")
print(f" {'Case':<22} {'Target':<28} {'Actual':>8}"
f" {'Ovhd':>6} {'Drain':>6} {'Wire':>5} {'Ovhd%':>6} {'Drain%':>7}"
f" {'Eff.BW':>8} {'BN.BW':>8} {'Util%':>6}")
print(" " + "-" * 120)
for name, sip, src_cube, src_pe, dst_cube, dst_pe in pe_cases:
if not show_all and case_filter != name:
continue
engine = GraphEngine(graph)
dst_pa = _hbm_pa(sip=sip, cube=dst_cube, pe_id=dst_pe, spec=spec)
msg = PeDmaMsg(
correlation_id="probe", request_id=name,
src_sip=sip, src_cube=src_cube, src_pe=src_pe,
dst_pa=dst_pa, nbytes=nbytes,
)
h = engine.submit(msg)
engine.wait(h)
_, trace = engine.get_completion(h)
total_ns = trace["total_ns"]
eff_bw = nbytes / total_ns if total_ns > 0 else 0.0
pe_ref = f"sip{sip}.cube{src_cube}.pe{src_pe}"
pa_obj = PhysAddr.decode(dst_pa)
dst_node = resolver.resolve(pa_obj)
dma_path = router.find_path(pe_ref, dst_node)
bn_bw = _bottleneck_bw(dma_path, edge_map)
wire, ovhd, drain, formula = _formula_breakdown(dma_path, nbytes, edge_map, graph)
ovhd_pct = ovhd / total_ns * 100 if total_ns > 0 else 0
drain_pct = drain / total_ns * 100 if total_ns > 0 else 0
target_str = f"c{src_cube}.pe{src_pe}->c{dst_cube}.slice{dst_pe}"
pe_results.append((name, total_ns, eff_bw, bn_bw))
pe_paths.append((name, dma_path))
print(f" {name:<22} {target_str:<28} {total_ns:>8.2f}"
f" {ovhd:>6.1f} {drain:>6.1f} {wire:>5.2f} {ovhd_pct:>5.1f}% {drain_pct:>5.1f}%"
f" {eff_bw:>8.2f} {_fmt_bw(bn_bw):>8} {_fmt_util(eff_bw, bn_bw):>6}")
if len(pe_results) >= 2:
local = [r for r in pe_results if "local" in r[0]]
chain = [r for r in pe_results if "local" not in r[0]]
if local and chain:
print(f" * Local BN: {_fmt_bw(local[0][3])} GB/s, "
f"Chain/NOC BN: {_fmt_bw(chain[0][3])} GB/s")
if pe_paths:
print()
print(" Route Details:")
print(f" {'Case':<22} Path")
print(" " + "-" * 80)
for name, dma_path in pe_paths:
print(f" {name:<22} {_short_path(dma_path)}")
print()
return 0
def cmd_probe(args) -> int:
return run_probe(args.topology, getattr(args, "case", "all"))
+175
View File
@@ -0,0 +1,175 @@
"""Performance report formatter for bench results."""
from __future__ import annotations
_DTYPE_BITS: dict[str, int] = {
"f16": 16, "fp16": 16, "float16": 16, "bf16": 16,
"f32": 32, "fp32": 32, "float32": 32,
"i8": 8, "int8": 8, "i16": 16, "int16": 16, "i32": 32, "int32": 32,
}
def format_report(
traces: list[dict],
title: str = "Benchmark",
spec: dict | None = None,
) -> str:
"""Format collected traces into a human-readable performance report.
spec: topology spec dict for peak TFLOPS / BW extraction.
"""
peak_tflops_f16, peak_hbm_bw_gbs = _extract_peaks(spec)
num_pes = _count_pes(spec)
lines: list[str] = []
title_line = f"-- {title} Performance Report "
deploy_entries = [t for t in traces if t.get("phase") not in ("kernel",)]
kernel_entries = [t for t in traces if t.get("phase") == "kernel"]
# ── Title ──
# Compute max header width for consistent separator lengths
_cmd_hdr = (f"{'Cmd':<10} {'Name':<12} {'SIP':>4} {'Cube':>5} {'PE':>4} {'Bytes':>10} "
f"{'Lat(ns)':>10} {'Xfer(ns)':>10} {'Proc(ns)':>10} "
f"{'BW(GB/s)':>10} {'MinBW':>10} {'Util%':>7}")
report_width = len(_cmd_hdr)
lines.append(title_line + "-" * max(0, report_width - len(title_line)))
# ── Command summary ──
if deploy_entries:
lines.append("")
hdr = (f"{'Cmd':<10} {'Name':<12} {'SIP':>4} {'Cube':>5} {'PE':>4} {'Bytes':>10} "
f"{'Lat(ns)':>10} {'Xfer(ns)':>10} {'Proc(ns)':>10} "
f"{'BW(GB/s)':>10} {'MinBW':>10} {'Util%':>7}")
lines.append(hdr)
lines.append("-" * len(hdr))
for e in deploy_entries:
lat = e.get("total_ns", 0.0)
nb = e.get("nbytes", 0)
sip = e.get("sip", "-")
pe = e.get("pe", "-")
cube = e.get("cube", "-")
cmd = e.get("phase", "deploy")
xfer_ns = e.get("xfer_ns", 0.0)
proc_ns = lat - xfer_ns if xfer_ns > 0 else 0.0
bw = nb / lat if lat > 0 else 0.0
min_bw = nb / xfer_ns if xfer_ns > 0 else 0.0
util = (xfer_ns / lat * 100) if lat > 0 and xfer_ns > 0 else 0.0
lines.append(
f"{cmd:<10} {e.get('name', '?'):<12} {str(sip):>4} {str(cube):>5} {str(pe):>4} {nb:>10} "
f"{lat:>10.1f} {xfer_ns:>10.1f} {proc_ns:>10.1f} "
f"{bw:>10.1f} {min_bw:>10.1f} {util:>6.1f}%"
)
# ── Kernel summary ──
if kernel_entries:
lines.append("")
k_hdr = (f"{'Phase':<10} {'Name':<12} {'PE':>4} {'E2E(ns)':>10} "
f"{'PE(ns)':>10} {'DMA(ns)':>10} {'Comp(ns)':>10} "
f"{'Bound':<8} {'TFLOPS':>8} {'Peak':>8} {'Util%':>7}")
lines.append(k_hdr)
lines.append("-" * len(k_hdr))
for e in kernel_entries:
e2e_ns = e.get("total_ns", 0.0)
pe_ns = e.get("pe_exec_ns", e2e_ns)
dma_ns = e.get("dma_ns", 0.0)
compute_ns = e.get("compute_ns", 0.0)
target_pe = e.get("target_pe", "-")
scalars = e.get("scalars", [])
pe_str = "all" if target_pe == "all" else str(target_pe)
n_active = num_pes if target_pe == "all" else 1
# Bound indicator based on measured DMA vs compute time
if dma_ns > 0 or compute_ns > 0:
bound = "memory" if dma_ns >= compute_ns else "compute"
else:
bound = "-"
achieved = _calc_tflops(scalars, pe_ns)
peak_total = peak_tflops_f16 * n_active
util = (achieved / peak_total * 100) if peak_total > 0 else 0.0
lines.append(
f"{'kernel':<10} {e.get('name', '?'):<12} {pe_str:>4} {e2e_ns:>10.1f} "
f"{pe_ns:>10.1f} {dma_ns:>10.1f} {compute_ns:>10.1f} "
f"{bound:<8} {achieved:>8.3f} {peak_total:>8.1f} {util:>6.1f}%"
)
# ── Per-PE summary ──
pe_deploy = _per_pe_deploy(deploy_entries)
if len(pe_deploy) > 1:
lines.append("")
pe_title = (f"-- Per-PE Summary (peak: {peak_tflops_f16:.1f} TFLOPS/PE, "
f"{peak_hbm_bw_gbs:.0f} GB/s HBM BW) ")
pe_hdr = (f"{'PE':>4} {'Deploy(ns)':>10} {'BW(GB/s)':>10} {'BW Util':>8} "
f"{'Kernel(ns)':>10} {'TFLOPS':>8} {'Util':>7}")
pe_width = max(len(pe_title), len(pe_hdr))
lines.append(pe_title + "-" * max(0, pe_width - len(pe_title)))
lines.append(pe_hdr)
lines.append("-" * pe_width)
k_ns = sum(e.get("pe_exec_ns", e.get("total_ns", 0.0)) for e in kernel_entries)
k_scalars = kernel_entries[0].get("scalars", []) if kernel_entries else []
n_active = len(pe_deploy)
total_achieved = _calc_tflops(k_scalars, k_ns)
per_pe_tflops = total_achieved / n_active if n_active > 0 else 0.0
pe_util = (per_pe_tflops / peak_tflops_f16 * 100) if peak_tflops_f16 > 0 else 0.0
for pe_id in sorted(pe_deploy):
d_ns, d_bytes = pe_deploy[pe_id]
d_bw = d_bytes / d_ns if d_ns > 0 else 0.0
d_util = (d_bw / peak_hbm_bw_gbs * 100) if peak_hbm_bw_gbs > 0 else 0.0
lines.append(
f"{pe_id:>4} {d_ns:>10.1f} {d_bw:>10.1f} {d_util:>7.1f}% "
f"{k_ns:>10.1f} {per_pe_tflops:>8.3f} {pe_util:>6.1f}%"
)
lines.append("")
return "\n".join(lines)
def _extract_peaks(spec: dict | None) -> tuple[float, float]:
"""Extract peak TFLOPS (f16) and HBM BW (GB/s) from spec."""
if spec is None:
return 0.0, 0.0
cube = spec.get("cube", {})
pe_template = cube.get("pe_template", {})
comps = pe_template.get("components", {})
gemm_attrs = comps.get("pe_gemm", {}).get("attrs", {})
peak_tflops = float(gemm_attrs.get("peak_tflops_f16", 0.0))
cube_links = cube.get("links", {})
hbm_bw = float(cube_links.get("xbar_to_hbm_bw_gbs", 0.0))
return peak_tflops, hbm_bw
def _count_pes(spec: dict | None) -> int:
if spec is None:
return 8
cube = spec.get("cube", {})
layout = cube.get("pe_layout", {})
per_corner = layout.get("pe_per_corner", 2)
corners = len(layout.get("corners", ["NW", "NE", "SW", "SE"]))
return per_corner * corners
def _calc_tflops(scalars: list, latency_ns: float) -> float:
"""Calculate achieved TFLOPS from scalar args [M, K, N] and latency."""
if len(scalars) < 3 or latency_ns <= 0:
return 0.0
m, k, n = scalars[0], scalars[1], scalars[2]
flops = 2.0 * m * k * n
return flops / (latency_ns * 1e-9) / 1e12
def _per_pe_deploy(deploy_entries: list[dict]) -> dict[int, tuple[float, int]]:
"""Aggregate deploy latency and bytes per PE."""
result: dict[int, tuple[float, int]] = {}
for e in deploy_entries:
pe = e.get("pe", 0)
lat = e.get("total_ns", 0.0)
nb = e.get("nbytes", 0)
if pe in result:
old_ns, old_bytes = result[pe]
result[pe] = (old_ns + lat, old_bytes + nb)
else:
result[pe] = (lat, nb)
return result
View File
+150
View File
@@ -0,0 +1,150 @@
"""PE-internal command types and handles (ADR-0014).
Generated by triton_emu (TLContext) and consumed by PE component
implementations (PE_CPU, PE_SCHEDULER, PE_DMA, PE_GEMM, PE_MATH).
Command lifecycle:
Triton kernel → TLContext → [PeCommand list] → PE_CPU → PE_SCHEDULER → engines
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Any, Literal
if TYPE_CHECKING:
import simpy
# ── Handles ───────────────────────────────────────────────────────
@dataclass(frozen=True)
class TensorHandle:
"""Opaque reference to a tensor residing in PE_TCM.
Returned by tl.load, tl.dot, tl.exp, etc.
Carries metadata for command generation; data field is reserved
for future validate mode (numpy array).
"""
id: str
pa: int # physical address in HBM/TCM
shape: tuple[int, ...]
dtype: str
nbytes: int # total byte size
data: object = None # reserved for validate mode
@dataclass(frozen=True)
class CompletionHandle:
"""Opaque handle for a non-blocking composite command.
Returned by tl.composite, consumed by tl.wait.
"""
id: str
# ── PE Commands ───────────────────────────────────────────────────
@dataclass(frozen=True)
class DmaReadCmd:
"""DMA READ: HBM → PE_TCM."""
handle: TensorHandle
src_pa: int
nbytes: int
@dataclass(frozen=True)
class DmaWriteCmd:
"""DMA WRITE: PE_TCM → HBM."""
handle: TensorHandle
dst_pa: int
nbytes: int
@dataclass(frozen=True)
class GemmCmd:
"""GEMM engine command: matrix multiply on TCM data.
out = a @ b, all operands in TCM.
"""
a: TensorHandle
b: TensorHandle
out: TensorHandle
m: int
k: int
n: int
@dataclass(frozen=True)
class MathCmd:
"""MATH engine command: unary/binary/reduction on TCM data.
op: "exp", "log", "sqrt", "abs", "sigmoid", "cos", "sin",
"add", "sub", "mul", "div", "where",
"sum", "max", "min"
"""
op: str
inputs: tuple[TensorHandle, ...]
out: TensorHandle
axis: int | None = None # for reductions
@dataclass(frozen=True)
class CompositeCmd:
"""Composite command: tiled pipeline of DMA_READ + COMPUTE + DMA_WRITE.
Non-blocking — submitted to PE_SCHEDULER which manages tile splitting
and pipeline overlaps (ADR-0014 D3.2).
"""
completion: CompletionHandle
op: Literal["gemm", "math"]
a: TensorHandle
b: TensorHandle | None
out_pa: int
out_nbytes: int
math_op: str | None = None # for op="math": which math operation
@dataclass(frozen=True)
class WaitCmd:
"""Wait for a specific composite or all pending composites."""
handle: CompletionHandle | None = None # None = wait all
@dataclass(frozen=True)
class PeCpuOverheadCmd:
"""PE_CPU scalar execution overhead (cycles)."""
cycles: int
# Union type for all PE commands
PeCommand = (
DmaReadCmd | DmaWriteCmd | GemmCmd | MathCmd
| CompositeCmd | WaitCmd | PeCpuOverheadCmd
)
@dataclass
class PeInternalTxn:
"""PE-internal message flowing PE_CPU → PE_SCHEDULER → engines.
Carries a single PeCommand and a completion event. PE_CPU creates one
PeInternalTxn per command during the replay phase and sends it to
PE_SCHEDULER, which routes it to the appropriate engine (PE_DMA,
PE_GEMM, PE_MATH). The engine signals ``done`` on completion.
"""
command: PeCommand
done: simpy.Event # succeeded when the engine completes this command
pe_prefix: str = "" # e.g. "sip0.cube0.pe0" — needed by PE_DMA for path resolution
result_data: dict[str, Any] = field(default_factory=dict)
+29
View File
@@ -0,0 +1,29 @@
from __future__ import annotations
from dataclasses import dataclass
from typing import Any, NewType, Protocol, TypeAlias
RequestHandle = NewType("RequestHandle", str)
Trace: TypeAlias = Any
@dataclass(frozen=True)
class Completion:
ok: bool
error_code: str | None = None
error_message: str | None = None
class SimEngine(Protocol):
"""
Backend simulation/runner engine contract.
Engine must be able to:
- accept requests created by RuntimeContext (submit/dispatch)
- report completion and optional trace for a given handle
"""
def get_completion(self, handle: RequestHandle) -> tuple[Completion, Trace | None]: ...
def submit(self, request: Any) -> RequestHandle: ...
def wait(self, handle: RequestHandle) -> None: ...
+4
View File
@@ -0,0 +1,4 @@
from kernbench.components.base import ComponentBase, ComponentRegistry
from kernbench.components.context import ComponentContext
__all__ = ["ComponentBase", "ComponentRegistry", "ComponentContext"]
+167
View File
@@ -0,0 +1,167 @@
from __future__ import annotations
from abc import ABC, abstractmethod
from collections.abc import Generator
from typing import TYPE_CHECKING, Any
import simpy
if TYPE_CHECKING:
from kernbench.components.context import ComponentContext
from kernbench.topology.types import Node
class ComponentBase(ABC):
"""Base class for all SimPy component implementations (ADR-0007 D3, ADR-0015).
Each component corresponds to one node in the compiled topology graph.
It models the processing overhead at that node as a SimPy generator,
allowing future implementations to add queueing and contention.
Port model (ADR-0015 D1):
in_ports[src_node_id] — SimPy Store for incoming messages from src
out_ports[dst_node_id] — SimPy Store for outgoing messages to dst
Ports are wired by GraphEngine at initialization; wire processes model
propagation delay between connected ports (ADR-0015 D2).
Context (ADR-0015 D4):
ctx — ComponentContext with router and resolver.
"""
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
self.node = node
self.ctx = ctx
self.in_ports: dict[str, simpy.Store] = {}
self.out_ports: dict[str, simpy.Store] = {}
def start(self, env: simpy.Environment) -> None:
"""Called once after all ports are wired.
Default: starts a fan-in collector and a generic forwarding worker.
The worker calls self.run() for per-component latency, then routes the
Transaction to the next hop or signals done (duck-typed; no direct
Transaction import to avoid circular dependencies).
Override in components that need custom fan-out / aggregation logic
(e.g. MCpuComponent, IoCpuComponent for kernel launch).
"""
if not self.in_ports:
return
self._inbox: simpy.Store = simpy.Store(env)
for port in self.in_ports.values():
env.process(self._fan_in(port))
env.process(self._worker(env))
def _fan_in(self, port: simpy.Store) -> Generator:
"""Relay messages from one in_port into the shared inbox."""
while True:
msg = yield port.get()
yield self._inbox.put(msg)
def _worker(self, env: simpy.Environment) -> Generator:
"""Generic forwarding worker: spawns _forward_txn per message (pipeline)."""
while True:
txn: Any = yield self._inbox.get()
env.process(self._forward_txn(env, txn))
def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator:
"""Apply run() latency, then forward to next hop or drain at terminal."""
yield from self.run(env, txn.nbytes)
next_hop = txn.next_hop # duck-typed: Transaction.next_hop
if next_hop:
yield self.out_ports[next_hop].put(txn.advance())
else:
drain = getattr(txn, "drain_ns", 0.0)
if drain > 0:
yield env.timeout(drain)
txn.done.succeed()
@abstractmethod
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
"""SimPy process: yield one or more events for this node's processing.
Subclasses yield env.timeout(overhead_ns) or compute latency dynamically.
Called by _forward_txn and subclass-specific handlers.
"""
...
class PeEngineBase(ComponentBase):
"""Base class for PE-internal engines (PE_DMA, PE_GEMM, PE_MATH).
Provides:
- ``_pe_prefix``: extracted from node.id (e.g. "sip0.cube0.pe0")
- Dual-message ``_worker``: dispatches PeInternalTxn to
``handle_command()`` and Transaction to inherited ``_forward_txn()``.
- ``init_resources(env)``: hook for subclass resource initialization,
called by ``start()`` before the worker is spawned.
Subclass contract:
1. Override ``handle_command(env, pe_txn)`` — process a PeInternalTxn.
2. Override ``run(env, nbytes)`` — yield component latency.
3. Optionally override ``init_resources(env)`` for DMA channels, etc.
"""
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
super().__init__(node, ctx)
self._pe_prefix: str = node.id.rsplit(".", 1)[0]
def start(self, env: simpy.Environment) -> None:
self.init_resources(env)
super().start(env)
def init_resources(self, env: simpy.Environment) -> None:
"""Hook for subclass resource initialization. Called before worker spawn."""
def _worker(self, env: simpy.Environment) -> Generator:
"""Dual-message dispatch: PeInternalTxn → handle_command, Transaction → _forward_txn."""
from kernbench.common.pe_commands import PeInternalTxn
while True:
msg: Any = yield self._inbox.get()
if isinstance(msg, PeInternalTxn):
env.process(self.handle_command(env, msg))
else:
env.process(self._forward_txn(env, msg))
@abstractmethod
def handle_command(self, env: simpy.Environment, pe_txn: Any) -> Generator:
"""Process a PE-internal command (PeInternalTxn).
Subclass must:
- Perform engine-specific work (acquire resources, compute, etc.)
- Call ``pe_txn.done.succeed()`` on completion.
"""
...
class ComponentRegistry:
"""DI registry: maps node.impl strings to ComponentBase subclasses.
Resolution order for ComponentRegistry.create(node, overrides, ctx):
1. overrides[node.impl] — caller-injected override
2. _registry[node.impl] — globally registered impl
3. Error — no fallback; every node must have an impl
"""
_registry: dict[str, type[ComponentBase]] = {}
@classmethod
def register(cls, impl: str, component_cls: type[ComponentBase]) -> None:
cls._registry[impl] = component_cls
@classmethod
def create(
cls,
node: Node,
overrides: dict[str, type[ComponentBase]] | None = None,
ctx: ComponentContext | None = None,
) -> ComponentBase:
if overrides and node.impl in overrides:
return overrides[node.impl](node, ctx)
if node.impl in cls._registry:
return cls._registry[node.impl](node, ctx)
raise ValueError(
f"No component registered for impl '{node.impl}' (node: {node.id}). "
f"Register it in kernbench.components.impls.__init__."
)
+52
View File
@@ -0,0 +1,52 @@
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any
import simpy
from kernbench.policy.routing.router import AddressResolver, PathRouter
@dataclass
class ComponentContext:
"""Topology services injected into every component implementation.
Required by components that need routing or address resolution
(IoCpuComponent, MCpuComponent, …). TransitComponent ignores ctx.
Passed via ComponentRegistry.create(node, overrides, ctx=ctx).
"""
router: PathRouter
resolver: AddressResolver
positions: dict[str, tuple[float, float] | None] # node_id → pos_mm
ns_per_mm: float # wire propagation constant (from topology spec)
edge_map: dict[tuple[str, str], Any] = field(default_factory=dict)
spec: dict = field(default_factory=dict) # topology spec (cube layout, PE count, etc.)
def get_shared_resource(
self, env: simpy.Environment, key: str, capacity: int = 1,
) -> simpy.Resource:
"""Return a shared SimPy Resource, creating it on first access.
Used by PE components that share a resource across engines within
the same PE (e.g. accel_slot shared by PE_GEMM and PE_MATH).
Key should be scoped per PE: e.g. "sip0.cube0.pe0.accel_slot".
"""
if not hasattr(self, "_shared_resources"):
self._shared_resources: dict[str, simpy.Resource] = {}
if key not in self._shared_resources:
self._shared_resources[key] = simpy.Resource(env, capacity=capacity)
return self._shared_resources[key]
def compute_drain_ns(self, path: list[str], nbytes: int) -> float:
"""Wormhole drain time: nbytes / bottleneck_bw along path."""
min_bw = float("inf")
for i in range(len(path) - 1):
edge = self.edge_map.get((path[i], path[i + 1]))
if edge and getattr(edge, "bw_gbs", None):
min_bw = min(min_bw, edge.bw_gbs)
if min_bw == float("inf"):
return 0.0
return nbytes / min_bw
@@ -0,0 +1,54 @@
"""Concrete component implementations.
Each module registers its component(s) with ComponentRegistry on import.
Import this package to activate all built-in implementations.
"""
from kernbench.components.base import ComponentRegistry
from kernbench.components.impls.forwarding import TransitComponent
from kernbench.components.impls.hbm_ctrl import HbmCtrlComponent
from kernbench.components.impls.io_cpu import IoCpuComponent
from kernbench.components.impls.m_cpu import MCpuComponent
from kernbench.components.impls.noc import TwoDMeshNocComponent
from kernbench.components.impls.pcie_ep import PcieEpComponent
from kernbench.components.impls.pe_cpu import PeCpuComponent
from kernbench.components.impls.pe_dma import PeDmaComponent
from kernbench.components.impls.pe_gemm import PeGemmComponent
from kernbench.components.impls.pe_math import PeMathComponent
from kernbench.components.impls.pe_scheduler import PeSchedulerComponent
from kernbench.components.impls.pe_tcm import PeTcmComponent
from kernbench.components.impls.sram import SramComponent
ComponentRegistry.register("forwarding_v1", TransitComponent)
ComponentRegistry.register("switch_v1", TransitComponent)
ComponentRegistry.register("noc_v1", TransitComponent)
ComponentRegistry.register("noc_2d_mesh_v1", TwoDMeshNocComponent)
ComponentRegistry.register("ucie_v1", TransitComponent)
ComponentRegistry.register("xbar_v1", TransitComponent)
ComponentRegistry.register("pcie_ep_v1", PcieEpComponent)
ComponentRegistry.register("io_cpu_v1", IoCpuComponent)
ComponentRegistry.register("m_cpu_v1", MCpuComponent)
ComponentRegistry.register("hbm_ctrl_v1", HbmCtrlComponent)
ComponentRegistry.register("sram_v1", SramComponent)
ComponentRegistry.register("pe_cpu_v1", PeCpuComponent)
ComponentRegistry.register("pe_scheduler_v1", PeSchedulerComponent)
ComponentRegistry.register("pe_dma_v1", PeDmaComponent)
ComponentRegistry.register("pe_gemm_v1", PeGemmComponent)
ComponentRegistry.register("pe_math_v1", PeMathComponent)
ComponentRegistry.register("pe_tcm_v1", PeTcmComponent)
__all__ = [
"HbmCtrlComponent",
"IoCpuComponent",
"MCpuComponent",
"PcieEpComponent",
"PeCpuComponent",
"PeDmaComponent",
"PeGemmComponent",
"PeMathComponent",
"PeSchedulerComponent",
"PeTcmComponent",
"TransitComponent",
"TwoDMeshNocComponent",
"SramComponent",
]
@@ -0,0 +1,27 @@
from __future__ import annotations
from collections.abc import Generator
from typing import TYPE_CHECKING
import simpy
from kernbench.components.base import ComponentBase
if TYPE_CHECKING:
from kernbench.components.context import ComponentContext
from kernbench.topology.types import Node
class TransitComponent(ComponentBase):
"""Transit component for NOC, UCIe, XBAR nodes.
Applies overhead_ns processing delay (from node.attrs) then forwards the
Transaction to the next hop via inherited _forward_txn().
"""
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
super().__init__(node, ctx)
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
yield env.timeout(overhead_ns)
+101
View File
@@ -0,0 +1,101 @@
from __future__ import annotations
from collections.abc import Generator
from typing import TYPE_CHECKING, Any
import simpy
from kernbench.components.base import ComponentBase
from kernbench.sim_engine.transaction import Transaction
if TYPE_CHECKING:
from kernbench.components.context import ComponentContext
from kernbench.topology.types import Node
class HbmCtrlComponent(ComponentBase):
"""HBM controller: terminal component that models HBM access latency.
Dual-channel model: separate read and write resources (each capacity=1)
allowing concurrent read/write like PE_DMA. Multiple reads or multiple
writes still serialize within their respective channel.
On completion, creates a ResponseMsg and sends it back on the reverse path
so that response latency is modeled through the fabric.
"""
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
super().__init__(node, ctx)
self._read: simpy.Resource | None = None
self._write: simpy.Resource | None = None
def start(self, env: simpy.Environment) -> None:
capacity = int(self.node.attrs.get("capacity", 1))
self._read = simpy.Resource(env, capacity=capacity)
self._write = simpy.Resource(env, capacity=capacity)
super().start(env)
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
yield env.timeout(overhead_ns)
def _select_channel(self, txn: Any) -> simpy.Resource:
"""Select channel based on request type: write requests → write, else → read."""
from kernbench.runtime_api.kernel import MemoryWriteMsg, PeDmaMsg
assert self._read is not None and self._write is not None
req = txn.request
if isinstance(req, MemoryWriteMsg):
return self._write
if isinstance(req, PeDmaMsg) and req.is_write:
return self._write
return self._read
def _worker(self, env: simpy.Environment) -> Generator:
"""Dispatch each incoming txn to a concurrent process for channel-level parallelism."""
while True:
txn: Any = yield self._inbox.get()
env.process(self._handle_txn(env, txn))
def _handle_txn(self, env: simpy.Environment, txn: Any) -> Generator:
"""Acquire channel, run, apply drain, send response."""
channel = self._select_channel(txn)
with channel.request() as req:
yield req
yield from self.run(env, txn.nbytes)
drain = getattr(txn, "drain_ns", 0.0)
if drain > 0:
yield env.timeout(drain)
yield from self._send_response(env, txn)
def _send_response(self, env: simpy.Environment, txn: Any) -> Generator:
"""Create ResponseMsg and send on reverse path back to originator.
PeDmaMsg is a direct probe with no IO_CPU/M_CPU aggregation in the path,
so we succeed txn.done directly instead of sending a response Transaction.
"""
from kernbench.runtime_api.kernel import PeDmaMsg
if isinstance(txn.request, PeDmaMsg):
txn.done.succeed()
return
reverse_path = list(reversed(txn.path))
if len(reverse_path) >= 2 and self.ctx:
from kernbench.runtime_api.kernel import ResponseMsg
parts = self.node.id.split(".")
cube_id = int(parts[1].replace("cube", ""))
pe_id = int(parts[3].replace("slice", ""))
resp_msg = ResponseMsg(
correlation_id=txn.request.correlation_id,
request_id=txn.request.request_id,
src_cube=cube_id, src_pe=pe_id, success=True,
)
resp_txn = Transaction(
request=resp_msg, path=reverse_path, step=0,
nbytes=0, done=env.event(), is_response=True,
)
yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
else:
txn.done.succeed()
+145
View File
@@ -0,0 +1,145 @@
from __future__ import annotations
from collections.abc import Generator
from typing import TYPE_CHECKING, Any
import simpy
from kernbench.components.base import ComponentBase
from kernbench.sim_engine.transaction import Transaction
if TYPE_CHECKING:
from kernbench.components.context import ComponentContext
from kernbench.topology.types import Node
class IoCpuComponent(ComponentBase):
"""IO_CPU component: multi-cube fan-out with response aggregation.
Forward path:
1. Applies overhead_ns processing overhead.
2. Resolves target cube(s) from request.target_cubes.
3. Fans out sub-Transactions to each target cube's M_CPU.
Response path:
Collects ResponseMsg from each M_CPU. When all cube responses are
received, succeeds the parent txn.done.
"""
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
super().__init__(node, ctx)
# Pending fan-out tracking: request_id → (expected, received, parent_txn_done)
self._pending: dict[str, tuple[int, int, simpy.Event]] = {}
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
yield env.timeout(overhead_ns)
def _worker(self, env: simpy.Environment) -> Generator:
while True:
txn: Any = yield self._inbox.get()
if getattr(txn, "is_response", False):
self._collect_response(txn)
else:
yield from self.run(env, txn.nbytes)
env.process(self._dispatch_to_m_cpus(env, txn))
def _collect_response(self, resp_txn: Any) -> None:
"""Receive a cube response and increment the aggregation counter."""
key = resp_txn.request.request_id
if key not in self._pending:
return
expected, received, parent_done = self._pending[key]
received += 1
if received >= expected:
parent_done.succeed()
del self._pending[key]
else:
self._pending[key] = (expected, received, parent_done)
def _dispatch_to_m_cpus(self, env: simpy.Environment, txn: Any) -> Generator:
"""Fan out sub-Transactions to target cube M_CPUs, wait for responses."""
from kernbench.runtime_api.kernel import KernelLaunchMsg, MemoryReadMsg, MemoryWriteMsg
request = txn.request
try:
cube_targets = self._resolve_cube_targets(request)
except Exception:
txn.done.succeed()
return
if not cube_targets:
txn.done.succeed()
return
# Setup aggregation
self._pending[request.request_id] = (len(cube_targets), 0, txn.done)
# Fan out to each target cube's M_CPU
for sip, cube in cube_targets:
try:
m_cpu_id = self.ctx.resolver.find_m_cpu(sip, cube)
path = self.ctx.router.find_node_path(self.node.id, m_cpu_id)
except Exception:
continue
if len(path) < 2:
continue
sub_txn = Transaction(
request=request, path=path, step=0,
nbytes=txn.nbytes, done=env.event(),
result_data=txn.result_data,
)
yield self.out_ports[path[1]].put(sub_txn.advance())
def _resolve_cube_targets(self, request: Any) -> list[tuple[int, int]]:
"""Return list of (sip, cube) pairs to fan out to."""
from kernbench.runtime_api.kernel import KernelLaunchMsg, MemoryReadMsg, MemoryWriteMsg
target_cubes = getattr(request, "target_cubes", "all")
if isinstance(request, MemoryWriteMsg):
sip = request.dst_sip
if target_cubes == "all":
cube = self._cube_from_pa(request.dst_pa, fallback=request.dst_cube)
return [(sip, cube)]
return [(sip, c) for c in target_cubes]
if isinstance(request, MemoryReadMsg):
sip = request.src_sip
if target_cubes == "all":
cube = self._cube_from_pa(request.src_pa, fallback=request.src_cube)
return [(sip, cube)]
return [(sip, c) for c in target_cubes]
if isinstance(request, KernelLaunchMsg):
my_sip = self._my_sip()
if target_cubes != "all":
return [(my_sip, c) for c in target_cubes]
# "all": derive from tensor shards, filtered to this SIP
seen: set[tuple[int, int]] = set()
targets: list[tuple[int, int]] = []
for arg in request.args:
if arg.arg_kind != "tensor":
continue
for shard in arg.shards:
if shard.sip != my_sip:
continue
key = (shard.sip, shard.cube)
if key not in seen:
seen.add(key)
targets.append(key)
return targets
return []
def _cube_from_pa(self, pa_val: int, fallback: int) -> int:
"""Extract cube_id from a physical address, with fallback."""
from kernbench.policy.address.phyaddr import PhysAddr
try:
return PhysAddr.decode(pa_val).cube_id
except Exception:
return fallback
def _my_sip(self) -> int:
"""Extract this IO_CPU's SIP ID from its node ID (e.g. 'sip0.io0.io_cpu' → 0)."""
return int(self.node.id.split(".")[0].replace("sip", ""))
+269
View File
@@ -0,0 +1,269 @@
from __future__ import annotations
from collections.abc import Generator
from typing import TYPE_CHECKING, Any
import simpy
from kernbench.components.base import ComponentBase
from kernbench.sim_engine.transaction import Transaction
if TYPE_CHECKING:
from kernbench.components.context import ComponentContext
from kernbench.topology.types import Node
class MCpuComponent(ComponentBase):
"""M_CPU component: multi-PE DMA fan-out with response aggregation.
Forward path (ADR-0015 D5):
When a forward Transaction arrives at m_cpu (terminal hop), M_CPU fans out
DMA sub-Transactions to target PEs' HBM slices. target_pe on the request
controls fan-out: int → single PE, "all" → all PEs in the cube.
Response path:
ResponseMsg from each hbm_ctrl arrives back at m_cpu. Once all PE responses
are collected, m_cpu sends an aggregate ResponseMsg on the reverse command
path back to io_cpu.
Transit:
When m_cpu is NOT the terminal hop (transit or response relay), the
Transaction is forwarded normally to the next hop.
"""
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
super().__init__(node, ctx)
# Pending fan-out tracking: request_id → (expected, received, all_done_event)
self._pending: dict[str, tuple[int, int, simpy.Event]] = {}
# Store parent txn for response sending: request_id → parent_txn
self._parent_txns: dict[str, Any] = {}
# DMA engine resources (ADR-0015 D5, ADR-0014 D4): capacity=1 each
self._dma_write: simpy.Resource | None = None
self._dma_read: simpy.Resource | None = None
def start(self, env: simpy.Environment) -> None:
self._dma_write = simpy.Resource(env, capacity=1)
self._dma_read = simpy.Resource(env, capacity=1)
super().start(env)
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
yield env.timeout(overhead_ns)
def _worker(self, env: simpy.Environment) -> Generator:
"""Dispatch forward txns, collect response txns."""
from kernbench.runtime_api.kernel import KernelLaunchMsg
while True:
txn: Any = yield self._inbox.get()
if getattr(txn, "is_response", False):
self._collect_response(txn)
else:
yield from self.run(env, txn.nbytes)
next_hop = txn.next_hop
if next_hop:
yield self.out_ports[next_hop].put(txn.advance())
elif self.ctx is not None and txn.request is not None:
if isinstance(txn.request, KernelLaunchMsg):
env.process(self._kernel_launch_fanout(env, txn))
else:
env.process(self._dma_fanout(env, txn))
else:
txn.done.succeed()
def _collect_response(self, resp_txn: Any) -> None:
"""Receive a PE response and increment the aggregation counter."""
key = resp_txn.request.request_id
if key not in self._pending:
return
expected, received, all_done = self._pending[key]
received += 1
if received >= expected:
all_done.succeed()
del self._pending[key]
else:
self._pending[key] = (expected, received, all_done)
def _dma_fanout(self, env: simpy.Environment, txn: Any) -> Generator:
"""Fan out DMA sub-Transactions to target PE(s), wait for responses,
then send aggregate response on reverse command path.
Each DMA transfer acquires the DMA resource (capacity=1 per ADR-0014 D4),
so multi-PE fan-out is serialized through the DMA engine.
"""
from kernbench.runtime_api.kernel import MemoryWriteMsg
request = txn.request
target_pe = getattr(request, "target_pe", "all")
dst_nodes = self._resolve_dma_destinations(request, target_pe)
if not dst_nodes:
txn.done.succeed()
return
# Setup aggregation
all_done = env.event()
self._pending[request.request_id] = (len(dst_nodes), 0, all_done)
self._parent_txns[request.request_id] = txn
# Select DMA resource based on operation type
dma_res = self._dma_write if isinstance(request, MemoryWriteMsg) else self._dma_read
# Fan out DMA sub-txns (serialized through DMA resource)
max_drain_ns = 0.0
for dst_node in dst_nodes:
try:
dma_path = self.ctx.router.find_mcpu_dma_path(self.node.id, dst_node)
except Exception:
continue
if len(dma_path) < 2:
continue
drain_ns = self.ctx.compute_drain_ns(dma_path, txn.nbytes)
max_drain_ns = max(max_drain_ns, drain_ns)
sub_txn = Transaction(
request=request, path=dma_path, step=0,
nbytes=txn.nbytes, done=env.event(),
drain_ns=drain_ns,
)
with dma_res.request() as req:
yield req
yield self.out_ports[dma_path[1]].put(sub_txn.advance())
# Wait for all PE responses
yield all_done
txn.result_data["xfer_ns"] = max_drain_ns
del self._parent_txns[request.request_id]
# Send aggregate response on reverse command path
reverse_path = list(reversed(txn.path))
if len(reverse_path) >= 2:
from kernbench.runtime_api.kernel import ResponseMsg
parts = self.node.id.split(".")
cube_id = int(parts[1].replace("cube", ""))
resp_msg = ResponseMsg(
correlation_id=request.correlation_id,
request_id=request.request_id,
src_cube=cube_id, src_pe=-1, success=True,
)
resp_txn = Transaction(
request=resp_msg, path=reverse_path, step=0,
nbytes=0, done=env.event(), is_response=True,
)
yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
else:
txn.done.succeed()
def _kernel_launch_fanout(self, env: simpy.Environment, txn: Any) -> Generator:
"""Fan out KernelLaunchMsg to target PE_CPU(s) via NOC (ADR-0009 D3).
Routes through find_node_path (M_CPU → NOC → PE_CPU command edges).
Waits for sub_txn.done directly — no ResponseMsg needed for PE direction.
Then sends aggregate ResponseMsg back to IO_CPU on the reverse path.
"""
request = txn.request
target_pe = getattr(request, "target_pe", "all")
cube_prefix = self.node.id.rsplit(".", 1)[0] # e.g. "sip0.cube0"
pe_ids = self._resolve_pe_ids(target_pe)
if not pe_ids:
txn.done.succeed()
return
# Fan out to each PE_CPU and collect done events
sub_dones: list[simpy.Event] = []
sub_txns: list[Transaction] = []
for pe_id in pe_ids:
pe_cpu_id = f"{cube_prefix}.pe{pe_id}.pe_cpu"
try:
path = self.ctx.router.find_node_path(self.node.id, pe_cpu_id)
except Exception:
continue
if len(path) < 2:
continue
sub_done = env.event()
sub_txn = Transaction(
request=request, path=path, step=0,
nbytes=0, done=sub_done,
)
yield self.out_ports[path[1]].put(sub_txn.advance())
sub_dones.append(sub_done)
sub_txns.append(sub_txn)
if not sub_dones:
txn.done.succeed()
return
# Wait for all PE_CPUs to complete
for sd in sub_dones:
yield sd
# Aggregate PE-internal metrics (max across PEs)
pe_exec_values = [st.result_data.get("pe_exec_ns", 0.0) for st in sub_txns]
if pe_exec_values:
txn.result_data["pe_exec_ns"] = max(pe_exec_values)
dma_values = [st.result_data.get("dma_ns", 0.0) for st in sub_txns]
if dma_values:
txn.result_data["dma_ns"] = max(dma_values)
compute_values = [st.result_data.get("compute_ns", 0.0) for st in sub_txns]
if compute_values:
txn.result_data["compute_ns"] = max(compute_values)
# Send aggregate response on reverse command path back to IO_CPU
reverse_path = list(reversed(txn.path))
if len(reverse_path) >= 2:
from kernbench.runtime_api.kernel import ResponseMsg
parts = self.node.id.split(".")
cube_id = int(parts[1].replace("cube", ""))
resp_msg = ResponseMsg(
correlation_id=request.correlation_id,
request_id=request.request_id,
src_cube=cube_id, src_pe=-1, success=True,
)
resp_txn = Transaction(
request=resp_msg, path=reverse_path, step=0,
nbytes=0, done=env.event(), is_response=True,
)
yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
else:
txn.done.succeed()
def _resolve_dma_destinations(self, request: Any, target_pe: int | str) -> list[str]:
"""Return list of HBM destination node_ids for DMA fan-out.
Uses PA-based resolution to determine the actual target cube and slice,
enabling cross-cube DMA routing when the PA points to a remote cube.
"""
cube_prefix = self.node.id.rsplit(".", 1)[0] # e.g. "sip0.cube0"
if isinstance(target_pe, int):
return [f"{cube_prefix}.hbm_ctrl.slice{target_pe}"]
# PA-based resolution: extract actual target from physical address
pa_val = getattr(request, "dst_pa", None) or getattr(request, "src_pa", None)
if pa_val is not None:
from kernbench.policy.address.phyaddr import PhysAddr
try:
pa = PhysAddr.decode(pa_val)
return [self.ctx.resolver.resolve(pa)]
except Exception:
pass
# "all" without PA (KernelLaunch): all slices in local cube
n_slices = 8
if self.ctx and self.ctx.spec:
mm = self.ctx.spec.get("cube", {}).get("memory_map", {})
n_slices = mm.get("hbm_slices_per_cube", 8)
return [f"{cube_prefix}.hbm_ctrl.slice{i}" for i in range(n_slices)]
def _resolve_pe_ids(self, target_pe: int | str) -> list[int]:
"""Return list of PE IDs to fan out to (used by kernel launch fan-out)."""
if isinstance(target_pe, int):
return [target_pe]
# "all": all PEs in local cube
n_slices = 8
if self.ctx and self.ctx.spec:
mm = self.ctx.spec.get("cube", {}).get("memory_map", {})
n_slices = mm.get("hbm_slices_per_cube", 8)
return list(range(n_slices))
+187
View File
@@ -0,0 +1,187 @@
from __future__ import annotations
from collections.abc import Generator
from typing import TYPE_CHECKING, Any
import simpy
from kernbench.components.base import ComponentBase
if TYPE_CHECKING:
from kernbench.components.context import ComponentContext
from kernbench.topology.types import Node
class TwoDMeshNocComponent(ComponentBase):
"""2D mesh NOC modeled as a single smart node.
Latency model:
- Traversal latency = Manhattan distance between prev_hop and next_hop
node positions, split into XY segments, traversed with pipeline.
- overhead_ns (from node.attrs) is added once per traversal.
Contention model:
- Each directed XY segment is a simpy.Resource(capacity=1).
- Pipeline: next segment's resource is requested before the current
segment's timeout completes, so a free downstream segment is acquired
immediately (wormhole-style cut-through).
- Two transactions sharing a segment (same row or column band) contend.
Concurrency:
- _worker spawns an independent SimPy process per transaction, so the
NOC is never serialized at the node level — only at segment resources.
"""
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
super().__init__(node, ctx)
self._env: simpy.Environment | None = None
self._links: dict[tuple, simpy.Resource] = {}
self._x_grid: list[float] = []
self._y_grid: list[float] = []
def start(self, env: simpy.Environment) -> None:
self._env = env
self._build_grid()
super().start(env)
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
yield env.timeout(0)
# ── Grid construction ────────────────────────────────────────────
def _build_grid(self) -> None:
if not self.ctx:
return
cube_prefix = self.node.id.rsplit(".", 1)[0]
xs: set[float] = set()
ys: set[float] = set()
for node_id, pos in self.ctx.positions.items():
if node_id.startswith(cube_prefix + ".") and pos is not None:
xs.add(round(pos[0], 2))
ys.add(round(pos[1], 2))
self._x_grid = sorted(xs)
self._y_grid = sorted(ys)
def _get_link(self, key: tuple) -> simpy.Resource:
if key not in self._links:
assert self._env is not None
self._links[key] = simpy.Resource(self._env, capacity=1)
return self._links[key]
# ── Worker ───────────────────────────────────────────────────────
def _worker(self, env: simpy.Environment) -> Generator:
while True:
txn: Any = yield self._inbox.get()
env.process(self._route(env, txn))
def _route(self, env: simpy.Environment, txn: Any) -> Generator:
prev_hop = txn.path[txn.step - 1] if txn.step > 0 else None
next_hop = txn.next_hop
overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
links: list[tuple[tuple, float]] = []
if prev_hop and next_hop and self.ctx:
src_pos = self.ctx.positions.get(prev_hop)
dst_pos = self.ctx.positions.get(next_hop)
if src_pos and dst_pos:
links = self._xy_links(src_pos, dst_pos)
if links:
yield from self._traverse(env, links, overhead_ns)
else:
yield env.timeout(overhead_ns)
if next_hop:
yield self.out_ports[next_hop].put(txn.advance())
else:
drain = getattr(txn, "drain_ns", 0.0)
if drain > 0:
yield env.timeout(drain)
txn.done.succeed()
# ── XY routing and pipelined link traversal ──────────────────────
def _traverse(
self,
env: simpy.Environment,
links: list[tuple[tuple, float]],
overhead_ns: float,
) -> Generator:
"""Pipeline: request next segment before current timeout finishes."""
ns_per_mm = self.ctx.ns_per_mm # type: ignore[union-attr]
# Acquire first link
first_key, _ = links[0]
current_resource = self._get_link(first_key)
current_req = current_resource.request()
yield current_req
for i, (_, dist_mm) in enumerate(links):
# Request next link before current timeout (pipeline)
if i + 1 < len(links):
next_key, _ = links[i + 1]
next_resource = self._get_link(next_key)
next_req = next_resource.request()
yield env.timeout(dist_mm * ns_per_mm + (overhead_ns if i == 0 else 0.0))
current_resource.release(current_req)
if i + 1 < len(links):
yield next_req # usually already fulfilled (pipeline)
current_resource = next_resource
current_req = next_req
def _xy_links(
self,
src: tuple[float, float],
dst: tuple[float, float],
) -> list[tuple[tuple, float]]:
"""XY routing: horizontal segment first, then vertical.
Returns list of (link_key, dist_mm) pairs, where link_key uniquely
identifies a directed segment shared across concurrent transactions.
"""
x0, y0 = src
x1, y1 = dst
links: list[tuple[tuple, float]] = []
# Horizontal segment at y≈y0
if abs(x0 - x1) > 1e-9:
y_band = self._snap(y0, self._y_grid)
for xa, xb in self._segments(x0, x1, self._x_grid):
d = abs(xb - xa)
if d > 1e-9:
lo, hi = (xa, xb) if xa < xb else (xb, xa)
dir_h = "E" if xb > xa else "W"
links.append((("H", round(y_band, 2), round(lo, 2), round(hi, 2), dir_h), d))
# Vertical segment at x≈x1
if abs(y0 - y1) > 1e-9:
x_band = self._snap(x1, self._x_grid)
for ya, yb in self._segments(y0, y1, self._y_grid):
d = abs(yb - ya)
if d > 1e-9:
lo, hi = (ya, yb) if ya < yb else (yb, ya)
dir_v = "S" if yb > ya else "N"
links.append((("V", round(x_band, 2), round(lo, 2), round(hi, 2), dir_v), d))
return links
@staticmethod
def _snap(val: float, grid: list[float]) -> float:
if not grid:
return val
return min(grid, key=lambda g: abs(g - val))
@staticmethod
def _segments(a: float, b: float, grid: list[float]) -> list[tuple[float, float]]:
"""Consecutive (p_i, p_{i+1}) pairs covering range [a, b] using grid waypoints."""
if abs(a - b) < 1e-9:
return []
lo, hi = (a, b) if a < b else (b, a)
pts = [lo] + [g for g in grid if lo + 1e-9 < g < hi - 1e-9] + [hi]
pairs = [(pts[i], pts[i + 1]) for i in range(len(pts) - 1)]
if a > b:
pairs = [(p2, p1) for p1, p2 in reversed(pairs)]
return pairs
+27
View File
@@ -0,0 +1,27 @@
from __future__ import annotations
from collections.abc import Generator
from typing import TYPE_CHECKING
import simpy
from kernbench.components.base import ComponentBase
if TYPE_CHECKING:
from kernbench.components.context import ComponentContext
from kernbench.topology.types import Node
class PcieEpComponent(ComponentBase):
"""PCIe endpoint: protocol processing overhead before forwarding.
Applies overhead_ns (from node.attrs) for PCIe protocol handling,
then forwards via inherited _forward_txn().
"""
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
super().__init__(node, ctx)
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
yield env.timeout(overhead_ns)
+154
View File
@@ -0,0 +1,154 @@
from __future__ import annotations
from collections.abc import Generator
from typing import TYPE_CHECKING, Any
import simpy
from kernbench.components.base import ComponentBase
if TYPE_CHECKING:
from kernbench.components.context import ComponentContext
from kernbench.topology.types import Node
class PeCpuComponent(ComponentBase):
"""PE_CPU: kernel execution controller (Stage 2).
Two-phase kernel execution (ADR-0014 D1):
Phase 1 (compile): look up kernel from registry, run it with TLContext
to generate a PeCommand list.
Phase 2 (replay): iterate commands, dispatch to PE_SCHEDULER via
PeInternalTxn, wait for blocking commands.
Non-kernel Transactions are forwarded normally.
"""
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
super().__init__(node, ctx)
self._pe_prefix = node.id.rsplit(".", 1)[0] # e.g. "sip0.cube0.pe0"
try:
self._pe_idx = int(self._pe_prefix.rsplit("pe", 1)[1])
except (IndexError, ValueError):
self._pe_idx = 0
# Extract sip/cube index for multi-SIP/cube shard matching
parts = node.id.split(".")
try:
self._sip_idx = int(parts[0].replace("sip", ""))
except (IndexError, ValueError):
self._sip_idx = 0
try:
self._cube_idx = int(parts[1].replace("cube", ""))
except (IndexError, ValueError):
self._cube_idx = 0
def _find_shard(self, shards: tuple) -> Any:
"""Find shard matching this PE's (sip, cube, pe). Fallback to positional index."""
for s in shards:
if s.sip == self._sip_idx and s.cube == self._cube_idx and s.pe == self._pe_idx:
return s
return shards[min(self._pe_idx, len(shards) - 1)]
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
yield env.timeout(overhead_ns)
def _worker(self, env: simpy.Environment) -> Generator:
while True:
txn: Any = yield self._inbox.get()
from kernbench.runtime_api.kernel import KernelLaunchMsg
if hasattr(txn, "request") and isinstance(txn.request, KernelLaunchMsg):
yield from self._execute_kernel(env, txn)
else:
yield from self._forward_txn(env, txn)
def _execute_kernel(self, env: simpy.Environment, txn: Any) -> Generator:
"""Compile kernel function and replay command trace."""
from kernbench.common.pe_commands import (
CompositeCmd,
PeCpuOverheadCmd,
PeInternalTxn,
WaitCmd,
)
from kernbench.triton_emu.registry import get_kernel
from kernbench.triton_emu.tl_context import TLContext, run_kernel
request = txn.request
# Phase 1: Compile — apply PE_CPU setup overhead, then run kernel
yield from self.run(env, 0)
kernel_fn = get_kernel(request.kernel_ref.name)
tl = TLContext(pe_id=self._pe_idx, dispatch_cycles=0)
# Unpack KernelLaunchMsg.args into positional args for kernel function
# TensorArg → PA (pointer), ScalarArg → value
kernel_args: list = []
for arg in request.args:
if arg.arg_kind == "tensor":
shard = self._find_shard(arg.shards)
kernel_args.append(shard.pa)
elif arg.arg_kind == "scalar":
kernel_args.append(arg.value)
run_kernel(kernel_fn, tl, *kernel_args)
commands = tl.commands
# Phase 2: Replay — dispatch commands to PE_SCHEDULER
pe_exec_start = env.now
scheduler_id = f"{self._pe_prefix}.pe_scheduler"
pending: dict[str, simpy.Event] = {} # completion_id → done event
composite_results: list[dict] = [] # collect result_data from CompositeCmd txns
for cmd in commands:
if isinstance(cmd, PeCpuOverheadCmd):
yield env.timeout(cmd.cycles)
elif isinstance(cmd, WaitCmd):
if cmd.handle is not None:
evt = pending.pop(cmd.handle.id, None)
if evt:
yield evt
else:
# Wait all pending completions
for evt in pending.values():
yield evt
pending.clear()
elif isinstance(cmd, CompositeCmd):
# Non-blocking: dispatch to scheduler, track completion
done_evt = env.event()
pe_txn = PeInternalTxn(
command=cmd, done=done_evt,
pe_prefix=self._pe_prefix,
)
composite_results.append(pe_txn.result_data)
yield self.out_ports[scheduler_id].put(pe_txn)
pending[cmd.completion.id] = done_evt
else:
# Blocking: dispatch and wait for completion
done_evt = env.event()
pe_txn = PeInternalTxn(
command=cmd, done=done_evt,
pe_prefix=self._pe_prefix,
)
yield self.out_ports[scheduler_id].put(pe_txn)
yield done_evt
# Wait for any remaining pending completions
for evt in pending.values():
yield evt
# Record PE-internal execution time
txn.result_data["pe_exec_ns"] = env.now - pe_exec_start
# Aggregate dma_ns / compute_ns from CompositeCmd results
total_dma_ns = 0.0
total_compute_ns = 0.0
for rd in composite_results:
total_dma_ns += rd.get("dma_ns", 0.0)
total_compute_ns += rd.get("compute_ns", 0.0)
txn.result_data["dma_ns"] = total_dma_ns
txn.result_data["compute_ns"] = total_compute_ns
# Signal original Transaction done
txn.done.succeed()
+116
View File
@@ -0,0 +1,116 @@
from __future__ import annotations
from collections.abc import Generator
from typing import TYPE_CHECKING, Any
import simpy
from kernbench.components.base import PeEngineBase
from kernbench.sim_engine.transaction import Transaction
if TYPE_CHECKING:
from kernbench.common.pe_commands import PeInternalTxn
from kernbench.components.context import ComponentContext
from kernbench.topology.types import Node
class PeDmaComponent(PeEngineBase):
"""PE_DMA: dual-channel DMA engine with READ and WRITE resources.
Each channel has capacity=1 (ADR-0014 D4):
- DMA_READ and DMA_WRITE may execute concurrently.
- Multiple READs cannot overlap; multiple WRITEs cannot overlap.
Handles two message types:
- Transaction: external fabric messages (PeDmaMsg probes, M_CPU DMA)
- PeInternalTxn: PE-internal commands from PE_SCHEDULER
(DmaReadCmd → HBM read, DmaWriteCmd → HBM write)
"""
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
super().__init__(node, ctx)
self._dma_read: simpy.Resource | None = None
self._dma_write: simpy.Resource | None = None
def init_resources(self, env: simpy.Environment) -> None:
self._dma_read = simpy.Resource(env, capacity=1)
self._dma_write = simpy.Resource(env, capacity=1)
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
yield env.timeout(0)
def handle_command(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
"""Handle PE-internal DMA command: resolve PA → HBM path → transfer."""
from kernbench.common.pe_commands import DmaReadCmd, DmaWriteCmd
from kernbench.policy.address.phyaddr import PhysAddr
from kernbench.runtime_api.kernel import PeDmaMsg
cmd = pe_txn.command
assert self._dma_read is not None and self._dma_write is not None
# Determine direction and target PA
if isinstance(cmd, DmaReadCmd):
dma_res = self._dma_read
target_pa = cmd.src_pa
is_write = False
elif isinstance(cmd, DmaWriteCmd):
dma_res = self._dma_write
target_pa = cmd.dst_pa
is_write = True
else:
pe_txn.done.succeed()
return
# Resolve PA → HBM node and compute path
pa = PhysAddr.decode(target_pa)
dst_node = self.ctx.resolver.resolve(pa)
path = self.ctx.router.find_path(self._pe_prefix, dst_node)
drain_ns = self.ctx.compute_drain_ns(path, cmd.nbytes)
# Acquire DMA channel (command issue serialization)
with dma_res.request() as req:
yield req
# Create sub-Transaction with PeDmaMsg (HbmCtrl handles it directly)
sub_done = env.event()
sub_request = PeDmaMsg(
correlation_id="pe_internal",
request_id=f"dma_{id(pe_txn)}",
src_sip=0, src_cube=0, src_pe=0,
dst_pa=target_pa, nbytes=cmd.nbytes,
is_write=is_write,
)
sub_txn = Transaction(
request=sub_request, path=path, step=0,
nbytes=cmd.nbytes, done=sub_done, drain_ns=drain_ns,
)
# Send to next hop (path[0] is pe_dma itself, path[1] is xbar)
if len(path) > 1:
yield self.out_ports[path[1]].put(sub_txn.advance())
# DMA channel released after issue
# Wait for HBM transfer completion
yield sub_done
pe_txn.done.succeed()
def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator:
"""Handle external Transaction (PeDmaMsg probe, M_CPU DMA) with channel acquisition."""
dma_res = self._select_channel(txn)
with dma_res.request() as req:
yield req
next_hop = txn.next_hop
if next_hop:
yield self.out_ports[next_hop].put(txn.advance())
else:
drain = getattr(txn, "drain_ns", 0.0)
if drain > 0:
yield env.timeout(drain)
txn.done.succeed()
def _select_channel(self, txn: Any) -> simpy.Resource:
"""Select DMA channel based on request type."""
from kernbench.runtime_api.kernel import MemoryWriteMsg
assert self._dma_read is not None and self._dma_write is not None
if isinstance(txn.request, MemoryWriteMsg):
return self._dma_write
return self._dma_read
+90
View File
@@ -0,0 +1,90 @@
from __future__ import annotations
from collections.abc import Generator
from typing import TYPE_CHECKING, Any
import simpy
from kernbench.components.base import PeEngineBase
if TYPE_CHECKING:
from kernbench.common.pe_commands import PeInternalTxn
from kernbench.components.context import ComponentContext
from kernbench.topology.types import Node
# dtype → bit width (for TFLOPS scaling)
_DTYPE_BITS: dict[str, int] = {
"f16": 16, "fp16": 16, "float16": 16, "bf16": 16,
"f32": 32, "fp32": 32, "float32": 32,
"i8": 8, "int8": 8,
"i16": 16, "int16": 16,
"i32": 32, "int32": 32,
}
class PeGemmComponent(PeEngineBase):
"""PE_GEMM: matrix multiplication engine sharing accel_slot (ADR-0014 D4).
Uses a shared compute resource (PE_ACCEL capacity=1) that is mutually
exclusive with PE_MATH within the same PE.
Compute latency model:
FLOPs = 2 * M * K * N
effective_tflops = peak_tflops_f16 * (16 / dtype_bits)
compute_ns = FLOPs / (effective_tflops * 1e3)
"""
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
super().__init__(node, ctx)
self._accel: simpy.Resource | None = None
self._peak_tflops_f16: float = float(node.attrs.get("peak_tflops_f16", 0.0))
def init_resources(self, env: simpy.Environment) -> None:
resource_name = self.node.attrs.get("shared_resource")
if resource_name and self.ctx:
self._accel = self.ctx.get_shared_resource(
env, f"{self._pe_prefix}.{resource_name}"
)
def _compute_ns(self, m: int, k: int, n: int, dtype: str) -> float:
"""Compute GEMM latency in nanoseconds."""
if self._peak_tflops_f16 <= 0:
return float(self.node.attrs.get("overhead_ns", 0.0))
dtype_bits = _DTYPE_BITS.get(dtype, 16)
effective_tflops = self._peak_tflops_f16 * (16.0 / dtype_bits)
flops = 2.0 * m * k * n
return flops / (effective_tflops * 1e3)
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
yield env.timeout(overhead_ns)
def handle_command(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
from kernbench.common.pe_commands import GemmCmd
cmd = pe_txn.command
if self._accel:
with self._accel.request() as req:
yield req
if isinstance(cmd, GemmCmd):
ns = self._compute_ns(cmd.m, cmd.k, cmd.n, cmd.a.dtype)
yield env.timeout(ns)
else:
yield from self.run(env, 0)
else:
if isinstance(cmd, GemmCmd):
ns = self._compute_ns(cmd.m, cmd.k, cmd.n, cmd.a.dtype)
yield env.timeout(ns)
else:
yield from self.run(env, 0)
pe_txn.done.succeed()
def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator:
"""Transaction forwarding with accel_slot acquisition."""
if self._accel:
with self._accel.request() as req:
yield req
yield from super()._forward_txn(env, txn)
else:
yield from super()._forward_txn(env, txn)
+54
View File
@@ -0,0 +1,54 @@
from __future__ import annotations
from collections.abc import Generator
from typing import TYPE_CHECKING, Any
import simpy
from kernbench.components.base import PeEngineBase
if TYPE_CHECKING:
from kernbench.common.pe_commands import PeInternalTxn
from kernbench.components.context import ComponentContext
from kernbench.topology.types import Node
class PeMathComponent(PeEngineBase):
"""PE_MATH: element-wise computation engine sharing accel_slot (ADR-0014 D4).
Uses a shared compute resource (PE_ACCEL capacity=1) that is mutually
exclusive with PE_GEMM within the same PE.
"""
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
super().__init__(node, ctx)
self._accel: simpy.Resource | None = None
def init_resources(self, env: simpy.Environment) -> None:
resource_name = self.node.attrs.get("shared_resource")
if resource_name and self.ctx:
self._accel = self.ctx.get_shared_resource(
env, f"{self._pe_prefix}.{resource_name}"
)
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
yield env.timeout(overhead_ns)
def handle_command(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
if self._accel:
with self._accel.request() as req:
yield req
yield from self.run(env, 0)
else:
yield from self.run(env, 0)
pe_txn.done.succeed()
def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator:
"""Transaction forwarding with accel_slot acquisition."""
if self._accel:
with self._accel.request() as req:
yield req
yield from super()._forward_txn(env, txn)
else:
yield from super()._forward_txn(env, txn)
@@ -0,0 +1,245 @@
from __future__ import annotations
from collections.abc import Generator
from typing import TYPE_CHECKING, Any
import simpy
from kernbench.components.base import ComponentBase
if TYPE_CHECKING:
from kernbench.common.pe_commands import PeInternalTxn
from kernbench.components.context import ComponentContext
from kernbench.topology.types import Node
class PeSchedulerComponent(ComponentBase):
"""PE_SCHEDULER: sole dispatcher inside a PE (ADR-0014 D1).
Receives PeInternalTxn from PE_CPU, routes to the appropriate engine:
- DmaReadCmd / DmaWriteCmd → PE_DMA
- GemmCmd → PE_GEMM
- MathCmd → PE_MATH
- CompositeCmd → tiled pipeline (Stage 3: ADR-0014 D3.2)
Composite GEMM pipeline (32x64x32 tiles):
DMA_READ(b_tile_t) → COMPUTE(t) → DMA_WRITE(out_tile_t)
with overlap: READ(t+1) || COMPUTE(t) || WRITE(t-1)
Applies scheduler overhead_ns before dispatching each command.
Non-PeInternalTxn messages are forwarded via inherited _forward_txn().
"""
# Scheduler tile dimensions (ADR-0014 D3.2)
TILE_M = 32
TILE_K = 64
TILE_N = 32
# Command → engine suffix dispatch table.
# New engines: add a single entry here (e.g. ConvCmd: "pe_conv").
_CMD_DISPATCH: dict[type, str] = {}
@classmethod
def _ensure_dispatch_table(cls) -> None:
if cls._CMD_DISPATCH:
return
from kernbench.common.pe_commands import DmaReadCmd, DmaWriteCmd, GemmCmd, MathCmd
cls._CMD_DISPATCH = {
DmaReadCmd: "pe_dma",
DmaWriteCmd: "pe_dma",
GemmCmd: "pe_gemm",
MathCmd: "pe_math",
}
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
super().__init__(node, ctx)
self._pe_prefix = node.id.rsplit(".", 1)[0]
self._ensure_dispatch_table()
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
yield env.timeout(overhead_ns)
def _worker(self, env: simpy.Environment) -> Generator:
from kernbench.common.pe_commands import PeInternalTxn
while True:
msg: Any = yield self._inbox.get()
if isinstance(msg, PeInternalTxn):
env.process(self._dispatch(env, msg))
else:
yield from self._forward_txn(env, msg)
def _dispatch(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
"""Route a PeInternalTxn to the correct engine via dispatch table."""
from kernbench.common.pe_commands import CompositeCmd
# Scheduler overhead
yield from self.run(env, 0)
cmd = pe_txn.command
# Check dispatch table first
engine_suffix = self._CMD_DISPATCH.get(type(cmd))
if engine_suffix is not None:
yield self.out_ports[f"{self._pe_prefix}.{engine_suffix}"].put(pe_txn)
return
# CompositeCmd: tiled pipeline (not a simple forward)
if isinstance(cmd, CompositeCmd):
yield from self._dispatch_composite(env, pe_txn)
return
# Unknown command — signal done immediately
pe_txn.done.succeed()
def _dispatch_composite(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
"""Composite tiled pipeline (ADR-0014 D3.2).
GEMM: 3-stage pipeline with b-tile streaming from HBM.
MATH: sequential compute + DMA_WRITE (no tiling).
"""
from kernbench.common.pe_commands import CompositeCmd
cmd = pe_txn.command
assert isinstance(cmd, CompositeCmd)
if cmd.op == "gemm" and cmd.b is not None:
yield from self._pipeline_gemm(env, pe_txn, cmd)
else:
yield from self._pipeline_math(env, pe_txn, cmd)
def _pipeline_gemm(self, env: simpy.Environment, pe_txn: PeInternalTxn, cmd: Any) -> Generator:
"""Tiled GEMM pipeline: stream b tiles from HBM, compute, write results.
Tensor a is in TCM (loaded via tl.load). Tensor b is in HBM (via tl.ref).
Pipeline: DMA_READ(b_tile_t) -> COMPUTE(t) -> DMA_WRITE(out_tile_t)
Overlap: READ(t+1) || COMPUTE(t) || WRITE(t-1)
"""
from kernbench.common.pe_commands import (
DmaReadCmd,
DmaWriteCmd,
GemmCmd,
PeInternalTxn as PeTxn,
TensorHandle,
)
pp = self._pe_prefix
a = cmd.a # already in TCM
b = cmd.b # HBM reference (via tl.ref)
M, K_a = a.shape[-2], a.shape[-1]
K_b, N = b.shape[-2], b.shape[-1]
dtype = a.dtype
dtype_bytes = b.nbytes // (K_b * N) if (K_b * N) > 0 else 2
# Tile counts
n_tiles_k = max(1, (K_a + self.TILE_K - 1) // self.TILE_K)
n_tiles_n = max(1, (N + self.TILE_N - 1) // self.TILE_N)
n_tiles = n_tiles_k * n_tiles_n
prev_compute_done = None
prev_write_done = None
total_dma_ns = 0.0
total_compute_ns = 0.0
for tile_idx in range(n_tiles):
tk = tile_idx // n_tiles_n
tn = tile_idx % n_tiles_n
k_start = tk * self.TILE_K
n_start = tn * self.TILE_N
tile_k = min(self.TILE_K, K_a - k_start)
tile_n = min(self.TILE_N, N - n_start)
tile_nbytes = tile_k * tile_n * dtype_bytes
# --- Stage 1: DMA_READ b_tile from HBM ---
read_done = env.event()
b_tile_pa = b.pa + (k_start * N + n_start) * dtype_bytes
b_tile_handle = TensorHandle(
id=f"b_tile_{tile_idx}", pa=b_tile_pa,
shape=(tile_k, tile_n), dtype=dtype, nbytes=tile_nbytes,
)
read_cmd = DmaReadCmd(handle=b_tile_handle, src_pa=b_tile_pa, nbytes=tile_nbytes)
read_txn = PeTxn(command=read_cmd, done=read_done, pe_prefix=pp)
t0 = env.now
yield self.out_ports[f"{pp}.pe_dma"].put(read_txn)
# Wait for previous compute before starting this tile's compute
if prev_compute_done is not None:
yield prev_compute_done
# Wait for this tile's DMA_READ
yield read_done
total_dma_ns += env.now - t0
# --- Stage 2: COMPUTE (GEMM) ---
compute_done = env.event()
out_handle = TensorHandle(
id=f"out_tile_{tile_idx}", pa=0,
shape=(M, tile_n), dtype=dtype,
nbytes=M * tile_n * dtype_bytes,
)
compute_cmd = GemmCmd(a=a, b=b_tile_handle, out=out_handle,
m=M, k=tile_k, n=tile_n)
compute_txn = PeTxn(command=compute_cmd, done=compute_done, pe_prefix=pp)
t0 = env.now
yield self.out_ports[f"{pp}.pe_gemm"].put(compute_txn)
# Wait for previous write (DMA_WRITE serialization)
if prev_write_done is not None:
yield prev_write_done
# Wait for compute of THIS tile
yield compute_done
total_compute_ns += env.now - t0
prev_compute_done = compute_done
# --- Stage 3: DMA_WRITE out_tile to HBM ---
write_done = env.event()
out_tile_pa = cmd.out_pa + n_start * dtype_bytes
write_nbytes = M * tile_n * dtype_bytes
write_cmd = DmaWriteCmd(handle=out_handle, dst_pa=out_tile_pa, nbytes=write_nbytes)
write_txn = PeTxn(command=write_cmd, done=write_done, pe_prefix=pp)
t0 = env.now
yield self.out_ports[f"{pp}.pe_dma"].put(write_txn)
prev_write_done = write_done
# Wait for final write
if prev_write_done is not None:
t0 = env.now
yield prev_write_done
total_dma_ns += env.now - t0
pe_txn.result_data["dma_ns"] = total_dma_ns
pe_txn.result_data["compute_ns"] = total_compute_ns
pe_txn.done.succeed()
def _pipeline_math(self, env: simpy.Environment, pe_txn: PeInternalTxn, cmd: Any) -> Generator:
"""Non-GEMM composite: sequential compute + DMA_WRITE (no tiling)."""
from kernbench.common.pe_commands import (
DmaWriteCmd,
MathCmd,
PeInternalTxn as PeTxn,
)
pp = self._pe_prefix
# Step 1: Compute (MATH)
compute_done = env.event()
compute_cmd = MathCmd(
op=cmd.math_op or "identity",
inputs=(cmd.a,), out=cmd.a,
)
compute_txn = PeTxn(command=compute_cmd, done=compute_done, pe_prefix=pp)
yield self.out_ports[f"{pp}.pe_math"].put(compute_txn)
yield compute_done
# Step 2: DMA_WRITE result to HBM
write_done = env.event()
write_cmd = DmaWriteCmd(handle=cmd.a, dst_pa=cmd.out_pa, nbytes=cmd.out_nbytes)
write_txn = PeTxn(command=write_cmd, done=write_done, pe_prefix=pp)
yield self.out_ports[f"{pp}.pe_dma"].put(write_txn)
yield write_done
pe_txn.done.succeed()
+25
View File
@@ -0,0 +1,25 @@
from __future__ import annotations
from collections.abc import Generator
from typing import TYPE_CHECKING
from kernbench.components.base import ComponentBase
if TYPE_CHECKING:
from kernbench.components.context import ComponentContext
from kernbench.topology.types import Node
class PeTcmComponent(ComponentBase):
"""PE_TCM: tightly-coupled memory / local SRAM staging buffer.
Terminal storage component for PE-internal dataflow (ADR-0014 D5).
Phase 0: applies overhead_ns and drain_ns at terminal.
"""
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
super().__init__(node, ctx)
def run(self, env, nbytes: int) -> Generator:
overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
yield env.timeout(overhead_ns)
+59
View File
@@ -0,0 +1,59 @@
from __future__ import annotations
from collections.abc import Generator
from typing import TYPE_CHECKING, Any
import simpy
from kernbench.components.base import ComponentBase
from kernbench.sim_engine.transaction import Transaction
if TYPE_CHECKING:
from kernbench.components.context import ComponentContext
from kernbench.topology.types import Node
class SramComponent(ComponentBase):
"""Cube SRAM: terminal component that models SRAM access latency.
Applies overhead_ns processing overhead (from node.attrs).
On completion, sends a ResponseMsg back on the reverse path.
"""
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
super().__init__(node, ctx)
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
yield env.timeout(overhead_ns)
def _worker(self, env: simpy.Environment) -> Generator:
"""Terminal worker: process, apply drain, send response."""
while True:
txn: Any = yield self._inbox.get()
yield from self.run(env, txn.nbytes)
drain = getattr(txn, "drain_ns", 0.0)
if drain > 0:
yield env.timeout(drain)
yield from self._send_response(env, txn)
def _send_response(self, env: simpy.Environment, txn: Any) -> Generator:
"""Create ResponseMsg and send on reverse path."""
reverse_path = list(reversed(txn.path))
if len(reverse_path) >= 2 and self.ctx:
from kernbench.runtime_api.kernel import ResponseMsg
parts = self.node.id.split(".")
cube_id = int(parts[1].replace("cube", ""))
resp_msg = ResponseMsg(
correlation_id=txn.request.correlation_id,
request_id=txn.request.request_id,
src_cube=cube_id, src_pe=-1, success=True,
)
resp_txn = Transaction(
request=resp_msg, path=reverse_path, step=0,
nbytes=0, done=env.event(), is_response=True,
)
yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
else:
txn.done.succeed()
View File
+85
View File
@@ -0,0 +1,85 @@
from __future__ import annotations
from dataclasses import dataclass
from kernbench.policy.address.phyaddr import PhysAddr
class AllocationError(Exception):
pass
@dataclass(frozen=True)
class AddressConfig:
sip_count: int
cubes_per_sip: int
pes_per_cube: int
hbm_bytes_per_cube: int
hbm_slices_per_cube: int
tcm_bytes_per_pe: int
tcm_scheduler_reserved_bytes: int
sram_bytes_per_cube: int
@property
def hbm_slice_bytes(self) -> int:
return self.hbm_bytes_per_cube // self.hbm_slices_per_cube
@property
def tcm_allocatable_bytes(self) -> int:
return self.tcm_bytes_per_pe - self.tcm_scheduler_reserved_bytes
class PEMemAllocator:
def __init__(
self, rack_id: int, sip_id: int, cube_id: int, pe_id: int, cfg: AddressConfig,
) -> None:
self._rack_id = rack_id
self._sip_id = sip_id
self._cube_id = cube_id
self._pe_id = pe_id
self._cfg = cfg
self._hbm_cursor = 0
self._tcm_cursor = 0
def alloc_hbm(self, nbytes: int) -> PhysAddr:
if self._hbm_cursor + nbytes > self._cfg.hbm_slice_bytes:
raise AllocationError(
f"HBM overflow: need {nbytes}, "
f"available {self._cfg.hbm_slice_bytes - self._hbm_cursor}"
)
pa = PhysAddr.pe_hbm_addr(
rack_id=self._rack_id, sip_id=self._sip_id, cube_id=self._cube_id,
pe_id=self._pe_id, pe_local_hbm_offset=self._hbm_cursor,
slice_size_bytes=self._cfg.hbm_slice_bytes,
)
self._hbm_cursor += nbytes
return pa
def alloc_tcm(self, nbytes: int) -> PhysAddr:
if self._tcm_cursor + nbytes > self._cfg.tcm_allocatable_bytes:
raise AllocationError(
f"TCM overflow: need {nbytes}, "
f"available {self._cfg.tcm_allocatable_bytes - self._tcm_cursor}"
)
pa = PhysAddr.pe_tcm_addr(
rack_id=self._rack_id, sip_id=self._sip_id, cube_id=self._cube_id,
pe_id=self._pe_id, tcm_offset=self._tcm_cursor,
)
self._tcm_cursor += nbytes
return pa
@property
def hbm_used(self) -> int:
return self._hbm_cursor
@property
def hbm_total(self) -> int:
return self._cfg.hbm_slice_bytes
@property
def tcm_used(self) -> int:
return self._tcm_cursor
@property
def tcm_total(self) -> int:
return self._cfg.tcm_allocatable_bytes
+184
View File
@@ -0,0 +1,184 @@
from __future__ import annotations
from dataclasses import dataclass
from enum import IntEnum
from typing import Literal
MAX_51 = (1 << 51) - 1
class PhysAddrError(Exception):
pass
def _chk_range(name: str, v: int, bits: int) -> None:
if not (0 <= v < (1 << bits)):
raise PhysAddrError(f"{name} out of range for {bits} bits: {v}")
def _chk_max(name: str, v: int, maxv: int) -> None:
if not (0 <= v <= maxv):
raise PhysAddrError(f"{name} out of range (0..{maxv}): {v}")
class UnitType(IntEnum):
PE = 0
MCPU = 1
SRAM = 2
@dataclass(frozen=True)
class PhysAddr:
"""
51-bit physical address value object.
Layout:
[50:47] rack_id (4)
[46:43] sip_id (4)
[42:38] sip_seg (5) # cube_id
[37:0] local_offset (38) => each segment is 256GB
local_offset:
[37] selector: 1 = HBM window (128GB reserved), 0 = PE resource window
"""
rack_id: int
sip_id: int
sip_seg: int
local_offset: int
kind: Literal["hbm", "pe_resource", "raw"] = "raw"
cube_id: int = 0
unit_type: UnitType = UnitType.PE
pe_id: int = 0
ext: int = 0
sub_offset: int = 0
hbm_offset: int = 0
HBM_WINDOW_BYTES = 1 << 37 # 128GB
def encode(self) -> int:
_chk_range("rack_id", self.rack_id, 4)
_chk_range("sip_id", self.sip_id, 4)
_chk_range("sip_seg", self.sip_seg, 5)
_chk_range("local_offset", self.local_offset, 38)
addr = (self.rack_id << 47) | (self.sip_id << 43) | (self.sip_seg << 38) | self.local_offset
if not (0 <= addr <= MAX_51):
raise PhysAddrError("address exceeds 51-bit space")
return addr
@staticmethod
def decode(addr: int) -> PhysAddr:
if not (0 <= addr <= MAX_51):
raise PhysAddrError("addr must be a 51-bit value")
rack = (addr >> 47) & 0xF
sip_id = (addr >> 43) & 0xF
sip_seg = (addr >> 38) & 0x1F
off = addr & ((1 << 38) - 1)
cube_id = sip_seg
sel = (off >> 37) & 0x1
if sel == 1:
hbm_offset = int(off & ((1 << 37) - 1))
return PhysAddr(
rack_id=rack,
sip_id=sip_id,
sip_seg=sip_seg,
local_offset=off,
kind="hbm",
cube_id=cube_id,
hbm_offset=hbm_offset,
)
# PE resource decode
raw_ut = int((off >> 34) & 0x7)
try:
unit_type = UnitType(raw_ut)
except ValueError:
raise PhysAddrError(f"unknown unit_type: {raw_ut}") from None
pe_id = int((off >> 30) & 0xF)
ext = int((off >> 29) & 0x1)
sub_offset = int(off & ((1 << 29) - 1))
return PhysAddr(
rack_id=rack,
sip_id=sip_id,
sip_seg=sip_seg,
local_offset=off,
kind="pe_resource",
cube_id=cube_id,
unit_type=unit_type,
pe_id=pe_id,
ext=ext,
sub_offset=sub_offset,
hbm_offset=0,
)
@staticmethod
def hbm_addr(*, rack_id: int, sip_id: int, cube_id: int, hbm_offset: int) -> PhysAddr:
_chk_max("cube_id", cube_id, 31)
_chk_range("hbm_offset", hbm_offset, 37)
sip_seg = cube_id
local_offset = (1 << 37) | int(hbm_offset)
return PhysAddr(
rack_id=rack_id,
sip_id=sip_id,
sip_seg=sip_seg,
local_offset=local_offset,
kind="hbm",
cube_id=cube_id,
hbm_offset=int(hbm_offset),
)
@staticmethod
def pe_hbm_addr(
*,
rack_id: int,
sip_id: int,
cube_id: int,
pe_id: int,
pe_local_hbm_offset: int,
slice_size_bytes: int,
) -> PhysAddr:
_chk_max("cube_id", cube_id, 31)
_chk_range("pe_id", pe_id, 4)
if not (0 <= pe_local_hbm_offset < slice_size_bytes):
raise PhysAddrError("pe_local_hbm_offset out of PE local slice range")
hbm_offset = int(pe_id) * int(slice_size_bytes) + int(pe_local_hbm_offset)
if not (0 <= hbm_offset < PhysAddr.HBM_WINDOW_BYTES):
raise PhysAddrError("HBM offset exceeds reserved 128GB window")
return PhysAddr.hbm_addr(
rack_id=rack_id, sip_id=sip_id, cube_id=cube_id, hbm_offset=hbm_offset
)
@staticmethod
def hbm_pe_id(hbm_offset: int, slice_size_bytes: int) -> int:
return hbm_offset // slice_size_bytes
@staticmethod
def cube_sram_addr(
*, rack_id: int, sip_id: int, cube_id: int, sram_offset: int,
) -> PhysAddr:
_chk_max("cube_id", cube_id, 31)
_chk_range("sram_offset", sram_offset, 29)
sip_seg = cube_id
local_offset = (UnitType.SRAM << 34) | sram_offset
return PhysAddr(
rack_id=rack_id, sip_id=sip_id, sip_seg=sip_seg,
local_offset=local_offset,
kind="pe_resource", cube_id=cube_id,
unit_type=UnitType.SRAM, sub_offset=sram_offset,
)
@staticmethod
def pe_tcm_addr(
*, rack_id: int, sip_id: int, cube_id: int, pe_id: int, tcm_offset: int,
) -> PhysAddr:
_chk_max("cube_id", cube_id, 31)
_chk_range("pe_id", pe_id, 4)
_chk_range("tcm_offset", tcm_offset, 29)
sip_seg = cube_id
local_offset = (UnitType.PE << 34) | (pe_id << 30) | tcm_offset
return PhysAddr(
rack_id=rack_id, sip_id=sip_id, sip_seg=sip_seg,
local_offset=local_offset,
kind="pe_resource", cube_id=cube_id,
unit_type=UnitType.PE, pe_id=pe_id, sub_offset=tcm_offset,
)
+174
View File
@@ -0,0 +1,174 @@
from __future__ import annotations
from dataclasses import dataclass
from math import ceil
from typing import Literal
@dataclass(frozen=True)
class DPPolicy:
"""Two-level data-parallel policy: cube-level + pe-level."""
cube: Literal["replicate", "shard_m", "shard_k"] = "replicate"
pe: Literal["replicate", "column_wise", "row_wise"] = "replicate"
def resolve_dp_policy(
policy: DPPolicy,
*,
shape: tuple[int, int],
itemsize: int,
num_pe: int,
num_cubes: int = 1,
) -> list[ShardSpec]:
"""Resolve a DPPolicy into a list[ShardSpec] with two-level resolution.
Cube-level policy distributes across cubes, pe-level distributes within
each cube. ShardSpec.pe_index uses flat indexing: cube_id * num_pe + pe_id.
"""
_PE_RESOLVERS = {
"replicate": replicate,
"column_wise": column_wise,
"row_wise": row_wise,
}
resolver = _PE_RESOLVERS.get(policy.pe)
if resolver is None:
raise ValueError(f"Unknown pe-level policy: {policy.pe}")
if num_cubes <= 1:
return resolver(shape=shape, itemsize=itemsize, num_pe=num_pe)
# Two-level resolution: cube-level → pe-level
M, K = shape
all_shards: list[ShardSpec] = []
for cube_id in range(num_cubes):
# Determine per-cube shape based on cube-level policy
if policy.cube == "replicate":
cube_shape = (M, K)
cube_offset = 0
elif policy.cube == "shard_m":
chunk_m = M // num_cubes
cube_shape = (chunk_m, K)
cube_offset = cube_id * chunk_m * K * itemsize
elif policy.cube == "shard_k":
chunk_k = K // num_cubes
cube_shape = (M, chunk_k)
cube_offset = cube_id * M * chunk_k * itemsize
else:
raise ValueError(f"Unknown cube-level policy: {policy.cube}")
# Resolve pe-level within this cube's shape
pe_shards = resolver(shape=cube_shape, itemsize=itemsize, num_pe=num_pe)
# Remap pe_index to flat index and adjust offset
for ps in pe_shards:
flat_idx = cube_id * num_pe + ps.pe_index
all_shards.append(ShardSpec(
pe_index=flat_idx,
offset_bytes=cube_offset + ps.offset_bytes,
nbytes=ps.nbytes,
))
return all_shards
@dataclass(frozen=True)
class ShardSpec:
pe_index: int
offset_bytes: int
nbytes: int
def column_wise(
*, shape: tuple[int, int], itemsize: int, num_pe: int,
) -> list[ShardSpec]:
"""Split K axis into num_pe equal parts. Each PE gets (M, K/P)."""
M, K = shape
chunk_k = K // num_pe
chunk_bytes = M * chunk_k * itemsize
shards = []
for i in range(num_pe):
shards.append(ShardSpec(
pe_index=i,
offset_bytes=i * chunk_bytes,
nbytes=chunk_bytes,
))
return shards
def row_wise(
*, shape: tuple[int, int], itemsize: int, num_pe: int,
) -> list[ShardSpec]:
"""Split M axis into num_pe equal parts. Each PE gets (M/P, K)."""
M, K = shape
chunk_m = M // num_pe
chunk_bytes = chunk_m * K * itemsize
shards = []
for i in range(num_pe):
shards.append(ShardSpec(
pe_index=i,
offset_bytes=i * chunk_bytes,
nbytes=chunk_bytes,
))
return shards
def replicate(
*, shape: tuple[int, int], itemsize: int, num_pe: int,
) -> list[ShardSpec]:
"""Full copy per PE. Each PE gets (M, K)."""
M, K = shape
full_bytes = M * K * itemsize
return [
ShardSpec(pe_index=i, offset_bytes=0, nbytes=full_bytes)
for i in range(num_pe)
]
def tiled_column_major(
*, shape: tuple[int, int], itemsize: int, num_pe: int,
tile_m: int, tile_k: int,
) -> list[ShardSpec]:
"""2D tiling, column-major order (K axis first), round-robin across PEs."""
M, K = shape
tiles_m = ceil(M / tile_m)
tiles_k = ceil(K / tile_k)
tile_bytes = tile_m * tile_k * itemsize
row_bytes = K * itemsize
shards = []
idx = 0
for mi in range(tiles_m):
for ki in range(tiles_k):
offset = (mi * tile_m * row_bytes) + (ki * tile_k * itemsize)
shards.append(ShardSpec(
pe_index=idx % num_pe,
offset_bytes=offset,
nbytes=tile_bytes,
))
idx += 1
return shards
def tiled_row_major(
*, shape: tuple[int, int], itemsize: int, num_pe: int,
tile_m: int, tile_k: int,
) -> list[ShardSpec]:
"""2D tiling, row-major order (M axis first), round-robin across PEs."""
M, K = shape
tiles_m = ceil(M / tile_m)
tiles_k = ceil(K / tile_k)
tile_bytes = tile_m * tile_k * itemsize
row_bytes = K * itemsize
shards = []
idx = 0
for ki in range(tiles_k):
for mi in range(tiles_m):
offset = (mi * tile_m * row_bytes) + (ki * tile_k * itemsize)
shards.append(ShardSpec(
pe_index=idx % num_pe,
offset_bytes=offset,
nbytes=tile_bytes,
))
idx += 1
return shards
+184
View File
@@ -0,0 +1,184 @@
from __future__ import annotations
import heapq
from collections import defaultdict
from kernbench.policy.address.phyaddr import PhysAddr, UnitType
from kernbench.topology.types import TopologyGraph
class RoutingError(Exception):
pass
class AddressResolver:
"""Resolve a PhysAddr to the destination node_id in the compiled graph.
Also provides named node lookups (find_m_cpu, find_pcie_ep, …) so that
component implementations never construct node_id strings directly.
Centralising the naming convention here means a single change propagates
everywhere (ADR-0015 D4).
"""
def __init__(self, graph: TopologyGraph) -> None:
self._node_ids = set(graph.nodes)
mm = graph.spec["cube"]["memory_map"]
self._slice_size_bytes = mm["hbm_total_gb_per_cube"] * (1 << 30) // mm["hbm_slices_per_cube"]
# ── Physical-address resolution ──────────────────────────────────
def resolve(self, addr: PhysAddr) -> str:
s = addr.sip_id
c = addr.cube_id
if addr.kind == "hbm":
pe_slice = PhysAddr.hbm_pe_id(addr.hbm_offset, self._slice_size_bytes)
node_id = f"sip{s}.cube{c}.hbm_ctrl.slice{pe_slice}"
elif addr.kind == "pe_resource":
if addr.unit_type == UnitType.PE:
node_id = f"sip{s}.cube{c}.pe{addr.pe_id}.pe_tcm"
elif addr.unit_type == UnitType.SRAM:
node_id = f"sip{s}.cube{c}.sram"
elif addr.unit_type == UnitType.MCPU:
node_id = f"sip{s}.cube{c}.m_cpu"
else:
raise RoutingError(f"unsupported unit_type: {addr.unit_type}")
else:
raise RoutingError(f"unsupported address kind: {addr.kind}")
if node_id not in self._node_ids:
raise RoutingError(f"node {node_id} not found in topology")
return node_id
# ── Named node lookups ───────────────────────────────────────────
def find_m_cpu(self, sip: int, cube: int) -> str:
node_id = f"sip{sip}.cube{cube}.m_cpu"
if node_id not in self._node_ids:
raise RoutingError(f"M_CPU not found: {node_id}")
return node_id
def find_pcie_ep(self, sip: int, io_id: str = "io0") -> str:
node_id = f"sip{sip}.{io_id}.pcie_ep"
if node_id not in self._node_ids:
raise RoutingError(f"PCIE_EP not found: {node_id}")
return node_id
def find_io_cpu(self, sip: int, io_id: str = "io0") -> str:
node_id = f"sip{sip}.{io_id}.io_cpu"
if node_id not in self._node_ids:
raise RoutingError(f"IO_CPU not found: {node_id}")
return node_id
def find_all_pcie_eps(self) -> list[str]:
"""Return all PCIE_EP node ids across all SIPs, sorted."""
return sorted(nid for nid in self._node_ids if nid.endswith(".pcie_ep"))
class PathRouter:
"""Find data-path from a source PE (or arbitrary node) to a destination node.
Two adjacency graphs are maintained:
_adj — excludes command edges (used by PE DMA routing, find_path)
_adj_all — includes all edges (used by component-to-component routing,
find_node_path; required because M_CPU↔NOC links are "command")
"""
# Edge kinds excluded from M_CPU DMA adjacency: prevents routing through
# PE-internal pipeline nodes when computing DMA paths.
_MCPU_DMA_EXCLUDE = {"pe_internal", "pe_to_xbar"}
def __init__(self, graph: TopologyGraph) -> None:
self._adj: dict[str, list[tuple[str, float]]] = defaultdict(list)
self._adj_all: dict[str, list[tuple[str, float]]] = defaultdict(list)
self._adj_mcpu_dma: dict[str, list[tuple[str, float]]] = defaultdict(list)
for e in graph.edges:
w = e.routing_weight_mm if e.routing_weight_mm is not None else e.distance_mm
self._adj_all[e.src].append((e.dst, w))
if e.kind != "command":
self._adj[e.src].append((e.dst, w))
if e.kind not in self._MCPU_DMA_EXCLUDE:
self._adj_mcpu_dma[e.src].append((e.dst, w))
def find_path(self, src_pe: str, dst_node: str) -> list[str]:
"""PE DMA routing: prepends .pe_dma, excludes command edges."""
start = f"{src_pe}.pe_dma"
return self._run_dijkstra(self._adj, start, dst_node)
def find_path_with_distance(self, src_pe: str, dst_node: str) -> tuple[list[str], float]:
start = f"{src_pe}.pe_dma"
return self._run_dijkstra_with_dist(self._adj, start, dst_node)
def find_mcpu_dma_path(self, m_cpu_id: str, dst_hbm_slice_id: str) -> list[str]:
"""M_CPU DMA path: never routes through PE-internal nodes (ADR-0015 D5).
Same-cube: deterministic [m_cpu, noc, xbar.pe_i, hbm_ctrl.slice_i].
Cross-cube: Dijkstra via _adj_mcpu_dma (pe_internal/pe_to_xbar excluded)
→ routes through NOC → UCIe → target cube NOC → xbar → HBM.
"""
m_cube = ".".join(m_cpu_id.split(".")[:2])
d_cube = ".".join(dst_hbm_slice_id.split(".")[:2])
if m_cube == d_cube:
slice_idx = int(dst_hbm_slice_id.rsplit("slice", 1)[1])
return [
m_cpu_id,
f"{m_cube}.noc",
f"{m_cube}.xbar.pe{slice_idx}",
dst_hbm_slice_id,
]
return self._run_dijkstra(self._adj_mcpu_dma, m_cpu_id, dst_hbm_slice_id)
def find_node_path(self, src: str, dst: str) -> list[str]:
"""General routing between arbitrary nodes, including command edges.
Used by components (IoCpuComponent, MCpuComponent) that route through
M_CPU↔NOC command-kind links.
"""
return self._run_dijkstra(self._adj_all, src, dst)
def _run_dijkstra(
self,
adj: dict[str, list[tuple[str, float]]],
start: str,
goal: str,
) -> list[str]:
path, _ = self._run_dijkstra_with_dist(adj, start, goal)
return path
def _run_dijkstra_with_dist(
self,
adj: dict[str, list[tuple[str, float]]],
start: str,
goal: str,
) -> tuple[list[str], float]:
if start == goal:
return [start], 0.0
best: dict[str, float] = {start: 0.0}
prev: dict[str, str] = {}
heap: list[tuple[float, str]] = [(0.0, start)]
while heap:
d, node = heapq.heappop(heap)
if node == goal:
path: list[str] = []
cur = goal
while cur != start:
path.append(cur)
cur = prev[cur]
path.append(start)
path.reverse()
return path, d
if d > best.get(node, float("inf")):
continue
for neighbor, edge_dist in adj[node]:
new_d = d + edge_dist
if new_d < best.get(neighbor, float("inf")):
best[neighbor] = new_d
prev[neighbor] = node
heapq.heappush(heap, (new_d, neighbor))
raise RoutingError(f"no path from {start} to {goal}")
# ── backward-compat shims (used by existing tests) ───────────────
def _dijkstra(self, start: str, goal: str) -> list[str]:
return self._run_dijkstra(self._adj, start, goal)
def _dijkstra_with_dist(self, start: str, goal: str) -> tuple[list[str], float]:
return self._run_dijkstra_with_dist(self._adj, start, goal)
+96
View File
@@ -0,0 +1,96 @@
from __future__ import annotations
from collections.abc import Callable
from enum import Enum
from typing import Any
from kernbench.common.types import Completion, SimEngine, Trace
from .context import RuntimeContext
from .types import BenchResult, DeviceSelector
class CompletionPolicy(str, Enum):
LAST_SUBMITTED = "last_submitted"
LAST_COMPLETED = "last_completed" # requires trace/timestamps or engine support; stub for now
ALL_OK_FAIL_FAST = "all_ok_fail_fast"
BenchFn = Callable[[RuntimeContext], Any]
EngineFactory = Callable[[object, DeviceSelector], SimEngine]
def run_bench(
*,
topology: object,
bench_fn: BenchFn,
device: DeviceSelector,
engine_factory: EngineFactory,
correlation_id: str = "bench0",
completion_policy: CompletionPolicy = CompletionPolicy.LAST_SUBMITTED,
) -> BenchResult:
"""
Minimal bench runner.
- topology: compiled topology object (opaque to runtime here)
- bench_fn: callable that receives RuntimeContext and submits requests
- device: DeviceSelector ("all" or "sip:<N>")
- engine_factory: builds sim_engine for given topology & device
- completion_policy: how to determine overall completion/result
"""
engine = engine_factory(topology, device)
# Extract spec from TopologyHandle or TopologyGraph
topo_obj = getattr(topology, "topology_obj", topology)
spec = getattr(topo_obj, "spec", None)
ctx = RuntimeContext(
engine=engine, target_device=device,
correlation_id=correlation_id, spec=spec,
)
bench_fn(ctx)
ctx.wait_all()
collected_traces = ctx._traces or None
handles = ctx.handles()
if not handles:
return BenchResult(
completion=Completion(
ok=False, error_code="NO_REQUESTS", error_message="Bench submitted no requests"
),
correlation_id=correlation_id,
trace=None,
traces=collected_traces,
)
if completion_policy == CompletionPolicy.LAST_SUBMITTED:
last = handles[-1]
completion, trace = engine.get_completion(last)
return BenchResult(
completion=completion, correlation_id=correlation_id,
trace=trace, traces=collected_traces,
)
if completion_policy == CompletionPolicy.ALL_OK_FAIL_FAST:
last_trace: Trace | None = None
for h in handles:
c, t = engine.get_completion(h)
last_trace = t if t is not None else last_trace
if not c.ok:
return BenchResult(
completion=c, correlation_id=correlation_id,
trace=last_trace, traces=collected_traces,
)
return BenchResult(
completion=Completion(ok=True), correlation_id=correlation_id,
trace=last_trace, traces=collected_traces,
)
# LAST_COMPLETED placeholder (needs engine support for timing). Fall back.
last = handles[-1]
completion, trace = engine.get_completion(last)
return BenchResult(
completion=completion, correlation_id=correlation_id,
trace=trace, traces=collected_traces,
)
+282
View File
@@ -0,0 +1,282 @@
# kernbench/runtime_api/context.py
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any
from kernbench.common.types import Completion, RequestHandle, SimEngine
from .types import DeviceSelector
@dataclass
class RuntimeContext:
engine: SimEngine
target_device: DeviceSelector
correlation_id: str
spec: dict | None = None
_handles: list[RequestHandle] = field(default_factory=list, init=False)
_completed: set[RequestHandle] = field(default_factory=set, init=False)
_allocators: dict[int, Any] = field(default_factory=dict, init=False)
_tensor_counter: int = field(default=0, init=False)
_traces: list[dict] = field(default_factory=list, init=False)
def submit(self, request: Any) -> RequestHandle:
submit_fn = getattr(self.engine, "submit", None)
if submit_fn is None:
raise AttributeError("Engine does not implement submit(request) -> RequestHandle.")
handle: RequestHandle = submit_fn(request) # type: ignore[call-arg]
self._handles.append(handle)
return handle
def is_completed(self, handle: RequestHandle) -> bool:
return handle in self._completed
def wait(self, handle: RequestHandle, *, _meta: dict | None = None) -> Completion:
if handle in self._completed:
completion, trace = self.engine.get_completion(handle)
return completion
wait_fn = getattr(self.engine, "wait", None)
if wait_fn is not None:
wait_fn(handle) # type: ignore[misc]
completion, trace = self.engine.get_completion(handle)
self._completed.add(handle)
if _meta is not None and trace is not None:
entry = dict(trace) if isinstance(trace, dict) else {"raw": trace}
entry.update(_meta)
self._traces.append(entry)
return completion
def wait_all(self) -> None:
for h in self._handles:
if h not in self._completed:
self.wait(h)
def handles(self) -> list[RequestHandle]:
return list(self._handles)
# ── PyTorch-like tensor API ──────────────────────────────────────
def _ensure_allocators(self) -> dict:
"""Lazily create PEMemAllocator instances from spec."""
if self._allocators:
return self._allocators
if self.spec is None:
raise RuntimeError(
"RuntimeContext.spec is required for tensor operations. "
"Pass spec=graph.spec when creating RuntimeContext."
)
from kernbench.policy.address.allocator import AddressConfig, PEMemAllocator
system = self.spec.get("system", {})
cube = self.spec.get("cube", {})
mm = cube.get("memory_map", {})
pe_template = cube.get("pe_template", {})
pe_comps = pe_template.get("components", {})
tcm_cfg = pe_comps.get("pe_tcm", {}).get("attrs", {})
sip_count = system.get("sips", {}).get("count", 1)
cubes_per_sip = system.get("sips", {}).get("cubes_per_sip", 16)
pes_per_cube = (
cube.get("pe_layout", {}).get("pe_per_corner", 2)
* len(cube.get("pe_layout", {}).get("corners", ["NW", "NE", "SW", "SE"]))
)
hbm_gb = mm.get("hbm_total_gb_per_cube", 48)
hbm_slices = mm.get("hbm_slices_per_cube", 8)
tcm_mb = tcm_cfg.get("size_mb", 16)
cfg = AddressConfig(
sip_count=sip_count,
cubes_per_sip=cubes_per_sip,
pes_per_cube=pes_per_cube,
hbm_bytes_per_cube=hbm_gb * (1 << 30),
hbm_slices_per_cube=hbm_slices,
tcm_bytes_per_pe=tcm_mb * (1 << 20),
tcm_scheduler_reserved_bytes=4 * (1 << 20),
sram_bytes_per_cube=32 * (1 << 20),
)
# Create allocators for all SIPs × cubes × PEs
# Flat index: sip_id * cubes_per_sip * pes_per_cube + cube_id * pes_per_cube + pe_id
self._pes_per_cube = pes_per_cube
self._num_cubes = cubes_per_sip
self._num_sips = sip_count
cubes_x_pes = cubes_per_sip * pes_per_cube
for sip_id in range(sip_count):
for cube_id in range(cubes_per_sip):
for pe_id in range(pes_per_cube):
flat_idx = sip_id * cubes_x_pes + cube_id * pes_per_cube + pe_id
self._allocators[flat_idx] = PEMemAllocator(
rack_id=0, sip_id=sip_id, cube_id=cube_id, pe_id=pe_id, cfg=cfg,
)
return self._allocators
def _next_tensor_name(self) -> str:
self._tensor_counter += 1
return f"t{self._tensor_counter}"
def zeros(
self,
shape: tuple[int, ...],
dtype: str = "f16",
*,
placement: list | None = None,
dp: Any = None,
name: str | None = None,
):
"""Create a tensor and deploy to HBM with zero-fill (like torch.zeros)."""
return self._create_tensor(shape, dtype, placement, name, pattern="zero", dp=dp)
def empty(
self,
shape: tuple[int, ...],
dtype: str = "f16",
*,
placement: list | None = None,
dp: Any = None,
name: str | None = None,
):
"""Allocate a tensor in HBM without initialization (like torch.empty)."""
return self._create_tensor(shape, dtype, placement, name, pattern=None, dp=dp)
def _create_tensor(
self,
shape: tuple[int, ...],
dtype: str,
placement: list | None,
name: str | None,
pattern: str | None,
dp: Any = None,
):
from kernbench.policy.placement.dp import DPPolicy, ShardSpec, resolve_dp_policy
from kernbench.runtime_api.kernel import MemoryWriteMsg
from kernbench.runtime_api.tensor import Tensor, deploy_tensor, dtype_itemsize
tensor_name = name or self._next_tensor_name()
t = Tensor(shape=shape, dtype=dtype, name=tensor_name)
dp_policy: DPPolicy | None = None
# Resolve placement: dp= takes priority over placement=
if dp is not None and isinstance(dp, DPPolicy):
dp_policy = dp
allocators = self._ensure_allocators()
itemsize = dtype_itemsize(dtype)
shape_2d = (shape[0], shape[1]) # type: tuple[int, int]
total_cubes = self._num_sips * self._num_cubes
placement = resolve_dp_policy(
dp, shape=shape_2d, itemsize=itemsize,
num_pe=self._pes_per_cube, num_cubes=total_cubes,
)
elif placement is None:
placement = [ShardSpec(pe_index=0, offset_bytes=0, nbytes=t.nbytes)]
# Infer target_pe from placement: multi-PE → "all", single PE → pe_index
pe_indices = {s.pe_index for s in placement}
target_pe: int | str = "all" if len(pe_indices) > 1 else next(iter(pe_indices))
t.to(placement=placement, target_pe=target_pe, dp_policy=dp_policy)
# Allocate PAs via PEMemAllocator
allocators = self._ensure_allocators()
handle = deploy_tensor(
name=tensor_name,
shape=shape,
dtype=dtype,
placement=placement,
allocators=allocators,
)
t._handle = handle
# Submit MemoryWriteMsg per shard (deploy data to device)
if pattern is not None:
for shard in handle.shards:
h = self.submit(MemoryWriteMsg(
correlation_id=self.correlation_id,
request_id=f"deploy_{tensor_name}_pe{shard.pe}",
dst_sip=shard.sip, dst_cube=shard.cube, dst_pe=shard.pe,
dst_pa=shard.pa, nbytes=shard.nbytes, pattern=pattern,
target_cubes=(shard.cube,), target_pe=shard.pe,
))
self.wait(h, _meta={
"phase": "memory_write", "name": tensor_name,
"sip": shard.sip, "cube": shard.cube, "pe": shard.pe,
"nbytes": shard.nbytes,
})
return t
def launch(
self,
kernel_name: str,
kernel_fn: Any,
*args: Any,
**kwargs: Any,
) -> RequestHandle:
"""Register and launch a kernel (like a fused torch op).
Positional args: Tensor objects become TensorArg, int/float become ScalarArg.
Keyword args: become ScalarArg (name is discarded, order preserved).
"""
from kernbench.runtime_api.kernel import (
KernelLaunchMsg,
KernelRef,
ScalarArg,
)
from kernbench.runtime_api.tensor import Tensor
from kernbench.triton_emu.registry import register_kernel
# Register kernel (idempotent)
try:
register_kernel(kernel_name, kernel_fn)
except ValueError:
pass
# Build kernel args from positional + keyword args
kernel_args: list = []
target_pe: int | str = 0
for a in args:
if isinstance(a, Tensor):
kernel_args.append(a.to_tensor_arg())
# Infer target_pe from tensor DP metadata
if a._dp_metadata is not None:
dp_target = a._dp_metadata.target_pe
if dp_target == "all":
target_pe = "all"
elif isinstance(dp_target, int) and target_pe != "all":
target_pe = dp_target
elif isinstance(a, (int, float)):
dtype_str = "f32" if isinstance(a, float) else "i32"
kernel_args.append(ScalarArg(dtype=dtype_str, value=a))
for v in kwargs.values():
if isinstance(v, (int, float)):
dtype_str = "f32" if isinstance(v, float) else "i32"
kernel_args.append(ScalarArg(dtype=dtype_str, value=v))
# Determine target cubes from all tensor shards
cube_set: set[int] = set()
for a in args:
if isinstance(a, Tensor) and a._handle is not None:
for s in a._handle.shards:
cube_set.add(s.cube)
target_cubes = tuple(sorted(cube_set)) if cube_set else (0,)
# Collect scalar values for GEMM FLOP calculation
scalar_vals = [a.value for a in kernel_args if hasattr(a, "value")]
h = self.submit(KernelLaunchMsg(
correlation_id=self.correlation_id,
request_id=kernel_name,
kernel_ref=KernelRef(name=kernel_name, kind="builtin"),
args=tuple(kernel_args),
target_cubes=target_cubes,
target_pe=target_pe,
))
self.wait(h, _meta={
"phase": "kernel", "name": kernel_name,
"target_pe": target_pe, "scalars": scalar_vals,
})
return h
+123
View File
@@ -0,0 +1,123 @@
from __future__ import annotations
from dataclasses import dataclass
from typing import Literal, TypeAlias
@dataclass(frozen=True)
class MemoryWriteMsg:
correlation_id: str
request_id: str
dst_sip: int
dst_cube: int
dst_pe: int
dst_pa: int
nbytes: int
src_kind: Literal["pattern", "host_buffer_ref"] = "pattern"
pattern: str | None = None
target_cubes: tuple[int, ...] | Literal["all"] = "all"
target_pe: int | Literal["all"] = "all"
msg_type: Literal["memory_write"] = "memory_write"
@dataclass(frozen=True)
class MemoryReadMsg:
correlation_id: str
request_id: str
src_sip: int
src_cube: int
src_pe: int
src_pa: int
nbytes: int
target_cubes: tuple[int, ...] | Literal["all"] = "all"
target_pe: int | Literal["all"] = "all"
msg_type: Literal["memory_read"] = "memory_read"
@dataclass(frozen=True)
class KernelRef:
"""Reference to a kernel binary or builtin timing model.
Kernel binaries must be pre-deployed to device memory via MemoryWriteMsg.
KernelLaunchMsg references the deployed location by PA — source code or IR
MUST NOT be embedded in launch messages.
- "deployed": kernel binary pre-deployed to HBM/SRAM at deploy_pa.
- "builtin": simulator built-in timing model, identified by name.
"""
name: str
kind: Literal["deployed", "builtin"]
deploy_pa: int | None = None
deploy_sip: int = 0
deploy_cube: int = 0
deploy_pe: int = 0
nbytes_code: int = 0
@dataclass(frozen=True)
class TensorArgShard:
sip: int
cube: int
pe: int
pa: int
nbytes: int
offset_bytes: int
@dataclass(frozen=True)
class TensorArg:
shards: tuple[TensorArgShard, ...]
arg_kind: Literal["tensor"] = "tensor"
@dataclass(frozen=True)
class ScalarArg:
dtype: str
value: float | int
arg_kind: Literal["scalar"] = "scalar"
KernelArg: TypeAlias = TensorArg | ScalarArg
@dataclass(frozen=True)
class KernelLaunchMsg:
correlation_id: str
request_id: str
kernel_ref: KernelRef
args: tuple[KernelArg, ...]
target_cubes: tuple[int, ...] | Literal["all"] = "all"
target_pe: int | Literal["all"] = "all"
msg_type: Literal["kernel_launch"] = "kernel_launch"
@dataclass(frozen=True)
class ResponseMsg:
"""Device→Host response carrying PE execution result."""
correlation_id: str
request_id: str
src_cube: int
src_pe: int
success: bool
msg_type: Literal["response"] = "response"
@dataclass(frozen=True)
class PeDmaMsg:
"""Direct PE DMA request: host injects a transfer at PE_DMA level.
Used by the probe utility to measure PE→HBM latency without requiring
the full PE_CPU → scheduler → DMA pipeline.
"""
correlation_id: str
request_id: str
src_sip: int
src_cube: int
src_pe: int
dst_pa: int
nbytes: int
is_write: bool = False
msg_type: Literal["pe_dma"] = "pe_dma"
+166
View File
@@ -0,0 +1,166 @@
from __future__ import annotations
import math
from dataclasses import dataclass
from typing import Literal
from kernbench.policy.address.allocator import PEMemAllocator
from kernbench.policy.placement.dp import DPPolicy, ShardSpec
from kernbench.runtime_api.kernel import TensorArg, TensorArgShard
@dataclass(frozen=True)
class TensorShard:
sip: int
cube: int
pe: int
pa: int
nbytes: int
offset_bytes: int
@dataclass(frozen=True)
class TensorHandle:
name: str
shape: tuple[int, ...]
dtype: str
itemsize: int
shards: tuple[TensorShard, ...]
@property
def nbytes(self) -> int:
return math.prod(self.shape) * self.itemsize
_DTYPE_ITEMSIZE = {
"fp16": 2, "float16": 2, "f16": 2,
"fp32": 4, "float32": 4, "f32": 4,
"bf16": 2,
"int8": 1, "i8": 1,
"int16": 2, "i16": 2,
"int32": 4, "i32": 4,
}
def dtype_itemsize(dtype: str) -> int:
if dtype not in _DTYPE_ITEMSIZE:
raise ValueError(f"unsupported dtype: {dtype}")
return _DTYPE_ITEMSIZE[dtype]
def deploy_tensor(
*,
name: str,
shape: tuple[int, ...],
dtype: str,
placement: list[ShardSpec],
allocators: dict[int, PEMemAllocator],
mem_kind: Literal["hbm", "tcm"] = "hbm",
) -> TensorHandle:
isize = dtype_itemsize(dtype)
shards: list[TensorShard] = []
for spec in placement:
alloc = allocators[spec.pe_index]
if mem_kind == "hbm":
pa = alloc.alloc_hbm(spec.nbytes)
else:
pa = alloc.alloc_tcm(spec.nbytes)
shards.append(TensorShard(
sip=alloc._sip_id,
cube=alloc._cube_id,
pe=alloc._pe_id,
pa=pa.encode(),
nbytes=spec.nbytes,
offset_bytes=spec.offset_bytes,
))
return TensorHandle(
name=name,
shape=shape,
dtype=dtype,
itemsize=isize,
shards=tuple(shards),
)
# ── PyTorch-like Tensor API ──────────────────────────────────────────
@dataclass(frozen=True)
class DPMetadata:
"""Data-parallel placement metadata (stored as Tensor._dp_metadata)."""
placement: list[ShardSpec]
dp_policy: DPPolicy | None = None
sip: int = 0
cube: int = 0
target_pe: int | str = 0 # int → single PE, "all" → all PEs
class Tensor:
"""PyTorch-like tensor for benchmark code.
Usage::
a = ctx.zeros((M, K), dtype="f16")
a = ctx.zeros((M, K), dtype="f16", placement=dp.replicate(num_pe=8))
ctx.launch("kernel_name", kernel_fn, a, b, out, M=M, K=K)
"""
def __init__(
self,
shape: tuple[int, ...],
dtype: str = "f16",
name: str = "",
) -> None:
self.shape = shape
self.dtype = dtype
self.name = name
self._dp_metadata: DPMetadata | None = None
self._handle: TensorHandle | None = None
@property
def itemsize(self) -> int:
return dtype_itemsize(self.dtype)
@property
def nbytes(self) -> int:
return math.prod(self.shape) * self.itemsize
@property
def pa(self) -> int:
"""Primary PA (first shard). Used as kernel pointer argument."""
if self._handle is None or not self._handle.shards:
raise RuntimeError(f"Tensor '{self.name}' is not deployed yet")
return self._handle.shards[0].pa
def to(
self,
placement: list[ShardSpec] | None = None,
*,
dp_policy: DPPolicy | None = None,
sip: int = 0,
cube: int = 0,
target_pe: int | str = 0,
) -> Tensor:
"""Set DP placement metadata (like torch.Tensor.to())."""
if placement is None:
placement = [ShardSpec(pe_index=0, offset_bytes=0, nbytes=self.nbytes)]
self._dp_metadata = DPMetadata(
placement=placement, dp_policy=dp_policy,
sip=sip, cube=cube, target_pe=target_pe,
)
return self
def to_tensor_arg(self) -> TensorArg:
"""Convert deployed shards to KernelLaunchMsg TensorArg."""
if self._handle is None:
raise RuntimeError(f"Tensor '{self.name}' is not deployed yet")
return TensorArg(
shards=tuple(
TensorArgShard(
sip=s.sip, cube=s.cube, pe=s.pe,
pa=s.pa, nbytes=s.nbytes, offset_bytes=s.offset_bytes,
)
for s in self._handle.shards
),
)
+71
View File
@@ -0,0 +1,71 @@
from __future__ import annotations
import re
from dataclasses import dataclass
from kernbench.common.types import Completion, Trace
@dataclass(frozen=True)
class BenchResult:
completion: Completion
correlation_id: str
trace: Trace | None = None
traces: list[dict] | None = None
def summary_text(self) -> str:
if self.completion.ok:
return f"[OK] correlation_id={self.correlation_id}"
code = self.completion.error_code or "ERROR"
msg = self.completion.error_message or ""
return f"[FAIL:{code}] correlation_id={self.correlation_id} {msg}".rstrip()
@dataclass(frozen=True)
class DeviceSelector:
"""
Device selector.
Supported:
- "all" : all SIPs in the tray topology
- "sip:<N>" : a single SIP index
"""
raw: str # "all" or "sip:<N>"
@property
def is_all(self) -> bool:
return self.raw == "all"
@property
def sip_index(self) -> int:
if self.is_all:
raise ValueError("DeviceSelector is 'all'; no single sip_index.")
m = re.fullmatch(r"sip:(\d+)", self.raw)
if not m:
raise ValueError(
f"Invalid device '{self.raw}'. Expected 'all' or 'sip:<N>' (e.g., sip:0)."
)
return int(m.group(1))
def resolve_device(raw: str | None) -> DeviceSelector:
"""
Resolve the CLI --device string into a DeviceSelector.
Semantics:
- if omitted/empty -> "all"
- else accept "all" or "sip:<N>"
"""
if raw is None or raw.strip() == "":
return DeviceSelector(raw="all")
raw = raw.strip().lower()
if raw == "all":
return DeviceSelector(raw="all")
m = re.fullmatch(r"sip:(\d+)", raw)
if not m:
raise ValueError(f"Invalid device '{raw}'. Expected 'all' or 'sip:<N>' (e.g., sip:0).")
return DeviceSelector(raw=raw)
+31
View File
@@ -0,0 +1,31 @@
# kernbench/engine/dummy.py
from __future__ import annotations
from dataclasses import dataclass
from typing import Any
from kernbench.common.types import Completion, RequestHandle, SimEngine, Trace
@dataclass
class DummyEngine(SimEngine):
topology: object
device_raw: str
_n: int = 0
_store: dict[str, tuple[Completion, Trace | None]] = None # type: ignore
def __post_init__(self) -> None:
self._store = {}
def submit(self, request: Any) -> RequestHandle:
self._n += 1
h = RequestHandle(f"h{self._n}")
# 여기서 request 처리/시뮬레이션/스케줄링 등을 수행
self._store[str(h)] = (Completion(ok=True), {"request": request, "device": self.device_raw})
return h
def get_completion(self, handle: RequestHandle) -> tuple[Completion, Trace | None]:
return self._store[str(handle)]
def wait(self, handle: RequestHandle) -> None:
pass
+298
View File
@@ -0,0 +1,298 @@
from __future__ import annotations
from typing import Any
import simpy
from kernbench.common.types import Completion, RequestHandle, Trace
import kernbench.components.impls # noqa: F401 — registers built-in implementations
from kernbench.components.base import ComponentBase, ComponentRegistry
from kernbench.components.context import ComponentContext
from kernbench.policy.address.phyaddr import PhysAddr
from kernbench.policy.routing.router import AddressResolver, PathRouter
from kernbench.runtime_api.kernel import KernelLaunchMsg, MemoryReadMsg, MemoryWriteMsg, PeDmaMsg
from kernbench.sim_engine.transaction import Transaction
from kernbench.topology.types import Edge, TopologyGraph
class GraphEngine:
"""simpy-based discrete-event simulation engine.
Phase B: engine injects a Transaction into the PCIE_EP host queue for
each request. Components handle their own routing:
Path 1: PCIE_EP → IO_CPU (engine-computed path, pre-loaded in Transaction)
Path 2: IO_CPU → M_CPU (IO_CPU dispatches, fire-and-forget callback)
Path 3: M_CPU.DMA → HBM (M_CPU dispatches, fire-and-forget callback)
Component implementations are DI-injectable via component_overrides (ADR-0007 D3).
"""
def __init__(
self,
graph: TopologyGraph,
*,
component_overrides: dict[str, type[ComponentBase]] | None = None,
) -> None:
self._env = simpy.Environment()
self._resolver = AddressResolver(graph)
self._router = PathRouter(graph)
self._nodes = graph.nodes
self._edge_map: dict[tuple[str, str], Edge] = {}
for e in graph.edges:
self._edge_map[(e.src, e.dst)] = e
self._ns_per_mm: float = graph.spec.get("system", {}).get("ns_per_mm", 0.01)
self._results: dict[str, tuple[Completion, Trace]] = {}
self._events: dict[str, simpy.Event] = {}
self._counter = 0
overrides = component_overrides or {}
ctx = ComponentContext(
router=self._router,
resolver=self._resolver,
positions={nid: n.pos_mm for nid, n in graph.nodes.items()},
ns_per_mm=self._ns_per_mm,
edge_map=self._edge_map,
spec=graph.spec,
)
self._components: dict[str, ComponentBase] = {
node_id: ComponentRegistry.create(node, overrides, ctx)
for node_id, node in graph.nodes.items()
}
# Wire ports: one Store per directed edge (ADR-0015 D1)
for e in graph.edges:
src_comp = self._components.get(e.src)
dst_comp = self._components.get(e.dst)
if src_comp is None or dst_comp is None:
continue
store: simpy.Store = simpy.Store(self._env)
src_comp.out_ports[e.dst] = store
dst_comp.in_ports[e.src] = store
# Wire processes: propagation delay per edge (ADR-0015 D2)
# Cut-through (wormhole) model: wires apply propagation only.
# Serialization (drain) is computed per-path and applied once at the terminal.
for e in graph.edges:
src_comp = self._components.get(e.src)
dst_comp = self._components.get(e.dst)
if src_comp is None or dst_comp is None:
continue
prop_ns = e.distance_mm * self._ns_per_mm
self._env.process(
self._wire(src_comp.out_ports[e.dst], dst_comp.in_ports[e.src],
prop_ns)
)
# Attach host queues to PCIE_EP in_ports before start() (ADR-0015 D3)
self._host_queues: dict[str, simpy.Store] = {}
for pcie_ep_id in self._resolver.find_all_pcie_eps():
host_q: simpy.Store = simpy.Store(self._env)
self._components[pcie_ep_id].in_ports["host"] = host_q
self._host_queues[pcie_ep_id] = host_q
# Attach host queues to PE_DMA nodes for direct PE DMA injection
self._pe_dma_queues: dict[str, simpy.Store] = {}
for node_id, node in graph.nodes.items():
if node.kind == "pe_dma":
host_q = simpy.Store(self._env)
self._components[node_id].in_ports["host"] = host_q
self._pe_dma_queues[node_id] = host_q
# Start components after all ports are wired (ADR-0015 D3)
for comp in self._components.values():
comp.start(self._env)
def submit(self, request: Any) -> RequestHandle:
self._counter += 1
handle = RequestHandle(f"h{self._counter}")
event = self._env.event()
self._events[str(handle)] = event
self._env.process(self._process(str(handle), request, event))
return handle
def wait(self, handle: RequestHandle) -> None:
key = str(handle)
event = self._events[key]
if not event.triggered:
self._env.run(until=event)
def get_completion(self, handle: RequestHandle) -> tuple[Completion, Trace | None]:
return self._results[str(handle)]
# ── internal ────────────────────────────────────────────────────
def _wire(
self,
out_port: simpy.Store,
in_port: simpy.Store,
prop_ns: float,
):
"""SimPy process: relay messages with propagation delay only.
Cut-through (wormhole) model: serialization (drain) is computed per-path
and applied once at the terminal component, not at every wire hop.
"""
while True:
msg = yield out_port.get()
if prop_ns > 0:
yield self._env.timeout(prop_ns)
yield in_port.put(msg)
def _process(self, key: str, request: Any, done: simpy.Event):
if isinstance(request, PeDmaMsg):
yield from self._process_pe_dma(key, request, done)
return
entries = self._entry_points(request)
if not entries:
self._results[key] = (
Completion(ok=True),
{"total_ns": 0.0, "nbytes": 0},
)
done.succeed()
return
start_ns = self._env.now
total_nbytes = 0
root_txn: Transaction | None = None
if len(entries) == 1:
# Single-SIP: direct inject (common path, no extra events)
pcie_ep_id, io_cpu_id, nbytes = entries[0]
total_nbytes = nbytes
path = self._router.find_node_path(pcie_ep_id, io_cpu_id)
txn_done = self._env.event()
txn = Transaction(request=request, path=path, step=0, nbytes=nbytes, done=txn_done)
root_txn = txn
yield self._host_queues[pcie_ep_id].put(txn)
yield txn_done
else:
# Multi-SIP: inject per SIP, aggregate completions (ADR-0007)
sub_dones: list[simpy.Event] = []
sub_txns: list[Transaction] = []
for pcie_ep_id, io_cpu_id, nbytes in entries:
total_nbytes = max(total_nbytes, nbytes)
path = self._router.find_node_path(pcie_ep_id, io_cpu_id)
txn_done = self._env.event()
txn = Transaction(
request=request, path=path, step=0,
nbytes=nbytes, done=txn_done,
)
yield self._host_queues[pcie_ep_id].put(txn)
sub_dones.append(txn_done)
sub_txns.append(txn)
for sd in sub_dones:
yield sd
# Aggregate pe_exec_ns from multi-SIP (max)
pe_vals = [st.result_data.get("pe_exec_ns") for st in sub_txns]
pe_vals = [v for v in pe_vals if v is not None]
if pe_vals:
if root_txn is None:
root_txn = sub_txns[0]
root_txn.result_data["pe_exec_ns"] = max(pe_vals)
total_ns = self._env.now - start_ns
result_trace: dict[str, Any] = {"total_ns": total_ns, "nbytes": total_nbytes}
if root_txn is not None and root_txn.result_data:
result_trace.update(root_txn.result_data)
self._results[key] = (
Completion(ok=True),
result_trace,
)
done.succeed()
def _process_pe_dma(self, key: str, request: PeDmaMsg, done: simpy.Event):
"""Inject a Transaction directly at PE_DMA for PE→HBM latency measurement."""
pe_prefix = f"sip{request.src_sip}.cube{request.src_cube}.pe{request.src_pe}"
pe_dma_id = f"{pe_prefix}.pe_dma"
pa = PhysAddr.decode(request.dst_pa)
dst_node = self._resolver.resolve(pa)
path = self._router.find_path(pe_prefix, dst_node)
drain_ns = self._path_drain_ns(path, request.nbytes)
start_ns = self._env.now
txn_done = self._env.event()
txn = Transaction(request=request, path=path, step=0, nbytes=request.nbytes,
done=txn_done, drain_ns=drain_ns)
yield self._pe_dma_queues[pe_dma_id].put(txn)
yield txn_done
total_ns = self._env.now - start_ns
formula_ns = self._formula_latency(path, request.nbytes)
self._results[key] = (
Completion(ok=True),
{"total_ns": total_ns, "formula_ns": formula_ns, "nbytes": request.nbytes},
)
done.succeed()
def _path_drain_ns(self, path: list[str], nbytes: int) -> float:
"""Wormhole drain time: nbytes / bottleneck_bw along path."""
min_bw = float("inf")
for i in range(len(path) - 1):
edge = self._edge_map.get((path[i], path[i + 1]))
if edge and edge.bw_gbs:
min_bw = min(min_bw, edge.bw_gbs)
if min_bw == float("inf"):
return 0.0
return nbytes / min_bw
def _formula_latency(self, path: list[str], nbytes: int) -> float:
"""Lower-bound formula latency (ADR-0015 D7).
formula = Σ(wire propagation) + Σ(component overhead_ns) + drain_ns
Phase 0: formula == actual (no contention).
Phase 1+: formula <= actual (contention adds queueing).
"""
total = 0.0
# Wire propagation delays
for i in range(len(path) - 1):
edge = self._edge_map.get((path[i], path[i + 1]))
if edge:
total += edge.distance_mm * self._ns_per_mm
# Component overhead_ns
for node_id in path:
node = self._nodes.get(node_id)
if node:
total += float(node.attrs.get("overhead_ns", 0.0))
# Drain
total += self._path_drain_ns(path, nbytes)
return total
def _entry_points(self, request: Any) -> list[tuple[str, str, int]]:
"""Return list of (pcie_ep_id, io_cpu_id, nbytes) per target SIP.
For Memory{Write,Read}: single SIP entry.
For KernelLaunchMsg: one entry per distinct SIP in tensor shards.
"""
if isinstance(request, MemoryWriteMsg):
sip = request.dst_sip
return [(
self._resolver.find_pcie_ep(sip),
self._resolver.find_io_cpu(sip),
request.nbytes,
)]
if isinstance(request, MemoryReadMsg):
sip = request.src_sip
return [(
self._resolver.find_pcie_ep(sip),
self._resolver.find_io_cpu(sip),
request.nbytes,
)]
if isinstance(request, KernelLaunchMsg):
seen: set[int] = set()
entries: list[tuple[str, str, int]] = []
for arg in request.args:
if arg.arg_kind != "tensor":
continue
for shard in arg.shards:
if shard.sip not in seen:
seen.add(shard.sip)
entries.append((
self._resolver.find_pcie_ep(shard.sip),
self._resolver.find_io_cpu(shard.sip),
shard.nbytes,
))
return entries
raise ValueError(f"unsupported request type: {type(request)}")
+49
View File
@@ -0,0 +1,49 @@
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any
import simpy
@dataclass
class Transaction:
"""In-flight request traversing the device fabric hop-by-hop (ADR-0015 D4).
A Transaction carries a host request through one leg of the device fabric.
Each component on the path reads from its in_port, processes (overhead_ns or
other latency), and advances the Transaction to the next hop via out_port.
Wire processes (ADR-0015 D2) model propagation delay between hops.
Multi-leg flows (e.g. IO_CPU → M_CPU as leg 1, M_CPU.DMA → HBM as leg 2)
use separate Transactions: the terminal component of leg 1 creates leg 2
and waits for leg 2's done before succeeding leg 1's done.
"""
request: Any # original host request (MemoryReadMsg, KernelLaunchMsg, …)
path: list[str] # node_id sequence for this leg
step: int # index of the component currently holding this Transaction
nbytes: int # payload size (bytes)
done: simpy.Event # succeeded when this leg completes
drain_ns: float = 0.0 # wormhole drain time: nbytes / bottleneck_bw (applied once at terminal)
is_response: bool = False # True when carrying ResponseMsg on reverse path
result_data: dict[str, Any] = field(default_factory=dict) # PE-level metrics (pe_exec_ns, etc.)
@property
def next_hop(self) -> str | None:
"""Node id of the next component, or None if this is the terminal hop."""
nxt = self.step + 1
return self.path[nxt] if nxt < len(self.path) else None
def advance(self) -> Transaction:
"""Return a copy of this Transaction advanced one step along the path."""
return Transaction(
request=self.request,
path=self.path,
step=self.step + 1,
nbytes=self.nbytes,
done=self.done,
drain_ns=self.drain_ns,
is_response=self.is_response,
result_data=self.result_data,
)
View File
+965
View File
@@ -0,0 +1,965 @@
# kernbench/topology/builder.py
"""
Topology compiler: parses topology.yaml and produces a fully-instantiated
TopologyGraph with nodes, edges, and representative view projections.
"""
from __future__ import annotations
from pathlib import Path
from typing import Any
import yaml
from .types import Edge, Node, TopologyGraph, TopologyHandle, ViewGraph
# PE component offsets from PE center (small, intra-PE distances ~0.5mm)
_PE_COMP_OFFSETS = {
"pe_cpu": (-0.3, 0.0),
"pe_scheduler": (-0.15, 0.0),
"pe_dma": (0.0, -0.15),
"pe_gemm": (0.0, 0.0),
"pe_math": (0.0, 0.15),
"pe_tcm": (0.3, 0.0),
}
# ── Public API ───────────────────────────────────────────────────────
def resolve_topology(path_str: str) -> TopologyHandle:
"""Validate path and build compiled topology graph."""
p = Path(path_str).expanduser().resolve()
if not p.exists():
raise FileNotFoundError(f"Topology file not found: {p}")
if not p.is_file():
raise ValueError(f"Topology path is not a file: {p}")
graph = load_topology(p)
return TopologyHandle(path=p, topology_obj=graph)
def load_topology(path: Path) -> TopologyGraph:
"""Load topology spec from file and compile into a topology graph."""
spec = _read_spec(path)
_validate_spec(spec)
return _compile_graph(spec)
def _read_spec(path: Path) -> dict[str, Any]:
"""Read YAML topology spec file and return a dict."""
try:
with path.open("r", encoding="utf-8") as f:
data = yaml.safe_load(f)
except yaml.YAMLError as e:
msg = f"Failed to parse YAML topology: {path}"
mark = getattr(e, "problem_mark", None)
if mark is not None:
msg += f" (line {mark.line + 1}, column {mark.column + 1})"
raise ValueError(msg) from e
if data is None:
raise ValueError(f"Topology YAML is empty: {path}")
if not isinstance(data, dict):
raise ValueError(
f"Topology YAML root must be a mapping/dict: {path} (got {type(data).__name__})"
)
return data
def _validate_spec(spec: dict) -> None:
# TODO: schema validation
return
# ── Graph Compiler ───────────────────────────────────────────────────
def _compile_graph(spec: dict) -> TopologyGraph:
"""Build fully-instantiated flat graph + representative view projections."""
nodes: dict[str, Node] = {}
edges: list[Edge] = []
system = spec["system"]
sip_spec = spec["sip"]
cube_spec = spec["cube"]
mesh_w = sip_spec["cube_mesh"]["w"]
mesh_h = sip_spec["cube_mesh"]["h"]
cube_w = cube_spec["geometry"]["cube_mm"]["w"]
cube_h = cube_spec["geometry"]["cube_mm"]["h"]
seam = sip_spec["links"]["inter_cube_mesh"]["distance_mm_across_seam"]
stride_x = cube_w + seam
stride_y = cube_h + seam
# System-level
_instantiate_system(nodes, system)
# Per-SIP
for sip_id in range(system["sips"]["count"]):
sp = f"sip{sip_id}"
# IO chiplets
_instantiate_io_chiplets(
nodes, edges, sp, sip_spec,
cube_w, cube_h, mesh_w, mesh_h, seam,
)
# Cubes + PEs
for row in range(mesh_h):
for col in range(mesh_w):
cid = row * mesh_w + col
cp = f"{sp}.cube{cid}"
origin = (col * stride_x, row * stride_y)
_instantiate_cube(nodes, edges, cp, cube_spec, origin)
# Inter-cube UCIe mesh
_add_inter_cube_edges(edges, sp, mesh_w, mesh_h, sip_spec)
# IO → cube UCIe
_add_io_to_cube_edges(edges, sp, sip_spec, mesh_w)
# Switch → IO pcie_ep
_add_system_to_io_edges(edges, sp, sip_spec, system)
# Build views
return TopologyGraph(
spec=spec,
nodes=nodes,
edges=edges,
system_view=_build_system_view(spec),
sip_view=_build_sip_view(spec),
cube_view=_build_cube_view(spec),
pe_view=_build_pe_view(spec),
)
# ── Layout helpers ───────────────────────────────────────────────────
def _cube_local_positions(cube_w: float, cube_h: float) -> dict[str, tuple[float, float]]:
"""Cube-internal component positions relative to cube origin (0,0) at top-left."""
cx, cy = cube_w / 2, cube_h / 2
# UCIe node half-sizes (default 2.0×1.2mm) — inset so edges touch boundary
uh = 0.6 # half height
uw = 1.0 # half width
return {
"ucie-N": (cx, uh),
"ucie-S": (cx, cube_h - uh),
"ucie-W": (uw, cy),
"ucie-E": (cube_w - uw, cy),
"m_cpu": (cube_w - 2.5, cy - 1.5),
"xbar.top": (cx, 3.5), # Y reference for top-half xbar.pe nodes
"hbm_ctrl": (cx - 2.0, cy),
"xbar.bottom": (cx, cube_h - 3.5), # Y reference for bottom-half xbar.pe nodes
"bridge.left": (2.5, cy + 2.0),
"bridge.right": (cube_w - 2.5, cy + 2.0),
"noc": (cx + 2.0, cy),
"sram": (2.5, cy - 1.5),
}
def _corner_pe_positions(cube_w: float, cube_h: float) -> dict[str, list[tuple[float, float]]]:
"""PE center positions per corner, relative to cube origin."""
return {
"NW": [(1.5, 1.5), (4.5, 1.5)],
"NE": [(cube_w - 4.5, 1.5), (cube_w - 1.5, 1.5)],
"SW": [(1.5, cube_h - 1.5), (4.5, cube_h - 1.5)],
"SE": [(cube_w - 4.5, cube_h - 1.5), (cube_w - 1.5, cube_h - 1.5)],
}
# ── Instantiation: system ───────────────────────────────────────────
def _instantiate_system(nodes: dict[str, Node], system: dict) -> None:
"""Add system-level nodes (fabric switch)."""
sw = system["components"]["switch"]
sw_id = "fabric.switch0"
nodes[sw_id] = Node(
id=sw_id, kind=sw["kind"], impl=sw["impl"],
attrs=sw.get("attrs", {}), pos_mm=None, label="Switch",
)
# ── Instantiation: IO chiplets ──────────────────────────────────────
def _instantiate_io_chiplets(
nodes: dict[str, Node],
edges: list[Edge],
sp: str,
sip_spec: dict,
cube_w: float,
cube_h: float,
mesh_w: int,
mesh_h: int,
seam: float,
) -> None:
"""Add IO chiplet nodes and internal pcie_ep → io_cpu edges."""
io_spec = sip_spec["iochiplet"]
comp = io_spec["components"]
links = io_spec["links"]
mesh_total_w = mesh_w * cube_w + (mesh_w - 1) * seam
mesh_total_h = mesh_h * cube_h + (mesh_h - 1) * seam
for inst in io_spec["instances"]:
iid = inst["id"]
prefix = f"{sp}.{iid}"
side = inst["place"]["side"]
cx = mesh_total_w / 2
if side == "N":
pcie_y, cpu_y = -5.0, -3.0
else:
pcie_y, cpu_y = mesh_total_h + 5.0, mesh_total_h + 3.0
# pcie_ep
ep = comp["pcie_ep"]
ep_id = f"{prefix}.pcie_ep"
nodes[ep_id] = Node(
id=ep_id, kind=ep["kind"], impl=ep["impl"],
attrs=ep["attrs"], pos_mm=(cx, pcie_y), label="PCIe EP",
)
# io_cpu
cpu = comp["io_cpu"]
cpu_id = f"{prefix}.io_cpu"
nodes[cpu_id] = Node(
id=cpu_id, kind=cpu["kind"], impl=cpu["impl"],
attrs=cpu["attrs"], pos_mm=(cx, cpu_y), label="IO CPU",
)
# Internal edge
edges.append(Edge(
src=ep_id, dst=cpu_id,
distance_mm=links["pcie_ep_to_io_cpu_mm"],
bw_gbs=links["pcie_ep_to_io_cpu_bw_gbs"],
kind="io_internal",
))
# ── Instantiation: cube + PEs ───────────────────────────────────────
def _instantiate_cube(
nodes: dict[str, Node],
edges: list[Edge],
cp: str,
cube: dict,
origin: tuple[float, float],
) -> None:
"""Add all cube-internal nodes and edges, including PE instances."""
cube_w = cube["geometry"]["cube_mm"]["w"]
cube_h = cube["geometry"]["cube_mm"]["h"]
ox, oy = origin
local_pos = _cube_local_positions(cube_w, cube_h)
clinks = cube["links"]
n_slices = cube["memory_map"]["hbm_slices_per_cube"]
# ── UCIe ports ──
ucie_ns = cube["ucie"]["overhead_ns"]
for port in cube["ucie"]["ports"]:
pid = f"{cp}.ucie-{port}"
lx, ly = local_pos[f"ucie-{port}"]
nodes[pid] = Node(
id=pid, kind="ucie_port", impl="ucie_v1",
attrs={"overhead_ns": ucie_ns}, pos_mm=(ox + lx, oy + ly),
label=f"UCIe-{port}",
)
# ── Named components: noc, m_cpu, sram ──
for name in ("noc", "m_cpu", "sram"):
c = cube["components"][name]
nid = f"{cp}.{name}"
lx, ly = local_pos[name]
nodes[nid] = Node(
id=nid, kind=c["kind"], impl=c["impl"],
attrs=c["attrs"], pos_mm=(ox + lx, oy + ly),
label=name.upper().replace("_", " "),
)
# ── HBM controller slices (one per PE) ──
hbm_spec = cube["components"]["hbm_ctrl"]
hbm_lx, hbm_ly = local_pos["hbm_ctrl"]
for sl in range(n_slices):
sid = f"{cp}.hbm_ctrl.slice{sl}"
nodes[sid] = Node(
id=sid, kind=hbm_spec["kind"], impl=hbm_spec["impl"],
attrs=hbm_spec["attrs"], pos_mm=(ox + hbm_lx, oy + hbm_ly),
label=f"HBM SLICE{sl}",
)
# ── Bridges ──
for br in cube["components"]["xbar"]["bridges"]:
bname = br["id"]
nid = f"{cp}.bridge.{bname}"
lx, ly = local_pos[f"bridge.{bname}"]
nodes[nid] = Node(
id=nid, kind=br["kind"], impl=br["impl"],
attrs=br["attrs"], pos_mm=(ox + lx, oy + ly),
label=f"Bridge {bname.upper()}",
)
# ── PE instances + per-PE xbar entry nodes ──
corners = cube["pe_layout"]["corners"]
pe_per_corner = cube["pe_layout"]["pe_per_corner"]
corner_pos = _corner_pe_positions(cube_w, cube_h)
pe_tmpl = cube["pe_template"]
pe_links = pe_tmpl["links"]
xbar_pe_spec = cube["components"]["xbar"]["pe"]
xbar_top_y = local_pos["xbar.top"][1]
xbar_bot_y = local_pos["xbar.bottom"][1]
pe_idx = 0
for corner in corners:
is_top = corner in ("NW", "NE")
xbar_y = xbar_top_y if is_top else xbar_bot_y
mm_key = "pe_to_xbar_row_n_mm" if is_top else "pe_to_xbar_row_s_mm"
for ci in range(pe_per_corner):
pp = f"{cp}.pe{pe_idx}"
pe_cx, pe_cy = corner_pos[corner][ci]
# Per-PE xbar entry node
xbar_nid = f"{cp}.xbar.pe{pe_idx}"
nodes[xbar_nid] = Node(
id=xbar_nid, kind=xbar_pe_spec["kind"], impl=xbar_pe_spec["impl"],
attrs=xbar_pe_spec["attrs"], pos_mm=(ox + pe_cx, oy + xbar_y),
label=f"XBAR PE{pe_idx}",
)
# PE template components
for comp_name, comp_spec in pe_tmpl["components"].items():
cid = f"{pp}.{comp_name}"
dx, dy = _PE_COMP_OFFSETS.get(comp_name, (0.0, 0.0))
nodes[cid] = Node(
id=cid, kind=comp_spec["kind"], impl=comp_spec["impl"],
attrs=comp_spec["attrs"],
pos_mm=(ox + pe_cx + dx, oy + pe_cy + dy),
label=comp_name.upper().replace("_", " "),
)
# PE-internal edges
_add_pe_internal_edges(edges, pp, pe_links)
# PE_DMA → xbar.pe_i (HBM data path)
edges.append(Edge(
src=f"{pp}.pe_dma", dst=xbar_nid,
distance_mm=clinks[mm_key],
bw_gbs=clinks["pe_to_xbar_bw_gbs"],
kind="pe_to_xbar",
))
# PE_DMA → noc (non-HBM data path: SRAM, inter-cube, etc.)
edges.append(Edge(
src=f"{pp}.pe_dma", dst=f"{cp}.noc",
distance_mm=clinks["pe_dma_to_noc_mm"],
bw_gbs=clinks["pe_dma_to_noc_bw_gbs"],
kind="pe_to_noc",
))
# noc → PE_CPU (command delivery)
edges.append(Edge(
src=f"{cp}.noc", dst=f"{pp}.pe_cpu",
distance_mm=clinks["noc_to_pe_cpu_mm"],
kind="command",
))
pe_idx += 1
# ── Cube fabric edges ──
# xbar.pe_i ↔ hbm_ctrl.slice_i (local Y-path, bidirectional for response)
for i in range(n_slices):
edges.append(Edge(
src=f"{cp}.xbar.pe{i}", dst=f"{cp}.hbm_ctrl.slice{i}",
distance_mm=clinks["xbar_to_hbm_mm"],
bw_gbs=clinks["xbar_to_hbm_bw_gbs"],
kind="xbar_to_hbm",
))
edges.append(Edge(
src=f"{cp}.hbm_ctrl.slice{i}", dst=f"{cp}.xbar.pe{i}",
distance_mm=clinks["xbar_to_hbm_mm"],
bw_gbs=clinks["xbar_to_hbm_bw_gbs"],
kind="hbm_to_xbar",
))
# xbar chain: pe0↔pe1↔pe2↔pe3 (top), pe4↔pe5↔pe6↔pe7 (bottom)
half = n_slices // 2
for half_start in (0, half):
for i in range(half_start, half_start + half - 1):
intra = ((i - half_start) % pe_per_corner) != (pe_per_corner - 1)
x_dist = clinks["xbar_chain_intra_corner_mm"] if intra else clinks["xbar_chain_inter_corner_mm"]
for a, b in [(i, i + 1), (i + 1, i)]:
edges.append(Edge(
src=f"{cp}.xbar.pe{a}", dst=f"{cp}.xbar.pe{b}",
distance_mm=x_dist,
bw_gbs=clinks["xbar_x_bw_gbs"],
kind="xbar_chain",
))
# bridge connections: pe0↔bridge.left↔pe4, pe3↔bridge.right↔pe7
for bname, pe_top, pe_bot in [("left", 0, half), ("right", half - 1, n_slices - 1)]:
br_node = f"{cp}.bridge.{bname}"
for pe_i, br_mm_key in [(pe_top, "xbar_row_n_to_bridge_mm"),
(pe_bot, "xbar_row_s_to_bridge_mm")]:
xbar_node = f"{cp}.xbar.pe{pe_i}"
edges.append(Edge(
src=xbar_node, dst=br_node,
distance_mm=clinks[br_mm_key],
bw_gbs=clinks["xbar_to_bridge_bw_gbs"],
kind="xbar_to_bridge",
))
edges.append(Edge(
src=br_node, dst=xbar_node,
distance_mm=clinks[br_mm_key],
bw_gbs=clinks["xbar_to_bridge_bw_gbs"],
kind="bridge_to_xbar",
))
# ucie ↔ noc (UCIe-NOC boundary; per_connection_bw_gbs = 128 GB/s, n_connections = 4)
_noc_ucie = clinks["noc_to_ucie"]
for port in cube["ucie"]["ports"]:
edges.append(Edge(
src=f"{cp}.ucie-{port}", dst=f"{cp}.noc",
distance_mm=0.0,
bw_gbs=_noc_ucie["per_connection_bw_gbs"],
n_connections=_noc_ucie["n_connections"],
kind="ucie_to_noc",
))
for port in cube["ucie"]["ports"]:
edges.append(Edge(
src=f"{cp}.noc", dst=f"{cp}.ucie-{port}",
distance_mm=0.0,
bw_gbs=_noc_ucie["per_connection_bw_gbs"],
n_connections=_noc_ucie["n_connections"],
kind="noc_to_ucie",
))
# noc ↔ xbar.pe{i}: wire delay is 0 (NOC traversal latency computed by TwoDMeshNocComponent);
# routing_weight_mm=50.0 steers PE DMA Dijkstra away from this path (prefer direct pe_dma→xbar)
_noc_xbar = clinks.get("noc_to_xbar", {})
_noc_xbar_bw = _noc_xbar.get("per_connection_bw_gbs")
for i in range(n_slices):
edges.append(Edge(
src=f"{cp}.noc", dst=f"{cp}.xbar.pe{i}",
distance_mm=0.0,
bw_gbs=_noc_xbar_bw,
routing_weight_mm=50.0,
kind="noc_to_xbar",
))
edges.append(Edge(
src=f"{cp}.xbar.pe{i}", dst=f"{cp}.noc",
distance_mm=0.0,
bw_gbs=_noc_xbar_bw,
routing_weight_mm=50.0,
kind="xbar_to_noc",
))
# m_cpu ↔ noc (command dispatch, both directions)
edges.append(Edge(
src=f"{cp}.m_cpu", dst=f"{cp}.noc",
distance_mm=clinks["m_cpu_to_noc_mm"],
kind="command",
))
edges.append(Edge(
src=f"{cp}.noc", dst=f"{cp}.m_cpu",
distance_mm=clinks["m_cpu_to_noc_mm"],
kind="command",
))
# noc ↔ sram (shared SRAM access; per_connection_bw_gbs = 128 GB/s, n_connections = 4)
_noc_sram = clinks["noc_to_sram"]
edges.append(Edge(
src=f"{cp}.noc", dst=f"{cp}.sram",
distance_mm=clinks["noc_to_sram_mm"],
bw_gbs=_noc_sram["per_connection_bw_gbs"],
n_connections=_noc_sram["n_connections"],
kind="noc_to_sram",
))
edges.append(Edge(
src=f"{cp}.sram", dst=f"{cp}.noc",
distance_mm=clinks["noc_to_sram_mm"],
bw_gbs=_noc_sram["per_connection_bw_gbs"],
n_connections=_noc_sram["n_connections"],
kind="noc_to_sram",
))
def _add_pe_internal_edges(edges: list[Edge], pp: str, pe_links: dict) -> None:
"""Add PE-internal edges for a single PE instance."""
edges.append(Edge(
src=f"{pp}.pe_cpu", dst=f"{pp}.pe_scheduler",
distance_mm=pe_links["pe_cpu_to_scheduler_mm"],
kind="pe_internal",
))
for eng, key in [("pe_dma", "scheduler_to_dma_mm"),
("pe_gemm", "scheduler_to_gemm_mm"),
("pe_math", "scheduler_to_math_mm")]:
edges.append(Edge(
src=f"{pp}.pe_scheduler", dst=f"{pp}.{eng}",
distance_mm=pe_links[key],
kind="pe_internal",
))
for eng, mm_key, bw_key in [("pe_dma", "dma_to_tcm_mm", "dma_to_tcm_bw_gbs"),
("pe_gemm", "gemm_to_tcm_mm", "gemm_to_tcm_bw_gbs"),
("pe_math", "math_to_tcm_mm", "math_to_tcm_bw_gbs")]:
edges.append(Edge(
src=f"{pp}.{eng}", dst=f"{pp}.pe_tcm",
distance_mm=pe_links[mm_key],
bw_gbs=pe_links[bw_key],
kind="pe_internal",
))
# ── Inter-cube / IO / system edges ──────────────────────────────────
def _add_inter_cube_edges(
edges: list[Edge], sp: str, mesh_w: int, mesh_h: int, sip_spec: dict,
) -> None:
"""Add UCIe mesh edges between adjacent cubes within a SIP."""
mesh = sip_spec["links"]["inter_cube_mesh"]
bw = mesh["bw_gbs_per_ucie_phy"]
dist = mesh["distance_mm_across_seam"]
for row in range(mesh_h):
for col in range(mesh_w):
cid = row * mesh_w + col
if col + 1 < mesh_w:
nid = row * mesh_w + (col + 1)
edges.append(Edge(
src=f"{sp}.cube{cid}.ucie-E", dst=f"{sp}.cube{nid}.ucie-W",
distance_mm=dist, bw_gbs=bw, kind="ucie_mesh",
))
edges.append(Edge(
src=f"{sp}.cube{nid}.ucie-W", dst=f"{sp}.cube{cid}.ucie-E",
distance_mm=dist, bw_gbs=bw, kind="ucie_mesh",
))
if row + 1 < mesh_h:
nid = (row + 1) * mesh_w + col
edges.append(Edge(
src=f"{sp}.cube{cid}.ucie-S", dst=f"{sp}.cube{nid}.ucie-N",
distance_mm=dist, bw_gbs=bw, kind="ucie_mesh",
))
edges.append(Edge(
src=f"{sp}.cube{nid}.ucie-N", dst=f"{sp}.cube{cid}.ucie-S",
distance_mm=dist, bw_gbs=bw, kind="ucie_mesh",
))
def _add_io_to_cube_edges(
edges: list[Edge], sp: str, sip_spec: dict, mesh_w: int,
) -> None:
"""Add IO chiplet io_cpu ↔ cube UCIe edges (bidirectional for response)."""
io_links = sip_spec["iochiplet"]["links"]
io_to_ucie_mm = io_links["io_cpu_to_ucie_mm"]
io_to_ucie_bw = io_links["io_cpu_to_ucie_bw_gbs"]
for inst in sip_spec["iochiplet"]["instances"]:
iid = inst["id"]
io_cpu_id = f"{sp}.{iid}.io_cpu"
for port in inst["cube_ports"]:
cube_col, cube_row = port["cube"]["xy"]
cube_id = cube_row * mesh_w + cube_col
cube_side = port["cube_side"]
ucie_id = f"{sp}.cube{cube_id}.ucie-{cube_side}"
edges.append(Edge(
src=io_cpu_id, dst=ucie_id,
distance_mm=io_to_ucie_mm + port["distance_mm"],
bw_gbs=io_to_ucie_bw,
kind="io_to_cube",
))
edges.append(Edge(
src=ucie_id, dst=io_cpu_id,
distance_mm=io_to_ucie_mm + port["distance_mm"],
bw_gbs=io_to_ucie_bw,
kind="cube_to_io",
))
def _add_system_to_io_edges(
edges: list[Edge], sp: str, sip_spec: dict, system: dict,
) -> None:
"""Add fabric switch → IO chiplet PCIe edges."""
sw_id = "fabric.switch0"
sys_link = system["links"]["io_ep_to_switch"]
for inst in sip_spec["iochiplet"]["instances"]:
pcie_ep_id = f"{sp}.{inst['id']}.pcie_ep"
edges.append(Edge(
src=sw_id, dst=pcie_ep_id,
distance_mm=sys_link["distance_mm"],
bw_gbs=sys_link["bw_gbs_per_ep"],
kind="pcie",
))
# ── View builders ────────────────────────────────────────────────────
def _build_system_view(spec: dict) -> ViewGraph:
"""System-level view: SIP blocks, IO chiplets, fabric switch."""
system = spec["system"]
sip_count = system["sips"]["count"]
sip_w, sip_h = 71.0, 59.0
gap = 30.0
canvas_w = sip_count * sip_w + (sip_count - 1) * gap
canvas_h = sip_h + 20.0
nodes: dict[str, Node] = {}
view_edges: list[Edge] = []
sw = system["components"]["switch"]
sw_id = "fabric.switch0"
nodes[sw_id] = Node(
id=sw_id, kind=sw["kind"], impl=sw["impl"],
attrs=sw.get("attrs", {}), pos_mm=(canvas_w / 2, 5.0), label="Fabric Switch",
)
for s in range(sip_count):
sx = s * (sip_w + gap)
sy = 20.0
sip_id = f"sip{s}"
nodes[sip_id] = Node(
id=sip_id, kind="sip", impl="",
attrs={"w_mm": sip_w, "h_mm": sip_h},
pos_mm=(sx + sip_w / 2, sy + sip_h / 2),
label=f"SIP {s}",
)
for inst in spec["sip"]["iochiplet"]["instances"]:
iid = inst["id"]
io_nid = f"{sip_id}.{iid}"
side = inst["place"]["side"]
iy = sy if side == "N" else sy + sip_h
nodes[io_nid] = Node(
id=io_nid, kind="iochiplet", impl="",
attrs={}, pos_mm=(sx + sip_w / 2, iy), label=f"IO {iid}",
)
view_edges.append(Edge(
src=sw_id, dst=io_nid,
distance_mm=system["links"]["io_ep_to_switch"]["distance_mm"],
bw_gbs=system["links"]["io_ep_to_switch"]["bw_gbs_per_ep"],
kind="pcie",
))
return ViewGraph(
name="system", nodes=nodes, edges=view_edges,
width_mm=canvas_w, height_mm=canvas_h,
)
def _build_sip_view(spec: dict) -> ViewGraph:
"""SIP-level view: cube mesh + IO chiplets (representative, sip0)."""
sip_spec = spec["sip"]
cube_spec = spec["cube"]
mesh_w = sip_spec["cube_mesh"]["w"]
mesh_h = sip_spec["cube_mesh"]["h"]
cube_w = cube_spec["geometry"]["cube_mm"]["w"]
cube_h = cube_spec["geometry"]["cube_mm"]["h"]
seam = sip_spec["links"]["inter_cube_mesh"]["distance_mm_across_seam"]
stride_x = cube_w + seam
stride_y = cube_h + seam
mesh_total_w = mesh_w * cube_w + (mesh_w - 1) * seam
mesh_total_h = mesh_h * cube_h + (mesh_h - 1) * seam
io_margin = 6.0
canvas_w = mesh_total_w
canvas_h = mesh_total_h + 2 * io_margin
nodes: dict[str, Node] = {}
view_edges: list[Edge] = []
# Cubes as opaque blocks
for row in range(mesh_h):
for col in range(mesh_w):
cid = row * mesh_w + col
cx = col * stride_x + cube_w / 2
cy = io_margin + row * stride_y + cube_h / 2
nid = f"cube{cid}"
nodes[nid] = Node(
id=nid, kind="cube", impl="",
attrs={"w_mm": cube_w, "h_mm": cube_h, "col": col, "row": row},
pos_mm=(cx, cy), label=f"CUBE ({col},{row})",
)
# Inter-cube mesh edges
mesh_link = sip_spec["links"]["inter_cube_mesh"]
for row in range(mesh_h):
for col in range(mesh_w):
cid = row * mesh_w + col
if col + 1 < mesh_w:
nid = row * mesh_w + (col + 1)
view_edges.append(Edge(
src=f"cube{cid}", dst=f"cube{nid}",
distance_mm=mesh_link["distance_mm_across_seam"],
bw_gbs=mesh_link["bw_gbs_per_ucie_phy"],
kind="ucie_mesh",
))
if row + 1 < mesh_h:
nid = (row + 1) * mesh_w + col
view_edges.append(Edge(
src=f"cube{cid}", dst=f"cube{nid}",
distance_mm=mesh_link["distance_mm_across_seam"],
bw_gbs=mesh_link["bw_gbs_per_ucie_phy"],
kind="ucie_mesh",
))
# IO chiplets
io_links = sip_spec["iochiplet"]["links"]
for inst in sip_spec["iochiplet"]["instances"]:
iid = inst["id"]
side = inst["place"]["side"]
iy = 2.0 if side == "N" else canvas_h - 2.0
nodes[iid] = Node(
id=iid, kind="iochiplet", impl="",
attrs={}, pos_mm=(mesh_total_w / 2, iy), label=f"IO {iid}",
)
for port in inst["cube_ports"]:
cube_col, cube_row = port["cube"]["xy"]
cube_id = cube_row * mesh_w + cube_col
view_edges.append(Edge(
src=iid, dst=f"cube{cube_id}",
distance_mm=io_links["io_cpu_to_ucie_mm"] + port["distance_mm"],
bw_gbs=io_links["io_cpu_to_ucie_bw_gbs"],
kind="io_to_cube",
))
return ViewGraph(
name="sip", nodes=nodes, edges=view_edges,
width_mm=canvas_w, height_mm=canvas_h,
)
def _build_cube_view(spec: dict) -> ViewGraph:
"""Cube-level view: representative single cube, PEs as opaque blocks."""
cube = spec["cube"]
cube_w = cube["geometry"]["cube_mm"]["w"]
cube_h = cube["geometry"]["cube_mm"]["h"]
local_pos = _cube_local_positions(cube_w, cube_h)
clinks = cube["links"]
n_slices = cube["memory_map"]["hbm_slices_per_cube"]
nodes: dict[str, Node] = {}
view_edges: list[Edge] = []
# UCIe ports
for port in cube["ucie"]["ports"]:
pid = f"ucie-{port}"
lx, ly = local_pos[pid]
nodes[pid] = Node(
id=pid, kind="ucie_port", impl="ucie_v1",
attrs={}, pos_mm=(lx, ly), label=f"UCIe-{port}",
)
# Named components (hbm_ctrl as single representative node in view)
for name in ("noc", "m_cpu", "hbm_ctrl", "sram"):
c = cube["components"][name]
lx, ly = local_pos[name]
nodes[name] = Node(
id=name, kind=c["kind"], impl=c["impl"],
attrs=c["attrs"], pos_mm=(lx, ly),
label=name.upper().replace("_", " "),
)
# Bridges
for br in cube["components"]["xbar"]["bridges"]:
bname = br["id"]
bid = f"bridge.{bname}"
lx, ly = local_pos[bid]
nodes[bid] = Node(
id=bid, kind=br["kind"], impl=br["impl"],
attrs=br["attrs"], pos_mm=(lx, ly),
label=f"Bridge {bname.upper()}",
)
# PEs as opaque blocks + per-PE xbar entry nodes
corners = cube["pe_layout"]["corners"]
pe_per_corner = cube["pe_layout"]["pe_per_corner"]
corner_pos = _corner_pe_positions(cube_w, cube_h)
xbar_pe_spec = cube["components"]["xbar"]["pe"]
xbar_top_y = local_pos["xbar.top"][1]
xbar_bot_y = local_pos["xbar.bottom"][1]
pe_idx = 0
for corner in corners:
is_top = corner in ("NW", "NE")
xbar_y = xbar_top_y if is_top else xbar_bot_y
mm_key = "pe_to_xbar_row_n_mm" if is_top else "pe_to_xbar_row_s_mm"
for ci in range(pe_per_corner):
pid = f"pe{pe_idx}"
xbar_id = f"xbar.pe{pe_idx}"
px, py = corner_pos[corner][ci]
nodes[pid] = Node(
id=pid, kind="pe", impl="",
attrs={"corner": corner}, pos_mm=(px, py),
label=f"PE{pe_idx}",
)
nodes[xbar_id] = Node(
id=xbar_id, kind=xbar_pe_spec["kind"], impl=xbar_pe_spec["impl"],
attrs=xbar_pe_spec["attrs"], pos_mm=(px, xbar_y),
label=f"XBAR PE{pe_idx}",
)
# PE → xbar.pe_i (HBM data path)
view_edges.append(Edge(
src=pid, dst=xbar_id,
distance_mm=clinks[mm_key],
bw_gbs=clinks["pe_to_xbar_bw_gbs"],
kind="pe_to_xbar",
))
# PE → noc (non-HBM data path)
view_edges.append(Edge(
src=pid, dst="noc",
distance_mm=clinks["pe_dma_to_noc_mm"],
bw_gbs=clinks["pe_dma_to_noc_bw_gbs"],
kind="pe_to_noc",
))
# noc → PE (command delivery)
view_edges.append(Edge(
src="noc", dst=pid,
distance_mm=clinks["noc_to_pe_cpu_mm"],
kind="command",
))
pe_idx += 1
# Cube fabric edges
# xbar.pe_i → hbm_ctrl (single representative node in view)
for i in range(n_slices):
view_edges.append(Edge(
src=f"xbar.pe{i}", dst="hbm_ctrl",
distance_mm=clinks["xbar_to_hbm_mm"],
bw_gbs=clinks["xbar_to_hbm_bw_gbs"],
kind="xbar_to_hbm",
))
# xbar chain
half = n_slices // 2
for half_start in (0, half):
for i in range(half_start, half_start + half - 1):
intra = ((i - half_start) % pe_per_corner) != (pe_per_corner - 1)
x_dist = clinks["xbar_chain_intra_corner_mm"] if intra else clinks["xbar_chain_inter_corner_mm"]
for a, b in [(i, i + 1), (i + 1, i)]:
view_edges.append(Edge(
src=f"xbar.pe{a}", dst=f"xbar.pe{b}",
distance_mm=x_dist,
bw_gbs=clinks["xbar_x_bw_gbs"],
kind="xbar_chain",
))
# bridge connections
for bname, pe_top, pe_bot in [("left", 0, half), ("right", half - 1, n_slices - 1)]:
br_id = f"bridge.{bname}"
for pe_i, br_mm_key in [(pe_top, "xbar_row_n_to_bridge_mm"),
(pe_bot, "xbar_row_s_to_bridge_mm")]:
xbar_id = f"xbar.pe{pe_i}"
view_edges.append(Edge(
src=xbar_id, dst=br_id,
distance_mm=clinks[br_mm_key],
bw_gbs=clinks["xbar_to_bridge_bw_gbs"],
kind="xbar_to_bridge",
))
view_edges.append(Edge(
src=br_id, dst=xbar_id,
distance_mm=clinks[br_mm_key],
bw_gbs=clinks["xbar_to_bridge_bw_gbs"],
kind="bridge_to_xbar",
))
_noc_ucie_v = clinks["noc_to_ucie"]
for port in cube["ucie"]["ports"]:
view_edges.append(Edge(
src="noc", dst=f"ucie-{port}",
distance_mm=0.0,
bw_gbs=_noc_ucie_v["per_connection_bw_gbs"],
n_connections=_noc_ucie_v["n_connections"],
kind="noc_to_ucie",
))
# m_cpu ↔ noc (command dispatch, both directions)
view_edges.append(Edge(
src="m_cpu", dst="noc",
distance_mm=clinks["m_cpu_to_noc_mm"],
kind="command",
))
view_edges.append(Edge(
src="noc", dst="m_cpu",
distance_mm=clinks["m_cpu_to_noc_mm"],
kind="command",
))
# noc ↔ sram (shared SRAM access, bidirectional)
_noc_sram_v = clinks["noc_to_sram"]
view_edges.append(Edge(
src="noc", dst="sram",
distance_mm=clinks["noc_to_sram_mm"],
bw_gbs=_noc_sram_v["per_connection_bw_gbs"],
n_connections=_noc_sram_v["n_connections"],
kind="noc_to_sram",
))
view_edges.append(Edge(
src="sram", dst="noc",
distance_mm=clinks["noc_to_sram_mm"],
bw_gbs=_noc_sram_v["per_connection_bw_gbs"],
n_connections=_noc_sram_v["n_connections"],
kind="noc_to_sram",
))
return ViewGraph(
name="cube", nodes=nodes, edges=view_edges,
width_mm=cube_w, height_mm=cube_h,
)
def _build_pe_view(spec: dict) -> ViewGraph:
"""PE-level view: representative single PE with all template components."""
pe_tmpl = spec["cube"]["pe_template"]
pe_links = pe_tmpl["links"]
canvas_w, canvas_h = 12.0, 8.0
positions = {
"pe_cpu": (1.5, 4.0),
"pe_scheduler": (4.0, 4.0),
"pe_dma": (7.0, 1.5),
"pe_gemm": (7.0, 4.0),
"pe_math": (7.0, 6.5),
"pe_tcm": (10.0, 4.0),
}
nodes: dict[str, Node] = {}
view_edges: list[Edge] = []
for comp_name, comp_spec in pe_tmpl["components"].items():
px, py = positions[comp_name]
nodes[comp_name] = Node(
id=comp_name, kind=comp_spec["kind"], impl=comp_spec["impl"],
attrs=comp_spec["attrs"], pos_mm=(px, py),
label=comp_name.upper().replace("_", " "),
)
view_edges.append(Edge(
src="pe_cpu", dst="pe_scheduler",
distance_mm=pe_links["pe_cpu_to_scheduler_mm"],
kind="pe_internal",
))
for eng, key in [("pe_dma", "scheduler_to_dma_mm"),
("pe_gemm", "scheduler_to_gemm_mm"),
("pe_math", "scheduler_to_math_mm")]:
view_edges.append(Edge(
src="pe_scheduler", dst=eng,
distance_mm=pe_links[key],
kind="pe_internal",
))
for eng, mm_key, bw_key in [("pe_dma", "dma_to_tcm_mm", "dma_to_tcm_bw_gbs"),
("pe_gemm", "gemm_to_tcm_mm", "gemm_to_tcm_bw_gbs"),
("pe_math", "math_to_tcm_mm", "math_to_tcm_bw_gbs")]:
view_edges.append(Edge(
src=eng, dst="pe_tcm",
distance_mm=pe_links[mm_key],
bw_gbs=pe_links[bw_key],
kind="pe_internal",
))
return ViewGraph(
name="pe", nodes=nodes, edges=view_edges,
width_mm=canvas_w, height_mm=canvas_h,
)
View File
+56
View File
@@ -0,0 +1,56 @@
from __future__ import annotations
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any
@dataclass
class Node:
id: str
kind: str
impl: str
attrs: dict[str, Any]
pos_mm: tuple[float, float] | None # (x_mm, y_mm); None for abstract nodes
label: str = ""
@dataclass
class Edge:
src: str # node id
dst: str # node id
distance_mm: float # physical wire delay distance (ns = distance_mm * ns_per_mm)
routing_weight_mm: float | None = None # Dijkstra cost; None → use distance_mm
bw_gbs: float | None = None
n_connections: int | None = None # multi-connection links; single request uses 1 connection
kind: str = "link"
@dataclass
class ViewGraph:
name: str # "system" | "sip" | "cube" | "pe"
nodes: dict[str, Node]
edges: list[Edge]
width_mm: float
height_mm: float
@dataclass
class TopologyGraph:
spec: dict[str, Any]
# Full instantiated flat graph (used by sim_engine)
nodes: dict[str, Node] = field(default_factory=dict)
edges: list[Edge] = field(default_factory=list)
# Representative view projections (used by visualizer)
system_view: ViewGraph | None = None
sip_view: ViewGraph | None = None
cube_view: ViewGraph | None = None
pe_view: ViewGraph | None = None
@dataclass(frozen=True)
class TopologyHandle:
path: Path
topology_obj: TopologyGraph | None # None until _compile_graph is implemented
+367
View File
@@ -0,0 +1,367 @@
# kernbench/topology/visualizer.py
"""
SVG diagram generator for TopologyGraph views.
Produces mm-accurate, deterministic SVG files for each view level
(system, SIP, cube, PE) per ADR-0005 and ADR-0006.
"""
from __future__ import annotations
from pathlib import Path
from .types import Edge, Node, TopologyGraph, ViewGraph
# ── Color palette by component kind ─────────────────────────────────
_KIND_COLORS: dict[str, str] = {
"switch": "#6366f1", # indigo
"sip": "#e0e7ff", # light indigo
"iochiplet": "#0ea5e9", # sky blue
"pcie_ep": "#0ea5e9",
"io_cpu": "#0ea5e9",
"ucie_port": "#3b82f6", # blue
"noc": "#a78bfa", # purple
"m_cpu": "#f59e0b", # amber
"xbar": "#f97316", # orange
"hbm_ctrl": "#10b981", # emerald
"pe": "#94a3b8", # slate
"pe_cpu": "#ef4444", # red
"pe_scheduler": "#f59e0b", # amber
"pe_dma": "#3b82f6", # blue
"pe_gemm": "#8b5cf6", # violet
"pe_math": "#ec4899", # pink
"pe_tcm": "#10b981", # emerald
"sram": "#f59e0b", # amber
"cube": "#cbd5e1", # slate-300
}
_EDGE_COLORS: dict[str, str] = {
"pcie": "#6366f1",
"io_internal": "#0ea5e9",
"io_to_cube": "#0ea5e9",
"ucie_mesh": "#3b82f6",
"pe_to_xbar": "#f97316",
"xbar_to_hbm": "#10b981",
"xbar_to_bridge": "#a78bfa",
"bridge_to_xbar": "#a78bfa",
"noc_to_ucie": "#a78bfa",
"pe_to_noc": "#a78bfa",
"noc_to_sram": "#f59e0b",
"command": "#f59e0b",
"pe_internal": "#94a3b8",
}
# ── Node sizing ──────────────────────────────────────────────────────
_DEFAULT_NODE_W = 2.0 # mm
_DEFAULT_NODE_H = 1.2 # mm
_KIND_SIZE: dict[str, tuple[float, float]] = {
"sip": (60.0, 50.0),
"cube": (6.0, 4.0),
"iochiplet": (4.0, 1.5),
"switch": (5.0, 1.5),
}
# ── Public API ───────────────────────────────────────────────────────
def emit_diagrams(graph: TopologyGraph, out_dir: Path) -> list[Path]:
"""Generate SVG diagrams for all views. Returns list of created file paths."""
out_dir.mkdir(parents=True, exist_ok=True)
created: list[Path] = []
views = [
("system_view", graph.system_view),
("sip_view", graph.sip_view),
("cube_view", graph.cube_view),
("pe_view", graph.pe_view),
]
for name, view in views:
if view is None:
continue
svg = _render_view_svg(view)
path = out_dir / f"{name}.svg"
path.write_text(svg, encoding="utf-8")
created.append(path)
return created
# ── SVG rendering ────────────────────────────────────────────────────
def _render_view_svg(view: ViewGraph) -> str:
"""Render a ViewGraph to an SVG string."""
scale = _pick_scale(view)
pad = 40 # px padding
node_sizes = _compute_node_sizes(view, scale)
# Canvas size in px
w_px = int(view.width_mm * scale + 2 * pad)
h_px = int(view.height_mm * scale + 2 * pad)
parts: list[str] = []
parts.append(_svg_header(w_px, h_px, view.name))
# Background
parts.append(f' <rect width="{w_px}" height="{h_px}" fill="#f8fafc"/>')
# Title
parts.append(
f' <text x="{w_px // 2}" y="18" text-anchor="middle" '
f'font-family="monospace" font-size="14" font-weight="bold" fill="#1e293b">'
f'{view.name.upper()} VIEW</text>'
)
# Special: draw cube boundary + HBM block background in cube view
if view.name == "cube":
_draw_cube_boundary(parts, view, scale, pad)
_draw_hbm_block(parts, view, scale, pad)
# Edges (draw before nodes so nodes are on top)
# Track fan-out edges to assign per-edge offsets
fanout_counter: dict[str, int] = {}
for edge in view.edges:
if edge.src in view.nodes and edge.dst in view.nodes:
_draw_edge(parts, edge, view, node_sizes, scale, pad, fanout_counter)
# Nodes
for node in view.nodes.values():
_draw_node(parts, node, node_sizes, scale, pad)
parts.append("</svg>")
return "\n".join(parts)
def _pick_scale(view: ViewGraph) -> float:
"""Pixels per mm, chosen per view type."""
return {
"system": 4.0,
"sip": 8.0,
"cube": 28.0,
"pe": 35.0,
}.get(view.name, 10.0)
def _compute_node_sizes(
view: ViewGraph, scale: float,
) -> dict[str, tuple[float, float]]:
"""Returns (w_px, h_px) for each node."""
sizes: dict[str, tuple[float, float]] = {}
for nid, node in view.nodes.items():
w_mm, h_mm = _KIND_SIZE.get(node.kind, (_DEFAULT_NODE_W, _DEFAULT_NODE_H))
# For cube view, use smaller PE nodes
if view.name == "cube" and node.kind == "pe":
w_mm, h_mm = 1.8, 1.0
if view.name == "pe":
w_mm, h_mm = 2.5, 1.4
sizes[nid] = (w_mm * scale, h_mm * scale)
return sizes
def _svg_header(w: int, h: int, title: str) -> str:
return (
f'<svg xmlns="http://www.w3.org/2000/svg" '
f'width="{w}" height="{h}" viewBox="0 0 {w} {h}">\n'
f' <title>{title}</title>'
)
def _draw_cube_boundary(
parts: list[str], view: ViewGraph, scale: float, pad: int,
) -> None:
"""Draw the cube die outline as a dashed rectangle."""
bx = pad
by = pad
bw = view.width_mm * scale
bh = view.height_mm * scale
parts.append(
f' <rect x="{bx:.1f}" y="{by:.1f}" '
f'width="{bw:.1f}" height="{bh:.1f}" '
f'rx="6" fill="none" stroke="#475569" stroke-width="2" '
f'stroke-dasharray="8,4"/>'
)
def _draw_hbm_block(
parts: list[str], view: ViewGraph, scale: float, pad: int,
) -> None:
"""Draw HBM area as a filled rectangle in cube view."""
# HBM area: centered at (8.5, 7.0), size 9x5 -> x=[4.0,13.0], y=[4.5,9.5]
hbm_x = 4.0 * scale + pad
hbm_y = 4.5 * scale + pad
hbm_w = 9.0 * scale
hbm_h = 5.0 * scale
parts.append(
f' <rect x="{hbm_x:.1f}" y="{hbm_y:.1f}" '
f'width="{hbm_w:.1f}" height="{hbm_h:.1f}" '
f'rx="4" fill="#d1fae5" stroke="#10b981" stroke-width="1.5" '
f'stroke-dasharray="6,3" opacity="0.5"/>'
)
cx = 8.5 * scale + pad
cy = 8.5 * scale + pad
parts.append(
f' <text x="{cx:.1f}" y="{cy:.1f}" text-anchor="middle" '
f'font-family="monospace" font-size="11" fill="#047857" opacity="0.7">'
f'HBM</text>'
)
def _draw_node(
parts: list[str],
node: Node,
sizes: dict[str, tuple[float, float]],
scale: float,
pad: int,
) -> None:
"""Draw a single node as a rounded rectangle with label."""
if node.pos_mm is None:
return
px = node.pos_mm[0] * scale + pad
py = node.pos_mm[1] * scale + pad
w, h = sizes.get(node.id, (40, 24))
x = px - w / 2
y = py - h / 2
fill = _KIND_COLORS.get(node.kind, "#e2e8f0")
text_color = "#ffffff" if _is_dark(fill) else "#1e293b"
parts.append(
f' <rect x="{x:.1f}" y="{y:.1f}" width="{w:.1f}" height="{h:.1f}" '
f'rx="4" fill="{fill}" stroke="#475569" stroke-width="1"/>'
)
label = node.label or node.id
font_size = _label_font_size(w, label)
parts.append(
f' <text x="{px:.1f}" y="{py + 4:.1f}" text-anchor="middle" '
f'font-family="monospace" font-size="{font_size}" fill="{text_color}">'
f'{_escape(label)}</text>'
)
# ── Fan-out edge kinds that need offset routing ─────────────────────
_FANOUT_KINDS = {"pe_to_xbar", "pe_to_noc", "command", "noc_to_ucie"}
def _draw_edge(
parts: list[str],
edge: Edge,
view: ViewGraph,
sizes: dict[str, tuple[float, float]],
scale: float,
pad: int,
fanout_counter: dict[str, int],
) -> None:
"""Draw an edge with orthogonal (90-degree) routing for fan-out kinds."""
nodes = view.nodes
src_node = nodes[edge.src]
dst_node = nodes[edge.dst]
if src_node.pos_mm is None or dst_node.pos_mm is None:
return
x1 = src_node.pos_mm[0] * scale + pad
y1 = src_node.pos_mm[1] * scale + pad
x2 = dst_node.pos_mm[0] * scale + pad
y2 = dst_node.pos_mm[1] * scale + pad
color = _EDGE_COLORS.get(edge.kind, "#94a3b8")
width = "1.5" if edge.kind == "pe_internal" else "1"
opacity = "0.6" if edge.kind in ("command", "noc_to_ucie") else "0.8"
if edge.kind in _FANOUT_KINDS and view.name == "cube":
# Orthogonal routing: src→horizontal→vertical→dst with per-edge offset.
group_key = f"{edge.kind}:{edge.dst}"
idx = fanout_counter.get(group_key, 0)
fanout_counter[group_key] = idx + 1
# Route: go vertically from src to a staggered horizontal channel,
# then horizontally to dst x, then vertically to dst.
mid_y = (y1 + y2) / 2 + (idx - 1.5) * 10 # spread channels vertically
parts.append(
f' <polyline points="{x1:.1f},{y1:.1f} {x1:.1f},{mid_y:.1f} '
f'{x2:.1f},{mid_y:.1f} {x2:.1f},{y2:.1f}" '
f'fill="none" stroke="{color}" stroke-width="{width}" opacity="{opacity}"/>'
)
# Label on the horizontal segment
if edge.distance_mm > 0:
lx = (x1 + x2) / 2
label = f"{edge.distance_mm:.1f}mm"
if edge.bw_gbs:
label += f" {edge.bw_gbs:.0f}GB/s"
parts.append(
f' <text x="{lx:.1f}" y="{mid_y - 3:.1f}" text-anchor="middle" '
f'font-family="monospace" font-size="7" fill="#64748b">'
f'{label}</text>'
)
return
# Non-fanout: orthogonal L-bend
if abs(x2 - x1) > 1 and abs(y2 - y1) > 1:
# PE view: vertical-first for left→right edges (scheduler→engines),
# horizontal-first for right→right edges (engines→tcm)
if view.name == "pe":
if src_node.pos_mm[0] < view.width_mm / 2:
# Source in left half: vertical-first (scheduler fan-out)
parts.append(
f' <polyline points="{x1:.1f},{y1:.1f} {x1:.1f},{y2:.1f} {x2:.1f},{y2:.1f}" '
f'fill="none" stroke="{color}" stroke-width="{width}" opacity="{opacity}"/>'
)
else:
# Source in right half: horizontal-first (dma/math→tcm)
parts.append(
f' <polyline points="{x1:.1f},{y1:.1f} {x2:.1f},{y1:.1f} {x2:.1f},{y2:.1f}" '
f'fill="none" stroke="{color}" stroke-width="{width}" opacity="{opacity}"/>'
)
else:
parts.append(
f' <polyline points="{x1:.1f},{y1:.1f} {x2:.1f},{y1:.1f} {x2:.1f},{y2:.1f}" '
f'fill="none" stroke="{color}" stroke-width="{width}" opacity="{opacity}"/>'
)
else:
parts.append(
f' <line x1="{x1:.1f}" y1="{y1:.1f}" x2="{x2:.1f}" y2="{y2:.1f}" '
f'stroke="{color}" stroke-width="{width}" opacity="{opacity}"/>'
)
# Distance label at midpoint
if edge.distance_mm > 0:
mx = (x1 + x2) / 2
my = (y1 + y2) / 2
label = f"{edge.distance_mm:.1f}mm"
if edge.bw_gbs:
label += f" {edge.bw_gbs:.0f}GB/s"
parts.append(
f' <text x="{mx:.1f}" y="{my - 4:.1f}" text-anchor="middle" '
f'font-family="monospace" font-size="7" fill="#64748b">'
f'{label}</text>'
)
# ── Helpers ──────────────────────────────────────────────────────────
def _is_dark(hex_color: str) -> bool:
"""Check if a hex color is dark (for white text)."""
h = hex_color.lstrip("#")
r, g, b = int(h[0:2], 16), int(h[2:4], 16), int(h[4:6], 16)
return (r * 0.299 + g * 0.587 + b * 0.114) < 140
def _label_font_size(box_width: float, label: str) -> int:
"""Choose font size to fit label in box."""
char_w = len(label) * 7
if char_w > box_width * 0.9:
return max(7, int(box_width * 0.9 / len(label) * 1.4))
return 10
def _escape(text: str) -> str:
"""Escape XML special characters."""
return text.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
+11
View File
@@ -0,0 +1,11 @@
"""Triton emulator: fake tl module for kernel performance simulation.
Provides TLContext (the fake `tl` parameter) that kernels use to express
memory access patterns and compute operations. Kernel functions are plain
Python — no yield, no async — and generate a PeCommand trace that PE_CPU
replays through SimPy.
Usage:
from kernbench.triton_emu.registry import register_kernel, get_kernel
from kernbench.triton_emu.tl_context import TLContext
"""
+30
View File
@@ -0,0 +1,30 @@
"""Kernel registry: maps kernel names to Python callable generators.
Benchmarks register kernel functions here; PE_CPU looks them up by
KernelRef.name at execution time.
"""
from __future__ import annotations
from collections.abc import Callable
from typing import Any
_kernels: dict[str, Callable[..., None]] = {}
def register_kernel(name: str, fn: Callable[..., None]) -> None:
"""Register a kernel function by name."""
if name in _kernels:
raise ValueError(f"kernel '{name}' already registered")
_kernels[name] = fn
def get_kernel(name: str) -> Callable[..., None]:
"""Look up a registered kernel function by name."""
if name not in _kernels:
raise KeyError(f"kernel '{name}' not registered")
return _kernels[name]
def clear_registry() -> None:
"""Clear all registered kernels (for testing)."""
_kernels.clear()
+356
View File
@@ -0,0 +1,356 @@
"""TLContext: fake Triton Language module for kernel performance simulation.
Passed as the `tl` parameter to kernel functions. Each API call records a
PeCommand in the internal trace. After the kernel returns, PE_CPU replays
the command list through SimPy.
Kernel code looks like standard Python — no yield, no async:
def my_kernel(a_ptr, b_ptr, out_ptr, tl):
pid = tl.program_id(0)
a = tl.load(a_ptr, shape=(32, 64), dtype="f16")
b = tl.load(b_ptr + pid * stride, shape=(64, 32), dtype="f16")
tl.composite(op="gemm", a=a, b=b, out_ptr=out_ptr)
"""
from __future__ import annotations
import math
from typing import Literal
from kernbench.common.pe_commands import (
CompletionHandle,
CompositeCmd,
DmaReadCmd,
DmaWriteCmd,
GemmCmd,
MathCmd,
PeCommand,
PeCpuOverheadCmd,
TensorHandle,
WaitCmd,
)
_DTYPE_BYTES: dict[str, int] = {
"f16": 2, "f32": 4, "f64": 8,
"bf16": 2,
"i8": 1, "i16": 2, "i32": 4, "i64": 8,
"u8": 1, "u16": 2, "u32": 4, "u64": 8,
}
class TLContext:
"""Fake Triton Language context.
Args:
pe_id: program instance index (returned by program_id).
num_programs: total number of program instances.
dispatch_cycles: PE_CPU overhead per tl API call (auto-inserted).
"""
def __init__(
self,
pe_id: int = 0,
num_programs: int = 1,
dispatch_cycles: int = 1,
) -> None:
self._pe_id = pe_id
self._num_programs = num_programs
self._dispatch_cycles = dispatch_cycles
self._commands: list[PeCommand] = []
self._handle_counter = 0
self._completion_counter = 0
@property
def commands(self) -> list[PeCommand]:
"""Return the recorded command trace."""
return self._commands
# ── helpers ────────────────────────────────────────────────────
def _next_handle_id(self) -> str:
self._handle_counter += 1
return f"t{self._handle_counter}"
def _next_completion_id(self) -> str:
self._completion_counter += 1
return f"c{self._completion_counter}"
def _dtype_bytes(self, dtype: str) -> int:
return _DTYPE_BYTES.get(dtype, 2)
def _nbytes(self, shape: tuple[int, ...], dtype: str) -> int:
return math.prod(shape) * self._dtype_bytes(dtype)
def _emit_dispatch_overhead(self) -> None:
if self._dispatch_cycles > 0:
self._commands.append(PeCpuOverheadCmd(cycles=self._dispatch_cycles))
def _make_handle(
self, pa: int, shape: tuple[int, ...], dtype: str,
) -> TensorHandle:
return TensorHandle(
id=self._next_handle_id(),
pa=pa, shape=shape, dtype=dtype,
nbytes=self._nbytes(shape, dtype),
)
# ── Reference (no DMA, metadata only) ────────────────────────
def ref(
self, ptr: int, shape: tuple[int, ...], dtype: str = "f16",
) -> TensorHandle:
"""Create a TensorHandle referencing HBM data without issuing DMA.
Used when the scheduler will stream data per-tile (e.g., tensor b
in a composite GEMM). No command is generated.
"""
return self._make_handle(pa=ptr, shape=shape, dtype=dtype)
# ── Data Movement (blocking, DMA engine) ──────────────────────
def load(
self, ptr: int, shape: tuple[int, ...], dtype: str = "f16",
) -> TensorHandle:
"""Load tensor from HBM to TCM. Returns TensorHandle."""
self._emit_dispatch_overhead()
handle = self._make_handle(pa=ptr, shape=shape, dtype=dtype)
self._commands.append(DmaReadCmd(
handle=handle, src_pa=ptr, nbytes=handle.nbytes,
))
return handle
def store(self, ptr: int, handle: TensorHandle) -> None:
"""Store tensor from TCM to HBM."""
self._emit_dispatch_overhead()
self._commands.append(DmaWriteCmd(
handle=handle, dst_pa=ptr, nbytes=handle.nbytes,
))
# ── GEMM Engine (blocking) ────────────────────────────────────
def dot(self, a: TensorHandle, b: TensorHandle) -> TensorHandle:
"""Matrix multiply: out = a @ b. Both operands must be in TCM.
a: (M, K), b: (K, N) → out: (M, N)
"""
if len(a.shape) < 2 or len(b.shape) < 2:
raise ValueError("dot requires 2D tensors")
m, k = a.shape[-2], a.shape[-1]
k2, n = b.shape[-2], b.shape[-1]
if k != k2:
raise ValueError(f"dot shape mismatch: a.K={k} != b.K={k2}")
out_shape = (*a.shape[:-2], m, n)
out_dtype = a.dtype
out = self._make_handle(pa=0, shape=out_shape, dtype=out_dtype)
self._emit_dispatch_overhead()
self._commands.append(GemmCmd(a=a, b=b, out=out, m=m, k=k, n=n))
return out
# ── MATH Engine: unary (blocking) ─────────────────────────────
def _unary_math(self, op: str, x: TensorHandle) -> TensorHandle:
out = self._make_handle(pa=0, shape=x.shape, dtype=x.dtype)
self._emit_dispatch_overhead()
self._commands.append(MathCmd(op=op, inputs=(x,), out=out))
return out
def exp(self, x: TensorHandle) -> TensorHandle:
return self._unary_math("exp", x)
def log(self, x: TensorHandle) -> TensorHandle:
return self._unary_math("log", x)
def sqrt(self, x: TensorHandle) -> TensorHandle:
return self._unary_math("sqrt", x)
def abs(self, x: TensorHandle) -> TensorHandle:
return self._unary_math("abs", x)
def sigmoid(self, x: TensorHandle) -> TensorHandle:
return self._unary_math("sigmoid", x)
def cos(self, x: TensorHandle) -> TensorHandle:
return self._unary_math("cos", x)
def sin(self, x: TensorHandle) -> TensorHandle:
return self._unary_math("sin", x)
# ── MATH Engine: reduction (blocking) ─────────────────────────
def _reduction(
self, op: str, x: TensorHandle, axis: int,
) -> TensorHandle:
out_shape = list(x.shape)
out_shape[axis] = 1
out = self._make_handle(pa=0, shape=tuple(out_shape), dtype=x.dtype)
self._emit_dispatch_overhead()
self._commands.append(MathCmd(op=op, inputs=(x,), out=out, axis=axis))
return out
def sum(self, x: TensorHandle, axis: int) -> TensorHandle:
return self._reduction("sum", x, axis)
def max(self, x: TensorHandle, axis: int) -> TensorHandle:
return self._reduction("max", x, axis)
def min(self, x: TensorHandle, axis: int) -> TensorHandle:
return self._reduction("min", x, axis)
# ── MATH Engine: binary (blocking) ────────────────────────────
def _binary_math(
self, op: str, a: TensorHandle, b: TensorHandle,
) -> TensorHandle:
out = self._make_handle(pa=0, shape=a.shape, dtype=a.dtype)
self._emit_dispatch_overhead()
self._commands.append(MathCmd(op=op, inputs=(a, b), out=out))
return out
def where(
self, cond: TensorHandle, a: TensorHandle, b: TensorHandle,
) -> TensorHandle:
out = self._make_handle(pa=0, shape=a.shape, dtype=a.dtype)
self._emit_dispatch_overhead()
self._commands.append(MathCmd(op="where", inputs=(cond, a, b), out=out))
return out
# ── Index / Scalar (PE_CPU, no engine) ────────────────────────
def program_id(self, axis: int = 0) -> int:
"""Return program instance index."""
return self._pe_id
def num_programs(self, axis: int = 0) -> int:
"""Return total number of program instances."""
return self._num_programs
def arange(self, start: int, end: int, dtype: str = "i32") -> TensorHandle:
"""Create index range tensor in TCM."""
n = end - start
return self._make_handle(pa=0, shape=(n,), dtype=dtype)
def zeros(self, shape: tuple[int, ...], dtype: str = "f16") -> TensorHandle:
"""Create zero-filled tensor in TCM."""
return self._make_handle(pa=0, shape=shape, dtype=dtype)
def full(
self, shape: tuple[int, ...], value: float | int, dtype: str = "f16",
) -> TensorHandle:
"""Create constant-filled tensor in TCM."""
return self._make_handle(pa=0, shape=shape, dtype=dtype)
# ── Metadata (no compute, no DMA) ─────────────────────────────
def trans(self, x: TensorHandle) -> TensorHandle:
"""Transpose — shape change only, no command generated."""
if len(x.shape) < 2:
raise ValueError("trans requires at least 2D tensor")
new_shape = (*x.shape[:-2], x.shape[-1], x.shape[-2])
return TensorHandle(
id=x.id, pa=x.pa, shape=new_shape,
dtype=x.dtype, nbytes=x.nbytes, data=x.data,
)
# ── Composite + Control ───────────────────────────────────────
def composite(
self,
op: Literal["gemm", "math"],
a: TensorHandle,
b: TensorHandle | None = None,
out_ptr: int = 0,
math_op: str | None = None,
) -> CompletionHandle:
"""Submit a composite command (non-blocking, tiled pipeline).
Returns CompletionHandle for use with wait().
"""
# Compute output size based on op
if op == "gemm" and b is not None:
m, k = a.shape[-2], a.shape[-1]
n = b.shape[-1]
out_dtype = a.dtype
out_nbytes = m * n * self._dtype_bytes(out_dtype)
else:
out_nbytes = a.nbytes
completion = CompletionHandle(id=self._next_completion_id())
self._emit_dispatch_overhead()
self._commands.append(CompositeCmd(
completion=completion, op=op,
a=a, b=b, out_pa=out_ptr, out_nbytes=out_nbytes,
math_op=math_op,
))
return completion
def wait(self, handle: CompletionHandle | None = None) -> None:
"""Wait for a specific composite or all pending composites."""
self._commands.append(WaitCmd(handle=handle))
def cycles(self, n: int) -> None:
"""Declare PE_CPU scalar execution overhead (cycles)."""
self._commands.append(PeCpuOverheadCmd(cycles=n))
# ── TensorHandle arithmetic operators ─────────────────────────────
# Enables: a + b, a * b, a - b, a / b in kernel code.
# Each creates a MathCmd via a module-level helper that requires a
# TLContext. We attach the context to handles via a closure approach.
def _enable_tensor_ops() -> None:
"""Patch TensorHandle with arithmetic operators.
Called once at module load. Operators create MathCmd entries via
a thread-local TLContext reference set during kernel execution.
"""
import threading
_local = threading.local()
def set_active_context(ctx: TLContext | None) -> None:
_local.ctx = ctx
def get_active_context() -> TLContext:
ctx = getattr(_local, "ctx", None)
if ctx is None:
raise RuntimeError("TensorHandle ops require an active TLContext")
return ctx
def _binop(op: str):
def method(self: TensorHandle, other: TensorHandle) -> TensorHandle:
ctx = get_active_context()
return ctx._binary_math(op, self, other)
return method
# Patch TensorHandle class with operators
TensorHandle.__add__ = _binop("add") # type: ignore[attr-defined]
TensorHandle.__sub__ = _binop("sub") # type: ignore[attr-defined]
TensorHandle.__mul__ = _binop("mul") # type: ignore[attr-defined]
TensorHandle.__truediv__ = _binop("div") # type: ignore[attr-defined]
# Expose context management
TLContext._set_active = staticmethod(set_active_context) # type: ignore[attr-defined]
TLContext._get_active = staticmethod(get_active_context) # type: ignore[attr-defined]
_enable_tensor_ops()
def run_kernel(
kernel_fn,
tl_ctx: TLContext,
*args,
**kwargs,
) -> list[PeCommand]:
"""Execute a kernel function with the given TLContext and return commands.
Sets tl_ctx as the active context for TensorHandle operators,
calls the kernel, then clears the context.
"""
TLContext._set_active(tl_ctx) # type: ignore[attr-defined]
try:
kernel_fn(*args, tl=tl_ctx, **kwargs)
finally:
TLContext._set_active(None) # type: ignore[attr-defined]
return tl_ctx.commands
+22
View File
@@ -0,0 +1,22 @@
import kernbench.cli.main as cli_main
def test_cli_main_arg_parsing(monkeypatch):
def fake_cmd_run(args) -> int:
assert args.cmd == "run"
assert args.topology == "topology.yaml"
assert args.bench == "qkv_gemm"
assert args.device == None
return 0
# monkey patch the handler to test arg parsing without running the actual bench
monkeypatch.setattr(cli_main, "cmd_run", fake_cmd_run)
rc = cli_main.main(["run", "--topology", "topology.yaml", "--bench", "qkv_gemm"])
assert rc == 0
def test_cli_main():
rc = cli_main.main(["run", "--topology", "topology.yaml", "--bench", "qkv_gemm"])
assert rc == 0
+187
View File
@@ -0,0 +1,187 @@
"""Tests for the SimPy component model and DI registry (ADR-0007 D3).
Phase 1 verification: all tests FAIL until Phase 2 implements production code.
Latency invariant after refactor:
total_ns = Σ(wire propagation) + Σ(component.run() overhead_ns) + nbytes / bottleneck_bw
This is identical to the current formula for Phase 0 (no contention).
"""
import pytest
import simpy
from pathlib import Path
from kernbench.components.base import ComponentBase, ComponentRegistry
from kernbench.components.impls.forwarding import TransitComponent
from kernbench.policy.address.phyaddr import PhysAddr
from kernbench.runtime_api.kernel import MemoryReadMsg
from kernbench.sim_engine.engine import GraphEngine
from kernbench.topology.builder import load_topology
from kernbench.topology.types import Node
TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml"
def _graph():
return load_topology(TOPOLOGY_PATH)
def _hbm_pa(pe_id: int = 0) -> int:
slice_bytes = 48 * (1 << 30) // 8
pa = PhysAddr.pe_hbm_addr(
rack_id=0, sip_id=0, cube_id=0, pe_id=pe_id,
pe_local_hbm_offset=0x1000, slice_size_bytes=slice_bytes,
)
return pa.encode()
def _node(impl: str, overhead_ns: float = 0.0) -> Node:
return Node(id="test", kind="xbar", impl=impl, attrs={"overhead_ns": overhead_ns}, pos_mm=None)
# ── 1. unknown impl → error ──────────────────────────────────────────
def test_registry_unknown_impl_raises_error():
"""Unregistered impl raises ValueError (no fallback)."""
node = _node("totally_unknown_v99", overhead_ns=5.0)
with pytest.raises(ValueError, match="No component registered"):
ComponentRegistry.create(node)
# ── 2. TransitComponent yields exactly overhead_ns via simpy timeout ──
def test_transit_component_yields_overhead_ns():
"""TransitComponent.run() yields exactly node.attrs['overhead_ns'] ns."""
node = _node("xbar_v1", overhead_ns=3.0)
comp = TransitComponent(node)
env = simpy.Environment()
def proc():
yield from comp.run(env, nbytes=4096)
env.process(proc())
env.run()
assert env.now == pytest.approx(3.0)
def test_transit_component_zero_overhead_ns():
"""TransitComponent with overhead_ns=0 still yields (no infinite loop)."""
node = _node("noc_v1", overhead_ns=0.0)
comp = TransitComponent(node)
env = simpy.Environment()
done = []
def proc():
yield from comp.run(env, nbytes=1024)
done.append(True)
env.process(proc())
env.run()
assert done == [True]
assert env.now == pytest.approx(0.0)
# ── 3. DI override: custom component is invoked by engine ────────────
def test_engine_component_override_is_called():
"""Custom component injected via component_overrides is invoked during simulation."""
class SpyXbar(ComponentBase):
calls = 0
def run(self, env, nbytes):
SpyXbar.calls += 1
yield env.timeout(0)
SpyXbar.calls = 0
graph = _graph()
engine = GraphEngine(graph, component_overrides={"xbar_v1": SpyXbar})
msg = MemoryReadMsg(
correlation_id="c", request_id="r",
src_sip=0, src_cube=0, src_pe=0,
src_pa=_hbm_pa(pe_id=0), nbytes=4096,
)
h = engine.submit(msg)
engine.wait(h)
# PE0→slice0 path passes through xbar.pe0 (impl=xbar_v1)
assert SpyXbar.calls > 0
# ── 4. behavior unchanged: total_ns matches existing formula ─────────
def test_engine_component_model_same_latency_as_before():
"""Phase B component model total_ns for PE0→slice0 local HBM (4096B).
Cut-through (wormhole) wire model: wires apply propagation only.
Serialization (drain) is computed per-path and applied once at the terminal.
Forward path:
Path 1: pcie_ep(5.0) + wire(1.0mm=0.01) + io_cpu(10.0)
Path 2: wire(3.5mm=0.035) + ucie-N(1.0)
+ 2DMeshNOC(ucie-N→m_cpu: Manhattan 10.9mm=0.109) + m_cpu(5.0)
Path 3 DMA (m_cpu→noc→xbar.pe0→hbm_ctrl.slice0):
+ 2DMeshNOC(m_cpu→xbar.pe0: Manhattan 15.0mm=0.15)
+ xbar.pe0(2.0) + wire(2.5mm=0.025) + hbm_ctrl(0.0)
+ drain_ns(4096/128 = 32.0, bottleneck = noc_to_xbar 128 GB/s)
Response path (reverse, nbytes=0, drain=0):
DMA response: hbm_ctrl→xbar.pe0→noc→m_cpu (propagation + xbar overhead_ns)
Command response: m_cpu→noc→ucie-N→io_cpu (propagation + ucie overhead_ns)
Total: ~58.648 ns
"""
graph = _graph()
engine = GraphEngine(graph)
msg = MemoryReadMsg(
correlation_id="c", request_id="r",
src_sip=0, src_cube=0, src_pe=0,
src_pa=_hbm_pa(pe_id=0), nbytes=4096,
)
h = engine.submit(msg)
engine.wait(h)
_, trace = engine.get_completion(h)
assert trace["total_ns"] == pytest.approx(58.648, rel=1e-4)
# ── 5. override is scoped: only targeted impl is replaced ────────────
def test_engine_override_is_scoped_to_impl():
"""xbar_v1 override (ZeroXbar, no overhead_ns) reduces total_ns by exactly 4.0 ns.
xbar.pe0 has overhead_ns=2.0. It is traversed on both the forward DMA path
and the reverse response path, so replacing it with a zero-latency impl
removes 2.0 ns × 2 = 4.0 ns; all other components are unchanged.
"""
class ZeroXbar(ComponentBase):
def run(self, env, nbytes):
yield env.timeout(0)
graph = _graph()
engine_default = GraphEngine(graph)
engine_override = GraphEngine(graph, component_overrides={"xbar_v1": ZeroXbar})
msg = MemoryReadMsg(
correlation_id="c", request_id="r",
src_sip=0, src_cube=0, src_pe=0,
src_pa=_hbm_pa(pe_id=0), nbytes=4096,
)
h_d = engine_default.submit(msg)
engine_default.wait(h_d)
_, t_default = engine_default.get_completion(h_d)
h_o = engine_override.submit(msg)
engine_override.wait(h_o)
_, t_override = engine_override.get_completion(h_o)
# ZeroXbar removes overhead_ns=2.0 from xbar.pe0 on forward + response = 4.0 ns faster
assert t_override["total_ns"] < t_default["total_ns"]
assert t_default["total_ns"] - t_override["total_ns"] == pytest.approx(4.0, rel=1e-6)
+405
View File
@@ -0,0 +1,405 @@
import pytest
from pathlib import Path
from kernbench.common.types import Completion, RequestHandle
from kernbench.policy.address.phyaddr import PhysAddr
from kernbench.runtime_api.kernel import (
KernelLaunchMsg,
KernelRef,
MemoryReadMsg,
MemoryWriteMsg,
ScalarArg,
TensorArg,
TensorArgShard,
)
from kernbench.sim_engine.engine import GraphEngine
from kernbench.topology.builder import load_topology
TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml"
def _engine():
graph = load_topology(TOPOLOGY_PATH)
return GraphEngine(graph)
def _hbm_pa(sip: int = 0, cube: int = 0, pe_id: int = 0) -> int:
"""Create an HBM physical address targeting a specific PE's HBM slice."""
# 48 GB / 8 slices = 6 GB per slice
slice_bytes = 48 * (1 << 30) // 8
pa = PhysAddr.pe_hbm_addr(
rack_id=0, sip_id=sip, cube_id=cube, pe_id=pe_id,
pe_local_hbm_offset=0x1000, slice_size_bytes=slice_bytes,
)
return pa.encode()
def _sram_pa(sip: int = 0, cube: int = 0) -> int:
"""Create an SRAM physical address."""
pa = PhysAddr.cube_sram_addr(rack_id=0, sip_id=sip, cube_id=cube, sram_offset=0x800)
return pa.encode()
# ── 1. submit returns handle ────────────────────────────────────────
def test_engine_submit_returns_handle():
"""submit() must return a RequestHandle (non-empty string)."""
engine = _engine()
msg = MemoryWriteMsg(
correlation_id="c0", request_id="r0",
dst_sip=0, dst_cube=0, dst_pe=0,
dst_pa=_hbm_pa(), nbytes=4096, pattern="zero",
)
handle = engine.submit(msg)
assert isinstance(handle, str)
assert len(handle) > 0
# ── 2. memory write completion ──────────────────────────────────────
def test_engine_memory_write_completion():
"""MemoryWrite must complete with ok=True."""
engine = _engine()
msg = MemoryWriteMsg(
correlation_id="c0", request_id="r1",
dst_sip=0, dst_cube=0, dst_pe=0,
dst_pa=_hbm_pa(), nbytes=4096, pattern="zero",
)
h = engine.submit(msg)
engine.wait(h)
comp, trace = engine.get_completion(h)
assert comp.ok is True
# ── 3. memory read completion ───────────────────────────────────────
def test_engine_memory_read_completion():
"""MemoryRead must complete with ok=True."""
engine = _engine()
msg = MemoryReadMsg(
correlation_id="c0", request_id="r2",
src_sip=0, src_cube=0, src_pe=0,
src_pa=_hbm_pa(), nbytes=4096,
)
h = engine.submit(msg)
engine.wait(h)
comp, trace = engine.get_completion(h)
assert comp.ok is True
# ── 4. latency positive ────────────────────────────────────────────
def test_engine_latency_positive():
"""Trace total_ns must be > 0 (ADR-0002 D4)."""
engine = _engine()
msg = MemoryWriteMsg(
correlation_id="c0", request_id="r3",
dst_sip=0, dst_cube=0, dst_pe=0,
dst_pa=_hbm_pa(), nbytes=4096, pattern="zero",
)
h = engine.submit(msg)
engine.wait(h)
_, trace = engine.get_completion(h)
assert trace["total_ns"] > 0
# ── 5. trace has total_ns and nbytes ───────────────────────────────
def test_engine_trace_has_total_ns_and_nbytes():
"""Trace must contain 'total_ns' and 'nbytes'."""
engine = _engine()
msg = MemoryWriteMsg(
correlation_id="c0", request_id="r4",
dst_sip=0, dst_cube=0, dst_pe=0,
dst_pa=_hbm_pa(), nbytes=4096, pattern="zero",
)
h = engine.submit(msg)
engine.wait(h)
_, trace = engine.get_completion(h)
assert "total_ns" in trace
assert "nbytes" in trace
assert trace["nbytes"] == 4096
# ── 6. latency includes node overhead_ns ────────────────────────────
def test_engine_latency_includes_node_overhead_ns():
"""Path traverses components with overhead_ns > 0, so total >= some minimum."""
engine = _engine()
msg = MemoryWriteMsg(
correlation_id="c0", request_id="r7",
dst_sip=0, dst_cube=0, dst_pe=0,
dst_pa=_hbm_pa(), nbytes=4096, pattern="zero",
)
h = engine.submit(msg)
engine.wait(h)
_, trace = engine.get_completion(h)
# pcie_ep (5.0) + io_cpu (10.0) + m_cpu (5.0) = at least 20 ns
assert trace["total_ns"] >= 20.0
# ── 7. concurrent requests ─────────────────────────────────────────
def test_engine_concurrent_requests():
"""Two requests submitted before wait must both complete with traces."""
engine = _engine()
msg1 = MemoryWriteMsg(
correlation_id="c0", request_id="r9a",
dst_sip=0, dst_cube=0, dst_pe=0,
dst_pa=_hbm_pa(), nbytes=4096, pattern="zero",
)
msg2 = MemoryWriteMsg(
correlation_id="c0", request_id="r9b",
dst_sip=0, dst_cube=0, dst_pe=1,
dst_pa=_hbm_pa(pe_id=1), nbytes=4096, pattern="zero",
)
h1 = engine.submit(msg1)
h2 = engine.submit(msg2)
engine.wait(h1)
engine.wait(h2)
comp1, trace1 = engine.get_completion(h1)
comp2, trace2 = engine.get_completion(h2)
assert comp1.ok is True
assert comp2.ok is True
assert trace1["total_ns"] > 0
assert trace2["total_ns"] > 0
# ── 8. kernel launch ───────────────────────────────────────────────
def test_engine_kernel_launch_simplified():
"""KernelLaunch returns latency > 0."""
from kernbench.triton_emu.registry import clear_registry, register_kernel
clear_registry()
hbm_pa = _hbm_pa(pe_id=0)
def gemm_kernel(a_ptr, tl):
a = tl.load(a_ptr, shape=(4, 4), dtype="f16")
tl.store(a_ptr, a)
register_kernel("gemm", gemm_kernel)
engine = _engine()
shard0 = TensorArgShard(
sip=0, cube=0, pe=0,
pa=_hbm_pa(pe_id=0), nbytes=4096, offset_bytes=0,
)
shard1 = TensorArgShard(
sip=0, cube=0, pe=1,
pa=_hbm_pa(pe_id=1), nbytes=4096, offset_bytes=4096,
)
msg = KernelLaunchMsg(
correlation_id="c0", request_id="r10",
kernel_ref=KernelRef(name="gemm", kind="builtin"),
args=(TensorArg(shards=(shard0, shard1)),),
)
h = engine.submit(msg)
engine.wait(h)
comp, trace = engine.get_completion(h)
assert comp.ok is True
assert trace["total_ns"] > 0
clear_registry()
# ── 9. deterministic ───────────────────────────────────────────────
def test_engine_deterministic():
"""Same request on two engines must produce identical latency."""
msg = MemoryWriteMsg(
correlation_id="c0", request_id="r11",
dst_sip=0, dst_cube=0, dst_pe=0,
dst_pa=_hbm_pa(), nbytes=4096, pattern="zero",
)
e1 = _engine()
h1 = e1.submit(msg)
e1.wait(h1)
_, t1 = e1.get_completion(h1)
e2 = _engine()
h2 = e2.submit(msg)
e2.wait(h2)
_, t2 = e2.get_completion(h2)
assert t1["total_ns"] == t2["total_ns"]
# ── 10. remote cube access succeeds with higher latency ────────────
def test_dma_capacity_serializes_concurrent():
"""Two concurrent DMA writes to the same cube must contend at DMA capacity=1.
When two MemoryWrite requests target the same cube's M_CPU simultaneously,
the DMA engine (capacity=1) serializes them. The slower request must take
longer than a single isolated request (ADR-0014 D4, ADR-0015 D5).
"""
# Single isolated write baseline
engine_single = _engine()
msg_single = MemoryWriteMsg(
correlation_id="c0", request_id="single",
dst_sip=0, dst_cube=0, dst_pe=0,
dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096,
pattern="zero", target_pe=0,
)
h1 = engine_single.submit(msg_single)
engine_single.wait(h1)
_, t1 = engine_single.get_completion(h1)
single_ns = t1["total_ns"]
# Two concurrent writes to same cube (different PEs) → DMA contention
engine_conc = _engine()
msg_a = MemoryWriteMsg(
correlation_id="c0", request_id="conc-a",
dst_sip=0, dst_cube=0, dst_pe=0,
dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096,
pattern="zero", target_pe=0,
)
msg_b = MemoryWriteMsg(
correlation_id="c0", request_id="conc-b",
dst_sip=0, dst_cube=0, dst_pe=1,
dst_pa=_hbm_pa(sip=0, cube=0, pe_id=1), nbytes=4096,
pattern="zero", target_pe=1,
)
ha = engine_conc.submit(msg_a)
hb = engine_conc.submit(msg_b)
engine_conc.wait(ha)
engine_conc.wait(hb)
_, ta = engine_conc.get_completion(ha)
_, tb = engine_conc.get_completion(hb)
# At least one must be delayed by DMA contention
max_ns = max(ta["total_ns"], tb["total_ns"])
assert max_ns > single_ns, (
f"concurrent max ({max_ns:.2f}ns) must > single ({single_ns:.2f}ns) "
f"due to DMA capacity=1 contention"
)
# ── 11. formula latency lower bound ──────────────────────────────
def test_formula_latency_lower_bound():
"""_formula_latency must be <= actual latency (ADR-0015 D7).
Uses PE DMA path which is fully known at engine level.
"""
from kernbench.policy.address.phyaddr import PhysAddr as PA
from kernbench.policy.routing.router import AddressResolver, PathRouter
from kernbench.topology.builder import load_topology as lt
graph = lt(TOPOLOGY_PATH)
engine = GraphEngine(graph)
resolver = AddressResolver(graph)
router = PathRouter(graph)
pa = _hbm_pa(sip=0, cube=0, pe_id=1)
pa_obj = PA.decode(pa)
dst_node = resolver.resolve(pa_obj)
pe_ref = "sip0.cube0.pe0"
path = router.find_path(pe_ref, dst_node)
formula = engine._formula_latency(path, 4096)
# Run actual simulation
msg = MemoryReadMsg(
correlation_id="c0", request_id="formula-lb",
src_sip=0, src_cube=0, src_pe=0,
src_pa=pa, nbytes=4096, target_pe=1,
)
h = engine.submit(msg)
engine.wait(h)
_, trace = engine.get_completion(h)
actual = trace["total_ns"]
assert formula <= actual, (
f"formula ({formula:.2f}) must <= actual ({actual:.2f})"
)
assert formula > 0, "formula must be > 0"
def test_formula_latency_exact_no_contention():
"""With no contention, formula should approximate actual for PE DMA.
PE DMA is single-request with no fan-out or aggregation,
so formula ≈ actual (within small tolerance for SimPy scheduling).
"""
from kernbench.runtime_api.kernel import PeDmaMsg
from kernbench.policy.address.phyaddr import PhysAddr as PA
from kernbench.policy.routing.router import AddressResolver, PathRouter
from kernbench.topology.builder import load_topology as lt
graph = lt(TOPOLOGY_PATH)
engine = GraphEngine(graph)
resolver = AddressResolver(graph)
router = PathRouter(graph)
pa = _hbm_pa(sip=0, cube=0, pe_id=0)
pa_obj = PA.decode(pa)
dst_node = resolver.resolve(pa_obj)
pe_ref = "sip0.cube0.pe0"
path = router.find_path(pe_ref, dst_node)
formula = engine._formula_latency(path, 4096)
msg = PeDmaMsg(
correlation_id="c0", request_id="formula-exact",
src_sip=0, src_cube=0, src_pe=0,
dst_pa=pa, nbytes=4096,
)
h = engine.submit(msg)
engine.wait(h)
_, trace = engine.get_completion(h)
actual = trace["total_ns"]
# No contention: formula should equal actual
assert abs(formula - actual) < 0.01, (
f"formula ({formula:.4f}) ≈ actual ({actual:.4f}) expected with no contention"
)
# ── 10. remote cube access succeeds with higher latency ────────────
def test_engine_remote_cube_latency_higher():
"""Accessing a distant cube's HBM must have strictly higher latency than local.
Uses separate engines to avoid contention effects.
cube15 (far corner of 4x4 mesh) requires multiple UCIe + NOC hops
from IO chiplet compared to cube0 (directly connected).
"""
engine_local = _engine()
engine_remote = _engine()
msg_local = MemoryReadMsg(
correlation_id="c0", request_id="r14a",
src_sip=0, src_cube=0, src_pe=0,
src_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096,
)
msg_remote = MemoryReadMsg(
correlation_id="c0", request_id="r14b",
src_sip=0, src_cube=0, src_pe=0,
src_pa=_hbm_pa(sip=0, cube=15, pe_id=0), nbytes=4096,
)
h_local = engine_local.submit(msg_local)
engine_local.wait(h_local)
_, t_local = engine_local.get_completion(h_local)
h_remote = engine_remote.submit(msg_remote)
engine_remote.wait(h_remote)
comp_remote, t_remote = engine_remote.get_completion(h_remote)
assert comp_remote.ok is True
assert t_remote is not None and t_local is not None
assert t_remote["total_ns"] > t_local["total_ns"], (
f"remote cube {t_remote['total_ns']:.2f} must > local {t_local['total_ns']:.2f}"
)
File diff suppressed because it is too large Load Diff
+269
View File
@@ -0,0 +1,269 @@
"""Phase A component infrastructure tests (ADR-0015).
Verifies:
- TransitComponent, IoCpuComponent apply overhead_ns via run()
- HbmCtrlComponent and SramComponent act as terminal nodes (succeed done)
- MCpuComponent forwards when not terminal; completes when terminal + no ctx
- ComponentRegistry resolves impl strings to correct concrete classes
- GraphEngine passes ComponentContext to every component
- ComponentContext.router and .resolver are correctly populated
"""
from __future__ import annotations
from pathlib import Path
from typing import Any
import pytest
import simpy
from kernbench.components.base import ComponentBase, ComponentRegistry
from kernbench.components.context import ComponentContext
from kernbench.components.impls import (
HbmCtrlComponent,
IoCpuComponent,
MCpuComponent,
PcieEpComponent,
SramComponent,
TransitComponent,
)
from kernbench.sim_engine.engine import GraphEngine
from kernbench.sim_engine.transaction import Transaction
from kernbench.topology.builder import load_topology
from kernbench.topology.types import Node
TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml"
def _node(impl: str, attrs: dict | None = None) -> Node:
return Node(id="test.node", kind="test", impl=impl, attrs=attrs or {}, pos_mm=None)
def _run_worker(comp: ComponentBase, env: simpy.Environment, txn: Transaction) -> None:
"""Wire one in_port, start the component, inject txn, run env until done."""
in_store: simpy.Store = simpy.Store(env)
comp.in_ports["src"] = in_store
comp.start(env)
env.process(_inject(in_store, txn))
env.run(until=txn.done)
def _inject(store: simpy.Store, txn: Transaction):
yield store.put(txn)
# ── 1. run() latency: TransitComponent ───────────────────────────────
def test_transit_component_run_overhead_ns():
"""TransitComponent.run() yields exactly overhead_ns."""
node = _node("forwarding_v1", {"overhead_ns": 7.5})
comp = TransitComponent(node)
env = simpy.Environment()
def proc():
yield from comp.run(env, nbytes=1024)
env.process(proc())
env.run()
assert env.now == pytest.approx(7.5)
def test_transit_component_run_zero_overhead_ns():
"""TransitComponent.run() with overhead_ns=0 completes immediately."""
node = _node("noc_v1", {"overhead_ns": 0.0})
comp = TransitComponent(node)
env = simpy.Environment()
done = []
def proc():
yield from comp.run(env, nbytes=512)
done.append(True)
env.process(proc())
env.run()
assert done == [True]
assert env.now == pytest.approx(0.0)
# ── 2. run() latency: IoCpuComponent ────────────────────────────────
def test_io_cpu_component_run_overhead_ns():
"""IoCpuComponent.run() yields exactly overhead_ns."""
node = _node("io_cpu_v1", {"overhead_ns": 10.0})
comp = IoCpuComponent(node)
env = simpy.Environment()
def proc():
yield from comp.run(env, nbytes=2048)
env.process(proc())
env.run()
assert env.now == pytest.approx(10.0)
# ── 3. Terminal: HbmCtrlComponent succeeds done ──────────────────────
def test_hbm_ctrl_terminal_succeeds_done():
"""HbmCtrlComponent is a terminal node: succeeds txn.done after run()."""
node = _node("hbm_ctrl_v1", {"overhead_ns": 0.0, "capacity": 1})
comp = HbmCtrlComponent(node)
env = simpy.Environment()
done_event = env.event()
txn = Transaction(request=None, path=["test.node"], step=0, nbytes=256, done=done_event)
_run_worker(comp, env, txn)
assert done_event.triggered
def test_hbm_ctrl_resource_serializes_requests():
"""HbmCtrlComponent with capacity=1 serializes concurrent requests."""
node = _node("hbm_ctrl_v1", {"overhead_ns": 5.0, "capacity": 1})
comp = HbmCtrlComponent(node)
env = simpy.Environment()
in_store: simpy.Store = simpy.Store(env)
comp.in_ports["src"] = in_store
comp.start(env)
done1 = env.event()
done2 = env.event()
txn1 = Transaction(request=None, path=["test.node"], step=0, nbytes=0, done=done1)
txn2 = Transaction(request=None, path=["test.node"], step=0, nbytes=0, done=done2)
def inject():
yield in_store.put(txn1)
yield in_store.put(txn2)
env.process(inject())
env.run(until=done2)
# Both must be done; with serialization: t=5 + t=10
assert done1.triggered
assert done2.triggered
assert env.now == pytest.approx(10.0)
# ── 4. Terminal: SramComponent succeeds done ─────────────────────────
def test_sram_terminal_succeeds_done():
"""SramComponent is a terminal node: succeeds txn.done after run()."""
node = _node("sram_v1", {"overhead_ns": 2.0})
comp = SramComponent(node)
env = simpy.Environment()
done_event = env.event()
txn = Transaction(request=None, path=["test.node"], step=0, nbytes=512, done=done_event)
_run_worker(comp, env, txn)
assert done_event.triggered
assert env.now == pytest.approx(2.0)
# ── 5. MCpuComponent: forward when not terminal ──────────────────────
def test_m_cpu_forwards_when_not_terminal():
"""MCpuComponent forwards Transaction to next hop when not terminal."""
node = _node("m_cpu_v1", {"overhead_ns": 5.0})
comp = MCpuComponent(node)
env = simpy.Environment()
# Wire in_port and out_port for a two-hop path [src, test.node, next]
in_store: simpy.Store = simpy.Store(env)
out_store: simpy.Store = simpy.Store(env)
comp.in_ports["src"] = in_store
comp.out_ports["next"] = out_store
comp.start(env)
done_event = env.event()
txn = Transaction(
request=None,
path=["src", "test.node", "next"],
step=1, # currently at test.node; next_hop = "next"
nbytes=128,
done=done_event,
)
forwarded: list[Any] = []
def receiver():
msg = yield out_store.get()
forwarded.append(msg)
msg.done.succeed()
env.process(receiver())
def inject():
yield in_store.put(txn)
env.process(inject())
env.run(until=done_event)
assert len(forwarded) == 1
assert forwarded[0].step == 2 # advanced
assert env.now == pytest.approx(5.0)
# ── 6. MCpuComponent: terminal with no ctx just completes ────────────
def test_m_cpu_terminal_no_ctx_completes():
"""MCpuComponent without ctx completes txn.done when it is the terminal hop."""
node = _node("m_cpu_v1", {"overhead_ns": 0.0})
comp = MCpuComponent(node, ctx=None)
env = simpy.Environment()
done_event = env.event()
txn = Transaction(request=None, path=["test.node"], step=0, nbytes=64, done=done_event)
_run_worker(comp, env, txn)
assert done_event.triggered
# ── 7. ComponentRegistry resolves impl strings ───────────────────────
@pytest.mark.parametrize("impl,expected_cls", [
("forwarding_v1", TransitComponent),
("noc_v1", TransitComponent),
("ucie_v1", TransitComponent),
("xbar_v1", TransitComponent),
("pcie_ep_v1", PcieEpComponent),
("io_cpu_v1", IoCpuComponent),
("m_cpu_v1", MCpuComponent),
("hbm_ctrl_v1", HbmCtrlComponent),
("sram_v1", SramComponent),
])
def test_registry_resolves_impl(impl, expected_cls):
"""ComponentRegistry.create() returns the correct concrete class for each impl."""
node = _node(impl, {"overhead_ns": 0.0})
comp = ComponentRegistry.create(node)
assert isinstance(comp, expected_cls)
# ── 8. GraphEngine passes ComponentContext to components ─────────────
def test_engine_passes_ctx_to_components():
"""GraphEngine injects a non-None ComponentContext into every component."""
graph = load_topology(TOPOLOGY_PATH)
engine = GraphEngine(graph)
for node_id, comp in engine._components.items():
assert comp.ctx is not None, f"{node_id}: ctx is None"
assert isinstance(comp.ctx, ComponentContext), f"{node_id}: ctx wrong type"
def test_engine_ctx_router_and_resolver_populated():
"""ComponentContext.router and .resolver are PathRouter / AddressResolver instances."""
from kernbench.policy.routing.router import AddressResolver, PathRouter
graph = load_topology(TOPOLOGY_PATH)
engine = GraphEngine(graph)
# Spot-check one component
first_comp = next(iter(engine._components.values()))
assert isinstance(first_comp.ctx.router, PathRouter)
assert isinstance(first_comp.ctx.resolver, AddressResolver)

Some files were not shown because too many files have changed in this diff Show More