commit - release 1

2026-03-18 11:47:48 -07:00
commit 6f43807900
109 changed files with 14909 additions and 0 deletions
@@ -0,0 +1,15 @@
 {
  "permissions": {
    "allow": [
      "Bash(python -c \":*)",
      "Bash(xargs ls -la)",
      "Bash(wc -l /c/Users/ywkang/kernbench/src/kernbench/policy/address/phyaddr.py /c/Users/ywkang/kernbench/src/kernbench/runtime_api/*.py)",
      "Bash(python scripts/gen_placement_diagrams.py)",
      "Bash(python -c:*)",
      "Bash(python -m kernbench.cli.main probe --topology topology.yaml)",
      "Bash(xargs grep -l \"class.*ComponentBase\\\\|class.*DefaultComponent\")",
      "Bash(python -m pytest tests/test_probe.py -v)",
      "Bash(python -m pytest tests/test_probe.py tests/test_component_registry.py -v)"
    ]
  }
 }
@@ -0,0 +1,36 @@
 {
  "permissions": {
    "allow": [
      "Bash(python -m ruff check --select I --fix src/kernbench/runtime_api/context.py src/kernbench/runtime_api/bench_runner.py src/kernbench/cli/main.py)",
      "Bash(python -m pytest tests/ -q)",
      "Bash(python -m pytest tests/ -v)",
      "Bash(python -m pytest tests/test_topology_load.py tests/test_cli.py -v)",
      "Bash(python -c \":*)",
      "Bash(python -m pytest tests/ -q --tb=no)",
      "Bash(python -m pytest tests/ -v --tb=short)",
      "Bash(python -m pytest tests/test_component_registry.py -v --tb=line)",
      "Bash(python -m pytest tests/ --ignore=tests/test_component_registry.py -q --tb=no)",
      "Bash(python -m pytest tests/ -q --tb=short)",
      "Bash(python -m pytest --tb=short -q)",
      "Bash(python -m pytest tests/test_phase_a_components.py -v)",
      "Bash(python -m pytest --tb=short)",
      "Bash(python -m pytest tests/ -x -q)",
      "Bash(python -m pytest tests/test_probe.py::test_h2d_latency_monotonic -x -q)",
      "Bash(python -m pytest tests/ -x --tb=short)",
      "Bash(python -m pytest tests/ --tb=line)",
      "Bash(python -m pytest tests/ --tb=short)",
      "Bash(python -m kernbench probe --topology topology.yaml)",
      "Bash(python -m pytest tests/ --tb=short -q)",
      "Bash(python -m pytest tests/test_probe.py tests/test_component_registry.py tests/test_topology_compile.py -v)",
      "Bash(python -m pytest tests/test_tensor.py tests/test_engine.py tests/test_probe.py tests/test_component_registry.py -v)",
      "Bash(grep -l \"class.*CPU\" \"/c/Users/ywkang/kernbench/src/kernbench/components/impls\"/*.py)",
      "Bash(grep -n \"^class \" \"/c/Users/ywkang/kernbench/src/kernbench/components/impls\"/*.py)",
      "Bash(python -m pytest tests/test_engine.py tests/test_probe.py tests/test_component_registry.py -v)",
      "Bash(grep -E \"\\\\.\\(py|md\\)$\")",
      "Bash(python -m pytest tests/test_pe_components.py -v)",
      "Bash(python -m pytest tests/test_triton_emu.py -v)",
      "Bash(python -m pytest tests/test_pe_components.py tests/test_triton_emu.py -v)",
      "Bash(python -m pytest tests/test_pe_components.py::test_mcpu_multi_pe_kernel_launch tests/test_pe_components.py::test_qkv_gemm_bench_multi_pe_completes -v)"
    ]
  }
 }
@@ -0,0 +1,12 @@
 root = true
 [*]
 charset = utf-8
 end_of_line = lf
 insert_final_newline = true
 trim_trailing_whitespace = true
 indent_style = space
 indent_size = 4
 [*.md]
 trim_trailing_whitespace = false
@@ -0,0 +1,28 @@
 # OS / Editor
 .DS_Store
 .vscode/.history/
 *.swp
 # Python
 __pycache__/
 *.py[cod]
 *.pyd
 .pytest_cache/
 .mypy_cache/
 .ruff_cache/
 # Virtualenv
 .venv/
 # Packaging
 dist/
 build/
 *.egg-info/
 # Env
 .env
 .env.*
 !.env.example
 # Logs
 *.log
@@ -0,0 +1,3 @@
 [MESSAGES CONTROL]
 disable=missing-function-docstring
 disable=missing-module-docstring
@@ -0,0 +1,16 @@
 {
  "recommendations": [
    "ms-python.python",
    "ms-python.vscode-pylance",
    "charliermarsh.ruff",
    "ms-python.debugpy",
    "eamodio.gitlens",
    "usernamehw.errorlens",
    "wayou.vscode-todo-highlight",
    "christian-kohler.path-intellisense",
    "editorconfig.editorconfig",
    "ms-azuretools.vscode-docker",
    "humao.rest-client",
    "shd101wyy.markdown-preview-enhanced"
  ]
 }
@@ -0,0 +1,55 @@
 {
  "version": "0.2.0",
  "configurations": [
    {
      "name": "Python: Current File",
      "type": "python",
      "request": "launch",
      "program": "${file}",
      "console": "integratedTerminal",
      "justMyCode": true
    },
    {
      "name": "Run kernbench CLI",
      "type": "python",
      "request": "launch",
      "module": "kernbench.cli.main",
      "console": "integratedTerminal",
      "justMyCode": true,
      "args": [
        "run",
        "--topology", "topology.yaml",
        "--bench", "qkv_gemm"
      ],
      "env": {
        "PYTHONPATH": "${workspaceFolder}/src"
      }
    },
    {
      "name": "Run KernBench Probe",
      "type": "python",
      "request": "launch",
      "module": "kernbench.cli.main",
      "console": "integratedTerminal",
      "justMyCode": true,
      "args": [
        "probe",
        "--topology", "topology.yaml",
      ],
      "env": {
        "PYTHONPATH": "${workspaceFolder}/src"
      }
    },
    {
      "name": "Pytest: Run All (terminal)",
      "type": "python",
      "request": "launch",
      "module": "pytest",
      "args": [
        "-q"
      ],
      "console": "integratedTerminal",
      "justMyCode": true
    }
  ]
 }
@@ -0,0 +1,39 @@
 {
  "editor.formatOnSave": true,
  "editor.formatOnSaveMode": "file",
  "editor.tabSize": 4,
  "editor.insertSpaces": true,
  "editor.rulers": [
    100
  ],
  "editor.minimap.enabled": false,
  "files.eol": "\n",
  "files.trimTrailingWhitespace": true,
  "files.insertFinalNewline": true,
  "search.useIgnoreFiles": true,
  "git.autofetch": true,
  "python.venvFolders": [".venv"],
  "python.testing.pytestEnabled": true,
  "python.testing.unittestEnabled": false,
  "python.testing.autoTestDiscoverOnSaveEnabled": true,
  "python.testing.pytestArgs": [
    "tests", "-q", "-vv", "-s",  "--tb=short", "--no-header",
  ],
  "python.analysis.typeCheckingMode": "basic",
  "python.analysis.autoImportCompletions": true,
  "python.terminal.activateEnvironment": true,
  "ruff.lint.enable": true,
  "ruff.format.enable": true,
  "editor.defaultFormatter": "charliermarsh.ruff",
  "[python]": {
    "editor.defaultFormatter": "charliermarsh.ruff"
  },
  "editor.codeActionsOnSave": {
    "source.fixAll": "explicit",
    "source.organizeImports": "explicit"
  },
  "python.analysis.extraPaths": [
    "${workspaceFolder}/src"
  ]
 }
@@ -0,0 +1,127 @@
 {
  "version": "2.0.0",
  "tasks": [
    {
      "label": "venv: create",
      "type": "shell",
      "command": "python -m venv .venv",
      "problemMatcher": []
    },
    {
      "label": "deps: install",
      "type": "shell",
      "command": "${command:python.interpreterPath}",
      "args": [
        "-m",
        "pip",
        "install",
        "-U",
        "pip",
        "&&",
        "${command:python.interpreterPath}",
        "-m",
        "pip",
        "install",
        "-e",
        ".[dev]"
      ],
      "problemMatcher": [],
      "dependsOn": "venv: create"
    },
    {
      "label": "Run KernBench CLI",
      "type": "shell",
      "command": "${command:python.interpreterPath}",
      "args": [
        "-m",
        "kernbench.cli.main",
        "run",
        "--topology",
        "topology.yaml",
        "--bench",
        "qkv_gemm"
      ],
      "options": { "cwd": "${workspaceFolder}" },
      "problemMatcher": [],
      "presentation": {
        "reveal": "silent",
        "panel": "shared",
        "clear": true
      }
    },
    {
      "label": "Run KernBench Probe",
      "type": "shell",
      "command": "${command:python.interpreterPath}",
      "args": [
        "-m",
        "kernbench.cli.main",
        "probe",
        "--topology",
        "topology.yaml"
      ],
      "options": { "cwd": "${workspaceFolder}" },
      "problemMatcher": [],
      "presentation": {
        "reveal": "silent",
        "panel": "shared",
        "clear": true
      }
    },
    {
      "label": "Pytest: Run All (terminal)",
      "type": "shell",
      "command": "${command:python.interpreterPath}",
      "args": [
        "-m",
        "pytest",
        "-q",
        "-s"
      ],
      "problemMatcher": [],
      "group": "test"
    },
    {
      "label": "lint",
      "type": "shell",
      "command": "${command:python.interpreterPath}",
      "args": [
        "-m",
        "ruff",
        "check",
        "."
      ],
      "problemMatcher": []
    },
    {
      "label": "format",
      "type": "shell",
      "command": "${command:python.interpreterPath}",
      "args": [
        "-m",
        "ruff",
        "format",
        "."
      ],
      "problemMatcher": []
    },
    {
      "label": "which python",
      "type": "shell",
      "command": "${command:python.interpreterPath}",
      "args": [
        "-c",
        "import sys;print(sys.executable)"
      ],
      "problemMatcher": []
    }
  ]
 }
@@ -0,0 +1,196 @@
 # Claude Code Instructions (Repo)
 This repository uses Claude Code with strict architectural and verification rules.
 SPEC.md and ADRs are the source of truth.
 ---
 ## Terminology
 - runtime API:
  Host-facing public API used by benchmarks and user code (e.g., tensor deployment, kernel launch).
 - simulation engine (sim_engine):
  Discrete-event engine responsible for request injection, scheduling, and completion tracking.
 - components:
  Device-side nodes modeling hardware behavior (IO_CPU, M_CPU, PE_CPU, routers, engines, etc.).
 ## Authority & Scope
 - SPEC.md defines the architectural contract.
 - ADRs (docs/adr/ADR-*.md) define non-trivial architectural decisions.
 - If a change conflicts with SPEC.md or an ADR:
  - STOP.
  - Explain the conflict.
  - Propose options (keep spec, update ADR, or narrow scope).
 - Do NOT silently change architecture.
 - The repository structure reflects architectural intent; Claude Code MUST respect existing module boundaries and file locations.
 ---
 ## Design Questions
 - Design / architecture questions are ALWAYS allowed.
 - Design questions MUST NOT modify:
  - production code
  - test code
  - SPEC.md
  - ADRs
 - If a design question implies a change, default to Phase 1.
 ---
 ## Change & Test Protocol (Mandatory)
 All non-trivial changes MUST follow a two-phase process.
 Design discussion is always allowed; code changes are not.
 ---
 ### Phase 1 — Proposal + Verification
 (No Production Code Changes)
 #### Purpose
 - Decide *what* to change and *how it will be validated*
 - Establish verification coverage BEFORE touching production code
 #### Phase 1 MUST include
 1) **Design Proposal**
 - Explain the design change.
 - Explain why the change is needed.
 - Explain consistency with SPEC.md and relevant ADRs.
 1) **Verification Plan**
 - SPEC requirement(s) / ADR(s) affected (e.g., R1/R2/R5, ADR-0002).
 - Tests that validate the change:
  - existing tests to run, and/or
  - new tests to add.
 - Concrete input cases used by the tests:
  - topology (SIP / CUBE / PE layout)
  - request parameters (src, dst, size_bytes).
 - Expected observable assertions, such as:
  - hop trace contains key waypoints,
  - latency invariants (e.g., > 0, monotonic increase),
  - deterministic route selection.
  - **expected changes (or no changes) in generated diagrams**, if applicable.
 If the Verification Plan is missing or vague, STOP.
 #### Allowed in Phase 1
 - Creating or modifying **test code only**
 - Running tests and reporting results
 #### Forbidden in Phase 1
 - Any production code changes
 - Any SPEC.md or ADR modifications
 - Any production diff output
 #### Phase 1 Output
 - Proposal + Verification Plan
 - Tests added/modified (if any)
 - Test execution results (PASS / FAIL)
 - Clear recommendation:
  - "No Phase 2 needed" OR
  - "Await approval for Phase 2"
 ---
 ### Phase 2 — Apply + Verify + Rollback
 #### Trigger
 Phase 2 is triggered ONLY by the exact user approval phrase:
 **"ok"**
 #### Phase 2 Rules
 - Output **minimal unified diffs only**
 - Modify ONLY production files declared in Phase 1
 - Do NOT include explanations, comments, or unchanged code
 - Automatically apply the diff to the working tree
 #### Mandatory Verification
 - Run the tests defined in the Phase 1 Verification Plan
 #### Success Path
 If ALL tests PASS:
 - Keep the applied changes
 - Ensure generated diagrams (if affected) are consistent
 - Report success concisely
 #### Failure Path (Mandatory)
 If ANY test FAILS:
 - Immediately rollback ALL Phase 2 changes
 - Do NOT keep partial changes
 - Report:
  - failing test names
  - error messages / assertions
  - brief hypothesis of the root cause
 - Return to Phase 1 state
 Tests must NEVER be weakened, removed, or altered to force Phase 2 to pass.
 ---
 ## What Counts as "Non-Trivial"
 (Protocol Required)
 Any of the following:
 - routing policy or ordering changes
 - topology builder changes (nodes, links, parameters)
 - address decoding / PhysAddr behavior
 - latency composition rules
 - changes affecting determinism or connectivity
 - changes touching two or more production files
 ---
 ## Allowed Exceptions
 (Protocol Still Required)
 - comments or docstrings
 - formatting-only changes
 - type annotation changes with no runtime behavior change
 In exceptions, Phase 1 MUST explicitly state:
 **"No behavior change; tests unchanged."**
 ---
 ## CLI Semantics
 - `kernbench run --device <id>` runs the benchmark on a single device.
 - Omitting `--device` runs the benchmark on all devices discovered in the topology (logically parallel).
 - Device enumeration is handled by the CLI only; benchmarks MUST remain single-device.
 ## Derived Artifacts (Clarification)
 - Generated diagrams under `docs/diagrams/` are **derived artifacts**, not production code.
 - Creating or updating files in `docs/diagrams/`:
  - does NOT count as a production code change,
  - does NOT require Phase 2 approval,
  - MUST be consistent with SPEC.md and ADRs.
 ## Enforcement Defaults
 - If unsure whether a change is non-trivial → treat it as non-trivial.
 - If unsure whether Phase 2 is allowed → STOP and ask.
 - SPEC.md and ADRs are the final authority.
 - runtime API MUST NOT hardcode topology/routing or internal hop sequences.
 - sim_engine MUST remain independent of runtime API semantics (no tensor/kernel policy logic).
@@ -0,0 +1,13 @@
 # Python Project (VS Code Template)
 ## Quick start
 1. Create venv + install dev deps (editable):
   - VS Code: Run Task → `deps: install (editable)`
 2. Run tests:
   - VS Code: Run Task → `test`
 3. Lint / format:
   - `lint`, `format` tasks
 ## Structure
 - `src/` app code
 - `tests/` pytest
@@ -0,0 +1,327 @@
 # KernBench System-Level Simulator — SPEC
 This document defines the architectural contract for the KernBench
 system-level discrete-event simulator for our AI Accelerator SIP-based systems.
 All implementations, tests, and changes MUST conform to this SPEC.
 ---
 ## 0. Goal
 Build a **system-level, discrete-event simulator** to evaluate the performance of
 **LLM kernels running on our AI Accelerator SIP-based systems**, under varying
 **SIP architectures, topologies, and interconnect configurations**.
 The simulator models **data-movement and control paths across the full hardware
 hierarchy** and computes **end-to-end execution latency** for kernel executions
 dispatched to Processing Elements (PEs).
 Primary objectives:
 - compare LLM kernel execution latency under different system configurations
 - model PE↔HBM, PE↔PE, CUBE↔CUBE, and SIP↔SIP communication and control paths
 - guarantee deterministic, verifiable behavior with strong debuggability
 - support visual inspection of the modeled system at multiple abstraction levels
 ---
 ## 0.1 Golden Invariants (Must NOT be violated)
 - End-to-end latency is computed **strictly by explicit traversal** over modeled
  components and links.
 - Every routed request MUST incur **latency > 0**.
 - Routing decisions MUST be **deterministic** given
  (topology + routing policy + request).
 - All valid request flows MUST have explicit connectivity in the model.
 - No hidden shortcuts, implicit bypasses, or magic paths are allowed.
 - Architectural decisions documented in ADRs override local optimizations.
 ---
 ## 0.2 Architectural References (ADRs)
 Major architectural decisions are documented in ADRs and referenced by number.
 - ADR-0001: PhysAddr layout & address decoding contract
 - ADR-0002: Routing distance, ordering, and bypass rules
 - ADR-0003: Target system hierarchy & modeling scope (Tray / SIP / CUBE / PE / IO chiplet)
 - ADR-0004: Memory semantics & local-HBM bandwidth guarantee contract
 - ADR-0005: Diagram views (SIP / CUBE / PE) and distance-aware layout rules
 - ADR-0006: Topology compilation, distance extraction, and automatic diagram generation
 - ADR-0007: runtime_api vs sim_engine responsibility boundaries
 - ADR-0008: Tensor deployment and allocation (Host allocator, PA-first)
 - ADR-0009: Kernel execution fan-out and completion semantics
 - ADR-0010: CLI device selection and multi-device execution semantics
 - ADR-0011: Memory addressing simplification (PA-first)
 - ADR-0012: Host ↔ IO_CPU message schema (PA-first, PE-tagged shards)
 - ADR-0013: Verification strategy and Phase 1 test plan
 SPEC MUST remain consistent with accepted ADRs.
 ---
 ## 1. Core Requirements
 ### R1. Correct Routing and Control Path
 - A request MUST traverse the correct sequence of components based on:
  - source location,
  - destination address or placement tags,
  - routing policy and available topology connectivity.
 - Local vs remote traffic MUST be distinguishable:
  - same SIP vs different SIP,
  - same CUBE vs different CUBE,
  - (optional) same PE-group vs cross PE-group.
 - Routing behavior MUST be reproducible and deterministic.
 ---
 ### R2. Latency is Computed by Traversal
 End-to-end latency is the sum of:
 - per-node fixed latency (processing / router delay),
 - per-link latency (fixed and/or size-aware serialization: bytes / BW),
 - per-service latency (e.g., memory controller service time).
 The simulator MUST:
 - support both fixed and size-aware latency,
 - emit hop-by-hop traces with timestamps and component identifiers.
 ---
 ### R3. Topology is Configurable and Variable
 Topology MUST NOT be hardcoded.
 The simulator MUST accept multiple topologies (YAML / JSON / dict), varying:
 - SIP count,
 - CUBE count per SIP,
 - PE count per CUBE,
 - on-chip fabric structure (e.g., mesh / NoC / XBAR),
 - IO chiplets and interconnects,
 - link bandwidth, latency, and capacity parameters.
 Given a topology:
 - all required request flows MUST have valid connectivity,
 - missing links are a topology construction error, not a routing error.
 ---
 ### R4. DI-First Component Design (Swappable Implementations)
 All components MUST be replaceable behind stable interfaces, including:
 - routers and fabrics (NoC, bridges, switches),
 - XBAR-like selectors,
 - DMA engines and queues,
 - memory controllers and services (HBM, TCM, queues),
 - management and control processors (modeled components).
 The simulator MUST:
 - use dependency injection (DI) to bind node specifications to implementation classes,
 - allow component swapping without changing test logic,
 - avoid leaking routing or policy logic into unrelated components.
 ---
 ### R5. Multi-Domain Communication Modeling
 The simulator MUST model communication across hierarchical domains, including:
 - PE ↔ local HBM
 - PE ↔ remote HBM in the same CUBE
 - PE ↔ remote HBM in other CUBEs within the same SIP
 - PE ↔ remote HBM in other SIPs
 - PE ↔ PE messaging (e.g., IPCQ)
 - PE ↔ IO chiplets
 - CUBE ↔ CUBE (e.g., via UCIe)
 - SIP ↔ SIP (e.g., via PCIe or UAL)
 Policy-based bypass is allowed ONLY if:
 - the bypass path is explicitly represented in the model,
 - the bypass incurs non-zero latency,
 - the bypass is visible in traces and diagrams.
 ---
 ### R6. Verification-Driven Development
 Development MUST follow a verification-driven workflow:
 - behavior is validated by tests with meaningful input cases,
 - tests encode SPEC-defined invariants, not incidental implementation details,
 - changes without clear verification coverage are not allowed.
 ---
 ## R7. Runtime API
 The simulator MUST provide a host-facing runtime API that:
 - exposes tensor deployment and kernel execution operations,
 - submits requests only to endpoint components (e.g., IO_CPU),
 - owns host-side tensor handles and allocation metadata as PA shard maps,
 - remains topology-agnostic and does not perform routing or fan-out.
 Tensor deployment in Phase 0 produces **device physical-address (PA) shard mappings**.
 Each shard explicitly identifies its target `(sip, cube, pe)` and PA range.
 No separate host-visible allocation RPC (e.g., AllocateTensorMeta) exists.
 ---
 ## R8. Simulation Engine
 The simulator MUST include a discrete-event simulation engine that:
 - injects requests into the system graph,
 - schedules events deterministically,
 - tracks completion via correlation identifiers,
 - decomposes runtime API operations into explicit graph requests
  (e.g., MemoryWrite, MemoryRead, KernelLaunch).
 ---
 ## R9. CLI Execution Semantics
 The CLI MUST support executing benchmarks:
 - on a specified device.
 Benchmarks are executed once per invocation within a single simulation instance.
 If multiple devices are present in the topology, a benchmark MAY interact with
 multiple devices internally, but the CLI does not launch multiple independent
 benchmark instances by default.
 ---
 ## R10. Memory Addressing (Phase 0)
 In Phase 0, the simulator uses a **PA-first memory model**:
 - All memory operations use device physical addresses (PA) only.
 - Virtual addressing, MMU/IOMMU, and address translation latency are out of scope.
 - Tensor placement is represented as a list of PA shards, each explicitly tagged
  with `(sip, cube, pe)`.
 All memory access latency MUST be modeled explicitly via graph traversal.
 No implicit translation or hidden latency is allowed.
 ---
 ## 2. Model Concepts
 ### 2.1 Graph Execution Model
 - Nodes represent modeled components (PE blocks, XBAR, NoC, bridges,
  HBM controllers, IO components, etc.).
 - Directed edges represent interconnect links with latency and bandwidth attributes.
 - Execution model:
  - a node receives a request,
  - incurs node or service latency,
  - emits the request to the next hop via a link,
  - repeats until the destination service completes.
 ---
 ### 2.2 Routing
 Routing MAY be implemented as:
 - policy-based routing (code-driven),
 - routing tables (config-driven),
 - topology-driven routing (e.g., mesh XY),
 - or a hybrid approach.
 Routing MUST:
 - consume decoded address domains or explicit placement tags,
 - operate only on explicit topology connectivity,
 - remain deterministic.
 Kernel execution requests reference tensors via PA shard mappings.
 Each shard explicitly identifies its target PE, allowing IO_CPU to
 deterministically fan-out execution without relying on PA decoding.
 ---
 ## 3. Inputs and Identity
 ### 3.1 Node Identity Scheme
 Nodes MUST have stable, parsable identifiers sufficient for domain inference
 and trace-based debugging.
 Example patterns:
 - `tray.host_cpu`
 - `sip{S}.io{I}.pcie_ep`
 - `sip{S}.cube{C}.fabric`
 - `sip{S}.cube{C}.pe{P}`
 - `sip{S}.cube{C}.hbm_ctrl`
 ---
 ### 3.2 Link Specifications
 A link MAY include:
 - fixed latency (ns),
 - bandwidth (GB/s) for serialization latency,
 - optional capacity for contention modeling.
 Topology builders MUST ensure:
 - required links exist,
 - link parameters are consistent with topology intent.
 ---
 ## 4. Output, Debuggability, and Diagrams
 The simulator MUST provide:
 - per-request hop-by-hop traces with timestamps,
 - clear error messages for missing connectivity
  (e.g., "no link for A → B"),
 - reproducible, inspectable representations of the modeled system.
 Diagrams are **derived artifacts** of the simulator model:
 - They MUST be generatable from the **compiled topology** and **distance metadata**
  used by execution and routing.
 - Generation MAY be performed lazily or cached by the implementation,
  as long as outputs remain consistent with the compiled topology.
 Diagram abstraction levels and distance-aware layout rules are defined in ADR-0005.
 Automatic diagram generation and output conventions are defined in ADR-0006.
 By default, generated diagrams are written under:
 - `docs/diagrams/`
 ---
 ## 5. Non-Goals (for now)
 The following are explicitly out of scope:
 - cycle-accurate microarchitecture modeling,
 - detailed cache coherence protocols,
 - full PCIe / CXL protocol correctness.
 These MAY be layered later via additional components and policies.
 ---
 ## 6. Decision Boundaries
 - SPEC.md defines architectural intent and invariants.
 - Code implements SPEC and MUST NOT introduce hidden invariants.
 - Tests validate SPEC-defined behavior and MUST NOT encode fixed topology assumptions.
 - ADRs record non-trivial architectural decisions and MUST be referenced when relevant.
@@ -0,0 +1,2 @@
 def run(ctx):
    print("IPCQ all reduce kernel bench")
@@ -0,0 +1,37 @@
 from __future__ import annotations
 import importlib
 from collections.abc import Callable
 from typing import Any
 from kernbench.runtime_api.context import RuntimeContext
 BenchFn = Callable[[RuntimeContext], Any]
 def resolve_bench(bench_id: str) -> BenchFn:
    """
    Resolve a bench id into a callable bench function.
    Expected layout (repo root):
        benches/<bench_id>.py
            def run(ctx: RuntimeContext) -> Any
    """
    bench_id = bench_id.strip()
    if not bench_id:
        raise ValueError("Bench id is empty.")
    module_path = f"benches.{bench_id}"
    try:
        mod = importlib.import_module(module_path)
    except ModuleNotFoundError as e:
        raise ValueError(f"Unknown bench '{bench_id}'. Expected module {module_path}.py") from e
    run_fn = getattr(mod, "run", None)
    if run_fn is None:
        raise ValueError(f"Bench module {module_path} must define a 'run(ctx)' function.")
    if not callable(run_fn):
        raise ValueError(f"'run' in {module_path} is not callable.")
    return run_fn
@@ -0,0 +1,39 @@
 """QKV GEMM benchmark: Q*K^T projection on a single PE.
 Demonstrates the full host-to-PE kernel launch pipeline:
  Host → PCIE_EP → IO_CPU → M_CPU → NOC → PE_CPU → PE_SCHEDULER → engines
 Kernel: tl.load(a) + tl.ref(b) + tl.composite(gemm) + tl.wait()
  - Tensor a is loaded into TCM via DMA
  - Tensor b stays in HBM; PE_SCHEDULER streams it per-tile (32x64x32)
 """
 from kernbench.policy.placement.dp import DPPolicy
 # GEMM dimensions: (M, K) x (K, N) → (M, N)
 M, K, N = 128, 256, 128
 DTYPE = "f16"
 def _gemm_kernel(a_ptr, b_ptr, out_ptr, M, K, N, tl, DTYPE="f16"):
    """QKV GEMM kernel: out = a @ b.
    a is loaded into TCM (DMA_READ).
    b is referenced in HBM (tl.ref, no DMA — scheduler streams per-tile).
    """
    a = tl.load(a_ptr, shape=(M, K), dtype=DTYPE)
    b = tl.ref(b_ptr, shape=(K, N), dtype=DTYPE)
    handle = tl.composite(op="gemm", a=a, b=b, out_ptr=out_ptr)
    tl.wait(handle)
 def run(ctx):
    """Run the QKV GEMM benchmark."""
    # DP placement: a=replicate (cube-level), b/out=column_wise (N-axis, single PE)
    a = ctx.zeros((M, K), dtype=DTYPE, dp=DPPolicy(cube="replicate", pe="replicate"), name="a")
    b = ctx.zeros((K, N), dtype=DTYPE, dp=DPPolicy(cube="replicate", pe="column_wise"), name="b")
    out = ctx.empty(
        (M, N), dtype=DTYPE, dp=DPPolicy(cube="replicate", pe="column_wise"), name="out",
    )
    # Launch GEMM kernel
    ctx.launch("qkv_gemm", _gemm_kernel, a, b, out, M, K, N)
@@ -0,0 +1,39 @@
 """QKV GEMM benchmark: Q*K^T projection on all PEs in a cube (multi-PE).
 Column-parallel GEMM: a is replicated (cube-level), b/out are column-sharded.
 M_CPU fans out KernelLaunchMsg to all 8 PE_CPUs (ADR-0009 D3).
 Kernel: tl.load(a) + tl.ref(b) + tl.composite(gemm) + tl.wait()
  - Tensor a is loaded into TCM via DMA
  - Tensor b stays in HBM; PE_SCHEDULER streams it per-tile (32x64x32)
 """
 from kernbench.policy.placement.dp import DPPolicy
 # GEMM dimensions: (M, K) x (K, N) -> (M, N)
 M, K, N = 128, 256, 128
 DTYPE = "f16"
 def _gemm_kernel(a_ptr, b_ptr, out_ptr, M, K, N, tl, DTYPE="f16"):
    """QKV GEMM kernel: out = a @ b.
    a is loaded into TCM (DMA_READ).
    b is referenced in HBM (tl.ref, no DMA -- scheduler streams per-tile).
    """
    a = tl.load(a_ptr, shape=(M, K), dtype=DTYPE)
    b = tl.ref(b_ptr, shape=(K, N), dtype=DTYPE)
    handle = tl.composite(op="gemm", a=a, b=b, out_ptr=out_ptr)
    tl.wait(handle)
 def run(ctx):
    """Run the multi-PE QKV GEMM benchmark."""
    # DP placement: a=replicate (cube-level), b/out=column_wise (N-axis split)
    a = ctx.zeros((M, K), dtype=DTYPE, dp=DPPolicy(cube="replicate", pe="replicate"), name="a")
    b = ctx.zeros((K, N), dtype=DTYPE, dp=DPPolicy(cube="replicate", pe="column_wise"), name="b")
    out = ctx.empty(
        (M, N), dtype=DTYPE, dp=DPPolicy(cube="replicate", pe="column_wise"), name="out",
    )
    # Launch GEMM kernel on all PEs
    ctx.launch("qkv_gemm_multi", _gemm_kernel, a, b, out, M, K, N)
@@ -0,0 +1,108 @@
 # ADR-0001: PhysAddr Layout & Address Decoding Contract
 ## Status
 Accepted
 ## Date
 2026-02-27
 ## Context
 KernBench Graph Latency Simulator must route requests deterministically and compute end-to-end latency strictly by graph traversal.
 To model local vs remote traffic (same/different SIP, same/different CUBE, optional PE-group), requests need a stable, parsable address/location scheme that:
 - can be decoded into routing domains (SIP/CUBE/HBM/PE-resource, etc.)
 - remains topology-agnostic (no hardcoded counts)
 - supports swappable policy and DI-first components without leaking topology assumptions into node implementations
 ## Decision
 We define a **PhysAddr value object** and an **address decoding contract** that converts an integer address into routing domains.
 ### D1. PhysAddr is an immutable value object
 - PhysAddr is immutable and comparable as a pure value.
 - Any allocator returns a **fully specified PhysAddr** (not partial metadata).
 - No global state may be required to interpret a PhysAddr.
 ### D2. PhysAddr fields (logical contract)
 PhysAddr must be able to represent at least:
 - `rack_id` (optional but reserved for scale-out)
 - `sip_id`  (device / SIP domain)
 - `sip_seg` (SIP-level segment/window selection, e.g., cube window)
 - `local_offset` (offset within the chosen segment/window)
 Decoded/derived fields may include (optional):
 - `cube_id`
 - `kind` (e.g., HBM vs PE-resource vs raw)
 - `unit_type` / `pe_id` (if PE-level addressing is modeled)
 **Important:** The exact bit allocation may evolve, but the *semantic fields above* must remain decodable without hidden assumptions.
 ### D3. Decoding is deterministic and policy-compatible
 - Decoding must deterministically map an integer address to:
  - destination SIP domain (`sip_id`)
  - destination sub-domain (`cube_id` if applicable)
  - destination target kind (HBM/PE-resource/other)
 - Decoding must not depend on runtime topology sizes; it may depend on **explicit topology parameters** provided through configuration (e.g., segment size, slice size), and those parameters must live in the topology/config layer (not in random components).
 ### D4. Topology-derived constants live in the topology layer
 Constants such as segment sizes (e.g., HBM slice size / window size) are derived from topology configuration (YAML/JSON/dict) and are provided to the decoder via DI/config.
 They must not be hardcoded in node implementations.
 ### D5. Routing consumes decoded domains, not raw bits
 Routing policy uses decoded domains:
 - `src` location (sip/cube/pe or node_id)
 - `dst` domains derived from PhysAddr decoding
 - `size_bytes` for size-aware link latency
 Routing must not inspect raw bit-fields directly except inside the decoding module.
 ## Alternatives Considered
 1) **Use raw integers everywhere, decode ad-hoc in routing**
 - Rejected: leads to duplicated logic, inconsistent routing, and hidden assumptions embedded in multiple components.
 1) **Hardcode topology sizes (SIP/CUBE/PE counts) into decoding**
 - Rejected: violates SPEC (R3) and breaks swappability and configuration-driven topologies.
 1) **Put decoding inside memory controllers or routers**
 - Rejected: leaks policy into components and undermines DI-first, swappable implementations (SPEC R4).
 ## Consequences
 ### Positive
 - Deterministic routing domains enable clear test invariants for local vs remote paths (SPEC R1, R5).
 - Keeps topology variability (SPEC R3) while preserving consistent semantics.
 - DI-first: decoder can be swapped or extended without changing components or tests (SPEC R4).
 ### Tradeoffs / Costs
 - Requires explicit configuration for any topology-derived sizes.
 - Introduces a single “blessed” decoding module that must remain stable and well-tested.
 ## Implementation Notes (Non-normative)
 - Recommended module boundary:
  - `src/kernbench/policy/address/phyaddr.py`
 - Tests should cover:
  - deterministic decoding
  - local vs remote classification from decoded fields
  - invariants: “allocator returns full PhysAddr”, “decoding requires no global state”
 ## Links
 - SPEC.md: R1 (routing), R3 (configurable topology), R4 (DI-first), R5 (multi-domain comm)
@@ -0,0 +1,103 @@
 # ADR-0002: Routing Distance, Ordering & Bypass Rules
 ## Status
 Accepted
 ## Date
 2026-02-27
 ## Context
 The KernBench Graph Latency Simulator must compare kernel execution time
 across different architectures and topologies by computing end-to-end
 latency from graph traversal.
 To support meaningful comparison:
 - routing must be deterministic
 - latency must reflect actual interconnect structure
 - local vs remote traffic must be distinguishable
 - “bypass” optimizations must not undermine debuggability or correctness
 The simulator also aims to avoid software-managed metadata and hidden
 shortcuts that obscure control paths.
 ## Decision
 ### D1. Distance is accumulated latency, not hop count
 - Routing “distance” is defined as the **sum of per-node and per-link latency**.
 - Hop count alone must not be used for ordering or path selection.
 - Size-aware serialization latency (bytes / BW) contributes to distance.
 ### D2. Routing order is derived from graph traversal
 - The chosen route is the path with minimum accumulated latency
  given the constructed graph and routing policy.
 - Deterministic ordering must be guaranteed for identical inputs
  (topology + policy + request).
 ### D3. Bypass is explicit and graph-represented
 - Any bypass (e.g., local cube HBM access via XBAR instead of NOC) must be:
  - explicitly represented as a graph path, and
  - subject to latency accumulation like any other path.
 - Example: PE_DMA has dual egress — one to XBAR (HBM path) and one to NOC (non-HBM path).
  Both are explicit graph edges; neither is a “bypass” — they are distinct data paths
  serving different memory domains.
 - Implicit or “magic” bypass paths are disallowed.
 ### D4. No zero-latency end-to-end paths
 - Every routed request must incur **end-to-end** latency > 0.
 - Individual fabric segments (e.g., NOC hops) MAY have distance_mm = 0
  when the fabric is distributed and distance is not meaningful at that granularity.
  This is allowed because other components on the same path (e.g., PE_DMA, SRAM,
  UCIe endpoints) contribute non-zero latency, ensuring the end-to-end invariant holds.
 - Fully zero-latency end-to-end paths are disallowed, except for explicit
  test-only stubs clearly marked as such.
 ### D5. Policy vs topology responsibility split
 - Topology builder:
  - defines nodes and links and their latency/BW parameters
 - Routing policy:
  - selects among available graph paths based on decoded domains
 - Routing policy must not assume missing links; missing connectivity
  is a topology construction error.
 ### D6. No software-managed routing metadata
 - Routing decisions must not rely on per-request software-managed metadata
  that tracks distance, hop count, or ordering outside the graph model.
 - All distance/order computation is derived from traversal itself.
 ## Alternatives Considered
 1) **Hop-count based routing**
 - Rejected: ignores heterogeneous latency/BW and misrepresents
  architectural differences.
 2) **Implicit local shortcuts**
 - Rejected: breaks debuggability and violates traversal-based latency.
 3) **Software-managed distance metadata**
 - Rejected: increases control overhead and obscures routing semantics.
 ## Consequences
 ### Positive
 - Clear, debuggable hop-by-hop traces (SPEC R2, R4).
 - Architecture comparisons reflect real interconnect structure.
 - Routing behavior is reproducible and deterministic.
 ### Tradeoffs / Costs
 - Graph construction must be correct and complete.
 - Bypass modeling requires explicit graph representation,
  which slightly increases topology description complexity.
 ## Implementation Notes (Non-normative)
 - Recommended responsibilities:
  - Graph builder: ensure all required paths exist.
  - Router: select next hop based on decoded domains and policy.
 - Tests should assert:
  - non-zero end-to-end latency
  - deterministic routing for identical inputs
  - bypass paths appear explicitly in emitted traces
 ## Links
 - SPEC.md: R1 (routing), R2 (latency), R3 (topology), R5 (multi-domain comm)
 - ADR-0001: PhysAddr layout & decoding contract
@@ -0,0 +1,64 @@
 # ADR-0003: Target System Hierarchy & Modeling Scope
 ## Status
 Accepted
 ## Context
 We need a system-level simulator to evaluate LLM kernel performance on our AI Accelerator platform.
 The platform is organized as a compute tray containing multiple identical SIPs connected via PCIe or UAL
 through switching fabrics, with a host CPU issuing commands/kernels.
 ## Decision
 We model the system hierarchy explicitly:
 ### D1. Tray-level
 - A compute tray contains:
  - Host CPU (issues requests / coordinates runtime & data placement)
  - Multiple identical SIPs (accelerators)
  - Interconnect fabric between SIPs (PCIe and/or UAL via switches)
 ### D2. SIP-level
 - A SIP is a multi-die package composed of:
  - Multiple CUBEs (HBM die + compute PEs + UCIe)
  - One or more IO chiplets (host/SIP interfaces)
 - IO chiplets:
  - provide interfaces: PCIe-EP, IO_CPU, optionally UAL-EP
  - can be multiple per SIP
  - placement constrained to SIP shoreline (top/bottom/left/right); each shoreline may host 1–2 IO chiplets
 ### D3. CUBE-level
 - A CUBE contains:
  - HBM + memory controller (HBM_CTRL)
  - XBAR (top/bottom): HBM pseudo-channel crossbar, PE's dedicated path to HBM
  - Bridge (left/right): connects XBAR.top ↔ XBAR.bottom for cross-half HBM access
  - NOC: distributed on-die fabric spanning the entire cube (distance modeled as 0);
    carries non-HBM traffic including inter-cube (UCIe), command (M_CPU↔PE_CPU), and shared SRAM access
  - Shared SRAM: cube-level shared memory accessible by all PEs via NOC
  - management/control CPU (M_CPU) coordinating PE command distribution and completion aggregation
  - multiple PEs
  - up to 4 UCIe endpoints (N/E/W/S) for CUBE↔CUBE and CUBE↔IO connectivity
 ### D4. PE-level
 - A PE can execute one kernel instance
 - PE contains internal control + accelerators (modeled at PE view granularity):
  - PE_CPU, command handler, PE_TCM, DMA/GEMM/MATH engines, internal queues
 ## Consequences
 - The simulator supports abstraction by “views”:
  - SIP view hides PE internals
  - CUBE view treats each PE as a single block
  - PE view expands PE internals
 - Topology remains parameterized; sizes/counts/links come from configuration.
 ## Links
 - SPEC R3/R5
 - ADR-0005 (diagram views)
@@ -0,0 +1,64 @@
 # ADR-0004: Memory Semantics & Local-HBM Bandwidth Guarantee
 ## Status
 Accepted
 ## Context
 Accurately modeling PE↔HBM behavior is essential for kernel latency estimation.
 Each PE has a notion of “local HBM” that must guarantee full HBM bandwidth, independent of intervening on-die fabric bandwidth.
 ## Decision
 ### D1. Local HBM definition
 - Each PE is assigned a logically defined “local HBM” region.
 - Local HBM corresponds to the pseudo-channel subset directly attached to that PE’s DMA path
  via the XBAR (top or bottom, depending on PE corner placement).
 - The path is: PE_DMA → XBAR.top/bottom → HBM_CTRL.
 - The mapping (HBM pseudo-channels → PE local regions) is derived from topology configuration.
 ### D2. Local HBM bandwidth guarantee contract
 - Accesses from a PE to its local HBM MUST guarantee full HBM read/write bandwidth
  independent of intervening fabric bandwidth limits.
 - This guarantee is modeled by:
  - a dedicated logical path and/or service model that enforces HBM BW at the PE-local-HBM interaction point,
  - while still incurring non-zero latency along explicitly modeled components.
 ### D3. Cross-half HBM semantics
 - A PE connected to XBAR.bottom that accesses HBM pseudo-channels on the XBAR.top half
  (or vice versa) traverses a bridge:
  - PE_DMA → XBAR.bottom → bridge → XBAR.top → HBM_CTRL
 - Bridge bandwidth may limit cross-half HBM access relative to local-half access.
 ### D4. Non-local HBM semantics (inter-cube / inter-SIP)
 - Accesses from a PE to HBM in a different cube or SIP MAY be limited by:
  - NOC bandwidth within the cube,
  - inter-cube UCIe links,
  - inter-SIP fabric (PCIe/UAL).
 - These paths MUST be explicit and traceable.
 ### D5. Shared SRAM semantics
 - Each CUBE contains a shared SRAM accessible by all PEs in that CUBE.
 - Access path: PE_DMA → NOC → shared SRAM.
 - Shared SRAM bandwidth is limited by the NOC↔SRAM link bandwidth.
 - Shared SRAM is not part of the HBM address space; it is a separate memory domain.
 ## Verification Notes
 Tests should cover:
 - local-HBM case: BW matches HBM BW regardless of fabric BW parameter
 - cross-half HBM case: latency includes bridge traversal
 - non-local cases (inter-cube/inter-SIP): BW/latency respond to fabric/link parameters
 - shared SRAM case: access via NOC with correct BW
 ## Links
 - SPEC R2/R5
 - ADR-0002 (distance/order & explicit bypass)
@@ -0,0 +1,186 @@
 # ADR-0005: Diagram Views & Distance-Aware Layout Rules
 ## Status
 Accepted
 ## Context
 We require verifiable and inspectable system modeling for a large-scale,
 parameterized AI Accelerator system.
 Humans must be able to:
 - visually inspect the modeled topology,
 - reason about communication structure and relative distance,
 - do so at multiple abstraction levels without being overwhelmed by detail.
 The simulator models distance (accumulated latency) as a first-class concept.
 Diagrams must reflect this distance by default.
 ---
 ## Global Defaults
 - All diagrams MUST be **distance-aware by default**.
 - All diagrams MUST render **representative views** of the architecture.
 - Instance indices (e.g., sip0, cube2, pe3) MUST NOT be required for diagram generation.
 - Instance indices MAY be used ONLY:
  - to define a distance anchor in asymmetric or debugging scenarios, or
  - when explicitly requested.
 ---
 ## Representative Rendering Rule
 - All CUBEs share the same internal structure.
 - All PEs share the same internal structure.
 Therefore:
 - SIP-level diagrams render representative CUBEs and IO chiplets.
 - CUBE-level diagrams render representative PEs as opaque blocks.
 - PE-level diagrams render a representative PE with fully expanded internals.
 Diagrams MUST NOT depend on specific SIP, CUBE, or PE indices
 unless explicitly requested.
 ---
 ## Diagram Views
 ### View A — SIP-Level Diagram
 **Purpose**
 Explain system-scale structure and connectivity.
 **Visible elements**
 - SIP boundaries (optional)
 - CUBEs (opaque blocks)
 - IO chiplets (opaque blocks)
 - Optional UCIe stubs only if needed to clarify connectivity
 **Hidden elements**
 - PE internals
 - CUBE internal fabric
 - IO chiplet internals
 **Visible links**
 - Host ↔ IO chiplets (PCIe)
 - SIP ↔ SIP (PCIe / UAL via switches)
 - IO ↔ CUBE (on-package links)
 ---
 ### View B — CUBE-Level Diagram
 **Purpose**
 Explain cube-internal structure and data/control flow.
 **Visible elements**
 - XBAR (top/bottom): HBM pseudo-channel crossbar
 - Bridge (left/right): cross-half HBM connectors between XBAR.top and XBAR.bottom
 - NOC: distributed on-die fabric for non-HBM traffic
 - HBM subsystem (HBM_CTRL)
 - Shared SRAM: cube-level shared memory
 - Management CPU (M_CPU)
 - PEs as opaque blocks (PE[0..N−1])
 - UCIe endpoints (N/E/W/S) as ports
 **Hidden elements**
 - PE internals
 **Visible links**
 - PE → XBAR (HBM data path, top or bottom by corner placement)
 - PE → NOC (non-HBM data path)
 - XBAR ↔ bridge ↔ XBAR (cross-half HBM access)
 - XBAR → HBM_CTRL
 - NOC ↔ UCIe endpoints
 - NOC ↔ shared SRAM
 - M_CPU ↔ NOC (command path)
 - NOC → PE_CPU (command delivery, collapsed into PE block)
 ---
 ### View C — PE-Level Diagram
 **Purpose**
 Explain internal PE behavior and execution structure.
 **Visible elements**
 - PE_CPU
 - Command handler / scheduler
 - PE_TCM (local SRAM)
 - HW accelerators (DMA, GEMM, MATH, etc.)
 - Local HBM interface
 - Optional IPCQ / messaging endpoints
 **Visible links**
 - Control paths (CPU → scheduler → engines)
 - Data paths (engines ↔ TCM, DMA ↔ local HBM)
 - External fabric ports as abstract ports only
 ---
 ## Distance-Aware Layout (Default)
 ### Distance definition
 - Distance is defined as **accumulated latency**, consistent with ADR-0002.
 - Distance is computed from a single anchor node.
 ### Default anchor selection
 - SIP view: IO chiplet (or Host CPU if present)
 - CUBE view: a representative PE
 - PE view: PE_CPU or Command Handler
 Anchors are **implicit defaults** and MUST NOT be required to be specified.
 ### Layout rules
 - Diagrams MUST be laid out in layers based on distance buckets.
 - Layout direction MUST be consistent within a view type
  (preferred: left-to-right).
 - Nodes with equal distance MUST have stable ordering
  (by role or identifier, deterministically).
 Cycles MAY be rendered using dashed or curved edges for readability,
 without affecting distance semantics.
 ---
 ## Generation Contract (for Tools / Claude Code)
 When generating diagrams:
 - Assume distance-aware layout by default.
 - Assume representative rendering by default.
 - Do NOT ask for SIP/CUBE/PE indices unless required.
 - Do NOT expand hidden abstraction levels.
 - Prefer architectural clarity over micro-hop fidelity.
 ---
 ## Consequences
 - Diagrams are stable across topology scaling.
 - Changes in distance or routing policy are reflected visually.
 - Diagrams serve as verifiable artifacts derived from the simulator model,
  not as hand-maintained documentation.
 ---
 ## Links
 - SPEC Section 4 (Output, Debuggability, and Diagrams)
 - ADR-0002 (Routing distance semantics)
 - ADR-0006 (Topology compilation & automatic diagram generation)
@@ -0,0 +1,130 @@
 # ADR-0006: Topology Compilation, Distance Extraction, and Automatic Diagram Generation
 ## Status
 Accepted
 ## Context
 The simulator compiles topology configuration (e.g., topology.yaml) into an explicit model graph,
 and computes routing and accumulated latency (distance).
 Diagrams should be generated from these authoritative artifacts to ensure consistency and avoid
 hand-maintained topology drawings.
 Additionally, for usability, diagrams should be emitted automatically into a stable location
 so that developers can preview them immediately in the repository.
 ---
 ## Decision
 ### D1. Topology compilation is the single source of truth
 - topology.yaml (or equivalent config) is compiled into:
  - an explicit system graph,
  - node/link attributes,
  - routing policies.
 This compiled graph is the authoritative representation of the system.
 ### D2. Distance extraction during compilation
 - During or immediately after topology compilation, the simulator MUST compute distance metadata
  (accumulated latency) consistent with ADR-0002.
 - Distance metadata MUST be sufficient to support distance-aware diagram layout as defined in ADR-0005.
 - Distributed fabric segments (e.g., NOC) MAY have distance_mm = 0 per ADR-0002 D4;
  layout placement for such nodes uses explicit position metadata rather than distance buckets.
 ### D3. Diagram generation is a derived artifact
 - Diagrams MUST be generated from:
  - the compiled topology graph,
  - extracted distance metadata,
  - view/layout rules defined in ADR-0005.
 - Diagram generation MUST NOT require additional hand-written topology descriptions.
 ### D4. Automatic diagram emission to the repository
 - As part of topology compilation, the implementation MUST produce the following diagrams by default:
  - SIP-level diagram (representative, distance-aware)
  - CUBE-level diagram (representative, distance-aware)
  - PE-level diagram (representative, distance-aware)
 - The default output directory is:
  - `docs/diagrams/`
 - The generator MUST overwrite/update only when the compiled topology (or diagram rules) changes.
 ### D5. View-specific projection and layout
 For each view (SIP / CUBE / PE):
 - The generator MUST project the compiled graph into a reduced view graph:
  - hide/collapse nodes according to ADR-0005,
  - preserve connectivity semantics relevant to that view,
  - compute distance buckets and assign layout layers deterministically.
 - CUBE-level projection MUST include:
  - XBAR (top/bottom), bridge (left/right), NOC, HBM_CTRL, shared SRAM, M_CPU, UCIe ports,
    and PEs as opaque blocks.
  - Distinct edge kinds for HBM path (PE→XBAR) vs non-HBM path (PE→NOC).
 - Default anchors are implicit (ADR-0005) and MUST NOT require instance indices.
 ### D6. Output formats and determinism
 - The generator MUST output at least one of:
  - Mermaid (Markdown-native)
  - Graphviz DOT (rank-based control)
  - SVG (mm-accurate layout, no external dependencies)
 - SVG is preferred when mm-accurate position metadata is available from the compiled topology.
 - Output MUST be deterministic:
  - same topology + same rules → identical diagram text
 - File naming MUST be deterministic and stable (see "Output Conventions").
 ### D7. Performance and caching
 - Diagram generation MAY be lazy and/or cached, as long as the outputs in `docs/diagrams/`
  remain consistent with the compiled topology.
 - The implementation SHOULD use a cache key based on:
  - topology content hash,
  - routing policy version,
  - diagram rules version,
  - view type (SIP/CUBE/PE).
 ---
 ## Output Conventions
 ### Directory
 - `docs/diagrams/` is the canonical output directory for generated diagrams.
 ### File names (recommended, deterministic)
 - `system_view.svg` / `system_view.mmd` / `system_view.dot`
 - `sip_view.svg` / `sip_view.mmd` / `sip_view.dot`
 - `cube_view.svg` / `cube_view.mmd` / `cube_view.dot`
 - `pe_view.svg` / `pe_view.mmd` / `pe_view.dot`
 Optionally, for multi-topology workflows:
 - `sip_view__{topology_id}.svg`
 - `cube_view__{topology_id}.svg`
 - `pe_view__{topology_id}.svg`
 ### Repository policy
 - Generated diagram files MAY be committed to the repository to enable diff-based review.
 - If committed, they MUST be reproducible from topology compilation.
 ---
 ## Consequences
 - Diagrams are always consistent with simulator behavior.
 - Architectural changes automatically propagate to visualizations.
 - Diagram diffs become meaningful indicators of architectural change.
 ---
 ## Links
 - SPEC Section 4 (Output, Debuggability, and Diagrams)
 - ADR-0002 (Distance semantics)
 - ADR-0005 (Diagram views and layout rules)
@@ -0,0 +1,89 @@
 # ADR-0007: Runtime API and Simulation Engine Boundaries
 ## Status
 Accepted
 ## Context
 The simulator consists of multiple layers with distinct responsibilities:
 - a host-facing API layer used by benchmarks and user code,
 - a discrete-event simulation engine that executes requests,
 - device components that model hardware behavior.
 Without strict boundaries, orchestration logic can leak into components,
 or simulation internals can become entangled with user-facing APIs.
 This ADR defines clear responsibility boundaries between:
 - runtime API,
 - simulation engine (sim_engine),
 - hardware components.
 ---
 ## Decision
 ### D1. Runtime API is host-facing orchestration only
 The runtime API represents host/driver-level behavior and MUST:
 - expose high-level operations (tensor deployment, kernel launch),
 - submit requests only to endpoint components (e.g., IO_CPU),
 - await completion via futures/handles,
 - own and persist host-side metadata (tensor allocation maps, kernel bindings).
 The runtime API MUST NOT:
 - hardcode hop-by-hop routing or fan-out,
 - directly invoke internal components (M_CPU, PE_CPU, engines),
 - embed topology- or routing-specific assumptions.
 ---
 ### D2. Simulation engine executes and schedules requests
 The simulation engine (sim_engine) MUST:
 - inject requests into the compiled topology graph,
 - schedule and execute events using a discrete-event model,
 - manage correlation ids and completion tracking,
 - decompose operations into low-level requests when required
  (e.g., MemoryWrite events).
 The simulation engine MUST NOT:
 - define tensor semantics,
 - define kernel execution policies,
 - expose internal graph details to the runtime API.
 ---
 ### D3. Components own fan-out and aggregation
 Device-side components MUST:
 - fan-out requests to downstream domains
  (IO_CPU → M_CPU → PE_CPU → schedulers/engines),
 - aggregate completion and failure signals,
 - propagate results deterministically upstream.
 Neither the runtime API nor the simulation engine may orchestrate
 component-level fan-out explicitly.
 ---
 ## Consequences
 - Runtime APIs remain stable as topology and routing evolve.
 - Simulation internals can change without affecting user-facing code.
 - Component implementations remain swappable via DI.
 ---
 ## Links
 - SPEC R4, R7, R8
 - ADR-0008 (Tensor deployment)
 - ADR-0009 (Kernel execution)
@@ -0,0 +1,100 @@
 # ADR-0008: Tensor Deployment and Allocation (Host Allocator, PA-first)
 ## Status
 Accepted
 ## Context
 Benchmarks require PyTorch-like tensor semantics:
 - tensor creation (empty, fill),
 - deployment to accelerator devices (tensor.to()).
 In the realistic system, host software manages allocation/mapping and installs
 mappings for DMA/MMU. For Phase 0 we simplify (ADR-0011):
 - device memory operations use PA only,
 - VA/MMU/IOMMU is not modeled.
 To keep the host↔device interface minimal, we avoid a separate
 AllocateTensorMeta message. Instead, host allocation produces a PA shard map
 that is used directly by MemoryWrite/Read and KernelLaunch.
 ---
 ## Decision
 ### D1. Tensor is a host-owned handle with PA shard mapping
 A Tensor object is a host-owned handle that encapsulates:
 - shape and dtype,
 - initialization intent,
 - device placement and allocation metadata as a PA shard map.
 After deployment, the Tensor handle MUST contain:
 - a list of shards, each with (sip,cube,pe,pa,nbytes,offset_bytes).
 This PA shard mapping is the single source of truth for kernel argument binding.
 ---
 ### D2. Deployment uses a host allocator (Phase 0)
 In Phase 0, tensor deployment produces PA shard mappings via a host allocator:
 - placement (split/replicate/hybrid) is decided by a DP policy,
 - allocation assigns PA ranges at the PE level and returns shard mappings,
 - the Tensor handle stores the resulting shard list deterministically.
 No separate host-visible device allocation RPC is required in Phase 0.
 ---
 ### D3. Data initialization and transfer uses MemoryWrite/Read only
 Any data initialization or transfer implied by a tensor (e.g., fill, copy)
 MUST be represented using Host ↔ IO_CPU messages only:
 - MemoryWrite
 - MemoryRead
 Rules:
 - MemoryWrite/Read MUST reference PA + (sip,cube,pe) tags (ADR-0012).
 - Allocation metadata MUST NOT be embedded as a separate allocation message.
 - Bulk tensor data MUST NOT be embedded in Phase 0 messages.
 The simulation engine schedules MemoryWrite/Read through the graph so that
 latency is computed by explicit traversal.
 ---
 ### D4. Extension path (non-breaking)
 Future ADRs MAY introduce optional VA/MMU/IOMMU modeling by adding:
 - virtual addressing in tensor handles,
 - mapping install steps,
 - translation latency/page granularity.
 The Phase 0 PA shard map remains a valid fast-path configuration.
 ---
 ## Consequences
 - Host↔IO_CPU contract remains minimal (MemoryRead/Write + KernelLaunch).
 - KernelLaunch can pass per-PE data placement explicitly via shard tags.
 - Early implementation stays simple and testable.
 ---
 ## Links
 - ADR-0011 (PA-first)
 - ADR-0012 (Host↔IO_CPU schema)
 - ADR-0007 (runtime_api vs sim_engine boundaries)
 - ADR-0009 (Kernel execution)
@@ -0,0 +1,74 @@
 # ADR-0009: Kernel Execution Messaging and Completion Semantics
 ## Status
 Accepted
 ## Context
 Kernel execution is initiated by the host and proceeds through
 device control components:
 Host → IO_CPU → M_CPU → PE_CPU → schedulers → engines
 Completion propagates in reverse order.
 To keep benchmarks simple and topology-agnostic,
 kernel execution must be endpoint-driven with deterministic aggregation.
 ---
 ## Decision
 ### D1. Kernel launch is an endpoint request
 A kernel launch is initiated by submitting a single KernelLaunch request
 to the IO_CPU endpoint.
 The runtime API MUST:
 - construct the kernel launch request,
 - submit it to IO_CPU,
 - await a single completion result.
 The runtime API MUST NOT orchestrate internal fan-out.
 ---
 ### D2. Tensor arguments are passed by metadata
 KernelLaunch requests MUST reference tensor arguments via:
 - host-owned tensor handles, or
 - resolved device address maps derived from those handles.
 Bulk tensor data MUST NOT be embedded in kernel launch messages.
 ---
 ### D3. Fan-out and aggregation are component responsibilities
 - IO_CPU fans out work to M_CPUs.
 - M_CPU fans out work to PE_CPUs.
 - PE_CPU manages kernel execution and engine dispatch.
 Completion semantics:
 - M_CPU completes when all targeted PEs complete or a failure policy triggers.
 - IO_CPU completes when all targeted CUBEs complete or a failure policy triggers.
 ---
 ### D4. Completion and failure propagation
 - All messages MUST carry correlation identifiers.
 - Completion and failure MUST propagate deterministically to the host.
 - The simulation engine provides futures/handles to observe completion.
 ---
 ## Links
 - SPEC R1, R2, R7, R8
 - ADR-0007 (Runtime API boundaries)
 - ADR-0008 (Tensor deployment)
@@ -0,0 +1,62 @@
 # ADR-0010: CLI Device Selection and Multi-Device Execution Semantics
 ## Status
 Accepted
 ## Context
 Benchmarks represent device-agnostic workloads that operate on a single device.
 Users may want to run a benchmark:
 - on a specific device, or
 - across all devices in the system.
 Device enumeration must not leak into benchmarks or runtime APIs.
 ---
 ## Decision
 ### D1. Benchmarks are single-device by design
 - A benchmark MUST define behavior for a single device only.
 - A benchmark MUST accept a device identifier as input.
 - Benchmarks MUST NOT enumerate or loop over multiple devices.
 ---
 ### D2. CLI controls device selection
 The `kernbench run` command supports an optional `--device` argument:
 - If `--device <id>` is specified:
  - the benchmark executes once for the specified device.
 - If `--device` is omitted:
  - the benchmark executes once using all the SIPs discovered in the topology.
 ---
 ### D3. Multi-device execution is logically parallel
 When running on multiple devices:
 - benchmark executions are submitted to a single simulation engine instance,
 - executions are logically parallel in simulation time,
 - inter-device contention is naturally modeled.
 ---
 ### D4. Runtime API and simulation engine remain device-scoped
 - Runtime API calls operate on one device per invocation.
 - The simulation engine schedules all requests deterministically.
 - Neither layer enumerates devices.
 ---
 ## Links
 - SPEC R7, R8
 - ADR-0007 (Runtime API boundaries)
@@ -0,0 +1,65 @@
 # ADR-0011: Memory Addressing Simplification (PA-first)
 ## Status
 Accepted
 ## Context
 A realistic system uses host-side virtual addressing and an MMU/IOMMU-style
 translation path for DMA: host allocates physical memory at PE level, maps it
 into a virtual address space, installs mappings, and DMA requests use virtual
 addresses that are translated to physical addresses.
 For early development, we want a minimal, deterministic model that enables:
 - correct routing and latency accounting through the graph,
 - stable tensor deployment and kernel execution semantics,
 - future extension toward VA/MMU without rewriting workflows.
 ---
 ## Decision
 ### D1. Phase 0 model is PA-only
 The simulator uses a PA-first model:
 - All device memory accesses (MemoryRead/MemoryWrite) operate on device physical
  addresses (PA) plus size.
 - Tensor handles store PA-based shard mappings after deployment.
 - KernelLaunch passes tensor arguments as PA-based mappings (or references to them).
 - MMU/IOMMU concepts (virtual address spaces, page tables, translation latency)
  are NOT modeled in Phase 0.
 ### D2. Allocation produces PA mappings
 Device allocation selects PE-local memory regions and returns PA mappings
 sufficient to execute kernels and issue DMA requests.
 ### D3. Extension path (non-breaking)
 A future ADR MAY introduce an optional VA/MMU layer by:
 - introducing virtual addresses in tensor handles,
 - adding a mapping-install step,
 - modeling translation latency and page granularity.
 The Phase 0 PA model remains a valid fast-path configuration.
 ---
 ## Consequences
 - Early implementation stays simple and testable.
 - All latency remains explicit via graph traversal, not hidden translation.
 - Future VA/MMU modeling can be added without breaking existing benchmarks.
 ---
 ## Links
 - ADR-0007 (runtime_api vs sim_engine boundaries)
 - ADR-0008 (tensor deployment)
 - ADR-0009 (kernel execution)
 - SPEC R2 (latency by traversal)
@@ -0,0 +1,232 @@
 # ADR-0012: Host ↔ IO_CPU Message Schema (PA-first, PE-tagged)
 ## Status
 Accepted
 ## Context
 Phase 0 uses a PA-first memory model (ADR-0011):
 - memory operations use device physical addresses (PA) only,
 - VA/MMU/IOMMU is not modeled.
 The host-facing runtime API interacts with the device via the IO_CPU endpoint.
 We define stable, minimal message schemas for Host ↔ IO_CPU so that:
 - benchmarks remain stable,
 - IO_CPU-internal fan-out/aggregation can evolve independently,
 - completion and failure propagation is deterministic.
 We also require PE-tagging (A 방식): each shard explicitly carries (sip,cube,pe)
 so IO_CPU can deterministically route/fan-out without relying on PA decoding.
 ---
 ## Decision
 ### D1. Contract scope
 This schema is the stable contract ONLY for Host ↔ IO_CPU.
 Messages beyond IO_CPU (to M_CPU, PE_CPU, schedulers, engines) are component-internal
 and are NOT part of this host contract in Phase 0.
 ---
 ### D2. Required message set
 The runtime API MUST use only these message types for Host ↔ IO_CPU:
 - MemoryWrite
 - MemoryRead
 - KernelLaunch
 All operations required by benchmarks (tensor init/copy, kernel run) MUST be expressible
 with these messages.
 ---
 ### D3. Common envelope (mandatory for all requests)
 All Host ↔ IO_CPU requests MUST include:
 - `msg_type: str`
 - `correlation_id: str`
  - generated by the host
  - used to match responses deterministically
 - `request_id: str`
  - unique within a correlation_id
 - `target_device: str`
  - device identifier (e.g., "sip:0")
 - `timestamp_tag: str | None` (optional)
  - debug tag only; MUST NOT affect determinism
 All Host ↔ IO_CPU responses MUST include:
 - `correlation_id: str`
 - `request_id: str`
 - `completion: Completion`
 ---
 ### D4. Completion schema (mandatory)
 `Completion` MUST have:
 - `ok: bool`
 - `error_code: str | None`
 - `error_message: str | None`
 Rules:
 - If `ok == true` then `error_code` and `error_message` MUST be null.
 - If `ok == false` then `error_code` MUST be non-null.
 - Completion semantics MUST be deterministic.
 ---
 ### D5. MemoryWrite schema (PA-first, PE-tagged)
 `MemoryWrite` represents a host-initiated write/initialize operation to device memory.
 Mandatory fields:
 - common envelope fields (D3)
 - destination placement tags (A 방식):
  - `dst_sip: int`
  - `dst_cube: int`
  - `dst_pe: int`
 - `dst_pa: int`
  - destination physical address in the destination PE's address space
 - `nbytes: int`
 - `src_kind: "pattern" | "host_buffer_ref"`
  - Phase 0 MUST support "pattern"
 - `pattern: Pattern | None`
  - required if `src_kind == "pattern"`
 `Pattern` (Phase 0 mandatory support):
 - `pattern_kind: "zero" | "fill_u8" | "fill_u16" | "fill_u32" | "fill_fp16" | "fill_fp32"`
 - `value: number | None`
  - required for fill_*; ignored for zero
 Optional fields:
 - `dst_mem_kind: "HBM" | "TCM" | "AUTO"` (default "AUTO")
 - `debug_label: str | None`
 Notes:
 - This message MUST NOT embed bulk tensor data in Phase 0.
 - All latency MUST come from explicit graph traversal and modeled components.
 ---
 ### D6. MemoryRead schema (PA-first, PE-tagged)
 `MemoryRead` represents a host-initiated read from device memory.
 Mandatory fields:
 - common envelope fields (D3)
 - source placement tags (A 방식):
  - `src_sip: int`
  - `src_cube: int`
  - `src_pe: int`
 - `src_pa: int`
 - `nbytes: int`
 Optional fields:
 - `dst_kind: "host_sink" | "discard"` (default "host_sink")
 - `debug_label: str | None`
 Response payload:
 - actual bytes are NOT required in Phase 0 (latency/traces focus)
 - implementations MAY return lightweight stats or hashes later via a new ADR
 ---
 ### D7. KernelLaunch schema (PA-first, PE-tagged shards)
 `KernelLaunch` represents launching a kernel on a target device via IO_CPU.
 Mandatory fields:
 - common envelope fields (D3)
 - `kernel_ref: KernelRef`
 - `args: list[KernelArg]`
 `KernelRef` MUST have:
 - `name: str`
 - `kind: "deployed" | "builtin"`
 - `deploy_pa: int | None` — PA where kernel binary was deployed (required for "deployed")
 - `deploy_sip: int` — SIP where binary resides
 - `deploy_cube: int` — cube where binary resides
 - `deploy_pe: int` — PE where binary resides
 - `nbytes_code: int` — kernel binary size (for BW modeling)
 Kernel binaries MUST be pre-deployed to device memory via MemoryWrite.
 KernelLaunch MUST NOT embed kernel source code or IR in the launch message.
 `KernelArg` supports tensor args by PA mapping and scalars by value.
 Tensor arg (mandatory):
 - `arg_kind: "tensor"`
 - `tensor_pa_map: TensorPAMap`
 `TensorPAMap` MUST have:
 - `shards: list[TensorShard]`
 `TensorShard` MUST have (A 방식 강제):
 - `sip: int`
 - `cube: int`
 - `pe: int`
 - `pa: int`
 - `nbytes: int`
 - `offset_bytes: int`
 Scalar arg (mandatory):
 - `arg_kind: "scalar"`
 - `dtype: "i32" | "i64" | "fp16" | "fp32" | "bool"`
 - `value: number | bool`
 Optional KernelLaunch fields:
 - `grid: dict | None`
 - `meta: dict | None`
 - `failure_policy: "fail_fast" | "collect_all"` (default "fail_fast")
 - `debug_label: str | None`
 Notes:
 - KernelLaunch MUST NOT embed bulk tensor data.
 - KernelLaunch MUST be submitted only to the IO_CPU endpoint.
 - IO_CPU MUST fan-out work internally using the shard (sip,cube,pe) tags.
 ---
 ## Verification Notes
 Tests SHOULD validate:
 - schema validation rejects missing mandatory fields,
 - deterministic correlation/response matching,
 - MemoryWrite/Read/KernelLaunch produce explicit hop traces,
 - all routed requests incur latency > 0.
 ---
 ## Links
 - ADR-0011 (PA-first memory addressing)
 - ADR-0007 (runtime_api vs sim_engine boundaries)
 - ADR-0009 (kernel execution fan-out/aggregation)
 - SPEC R2, R7, R8
@@ -0,0 +1,139 @@
 # ADR-0013: Verification Strategy and Phase 1 Test Plan
 ## Status
 Accepted
 ## Context
 KernBench is a system-level simulator whose correctness is defined by:
 - adherence to SPEC-defined invariants,
 - determinism and debuggability,
 - explicit modeling of routing and latency.
 Given the evolving implementation, we need a stable verification strategy
 that prevents architectural drift while allowing incremental development.
 This ADR defines the Phase 1 verification plan and what constitutes
 "correct behavior" for early implementations.
 ---
 ## Decision
 ### D1. Verification is contract-based
 Verification MUST be derived from:
 - SPEC requirements,
 - accepted ADRs.
 Tests MUST validate architectural contracts, not incidental implementation details.
 ---
 ### D2. Phase 1 verification scope
 Phase 1 verification focuses on:
 - message contract validity (ADR-0012),
 - routing and fan-out semantics at the IO_CPU boundary (ADR-0009),
 - PA-first memory addressing and shard tagging (ADR-0011),
 - core latency and trace invariants (SPEC 0.1, R2).
 Microarchitectural accuracy, bandwidth contention, and cycle-level behavior
 are explicitly out of scope in Phase 1.
 ---
 ### D3. Required Phase 1 verification cases
 The following verification cases MUST be supported by the implementation:
 #### V1. Message schema validation
 - KernelLaunch requests missing `(sip, cube, pe)` in any tensor shard MUST be rejected.
 - MemoryWrite/MemoryRead requests missing destination/source placement tags MUST be rejected.
 - Completion results MUST follow the `ok / error_code / error_message` contract.
 #### V2. IO_CPU fan-out and aggregation
 Given:
 - a topology with one SIP, one CUBE, and two PEs,
 - a KernelLaunch request containing two tensor shards targeting different PEs,
 The system MUST:
 - submit a single KernelLaunch to IO_CPU,
 - fan-out work internally to both PEs,
 - aggregate completion and return a single deterministic completion to the host.
 #### V3. Latency and trace invariants
 For any valid request:
 - the hop-by-hop trace MUST be non-empty,
 - total latency MUST be greater than zero,
 - repeated runs with identical inputs MUST produce identical traces.
 #### V4. Topology independence and cross-domain coverage
 Verification cases MUST pass for multiple topology shapes, including:
 - minimal: (1 SIP, 1 CUBE, 1 PE)
 - multi-PE: (1 SIP, 1 CUBE, N PEs)
 - multi-CUBE within a SIP: (1 SIP, M CUBEs, ≥1 PE per CUBE)
 - multi-SIP tray: (K SIPs, ≥1 CUBE per SIP, ≥1 PE per CUBE)
 For multi-CUBE and multi-SIP topologies, Phase 1 verification focuses on:
 - explicit connectivity (required links exist),
 - deterministic routing and control-path traversal,
 - non-empty traces and latency > 0 for representative cross-domain requests
  (inter-CUBE and inter-SIP paths).
 Tests MUST NOT hardcode topology sizes, node ids, or link counts.
 Instead, tests MUST derive expectations from the compiled topology metadata
 ---
 ### D4. Phase 1 artifacts
 Phase 1 MAY include:
 - verification-only test code,
 - topology fixtures,
 - trace inspection utilities.
 Phase 1 MUST NOT require:
 - production code changes solely to satisfy tests,
 - weakening or removing tests to allow progress.
 ---
 ### D5. Phase 2 enforcement
 Phase 2 (Apply) MUST:
 - run the Phase 1 verification cases,
 - rollback all changes if any verification fails,
 - preserve tests as authoritative contracts.
 ---
 ## Consequences
 - Architectural correctness is enforced early.
 - Tests serve as executable documentation of system behavior.
 - Implementation remains flexible without losing rigor.
 ---
 ## Links
 - SPEC 0.1, R2, R6
 - ADR-0011 (PA-first memory addressing)
 - ADR-0012 (Host ↔ IO_CPU message schema)
 - ADR-0009 (Kernel execution semantics)
@@ -0,0 +1,364 @@
 # ADR-0014: PE Internal Execution Model (PE_CPU, PE_SCHEDULER, and Composite Commands)
 ## Status
 Proposed
 ## Context
 ADR-0003 (system hierarchy) and ADR-0009 (kernel execution semantics) reference PE internals but do not define:
 - the dispatch model inside a PE,
 - the responsibilities of PE_SCHEDULER,
 - the PE_TCM-centric dataflow contract used by accelerator engines.
 We need a deterministic and debuggable PE-internal execution contract that supports:
 - simple single-engine commands
 - composite commands that build a tiled pipeline across DMA and accelerator engines
 The simulator must produce deterministic traces and allow modeling of PE-internal pipelining without introducing nondeterministic engine scheduling.
 ## Decision
 ### D1. PE internal component roles
 Each PE contains the following logical components.
 **PE_CPU**
 - Executes kernel instruction stream or kernel control logic.
 - Generates PE commands.
 - Submits commands to PE_SCHEDULER.
 - PE_CPU does NOT enqueue work directly into engine queues.
 **PE_SCHEDULER**
 - The sole dispatcher inside a PE.
 - Receives commands from PE_CPU.
 - Expands composite commands into sub-commands.
 - Tracks dependencies and command state.
 - Dispatches work to engine queues.
 - Manages tile scheduling for composite commands.
 **PE_DMA**
 - Handles memory transfers between PE_TCM and external memory domains.
 - PE_DMA has **dual egress** at the CUBE level:
  - **→ XBAR**: dedicated path to HBM (local and cross-half via bridge)
  - **→ NOC**: path to non-HBM destinations (shared SRAM, inter-cube UCIe, etc.)
 - Supported directions include:
  - HBM → PE_TCM (via XBAR)
  - PE_TCM → HBM (via XBAR)
  - PE_TCM → shared SRAM (via NOC)
  - PE_TCM → other memory domains (via NOC, if supported by topology)
 **PE_GEMM**
 - Matrix multiplication engine.
 - Reads activations from PE_TCM.
 - May stream weights directly from HBM.
 **PE_MATH**
 - Element-wise computation engine.
 - Reads and writes PE_TCM.
 **PE_TCM**
 - Local SRAM used as the staging memory for accelerator operations.
 ---
 ### D2. Command lifecycle and queues
 PE_SCHEDULER maintains three logical structures.
 **SubmissionQueue**
 - Written by PE_CPU.
 - Contains incoming PE commands waiting to be processed.
 **InflightTable**
 - Owned and mutated only by PE_SCHEDULER.
 - Tracks:
  - expanded sub-commands
  - dependency state
  - engine assignment
  - completion status
 **CompletionQueue**
 - Written by PE_SCHEDULER.
 - Contains final completion records for commands.
 **Single-writer rule**
 - Only PE_SCHEDULER is allowed to mutate command completion state.
 - Engine components must report completion via explicit completion events/messages.
 **Command completion**
 A command becomes DONE when:
 - all sub-commands complete
 - PE_SCHEDULER publishes a completion record to CompletionQueue.
 ---
 ### D3. Dispatch modes
 PE commands are divided into two categories.
 #### D3.1 Simple command
 A simple command expands to exactly one engine sub-command.
 Examples include:
 - DMA transfer
 - GEMM compute
 - MATH compute
 Execution flow:
 ```
 PE_CPU → SubmissionQueue → PE_SCHEDULER → engine queue → engine execution → completion event → PE_SCHEDULER → CompletionQueue
 ```
 #### D3.2 Composite command (tiled pipeline)
 Composite commands implement tiled pipelined execution across engines.
 Each tile executes the following pipeline:
 ```
 Input DMA (READ)
 → Compute (GEMM or MATH)
 → Output DMA (WRITE)
 ```
 **Tiling rule**
 If the DMA payload exceeds hardware tile size, PE_SCHEDULER splits the transfer into tiles.
 Each tile is assigned a monotonically increasing `tile_id`.
 **Tile dependency rules**
 For tile `t`:
 - Compute must wait for input DMA: `DMA_READ(t) → COMPUTE(t)`
 - Output DMA must wait for compute: `COMPUTE(t) → DMA_WRITE(t)`
 - All dependencies are enforced by PE_SCHEDULER.
 **Overlap policy (Phase 0 default)**
 Operations for different tiles may overlap when engine resources permit.
 Allowed overlaps:
 ```
 DMA_READ(t+1) ∥ COMPUTE(t)
 DMA_WRITE(t−1) ∥ COMPUTE(t)
 DMA_READ(t) ∥ DMA_WRITE(t)
 ```
 Disallowed overlaps:
 ```
 GEMM(t) ∥ GEMM(t′)
 MATH(t) ∥ MATH(t′)
 GEMM(t) ∥ MATH(t′)
 ```
 ---
 ### D4. Engine execution model (Phase 0 default)
 Each engine behaves as a deterministic service resource.
 **DMA engine**
 PE_DMA contains two independent channels.
 ```
 DMA_READ capacity  = 1
 DMA_WRITE capacity = 1
 ```
 Rules:
 - DMA_READ and DMA_WRITE may execute concurrently.
 - Multiple READs cannot overlap.
 - Multiple WRITEs cannot overlap.
 Example allowed:
 ```
 DMA_READ(t+1) ∥ DMA_WRITE(t)
 ```
 Example not allowed:
 ```
 DMA_READ(t) ∥ DMA_READ(t+1)
 DMA_WRITE(t) ∥ DMA_WRITE(t+1)
 ```
 **Compute engine**
 Compute operations share a single compute resource.
 ```
 PE_ACCEL capacity = 1
 ```
 Both GEMM and MATH require this shared compute slot.
 Consequences:
 - GEMM ∥ GEMM not allowed
 - MATH ∥ MATH not allowed
 - GEMM ∥ MATH not allowed
 Only one compute operation can run in a PE at a time.
 **Compute opcode restriction**
 Composite commands contain one compute opcode only.
 Examples:
 ```
 COMPOSITE_GEMM
 COMPOSITE_MATH
 ```
 Mixed compute pipelines such as `GEMM → MATH` are not supported in Phase 0.
 **Engine completion signaling**
 Every engine emits a completion event when a sub-command finishes.
 Completion events are delivered to PE_SCHEDULER.
 ---
 ### D5. Dataflow model
 Compute operations use a TCM-centric dataflow model.
 **Input path (HBM)**
 ```
 HBM → XBAR → PE_DMA (DMA_READ) → PE_TCM
 ```
 **Input path (shared SRAM)**
 ```
 Shared SRAM → NOC → PE_DMA (DMA_READ) → PE_TCM
 ```
 **Compute stage**
 Compute engines read input tensors from PE_TCM.
 ```
 PE_TCM → GEMM / MATH
 ```
 Weights for GEMM may optionally stream directly from HBM (via XBAR).
 **Output path (HBM)**
 Compute results are written to PE_TCM, then DMA writes to HBM.
 ```
 PE_TCM → PE_DMA (DMA_WRITE) → XBAR → HBM
 ```
 **Output path (shared SRAM)**
 ```
 PE_TCM → PE_DMA (DMA_WRITE) → NOC → Shared SRAM
 ```
 #### D5.1 PE_TCM partitioning and ownership boundary
 The PE_TCM address space is partitioned into two logical regions.
 **SchedulerReservedTCM**
 - A staging region owned exclusively by PE_SCHEDULER.
 - This region is used for composite command tile buffers.
 - PE_SCHEDULER:
  - partitions this region into tile buffers
  - assigns buffers for DMA_READ, COMPUTE, and DMA_WRITE stages
  - guarantees input/output buffer separation
  - manages tile buffer lifetime
 **AllocatableTCM**
 - General-purpose region managed by PEMemAllocator.
 - Used by host or DP-visible allocations.
 **Visibility rule (hard isolation)**
 - PEMemAllocator must not see or allocate memory inside SchedulerReservedTCM.
 - SchedulerReservedTCM is excluded from allocator-managed ranges by construction.
 - This prevents DP or host allocations from interfering with scheduler staging buffers.
 **Tile buffer rules**
 Within SchedulerReservedTCM:
 - input buffers and output buffers must not overlap
 - PE_SCHEDULER assigns tile buffers for DMA and compute stages
 - tile buffers remain valid until the corresponding DMA_WRITE completes
 - Buffer reuse is allowed only after the tile lifetime finishes.
 ---
 ### D6. Observability and trace contract
 The simulator must emit deterministic trace events.
 Required events include:
 - `command_submitted`
 - `sub_command_dispatched`
 - `engine_start`
 - `engine_complete`
 - `tile_ready`
 - `command_complete`
 Trace ordering must be deterministic for identical inputs.
 ---
 ### D7. Topology representation
 PE internal components are declared in `cube.pe_template`.
 The template is instantiated once per PE.
 PE instances are derived from `cube.pe_layout`.
 External connectivity such as:
 - PE_DMA → XBAR (HBM data path)
 - PE_DMA → NOC (non-HBM data path: shared SRAM, inter-cube UCIe)
 - NOC → PE_CPU (command path from M_CPU)
 is modeled at the CUBE level (see ADR-0003 D3).
 ---
 ## Links
 - SPEC R3, R4
 - ADR-0003 D4 (PE-level system hierarchy)
 - ADR-0005 View C (PE-level diagram)
 - ADR-0008 D2 (PA-level allocation at PE scope; PEMemAllocator is the per-PE allocator instance)
 - ADR-0009 D3 (kernel execution fan-out and PE_CPU dispatch)
@@ -0,0 +1,178 @@
 # ADR-0015: Component Port/Wire Model and Fabric Routing
 ## Status
 Proposed
 ## Context
 ADR-0007 D2 assigns path-walking and low-level request decomposition to the simulation engine.
 In practice, the engine iterates the topology path and calls `run()` on each component
 sequentially — conflating routing policy with component behavior and preventing realistic
 hardware modeling (queues, contention, fan-out).
 ADR-0007 D3 already states that components own fan-out and aggregation, but the current
 implementation does not enforce this for fabric traversal.
 This ADR defines:
 - how components communicate via typed port queues,
 - how propagation delay is modeled (wire processes),
 - the fabric path for Memory R/W through M_CPU.DMA,
 - the reduced role of the simulation engine,
 - M_CPU.DMA as an internal subcomponent of M_CPU.
 ---
 ## Decision
 ### D1. Component port model
 Each component has typed input/output ports modeled as SimPy Stores:
 ```
 in_ports:  dict[str, simpy.Store]   # keyed by source node_id
 out_ports: dict[str, simpy.Store]   # keyed by destination node_id
 ```
 Ports are created at engine initialization based on graph edges.
 Each directed edge (src → dst) results in:
 - `src.out_ports[dst]`  — the sending end
 - `dst.in_ports[src]`   — the receiving end
 ---
 ### D2. Wire process (propagation delay)
 For each directed edge (src, dst) in the topology graph, a SimPy wire process
 models propagation delay:
 ```python
 def wire_process(env, out_port, in_port, delay_ns):
    while True:
        cmd = yield out_port.get()
        yield env.timeout(delay_ns)
        yield in_port.put(cmd)
 ```
 Wire processes are started at engine initialization.
 BW constraints are enforced by the sending component's out_port capacity or token model,
 not by the wire process itself.
 ---
 ### D3. Engine role (reduced)
 The simulation engine MUST:
 - wire components at initialization (create port Stores, start wire processes),
 - identify the entry component for each request type (PCIE_EP),
 - put the request into the entry component's in_port,
 - wait for a completion event.
 The simulation engine MUST NOT:
 - walk the topology path during request execution,
 - call component `run()` methods directly,
 - track per-hop latency or decompose fan-out.
 This supersedes ADR-0007 D2's "decompose operations into low-level requests" clause.
 ADR-0007 D2 must be amended accordingly.
 ---
 ### D4. Unified fabric path for Memory R/W and Kernel Launch
 Both Memory R/W and Kernel Launch use the same fabric path to reach the target cube's M_CPU.
 The difference is what M_CPU does upon receiving the request.
 **Forward path (IO_CPU → target M_CPU):**
 ```
 IO_CPU
  → [transit cubes: ucie_out → wire → ucie_in → noc → ucie_out]  (zero or more)
  → target cube: ucie_in → noc → M_CPU
 ```
 **At M_CPU (diverges by operation type):**
 ```
 Memory R/W:     M_CPU → M_CPU.DMA → noc → hbm_ctrl
 Kernel Launch:  M_CPU → PE[0..n] (parallel fan-out)
 ```
 **Completion path (reverse, same fabric):**
 ```
 Memory R/W:     hbm_ctrl → noc → M_CPU.DMA → M_CPU
 Kernel Launch:  PE[0..n] all complete → M_CPU (aggregation)
 M_CPU → [transit cubes: ucie → noc → ucie] → IO_CPU → runtime_api
 ```
 ---
 ### D5. M_CPU.DMA is an internal subcomponent of M_CPU
 M_CPU.DMA is NOT a separate topology node.
 It is an internal subcomponent owned by the M_CPU component implementation.
 M_CPU.DMA:
 - owns the DMA READ and DMA WRITE queues (capacity=1 each, per ADR-0014 D4),
 - issues memory requests over the NOC to hbm_ctrl,
 - receives completion from hbm_ctrl via the NOC,
 - reports completion to M_CPU,
 - is created and managed inside M_CPU's `__init__` and `run()`.
 M_CPU.DMA does not appear as a node in the compiled topology graph.
 ---
 ### D6. Transit cube forwarding
 A cube that is not the target of a memory or kernel request acts as a transit node.
 Transit cubes forward requests without consuming them:
 ```
 ucie_in (from upstream) → noc → ucie_out (to downstream)
 ```
 Transit forwarding is implemented entirely within the ucie_in component.
 The noc and ucie_out components in a transit cube forward the packet without modification.
 ---
 ### D7. _formula_latency is preserved as a lower-bound cross-check
 The path-based formula latency function (`_formula_latency`) is preserved in the engine
 as a lower bound for correctness verification.
 Invariant:
 - Phase 0: `_formula_latency == component model total_ns`
 - Phase 1+: `_formula_latency <= component model total_ns` (contention adds queueing)
 This function is independent of the port/wire model and requires only the topology graph.
 It is used for shard comparison in `_route_kernel` and as a regression guard.
 ---
 ## Consequences
 - Components model realistic hardware behavior (queues, contention, fan-out).
 - Propagation delay is modeled accurately per edge.
 - Engine is decoupled from routing policy.
 - Component implementations remain swappable via DI (ADR-0007 D3).
 - ADR-0007 D2 must be amended to remove path-walking from engine responsibilities.
 - ADR-0009 D3 should be updated to reference the unified fabric path (D4 above).
 ---
 ## Links
 - ADR-0007 D2 (to be amended: engine path-walking clause)
 - ADR-0009 D3 (kernel execution fan-out; fabric path to be referenced)
 - ADR-0014 D4 (DMA engine capacity=1)
 - ADR-0012 D1 (host ↔ IO_CPU message schema; M_CPU.DMA is component-internal)
@@ -0,0 +1,363 @@
 # 실무 DI 패턴: kernbench 구현으로 배우는 Dependency Injection
 ---
 ## 슬라이드 1 — 오늘 이야기할 것
 **질문:** 코드를 어떻게 설계해야 테스트하기 쉽고, 갈아끼우기 쉬울까?
 **답:** Dependency Injection (DI)
 오늘은 이론이 아니라 **실제로 돌아가는 시뮬레이터 코드**를 보면서 배웁니다.
 ```
 kernbench
 └── AI 가속기 하드웨어를 Python으로 시뮬레이션하는 프레임워크
    - 수십 개의 하드웨어 컴포넌트 (NOC, HBM, PE, CPU...)
    - 각 컴포넌트는 런타임에 교체 가능
    - 테스트에서 Mock 컴포넌트로 즉시 대체 가능
 ```
 ---
 ## 슬라이드 2 — DI가 없으면 어떤 일이 생기나
 ```python
 # ❌ DI 없는 코드
 class IoCpuComponent:
    def run(self, env, nbytes):
        router = PathRouter()        # 직접 생성 — 교체 불가
        hbm = HbmCtrlComponent()    # 직접 생성 — 교체 불가
        yield env.timeout(10.0)
 ```
 **문제:**
 - 테스트할 때 실제 `PathRouter`와 `HbmCtrl`이 항상 따라온다
 - 컴포넌트를 Mock으로 바꾸려면 **소스 코드를 수정**해야 한다
 - 다른 topology(다른 라우팅 전략)를 쓰고 싶으면 **또 수정**
 > 클래스가 자기 의존성을 스스로 만들면, 그 클래스는 의존성과 결합된다
 ---
 ## 슬라이드 3 — DI의 핵심 원칙
 **의존성은 밖에서 만들어서 안으로 넣어준다**
 ```
 ┌────────────────────────────┐
 │  조립자 (Assembler)         │  ← 누가 무엇을 쓸지 결정
 │  GraphEngine.__init__      │
 └────────────┬───────────────┘
             │ ctx 주입
             ▼
 ┌────────────────────────────┐
 │  컴포넌트 (Component)       │  ← 어떻게 동작하는지만 알면 됨
 │  IoCpuComponent            │
 │    self.ctx.router.find_path(...)  ← 그냥 사용
 └────────────────────────────┘
 ```
 **세 가지 역할 분리:**
 1. **Interface** — 무엇을 할 수 있는가 (`ComponentBase`)
 2. **Implementation** — 어떻게 하는가 (`IoCpuComponent`, `HbmCtrlComponent`, ...)
 3. **Assembler** — 무엇을 연결할 것인가 (`GraphEngine`)
 ---
 ## 슬라이드 4 — 패턴 1: Constructor Injection
 > 생성자로 의존성을 받는다
 ```python
 # kernbench/components/base.py
 class ComponentBase(ABC):
    def __init__(self, node: Node, ctx: ComponentContext | None = None):
        self.node = node
        self.ctx = ctx          # 외부에서 주입받은 의존성
        self.in_ports: dict[str, simpy.Store] = {}
        self.out_ports: dict[str, simpy.Store] = {}
 ```
 ```python
 # 사용 측 — ctx를 직접 만들지 않는다
 class IoCpuComponent(ComponentBase):
    def _dispatch(self, env, txn):
        path = self.ctx.router.find_node_path(...)   # ctx는 이미 들어와 있음
        yield self.out_ports[next_hop].put(...)
 ```
 **언제 쓰나:**
 - 컴포넌트가 살아있는 동안 의존성이 바뀌지 않을 때
 - 의존성 없이는 컴포넌트가 동작하지 않을 때 (필수 의존성)
 ---
 ## 슬라이드 5 — Context Object 패턴
 > 의존성이 많아지면 묶어서 하나로
 ```python
 # kernbench/components/context.py
@dataclass
 class ComponentContext:
    router: PathRouter              # 라우팅 정책
    resolver: AddressResolver       # 주소 해석
    positions: dict[str, ...]       # 물리적 위치 정보
    ns_per_mm: float                # 전파 지연 상수
    edge_map: dict[...]             # 엣지 정보
    spec: dict                      # 토폴로지 스펙
 ```
 **왜 Context로 묶나?**
 - 생성자 인자가 6개면 → 컴포넌트 추가할 때마다 시그니처 변경
 - Context 하나면 → 새 필드 추가해도 기존 컴포넌트 무영향
 - 컴포넌트는 **필요한 것만 꺼내 쓴다**
 ```python
 class TwoDMeshNocComponent(ComponentBase):
    def _route(self, env, txn):
        src_pos = self.ctx.positions.get(prev_hop)   # 위치만 사용
        ns_per_mm = self.ctx.ns_per_mm               # 상수만 사용
        # router, resolver 등은 건드리지 않음
 ```
 ---
 ## 슬라이드 6 — 패턴 2: Registry + Factory
 > 문자열 키 → 클래스 매핑으로 런타임 교체
 ```python
 # kernbench/components/base.py
 class ComponentRegistry:
    _registry: dict[str, type[ComponentBase]] = {}
    @classmethod
    def register(cls, impl: str, component_cls: type[ComponentBase]):
        cls._registry[impl] = component_cls
    @classmethod
    def create(cls, node, overrides=None, ctx=None) -> ComponentBase:
        if overrides and node.impl in overrides:
            return overrides[node.impl](node, ctx)   # 1순위: 호출자 override
        if node.impl in cls._registry:
            return cls._registry[node.impl](node, ctx)  # 2순위: 등록된 구현
        return DefaultComponent(node, ctx)           # 3순위: 기본값 fallback
 ```
 **Resolution 우선순위:**
 ```
 overrides[impl]        ← 테스트/실험용 주입
  ↓ (없으면)
 _registry[impl]        ← 프로덕션 구현
  ↓ (없으면)
 DefaultComponent       ← 안전한 fallback
 ```
 ---
 ## 슬라이드 7 — Registry 등록 방식
 ```python
 # kernbench/components/impls/__init__.py
 from kernbench.components.base import ComponentRegistry
 from kernbench.components.impls.noc import TwoDMeshNocComponent
 from kernbench.components.impls.io_cpu import IoCpuComponent
 # ...
 ComponentRegistry.register("noc_2d_mesh_v1", TwoDMeshNocComponent)
 ComponentRegistry.register("io_cpu_v1",       IoCpuComponent)
 ComponentRegistry.register("hbm_ctrl_v1",     HbmCtrlComponent)
 # ...
 ```
 **topology.yaml (설정 파일)**
 ```yaml
 nodes:
  - id: sip0.cube0.noc
    impl: noc_2d_mesh_v1    # ← 이 문자열이 Registry 키
 ```
 **흐름:**
 ```
 YAML → impl 문자열 → Registry.create() → 실제 컴포넌트 인스턴스
 ```
 impl 문자열만 바꾸면 동작이 바뀐다. 코드 수정 없음.
 ---
 ## 슬라이드 8 — 패턴 3: Override Injection (테스트용)
 > 호출자가 특정 impl만 갈아끼운다
 ```python
 # tests/test_component_registry.py
 class SpyXbar(ComponentBase):
    calls = 0
    def run(self, env, nbytes):
        SpyXbar.calls += 1
        yield env.timeout(0)
 # 테스트에서 xbar_v1만 SpyXbar로 교체
 engine = GraphEngine(
    graph,
    component_overrides={"xbar_v1": SpyXbar}   # ← 이것만 추가
 )
 result = engine.run(msg)
 assert SpyXbar.calls > 0    # Xbar가 실제로 호출됐는지 검증
 ```
 **핵심:** 테스트 코드가 프로덕션 코드를 **수정하지 않는다**
 ---
 ## 슬라이드 9 — 조립자: GraphEngine
 > 컴포넌트를 생성하고 연결하는 유일한 곳
 ```python
 # kernbench/sim_engine/engine.py
 class GraphEngine:
    def __init__(self, graph, component_overrides=None):
        # 1. 공유 의존성 생성
        ctx = ComponentContext(
            router=PathRouter(graph),
            resolver=AddressResolver(graph),
            positions={nid: n.pos_mm for nid, n in graph.nodes.items()},
            ns_per_mm=...,
        )
        # 2. 컴포넌트 생성 (DI: ctx 주입)
        self._components = {
            node_id: ComponentRegistry.create(node, overrides, ctx)
            for node_id, node in graph.nodes.items()
        }
        # 3. 포트 연결 (배선)
        for e in graph.edges:
            store = simpy.Store(self._env)
            self._components[e.src].out_ports[e.dst] = store
            self._components[e.dst].in_ports[e.src] = store
 ```
 **생성 → 주입 → 연결** — 이 세 단계가 한 곳에서만 일어난다
 ---
 ## 슬라이드 10 — 전체 구조 한눈에 보기
 ```
 topology.yaml
    │ impl: "noc_2d_mesh_v1"
    ▼
 GraphEngine.__init__()                     ← 조립자
    │
    ├── ComponentContext 생성               ← 공유 의존성 묶음
    │     ├── PathRouter
    │     ├── AddressResolver
    │     └── positions, ns_per_mm, ...
    │
    ├── ComponentRegistry.create(node, overrides, ctx)
    │     ├── overrides["noc_2d_mesh_v1"]? → SpyNoc (테스트)
    │     ├── registry["noc_2d_mesh_v1"]?  → TwoDMeshNocComponent (프로덕션)
    │     └── fallback                     → DefaultComponent
    │
    └── 포트 배선: out_ports / in_ports 연결
 Component (TwoDMeshNocComponent)
    └── self.ctx.positions, self.ctx.ns_per_mm 사용
        (라우터, 리졸버는 건드리지 않음 — 필요한 것만)
 ```
 ---
 ## 슬라이드 11 — 무엇을 얻었나
 | 상황 | DI 없이 | DI 있이 |
 |------|---------|---------|
 | NOC 알고리즘 교체 | 소스 코드 수정 | YAML에서 impl 문자열 변경 |
 | Xbar 동작 검증 | 실제 HW 전부 구동 | `overrides={"xbar_v1": SpyXbar}` |
 | 새 컴포넌트 추가 | 기존 코드 수정 | `register("new_v1", NewComp)` |
 | 컨텍스트 필드 추가 | 모든 생성자 수정 | `ComponentContext`에 필드 추가 |
 | 테스트 격리 | 불가능 | 필요한 것만 override |
 ---
 ## 슬라이드 12 — 실무 적용 체크리스트
 **설계할 때 물어볼 것:**
 1. **이 클래스가 직접 `new`(생성)하는 것은 무엇인가?**
   → 생성하는 것 = 교체할 수 없는 것. 생성자로 받을 수 없는지 검토.
 2. **의존성이 3개 이상이면?**
   → Context Object로 묶어라.
 3. **테스트에서 이 클래스를 단독으로 실행할 수 있는가?**
   → 없다면 DI가 필요하다는 신호.
 4. **설정(YAML/config)으로 동작을 바꾸고 싶은가?**
   → Registry + 문자열 키 패턴.
 5. **누가 조립하는가?**
   → 조립자는 하나여야 한다. 컴포넌트 안에 조립 로직이 있으면 안 된다.
 ---
 ## 슬라이드 13 — 안티패턴: 이것은 하지 말자
 ```python
 # ❌ 서비스 로케이터 (컴포넌트 안에서 registry 호출)
 class BadComponent(ComponentBase):
    def run(self, env, nbytes):
        router = ComponentRegistry.get("router")  # 컴포넌트가 직접 찾는다
        ...
 # ❌ 전역 싱글톤 직접 참조
 class BadComponent(ComponentBase):
    def run(self, env, nbytes):
        router = GlobalRouter.instance()          # 교체 불가
        ...
 # ❌ 생성자 안에서 의존성 생성
 class BadComponent(ComponentBase):
    def __init__(self, node):
        self.router = PathRouter(node.graph)      # 테스트에서 격리 불가
 ```
 **공통 문제:** 컴포넌트가 자기 의존성을 스스로 해결한다 → 결합도 증가
 ---
 ## 슬라이드 14 — 요약
 > **DI = 의존성의 생성과 사용을 분리하는 것**
 ```
 생성  →  Registry / Assembler (GraphEngine)
 사용  →  Component (IoCpuComponent, TwoDMeshNocComponent, ...)
 ```
 **kernbench에서 배운 패턴 3가지:**
 1. **Constructor Injection** — 필수 의존성은 생성자로
 2. **Context Object** — 의존성 묶음을 하나의 dataclass로
 3. **Registry + Override** — 문자열 키로 구현체 선택, 테스트에서 교체
 **결과:** 141개 테스트, YAML 한 줄로 컴포넌트 교체, 프로덕션 코드 수정 없이 Mock 주입
 ---
 *참고 코드: kernbench/src/kernbench/components/*
@@ -0,0 +1,26 @@
 # Generated Diagrams
 This directory contains diagrams generated from topology compilation.
 ## What these files are
 - Derived artifacts generated from:
  - compiled topology graph
  - distance (accumulated latency) metadata
  - view/layout rules (ADR-0005)
 These files are meant for quick visual inspection and review.
 ## Default outputs
 - SIP view: `sip_view.mmd` (and/or `sip_view.dot`)
 - CUBE view: `cube_view.mmd` (and/or `cube_view.dot`)
 - PE view: `pe_view.mmd` (and/or `pe_view.dot`)
 ## How to preview
 - In VS Code:
  - open `.mmd` or `.md` containing Mermaid blocks and use Markdown Preview
  - for `.dot`, use a Graphviz preview extension or `dot -Tpng`
 ## Notes
 - Diagrams are representative and distance-aware by default.
 - Instance indices are not required unless debugging asymmetry.
 - Outputs should be deterministic for the same topology and rules.
@@ -0,0 +1,156 @@
 <svg xmlns="http://www.w3.org/2000/svg" width="556" height="472" viewBox="0 0 556 472">
  <title>cube</title>
  <rect width="556" height="472" fill="#f8fafc"/>
  <text x="278" y="18" text-anchor="middle" font-family="monospace" font-size="14" font-weight="bold" fill="#1e293b">CUBE VIEW</text>
  <rect x="40.0" y="40.0" width="476.0" height="392.0" rx="6" fill="none" stroke="#475569" stroke-width="2" stroke-dasharray="8,4"/>
  <rect x="152.0" y="166.0" width="252.0" height="140.0" rx="4" fill="#d1fae5" stroke="#10b981" stroke-width="1.5" stroke-dasharray="6,3" opacity="0.5"/>
  <text x="278.0" y="278.0" text-anchor="middle" font-family="monospace" font-size="11" fill="#047857" opacity="0.7">HBM</text>
  <polyline points="82.0,82.0 82.0,95.0 82.0,95.0 82.0,138.0" fill="none" stroke="#f97316" stroke-width="1" opacity="0.8"/>
  <text x="82.0" y="92.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">6.0mm 256GB/s</text>
  <polyline points="82.0,82.0 82.0,144.0 334.0,144.0 334.0,236.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
  <polyline points="334.0,236.0 334.0,144.0 82.0,144.0 82.0,82.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.6"/>
  <polyline points="166.0,82.0 166.0,95.0 166.0,95.0 166.0,138.0" fill="none" stroke="#f97316" stroke-width="1" opacity="0.8"/>
  <text x="166.0" y="92.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">6.0mm 256GB/s</text>
  <polyline points="166.0,82.0 166.0,154.0 334.0,154.0 334.0,236.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
  <polyline points="334.0,236.0 334.0,144.0 166.0,144.0 166.0,82.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.6"/>
  <polyline points="390.0,82.0 390.0,95.0 390.0,95.0 390.0,138.0" fill="none" stroke="#f97316" stroke-width="1" opacity="0.8"/>
  <text x="390.0" y="92.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">6.0mm 256GB/s</text>
  <polyline points="390.0,82.0 390.0,164.0 334.0,164.0 334.0,236.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
  <polyline points="334.0,236.0 334.0,144.0 390.0,144.0 390.0,82.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.6"/>
  <polyline points="474.0,82.0 474.0,95.0 474.0,95.0 474.0,138.0" fill="none" stroke="#f97316" stroke-width="1" opacity="0.8"/>
  <text x="474.0" y="92.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">6.0mm 256GB/s</text>
  <polyline points="474.0,82.0 474.0,174.0 334.0,174.0 334.0,236.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
  <polyline points="334.0,236.0 334.0,144.0 474.0,144.0 474.0,82.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.6"/>
  <polyline points="82.0,390.0 82.0,347.0 82.0,347.0 82.0,334.0" fill="none" stroke="#f97316" stroke-width="1" opacity="0.8"/>
  <text x="82.0" y="344.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">6.0mm 256GB/s</text>
  <polyline points="82.0,390.0 82.0,338.0 334.0,338.0 334.0,236.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
  <polyline points="334.0,236.0 334.0,298.0 82.0,298.0 82.0,390.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.6"/>
  <polyline points="166.0,390.0 166.0,347.0 166.0,347.0 166.0,334.0" fill="none" stroke="#f97316" stroke-width="1" opacity="0.8"/>
  <text x="166.0" y="344.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">6.0mm 256GB/s</text>
  <polyline points="166.0,390.0 166.0,348.0 334.0,348.0 334.0,236.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
  <polyline points="334.0,236.0 334.0,298.0 166.0,298.0 166.0,390.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.6"/>
  <polyline points="390.0,390.0 390.0,347.0 390.0,347.0 390.0,334.0" fill="none" stroke="#f97316" stroke-width="1" opacity="0.8"/>
  <text x="390.0" y="344.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">6.0mm 256GB/s</text>
  <polyline points="390.0,390.0 390.0,358.0 334.0,358.0 334.0,236.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
  <polyline points="334.0,236.0 334.0,298.0 390.0,298.0 390.0,390.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.6"/>
  <polyline points="474.0,390.0 474.0,347.0 474.0,347.0 474.0,334.0" fill="none" stroke="#f97316" stroke-width="1" opacity="0.8"/>
  <text x="474.0" y="344.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">6.0mm 256GB/s</text>
  <polyline points="474.0,390.0 474.0,368.0 334.0,368.0 334.0,236.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
  <polyline points="334.0,236.0 334.0,298.0 474.0,298.0 474.0,390.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.6"/>
  <polyline points="82.0,138.0 222.0,138.0 222.0,236.0" fill="none" stroke="#10b981" stroke-width="1" opacity="0.8"/>
  <text x="152.0" y="183.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.5mm 256GB/s</text>
  <polyline points="166.0,138.0 222.0,138.0 222.0,236.0" fill="none" stroke="#10b981" stroke-width="1" opacity="0.8"/>
  <text x="194.0" y="183.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.5mm 256GB/s</text>
  <polyline points="390.0,138.0 222.0,138.0 222.0,236.0" fill="none" stroke="#10b981" stroke-width="1" opacity="0.8"/>
  <text x="306.0" y="183.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.5mm 256GB/s</text>
  <polyline points="474.0,138.0 222.0,138.0 222.0,236.0" fill="none" stroke="#10b981" stroke-width="1" opacity="0.8"/>
  <text x="348.0" y="183.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.5mm 256GB/s</text>
  <polyline points="82.0,334.0 222.0,334.0 222.0,236.0" fill="none" stroke="#10b981" stroke-width="1" opacity="0.8"/>
  <text x="152.0" y="281.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.5mm 256GB/s</text>
  <polyline points="166.0,334.0 222.0,334.0 222.0,236.0" fill="none" stroke="#10b981" stroke-width="1" opacity="0.8"/>
  <text x="194.0" y="281.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.5mm 256GB/s</text>
  <polyline points="390.0,334.0 222.0,334.0 222.0,236.0" fill="none" stroke="#10b981" stroke-width="1" opacity="0.8"/>
  <text x="306.0" y="281.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.5mm 256GB/s</text>
  <polyline points="474.0,334.0 222.0,334.0 222.0,236.0" fill="none" stroke="#10b981" stroke-width="1" opacity="0.8"/>
  <text x="348.0" y="281.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.5mm 256GB/s</text>
  <line x1="82.0" y1="138.0" x2="166.0" y2="138.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
  <text x="124.0" y="134.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.0mm 128GB/s</text>
  <line x1="166.0" y1="138.0" x2="82.0" y2="138.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
  <text x="124.0" y="134.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.0mm 128GB/s</text>
  <line x1="166.0" y1="138.0" x2="390.0" y2="138.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
  <text x="278.0" y="134.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">10.0mm 128GB/s</text>
  <line x1="390.0" y1="138.0" x2="166.0" y2="138.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
  <text x="278.0" y="134.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">10.0mm 128GB/s</text>
  <line x1="390.0" y1="138.0" x2="474.0" y2="138.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
  <text x="432.0" y="134.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.0mm 128GB/s</text>
  <line x1="474.0" y1="138.0" x2="390.0" y2="138.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
  <text x="432.0" y="134.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.0mm 128GB/s</text>
  <line x1="82.0" y1="334.0" x2="166.0" y2="334.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
  <text x="124.0" y="330.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.0mm 128GB/s</text>
  <line x1="166.0" y1="334.0" x2="82.0" y2="334.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
  <text x="124.0" y="330.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.0mm 128GB/s</text>
  <line x1="166.0" y1="334.0" x2="390.0" y2="334.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
  <text x="278.0" y="330.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">10.0mm 128GB/s</text>
  <line x1="390.0" y1="334.0" x2="166.0" y2="334.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
  <text x="278.0" y="330.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">10.0mm 128GB/s</text>
  <line x1="390.0" y1="334.0" x2="474.0" y2="334.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
  <text x="432.0" y="330.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.0mm 128GB/s</text>
  <line x1="474.0" y1="334.0" x2="390.0" y2="334.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
  <text x="432.0" y="330.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.0mm 128GB/s</text>
  <polyline points="82.0,138.0 110.0,138.0 110.0,292.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
  <text x="96.0" y="211.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.0mm 512GB/s</text>
  <polyline points="110.0,292.0 82.0,292.0 82.0,138.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
  <text x="96.0" y="211.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.0mm 512GB/s</text>
  <polyline points="82.0,334.0 110.0,334.0 110.0,292.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
  <text x="96.0" y="309.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.0mm 512GB/s</text>
  <polyline points="110.0,292.0 82.0,292.0 82.0,334.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
  <text x="96.0" y="309.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.0mm 512GB/s</text>
  <polyline points="474.0,138.0 446.0,138.0 446.0,292.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
  <text x="460.0" y="211.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.0mm 512GB/s</text>
  <polyline points="446.0,292.0 474.0,292.0 474.0,138.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
  <text x="460.0" y="211.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.0mm 512GB/s</text>
  <polyline points="474.0,334.0 446.0,334.0 446.0,292.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
  <text x="460.0" y="309.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.0mm 512GB/s</text>
  <polyline points="446.0,292.0 474.0,292.0 474.0,334.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
  <text x="460.0" y="309.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.0mm 512GB/s</text>
  <polyline points="334.0,236.0 334.0,131.4 278.0,131.4 278.0,56.8" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.6"/>
  <polyline points="334.0,236.0 334.0,310.6 278.0,310.6 278.0,415.2" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.6"/>
  <polyline points="334.0,236.0 334.0,221.0 488.0,221.0 488.0,236.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.6"/>
  <polyline points="334.0,236.0 334.0,221.0 68.0,221.0 68.0,236.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.6"/>
  <polyline points="446.0,194.0 446.0,200.0 334.0,200.0 334.0,236.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.6"/>
  <polyline points="334.0,236.0 334.0,200.0 446.0,200.0 446.0,194.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.6"/>
  <polyline points="334.0,236.0 110.0,236.0 110.0,194.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.8"/>
  <polyline points="110.0,194.0 334.0,194.0 334.0,236.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.8"/>
  <rect x="250.0" y="40.0" width="56.0" height="33.6" rx="4" fill="#3b82f6" stroke="#475569" stroke-width="1"/>
  <text x="278.0" y="60.8" text-anchor="middle" font-family="monospace" font-size="10" fill="#ffffff">UCIe-N</text>
  <rect x="250.0" y="398.4" width="56.0" height="33.6" rx="4" fill="#3b82f6" stroke="#475569" stroke-width="1"/>
  <text x="278.0" y="419.2" text-anchor="middle" font-family="monospace" font-size="10" fill="#ffffff">UCIe-S</text>
  <rect x="460.0" y="219.2" width="56.0" height="33.6" rx="4" fill="#3b82f6" stroke="#475569" stroke-width="1"/>
  <text x="488.0" y="240.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#ffffff">UCIe-E</text>
  <rect x="40.0" y="219.2" width="56.0" height="33.6" rx="4" fill="#3b82f6" stroke="#475569" stroke-width="1"/>
  <text x="68.0" y="240.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#ffffff">UCIe-W</text>
  <rect x="306.0" y="219.2" width="56.0" height="33.6" rx="4" fill="#a78bfa" stroke="#475569" stroke-width="1"/>
  <text x="334.0" y="240.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">NOC</text>
  <rect x="418.0" y="177.2" width="56.0" height="33.6" rx="4" fill="#f59e0b" stroke="#475569" stroke-width="1"/>
  <text x="446.0" y="198.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">M CPU</text>
  <rect x="194.0" y="219.2" width="56.0" height="33.6" rx="4" fill="#10b981" stroke="#475569" stroke-width="1"/>
  <text x="222.0" y="240.0" text-anchor="middle" font-family="monospace" font-size="8" fill="#ffffff">HBM CTRL</text>
  <rect x="82.0" y="177.2" width="56.0" height="33.6" rx="4" fill="#f59e0b" stroke="#475569" stroke-width="1"/>
  <text x="110.0" y="198.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">SRAM</text>
  <rect x="82.0" y="275.2" width="56.0" height="33.6" rx="4" fill="#f97316" stroke="#475569" stroke-width="1"/>
  <text x="110.0" y="296.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">Bridge LEFT</text>
  <rect x="418.0" y="275.2" width="56.0" height="33.6" rx="4" fill="#f97316" stroke="#475569" stroke-width="1"/>
  <text x="446.0" y="296.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">Bridge RIGHT</text>
  <rect x="56.8" y="68.0" width="50.4" height="28.0" rx="4" fill="#94a3b8" stroke="#475569" stroke-width="1"/>
  <text x="82.0" y="86.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">PE0</text>
  <rect x="54.0" y="121.2" width="56.0" height="33.6" rx="4" fill="#f97316" stroke="#475569" stroke-width="1"/>
  <text x="82.0" y="142.0" text-anchor="middle" font-family="monospace" font-size="8" fill="#1e293b">XBAR PE0</text>
  <rect x="140.8" y="68.0" width="50.4" height="28.0" rx="4" fill="#94a3b8" stroke="#475569" stroke-width="1"/>
  <text x="166.0" y="86.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">PE1</text>
  <rect x="138.0" y="121.2" width="56.0" height="33.6" rx="4" fill="#f97316" stroke="#475569" stroke-width="1"/>
  <text x="166.0" y="142.0" text-anchor="middle" font-family="monospace" font-size="8" fill="#1e293b">XBAR PE1</text>
  <rect x="364.8" y="68.0" width="50.4" height="28.0" rx="4" fill="#94a3b8" stroke="#475569" stroke-width="1"/>
  <text x="390.0" y="86.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">PE2</text>
  <rect x="362.0" y="121.2" width="56.0" height="33.6" rx="4" fill="#f97316" stroke="#475569" stroke-width="1"/>
  <text x="390.0" y="142.0" text-anchor="middle" font-family="monospace" font-size="8" fill="#1e293b">XBAR PE2</text>
  <rect x="448.8" y="68.0" width="50.4" height="28.0" rx="4" fill="#94a3b8" stroke="#475569" stroke-width="1"/>
  <text x="474.0" y="86.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">PE3</text>
  <rect x="446.0" y="121.2" width="56.0" height="33.6" rx="4" fill="#f97316" stroke="#475569" stroke-width="1"/>
  <text x="474.0" y="142.0" text-anchor="middle" font-family="monospace" font-size="8" fill="#1e293b">XBAR PE3</text>
  <rect x="56.8" y="376.0" width="50.4" height="28.0" rx="4" fill="#94a3b8" stroke="#475569" stroke-width="1"/>
  <text x="82.0" y="394.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">PE4</text>
  <rect x="54.0" y="317.2" width="56.0" height="33.6" rx="4" fill="#f97316" stroke="#475569" stroke-width="1"/>
  <text x="82.0" y="338.0" text-anchor="middle" font-family="monospace" font-size="8" fill="#1e293b">XBAR PE4</text>
  <rect x="140.8" y="376.0" width="50.4" height="28.0" rx="4" fill="#94a3b8" stroke="#475569" stroke-width="1"/>
  <text x="166.0" y="394.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">PE5</text>
  <rect x="138.0" y="317.2" width="56.0" height="33.6" rx="4" fill="#f97316" stroke="#475569" stroke-width="1"/>
  <text x="166.0" y="338.0" text-anchor="middle" font-family="monospace" font-size="8" fill="#1e293b">XBAR PE5</text>
  <rect x="364.8" y="376.0" width="50.4" height="28.0" rx="4" fill="#94a3b8" stroke="#475569" stroke-width="1"/>
  <text x="390.0" y="394.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">PE6</text>
  <rect x="362.0" y="317.2" width="56.0" height="33.6" rx="4" fill="#f97316" stroke="#475569" stroke-width="1"/>
  <text x="390.0" y="338.0" text-anchor="middle" font-family="monospace" font-size="8" fill="#1e293b">XBAR PE6</text>
  <rect x="448.8" y="376.0" width="50.4" height="28.0" rx="4" fill="#94a3b8" stroke="#475569" stroke-width="1"/>
  <text x="474.0" y="394.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">PE7</text>
  <rect x="446.0" y="317.2" width="56.0" height="33.6" rx="4" fill="#f97316" stroke="#475569" stroke-width="1"/>
  <text x="474.0" y="338.0" text-anchor="middle" font-family="monospace" font-size="8" fill="#1e293b">XBAR PE7</text>
 </svg>
@@ -0,0 +1,31 @@
 <svg xmlns="http://www.w3.org/2000/svg" width="500" height="360" viewBox="0 0 500 360">
  <title>pe</title>
  <rect width="500" height="360" fill="#f8fafc"/>
  <text x="250" y="18" text-anchor="middle" font-family="monospace" font-size="14" font-weight="bold" fill="#1e293b">PE VIEW</text>
  <line x1="92.5" y1="180.0" x2="180.0" y2="180.0" stroke="#94a3b8" stroke-width="1.5" opacity="0.8"/>
  <text x="136.2" y="176.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">0.5mm</text>
  <polyline points="180.0,180.0 180.0,92.5 285.0,92.5" fill="none" stroke="#94a3b8" stroke-width="1.5" opacity="0.8"/>
  <text x="232.5" y="132.2" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">0.5mm</text>
  <line x1="180.0" y1="180.0" x2="285.0" y2="180.0" stroke="#94a3b8" stroke-width="1.5" opacity="0.8"/>
  <text x="232.5" y="176.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">0.5mm</text>
  <polyline points="180.0,180.0 180.0,267.5 285.0,267.5" fill="none" stroke="#94a3b8" stroke-width="1.5" opacity="0.8"/>
  <text x="232.5" y="219.8" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">0.5mm</text>
  <polyline points="285.0,92.5 390.0,92.5 390.0,180.0" fill="none" stroke="#94a3b8" stroke-width="1.5" opacity="0.8"/>
  <text x="337.5" y="132.2" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">0.5mm 512GB/s</text>
  <line x1="285.0" y1="180.0" x2="390.0" y2="180.0" stroke="#94a3b8" stroke-width="1.5" opacity="0.8"/>
  <text x="337.5" y="176.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">0.5mm 512GB/s</text>
  <polyline points="285.0,267.5 390.0,267.5 390.0,180.0" fill="none" stroke="#94a3b8" stroke-width="1.5" opacity="0.8"/>
  <text x="337.5" y="219.8" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">0.5mm 512GB/s</text>
  <rect x="48.8" y="155.5" width="87.5" height="49.0" rx="4" fill="#ef4444" stroke="#475569" stroke-width="1"/>
  <text x="92.5" y="184.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#ffffff">PE CPU</text>
  <rect x="136.2" y="155.5" width="87.5" height="49.0" rx="4" fill="#f59e0b" stroke="#475569" stroke-width="1"/>
  <text x="180.0" y="184.0" text-anchor="middle" font-family="monospace" font-size="9" fill="#1e293b">PE SCHEDULER</text>
  <rect x="241.2" y="68.0" width="87.5" height="49.0" rx="4" fill="#3b82f6" stroke="#475569" stroke-width="1"/>
  <text x="285.0" y="96.5" text-anchor="middle" font-family="monospace" font-size="10" fill="#ffffff">PE DMA</text>
  <rect x="241.2" y="155.5" width="87.5" height="49.0" rx="4" fill="#8b5cf6" stroke="#475569" stroke-width="1"/>
  <text x="285.0" y="184.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#ffffff">PE GEMM</text>
  <rect x="241.2" y="243.0" width="87.5" height="49.0" rx="4" fill="#ec4899" stroke="#475569" stroke-width="1"/>
  <text x="285.0" y="271.5" text-anchor="middle" font-family="monospace" font-size="10" fill="#ffffff">PE MATH</text>
  <rect x="346.2" y="155.5" width="87.5" height="49.0" rx="4" fill="#10b981" stroke="#475569" stroke-width="1"/>
  <text x="390.0" y="184.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#ffffff">PE TCM</text>
 </svg>
@@ -0,0 +1,72 @@
 <svg xmlns="http://www.w3.org/2000/svg" width="820" height="500" viewBox="0 0 820 500" font-family="monospace">
 <rect width="820" height="500" fill="#f8fafc" rx="6"/>
 <text x="410" y="32" text-anchor="middle" font-size="16" font-weight="bold" fill="#1e293b">Placement: column_wise</text>
 <text x="410.0" y="54.0" text-anchor="middle" font-size="12" fill="#475569" font-weight="normal">Tensor (1024×512) fp16  →  K axis split into 8 parts</text>
 <text x="320.0" y="82.0" text-anchor="middle" font-size="11" fill="#475569" font-weight="normal">← K=512 →</text>
 <text x="68.0" y="250.0" text-anchor="middle" font-size="11" fill="#475569" transform="rotate(-90 68.0 250.0)">↑ M=1024 ↓</text>
 <rect x="80.0" y="90.0" width="60.0" height="320.0" fill="#3b82f6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="110.0" y="246.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE0</text>
 <text x="110.0" y="262.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">(1024×64)</text>
 <rect x="140.0" y="90.0" width="60.0" height="320.0" fill="#10b981" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="170.0" y="246.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE1</text>
 <text x="170.0" y="262.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">(1024×64)</text>
 <rect x="200.0" y="90.0" width="60.0" height="320.0" fill="#f59e0b" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="230.0" y="246.0" text-anchor="middle" font-size="12" fill="#000" font-weight="bold">PE2</text>
 <text x="230.0" y="262.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">(1024×64)</text>
 <rect x="260.0" y="90.0" width="60.0" height="320.0" fill="#ef4444" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="290.0" y="246.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE3</text>
 <text x="290.0" y="262.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">(1024×64)</text>
 <rect x="320.0" y="90.0" width="60.0" height="320.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="350.0" y="246.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE4</text>
 <text x="350.0" y="262.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">(1024×64)</text>
 <rect x="380.0" y="90.0" width="60.0" height="320.0" fill="#ec4899" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="410.0" y="246.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE5</text>
 <text x="410.0" y="262.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">(1024×64)</text>
 <rect x="440.0" y="90.0" width="60.0" height="320.0" fill="#06b6d4" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="470.0" y="246.0" text-anchor="middle" font-size="12" fill="#000" font-weight="bold">PE6</text>
 <text x="470.0" y="262.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">(1024×64)</text>
 <rect x="500.0" y="90.0" width="60.0" height="320.0" fill="#f97316" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="530.0" y="246.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE7</text>
 <text x="530.0" y="262.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">(1024×64)</text>
 <rect x="80.0" y="90.0" width="480.0" height="320.0" fill="none" stroke="#1e293b" stroke-width="2" fill-opacity="1.0" rx="2"/>
 <text x="110.0" y="426.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">off=0 B</text>
 <text x="110.0" y="440.0" text-anchor="middle" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
 <text x="170.0" y="426.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">off=128 KB</text>
 <text x="170.0" y="440.0" text-anchor="middle" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
 <text x="230.0" y="426.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">off=256 KB</text>
 <text x="230.0" y="440.0" text-anchor="middle" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
 <text x="290.0" y="426.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">off=384 KB</text>
 <text x="290.0" y="440.0" text-anchor="middle" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
 <text x="350.0" y="426.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">off=512 KB</text>
 <text x="350.0" y="440.0" text-anchor="middle" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
 <text x="410.0" y="426.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">off=640 KB</text>
 <text x="410.0" y="440.0" text-anchor="middle" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
 <text x="470.0" y="426.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">off=768 KB</text>
 <text x="470.0" y="440.0" text-anchor="middle" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
 <text x="530.0" y="426.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">off=896 KB</text>
 <text x="530.0" y="440.0" text-anchor="middle" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
 <text x="670.0" y="100.0" text-anchor="middle" font-size="12" fill="#1e293b" font-weight="bold">PE Legend</text>
 <rect x="620.0" y="106.0" width="16.0" height="16.0" fill="#3b82f6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="642.0" y="118.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE0</text>
 <rect x="620.0" y="128.0" width="16.0" height="16.0" fill="#10b981" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="642.0" y="140.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE1</text>
 <rect x="620.0" y="150.0" width="16.0" height="16.0" fill="#f59e0b" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="642.0" y="162.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE2</text>
 <rect x="620.0" y="172.0" width="16.0" height="16.0" fill="#ef4444" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="642.0" y="184.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE3</text>
 <rect x="620.0" y="194.0" width="16.0" height="16.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="642.0" y="206.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE4</text>
 <rect x="620.0" y="216.0" width="16.0" height="16.0" fill="#ec4899" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="642.0" y="228.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE5</text>
 <rect x="620.0" y="238.0" width="16.0" height="16.0" fill="#06b6d4" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="642.0" y="250.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE6</text>
 <rect x="620.0" y="260.0" width="16.0" height="16.0" fill="#f97316" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="642.0" y="272.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE7</text>
 <rect x="620.0" y="320.0" width="167.0" height="120.0" fill="#e2e8f0" stroke="#94a3b8" stroke-width="1" fill-opacity="1.0" rx="2"/>
 <text x="630.0" y="338.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Strategy: column_wise</text>
 <text x="630.0" y="356.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Split axis: K</text>
 <text x="630.0" y="374.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Shards: 8</text>
 <text x="630.0" y="392.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Each: (1024, 64)</text>
 <text x="630.0" y="410.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Each: 128 KB</text>
 <text x="630.0" y="428.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Total: 1 MB</text>
 </svg>
@@ -0,0 +1,47 @@
 <svg xmlns="http://www.w3.org/2000/svg" width="820" height="500" viewBox="0 0 820 500" font-family="monospace">
 <rect width="820" height="500" fill="#f8fafc" rx="6"/>
 <text x="410" y="32" text-anchor="middle" font-size="16" font-weight="bold" fill="#1e293b">Placement: replicate</text>
 <text x="410.0" y="54.0" text-anchor="middle" font-size="12" fill="#475569" font-weight="normal">Tensor (1024×512) fp16  →  full copy to each PE</text>
 <rect x="60.0" y="90.0" width="163.0" height="162.0" fill="#3b82f6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="141.5" y="157.0" text-anchor="middle" font-size="14" fill="#fff" font-weight="bold">PE0</text>
 <text x="141.5" y="177.0" text-anchor="middle" font-size="11" fill="#fff" font-weight="normal">(1024×512)</text>
 <text x="141.5" y="193.0" text-anchor="middle" font-size="10" fill="#fff" font-weight="normal">1 MB</text>
 <text x="141.5" y="207.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">offset=0</text>
 <rect x="239.0" y="90.0" width="163.0" height="162.0" fill="#10b981" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="320.5" y="157.0" text-anchor="middle" font-size="14" fill="#fff" font-weight="bold">PE1</text>
 <text x="320.5" y="177.0" text-anchor="middle" font-size="11" fill="#fff" font-weight="normal">(1024×512)</text>
 <text x="320.5" y="193.0" text-anchor="middle" font-size="10" fill="#fff" font-weight="normal">1 MB</text>
 <text x="320.5" y="207.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">offset=0</text>
 <rect x="418.0" y="90.0" width="163.0" height="162.0" fill="#f59e0b" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="499.5" y="157.0" text-anchor="middle" font-size="14" fill="#000" font-weight="bold">PE2</text>
 <text x="499.5" y="177.0" text-anchor="middle" font-size="11" fill="#000" font-weight="normal">(1024×512)</text>
 <text x="499.5" y="193.0" text-anchor="middle" font-size="10" fill="#000" font-weight="normal">1 MB</text>
 <text x="499.5" y="207.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">offset=0</text>
 <rect x="597.0" y="90.0" width="163.0" height="162.0" fill="#ef4444" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="678.5" y="157.0" text-anchor="middle" font-size="14" fill="#fff" font-weight="bold">PE3</text>
 <text x="678.5" y="177.0" text-anchor="middle" font-size="11" fill="#fff" font-weight="normal">(1024×512)</text>
 <text x="678.5" y="193.0" text-anchor="middle" font-size="10" fill="#fff" font-weight="normal">1 MB</text>
 <text x="678.5" y="207.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">offset=0</text>
 <rect x="60.0" y="268.0" width="163.0" height="162.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="141.5" y="335.0" text-anchor="middle" font-size="14" fill="#fff" font-weight="bold">PE4</text>
 <text x="141.5" y="355.0" text-anchor="middle" font-size="11" fill="#fff" font-weight="normal">(1024×512)</text>
 <text x="141.5" y="371.0" text-anchor="middle" font-size="10" fill="#fff" font-weight="normal">1 MB</text>
 <text x="141.5" y="385.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">offset=0</text>
 <rect x="239.0" y="268.0" width="163.0" height="162.0" fill="#ec4899" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="320.5" y="335.0" text-anchor="middle" font-size="14" fill="#fff" font-weight="bold">PE5</text>
 <text x="320.5" y="355.0" text-anchor="middle" font-size="11" fill="#fff" font-weight="normal">(1024×512)</text>
 <text x="320.5" y="371.0" text-anchor="middle" font-size="10" fill="#fff" font-weight="normal">1 MB</text>
 <text x="320.5" y="385.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">offset=0</text>
 <rect x="418.0" y="268.0" width="163.0" height="162.0" fill="#06b6d4" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="499.5" y="335.0" text-anchor="middle" font-size="14" fill="#000" font-weight="bold">PE6</text>
 <text x="499.5" y="355.0" text-anchor="middle" font-size="11" fill="#000" font-weight="normal">(1024×512)</text>
 <text x="499.5" y="371.0" text-anchor="middle" font-size="10" fill="#000" font-weight="normal">1 MB</text>
 <text x="499.5" y="385.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">offset=0</text>
 <rect x="597.0" y="268.0" width="163.0" height="162.0" fill="#f97316" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="678.5" y="335.0" text-anchor="middle" font-size="14" fill="#fff" font-weight="bold">PE7</text>
 <text x="678.5" y="355.0" text-anchor="middle" font-size="11" fill="#fff" font-weight="normal">(1024×512)</text>
 <text x="678.5" y="371.0" text-anchor="middle" font-size="10" fill="#fff" font-weight="normal">1 MB</text>
 <text x="678.5" y="385.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">offset=0</text>
 <rect x="60.0" y="450.0" width="496.0" height="30.0" fill="#e2e8f0" stroke="#94a3b8" stroke-width="1" fill-opacity="1.0" rx="2"/>
 <text x="70.0" y="468.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Strategy: replicate  |  Shards: 8  |  Each: 1 MB  |  Total mem: 8 MB</text>
 </svg>
@@ -0,0 +1,72 @@
 <svg xmlns="http://www.w3.org/2000/svg" width="820" height="560" viewBox="0 0 820 560" font-family="monospace">
 <rect width="820" height="560" fill="#f8fafc" rx="6"/>
 <text x="410" y="32" text-anchor="middle" font-size="16" font-weight="bold" fill="#1e293b">Placement: row_wise</text>
 <text x="410.0" y="54.0" text-anchor="middle" font-size="12" fill="#475569" font-weight="normal">Tensor (1024×512) fp16  →  M axis split into 8 parts</text>
 <text x="240.0" y="82.0" text-anchor="middle" font-size="11" fill="#475569" font-weight="normal">← K=512 →</text>
 <text x="68.0" y="290.0" text-anchor="middle" font-size="11" fill="#475569" transform="rotate(-90 68.0 290.0)">↑ M=1024 ↓</text>
 <rect x="80.0" y="90.0" width="320.0" height="50.0" fill="#3b82f6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="240.0" y="111.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE0</text>
 <text x="240.0" y="127.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">(128×512)</text>
 <rect x="80.0" y="140.0" width="320.0" height="50.0" fill="#10b981" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="240.0" y="161.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE1</text>
 <text x="240.0" y="177.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">(128×512)</text>
 <rect x="80.0" y="190.0" width="320.0" height="50.0" fill="#f59e0b" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="240.0" y="211.0" text-anchor="middle" font-size="12" fill="#000" font-weight="bold">PE2</text>
 <text x="240.0" y="227.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">(128×512)</text>
 <rect x="80.0" y="240.0" width="320.0" height="50.0" fill="#ef4444" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="240.0" y="261.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE3</text>
 <text x="240.0" y="277.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">(128×512)</text>
 <rect x="80.0" y="290.0" width="320.0" height="50.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="240.0" y="311.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE4</text>
 <text x="240.0" y="327.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">(128×512)</text>
 <rect x="80.0" y="340.0" width="320.0" height="50.0" fill="#ec4899" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="240.0" y="361.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE5</text>
 <text x="240.0" y="377.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">(128×512)</text>
 <rect x="80.0" y="390.0" width="320.0" height="50.0" fill="#06b6d4" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="240.0" y="411.0" text-anchor="middle" font-size="12" fill="#000" font-weight="bold">PE6</text>
 <text x="240.0" y="427.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">(128×512)</text>
 <rect x="80.0" y="440.0" width="320.0" height="50.0" fill="#f97316" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="240.0" y="461.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE7</text>
 <text x="240.0" y="477.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">(128×512)</text>
 <rect x="80.0" y="90.0" width="320.0" height="400.0" fill="none" stroke="#1e293b" stroke-width="2" fill-opacity="1.0" rx="2"/>
 <text x="410.0" y="111.0" text-anchor="start" font-size="9" fill="#475569" font-weight="normal">off=0 B</text>
 <text x="410.0" y="125.0" text-anchor="start" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
 <text x="410.0" y="161.0" text-anchor="start" font-size="9" fill="#475569" font-weight="normal">off=128 KB</text>
 <text x="410.0" y="175.0" text-anchor="start" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
 <text x="410.0" y="211.0" text-anchor="start" font-size="9" fill="#475569" font-weight="normal">off=256 KB</text>
 <text x="410.0" y="225.0" text-anchor="start" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
 <text x="410.0" y="261.0" text-anchor="start" font-size="9" fill="#475569" font-weight="normal">off=384 KB</text>
 <text x="410.0" y="275.0" text-anchor="start" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
 <text x="410.0" y="311.0" text-anchor="start" font-size="9" fill="#475569" font-weight="normal">off=512 KB</text>
 <text x="410.0" y="325.0" text-anchor="start" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
 <text x="410.0" y="361.0" text-anchor="start" font-size="9" fill="#475569" font-weight="normal">off=640 KB</text>
 <text x="410.0" y="375.0" text-anchor="start" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
 <text x="410.0" y="411.0" text-anchor="start" font-size="9" fill="#475569" font-weight="normal">off=768 KB</text>
 <text x="410.0" y="425.0" text-anchor="start" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
 <text x="410.0" y="461.0" text-anchor="start" font-size="9" fill="#475569" font-weight="normal">off=896 KB</text>
 <text x="410.0" y="475.0" text-anchor="start" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
 <text x="630.0" y="100.0" text-anchor="middle" font-size="12" fill="#1e293b" font-weight="bold">PE Legend</text>
 <rect x="580.0" y="106.0" width="16.0" height="16.0" fill="#3b82f6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="602.0" y="118.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE0</text>
 <rect x="580.0" y="128.0" width="16.0" height="16.0" fill="#10b981" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="602.0" y="140.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE1</text>
 <rect x="580.0" y="150.0" width="16.0" height="16.0" fill="#f59e0b" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="602.0" y="162.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE2</text>
 <rect x="580.0" y="172.0" width="16.0" height="16.0" fill="#ef4444" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="602.0" y="184.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE3</text>
 <rect x="580.0" y="194.0" width="16.0" height="16.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="602.0" y="206.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE4</text>
 <rect x="580.0" y="216.0" width="16.0" height="16.0" fill="#ec4899" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="602.0" y="228.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE5</text>
 <rect x="580.0" y="238.0" width="16.0" height="16.0" fill="#06b6d4" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="602.0" y="250.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE6</text>
 <rect x="580.0" y="260.0" width="16.0" height="16.0" fill="#f97316" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="602.0" y="272.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE7</text>
 <rect x="580.0" y="320.0" width="146.0" height="120.0" fill="#e2e8f0" stroke="#94a3b8" stroke-width="1" fill-opacity="1.0" rx="2"/>
 <text x="590.0" y="338.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Strategy: row_wise</text>
 <text x="590.0" y="356.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Split axis: M</text>
 <text x="590.0" y="374.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Shards: 8</text>
 <text x="590.0" y="392.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Each: (128, 512)</text>
 <text x="590.0" y="410.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Each: 128 KB</text>
 <text x="590.0" y="428.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Total: 1 MB</text>
 </svg>
@@ -0,0 +1,116 @@
 <svg xmlns="http://www.w3.org/2000/svg" width="820" height="620" viewBox="0 0 820 620" font-family="monospace">
 <rect width="820" height="620" fill="#f8fafc" rx="6"/>
 <text x="410" y="32" text-anchor="middle" font-size="16" font-weight="bold" fill="#1e293b">Placement: tiled_column_major</text>
 <text x="410.0" y="54.0" text-anchor="middle" font-size="11" fill="#475569" font-weight="normal">Tensor (1024×512) fp16, tile=(256×128)  →  4×4=16 tiles, column-major (K first)</text>
 <text x="280.0" y="82.0" text-anchor="middle" font-size="11" fill="#475569" font-weight="normal">← K=512 →</text>
 <text x="68.0" y="290.0" text-anchor="middle" font-size="11" fill="#475569" transform="rotate(-90 68.0 290.0)">↑ M=1024 ↓</text>
 <rect x="80.0" y="90.0" width="100.0" height="100.0" fill="#3b82f6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="130.0" y="136.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE0</text>
 <text x="130.0" y="152.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t0</text>
 <rect x="180.0" y="90.0" width="100.0" height="100.0" fill="#10b981" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="230.0" y="136.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE1</text>
 <text x="230.0" y="152.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t1</text>
 <rect x="280.0" y="90.0" width="100.0" height="100.0" fill="#f59e0b" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="330.0" y="136.0" text-anchor="middle" font-size="12" fill="#000" font-weight="bold">PE2</text>
 <text x="330.0" y="152.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">t2</text>
 <rect x="380.0" y="90.0" width="100.0" height="100.0" fill="#ef4444" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="430.0" y="136.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE3</text>
 <text x="430.0" y="152.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t3</text>
 <rect x="80.0" y="190.0" width="100.0" height="100.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="130.0" y="236.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE4</text>
 <text x="130.0" y="252.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t4</text>
 <rect x="180.0" y="190.0" width="100.0" height="100.0" fill="#ec4899" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="230.0" y="236.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE5</text>
 <text x="230.0" y="252.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t5</text>
 <rect x="280.0" y="190.0" width="100.0" height="100.0" fill="#06b6d4" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="330.0" y="236.0" text-anchor="middle" font-size="12" fill="#000" font-weight="bold">PE6</text>
 <text x="330.0" y="252.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">t6</text>
 <rect x="380.0" y="190.0" width="100.0" height="100.0" fill="#f97316" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="430.0" y="236.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE7</text>
 <text x="430.0" y="252.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t7</text>
 <rect x="80.0" y="290.0" width="100.0" height="100.0" fill="#3b82f6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="130.0" y="336.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE0</text>
 <text x="130.0" y="352.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t8</text>
 <rect x="180.0" y="290.0" width="100.0" height="100.0" fill="#10b981" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="230.0" y="336.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE1</text>
 <text x="230.0" y="352.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t9</text>
 <rect x="280.0" y="290.0" width="100.0" height="100.0" fill="#f59e0b" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="330.0" y="336.0" text-anchor="middle" font-size="12" fill="#000" font-weight="bold">PE2</text>
 <text x="330.0" y="352.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">t10</text>
 <rect x="380.0" y="290.0" width="100.0" height="100.0" fill="#ef4444" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="430.0" y="336.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE3</text>
 <text x="430.0" y="352.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t11</text>
 <rect x="80.0" y="390.0" width="100.0" height="100.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="130.0" y="436.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE4</text>
 <text x="130.0" y="452.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t12</text>
 <rect x="180.0" y="390.0" width="100.0" height="100.0" fill="#ec4899" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="230.0" y="436.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE5</text>
 <text x="230.0" y="452.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t13</text>
 <rect x="280.0" y="390.0" width="100.0" height="100.0" fill="#06b6d4" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="330.0" y="436.0" text-anchor="middle" font-size="12" fill="#000" font-weight="bold">PE6</text>
 <text x="330.0" y="452.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">t14</text>
 <rect x="380.0" y="390.0" width="100.0" height="100.0" fill="#f97316" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="430.0" y="436.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE7</text>
 <text x="430.0" y="452.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t15</text>
 <rect x="80.0" y="90.0" width="400.0" height="400.0" fill="none" stroke="#1e293b" stroke-width="2" fill-opacity="1.0" rx="2"/>
 <text x="130.0" y="506.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">k=0..127</text>
 <text x="230.0" y="506.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">k=128..255</text>
 <text x="330.0" y="506.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">k=256..383</text>
 <text x="430.0" y="506.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">k=384..511</text>
 <text x="64.0" y="140.0" text-anchor="end" font-size="9" fill="#475569" font-weight="normal">m=0..255</text>
 <text x="64.0" y="240.0" text-anchor="end" font-size="9" fill="#475569" font-weight="normal">m=256..511</text>
 <text x="64.0" y="340.0" text-anchor="end" font-size="9" fill="#475569" font-weight="normal">m=512..767</text>
 <text x="64.0" y="440.0" text-anchor="end" font-size="9" fill="#475569" font-weight="normal">m=768..1023</text>
 <text x="590.0" y="90.0" text-anchor="middle" font-size="12" fill="#1e293b" font-weight="bold">PE Legend</text>
 <rect x="540.0" y="96.0" width="16.0" height="16.0" fill="#3b82f6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="562.0" y="108.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE0</text>
 <rect x="540.0" y="118.0" width="16.0" height="16.0" fill="#10b981" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="562.0" y="130.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE1</text>
 <rect x="540.0" y="140.0" width="16.0" height="16.0" fill="#f59e0b" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="562.0" y="152.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE2</text>
 <rect x="540.0" y="162.0" width="16.0" height="16.0" fill="#ef4444" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="562.0" y="174.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE3</text>
 <rect x="540.0" y="184.0" width="16.0" height="16.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="562.0" y="196.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE4</text>
 <rect x="540.0" y="206.0" width="16.0" height="16.0" fill="#ec4899" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="562.0" y="218.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE5</text>
 <rect x="540.0" y="228.0" width="16.0" height="16.0" fill="#06b6d4" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="562.0" y="240.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE6</text>
 <rect x="540.0" y="250.0" width="16.0" height="16.0" fill="#f97316" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="562.0" y="262.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE7</text>
 <text x="540.0" y="310.0" text-anchor="middle" font-size="12" fill="#1e293b" font-weight="bold">Tile Assignment Order</text>
 <rect x="540.0" y="318.0" width="12.0" height="12.0" fill="#3b82f6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="558.0" y="328.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 0 → PE0  (0,0)  off=0 B</text>
 <rect x="540.0" y="334.0" width="12.0" height="12.0" fill="#10b981" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="558.0" y="344.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 1 → PE1  (0,1)  off=256 B</text>
 <rect x="540.0" y="350.0" width="12.0" height="12.0" fill="#f59e0b" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="558.0" y="360.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 2 → PE2  (0,2)  off=512 B</text>
 <rect x="540.0" y="366.0" width="12.0" height="12.0" fill="#ef4444" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="558.0" y="376.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 3 → PE3  (0,3)  off=768 B</text>
 <rect x="540.0" y="382.0" width="12.0" height="12.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="558.0" y="392.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 4 → PE4  (1,0)  off=256 KB</text>
 <rect x="540.0" y="398.0" width="12.0" height="12.0" fill="#ec4899" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="558.0" y="408.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 5 → PE5  (1,1)  off=256 KB</text>
 <rect x="540.0" y="414.0" width="12.0" height="12.0" fill="#06b6d4" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="558.0" y="424.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 6 → PE6  (1,2)  off=256 KB</text>
 <rect x="540.0" y="430.0" width="12.0" height="12.0" fill="#f97316" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="558.0" y="440.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 7 → PE7  (1,3)  off=256 KB</text>
 <rect x="540.0" y="446.0" width="12.0" height="12.0" fill="#3b82f6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="558.0" y="456.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 8 → PE0  (2,0)  off=512 KB</text>
 <rect x="540.0" y="462.0" width="12.0" height="12.0" fill="#10b981" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="558.0" y="472.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 9 → PE1  (2,1)  off=512 KB</text>
 <rect x="540.0" y="478.0" width="12.0" height="12.0" fill="#f59e0b" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="558.0" y="488.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t10 → PE2  (2,2)  off=512 KB</text>
 <rect x="540.0" y="494.0" width="12.0" height="12.0" fill="#ef4444" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="558.0" y="504.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t11 → PE3  (2,3)  off=512 KB</text>
 <rect x="540.0" y="510.0" width="12.0" height="12.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="558.0" y="520.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t12 → PE4  (3,0)  off=768 KB</text>
 <rect x="540.0" y="526.0" width="12.0" height="12.0" fill="#ec4899" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="558.0" y="536.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t13 → PE5  (3,1)  off=768 KB</text>
 <rect x="540.0" y="542.0" width="12.0" height="12.0" fill="#06b6d4" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="558.0" y="552.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t14 → PE6  (3,2)  off=768 KB</text>
 <rect x="540.0" y="558.0" width="12.0" height="12.0" fill="#f97316" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="558.0" y="568.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t15 → PE7  (3,3)  off=768 KB</text>
 <rect x="80.0" y="560.0" width="608.0" height="30.0" fill="#e2e8f0" stroke="#94a3b8" stroke-width="1" fill-opacity="1.0" rx="2"/>
 <text x="90.0" y="578.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Strategy: tiled_column_major  |  Tile: (256×128)=64 KB  |  Tiles: 16  |  Total: 1 MB</text>
 </svg>
@@ -0,0 +1,116 @@
 <svg xmlns="http://www.w3.org/2000/svg" width="820" height="620" viewBox="0 0 820 620" font-family="monospace">
 <rect width="820" height="620" fill="#f8fafc" rx="6"/>
 <text x="410" y="32" text-anchor="middle" font-size="16" font-weight="bold" fill="#1e293b">Placement: tiled_row_major</text>
 <text x="410.0" y="54.0" text-anchor="middle" font-size="11" fill="#475569" font-weight="normal">Tensor (1024×512) fp16, tile=(256×128)  →  4×4=16 tiles, row-major (M first)</text>
 <text x="280.0" y="82.0" text-anchor="middle" font-size="11" fill="#475569" font-weight="normal">← K=512 →</text>
 <text x="68.0" y="290.0" text-anchor="middle" font-size="11" fill="#475569" transform="rotate(-90 68.0 290.0)">↑ M=1024 ↓</text>
 <rect x="80.0" y="90.0" width="100.0" height="100.0" fill="#3b82f6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="130.0" y="136.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE0</text>
 <text x="130.0" y="152.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t0</text>
 <rect x="80.0" y="190.0" width="100.0" height="100.0" fill="#10b981" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="130.0" y="236.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE1</text>
 <text x="130.0" y="252.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t1</text>
 <rect x="80.0" y="290.0" width="100.0" height="100.0" fill="#f59e0b" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="130.0" y="336.0" text-anchor="middle" font-size="12" fill="#000" font-weight="bold">PE2</text>
 <text x="130.0" y="352.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">t2</text>
 <rect x="80.0" y="390.0" width="100.0" height="100.0" fill="#ef4444" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="130.0" y="436.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE3</text>
 <text x="130.0" y="452.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t3</text>
 <rect x="180.0" y="90.0" width="100.0" height="100.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="230.0" y="136.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE4</text>
 <text x="230.0" y="152.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t4</text>
 <rect x="180.0" y="190.0" width="100.0" height="100.0" fill="#ec4899" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="230.0" y="236.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE5</text>
 <text x="230.0" y="252.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t5</text>
 <rect x="180.0" y="290.0" width="100.0" height="100.0" fill="#06b6d4" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="230.0" y="336.0" text-anchor="middle" font-size="12" fill="#000" font-weight="bold">PE6</text>
 <text x="230.0" y="352.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">t6</text>
 <rect x="180.0" y="390.0" width="100.0" height="100.0" fill="#f97316" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="230.0" y="436.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE7</text>
 <text x="230.0" y="452.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t7</text>
 <rect x="280.0" y="90.0" width="100.0" height="100.0" fill="#3b82f6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="330.0" y="136.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE0</text>
 <text x="330.0" y="152.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t8</text>
 <rect x="280.0" y="190.0" width="100.0" height="100.0" fill="#10b981" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="330.0" y="236.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE1</text>
 <text x="330.0" y="252.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t9</text>
 <rect x="280.0" y="290.0" width="100.0" height="100.0" fill="#f59e0b" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="330.0" y="336.0" text-anchor="middle" font-size="12" fill="#000" font-weight="bold">PE2</text>
 <text x="330.0" y="352.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">t10</text>
 <rect x="280.0" y="390.0" width="100.0" height="100.0" fill="#ef4444" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="330.0" y="436.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE3</text>
 <text x="330.0" y="452.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t11</text>
 <rect x="380.0" y="90.0" width="100.0" height="100.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="430.0" y="136.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE4</text>
 <text x="430.0" y="152.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t12</text>
 <rect x="380.0" y="190.0" width="100.0" height="100.0" fill="#ec4899" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="430.0" y="236.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE5</text>
 <text x="430.0" y="252.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t13</text>
 <rect x="380.0" y="290.0" width="100.0" height="100.0" fill="#06b6d4" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="430.0" y="336.0" text-anchor="middle" font-size="12" fill="#000" font-weight="bold">PE6</text>
 <text x="430.0" y="352.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">t14</text>
 <rect x="380.0" y="390.0" width="100.0" height="100.0" fill="#f97316" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
 <text x="430.0" y="436.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE7</text>
 <text x="430.0" y="452.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t15</text>
 <rect x="80.0" y="90.0" width="400.0" height="400.0" fill="none" stroke="#1e293b" stroke-width="2" fill-opacity="1.0" rx="2"/>
 <text x="130.0" y="506.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">k=0..127</text>
 <text x="230.0" y="506.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">k=128..255</text>
 <text x="330.0" y="506.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">k=256..383</text>
 <text x="430.0" y="506.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">k=384..511</text>
 <text x="64.0" y="140.0" text-anchor="end" font-size="9" fill="#475569" font-weight="normal">m=0..255</text>
 <text x="64.0" y="240.0" text-anchor="end" font-size="9" fill="#475569" font-weight="normal">m=256..511</text>
 <text x="64.0" y="340.0" text-anchor="end" font-size="9" fill="#475569" font-weight="normal">m=512..767</text>
 <text x="64.0" y="440.0" text-anchor="end" font-size="9" fill="#475569" font-weight="normal">m=768..1023</text>
 <text x="590.0" y="90.0" text-anchor="middle" font-size="12" fill="#1e293b" font-weight="bold">PE Legend</text>
 <rect x="540.0" y="96.0" width="16.0" height="16.0" fill="#3b82f6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="562.0" y="108.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE0</text>
 <rect x="540.0" y="118.0" width="16.0" height="16.0" fill="#10b981" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="562.0" y="130.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE1</text>
 <rect x="540.0" y="140.0" width="16.0" height="16.0" fill="#f59e0b" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="562.0" y="152.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE2</text>
 <rect x="540.0" y="162.0" width="16.0" height="16.0" fill="#ef4444" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="562.0" y="174.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE3</text>
 <rect x="540.0" y="184.0" width="16.0" height="16.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="562.0" y="196.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE4</text>
 <rect x="540.0" y="206.0" width="16.0" height="16.0" fill="#ec4899" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="562.0" y="218.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE5</text>
 <rect x="540.0" y="228.0" width="16.0" height="16.0" fill="#06b6d4" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="562.0" y="240.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE6</text>
 <rect x="540.0" y="250.0" width="16.0" height="16.0" fill="#f97316" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="562.0" y="262.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE7</text>
 <text x="540.0" y="310.0" text-anchor="middle" font-size="12" fill="#1e293b" font-weight="bold">Tile Assignment Order</text>
 <rect x="540.0" y="318.0" width="12.0" height="12.0" fill="#3b82f6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="558.0" y="328.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 0 → PE0  (0,0)  off=0 B</text>
 <rect x="540.0" y="334.0" width="12.0" height="12.0" fill="#10b981" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="558.0" y="344.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 1 → PE1  (1,0)  off=256 KB</text>
 <rect x="540.0" y="350.0" width="12.0" height="12.0" fill="#f59e0b" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="558.0" y="360.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 2 → PE2  (2,0)  off=512 KB</text>
 <rect x="540.0" y="366.0" width="12.0" height="12.0" fill="#ef4444" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="558.0" y="376.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 3 → PE3  (3,0)  off=768 KB</text>
 <rect x="540.0" y="382.0" width="12.0" height="12.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="558.0" y="392.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 4 → PE4  (0,1)  off=256 B</text>
 <rect x="540.0" y="398.0" width="12.0" height="12.0" fill="#ec4899" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="558.0" y="408.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 5 → PE5  (1,1)  off=256 KB</text>
 <rect x="540.0" y="414.0" width="12.0" height="12.0" fill="#06b6d4" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="558.0" y="424.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 6 → PE6  (2,1)  off=512 KB</text>
 <rect x="540.0" y="430.0" width="12.0" height="12.0" fill="#f97316" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="558.0" y="440.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 7 → PE7  (3,1)  off=768 KB</text>
 <rect x="540.0" y="446.0" width="12.0" height="12.0" fill="#3b82f6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="558.0" y="456.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 8 → PE0  (0,2)  off=512 B</text>
 <rect x="540.0" y="462.0" width="12.0" height="12.0" fill="#10b981" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="558.0" y="472.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 9 → PE1  (1,2)  off=256 KB</text>
 <rect x="540.0" y="478.0" width="12.0" height="12.0" fill="#f59e0b" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="558.0" y="488.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t10 → PE2  (2,2)  off=512 KB</text>
 <rect x="540.0" y="494.0" width="12.0" height="12.0" fill="#ef4444" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="558.0" y="504.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t11 → PE3  (3,2)  off=768 KB</text>
 <rect x="540.0" y="510.0" width="12.0" height="12.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="558.0" y="520.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t12 → PE4  (0,3)  off=768 B</text>
 <rect x="540.0" y="526.0" width="12.0" height="12.0" fill="#ec4899" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="558.0" y="536.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t13 → PE5  (1,3)  off=256 KB</text>
 <rect x="540.0" y="542.0" width="12.0" height="12.0" fill="#06b6d4" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="558.0" y="552.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t14 → PE6  (2,3)  off=512 KB</text>
 <rect x="540.0" y="558.0" width="12.0" height="12.0" fill="#f97316" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
 <text x="558.0" y="568.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t15 → PE7  (3,3)  off=768 KB</text>
 <rect x="80.0" y="560.0" width="587.0" height="30.0" fill="#e2e8f0" stroke="#94a3b8" stroke-width="1" fill-opacity="1.0" rx="2"/>
 <text x="90.0" y="578.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Strategy: tiled_row_major  |  Tile: (256×128)=64 KB  |  Tiles: 16  |  Total: 1 MB</text>
 </svg>
@@ -0,0 +1,95 @@
 <svg xmlns="http://www.w3.org/2000/svg" width="648" height="648" viewBox="0 0 648 648">
  <title>sip</title>
  <rect width="648" height="648" fill="#f8fafc"/>
  <text x="324" y="18" text-anchor="middle" font-family="monospace" font-size="14" font-weight="bold" fill="#1e293b">SIP VIEW</text>
  <line x1="108.0" y1="144.0" x2="252.0" y2="144.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
  <text x="180.0" y="140.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
  <line x1="108.0" y1="144.0" x2="108.0" y2="264.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
  <text x="108.0" y="200.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
  <line x1="252.0" y1="144.0" x2="396.0" y2="144.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
  <text x="324.0" y="140.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
  <line x1="252.0" y1="144.0" x2="252.0" y2="264.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
  <text x="252.0" y="200.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
  <line x1="396.0" y1="144.0" x2="540.0" y2="144.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
  <text x="468.0" y="140.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
  <line x1="396.0" y1="144.0" x2="396.0" y2="264.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
  <text x="396.0" y="200.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
  <line x1="540.0" y1="144.0" x2="540.0" y2="264.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
  <text x="540.0" y="200.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
  <line x1="108.0" y1="264.0" x2="252.0" y2="264.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
  <text x="180.0" y="260.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
  <line x1="108.0" y1="264.0" x2="108.0" y2="384.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
  <text x="108.0" y="320.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
  <line x1="252.0" y1="264.0" x2="396.0" y2="264.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
  <text x="324.0" y="260.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
  <line x1="252.0" y1="264.0" x2="252.0" y2="384.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
  <text x="252.0" y="320.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
  <line x1="396.0" y1="264.0" x2="540.0" y2="264.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
  <text x="468.0" y="260.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
  <line x1="396.0" y1="264.0" x2="396.0" y2="384.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
  <text x="396.0" y="320.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
  <line x1="540.0" y1="264.0" x2="540.0" y2="384.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
  <text x="540.0" y="320.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
  <line x1="108.0" y1="384.0" x2="252.0" y2="384.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
  <text x="180.0" y="380.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
  <line x1="108.0" y1="384.0" x2="108.0" y2="504.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
  <text x="108.0" y="440.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
  <line x1="252.0" y1="384.0" x2="396.0" y2="384.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
  <text x="324.0" y="380.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
  <line x1="252.0" y1="384.0" x2="252.0" y2="504.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
  <text x="252.0" y="440.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
  <line x1="396.0" y1="384.0" x2="540.0" y2="384.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
  <text x="468.0" y="380.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
  <line x1="396.0" y1="384.0" x2="396.0" y2="504.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
  <text x="396.0" y="440.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
  <line x1="540.0" y1="384.0" x2="540.0" y2="504.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
  <text x="540.0" y="440.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
  <line x1="108.0" y1="504.0" x2="252.0" y2="504.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
  <text x="180.0" y="500.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
  <line x1="252.0" y1="504.0" x2="396.0" y2="504.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
  <text x="324.0" y="500.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
  <line x1="396.0" y1="504.0" x2="540.0" y2="504.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
  <text x="468.0" y="500.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
  <polyline points="324.0,56.0 108.0,56.0 108.0,144.0" fill="none" stroke="#0ea5e9" stroke-width="1" opacity="0.8"/>
  <text x="216.0" y="96.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.5mm 512GB/s</text>
  <polyline points="324.0,56.0 252.0,56.0 252.0,144.0" fill="none" stroke="#0ea5e9" stroke-width="1" opacity="0.8"/>
  <text x="288.0" y="96.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.5mm 512GB/s</text>
  <polyline points="324.0,56.0 396.0,56.0 396.0,144.0" fill="none" stroke="#0ea5e9" stroke-width="1" opacity="0.8"/>
  <text x="360.0" y="96.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.5mm 512GB/s</text>
  <polyline points="324.0,56.0 540.0,56.0 540.0,144.0" fill="none" stroke="#0ea5e9" stroke-width="1" opacity="0.8"/>
  <text x="432.0" y="96.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.5mm 512GB/s</text>
  <rect x="84.0" y="128.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
  <text x="108.0" y="148.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (0,0)</text>
  <rect x="228.0" y="128.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
  <text x="252.0" y="148.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (1,0)</text>
  <rect x="372.0" y="128.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
  <text x="396.0" y="148.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (2,0)</text>
  <rect x="516.0" y="128.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
  <text x="540.0" y="148.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (3,0)</text>
  <rect x="84.0" y="248.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
  <text x="108.0" y="268.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (0,1)</text>
  <rect x="228.0" y="248.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
  <text x="252.0" y="268.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (1,1)</text>
  <rect x="372.0" y="248.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
  <text x="396.0" y="268.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (2,1)</text>
  <rect x="516.0" y="248.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
  <text x="540.0" y="268.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (3,1)</text>
  <rect x="84.0" y="368.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
  <text x="108.0" y="388.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (0,2)</text>
  <rect x="228.0" y="368.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
  <text x="252.0" y="388.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (1,2)</text>
  <rect x="372.0" y="368.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
  <text x="396.0" y="388.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (2,2)</text>
  <rect x="516.0" y="368.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
  <text x="540.0" y="388.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (3,2)</text>
  <rect x="84.0" y="488.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
  <text x="108.0" y="508.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (0,3)</text>
  <rect x="228.0" y="488.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
  <text x="252.0" y="508.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (1,3)</text>
  <rect x="372.0" y="488.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
  <text x="396.0" y="508.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (2,3)</text>
  <rect x="516.0" y="488.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
  <text x="540.0" y="508.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (3,3)</text>
  <rect x="308.0" y="50.0" width="32.0" height="12.0" rx="4" fill="#0ea5e9" stroke="#475569" stroke-width="1"/>
  <text x="324.0" y="60.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#ffffff">IO io0</text>
 </svg>
@@ -0,0 +1,19 @@
 <svg xmlns="http://www.w3.org/2000/svg" width="768" height="396" viewBox="0 0 768 396">
  <title>system</title>
  <rect width="768" height="396" fill="#f8fafc"/>
  <text x="384" y="18" text-anchor="middle" font-family="monospace" font-size="14" font-weight="bold" fill="#1e293b">SYSTEM VIEW</text>
  <polyline points="384.0,60.0 182.0,60.0 182.0,120.0" fill="none" stroke="#6366f1" stroke-width="1" opacity="0.8"/>
  <text x="283.0" y="86.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">20.0mm 256GB/s</text>
  <polyline points="384.0,60.0 586.0,60.0 586.0,120.0" fill="none" stroke="#6366f1" stroke-width="1" opacity="0.8"/>
  <text x="485.0" y="86.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">20.0mm 256GB/s</text>
  <rect x="374.0" y="57.0" width="20.0" height="6.0" rx="4" fill="#6366f1" stroke="#475569" stroke-width="1"/>
  <text x="384.0" y="64.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#ffffff">Fabric Switch</text>
  <rect x="62.0" y="138.0" width="240.0" height="200.0" rx="4" fill="#e0e7ff" stroke="#475569" stroke-width="1"/>
  <text x="182.0" y="242.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">SIP 0</text>
  <rect x="174.0" y="117.0" width="16.0" height="6.0" rx="4" fill="#0ea5e9" stroke="#475569" stroke-width="1"/>
  <text x="182.0" y="124.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#ffffff">IO io0</text>
  <rect x="466.0" y="138.0" width="240.0" height="200.0" rx="4" fill="#e0e7ff" stroke="#475569" stroke-width="1"/>
  <text x="586.0" y="242.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">SIP 1</text>
  <rect x="578.0" y="117.0" width="16.0" height="6.0" rx="4" fill="#0ea5e9" stroke="#475569" stroke-width="1"/>
  <text x="586.0" y="124.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#ffffff">IO io0</text>
 </svg>
@@ -0,0 +1,381 @@
 # Latency Model
 ## Overview
 kernbench uses a discrete-event simulation (SimPy) to compute end-to-end latency.
 Every request flows through a graph of **components** connected by **wires**.
 The total latency reported is the **actual SimPy wall-clock** (`env.now` delta),
 not a static formula—so contention and queueing are captured automatically.
 ```
 total_ns (actual) = wire_prop + component_overhead + drain + queueing
                    ├── deterministic ──────────────────┘       │
                    └── contention-dependent ────────────────────┘
 ```
 ## Three Deterministic Cost Components
 ### 1. Wire Propagation
 ```
 wire_ns = distance_mm × ns_per_mm       (global: 0.01 = 10 ps/mm)
 ```
 Every edge in the topology graph has a `distance_mm`. A SimPy wire process
 delays each message by `wire_ns` before delivering it to the next component.
 For on-chip silicon this is ~10 ps/mm; the same constant applies everywhere
 since all links are on-die or interposer. Wire propagation is typically <1 ns
 and negligible compared to other costs.
 ### 2. Component Overhead (`overhead_ns`)
 ```
 component_ns = node.attrs["overhead_ns"]
 ```
 Each component on the path adds a fixed processing delay via `yield env.timeout(overhead_ns)`.
 This models arbitration, protocol processing, pipeline stages, etc.
 | Component | overhead_ns | Meaning |
 |-----------|-------------|---------|
 | pcie_ep | 5.0 | PCIe protocol processing |
 | io_cpu | 10.0 | Command decode / dispatch |
 | m_cpu | 5.0 | DMA scheduling |
 | fabric switch | 5.0 | Packet arbitration |
 | xbar | 2.0 | Crossbar arbitration |
 | xbar bridge | 1.0 | Bridge traversal between xbar halves |
 | ucie | 1.0 | UCIe protocol overhead per port |
 | noc (2D mesh) | 0.0 | Hop delay modeled internally via manhattan distance |
 | hbm_ctrl | 0.0 | Access time captured in drain_ns |
 | pe_cpu | 2.0 | Command dispatch |
 | pe_scheduler | 1.0 | PE-internal scheduling |
 | pe_gemm/math | 0.0 | Placeholder; will use flops-based model |
 ### 3. Drain (Serialization Delay)
 ```
 drain_ns = nbytes / bottleneck_bw_gbs
 ```
 **Wormhole (cut-through) model**: data flows through intermediate nodes as a
 pipeline. Serialization cost is paid **once** at the terminal node, not at
 every hop. The bottleneck is the minimum `bw_gbs` across all edges in the path.
 Example: 4096 bytes through a path with bottleneck 128 GB/s → `4096 / 128 = 32.0 ns`.
 ### Formula (Theoretical Lower Bound)
 ```
 formula_ns = Σ(wire_prop) + Σ(overhead_ns) + drain_ns
 ```
 This is the latency with **zero contention**—no other request competing for
 any resource. The engine provides `_formula_latency()` for verification.
 With no contention: `actual == formula`. With contention: `actual > formula`.
 ### Diagram: PE DMA Read (pe0 → local slice0, 4096 bytes)
 ```mermaid
 sequenceDiagram
    participant D as pe_dma
    participant X as xbar.pe0
    participant H as hbm_ctrl.slice0
    D->>X: txn (4096B)
    Note over X: overhead 2.0 ns
    X->>H: txn (wire 0.025 ns)
    Note over H: acquire Resource
    Note over H: overhead 0 ns
    Note over H: drain 4096/256 = 16.0 ns
    Note over H: release Resource
    H-->>D: done.succeed()
    Note over D,H: total_ns = 18.09 ns<br/>formula = wire(0.025) + ovhd(2.0) + drain(16.0) = 18.025 ns<br/>actual ≈ formula (no contention)
 ```
 ### Diagram: Two Requests — No Contention vs HOL Blocking
 #### Case 1: Different slices (parallel, no contention)
 ```mermaid
 sequenceDiagram
    participant A as Request A
    participant S0 as hbm_ctrl.slice0<br/>Resource(cap=1)
    participant S1 as hbm_ctrl.slice1<br/>Resource(cap=1)
    Note over A,S1: t=2 ns — both requests arrive at their own slice
    A->>S0: A (4KB)
    A->>S1: B (4KB)
    Note over S0: acquire (immediate)
    Note over S1: acquire (immediate)
    Note over S0: drain 16.0 ns
    Note over S1: drain 16.0 ns
    Note over S0: t=18 release
    Note over S1: t=18 release
    Note over A,S1: A actual = 18 ns, B actual = 18 ns<br/>No waiting — separate Resources
 ```
 #### Case 2: Same slice (HOL blocking)
 ```mermaid
 sequenceDiagram
    participant A as Request A (4KB)
    participant Q as hbm_ctrl.slice0<br/>Resource(cap=1)
    participant B as Request B (64B)
    Note over A,B: t=0 — A arrives first
    A->>Q: acquire (immediate)
    Note over Q: drain A = 16.0 ns
    Note over B,Q: t=5 — B arrives, yield req → BLOCKED
    B--xQ: waiting...
    Note over Q: t=16 — A drain done, release
    Q->>B: B acquires resource
    Note over Q: drain B = 0.25 ns
    Note over Q: t=16.25 — B done, release
    Note over A,B: A actual = 16.0 ns (== formula)<br/>B actual = 11.25 ns (formula 0.25 + queueing 11.0)<br/>HOL blocking: short request waits behind long drain
 ```
 ---
 ## How SimPy Tracks Latency
 ### Measurement
 ```python
 start_ns = env.now
 yield txn_done          # wait for the transaction to complete
 total_ns = env.now - start_ns     # ← this is what probe reports
 ```
 `env.now` is SimPy's simulation clock. It only advances when a process `yield`s
 a timeout or waits on a resource/store. The delta between start and done captures
 **everything**: wire delays, component overheads, drain, and any queueing.
 ### Component Pipeline
 Each component is a SimPy process:
 ```
 _fan_in (per in_port)  →  _inbox (Store)  →  _worker  →  out_ports
 ```
 1. **`_fan_in`**: relays messages from each `in_port` into a shared `_inbox` Store.
 2. **`_worker`**: pulls from `_inbox`, spawns `_forward_txn` per message.
 3. **`_forward_txn`**: calls `run()` (overhead), then puts to `out_ports[next_hop]`.
 The worker uses `env.process()` (pipeline model), so multiple messages can be
 in-flight through the same component concurrently. Contention happens when
 they compete for shared resources (e.g., `simpy.Resource` in hbm_ctrl).
 ### Wire Process
 ```python
 while True:
    msg = yield out_port.get()      # wait for sender
    yield env.timeout(prop_ns)      # propagation delay
    yield in_port.put(msg)          # deliver to receiver
 ```
 Each directed edge has its own wire process. Messages are delayed by exactly
 `distance_mm × ns_per_mm`.
 ---
 ## Contention and Queueing
 Queueing delay is **not a separate formula term**—it emerges from SimPy's
 event scheduling when multiple requests compete for the same resource.
 ### Where Contention Occurs
 | Resource | SimPy Type | Capacity | Effect |
 |----------|-----------|----------|--------|
 | hbm_ctrl | `simpy.Resource` | 1 | Serializes HBM access |
 | m_cpu DMA read engine | `simpy.Resource` | 1 | Serializes DMA reads |
 | m_cpu DMA write engine | `simpy.Resource` | 1 | Serializes DMA writes |
 | pe_dma channels | `simpy.Resource` | configurable | Serializes PE DMA ops |
 | component inbox | `simpy.Store` | unbounded | No backpressure (FIFO) |
 ### How Queueing Works
 ```python
 # hbm_ctrl._worker
 with self._resource.request() as req:
    yield req                     # ← BLOCKS if resource is occupied
    yield from self.run(env, txn.nbytes)
    yield env.timeout(drain_ns)
 ```
 If request A holds the resource and request B arrives:
 - B's `yield req` blocks until A releases the resource
 - SimPy advances B's `env.now` by A's remaining service time
 - This "extra" time shows up in B's `total_ns` automatically
 ```
 No contention:  actual_ns == formula_ns
 Contention:     actual_ns  > formula_ns
                queueing_delay = actual_ns - formula_ns
 ```
 ### Head-of-Line (HOL) Blocking at hbm_ctrl
 The `simpy.Resource` is held for the **entire** `with` block—both overhead and
 drain. The resource is NOT released between overhead and drain:
 ```python
 with self._resource.request() as req:
    yield req                              # acquire (or wait)
    yield from self.run(env, txn.nbytes)   # overhead_ns  ─┐
    yield env.timeout(drain_ns)            # drain_ns      │ resource held
 # ← resource released here ───────────────────────────────┘
 ```
 This means a short request arriving during a long request's drain must wait
 for the full remaining drain time—classic head-of-line blocking:
 ```
 Request A: 4 KB,  drain = 16.0 ns   (arrives at t=0)
 Request B: 64 B,  drain = 0.25 ns   (arrives at t=5)
 Timeline:
  t=0.00   A acquires resource
  t=0.00   A: overhead (0 ns)
  t=0.00   A: drain starts (16.0 ns)
  t=5.00   B arrives → yield req → BLOCKED (A holds resource)
  t=16.00  A: drain done → resource released
  t=16.00  B acquires resource
  t=16.00  B: overhead (0 ns)
  t=16.25  B: drain done → resource released
  B actual  = 11.25 ns (waited 11.0 + own 0.25)
  B formula = 0.25 ns
  B queueing = 11.0 ns  ← HOL blocking penalty
 ```
 **Why this is physically realistic**: An HBM channel processes one burst at a
 time. While data is being serialized onto the channel (drain), no other request
 can use that channel. The FIFO ordering (`simpy.Resource` default) reflects
 the simplest controller scheduling policy.
 **Alternative: priority scheduling**: If needed, `simpy.PriorityResource` can
 prioritize shorter requests (Shortest Job First), but this is not currently
 used since FIFO matches typical HBM controller behavior.
 ---
 ## Worked Example: Two Concurrent PE DMA Reads
 Setup: PE0 and PE1 in cube0 both read 4096 bytes from their local HBM slices
 (slice0 and slice1), submitted to the **same engine** at the same time.
 ### Paths
 ```
 DMA A: pe0.pe_dma → xbar.pe0 → hbm_ctrl.slice0
 DMA B: pe1.pe_dma → xbar.pe1 → hbm_ctrl.slice1
 ```
 ### No Contention (different HBM slices)
 Since slice0 and slice1 are **separate** hbm_ctrl instances, each with its own
 `simpy.Resource(capacity=1)`, there is no resource competition.
 ```
 DMA A timeline:
  t=0.00   pe_dma dequeues txn
  t=0.00   xbar.pe0: overhead_ns=2.0 → t=2.00
  t=2.025  wire prop (2.5mm × 0.01) → t=2.025
  t=2.025  hbm_ctrl.slice0: yield req → immediate (no contention)
  t=2.025  hbm_ctrl.slice0: overhead_ns=0 → t=2.025
  t=18.025 drain_ns = 4096/256 = 16.0 → t=18.025
  t=18.025 done
 DMA B timeline: (identical, on its own slice)
  t=0.00   → ... → t=18.09  done
 ```
 Both complete at ~18.09 ns. `actual == formula` for both.
 ### With Contention (same HBM slice)
 Now suppose both PE0 and PE1 read from **slice0**:
 ```
 DMA A: pe0.pe_dma → xbar.pe0 → hbm_ctrl.slice0
 DMA B: pe1.pe_dma → xbar.pe1 → xbar.pe0 → hbm_ctrl.slice0
                                (chain traversal to reach slice0)
 ```
 ```
 DMA A timeline:
  t=0.00   xbar.pe0(2.0) → wire → hbm_ctrl.slice0
  t=2.025  yield req → immediate (first to arrive)
  t=18.025 drain 16.0 → release resource → done
  actual_A = 18.025 ns (== formula)
 DMA B timeline:
  t=0.00   xbar.pe1(2.0) → xbar.pe0(2.0) → wire → hbm_ctrl.slice0
  t=4.035  yield req → BLOCKED (A holds resource until t=18.025)
  t=18.025 acquire resource
  t=34.025 drain 16.0 → release → done
  actual_B = 34.035 ns
  formula_B = wire(0.035) + overhead(4.0) + drain(32.0) = 36.035 ns
  But actual_B is different because drain uses bottleneck BW of B's path (128 GB/s)
  while A's path has BW 256 GB/s. Let's recalculate:
  B's bottleneck: xbar_x_bw = 128 GB/s → drain = 4096/128 = 32.0 ns
  formula_B = 0.035 + 4.0 + 32.0 = 36.035 ns
  actual_B  = 36.035 + queueing ≈ 50+ ns
  queueing  = time waiting for A to release hbm_ctrl
 ```
 The key insight: **queueing delay is not in the formula**. It only appears in
 the actual SimPy simulation when resources are contested. The probe reports
 `actual_ns`, which includes all queueing. To see pure queueing overhead,
 compare `actual_ns` vs `formula_ns` (available in PE DMA traces).
 ---
 ## Probe Output Explained
 ```
 === PE DMA Latency ===
 Case                Target              Actual  Ovhd  Drain  Wire  Ovhd% Drain%  Eff.BW   BN.BW   Util%
 pe-local-hbm        c0.pe0->c0.slice0    18.09   2.0  16.0  0.08  11.1% 88.5%   226.49   256.0   88.5%
 pe-cross-half-hbm   c0.pe0->c0.slice4    37.14   5.0  32.0  0.14  13.5% 86.1%   110.27   128.0   86.1%
 ```
 | Column | Meaning |
 |--------|---------|
 | **Actual** | SimPy measured `env.now` delta (includes contention if any) |
 | **Ovhd** | Sum of `overhead_ns` for all components on the forward path |
 | **Drain** | `nbytes / bottleneck_bw` — serialization at terminal |
 | **Wire** | Sum of `distance_mm × ns_per_mm` for all edges |
 | **Ovhd%** | `Ovhd / Actual × 100` — fraction of time spent in component processing |
 | **Drain%** | `Drain / Actual × 100` — fraction of time spent in data transfer |
 | **Eff.BW** | `nbytes / Actual` — achieved bandwidth |
 | **BN.BW** | Bottleneck bandwidth (min `bw_gbs` on path) |
 | **Util%** | `Eff.BW / BN.BW × 100` — how close to theoretical max BW |
 ### Why Util% < 100%
 `Util% = Drain% = drain_ns / actual_ns`. The gap from 100% is the overhead
 fraction. For small transfers (4KB), overhead is significant relative to drain.
 For large transfers, drain dominates and utilization approaches 100%.
 ```
  4 KB:  Ovhd=2.0, Drain=16.0  → Util=88.5%   (overhead is 11% of time)
 64 KB:  Ovhd=2.0, Drain=256.0 → Util=99.2%   (overhead is <1% of time)
 ```
 ### H2D Path: Why Ovhd% is ~40%
 H2D traverses many components (pcie_ep → io_cpu → ucie → noc → m_cpu → noc →
 xbar → hbm_ctrl + response path). Total forward overhead is ~23 ns vs drain
 of 32 ns for 4KB, so overhead is comparable to data transfer time—resulting
 in ~55% utilization. This is expected for small command-path transfers.
@@ -0,0 +1,30 @@
 [build-system]
 requires = ["setuptools>=68", "wheel"]
 build-backend = "setuptools.build_meta"
 [project]
 name = "kernbench"
 version = "0.1.0"
 requires-python = ">=3.10"
 dependencies = ["pytest", "simpy", "pyyaml"]
 [project.scripts]
 kernbench = "kernbench.cli.main:main"
 [project.optional-dependencies]
 dev = [
  "pytest>=7",
  "ruff>=0.4.0",
 ]
 [tool.ruff]
 line-length = 100
 target-version = "py310"
 fix = false
 [tool.ruff.lint]
 select = ["E", "F", "I", "B", "UP"]
 ignore = ["F401"]
 [tool.pytest.ini_options]
 addopts = ["--disable-warnings"]
@@ -0,0 +1,393 @@
 #!/usr/bin/env python3
 """Generate SVG diagrams illustrating each placement strategy.
 Example tensor: (M=1024, K=512) fp16 (itemsize=2), 8 PEs.
 Tiled variants use tile_m=256, tile_k=128.
 Output: docs/diagrams/placement_*.svg
 """
 from __future__ import annotations
 import math
 from pathlib import Path
 # ── Diagram parameters ──────────────────────────────────────────────
 M, K = 1024, 512
 ITEMSIZE = 2
 NUM_PE = 8
 TILE_M, TILE_K = 256, 128
 PE_COLORS = [
    "#3b82f6",  # PE0 blue
    "#10b981",  # PE1 emerald
    "#f59e0b",  # PE2 amber
    "#ef4444",  # PE3 red
    "#8b5cf6",  # PE4 violet
    "#ec4899",  # PE5 pink
    "#06b6d4",  # PE6 cyan
    "#f97316",  # PE7 orange
 ]
 PE_TEXT_COLORS = [
    "#fff", "#fff", "#000", "#fff",
    "#fff", "#fff", "#000", "#fff",
 ]
 OUT_DIR = Path(__file__).parent.parent / "docs" / "diagrams"
 # ── SVG helpers ─────────────────────────────────────────────────────
 def _svg_header(w: int, h: int, title: str) -> str:
    return (
        f'<svg xmlns="http://www.w3.org/2000/svg" width="{w}" height="{h}"'
        f' viewBox="0 0 {w} {h}" font-family="monospace">\n'
        f'<rect width="{w}" height="{h}" fill="#f8fafc" rx="6"/>\n'
        f'<text x="{w // 2}" y="32" text-anchor="middle" font-size="16"'
        f' font-weight="bold" fill="#1e293b">{title}</text>\n'
    )
 def _svg_footer() -> str:
    return "</svg>\n"
 def _rect(x: float, y: float, w: float, h: float, fill: str,
          stroke: str = "#334155", sw: float = 1.0, opacity: float = 1.0) -> str:
    return (
        f'<rect x="{x:.1f}" y="{y:.1f}" width="{w:.1f}" height="{h:.1f}"'
        f' fill="{fill}" stroke="{stroke}" stroke-width="{sw}"'
        f' fill-opacity="{opacity}" rx="2"/>\n'
    )
 def _text(x: float, y: float, txt: str, size: int = 11,
          anchor: str = "middle", fill: str = "#1e293b",
          weight: str = "normal") -> str:
    return (
        f'<text x="{x:.1f}" y="{y:.1f}" text-anchor="{anchor}"'
        f' font-size="{size}" fill="{fill}" font-weight="{weight}">{txt}</text>\n'
    )
 def _line(x1: float, y1: float, x2: float, y2: float,
          stroke: str = "#94a3b8", sw: float = 1) -> str:
    return (
        f'<line x1="{x1:.1f}" y1="{y1:.1f}" x2="{x2:.1f}" y2="{y2:.1f}"'
        f' stroke="{stroke}" stroke-width="{sw}"/>\n'
    )
 def _format_bytes(n: int) -> str:
    if n >= (1 << 20):
        return f"{n >> 20} MB"
    if n >= (1 << 10):
        return f"{n >> 10} KB"
    return f"{n} B"
 def _legend(x: float, y0: float, num_pe: int = NUM_PE) -> str:
    s = _text(x + 50, y0, "PE Legend", size=12, weight="bold")
    for i in range(num_pe):
        ly = y0 + 18 + i * 22
        s += _rect(x, ly - 12, 16, 16, PE_COLORS[i])
        s += _text(x + 22, ly, f"PE{i}", size=11, anchor="start")
    return s
 def _axes(gx: float, gy: float, gw: float, gh: float,
          m_label: str = "M=1024", k_label: str = "K=512") -> str:
    """Draw axis labels and dimension arrows."""
    s = ""
    # K axis (horizontal) label above grid
    s += _text(gx + gw / 2, gy - 8, f"← {k_label} →", size=11, fill="#475569")
    # M axis (vertical) label left of grid
    mx = gx - 12
    my = gy + gh / 2
    s += (
        f'<text x="{mx:.1f}" y="{my:.1f}" text-anchor="middle"'
        f' font-size="11" fill="#475569"'
        f' transform="rotate(-90 {mx:.1f} {my:.1f})">↑ {m_label} ↓</text>\n'
    )
    return s
 def _info_box(x: float, y: float, lines: list[str]) -> str:
    """Rounded info box with key/value lines."""
    bw = max(len(l) for l in lines) * 7 + 20
    bh = len(lines) * 18 + 12
    s = _rect(x, y, bw, bh, "#e2e8f0", stroke="#94a3b8", sw=1)
    for i, line in enumerate(lines):
        s += _text(x + 10, y + 18 + i * 18, line, size=10, anchor="start", fill="#334155")
    return s
 # ── Grid drawing ────────────────────────────────────────────────────
 def _draw_grid(
    gx: float, gy: float, gw: float, gh: float,
    cells: list[dict],  # [{row, col, rspan, cspan, pe, label?, offset?}]
    rows: int, cols: int,
    cell_labels: bool = True,
 ) -> str:
    """Draw a grid of colored cells representing shard placement."""
    cw = gw / cols
    ch = gh / rows
    s = ""
    for c in cells:
        cx = gx + c["col"] * cw
        cy = gy + c["row"] * ch
        w = c.get("cspan", 1) * cw
        h = c.get("rspan", 1) * ch
        pe = c["pe"]
        s += _rect(cx, cy, w, h, PE_COLORS[pe], stroke="#334155", sw=1.5)
        # PE label
        lx = cx + w / 2
        ly = cy + h / 2
        s += _text(lx, ly - 4, f"PE{pe}", size=12,
                   fill=PE_TEXT_COLORS[pe], weight="bold")
        if cell_labels and "label" in c:
            s += _text(lx, ly + 12, c["label"], size=9,
                       fill=PE_TEXT_COLORS[pe])
    # Grid border
    s += _rect(gx, gy, gw, gh, "none", stroke="#1e293b", sw=2)
    return s
 # ── Strategy-specific generators ────────────────────────────────────
 def gen_column_wise() -> str:
    """Column-wise: split K into 8 equal parts."""
    W, H = 820, 500
    s = _svg_header(W, H, "Placement: column_wise")
    s += _text(W // 2, 54, f"Tensor ({M}×{K}) fp16  →  K axis split into {NUM_PE} parts",
               size=12, fill="#475569")
    gx, gy, gw, gh = 80, 90, 480, 320
    chunk_k = K // NUM_PE  # 64
    chunk_bytes = M * chunk_k * ITEMSIZE
    s += _axes(gx, gy, gw, gh)
    cells = []
    for i in range(NUM_PE):
        cells.append({
            "row": 0, "col": i, "rspan": 1, "cspan": 1,
            "pe": i,
            "label": f"({M}×{chunk_k})",
        })
    s += _draw_grid(gx, gy, gw, gh, cells, rows=1, cols=NUM_PE)
    # Column dimension labels
    cw = gw / NUM_PE
    for i in range(NUM_PE):
        cx = gx + i * cw + cw / 2
        off = i * chunk_bytes
        s += _text(cx, gy + gh + 16, f"off={_format_bytes(off)}", size=9, fill="#475569")
        s += _text(cx, gy + gh + 30, f"{_format_bytes(chunk_bytes)}", size=9, fill="#64748b")
    s += _legend(620, 100)
    s += _info_box(620, 320, [
        f"Strategy: column_wise",
        f"Split axis: K",
        f"Shards: {NUM_PE}",
        f"Each: ({M}, {chunk_k})",
        f"Each: {_format_bytes(chunk_bytes)}",
        f"Total: {_format_bytes(M * K * ITEMSIZE)}",
    ])
    s += _svg_footer()
    return s
 def gen_row_wise() -> str:
    """Row-wise: split M into 8 equal parts."""
    W, H = 820, 560
    s = _svg_header(W, H, "Placement: row_wise")
    s += _text(W // 2, 54, f"Tensor ({M}×{K}) fp16  →  M axis split into {NUM_PE} parts",
               size=12, fill="#475569")
    gx, gy, gw, gh = 80, 90, 320, 400
    chunk_m = M // NUM_PE  # 128
    chunk_bytes = chunk_m * K * ITEMSIZE
    s += _axes(gx, gy, gw, gh)
    cells = []
    for i in range(NUM_PE):
        cells.append({
            "row": i, "col": 0, "rspan": 1, "cspan": 1,
            "pe": i,
            "label": f"({chunk_m}×{K})",
        })
    s += _draw_grid(gx, gy, gw, gh, cells, rows=NUM_PE, cols=1)
    # Row dimension labels
    ch = gh / NUM_PE
    for i in range(NUM_PE):
        cy = gy + i * ch + ch / 2
        off = i * chunk_bytes
        s += _text(gx + gw + 10, cy - 4, f"off={_format_bytes(off)}",
                   size=9, anchor="start", fill="#475569")
        s += _text(gx + gw + 10, cy + 10, f"{_format_bytes(chunk_bytes)}",
                   size=9, anchor="start", fill="#64748b")
    s += _legend(580, 100)
    s += _info_box(580, 320, [
        f"Strategy: row_wise",
        f"Split axis: M",
        f"Shards: {NUM_PE}",
        f"Each: ({chunk_m}, {K})",
        f"Each: {_format_bytes(chunk_bytes)}",
        f"Total: {_format_bytes(M * K * ITEMSIZE)}",
    ])
    s += _svg_footer()
    return s
 def gen_replicate() -> str:
    """Replicate: full copy per PE."""
    W, H = 820, 500
    s = _svg_header(W, H, "Placement: replicate")
    s += _text(W // 2, 54, f"Tensor ({M}×{K}) fp16  →  full copy to each PE",
               size=12, fill="#475569")
    full_bytes = M * K * ITEMSIZE
    # Show 8 small copies in 2 rows × 4 cols
    cols, rows = 4, 2
    margin_x, margin_y = 60, 90
    gap = 16
    bw = (700 - (cols - 1) * gap) / cols
    bh = (340 - (rows - 1) * gap) / rows
    for i in range(NUM_PE):
        r = i // cols
        c = i % cols
        bx = margin_x + c * (bw + gap)
        by = margin_y + r * (bh + gap)
        s += _rect(bx, by, bw, bh, PE_COLORS[i], stroke="#334155", sw=1.5)
        s += _text(bx + bw / 2, by + bh / 2 - 14, f"PE{i}",
                   size=14, fill=PE_TEXT_COLORS[i], weight="bold")
        s += _text(bx + bw / 2, by + bh / 2 + 6, f"({M}×{K})",
                   size=11, fill=PE_TEXT_COLORS[i])
        s += _text(bx + bw / 2, by + bh / 2 + 22, f"{_format_bytes(full_bytes)}",
                   size=10, fill=PE_TEXT_COLORS[i])
        s += _text(bx + bw / 2, by + bh / 2 + 36, "offset=0",
                   size=9, fill=PE_TEXT_COLORS[i])
    s += _info_box(60, 450, [
        f"Strategy: replicate  |  Shards: {NUM_PE}  |  Each: {_format_bytes(full_bytes)}"
        f"  |  Total mem: {_format_bytes(full_bytes * NUM_PE)}",
    ])
    s += _svg_footer()
    return s
 def gen_tiled(column_major: bool) -> str:
    """2D tiled placement. column_major=True → tiled_column_major."""
    name = "tiled_column_major" if column_major else "tiled_row_major"
    order = "column-major (K first)" if column_major else "row-major (M first)"
    tiles_m = M // TILE_M  # 4
    tiles_k = K // TILE_K  # 4
    total_tiles = tiles_m * tiles_k  # 16
    tile_bytes = TILE_M * TILE_K * ITEMSIZE
    W, H = 820, 620
    s = _svg_header(W, H, f"Placement: {name}")
    s += _text(W // 2, 54,
               f"Tensor ({M}×{K}) fp16, tile=({TILE_M}×{TILE_K})  →  "
               f"{tiles_m}×{tiles_k}={total_tiles} tiles, {order}",
               size=11, fill="#475569")
    gx, gy, gw, gh = 80, 90, 400, 400
    s += _axes(gx, gy, gw, gh)
    # Build tile → PE mapping
    cells = []
    idx = 0
    if column_major:
        # iterate M first (rows), then K (cols) — but column-major means
        # we traverse in the order that fills columns first
        # Actually: column-major = K axis first within each M row
        # The implementation iterates: for mi in tiles_m: for ki in tiles_k
        for mi in range(tiles_m):
            for ki in range(tiles_k):
                pe = idx % NUM_PE
                row_bytes = K * ITEMSIZE
                offset = (mi * TILE_M * row_bytes) + (ki * TILE_K * ITEMSIZE)
                cells.append({
                    "row": mi, "col": ki, "rspan": 1, "cspan": 1,
                    "pe": pe,
                    "label": f"t{idx}",
                    "offset": offset,
                    "idx": idx,
                })
                idx += 1
    else:
        # row-major: iterate K first (cols), then M (rows)
        for ki in range(tiles_k):
            for mi in range(tiles_m):
                pe = idx % NUM_PE
                row_bytes = K * ITEMSIZE
                offset = (mi * TILE_M * row_bytes) + (ki * TILE_K * ITEMSIZE)
                cells.append({
                    "row": mi, "col": ki, "rspan": 1, "cspan": 1,
                    "pe": pe,
                    "label": f"t{idx}",
                    "offset": offset,
                    "idx": idx,
                })
                idx += 1
    s += _draw_grid(gx, gy, gw, gh, cells, rows=tiles_m, cols=tiles_k)
    # Tile dimension labels on top
    cw = gw / tiles_k
    for ki in range(tiles_k):
        cx = gx + ki * cw + cw / 2
        s += _text(cx, gy + gh + 16, f"k={ki * TILE_K}..{(ki + 1) * TILE_K - 1}",
                   size=9, fill="#475569")
    # Tile dimension labels on left
    ch = gh / tiles_m
    for mi in range(tiles_m):
        cy = gy + mi * ch + ch / 2
        s += _text(gx - 16, cy, f"m={mi * TILE_M}..{(mi + 1) * TILE_M - 1}",
                   size=9, anchor="end", fill="#475569")
    s += _legend(540, 90)
    # Assignment table
    table_y = 310
    s += _text(540, table_y, "Tile Assignment Order", size=12, weight="bold")
    # Sort cells by idx for table
    sorted_cells = sorted(cells, key=lambda c: c["idx"])
    for i, c in enumerate(sorted_cells):
        ty = table_y + 18 + i * 16
        if ty > H - 20:
            break
        pe = c["pe"]
        s += _rect(540, ty - 10, 12, 12, PE_COLORS[pe])
        s += _text(558, ty,
                   f"t{c['idx']:>2d} → PE{pe}  ({c['row']},{c['col']})"
                   f"  off={_format_bytes(c['offset'])}",
                   size=9, anchor="start", fill="#334155")
    s += _info_box(80, H - 60, [
        f"Strategy: {name}  |  Tile: ({TILE_M}×{TILE_K})={_format_bytes(tile_bytes)}"
        f"  |  Tiles: {total_tiles}  |  Total: {_format_bytes(M * K * ITEMSIZE)}",
    ])
    s += _svg_footer()
    return s
 # ── Main ────────────────────────────────────────────────────────────
 def main() -> None:
    OUT_DIR.mkdir(parents=True, exist_ok=True)
    diagrams = {
        "placement_column_wise.svg": gen_column_wise(),
        "placement_row_wise.svg": gen_row_wise(),
        "placement_replicate.svg": gen_replicate(),
        "placement_tiled_column_major.svg": gen_tiled(column_major=True),
        "placement_tiled_row_major.svg": gen_tiled(column_major=False),
    }
    for name, svg in diagrams.items():
        path = OUT_DIR / name
        path.write_text(svg, encoding="utf-8")
        print(f"  wrote {path}")
    print(f"\nGenerated {len(diagrams)} placement diagrams.")
 if __name__ == "__main__":
    main()
@@ -0,0 +1,64 @@
 import argparse
 import sys
 from benches.loader import resolve_bench
 from kernbench.cli.probe import cmd_probe
 from kernbench.cli.report import format_report
 from kernbench.common.types import SimEngine
 from kernbench.runtime_api.bench_runner import run_bench
 from kernbench.runtime_api.types import DeviceSelector, resolve_device
 from kernbench.sim_engine.engine import GraphEngine
 from kernbench.topology.builder import resolve_topology
 def build_parser() -> argparse.ArgumentParser:
    p = argparse.ArgumentParser(prog="kernbench")
    sub = p.add_subparsers(dest="cmd", required=True)
    runp = sub.add_parser("run", help="Run a benchmark")
    runp.add_argument("--topology", required=True)
    runp.add_argument("--bench", required=True)
    runp.add_argument(
        "--device", default=None, help="Target device: 'all' or 'sip:<N>' (default: all)"
    )
    runp.set_defaults(_handler=cmd_run)
    probep = sub.add_parser("probe", help="Probe latency and BW for predefined traffic patterns")
    probep.add_argument("--topology", required=True)
    probep.add_argument("--case", default="all", help="Case name or 'all' (default: all)")
    probep.set_defaults(_handler=cmd_probe)
    return p
 def engine_factory(topology: object, device: DeviceSelector) -> SimEngine:
    topo_obj = getattr(topology, "topology_obj", topology)
    return GraphEngine(topo_obj)
 def cmd_run(args) -> int:
    print("> Running benchmark with:", args)
    topo = resolve_topology(args.topology)
    bench = resolve_bench(args.bench)
    device = resolve_device(args.device)
    result = run_bench(topology=topo, bench_fn=bench, device=device, engine_factory=engine_factory)
    topo_obj = getattr(topo, "topology_obj", topo)
    spec = getattr(topo_obj, "spec", None)
    if result.traces:
        print(format_report(result.traces, title=args.bench, spec=spec))
    print(result.summary_text())
    return 0 if result.completion.ok else 1
 def main(argv=None) -> int:
    parser = build_parser()
    args = parser.parse_args(argv)
    return int(args._handler(args))
 if __name__ == "__main__":
    sys.exit(main())
@@ -0,0 +1,248 @@
 """kernbench probe: latency and BW verification utility.
 Runs predefined traffic patterns through the simulation engine and reports
 latency, effective bandwidth, bottleneck bandwidth, and utilization for each
 case.  Validates monotonicity invariants across hop counts and access types.
 """
 from __future__ import annotations
 from pathlib import Path
 from kernbench.policy.address.phyaddr import PhysAddr
 from kernbench.policy.routing.router import AddressResolver, PathRouter
 from kernbench.runtime_api.kernel import MemoryWriteMsg, PeDmaMsg
 from kernbench.sim_engine.engine import GraphEngine
 from kernbench.topology.builder import load_topology
 from kernbench.topology.types import TopologyGraph
 # -- Helpers ----------------------------------------------------------
 def _hbm_pa(sip: int, cube: int, pe_id: int, spec: dict) -> int:
    mm = spec["cube"]["memory_map"]
    slice_bytes = mm["hbm_total_gb_per_cube"] * (1 << 30) // mm["hbm_slices_per_cube"]
    pa = PhysAddr.pe_hbm_addr(
        rack_id=0, sip_id=sip, cube_id=cube, pe_id=pe_id,
        pe_local_hbm_offset=0x1000, slice_size_bytes=slice_bytes,
    )
    return pa.encode()
 def _build_edge_map(graph: TopologyGraph) -> dict[tuple[str, str], object]:
    return {(e.src, e.dst): e for e in graph.edges}
 def _formula_breakdown(
    path: list[str], nbytes: int, edge_map: dict, graph: TopologyGraph,
 ) -> tuple[float, float, float, float]:
    """Return (wire_ns, overhead_ns, drain_ns, formula_ns) for a path."""
    ns_per_mm = graph.spec.get("system", {}).get("ns_per_mm", 0.01)
    wire_ns = 0.0
    for i in range(len(path) - 1):
        e = edge_map.get((path[i], path[i + 1]))
        if e:
            wire_ns += e.distance_mm * ns_per_mm
    overhead_ns = 0.0
    for nid in path:
        node = graph.nodes.get(nid)
        if node:
            overhead_ns += float(node.attrs.get("overhead_ns", 0.0))
    bws = [e.bw_gbs for i in range(len(path) - 1)
           if (e := edge_map.get((path[i], path[i + 1]))) and e.bw_gbs]
    drain_ns = nbytes / min(bws) if bws else 0.0
    return wire_ns, overhead_ns, drain_ns, wire_ns + overhead_ns + drain_ns
 def _bottleneck_bw(path: list[str], edge_map: dict) -> float | None:
    """Per-request bottleneck: single request uses one connection."""
    bws: list[float] = []
    for i in range(len(path) - 1):
        e = edge_map.get((path[i], path[i + 1]))
        if e and e.bw_gbs:
            bws.append(e.bw_gbs)
    return min(bws) if bws else None
 def _fmt_bw(bw: float | None) -> str:
    return f"{bw:.1f}" if bw is not None else "-"
 def _fmt_util(eff: float, bn: float | None) -> str:
    if bn is None or bn <= 0:
        return "-"
    return f"{eff / bn * 100:.1f}%"
 def _short_name(node_id: str) -> str:
    """Shorten node id: keep last 2 segments to avoid ambiguity (xbar.pe0 vs pe0)."""
    parts = node_id.split(".")
    return ".".join(parts[-2:]) if len(parts) >= 2 else node_id
 def _short_path(path: list[str]) -> str:
    return " -> ".join(_short_name(n) for n in path)
 # -- Probe runner -----------------------------------------------------
 def run_probe(topology_path: str, case_filter: str | None = None) -> int:
    path = Path(topology_path).expanduser().resolve()
    graph = load_topology(path)
    edge_map = _build_edge_map(graph)
    spec = graph.spec
    resolver = AddressResolver(graph)
    router = PathRouter(graph)
    nbytes = 4096
    show_all = case_filter is None or case_filter == "all"
    # === H2D Write ===
    h2d_cases = [
        ("h2d-1hop", 0, 1),
        ("h2d-2hop", 4, 2),
        ("h2d-3hop", 8, 3),
        ("h2d-4hop", 12, 4),
    ]
    h2d_results: list[tuple[str, int, float, float, float | None]] = []
    h2d_paths: list[tuple[str, list[str], list[str], list[str]]] = []
    print()
    print("=== H2D Write Latency (IO->HBM, varying hop count) ===")
    print(f"  {'Case':<14} {'Target':<16} {'Hops':>4}   {'Actual':>8}"
          f"  {'Ovhd':>6} {'Drain':>6} {'Wire':>5} {'Ovhd%':>6} {'Drain%':>7}"
          f"   {'Eff.BW':>8}   {'BN.BW':>8}   {'Util%':>6}")
    print("  " + "-" * 115)
    for name, cube, hops in h2d_cases:
        if not show_all and case_filter != name:
            continue
        engine = GraphEngine(graph)
        pa = _hbm_pa(sip=0, cube=cube, pe_id=0, spec=spec)
        msg = MemoryWriteMsg(
            correlation_id="probe", request_id=name,
            dst_sip=0, dst_cube=cube, dst_pe=0,
            dst_pa=pa, nbytes=nbytes, pattern="zero",
        )
        h = engine.submit(msg)
        engine.wait(h)
        _, trace = engine.get_completion(h)
        total_ns = trace["total_ns"]
        eff_bw = nbytes / total_ns if total_ns > 0 else 0.0
        pa_obj = PhysAddr.decode(pa)
        dst_node = resolver.resolve(pa_obj)
        pcie_ep = resolver.find_pcie_ep(0)
        io_cpu = resolver.find_io_cpu(0)
        m_cpu = resolver.find_m_cpu(0, cube)
        leg1 = router.find_node_path(pcie_ep, io_cpu)
        leg2 = router.find_node_path(io_cpu, m_cpu)
        leg3 = router.find_mcpu_dma_path(m_cpu, dst_node)
        full_path = leg1 + leg2[1:] + leg3[1:]
        bn_bw = _bottleneck_bw(full_path, edge_map)
        # Forward path breakdown only (response path is implicit in actual_ns)
        fwd_path = leg1 + leg2[1:] + leg3[1:]
        wire, ovhd, drain, formula = _formula_breakdown(fwd_path, nbytes, edge_map, graph)
        ovhd_pct = ovhd / total_ns * 100 if total_ns > 0 else 0
        drain_pct = drain / total_ns * 100 if total_ns > 0 else 0
        h2d_results.append((name, hops, total_ns, eff_bw, bn_bw))
        h2d_paths.append((name, leg1, leg2, leg3))
        print(f"  {name:<14} cube{cube}.pe0{'':<8} {hops:>4}   {total_ns:>8.2f}"
              f"  {ovhd:>6.1f} {drain:>6.1f} {wire:>5.2f} {ovhd_pct:>5.1f}% {drain_pct:>5.1f}%"
              f"   {eff_bw:>8.2f}   {_fmt_bw(bn_bw):>8}   {_fmt_util(eff_bw, bn_bw):>6}")
    if len(h2d_results) >= 2:
        lats = [r[2] for r in h2d_results]
        mono = all(lats[i] < lats[i + 1] for i in range(len(lats) - 1))
        sym = "[v]" if mono else "[x]"
        print(f"  {sym} Monotonic increase: {'PASS' if mono else 'FAIL'}")
    if h2d_paths:
        print()
        print("  Route Details:")
        print(f"  {'Case':<14} {'Leg':>4}  Path")
        print("  " + "-" * 80)
        for name, leg1, leg2, leg3 in h2d_paths:
            print(f"  {name:<14} {'L1':>4}  {_short_path(leg1)}")
            print(f"  {'':<14} {'L2':>4}  {_short_path(leg2)}")
            print(f"  {'':<14} {'L3':>4}  {_short_path(leg3)}")
    # === PE DMA → HBM (direct PE-level injection) ===
    # (name, sip, src_cube, src_pe, dst_cube, dst_pe)
    pe_cases = [
        ("pe-local-hbm", 0, 0, 0, 0, 0),        # pe0 → slice0 (local, 256 GB/s)
        ("pe-same-half-hbm", 0, 0, 0, 0, 1),     # pe0 → slice1 (xbar chain, 128 GB/s)
        ("pe-cross-half-hbm", 0, 0, 0, 0, 4),    # pe0 → slice4 (xbar chain, 128 GB/s)
        ("pe-cross-cube-hbm", 0, 0, 0, 1, 0),    # cube0.pe0 → cube1.slice0 (NOC, 128 GB/s)
    ]
    pe_results: list[tuple[str, float, float, float | None]] = []
    pe_paths: list[tuple[str, list[str]]] = []
    print()
    print("=== PE DMA Latency (pe_dma -> xbar -> HBM, direct injection) ===")
    print(f"  {'Case':<22} {'Target':<28} {'Actual':>8}"
          f"  {'Ovhd':>6} {'Drain':>6} {'Wire':>5} {'Ovhd%':>6} {'Drain%':>7}"
          f"   {'Eff.BW':>8}   {'BN.BW':>8}   {'Util%':>6}")
    print("  " + "-" * 120)
    for name, sip, src_cube, src_pe, dst_cube, dst_pe in pe_cases:
        if not show_all and case_filter != name:
            continue
        engine = GraphEngine(graph)
        dst_pa = _hbm_pa(sip=sip, cube=dst_cube, pe_id=dst_pe, spec=spec)
        msg = PeDmaMsg(
            correlation_id="probe", request_id=name,
            src_sip=sip, src_cube=src_cube, src_pe=src_pe,
            dst_pa=dst_pa, nbytes=nbytes,
        )
        h = engine.submit(msg)
        engine.wait(h)
        _, trace = engine.get_completion(h)
        total_ns = trace["total_ns"]
        eff_bw = nbytes / total_ns if total_ns > 0 else 0.0
        pe_ref = f"sip{sip}.cube{src_cube}.pe{src_pe}"
        pa_obj = PhysAddr.decode(dst_pa)
        dst_node = resolver.resolve(pa_obj)
        dma_path = router.find_path(pe_ref, dst_node)
        bn_bw = _bottleneck_bw(dma_path, edge_map)
        wire, ovhd, drain, formula = _formula_breakdown(dma_path, nbytes, edge_map, graph)
        ovhd_pct = ovhd / total_ns * 100 if total_ns > 0 else 0
        drain_pct = drain / total_ns * 100 if total_ns > 0 else 0
        target_str = f"c{src_cube}.pe{src_pe}->c{dst_cube}.slice{dst_pe}"
        pe_results.append((name, total_ns, eff_bw, bn_bw))
        pe_paths.append((name, dma_path))
        print(f"  {name:<22} {target_str:<28} {total_ns:>8.2f}"
              f"  {ovhd:>6.1f} {drain:>6.1f} {wire:>5.2f} {ovhd_pct:>5.1f}% {drain_pct:>5.1f}%"
              f"   {eff_bw:>8.2f}   {_fmt_bw(bn_bw):>8}   {_fmt_util(eff_bw, bn_bw):>6}")
    if len(pe_results) >= 2:
        local = [r for r in pe_results if "local" in r[0]]
        chain = [r for r in pe_results if "local" not in r[0]]
        if local and chain:
            print(f"  * Local BN: {_fmt_bw(local[0][3])} GB/s, "
                  f"Chain/NOC BN: {_fmt_bw(chain[0][3])} GB/s")
    if pe_paths:
        print()
        print("  Route Details:")
        print(f"  {'Case':<22}  Path")
        print("  " + "-" * 80)
        for name, dma_path in pe_paths:
            print(f"  {name:<22}  {_short_path(dma_path)}")
    print()
    return 0
 def cmd_probe(args) -> int:
    return run_probe(args.topology, getattr(args, "case", "all"))
@@ -0,0 +1,175 @@
 """Performance report formatter for bench results."""
 from __future__ import annotations
 _DTYPE_BITS: dict[str, int] = {
    "f16": 16, "fp16": 16, "float16": 16, "bf16": 16,
    "f32": 32, "fp32": 32, "float32": 32,
    "i8": 8, "int8": 8, "i16": 16, "int16": 16, "i32": 32, "int32": 32,
 }
 def format_report(
    traces: list[dict],
    title: str = "Benchmark",
    spec: dict | None = None,
 ) -> str:
    """Format collected traces into a human-readable performance report.
    spec: topology spec dict for peak TFLOPS / BW extraction.
    """
    peak_tflops_f16, peak_hbm_bw_gbs = _extract_peaks(spec)
    num_pes = _count_pes(spec)
    lines: list[str] = []
    title_line = f"-- {title} Performance Report "
    deploy_entries = [t for t in traces if t.get("phase") not in ("kernel",)]
    kernel_entries = [t for t in traces if t.get("phase") == "kernel"]
    # ── Title ──
    # Compute max header width for consistent separator lengths
    _cmd_hdr = (f"{'Cmd':<10} {'Name':<12} {'SIP':>4} {'Cube':>5} {'PE':>4} {'Bytes':>10} "
                f"{'Lat(ns)':>10} {'Xfer(ns)':>10} {'Proc(ns)':>10} "
                f"{'BW(GB/s)':>10} {'MinBW':>10} {'Util%':>7}")
    report_width = len(_cmd_hdr)
    lines.append(title_line + "-" * max(0, report_width - len(title_line)))
    # ── Command summary ──
    if deploy_entries:
        lines.append("")
        hdr = (f"{'Cmd':<10} {'Name':<12} {'SIP':>4} {'Cube':>5} {'PE':>4} {'Bytes':>10} "
               f"{'Lat(ns)':>10} {'Xfer(ns)':>10} {'Proc(ns)':>10} "
               f"{'BW(GB/s)':>10} {'MinBW':>10} {'Util%':>7}")
        lines.append(hdr)
        lines.append("-" * len(hdr))
        for e in deploy_entries:
            lat = e.get("total_ns", 0.0)
            nb = e.get("nbytes", 0)
            sip = e.get("sip", "-")
            pe = e.get("pe", "-")
            cube = e.get("cube", "-")
            cmd = e.get("phase", "deploy")
            xfer_ns = e.get("xfer_ns", 0.0)
            proc_ns = lat - xfer_ns if xfer_ns > 0 else 0.0
            bw = nb / lat if lat > 0 else 0.0
            min_bw = nb / xfer_ns if xfer_ns > 0 else 0.0
            util = (xfer_ns / lat * 100) if lat > 0 and xfer_ns > 0 else 0.0
            lines.append(
                f"{cmd:<10} {e.get('name', '?'):<12} {str(sip):>4} {str(cube):>5} {str(pe):>4} {nb:>10} "
                f"{lat:>10.1f} {xfer_ns:>10.1f} {proc_ns:>10.1f} "
                f"{bw:>10.1f} {min_bw:>10.1f} {util:>6.1f}%"
            )
    # ── Kernel summary ──
    if kernel_entries:
        lines.append("")
        k_hdr = (f"{'Phase':<10} {'Name':<12} {'PE':>4} {'E2E(ns)':>10} "
                 f"{'PE(ns)':>10} {'DMA(ns)':>10} {'Comp(ns)':>10} "
                 f"{'Bound':<8} {'TFLOPS':>8} {'Peak':>8} {'Util%':>7}")
        lines.append(k_hdr)
        lines.append("-" * len(k_hdr))
        for e in kernel_entries:
            e2e_ns = e.get("total_ns", 0.0)
            pe_ns = e.get("pe_exec_ns", e2e_ns)
            dma_ns = e.get("dma_ns", 0.0)
            compute_ns = e.get("compute_ns", 0.0)
            target_pe = e.get("target_pe", "-")
            scalars = e.get("scalars", [])
            pe_str = "all" if target_pe == "all" else str(target_pe)
            n_active = num_pes if target_pe == "all" else 1
            # Bound indicator based on measured DMA vs compute time
            if dma_ns > 0 or compute_ns > 0:
                bound = "memory" if dma_ns >= compute_ns else "compute"
            else:
                bound = "-"
            achieved = _calc_tflops(scalars, pe_ns)
            peak_total = peak_tflops_f16 * n_active
            util = (achieved / peak_total * 100) if peak_total > 0 else 0.0
            lines.append(
                f"{'kernel':<10} {e.get('name', '?'):<12} {pe_str:>4} {e2e_ns:>10.1f} "
                f"{pe_ns:>10.1f} {dma_ns:>10.1f} {compute_ns:>10.1f} "
                f"{bound:<8} {achieved:>8.3f} {peak_total:>8.1f} {util:>6.1f}%"
            )
    # ── Per-PE summary ──
    pe_deploy = _per_pe_deploy(deploy_entries)
    if len(pe_deploy) > 1:
        lines.append("")
        pe_title = (f"-- Per-PE Summary (peak: {peak_tflops_f16:.1f} TFLOPS/PE, "
                     f"{peak_hbm_bw_gbs:.0f} GB/s HBM BW) ")
        pe_hdr = (f"{'PE':>4}  {'Deploy(ns)':>10}  {'BW(GB/s)':>10}  {'BW Util':>8}  "
                   f"{'Kernel(ns)':>10}  {'TFLOPS':>8}  {'Util':>7}")
        pe_width = max(len(pe_title), len(pe_hdr))
        lines.append(pe_title + "-" * max(0, pe_width - len(pe_title)))
        lines.append(pe_hdr)
        lines.append("-" * pe_width)
        k_ns = sum(e.get("pe_exec_ns", e.get("total_ns", 0.0)) for e in kernel_entries)
        k_scalars = kernel_entries[0].get("scalars", []) if kernel_entries else []
        n_active = len(pe_deploy)
        total_achieved = _calc_tflops(k_scalars, k_ns)
        per_pe_tflops = total_achieved / n_active if n_active > 0 else 0.0
        pe_util = (per_pe_tflops / peak_tflops_f16 * 100) if peak_tflops_f16 > 0 else 0.0
        for pe_id in sorted(pe_deploy):
            d_ns, d_bytes = pe_deploy[pe_id]
            d_bw = d_bytes / d_ns if d_ns > 0 else 0.0
            d_util = (d_bw / peak_hbm_bw_gbs * 100) if peak_hbm_bw_gbs > 0 else 0.0
            lines.append(
                f"{pe_id:>4}  {d_ns:>10.1f}  {d_bw:>10.1f}  {d_util:>7.1f}%  "
                f"{k_ns:>10.1f}  {per_pe_tflops:>8.3f}  {pe_util:>6.1f}%"
            )
        lines.append("")
    return "\n".join(lines)
 def _extract_peaks(spec: dict | None) -> tuple[float, float]:
    """Extract peak TFLOPS (f16) and HBM BW (GB/s) from spec."""
    if spec is None:
        return 0.0, 0.0
    cube = spec.get("cube", {})
    pe_template = cube.get("pe_template", {})
    comps = pe_template.get("components", {})
    gemm_attrs = comps.get("pe_gemm", {}).get("attrs", {})
    peak_tflops = float(gemm_attrs.get("peak_tflops_f16", 0.0))
    cube_links = cube.get("links", {})
    hbm_bw = float(cube_links.get("xbar_to_hbm_bw_gbs", 0.0))
    return peak_tflops, hbm_bw
 def _count_pes(spec: dict | None) -> int:
    if spec is None:
        return 8
    cube = spec.get("cube", {})
    layout = cube.get("pe_layout", {})
    per_corner = layout.get("pe_per_corner", 2)
    corners = len(layout.get("corners", ["NW", "NE", "SW", "SE"]))
    return per_corner * corners
 def _calc_tflops(scalars: list, latency_ns: float) -> float:
    """Calculate achieved TFLOPS from scalar args [M, K, N] and latency."""
    if len(scalars) < 3 or latency_ns <= 0:
        return 0.0
    m, k, n = scalars[0], scalars[1], scalars[2]
    flops = 2.0 * m * k * n
    return flops / (latency_ns * 1e-9) / 1e12
 def _per_pe_deploy(deploy_entries: list[dict]) -> dict[int, tuple[float, int]]:
    """Aggregate deploy latency and bytes per PE."""
    result: dict[int, tuple[float, int]] = {}
    for e in deploy_entries:
        pe = e.get("pe", 0)
        lat = e.get("total_ns", 0.0)
        nb = e.get("nbytes", 0)
        if pe in result:
            old_ns, old_bytes = result[pe]
            result[pe] = (old_ns + lat, old_bytes + nb)
        else:
            result[pe] = (lat, nb)
    return result
@@ -0,0 +1,150 @@
 """PE-internal command types and handles (ADR-0014).
 Generated by triton_emu (TLContext) and consumed by PE component
 implementations (PE_CPU, PE_SCHEDULER, PE_DMA, PE_GEMM, PE_MATH).
 Command lifecycle:
  Triton kernel → TLContext → [PeCommand list] → PE_CPU → PE_SCHEDULER → engines
 """
 from __future__ import annotations
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Any, Literal
 if TYPE_CHECKING:
    import simpy
 # ── Handles ───────────────────────────────────────────────────────
@dataclass(frozen=True)
 class TensorHandle:
    """Opaque reference to a tensor residing in PE_TCM.
    Returned by tl.load, tl.dot, tl.exp, etc.
    Carries metadata for command generation; data field is reserved
    for future validate mode (numpy array).
    """
    id: str
    pa: int                          # physical address in HBM/TCM
    shape: tuple[int, ...]
    dtype: str
    nbytes: int                      # total byte size
    data: object = None              # reserved for validate mode
@dataclass(frozen=True)
 class CompletionHandle:
    """Opaque handle for a non-blocking composite command.
    Returned by tl.composite, consumed by tl.wait.
    """
    id: str
 # ── PE Commands ───────────────────────────────────────────────────
@dataclass(frozen=True)
 class DmaReadCmd:
    """DMA READ: HBM → PE_TCM."""
    handle: TensorHandle
    src_pa: int
    nbytes: int
@dataclass(frozen=True)
 class DmaWriteCmd:
    """DMA WRITE: PE_TCM → HBM."""
    handle: TensorHandle
    dst_pa: int
    nbytes: int
@dataclass(frozen=True)
 class GemmCmd:
    """GEMM engine command: matrix multiply on TCM data.
    out = a @ b, all operands in TCM.
    """
    a: TensorHandle
    b: TensorHandle
    out: TensorHandle
    m: int
    k: int
    n: int
@dataclass(frozen=True)
 class MathCmd:
    """MATH engine command: unary/binary/reduction on TCM data.
    op: "exp", "log", "sqrt", "abs", "sigmoid", "cos", "sin",
        "add", "sub", "mul", "div", "where",
        "sum", "max", "min"
    """
    op: str
    inputs: tuple[TensorHandle, ...]
    out: TensorHandle
    axis: int | None = None          # for reductions
@dataclass(frozen=True)
 class CompositeCmd:
    """Composite command: tiled pipeline of DMA_READ + COMPUTE + DMA_WRITE.
    Non-blocking — submitted to PE_SCHEDULER which manages tile splitting
    and pipeline overlaps (ADR-0014 D3.2).
    """
    completion: CompletionHandle
    op: Literal["gemm", "math"]
    a: TensorHandle
    b: TensorHandle | None
    out_pa: int
    out_nbytes: int
    math_op: str | None = None       # for op="math": which math operation
@dataclass(frozen=True)
 class WaitCmd:
    """Wait for a specific composite or all pending composites."""
    handle: CompletionHandle | None = None  # None = wait all
@dataclass(frozen=True)
 class PeCpuOverheadCmd:
    """PE_CPU scalar execution overhead (cycles)."""
    cycles: int
 # Union type for all PE commands
 PeCommand = (
    DmaReadCmd | DmaWriteCmd | GemmCmd | MathCmd
    | CompositeCmd | WaitCmd | PeCpuOverheadCmd
 )
@dataclass
 class PeInternalTxn:
    """PE-internal message flowing PE_CPU → PE_SCHEDULER → engines.
    Carries a single PeCommand and a completion event.  PE_CPU creates one
    PeInternalTxn per command during the replay phase and sends it to
    PE_SCHEDULER, which routes it to the appropriate engine (PE_DMA,
    PE_GEMM, PE_MATH).  The engine signals ``done`` on completion.
    """
    command: PeCommand
    done: simpy.Event  # succeeded when the engine completes this command
    pe_prefix: str = ""  # e.g. "sip0.cube0.pe0" — needed by PE_DMA for path resolution
    result_data: dict[str, Any] = field(default_factory=dict)
@@ -0,0 +1,29 @@
 from __future__ import annotations
 from dataclasses import dataclass
 from typing import Any, NewType, Protocol, TypeAlias
 RequestHandle = NewType("RequestHandle", str)
 Trace: TypeAlias = Any
@dataclass(frozen=True)
 class Completion:
    ok: bool
    error_code: str | None = None
    error_message: str | None = None
 class SimEngine(Protocol):
    """
    Backend simulation/runner engine contract.
    Engine must be able to:
      - accept requests created by RuntimeContext (submit/dispatch)
      - report completion and optional trace for a given handle
    """
    def get_completion(self, handle: RequestHandle) -> tuple[Completion, Trace | None]: ...
    def submit(self, request: Any) -> RequestHandle: ...
    def wait(self, handle: RequestHandle) -> None: ...
@@ -0,0 +1,4 @@
 from kernbench.components.base import ComponentBase, ComponentRegistry
 from kernbench.components.context import ComponentContext
 __all__ = ["ComponentBase", "ComponentRegistry", "ComponentContext"]
@@ -0,0 +1,167 @@
 from __future__ import annotations
 from abc import ABC, abstractmethod
 from collections.abc import Generator
 from typing import TYPE_CHECKING, Any
 import simpy
 if TYPE_CHECKING:
    from kernbench.components.context import ComponentContext
    from kernbench.topology.types import Node
 class ComponentBase(ABC):
    """Base class for all SimPy component implementations (ADR-0007 D3, ADR-0015).
    Each component corresponds to one node in the compiled topology graph.
    It models the processing overhead at that node as a SimPy generator,
    allowing future implementations to add queueing and contention.
    Port model (ADR-0015 D1):
      in_ports[src_node_id]  — SimPy Store for incoming messages from src
      out_ports[dst_node_id] — SimPy Store for outgoing messages to dst
    Ports are wired by GraphEngine at initialization; wire processes model
    propagation delay between connected ports (ADR-0015 D2).
    Context (ADR-0015 D4):
      ctx — ComponentContext with router and resolver.
    """
    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
        self.node = node
        self.ctx = ctx
        self.in_ports: dict[str, simpy.Store] = {}
        self.out_ports: dict[str, simpy.Store] = {}
    def start(self, env: simpy.Environment) -> None:
        """Called once after all ports are wired.
        Default: starts a fan-in collector and a generic forwarding worker.
        The worker calls self.run() for per-component latency, then routes the
        Transaction to the next hop or signals done (duck-typed; no direct
        Transaction import to avoid circular dependencies).
        Override in components that need custom fan-out / aggregation logic
        (e.g. MCpuComponent, IoCpuComponent for kernel launch).
        """
        if not self.in_ports:
            return
        self._inbox: simpy.Store = simpy.Store(env)
        for port in self.in_ports.values():
            env.process(self._fan_in(port))
        env.process(self._worker(env))
    def _fan_in(self, port: simpy.Store) -> Generator:
        """Relay messages from one in_port into the shared inbox."""
        while True:
            msg = yield port.get()
            yield self._inbox.put(msg)
    def _worker(self, env: simpy.Environment) -> Generator:
        """Generic forwarding worker: spawns _forward_txn per message (pipeline)."""
        while True:
            txn: Any = yield self._inbox.get()
            env.process(self._forward_txn(env, txn))
    def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator:
        """Apply run() latency, then forward to next hop or drain at terminal."""
        yield from self.run(env, txn.nbytes)
        next_hop = txn.next_hop          # duck-typed: Transaction.next_hop
        if next_hop:
            yield self.out_ports[next_hop].put(txn.advance())
        else:
            drain = getattr(txn, "drain_ns", 0.0)
            if drain > 0:
                yield env.timeout(drain)
            txn.done.succeed()
    @abstractmethod
    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
        """SimPy process: yield one or more events for this node's processing.
        Subclasses yield env.timeout(overhead_ns) or compute latency dynamically.
        Called by _forward_txn and subclass-specific handlers.
        """
        ...
 class PeEngineBase(ComponentBase):
    """Base class for PE-internal engines (PE_DMA, PE_GEMM, PE_MATH).
    Provides:
      - ``_pe_prefix``: extracted from node.id (e.g. "sip0.cube0.pe0")
      - Dual-message ``_worker``: dispatches PeInternalTxn to
        ``handle_command()`` and Transaction to inherited ``_forward_txn()``.
      - ``init_resources(env)``: hook for subclass resource initialization,
        called by ``start()`` before the worker is spawned.
    Subclass contract:
      1. Override ``handle_command(env, pe_txn)`` — process a PeInternalTxn.
      2. Override ``run(env, nbytes)`` — yield component latency.
      3. Optionally override ``init_resources(env)`` for DMA channels, etc.
    """
    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
        super().__init__(node, ctx)
        self._pe_prefix: str = node.id.rsplit(".", 1)[0]
    def start(self, env: simpy.Environment) -> None:
        self.init_resources(env)
        super().start(env)
    def init_resources(self, env: simpy.Environment) -> None:
        """Hook for subclass resource initialization. Called before worker spawn."""
    def _worker(self, env: simpy.Environment) -> Generator:
        """Dual-message dispatch: PeInternalTxn → handle_command, Transaction → _forward_txn."""
        from kernbench.common.pe_commands import PeInternalTxn
        while True:
            msg: Any = yield self._inbox.get()
            if isinstance(msg, PeInternalTxn):
                env.process(self.handle_command(env, msg))
            else:
                env.process(self._forward_txn(env, msg))
    @abstractmethod
    def handle_command(self, env: simpy.Environment, pe_txn: Any) -> Generator:
        """Process a PE-internal command (PeInternalTxn).
        Subclass must:
          - Perform engine-specific work (acquire resources, compute, etc.)
          - Call ``pe_txn.done.succeed()`` on completion.
        """
        ...
 class ComponentRegistry:
    """DI registry: maps node.impl strings to ComponentBase subclasses.
    Resolution order for ComponentRegistry.create(node, overrides, ctx):
      1. overrides[node.impl]   — caller-injected override
      2. _registry[node.impl]   — globally registered impl
      3. Error                   — no fallback; every node must have an impl
    """
    _registry: dict[str, type[ComponentBase]] = {}
    @classmethod
    def register(cls, impl: str, component_cls: type[ComponentBase]) -> None:
        cls._registry[impl] = component_cls
    @classmethod
    def create(
        cls,
        node: Node,
        overrides: dict[str, type[ComponentBase]] | None = None,
        ctx: ComponentContext | None = None,
    ) -> ComponentBase:
        if overrides and node.impl in overrides:
            return overrides[node.impl](node, ctx)
        if node.impl in cls._registry:
            return cls._registry[node.impl](node, ctx)
        raise ValueError(
            f"No component registered for impl '{node.impl}' (node: {node.id}). "
            f"Register it in kernbench.components.impls.__init__."
        )
@@ -0,0 +1,52 @@
 from __future__ import annotations
 from dataclasses import dataclass, field
 from typing import Any
 import simpy
 from kernbench.policy.routing.router import AddressResolver, PathRouter
@dataclass
 class ComponentContext:
    """Topology services injected into every component implementation.
    Required by components that need routing or address resolution
    (IoCpuComponent, MCpuComponent, …).  TransitComponent ignores ctx.
    Passed via ComponentRegistry.create(node, overrides, ctx=ctx).
    """
    router: PathRouter
    resolver: AddressResolver
    positions: dict[str, tuple[float, float] | None]  # node_id → pos_mm
    ns_per_mm: float  # wire propagation constant (from topology spec)
    edge_map: dict[tuple[str, str], Any] = field(default_factory=dict)
    spec: dict = field(default_factory=dict)  # topology spec (cube layout, PE count, etc.)
    def get_shared_resource(
        self, env: simpy.Environment, key: str, capacity: int = 1,
    ) -> simpy.Resource:
        """Return a shared SimPy Resource, creating it on first access.
        Used by PE components that share a resource across engines within
        the same PE (e.g. accel_slot shared by PE_GEMM and PE_MATH).
        Key should be scoped per PE: e.g. "sip0.cube0.pe0.accel_slot".
        """
        if not hasattr(self, "_shared_resources"):
            self._shared_resources: dict[str, simpy.Resource] = {}
        if key not in self._shared_resources:
            self._shared_resources[key] = simpy.Resource(env, capacity=capacity)
        return self._shared_resources[key]
    def compute_drain_ns(self, path: list[str], nbytes: int) -> float:
        """Wormhole drain time: nbytes / bottleneck_bw along path."""
        min_bw = float("inf")
        for i in range(len(path) - 1):
            edge = self.edge_map.get((path[i], path[i + 1]))
            if edge and getattr(edge, "bw_gbs", None):
                min_bw = min(min_bw, edge.bw_gbs)
        if min_bw == float("inf"):
            return 0.0
        return nbytes / min_bw
@@ -0,0 +1,54 @@
 """Concrete component implementations.
 Each module registers its component(s) with ComponentRegistry on import.
 Import this package to activate all built-in implementations.
 """
 from kernbench.components.base import ComponentRegistry
 from kernbench.components.impls.forwarding import TransitComponent
 from kernbench.components.impls.hbm_ctrl import HbmCtrlComponent
 from kernbench.components.impls.io_cpu import IoCpuComponent
 from kernbench.components.impls.m_cpu import MCpuComponent
 from kernbench.components.impls.noc import TwoDMeshNocComponent
 from kernbench.components.impls.pcie_ep import PcieEpComponent
 from kernbench.components.impls.pe_cpu import PeCpuComponent
 from kernbench.components.impls.pe_dma import PeDmaComponent
 from kernbench.components.impls.pe_gemm import PeGemmComponent
 from kernbench.components.impls.pe_math import PeMathComponent
 from kernbench.components.impls.pe_scheduler import PeSchedulerComponent
 from kernbench.components.impls.pe_tcm import PeTcmComponent
 from kernbench.components.impls.sram import SramComponent
 ComponentRegistry.register("forwarding_v1", TransitComponent)
 ComponentRegistry.register("switch_v1", TransitComponent)
 ComponentRegistry.register("noc_v1", TransitComponent)
 ComponentRegistry.register("noc_2d_mesh_v1", TwoDMeshNocComponent)
 ComponentRegistry.register("ucie_v1", TransitComponent)
 ComponentRegistry.register("xbar_v1", TransitComponent)
 ComponentRegistry.register("pcie_ep_v1", PcieEpComponent)
 ComponentRegistry.register("io_cpu_v1", IoCpuComponent)
 ComponentRegistry.register("m_cpu_v1", MCpuComponent)
 ComponentRegistry.register("hbm_ctrl_v1", HbmCtrlComponent)
 ComponentRegistry.register("sram_v1", SramComponent)
 ComponentRegistry.register("pe_cpu_v1", PeCpuComponent)
 ComponentRegistry.register("pe_scheduler_v1", PeSchedulerComponent)
 ComponentRegistry.register("pe_dma_v1", PeDmaComponent)
 ComponentRegistry.register("pe_gemm_v1", PeGemmComponent)
 ComponentRegistry.register("pe_math_v1", PeMathComponent)
 ComponentRegistry.register("pe_tcm_v1", PeTcmComponent)
 __all__ = [
    "HbmCtrlComponent",
    "IoCpuComponent",
    "MCpuComponent",
    "PcieEpComponent",
    "PeCpuComponent",
    "PeDmaComponent",
    "PeGemmComponent",
    "PeMathComponent",
    "PeSchedulerComponent",
    "PeTcmComponent",
    "TransitComponent",
    "TwoDMeshNocComponent",
    "SramComponent",
 ]
@@ -0,0 +1,27 @@
 from __future__ import annotations
 from collections.abc import Generator
 from typing import TYPE_CHECKING
 import simpy
 from kernbench.components.base import ComponentBase
 if TYPE_CHECKING:
    from kernbench.components.context import ComponentContext
    from kernbench.topology.types import Node
 class TransitComponent(ComponentBase):
    """Transit component for NOC, UCIe, XBAR nodes.
    Applies overhead_ns processing delay (from node.attrs) then forwards the
    Transaction to the next hop via inherited _forward_txn().
    """
    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
        super().__init__(node, ctx)
    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
        yield env.timeout(overhead_ns)
@@ -0,0 +1,101 @@
 from __future__ import annotations
 from collections.abc import Generator
 from typing import TYPE_CHECKING, Any
 import simpy
 from kernbench.components.base import ComponentBase
 from kernbench.sim_engine.transaction import Transaction
 if TYPE_CHECKING:
    from kernbench.components.context import ComponentContext
    from kernbench.topology.types import Node
 class HbmCtrlComponent(ComponentBase):
    """HBM controller: terminal component that models HBM access latency.
    Dual-channel model: separate read and write resources (each capacity=1)
    allowing concurrent read/write like PE_DMA. Multiple reads or multiple
    writes still serialize within their respective channel.
    On completion, creates a ResponseMsg and sends it back on the reverse path
    so that response latency is modeled through the fabric.
    """
    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
        super().__init__(node, ctx)
        self._read: simpy.Resource | None = None
        self._write: simpy.Resource | None = None
    def start(self, env: simpy.Environment) -> None:
        capacity = int(self.node.attrs.get("capacity", 1))
        self._read = simpy.Resource(env, capacity=capacity)
        self._write = simpy.Resource(env, capacity=capacity)
        super().start(env)
    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
        yield env.timeout(overhead_ns)
    def _select_channel(self, txn: Any) -> simpy.Resource:
        """Select channel based on request type: write requests → write, else → read."""
        from kernbench.runtime_api.kernel import MemoryWriteMsg, PeDmaMsg
        assert self._read is not None and self._write is not None
        req = txn.request
        if isinstance(req, MemoryWriteMsg):
            return self._write
        if isinstance(req, PeDmaMsg) and req.is_write:
            return self._write
        return self._read
    def _worker(self, env: simpy.Environment) -> Generator:
        """Dispatch each incoming txn to a concurrent process for channel-level parallelism."""
        while True:
            txn: Any = yield self._inbox.get()
            env.process(self._handle_txn(env, txn))
    def _handle_txn(self, env: simpy.Environment, txn: Any) -> Generator:
        """Acquire channel, run, apply drain, send response."""
        channel = self._select_channel(txn)
        with channel.request() as req:
            yield req
            yield from self.run(env, txn.nbytes)
            drain = getattr(txn, "drain_ns", 0.0)
            if drain > 0:
                yield env.timeout(drain)
        yield from self._send_response(env, txn)
    def _send_response(self, env: simpy.Environment, txn: Any) -> Generator:
        """Create ResponseMsg and send on reverse path back to originator.
        PeDmaMsg is a direct probe with no IO_CPU/M_CPU aggregation in the path,
        so we succeed txn.done directly instead of sending a response Transaction.
        """
        from kernbench.runtime_api.kernel import PeDmaMsg
        if isinstance(txn.request, PeDmaMsg):
            txn.done.succeed()
            return
        reverse_path = list(reversed(txn.path))
        if len(reverse_path) >= 2 and self.ctx:
            from kernbench.runtime_api.kernel import ResponseMsg
            parts = self.node.id.split(".")
            cube_id = int(parts[1].replace("cube", ""))
            pe_id = int(parts[3].replace("slice", ""))
            resp_msg = ResponseMsg(
                correlation_id=txn.request.correlation_id,
                request_id=txn.request.request_id,
                src_cube=cube_id, src_pe=pe_id, success=True,
            )
            resp_txn = Transaction(
                request=resp_msg, path=reverse_path, step=0,
                nbytes=0, done=env.event(), is_response=True,
            )
            yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
        else:
            txn.done.succeed()
@@ -0,0 +1,145 @@
 from __future__ import annotations
 from collections.abc import Generator
 from typing import TYPE_CHECKING, Any
 import simpy
 from kernbench.components.base import ComponentBase
 from kernbench.sim_engine.transaction import Transaction
 if TYPE_CHECKING:
    from kernbench.components.context import ComponentContext
    from kernbench.topology.types import Node
 class IoCpuComponent(ComponentBase):
    """IO_CPU component: multi-cube fan-out with response aggregation.
    Forward path:
      1. Applies overhead_ns processing overhead.
      2. Resolves target cube(s) from request.target_cubes.
      3. Fans out sub-Transactions to each target cube's M_CPU.
    Response path:
      Collects ResponseMsg from each M_CPU. When all cube responses are
      received, succeeds the parent txn.done.
    """
    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
        super().__init__(node, ctx)
        # Pending fan-out tracking: request_id → (expected, received, parent_txn_done)
        self._pending: dict[str, tuple[int, int, simpy.Event]] = {}
    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
        yield env.timeout(overhead_ns)
    def _worker(self, env: simpy.Environment) -> Generator:
        while True:
            txn: Any = yield self._inbox.get()
            if getattr(txn, "is_response", False):
                self._collect_response(txn)
            else:
                yield from self.run(env, txn.nbytes)
                env.process(self._dispatch_to_m_cpus(env, txn))
    def _collect_response(self, resp_txn: Any) -> None:
        """Receive a cube response and increment the aggregation counter."""
        key = resp_txn.request.request_id
        if key not in self._pending:
            return
        expected, received, parent_done = self._pending[key]
        received += 1
        if received >= expected:
            parent_done.succeed()
            del self._pending[key]
        else:
            self._pending[key] = (expected, received, parent_done)
    def _dispatch_to_m_cpus(self, env: simpy.Environment, txn: Any) -> Generator:
        """Fan out sub-Transactions to target cube M_CPUs, wait for responses."""
        from kernbench.runtime_api.kernel import KernelLaunchMsg, MemoryReadMsg, MemoryWriteMsg
        request = txn.request
        try:
            cube_targets = self._resolve_cube_targets(request)
        except Exception:
            txn.done.succeed()
            return
        if not cube_targets:
            txn.done.succeed()
            return
        # Setup aggregation
        self._pending[request.request_id] = (len(cube_targets), 0, txn.done)
        # Fan out to each target cube's M_CPU
        for sip, cube in cube_targets:
            try:
                m_cpu_id = self.ctx.resolver.find_m_cpu(sip, cube)
                path = self.ctx.router.find_node_path(self.node.id, m_cpu_id)
            except Exception:
                continue
            if len(path) < 2:
                continue
            sub_txn = Transaction(
                request=request, path=path, step=0,
                nbytes=txn.nbytes, done=env.event(),
                result_data=txn.result_data,
            )
            yield self.out_ports[path[1]].put(sub_txn.advance())
    def _resolve_cube_targets(self, request: Any) -> list[tuple[int, int]]:
        """Return list of (sip, cube) pairs to fan out to."""
        from kernbench.runtime_api.kernel import KernelLaunchMsg, MemoryReadMsg, MemoryWriteMsg
        target_cubes = getattr(request, "target_cubes", "all")
        if isinstance(request, MemoryWriteMsg):
            sip = request.dst_sip
            if target_cubes == "all":
                cube = self._cube_from_pa(request.dst_pa, fallback=request.dst_cube)
                return [(sip, cube)]
            return [(sip, c) for c in target_cubes]
        if isinstance(request, MemoryReadMsg):
            sip = request.src_sip
            if target_cubes == "all":
                cube = self._cube_from_pa(request.src_pa, fallback=request.src_cube)
                return [(sip, cube)]
            return [(sip, c) for c in target_cubes]
        if isinstance(request, KernelLaunchMsg):
            my_sip = self._my_sip()
            if target_cubes != "all":
                return [(my_sip, c) for c in target_cubes]
            # "all": derive from tensor shards, filtered to this SIP
            seen: set[tuple[int, int]] = set()
            targets: list[tuple[int, int]] = []
            for arg in request.args:
                if arg.arg_kind != "tensor":
                    continue
                for shard in arg.shards:
                    if shard.sip != my_sip:
                        continue
                    key = (shard.sip, shard.cube)
                    if key not in seen:
                        seen.add(key)
                        targets.append(key)
            return targets
        return []
    def _cube_from_pa(self, pa_val: int, fallback: int) -> int:
        """Extract cube_id from a physical address, with fallback."""
        from kernbench.policy.address.phyaddr import PhysAddr
        try:
            return PhysAddr.decode(pa_val).cube_id
        except Exception:
            return fallback
    def _my_sip(self) -> int:
        """Extract this IO_CPU's SIP ID from its node ID (e.g. 'sip0.io0.io_cpu' → 0)."""
        return int(self.node.id.split(".")[0].replace("sip", ""))
@@ -0,0 +1,269 @@
 from __future__ import annotations
 from collections.abc import Generator
 from typing import TYPE_CHECKING, Any
 import simpy
 from kernbench.components.base import ComponentBase
 from kernbench.sim_engine.transaction import Transaction
 if TYPE_CHECKING:
    from kernbench.components.context import ComponentContext
    from kernbench.topology.types import Node
 class MCpuComponent(ComponentBase):
    """M_CPU component: multi-PE DMA fan-out with response aggregation.
    Forward path (ADR-0015 D5):
      When a forward Transaction arrives at m_cpu (terminal hop), M_CPU fans out
      DMA sub-Transactions to target PEs' HBM slices. target_pe on the request
      controls fan-out: int → single PE, "all" → all PEs in the cube.
    Response path:
      ResponseMsg from each hbm_ctrl arrives back at m_cpu. Once all PE responses
      are collected, m_cpu sends an aggregate ResponseMsg on the reverse command
      path back to io_cpu.
    Transit:
      When m_cpu is NOT the terminal hop (transit or response relay), the
      Transaction is forwarded normally to the next hop.
    """
    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
        super().__init__(node, ctx)
        # Pending fan-out tracking: request_id → (expected, received, all_done_event)
        self._pending: dict[str, tuple[int, int, simpy.Event]] = {}
        # Store parent txn for response sending: request_id → parent_txn
        self._parent_txns: dict[str, Any] = {}
        # DMA engine resources (ADR-0015 D5, ADR-0014 D4): capacity=1 each
        self._dma_write: simpy.Resource | None = None
        self._dma_read: simpy.Resource | None = None
    def start(self, env: simpy.Environment) -> None:
        self._dma_write = simpy.Resource(env, capacity=1)
        self._dma_read = simpy.Resource(env, capacity=1)
        super().start(env)
    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
        yield env.timeout(overhead_ns)
    def _worker(self, env: simpy.Environment) -> Generator:
        """Dispatch forward txns, collect response txns."""
        from kernbench.runtime_api.kernel import KernelLaunchMsg
        while True:
            txn: Any = yield self._inbox.get()
            if getattr(txn, "is_response", False):
                self._collect_response(txn)
            else:
                yield from self.run(env, txn.nbytes)
                next_hop = txn.next_hop
                if next_hop:
                    yield self.out_ports[next_hop].put(txn.advance())
                elif self.ctx is not None and txn.request is not None:
                    if isinstance(txn.request, KernelLaunchMsg):
                        env.process(self._kernel_launch_fanout(env, txn))
                    else:
                        env.process(self._dma_fanout(env, txn))
                else:
                    txn.done.succeed()
    def _collect_response(self, resp_txn: Any) -> None:
        """Receive a PE response and increment the aggregation counter."""
        key = resp_txn.request.request_id
        if key not in self._pending:
            return
        expected, received, all_done = self._pending[key]
        received += 1
        if received >= expected:
            all_done.succeed()
            del self._pending[key]
        else:
            self._pending[key] = (expected, received, all_done)
    def _dma_fanout(self, env: simpy.Environment, txn: Any) -> Generator:
        """Fan out DMA sub-Transactions to target PE(s), wait for responses,
        then send aggregate response on reverse command path.
        Each DMA transfer acquires the DMA resource (capacity=1 per ADR-0014 D4),
        so multi-PE fan-out is serialized through the DMA engine.
        """
        from kernbench.runtime_api.kernel import MemoryWriteMsg
        request = txn.request
        target_pe = getattr(request, "target_pe", "all")
        dst_nodes = self._resolve_dma_destinations(request, target_pe)
        if not dst_nodes:
            txn.done.succeed()
            return
        # Setup aggregation
        all_done = env.event()
        self._pending[request.request_id] = (len(dst_nodes), 0, all_done)
        self._parent_txns[request.request_id] = txn
        # Select DMA resource based on operation type
        dma_res = self._dma_write if isinstance(request, MemoryWriteMsg) else self._dma_read
        # Fan out DMA sub-txns (serialized through DMA resource)
        max_drain_ns = 0.0
        for dst_node in dst_nodes:
            try:
                dma_path = self.ctx.router.find_mcpu_dma_path(self.node.id, dst_node)
            except Exception:
                continue
            if len(dma_path) < 2:
                continue
            drain_ns = self.ctx.compute_drain_ns(dma_path, txn.nbytes)
            max_drain_ns = max(max_drain_ns, drain_ns)
            sub_txn = Transaction(
                request=request, path=dma_path, step=0,
                nbytes=txn.nbytes, done=env.event(),
                drain_ns=drain_ns,
            )
            with dma_res.request() as req:
                yield req
                yield self.out_ports[dma_path[1]].put(sub_txn.advance())
        # Wait for all PE responses
        yield all_done
        txn.result_data["xfer_ns"] = max_drain_ns
        del self._parent_txns[request.request_id]
        # Send aggregate response on reverse command path
        reverse_path = list(reversed(txn.path))
        if len(reverse_path) >= 2:
            from kernbench.runtime_api.kernel import ResponseMsg
            parts = self.node.id.split(".")
            cube_id = int(parts[1].replace("cube", ""))
            resp_msg = ResponseMsg(
                correlation_id=request.correlation_id,
                request_id=request.request_id,
                src_cube=cube_id, src_pe=-1, success=True,
            )
            resp_txn = Transaction(
                request=resp_msg, path=reverse_path, step=0,
                nbytes=0, done=env.event(), is_response=True,
            )
            yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
        else:
            txn.done.succeed()
    def _kernel_launch_fanout(self, env: simpy.Environment, txn: Any) -> Generator:
        """Fan out KernelLaunchMsg to target PE_CPU(s) via NOC (ADR-0009 D3).
        Routes through find_node_path (M_CPU → NOC → PE_CPU command edges).
        Waits for sub_txn.done directly — no ResponseMsg needed for PE direction.
        Then sends aggregate ResponseMsg back to IO_CPU on the reverse path.
        """
        request = txn.request
        target_pe = getattr(request, "target_pe", "all")
        cube_prefix = self.node.id.rsplit(".", 1)[0]  # e.g. "sip0.cube0"
        pe_ids = self._resolve_pe_ids(target_pe)
        if not pe_ids:
            txn.done.succeed()
            return
        # Fan out to each PE_CPU and collect done events
        sub_dones: list[simpy.Event] = []
        sub_txns: list[Transaction] = []
        for pe_id in pe_ids:
            pe_cpu_id = f"{cube_prefix}.pe{pe_id}.pe_cpu"
            try:
                path = self.ctx.router.find_node_path(self.node.id, pe_cpu_id)
            except Exception:
                continue
            if len(path) < 2:
                continue
            sub_done = env.event()
            sub_txn = Transaction(
                request=request, path=path, step=0,
                nbytes=0, done=sub_done,
            )
            yield self.out_ports[path[1]].put(sub_txn.advance())
            sub_dones.append(sub_done)
            sub_txns.append(sub_txn)
        if not sub_dones:
            txn.done.succeed()
            return
        # Wait for all PE_CPUs to complete
        for sd in sub_dones:
            yield sd
        # Aggregate PE-internal metrics (max across PEs)
        pe_exec_values = [st.result_data.get("pe_exec_ns", 0.0) for st in sub_txns]
        if pe_exec_values:
            txn.result_data["pe_exec_ns"] = max(pe_exec_values)
        dma_values = [st.result_data.get("dma_ns", 0.0) for st in sub_txns]
        if dma_values:
            txn.result_data["dma_ns"] = max(dma_values)
        compute_values = [st.result_data.get("compute_ns", 0.0) for st in sub_txns]
        if compute_values:
            txn.result_data["compute_ns"] = max(compute_values)
        # Send aggregate response on reverse command path back to IO_CPU
        reverse_path = list(reversed(txn.path))
        if len(reverse_path) >= 2:
            from kernbench.runtime_api.kernel import ResponseMsg
            parts = self.node.id.split(".")
            cube_id = int(parts[1].replace("cube", ""))
            resp_msg = ResponseMsg(
                correlation_id=request.correlation_id,
                request_id=request.request_id,
                src_cube=cube_id, src_pe=-1, success=True,
            )
            resp_txn = Transaction(
                request=resp_msg, path=reverse_path, step=0,
                nbytes=0, done=env.event(), is_response=True,
            )
            yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
        else:
            txn.done.succeed()
    def _resolve_dma_destinations(self, request: Any, target_pe: int | str) -> list[str]:
        """Return list of HBM destination node_ids for DMA fan-out.
        Uses PA-based resolution to determine the actual target cube and slice,
        enabling cross-cube DMA routing when the PA points to a remote cube.
        """
        cube_prefix = self.node.id.rsplit(".", 1)[0]  # e.g. "sip0.cube0"
        if isinstance(target_pe, int):
            return [f"{cube_prefix}.hbm_ctrl.slice{target_pe}"]
        # PA-based resolution: extract actual target from physical address
        pa_val = getattr(request, "dst_pa", None) or getattr(request, "src_pa", None)
        if pa_val is not None:
            from kernbench.policy.address.phyaddr import PhysAddr
            try:
                pa = PhysAddr.decode(pa_val)
                return [self.ctx.resolver.resolve(pa)]
            except Exception:
                pass
        # "all" without PA (KernelLaunch): all slices in local cube
        n_slices = 8
        if self.ctx and self.ctx.spec:
            mm = self.ctx.spec.get("cube", {}).get("memory_map", {})
            n_slices = mm.get("hbm_slices_per_cube", 8)
        return [f"{cube_prefix}.hbm_ctrl.slice{i}" for i in range(n_slices)]
    def _resolve_pe_ids(self, target_pe: int | str) -> list[int]:
        """Return list of PE IDs to fan out to (used by kernel launch fan-out)."""
        if isinstance(target_pe, int):
            return [target_pe]
        # "all": all PEs in local cube
        n_slices = 8
        if self.ctx and self.ctx.spec:
            mm = self.ctx.spec.get("cube", {}).get("memory_map", {})
            n_slices = mm.get("hbm_slices_per_cube", 8)
        return list(range(n_slices))
@@ -0,0 +1,187 @@
 from __future__ import annotations
 from collections.abc import Generator
 from typing import TYPE_CHECKING, Any
 import simpy
 from kernbench.components.base import ComponentBase
 if TYPE_CHECKING:
    from kernbench.components.context import ComponentContext
    from kernbench.topology.types import Node
 class TwoDMeshNocComponent(ComponentBase):
    """2D mesh NOC modeled as a single smart node.
    Latency model:
      - Traversal latency = Manhattan distance between prev_hop and next_hop
        node positions, split into XY segments, traversed with pipeline.
      - overhead_ns (from node.attrs) is added once per traversal.
    Contention model:
      - Each directed XY segment is a simpy.Resource(capacity=1).
      - Pipeline: next segment's resource is requested before the current
        segment's timeout completes, so a free downstream segment is acquired
        immediately (wormhole-style cut-through).
      - Two transactions sharing a segment (same row or column band) contend.
    Concurrency:
      - _worker spawns an independent SimPy process per transaction, so the
        NOC is never serialized at the node level — only at segment resources.
    """
    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
        super().__init__(node, ctx)
        self._env: simpy.Environment | None = None
        self._links: dict[tuple, simpy.Resource] = {}
        self._x_grid: list[float] = []
        self._y_grid: list[float] = []
    def start(self, env: simpy.Environment) -> None:
        self._env = env
        self._build_grid()
        super().start(env)
    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
        yield env.timeout(0)
    # ── Grid construction ────────────────────────────────────────────
    def _build_grid(self) -> None:
        if not self.ctx:
            return
        cube_prefix = self.node.id.rsplit(".", 1)[0]
        xs: set[float] = set()
        ys: set[float] = set()
        for node_id, pos in self.ctx.positions.items():
            if node_id.startswith(cube_prefix + ".") and pos is not None:
                xs.add(round(pos[0], 2))
                ys.add(round(pos[1], 2))
        self._x_grid = sorted(xs)
        self._y_grid = sorted(ys)
    def _get_link(self, key: tuple) -> simpy.Resource:
        if key not in self._links:
            assert self._env is not None
            self._links[key] = simpy.Resource(self._env, capacity=1)
        return self._links[key]
    # ── Worker ───────────────────────────────────────────────────────
    def _worker(self, env: simpy.Environment) -> Generator:
        while True:
            txn: Any = yield self._inbox.get()
            env.process(self._route(env, txn))
    def _route(self, env: simpy.Environment, txn: Any) -> Generator:
        prev_hop = txn.path[txn.step - 1] if txn.step > 0 else None
        next_hop = txn.next_hop
        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
        links: list[tuple[tuple, float]] = []
        if prev_hop and next_hop and self.ctx:
            src_pos = self.ctx.positions.get(prev_hop)
            dst_pos = self.ctx.positions.get(next_hop)
            if src_pos and dst_pos:
                links = self._xy_links(src_pos, dst_pos)
        if links:
            yield from self._traverse(env, links, overhead_ns)
        else:
            yield env.timeout(overhead_ns)
        if next_hop:
            yield self.out_ports[next_hop].put(txn.advance())
        else:
            drain = getattr(txn, "drain_ns", 0.0)
            if drain > 0:
                yield env.timeout(drain)
            txn.done.succeed()
    # ── XY routing and pipelined link traversal ──────────────────────
    def _traverse(
        self,
        env: simpy.Environment,
        links: list[tuple[tuple, float]],
        overhead_ns: float,
    ) -> Generator:
        """Pipeline: request next segment before current timeout finishes."""
        ns_per_mm = self.ctx.ns_per_mm  # type: ignore[union-attr]
        # Acquire first link
        first_key, _ = links[0]
        current_resource = self._get_link(first_key)
        current_req = current_resource.request()
        yield current_req
        for i, (_, dist_mm) in enumerate(links):
            # Request next link before current timeout (pipeline)
            if i + 1 < len(links):
                next_key, _ = links[i + 1]
                next_resource = self._get_link(next_key)
                next_req = next_resource.request()
            yield env.timeout(dist_mm * ns_per_mm + (overhead_ns if i == 0 else 0.0))
            current_resource.release(current_req)
            if i + 1 < len(links):
                yield next_req  # usually already fulfilled (pipeline)
                current_resource = next_resource
                current_req = next_req
    def _xy_links(
        self,
        src: tuple[float, float],
        dst: tuple[float, float],
    ) -> list[tuple[tuple, float]]:
        """XY routing: horizontal segment first, then vertical.
        Returns list of (link_key, dist_mm) pairs, where link_key uniquely
        identifies a directed segment shared across concurrent transactions.
        """
        x0, y0 = src
        x1, y1 = dst
        links: list[tuple[tuple, float]] = []
        # Horizontal segment at y≈y0
        if abs(x0 - x1) > 1e-9:
            y_band = self._snap(y0, self._y_grid)
            for xa, xb in self._segments(x0, x1, self._x_grid):
                d = abs(xb - xa)
                if d > 1e-9:
                    lo, hi = (xa, xb) if xa < xb else (xb, xa)
                    dir_h = "E" if xb > xa else "W"
                    links.append((("H", round(y_band, 2), round(lo, 2), round(hi, 2), dir_h), d))
        # Vertical segment at x≈x1
        if abs(y0 - y1) > 1e-9:
            x_band = self._snap(x1, self._x_grid)
            for ya, yb in self._segments(y0, y1, self._y_grid):
                d = abs(yb - ya)
                if d > 1e-9:
                    lo, hi = (ya, yb) if ya < yb else (yb, ya)
                    dir_v = "S" if yb > ya else "N"
                    links.append((("V", round(x_band, 2), round(lo, 2), round(hi, 2), dir_v), d))
        return links
    @staticmethod
    def _snap(val: float, grid: list[float]) -> float:
        if not grid:
            return val
        return min(grid, key=lambda g: abs(g - val))
    @staticmethod
    def _segments(a: float, b: float, grid: list[float]) -> list[tuple[float, float]]:
        """Consecutive (p_i, p_{i+1}) pairs covering range [a, b] using grid waypoints."""
        if abs(a - b) < 1e-9:
            return []
        lo, hi = (a, b) if a < b else (b, a)
        pts = [lo] + [g for g in grid if lo + 1e-9 < g < hi - 1e-9] + [hi]
        pairs = [(pts[i], pts[i + 1]) for i in range(len(pts) - 1)]
        if a > b:
            pairs = [(p2, p1) for p1, p2 in reversed(pairs)]
        return pairs
@@ -0,0 +1,27 @@
 from __future__ import annotations
 from collections.abc import Generator
 from typing import TYPE_CHECKING
 import simpy
 from kernbench.components.base import ComponentBase
 if TYPE_CHECKING:
    from kernbench.components.context import ComponentContext
    from kernbench.topology.types import Node
 class PcieEpComponent(ComponentBase):
    """PCIe endpoint: protocol processing overhead before forwarding.
    Applies overhead_ns (from node.attrs) for PCIe protocol handling,
    then forwards via inherited _forward_txn().
    """
    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
        super().__init__(node, ctx)
    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
        yield env.timeout(overhead_ns)
@@ -0,0 +1,154 @@
 from __future__ import annotations
 from collections.abc import Generator
 from typing import TYPE_CHECKING, Any
 import simpy
 from kernbench.components.base import ComponentBase
 if TYPE_CHECKING:
    from kernbench.components.context import ComponentContext
    from kernbench.topology.types import Node
 class PeCpuComponent(ComponentBase):
    """PE_CPU: kernel execution controller (Stage 2).
    Two-phase kernel execution (ADR-0014 D1):
      Phase 1 (compile): look up kernel from registry, run it with TLContext
                         to generate a PeCommand list.
      Phase 2 (replay):  iterate commands, dispatch to PE_SCHEDULER via
                         PeInternalTxn, wait for blocking commands.
    Non-kernel Transactions are forwarded normally.
    """
    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
        super().__init__(node, ctx)
        self._pe_prefix = node.id.rsplit(".", 1)[0]  # e.g. "sip0.cube0.pe0"
        try:
            self._pe_idx = int(self._pe_prefix.rsplit("pe", 1)[1])
        except (IndexError, ValueError):
            self._pe_idx = 0
        # Extract sip/cube index for multi-SIP/cube shard matching
        parts = node.id.split(".")
        try:
            self._sip_idx = int(parts[0].replace("sip", ""))
        except (IndexError, ValueError):
            self._sip_idx = 0
        try:
            self._cube_idx = int(parts[1].replace("cube", ""))
        except (IndexError, ValueError):
            self._cube_idx = 0
    def _find_shard(self, shards: tuple) -> Any:
        """Find shard matching this PE's (sip, cube, pe). Fallback to positional index."""
        for s in shards:
            if s.sip == self._sip_idx and s.cube == self._cube_idx and s.pe == self._pe_idx:
                return s
        return shards[min(self._pe_idx, len(shards) - 1)]
    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
        yield env.timeout(overhead_ns)
    def _worker(self, env: simpy.Environment) -> Generator:
        while True:
            txn: Any = yield self._inbox.get()
            from kernbench.runtime_api.kernel import KernelLaunchMsg
            if hasattr(txn, "request") and isinstance(txn.request, KernelLaunchMsg):
                yield from self._execute_kernel(env, txn)
            else:
                yield from self._forward_txn(env, txn)
    def _execute_kernel(self, env: simpy.Environment, txn: Any) -> Generator:
        """Compile kernel function and replay command trace."""
        from kernbench.common.pe_commands import (
            CompositeCmd,
            PeCpuOverheadCmd,
            PeInternalTxn,
            WaitCmd,
        )
        from kernbench.triton_emu.registry import get_kernel
        from kernbench.triton_emu.tl_context import TLContext, run_kernel
        request = txn.request
        # Phase 1: Compile — apply PE_CPU setup overhead, then run kernel
        yield from self.run(env, 0)
        kernel_fn = get_kernel(request.kernel_ref.name)
        tl = TLContext(pe_id=self._pe_idx, dispatch_cycles=0)
        # Unpack KernelLaunchMsg.args into positional args for kernel function
        # TensorArg → PA (pointer), ScalarArg → value
        kernel_args: list = []
        for arg in request.args:
            if arg.arg_kind == "tensor":
                shard = self._find_shard(arg.shards)
                kernel_args.append(shard.pa)
            elif arg.arg_kind == "scalar":
                kernel_args.append(arg.value)
        run_kernel(kernel_fn, tl, *kernel_args)
        commands = tl.commands
        # Phase 2: Replay — dispatch commands to PE_SCHEDULER
        pe_exec_start = env.now
        scheduler_id = f"{self._pe_prefix}.pe_scheduler"
        pending: dict[str, simpy.Event] = {}  # completion_id → done event
        composite_results: list[dict] = []  # collect result_data from CompositeCmd txns
        for cmd in commands:
            if isinstance(cmd, PeCpuOverheadCmd):
                yield env.timeout(cmd.cycles)
            elif isinstance(cmd, WaitCmd):
                if cmd.handle is not None:
                    evt = pending.pop(cmd.handle.id, None)
                    if evt:
                        yield evt
                else:
                    # Wait all pending completions
                    for evt in pending.values():
                        yield evt
                    pending.clear()
            elif isinstance(cmd, CompositeCmd):
                # Non-blocking: dispatch to scheduler, track completion
                done_evt = env.event()
                pe_txn = PeInternalTxn(
                    command=cmd, done=done_evt,
                    pe_prefix=self._pe_prefix,
                )
                composite_results.append(pe_txn.result_data)
                yield self.out_ports[scheduler_id].put(pe_txn)
                pending[cmd.completion.id] = done_evt
            else:
                # Blocking: dispatch and wait for completion
                done_evt = env.event()
                pe_txn = PeInternalTxn(
                    command=cmd, done=done_evt,
                    pe_prefix=self._pe_prefix,
                )
                yield self.out_ports[scheduler_id].put(pe_txn)
                yield done_evt
        # Wait for any remaining pending completions
        for evt in pending.values():
            yield evt
        # Record PE-internal execution time
        txn.result_data["pe_exec_ns"] = env.now - pe_exec_start
        # Aggregate dma_ns / compute_ns from CompositeCmd results
        total_dma_ns = 0.0
        total_compute_ns = 0.0
        for rd in composite_results:
            total_dma_ns += rd.get("dma_ns", 0.0)
            total_compute_ns += rd.get("compute_ns", 0.0)
        txn.result_data["dma_ns"] = total_dma_ns
        txn.result_data["compute_ns"] = total_compute_ns
        # Signal original Transaction done
        txn.done.succeed()
@@ -0,0 +1,116 @@
 from __future__ import annotations
 from collections.abc import Generator
 from typing import TYPE_CHECKING, Any
 import simpy
 from kernbench.components.base import PeEngineBase
 from kernbench.sim_engine.transaction import Transaction
 if TYPE_CHECKING:
    from kernbench.common.pe_commands import PeInternalTxn
    from kernbench.components.context import ComponentContext
    from kernbench.topology.types import Node
 class PeDmaComponent(PeEngineBase):
    """PE_DMA: dual-channel DMA engine with READ and WRITE resources.
    Each channel has capacity=1 (ADR-0014 D4):
      - DMA_READ and DMA_WRITE may execute concurrently.
      - Multiple READs cannot overlap; multiple WRITEs cannot overlap.
    Handles two message types:
      - Transaction: external fabric messages (PeDmaMsg probes, M_CPU DMA)
      - PeInternalTxn: PE-internal commands from PE_SCHEDULER
        (DmaReadCmd → HBM read, DmaWriteCmd → HBM write)
    """
    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
        super().__init__(node, ctx)
        self._dma_read: simpy.Resource | None = None
        self._dma_write: simpy.Resource | None = None
    def init_resources(self, env: simpy.Environment) -> None:
        self._dma_read = simpy.Resource(env, capacity=1)
        self._dma_write = simpy.Resource(env, capacity=1)
    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
        yield env.timeout(0)
    def handle_command(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
        """Handle PE-internal DMA command: resolve PA → HBM path → transfer."""
        from kernbench.common.pe_commands import DmaReadCmd, DmaWriteCmd
        from kernbench.policy.address.phyaddr import PhysAddr
        from kernbench.runtime_api.kernel import PeDmaMsg
        cmd = pe_txn.command
        assert self._dma_read is not None and self._dma_write is not None
        # Determine direction and target PA
        if isinstance(cmd, DmaReadCmd):
            dma_res = self._dma_read
            target_pa = cmd.src_pa
            is_write = False
        elif isinstance(cmd, DmaWriteCmd):
            dma_res = self._dma_write
            target_pa = cmd.dst_pa
            is_write = True
        else:
            pe_txn.done.succeed()
            return
        # Resolve PA → HBM node and compute path
        pa = PhysAddr.decode(target_pa)
        dst_node = self.ctx.resolver.resolve(pa)
        path = self.ctx.router.find_path(self._pe_prefix, dst_node)
        drain_ns = self.ctx.compute_drain_ns(path, cmd.nbytes)
        # Acquire DMA channel (command issue serialization)
        with dma_res.request() as req:
            yield req
            # Create sub-Transaction with PeDmaMsg (HbmCtrl handles it directly)
            sub_done = env.event()
            sub_request = PeDmaMsg(
                correlation_id="pe_internal",
                request_id=f"dma_{id(pe_txn)}",
                src_sip=0, src_cube=0, src_pe=0,
                dst_pa=target_pa, nbytes=cmd.nbytes,
                is_write=is_write,
            )
            sub_txn = Transaction(
                request=sub_request, path=path, step=0,
                nbytes=cmd.nbytes, done=sub_done, drain_ns=drain_ns,
            )
            # Send to next hop (path[0] is pe_dma itself, path[1] is xbar)
            if len(path) > 1:
                yield self.out_ports[path[1]].put(sub_txn.advance())
        # DMA channel released after issue
        # Wait for HBM transfer completion
        yield sub_done
        pe_txn.done.succeed()
    def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator:
        """Handle external Transaction (PeDmaMsg probe, M_CPU DMA) with channel acquisition."""
        dma_res = self._select_channel(txn)
        with dma_res.request() as req:
            yield req
            next_hop = txn.next_hop
            if next_hop:
                yield self.out_ports[next_hop].put(txn.advance())
            else:
                drain = getattr(txn, "drain_ns", 0.0)
                if drain > 0:
                    yield env.timeout(drain)
                txn.done.succeed()
    def _select_channel(self, txn: Any) -> simpy.Resource:
        """Select DMA channel based on request type."""
        from kernbench.runtime_api.kernel import MemoryWriteMsg
        assert self._dma_read is not None and self._dma_write is not None
        if isinstance(txn.request, MemoryWriteMsg):
            return self._dma_write
        return self._dma_read
@@ -0,0 +1,90 @@
 from __future__ import annotations
 from collections.abc import Generator
 from typing import TYPE_CHECKING, Any
 import simpy
 from kernbench.components.base import PeEngineBase
 if TYPE_CHECKING:
    from kernbench.common.pe_commands import PeInternalTxn
    from kernbench.components.context import ComponentContext
    from kernbench.topology.types import Node
 # dtype → bit width (for TFLOPS scaling)
 _DTYPE_BITS: dict[str, int] = {
    "f16": 16, "fp16": 16, "float16": 16, "bf16": 16,
    "f32": 32, "fp32": 32, "float32": 32,
    "i8": 8, "int8": 8,
    "i16": 16, "int16": 16,
    "i32": 32, "int32": 32,
 }
 class PeGemmComponent(PeEngineBase):
    """PE_GEMM: matrix multiplication engine sharing accel_slot (ADR-0014 D4).
    Uses a shared compute resource (PE_ACCEL capacity=1) that is mutually
    exclusive with PE_MATH within the same PE.
    Compute latency model:
      FLOPs = 2 * M * K * N
      effective_tflops = peak_tflops_f16 * (16 / dtype_bits)
      compute_ns = FLOPs / (effective_tflops * 1e3)
    """
    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
        super().__init__(node, ctx)
        self._accel: simpy.Resource | None = None
        self._peak_tflops_f16: float = float(node.attrs.get("peak_tflops_f16", 0.0))
    def init_resources(self, env: simpy.Environment) -> None:
        resource_name = self.node.attrs.get("shared_resource")
        if resource_name and self.ctx:
            self._accel = self.ctx.get_shared_resource(
                env, f"{self._pe_prefix}.{resource_name}"
            )
    def _compute_ns(self, m: int, k: int, n: int, dtype: str) -> float:
        """Compute GEMM latency in nanoseconds."""
        if self._peak_tflops_f16 <= 0:
            return float(self.node.attrs.get("overhead_ns", 0.0))
        dtype_bits = _DTYPE_BITS.get(dtype, 16)
        effective_tflops = self._peak_tflops_f16 * (16.0 / dtype_bits)
        flops = 2.0 * m * k * n
        return flops / (effective_tflops * 1e3)
    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
        yield env.timeout(overhead_ns)
    def handle_command(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
        from kernbench.common.pe_commands import GemmCmd
        cmd = pe_txn.command
        if self._accel:
            with self._accel.request() as req:
                yield req
                if isinstance(cmd, GemmCmd):
                    ns = self._compute_ns(cmd.m, cmd.k, cmd.n, cmd.a.dtype)
                    yield env.timeout(ns)
                else:
                    yield from self.run(env, 0)
        else:
            if isinstance(cmd, GemmCmd):
                ns = self._compute_ns(cmd.m, cmd.k, cmd.n, cmd.a.dtype)
                yield env.timeout(ns)
            else:
                yield from self.run(env, 0)
        pe_txn.done.succeed()
    def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator:
        """Transaction forwarding with accel_slot acquisition."""
        if self._accel:
            with self._accel.request() as req:
                yield req
                yield from super()._forward_txn(env, txn)
        else:
            yield from super()._forward_txn(env, txn)
@@ -0,0 +1,54 @@
 from __future__ import annotations
 from collections.abc import Generator
 from typing import TYPE_CHECKING, Any
 import simpy
 from kernbench.components.base import PeEngineBase
 if TYPE_CHECKING:
    from kernbench.common.pe_commands import PeInternalTxn
    from kernbench.components.context import ComponentContext
    from kernbench.topology.types import Node
 class PeMathComponent(PeEngineBase):
    """PE_MATH: element-wise computation engine sharing accel_slot (ADR-0014 D4).
    Uses a shared compute resource (PE_ACCEL capacity=1) that is mutually
    exclusive with PE_GEMM within the same PE.
    """
    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
        super().__init__(node, ctx)
        self._accel: simpy.Resource | None = None
    def init_resources(self, env: simpy.Environment) -> None:
        resource_name = self.node.attrs.get("shared_resource")
        if resource_name and self.ctx:
            self._accel = self.ctx.get_shared_resource(
                env, f"{self._pe_prefix}.{resource_name}"
            )
    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
        yield env.timeout(overhead_ns)
    def handle_command(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
        if self._accel:
            with self._accel.request() as req:
                yield req
                yield from self.run(env, 0)
        else:
            yield from self.run(env, 0)
        pe_txn.done.succeed()
    def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator:
        """Transaction forwarding with accel_slot acquisition."""
        if self._accel:
            with self._accel.request() as req:
                yield req
                yield from super()._forward_txn(env, txn)
        else:
            yield from super()._forward_txn(env, txn)
@@ -0,0 +1,245 @@
 from __future__ import annotations
 from collections.abc import Generator
 from typing import TYPE_CHECKING, Any
 import simpy
 from kernbench.components.base import ComponentBase
 if TYPE_CHECKING:
    from kernbench.common.pe_commands import PeInternalTxn
    from kernbench.components.context import ComponentContext
    from kernbench.topology.types import Node
 class PeSchedulerComponent(ComponentBase):
    """PE_SCHEDULER: sole dispatcher inside a PE (ADR-0014 D1).
    Receives PeInternalTxn from PE_CPU, routes to the appropriate engine:
      - DmaReadCmd / DmaWriteCmd → PE_DMA
      - GemmCmd                  → PE_GEMM
      - MathCmd                  → PE_MATH
      - CompositeCmd             → tiled pipeline (Stage 3: ADR-0014 D3.2)
    Composite GEMM pipeline (32x64x32 tiles):
      DMA_READ(b_tile_t) → COMPUTE(t) → DMA_WRITE(out_tile_t)
      with overlap: READ(t+1) || COMPUTE(t) || WRITE(t-1)
    Applies scheduler overhead_ns before dispatching each command.
    Non-PeInternalTxn messages are forwarded via inherited _forward_txn().
    """
    # Scheduler tile dimensions (ADR-0014 D3.2)
    TILE_M = 32
    TILE_K = 64
    TILE_N = 32
    # Command → engine suffix dispatch table.
    # New engines: add a single entry here (e.g. ConvCmd: "pe_conv").
    _CMD_DISPATCH: dict[type, str] = {}
    @classmethod
    def _ensure_dispatch_table(cls) -> None:
        if cls._CMD_DISPATCH:
            return
        from kernbench.common.pe_commands import DmaReadCmd, DmaWriteCmd, GemmCmd, MathCmd
        cls._CMD_DISPATCH = {
            DmaReadCmd: "pe_dma",
            DmaWriteCmd: "pe_dma",
            GemmCmd: "pe_gemm",
            MathCmd: "pe_math",
        }
    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
        super().__init__(node, ctx)
        self._pe_prefix = node.id.rsplit(".", 1)[0]
        self._ensure_dispatch_table()
    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
        yield env.timeout(overhead_ns)
    def _worker(self, env: simpy.Environment) -> Generator:
        from kernbench.common.pe_commands import PeInternalTxn
        while True:
            msg: Any = yield self._inbox.get()
            if isinstance(msg, PeInternalTxn):
                env.process(self._dispatch(env, msg))
            else:
                yield from self._forward_txn(env, msg)
    def _dispatch(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
        """Route a PeInternalTxn to the correct engine via dispatch table."""
        from kernbench.common.pe_commands import CompositeCmd
        # Scheduler overhead
        yield from self.run(env, 0)
        cmd = pe_txn.command
        # Check dispatch table first
        engine_suffix = self._CMD_DISPATCH.get(type(cmd))
        if engine_suffix is not None:
            yield self.out_ports[f"{self._pe_prefix}.{engine_suffix}"].put(pe_txn)
            return
        # CompositeCmd: tiled pipeline (not a simple forward)
        if isinstance(cmd, CompositeCmd):
            yield from self._dispatch_composite(env, pe_txn)
            return
        # Unknown command — signal done immediately
        pe_txn.done.succeed()
    def _dispatch_composite(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
        """Composite tiled pipeline (ADR-0014 D3.2).
        GEMM: 3-stage pipeline with b-tile streaming from HBM.
        MATH: sequential compute + DMA_WRITE (no tiling).
        """
        from kernbench.common.pe_commands import CompositeCmd
        cmd = pe_txn.command
        assert isinstance(cmd, CompositeCmd)
        if cmd.op == "gemm" and cmd.b is not None:
            yield from self._pipeline_gemm(env, pe_txn, cmd)
        else:
            yield from self._pipeline_math(env, pe_txn, cmd)
    def _pipeline_gemm(self, env: simpy.Environment, pe_txn: PeInternalTxn, cmd: Any) -> Generator:
        """Tiled GEMM pipeline: stream b tiles from HBM, compute, write results.
        Tensor a is in TCM (loaded via tl.load). Tensor b is in HBM (via tl.ref).
        Pipeline: DMA_READ(b_tile_t) -> COMPUTE(t) -> DMA_WRITE(out_tile_t)
        Overlap: READ(t+1) || COMPUTE(t) || WRITE(t-1)
        """
        from kernbench.common.pe_commands import (
            DmaReadCmd,
            DmaWriteCmd,
            GemmCmd,
            PeInternalTxn as PeTxn,
            TensorHandle,
        )
        pp = self._pe_prefix
        a = cmd.a   # already in TCM
        b = cmd.b   # HBM reference (via tl.ref)
        M, K_a = a.shape[-2], a.shape[-1]
        K_b, N = b.shape[-2], b.shape[-1]
        dtype = a.dtype
        dtype_bytes = b.nbytes // (K_b * N) if (K_b * N) > 0 else 2
        # Tile counts
        n_tiles_k = max(1, (K_a + self.TILE_K - 1) // self.TILE_K)
        n_tiles_n = max(1, (N + self.TILE_N - 1) // self.TILE_N)
        n_tiles = n_tiles_k * n_tiles_n
        prev_compute_done = None
        prev_write_done = None
        total_dma_ns = 0.0
        total_compute_ns = 0.0
        for tile_idx in range(n_tiles):
            tk = tile_idx // n_tiles_n
            tn = tile_idx % n_tiles_n
            k_start = tk * self.TILE_K
            n_start = tn * self.TILE_N
            tile_k = min(self.TILE_K, K_a - k_start)
            tile_n = min(self.TILE_N, N - n_start)
            tile_nbytes = tile_k * tile_n * dtype_bytes
            # --- Stage 1: DMA_READ b_tile from HBM ---
            read_done = env.event()
            b_tile_pa = b.pa + (k_start * N + n_start) * dtype_bytes
            b_tile_handle = TensorHandle(
                id=f"b_tile_{tile_idx}", pa=b_tile_pa,
                shape=(tile_k, tile_n), dtype=dtype, nbytes=tile_nbytes,
            )
            read_cmd = DmaReadCmd(handle=b_tile_handle, src_pa=b_tile_pa, nbytes=tile_nbytes)
            read_txn = PeTxn(command=read_cmd, done=read_done, pe_prefix=pp)
            t0 = env.now
            yield self.out_ports[f"{pp}.pe_dma"].put(read_txn)
            # Wait for previous compute before starting this tile's compute
            if prev_compute_done is not None:
                yield prev_compute_done
            # Wait for this tile's DMA_READ
            yield read_done
            total_dma_ns += env.now - t0
            # --- Stage 2: COMPUTE (GEMM) ---
            compute_done = env.event()
            out_handle = TensorHandle(
                id=f"out_tile_{tile_idx}", pa=0,
                shape=(M, tile_n), dtype=dtype,
                nbytes=M * tile_n * dtype_bytes,
            )
            compute_cmd = GemmCmd(a=a, b=b_tile_handle, out=out_handle,
                                  m=M, k=tile_k, n=tile_n)
            compute_txn = PeTxn(command=compute_cmd, done=compute_done, pe_prefix=pp)
            t0 = env.now
            yield self.out_ports[f"{pp}.pe_gemm"].put(compute_txn)
            # Wait for previous write (DMA_WRITE serialization)
            if prev_write_done is not None:
                yield prev_write_done
            # Wait for compute of THIS tile
            yield compute_done
            total_compute_ns += env.now - t0
            prev_compute_done = compute_done
            # --- Stage 3: DMA_WRITE out_tile to HBM ---
            write_done = env.event()
            out_tile_pa = cmd.out_pa + n_start * dtype_bytes
            write_nbytes = M * tile_n * dtype_bytes
            write_cmd = DmaWriteCmd(handle=out_handle, dst_pa=out_tile_pa, nbytes=write_nbytes)
            write_txn = PeTxn(command=write_cmd, done=write_done, pe_prefix=pp)
            t0 = env.now
            yield self.out_ports[f"{pp}.pe_dma"].put(write_txn)
            prev_write_done = write_done
        # Wait for final write
        if prev_write_done is not None:
            t0 = env.now
            yield prev_write_done
            total_dma_ns += env.now - t0
        pe_txn.result_data["dma_ns"] = total_dma_ns
        pe_txn.result_data["compute_ns"] = total_compute_ns
        pe_txn.done.succeed()
    def _pipeline_math(self, env: simpy.Environment, pe_txn: PeInternalTxn, cmd: Any) -> Generator:
        """Non-GEMM composite: sequential compute + DMA_WRITE (no tiling)."""
        from kernbench.common.pe_commands import (
            DmaWriteCmd,
            MathCmd,
            PeInternalTxn as PeTxn,
        )
        pp = self._pe_prefix
        # Step 1: Compute (MATH)
        compute_done = env.event()
        compute_cmd = MathCmd(
            op=cmd.math_op or "identity",
            inputs=(cmd.a,), out=cmd.a,
        )
        compute_txn = PeTxn(command=compute_cmd, done=compute_done, pe_prefix=pp)
        yield self.out_ports[f"{pp}.pe_math"].put(compute_txn)
        yield compute_done
        # Step 2: DMA_WRITE result to HBM
        write_done = env.event()
        write_cmd = DmaWriteCmd(handle=cmd.a, dst_pa=cmd.out_pa, nbytes=cmd.out_nbytes)
        write_txn = PeTxn(command=write_cmd, done=write_done, pe_prefix=pp)
        yield self.out_ports[f"{pp}.pe_dma"].put(write_txn)
        yield write_done
        pe_txn.done.succeed()
@@ -0,0 +1,25 @@
 from __future__ import annotations
 from collections.abc import Generator
 from typing import TYPE_CHECKING
 from kernbench.components.base import ComponentBase
 if TYPE_CHECKING:
    from kernbench.components.context import ComponentContext
    from kernbench.topology.types import Node
 class PeTcmComponent(ComponentBase):
    """PE_TCM: tightly-coupled memory / local SRAM staging buffer.
    Terminal storage component for PE-internal dataflow (ADR-0014 D5).
    Phase 0: applies overhead_ns and drain_ns at terminal.
    """
    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
        super().__init__(node, ctx)
    def run(self, env, nbytes: int) -> Generator:
        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
        yield env.timeout(overhead_ns)
@@ -0,0 +1,59 @@
 from __future__ import annotations
 from collections.abc import Generator
 from typing import TYPE_CHECKING, Any
 import simpy
 from kernbench.components.base import ComponentBase
 from kernbench.sim_engine.transaction import Transaction
 if TYPE_CHECKING:
    from kernbench.components.context import ComponentContext
    from kernbench.topology.types import Node
 class SramComponent(ComponentBase):
    """Cube SRAM: terminal component that models SRAM access latency.
    Applies overhead_ns processing overhead (from node.attrs).
    On completion, sends a ResponseMsg back on the reverse path.
    """
    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
        super().__init__(node, ctx)
    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
        yield env.timeout(overhead_ns)
    def _worker(self, env: simpy.Environment) -> Generator:
        """Terminal worker: process, apply drain, send response."""
        while True:
            txn: Any = yield self._inbox.get()
            yield from self.run(env, txn.nbytes)
            drain = getattr(txn, "drain_ns", 0.0)
            if drain > 0:
                yield env.timeout(drain)
            yield from self._send_response(env, txn)
    def _send_response(self, env: simpy.Environment, txn: Any) -> Generator:
        """Create ResponseMsg and send on reverse path."""
        reverse_path = list(reversed(txn.path))
        if len(reverse_path) >= 2 and self.ctx:
            from kernbench.runtime_api.kernel import ResponseMsg
            parts = self.node.id.split(".")
            cube_id = int(parts[1].replace("cube", ""))
            resp_msg = ResponseMsg(
                correlation_id=txn.request.correlation_id,
                request_id=txn.request.request_id,
                src_cube=cube_id, src_pe=-1, success=True,
            )
            resp_txn = Transaction(
                request=resp_msg, path=reverse_path, step=0,
                nbytes=0, done=env.event(), is_response=True,
            )
            yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
        else:
            txn.done.succeed()
@@ -0,0 +1,85 @@
 from __future__ import annotations
 from dataclasses import dataclass
 from kernbench.policy.address.phyaddr import PhysAddr
 class AllocationError(Exception):
    pass
@dataclass(frozen=True)
 class AddressConfig:
    sip_count: int
    cubes_per_sip: int
    pes_per_cube: int
    hbm_bytes_per_cube: int
    hbm_slices_per_cube: int
    tcm_bytes_per_pe: int
    tcm_scheduler_reserved_bytes: int
    sram_bytes_per_cube: int
    @property
    def hbm_slice_bytes(self) -> int:
        return self.hbm_bytes_per_cube // self.hbm_slices_per_cube
    @property
    def tcm_allocatable_bytes(self) -> int:
        return self.tcm_bytes_per_pe - self.tcm_scheduler_reserved_bytes
 class PEMemAllocator:
    def __init__(
        self, rack_id: int, sip_id: int, cube_id: int, pe_id: int, cfg: AddressConfig,
    ) -> None:
        self._rack_id = rack_id
        self._sip_id = sip_id
        self._cube_id = cube_id
        self._pe_id = pe_id
        self._cfg = cfg
        self._hbm_cursor = 0
        self._tcm_cursor = 0
    def alloc_hbm(self, nbytes: int) -> PhysAddr:
        if self._hbm_cursor + nbytes > self._cfg.hbm_slice_bytes:
            raise AllocationError(
                f"HBM overflow: need {nbytes}, "
                f"available {self._cfg.hbm_slice_bytes - self._hbm_cursor}"
            )
        pa = PhysAddr.pe_hbm_addr(
            rack_id=self._rack_id, sip_id=self._sip_id, cube_id=self._cube_id,
            pe_id=self._pe_id, pe_local_hbm_offset=self._hbm_cursor,
            slice_size_bytes=self._cfg.hbm_slice_bytes,
        )
        self._hbm_cursor += nbytes
        return pa
    def alloc_tcm(self, nbytes: int) -> PhysAddr:
        if self._tcm_cursor + nbytes > self._cfg.tcm_allocatable_bytes:
            raise AllocationError(
                f"TCM overflow: need {nbytes}, "
                f"available {self._cfg.tcm_allocatable_bytes - self._tcm_cursor}"
            )
        pa = PhysAddr.pe_tcm_addr(
            rack_id=self._rack_id, sip_id=self._sip_id, cube_id=self._cube_id,
            pe_id=self._pe_id, tcm_offset=self._tcm_cursor,
        )
        self._tcm_cursor += nbytes
        return pa
    @property
    def hbm_used(self) -> int:
        return self._hbm_cursor
    @property
    def hbm_total(self) -> int:
        return self._cfg.hbm_slice_bytes
    @property
    def tcm_used(self) -> int:
        return self._tcm_cursor
    @property
    def tcm_total(self) -> int:
        return self._cfg.tcm_allocatable_bytes
@@ -0,0 +1,184 @@
 from __future__ import annotations
 from dataclasses import dataclass
 from enum import IntEnum
 from typing import Literal
 MAX_51 = (1 << 51) - 1
 class PhysAddrError(Exception):
    pass
 def _chk_range(name: str, v: int, bits: int) -> None:
    if not (0 <= v < (1 << bits)):
        raise PhysAddrError(f"{name} out of range for {bits} bits: {v}")
 def _chk_max(name: str, v: int, maxv: int) -> None:
    if not (0 <= v <= maxv):
        raise PhysAddrError(f"{name} out of range (0..{maxv}): {v}")
 class UnitType(IntEnum):
    PE = 0
    MCPU = 1
    SRAM = 2
@dataclass(frozen=True)
 class PhysAddr:
    """
    51-bit physical address value object.
    Layout:
      [50:47] rack_id  (4)
      [46:43] sip_id   (4)
      [42:38] sip_seg  (5)   # cube_id
      [37:0]  local_offset (38) => each segment is 256GB
    local_offset:
      [37] selector: 1 = HBM window (128GB reserved), 0 = PE resource window
    """
    rack_id: int
    sip_id: int
    sip_seg: int
    local_offset: int
    kind: Literal["hbm", "pe_resource", "raw"] = "raw"
    cube_id: int = 0
    unit_type: UnitType = UnitType.PE
    pe_id: int = 0
    ext: int = 0
    sub_offset: int = 0
    hbm_offset: int = 0
    HBM_WINDOW_BYTES = 1 << 37  # 128GB
    def encode(self) -> int:
        _chk_range("rack_id", self.rack_id, 4)
        _chk_range("sip_id", self.sip_id, 4)
        _chk_range("sip_seg", self.sip_seg, 5)
        _chk_range("local_offset", self.local_offset, 38)
        addr = (self.rack_id << 47) | (self.sip_id << 43) | (self.sip_seg << 38) | self.local_offset
        if not (0 <= addr <= MAX_51):
            raise PhysAddrError("address exceeds 51-bit space")
        return addr
    @staticmethod
    def decode(addr: int) -> PhysAddr:
        if not (0 <= addr <= MAX_51):
            raise PhysAddrError("addr must be a 51-bit value")
        rack = (addr >> 47) & 0xF
        sip_id = (addr >> 43) & 0xF
        sip_seg = (addr >> 38) & 0x1F
        off = addr & ((1 << 38) - 1)
        cube_id = sip_seg
        sel = (off >> 37) & 0x1
        if sel == 1:
            hbm_offset = int(off & ((1 << 37) - 1))
            return PhysAddr(
                rack_id=rack,
                sip_id=sip_id,
                sip_seg=sip_seg,
                local_offset=off,
                kind="hbm",
                cube_id=cube_id,
                hbm_offset=hbm_offset,
            )
        # PE resource decode
        raw_ut = int((off >> 34) & 0x7)
        try:
            unit_type = UnitType(raw_ut)
        except ValueError:
            raise PhysAddrError(f"unknown unit_type: {raw_ut}") from None
        pe_id = int((off >> 30) & 0xF)
        ext = int((off >> 29) & 0x1)
        sub_offset = int(off & ((1 << 29) - 1))
        return PhysAddr(
            rack_id=rack,
            sip_id=sip_id,
            sip_seg=sip_seg,
            local_offset=off,
            kind="pe_resource",
            cube_id=cube_id,
            unit_type=unit_type,
            pe_id=pe_id,
            ext=ext,
            sub_offset=sub_offset,
            hbm_offset=0,
        )
    @staticmethod
    def hbm_addr(*, rack_id: int, sip_id: int, cube_id: int, hbm_offset: int) -> PhysAddr:
        _chk_max("cube_id", cube_id, 31)
        _chk_range("hbm_offset", hbm_offset, 37)
        sip_seg = cube_id
        local_offset = (1 << 37) | int(hbm_offset)
        return PhysAddr(
            rack_id=rack_id,
            sip_id=sip_id,
            sip_seg=sip_seg,
            local_offset=local_offset,
            kind="hbm",
            cube_id=cube_id,
            hbm_offset=int(hbm_offset),
        )
    @staticmethod
    def pe_hbm_addr(
        *,
        rack_id: int,
        sip_id: int,
        cube_id: int,
        pe_id: int,
        pe_local_hbm_offset: int,
        slice_size_bytes: int,
    ) -> PhysAddr:
        _chk_max("cube_id", cube_id, 31)
        _chk_range("pe_id", pe_id, 4)
        if not (0 <= pe_local_hbm_offset < slice_size_bytes):
            raise PhysAddrError("pe_local_hbm_offset out of PE local slice range")
        hbm_offset = int(pe_id) * int(slice_size_bytes) + int(pe_local_hbm_offset)
        if not (0 <= hbm_offset < PhysAddr.HBM_WINDOW_BYTES):
            raise PhysAddrError("HBM offset exceeds reserved 128GB window")
        return PhysAddr.hbm_addr(
            rack_id=rack_id, sip_id=sip_id, cube_id=cube_id, hbm_offset=hbm_offset
        )
    @staticmethod
    def hbm_pe_id(hbm_offset: int, slice_size_bytes: int) -> int:
        return hbm_offset // slice_size_bytes
    @staticmethod
    def cube_sram_addr(
        *, rack_id: int, sip_id: int, cube_id: int, sram_offset: int,
    ) -> PhysAddr:
        _chk_max("cube_id", cube_id, 31)
        _chk_range("sram_offset", sram_offset, 29)
        sip_seg = cube_id
        local_offset = (UnitType.SRAM << 34) | sram_offset
        return PhysAddr(
            rack_id=rack_id, sip_id=sip_id, sip_seg=sip_seg,
            local_offset=local_offset,
            kind="pe_resource", cube_id=cube_id,
            unit_type=UnitType.SRAM, sub_offset=sram_offset,
        )
    @staticmethod
    def pe_tcm_addr(
        *, rack_id: int, sip_id: int, cube_id: int, pe_id: int, tcm_offset: int,
    ) -> PhysAddr:
        _chk_max("cube_id", cube_id, 31)
        _chk_range("pe_id", pe_id, 4)
        _chk_range("tcm_offset", tcm_offset, 29)
        sip_seg = cube_id
        local_offset = (UnitType.PE << 34) | (pe_id << 30) | tcm_offset
        return PhysAddr(
            rack_id=rack_id, sip_id=sip_id, sip_seg=sip_seg,
            local_offset=local_offset,
            kind="pe_resource", cube_id=cube_id,
            unit_type=UnitType.PE, pe_id=pe_id, sub_offset=tcm_offset,
        )
@@ -0,0 +1,174 @@
 from __future__ import annotations
 from dataclasses import dataclass
 from math import ceil
 from typing import Literal
@dataclass(frozen=True)
 class DPPolicy:
    """Two-level data-parallel policy: cube-level + pe-level."""
    cube: Literal["replicate", "shard_m", "shard_k"] = "replicate"
    pe: Literal["replicate", "column_wise", "row_wise"] = "replicate"
 def resolve_dp_policy(
    policy: DPPolicy,
    *,
    shape: tuple[int, int],
    itemsize: int,
    num_pe: int,
    num_cubes: int = 1,
 ) -> list[ShardSpec]:
    """Resolve a DPPolicy into a list[ShardSpec] with two-level resolution.
    Cube-level policy distributes across cubes, pe-level distributes within
    each cube. ShardSpec.pe_index uses flat indexing: cube_id * num_pe + pe_id.
    """
    _PE_RESOLVERS = {
        "replicate": replicate,
        "column_wise": column_wise,
        "row_wise": row_wise,
    }
    resolver = _PE_RESOLVERS.get(policy.pe)
    if resolver is None:
        raise ValueError(f"Unknown pe-level policy: {policy.pe}")
    if num_cubes <= 1:
        return resolver(shape=shape, itemsize=itemsize, num_pe=num_pe)
    # Two-level resolution: cube-level → pe-level
    M, K = shape
    all_shards: list[ShardSpec] = []
    for cube_id in range(num_cubes):
        # Determine per-cube shape based on cube-level policy
        if policy.cube == "replicate":
            cube_shape = (M, K)
            cube_offset = 0
        elif policy.cube == "shard_m":
            chunk_m = M // num_cubes
            cube_shape = (chunk_m, K)
            cube_offset = cube_id * chunk_m * K * itemsize
        elif policy.cube == "shard_k":
            chunk_k = K // num_cubes
            cube_shape = (M, chunk_k)
            cube_offset = cube_id * M * chunk_k * itemsize
        else:
            raise ValueError(f"Unknown cube-level policy: {policy.cube}")
        # Resolve pe-level within this cube's shape
        pe_shards = resolver(shape=cube_shape, itemsize=itemsize, num_pe=num_pe)
        # Remap pe_index to flat index and adjust offset
        for ps in pe_shards:
            flat_idx = cube_id * num_pe + ps.pe_index
            all_shards.append(ShardSpec(
                pe_index=flat_idx,
                offset_bytes=cube_offset + ps.offset_bytes,
                nbytes=ps.nbytes,
            ))
    return all_shards
@dataclass(frozen=True)
 class ShardSpec:
    pe_index: int
    offset_bytes: int
    nbytes: int
 def column_wise(
    *, shape: tuple[int, int], itemsize: int, num_pe: int,
 ) -> list[ShardSpec]:
    """Split K axis into num_pe equal parts. Each PE gets (M, K/P)."""
    M, K = shape
    chunk_k = K // num_pe
    chunk_bytes = M * chunk_k * itemsize
    shards = []
    for i in range(num_pe):
        shards.append(ShardSpec(
            pe_index=i,
            offset_bytes=i * chunk_bytes,
            nbytes=chunk_bytes,
        ))
    return shards
 def row_wise(
    *, shape: tuple[int, int], itemsize: int, num_pe: int,
 ) -> list[ShardSpec]:
    """Split M axis into num_pe equal parts. Each PE gets (M/P, K)."""
    M, K = shape
    chunk_m = M // num_pe
    chunk_bytes = chunk_m * K * itemsize
    shards = []
    for i in range(num_pe):
        shards.append(ShardSpec(
            pe_index=i,
            offset_bytes=i * chunk_bytes,
            nbytes=chunk_bytes,
        ))
    return shards
 def replicate(
    *, shape: tuple[int, int], itemsize: int, num_pe: int,
 ) -> list[ShardSpec]:
    """Full copy per PE. Each PE gets (M, K)."""
    M, K = shape
    full_bytes = M * K * itemsize
    return [
        ShardSpec(pe_index=i, offset_bytes=0, nbytes=full_bytes)
        for i in range(num_pe)
    ]
 def tiled_column_major(
    *, shape: tuple[int, int], itemsize: int, num_pe: int,
    tile_m: int, tile_k: int,
 ) -> list[ShardSpec]:
    """2D tiling, column-major order (K axis first), round-robin across PEs."""
    M, K = shape
    tiles_m = ceil(M / tile_m)
    tiles_k = ceil(K / tile_k)
    tile_bytes = tile_m * tile_k * itemsize
    row_bytes = K * itemsize
    shards = []
    idx = 0
    for mi in range(tiles_m):
        for ki in range(tiles_k):
            offset = (mi * tile_m * row_bytes) + (ki * tile_k * itemsize)
            shards.append(ShardSpec(
                pe_index=idx % num_pe,
                offset_bytes=offset,
                nbytes=tile_bytes,
            ))
            idx += 1
    return shards
 def tiled_row_major(
    *, shape: tuple[int, int], itemsize: int, num_pe: int,
    tile_m: int, tile_k: int,
 ) -> list[ShardSpec]:
    """2D tiling, row-major order (M axis first), round-robin across PEs."""
    M, K = shape
    tiles_m = ceil(M / tile_m)
    tiles_k = ceil(K / tile_k)
    tile_bytes = tile_m * tile_k * itemsize
    row_bytes = K * itemsize
    shards = []
    idx = 0
    for ki in range(tiles_k):
        for mi in range(tiles_m):
            offset = (mi * tile_m * row_bytes) + (ki * tile_k * itemsize)
            shards.append(ShardSpec(
                pe_index=idx % num_pe,
                offset_bytes=offset,
                nbytes=tile_bytes,
            ))
            idx += 1
    return shards
@@ -0,0 +1,184 @@
 from __future__ import annotations
 import heapq
 from collections import defaultdict
 from kernbench.policy.address.phyaddr import PhysAddr, UnitType
 from kernbench.topology.types import TopologyGraph
 class RoutingError(Exception):
    pass
 class AddressResolver:
    """Resolve a PhysAddr to the destination node_id in the compiled graph.
    Also provides named node lookups (find_m_cpu, find_pcie_ep, …) so that
    component implementations never construct node_id strings directly.
    Centralising the naming convention here means a single change propagates
    everywhere (ADR-0015 D4).
    """
    def __init__(self, graph: TopologyGraph) -> None:
        self._node_ids = set(graph.nodes)
        mm = graph.spec["cube"]["memory_map"]
        self._slice_size_bytes = mm["hbm_total_gb_per_cube"] * (1 << 30) // mm["hbm_slices_per_cube"]
    # ── Physical-address resolution ──────────────────────────────────
    def resolve(self, addr: PhysAddr) -> str:
        s = addr.sip_id
        c = addr.cube_id
        if addr.kind == "hbm":
            pe_slice = PhysAddr.hbm_pe_id(addr.hbm_offset, self._slice_size_bytes)
            node_id = f"sip{s}.cube{c}.hbm_ctrl.slice{pe_slice}"
        elif addr.kind == "pe_resource":
            if addr.unit_type == UnitType.PE:
                node_id = f"sip{s}.cube{c}.pe{addr.pe_id}.pe_tcm"
            elif addr.unit_type == UnitType.SRAM:
                node_id = f"sip{s}.cube{c}.sram"
            elif addr.unit_type == UnitType.MCPU:
                node_id = f"sip{s}.cube{c}.m_cpu"
            else:
                raise RoutingError(f"unsupported unit_type: {addr.unit_type}")
        else:
            raise RoutingError(f"unsupported address kind: {addr.kind}")
        if node_id not in self._node_ids:
            raise RoutingError(f"node {node_id} not found in topology")
        return node_id
    # ── Named node lookups ───────────────────────────────────────────
    def find_m_cpu(self, sip: int, cube: int) -> str:
        node_id = f"sip{sip}.cube{cube}.m_cpu"
        if node_id not in self._node_ids:
            raise RoutingError(f"M_CPU not found: {node_id}")
        return node_id
    def find_pcie_ep(self, sip: int, io_id: str = "io0") -> str:
        node_id = f"sip{sip}.{io_id}.pcie_ep"
        if node_id not in self._node_ids:
            raise RoutingError(f"PCIE_EP not found: {node_id}")
        return node_id
    def find_io_cpu(self, sip: int, io_id: str = "io0") -> str:
        node_id = f"sip{sip}.{io_id}.io_cpu"
        if node_id not in self._node_ids:
            raise RoutingError(f"IO_CPU not found: {node_id}")
        return node_id
    def find_all_pcie_eps(self) -> list[str]:
        """Return all PCIE_EP node ids across all SIPs, sorted."""
        return sorted(nid for nid in self._node_ids if nid.endswith(".pcie_ep"))
 class PathRouter:
    """Find data-path from a source PE (or arbitrary node) to a destination node.
    Two adjacency graphs are maintained:
      _adj      — excludes command edges (used by PE DMA routing, find_path)
      _adj_all  — includes all edges (used by component-to-component routing,
                  find_node_path; required because M_CPU↔NOC links are "command")
    """
    # Edge kinds excluded from M_CPU DMA adjacency: prevents routing through
    # PE-internal pipeline nodes when computing DMA paths.
    _MCPU_DMA_EXCLUDE = {"pe_internal", "pe_to_xbar"}
    def __init__(self, graph: TopologyGraph) -> None:
        self._adj: dict[str, list[tuple[str, float]]] = defaultdict(list)
        self._adj_all: dict[str, list[tuple[str, float]]] = defaultdict(list)
        self._adj_mcpu_dma: dict[str, list[tuple[str, float]]] = defaultdict(list)
        for e in graph.edges:
            w = e.routing_weight_mm if e.routing_weight_mm is not None else e.distance_mm
            self._adj_all[e.src].append((e.dst, w))
            if e.kind != "command":
                self._adj[e.src].append((e.dst, w))
            if e.kind not in self._MCPU_DMA_EXCLUDE:
                self._adj_mcpu_dma[e.src].append((e.dst, w))
    def find_path(self, src_pe: str, dst_node: str) -> list[str]:
        """PE DMA routing: prepends .pe_dma, excludes command edges."""
        start = f"{src_pe}.pe_dma"
        return self._run_dijkstra(self._adj, start, dst_node)
    def find_path_with_distance(self, src_pe: str, dst_node: str) -> tuple[list[str], float]:
        start = f"{src_pe}.pe_dma"
        return self._run_dijkstra_with_dist(self._adj, start, dst_node)
    def find_mcpu_dma_path(self, m_cpu_id: str, dst_hbm_slice_id: str) -> list[str]:
        """M_CPU DMA path: never routes through PE-internal nodes (ADR-0015 D5).
        Same-cube: deterministic [m_cpu, noc, xbar.pe_i, hbm_ctrl.slice_i].
        Cross-cube: Dijkstra via _adj_mcpu_dma (pe_internal/pe_to_xbar excluded)
                    → routes through NOC → UCIe → target cube NOC → xbar → HBM.
        """
        m_cube = ".".join(m_cpu_id.split(".")[:2])
        d_cube = ".".join(dst_hbm_slice_id.split(".")[:2])
        if m_cube == d_cube:
            slice_idx = int(dst_hbm_slice_id.rsplit("slice", 1)[1])
            return [
                m_cpu_id,
                f"{m_cube}.noc",
                f"{m_cube}.xbar.pe{slice_idx}",
                dst_hbm_slice_id,
            ]
        return self._run_dijkstra(self._adj_mcpu_dma, m_cpu_id, dst_hbm_slice_id)
    def find_node_path(self, src: str, dst: str) -> list[str]:
        """General routing between arbitrary nodes, including command edges.
        Used by components (IoCpuComponent, MCpuComponent) that route through
        M_CPU↔NOC command-kind links.
        """
        return self._run_dijkstra(self._adj_all, src, dst)
    def _run_dijkstra(
        self,
        adj: dict[str, list[tuple[str, float]]],
        start: str,
        goal: str,
    ) -> list[str]:
        path, _ = self._run_dijkstra_with_dist(adj, start, goal)
        return path
    def _run_dijkstra_with_dist(
        self,
        adj: dict[str, list[tuple[str, float]]],
        start: str,
        goal: str,
    ) -> tuple[list[str], float]:
        if start == goal:
            return [start], 0.0
        best: dict[str, float] = {start: 0.0}
        prev: dict[str, str] = {}
        heap: list[tuple[float, str]] = [(0.0, start)]
        while heap:
            d, node = heapq.heappop(heap)
            if node == goal:
                path: list[str] = []
                cur = goal
                while cur != start:
                    path.append(cur)
                    cur = prev[cur]
                path.append(start)
                path.reverse()
                return path, d
            if d > best.get(node, float("inf")):
                continue
            for neighbor, edge_dist in adj[node]:
                new_d = d + edge_dist
                if new_d < best.get(neighbor, float("inf")):
                    best[neighbor] = new_d
                    prev[neighbor] = node
                    heapq.heappush(heap, (new_d, neighbor))
        raise RoutingError(f"no path from {start} to {goal}")
    # ── backward-compat shims (used by existing tests) ───────────────
    def _dijkstra(self, start: str, goal: str) -> list[str]:
        return self._run_dijkstra(self._adj, start, goal)
    def _dijkstra_with_dist(self, start: str, goal: str) -> tuple[list[str], float]:
        return self._run_dijkstra_with_dist(self._adj, start, goal)
@@ -0,0 +1,96 @@
 from __future__ import annotations
 from collections.abc import Callable
 from enum import Enum
 from typing import Any
 from kernbench.common.types import Completion, SimEngine, Trace
 from .context import RuntimeContext
 from .types import BenchResult, DeviceSelector
 class CompletionPolicy(str, Enum):
    LAST_SUBMITTED = "last_submitted"
    LAST_COMPLETED = "last_completed"  # requires trace/timestamps or engine support; stub for now
    ALL_OK_FAIL_FAST = "all_ok_fail_fast"
 BenchFn = Callable[[RuntimeContext], Any]
 EngineFactory = Callable[[object, DeviceSelector], SimEngine]
 def run_bench(
    *,
    topology: object,
    bench_fn: BenchFn,
    device: DeviceSelector,
    engine_factory: EngineFactory,
    correlation_id: str = "bench0",
    completion_policy: CompletionPolicy = CompletionPolicy.LAST_SUBMITTED,
 ) -> BenchResult:
    """
    Minimal bench runner.
    - topology: compiled topology object (opaque to runtime here)
    - bench_fn: callable that receives RuntimeContext and submits requests
    - device: DeviceSelector ("all" or "sip:<N>")
    - engine_factory: builds sim_engine for given topology & device
    - completion_policy: how to determine overall completion/result
    """
    engine = engine_factory(topology, device)
    # Extract spec from TopologyHandle or TopologyGraph
    topo_obj = getattr(topology, "topology_obj", topology)
    spec = getattr(topo_obj, "spec", None)
    ctx = RuntimeContext(
        engine=engine, target_device=device,
        correlation_id=correlation_id, spec=spec,
    )
    bench_fn(ctx)
    ctx.wait_all()
    collected_traces = ctx._traces or None
    handles = ctx.handles()
    if not handles:
        return BenchResult(
            completion=Completion(
                ok=False, error_code="NO_REQUESTS", error_message="Bench submitted no requests"
            ),
            correlation_id=correlation_id,
            trace=None,
            traces=collected_traces,
        )
    if completion_policy == CompletionPolicy.LAST_SUBMITTED:
        last = handles[-1]
        completion, trace = engine.get_completion(last)
        return BenchResult(
            completion=completion, correlation_id=correlation_id,
            trace=trace, traces=collected_traces,
        )
    if completion_policy == CompletionPolicy.ALL_OK_FAIL_FAST:
        last_trace: Trace | None = None
        for h in handles:
            c, t = engine.get_completion(h)
            last_trace = t if t is not None else last_trace
            if not c.ok:
                return BenchResult(
                    completion=c, correlation_id=correlation_id,
                    trace=last_trace, traces=collected_traces,
                )
        return BenchResult(
            completion=Completion(ok=True), correlation_id=correlation_id,
            trace=last_trace, traces=collected_traces,
        )
    # LAST_COMPLETED placeholder (needs engine support for timing). Fall back.
    last = handles[-1]
    completion, trace = engine.get_completion(last)
    return BenchResult(
        completion=completion, correlation_id=correlation_id,
        trace=trace, traces=collected_traces,
    )
@@ -0,0 +1,282 @@
 # kernbench/runtime_api/context.py
 from __future__ import annotations
 from dataclasses import dataclass, field
 from typing import Any
 from kernbench.common.types import Completion, RequestHandle, SimEngine
 from .types import DeviceSelector
@dataclass
 class RuntimeContext:
    engine: SimEngine
    target_device: DeviceSelector
    correlation_id: str
    spec: dict | None = None
    _handles: list[RequestHandle] = field(default_factory=list, init=False)
    _completed: set[RequestHandle] = field(default_factory=set, init=False)
    _allocators: dict[int, Any] = field(default_factory=dict, init=False)
    _tensor_counter: int = field(default=0, init=False)
    _traces: list[dict] = field(default_factory=list, init=False)
    def submit(self, request: Any) -> RequestHandle:
        submit_fn = getattr(self.engine, "submit", None)
        if submit_fn is None:
            raise AttributeError("Engine does not implement submit(request) -> RequestHandle.")
        handle: RequestHandle = submit_fn(request)  # type: ignore[call-arg]
        self._handles.append(handle)
        return handle
    def is_completed(self, handle: RequestHandle) -> bool:
        return handle in self._completed
    def wait(self, handle: RequestHandle, *, _meta: dict | None = None) -> Completion:
        if handle in self._completed:
            completion, trace = self.engine.get_completion(handle)
            return completion
        wait_fn = getattr(self.engine, "wait", None)
        if wait_fn is not None:
            wait_fn(handle)  # type: ignore[misc]
        completion, trace = self.engine.get_completion(handle)
        self._completed.add(handle)
        if _meta is not None and trace is not None:
            entry = dict(trace) if isinstance(trace, dict) else {"raw": trace}
            entry.update(_meta)
            self._traces.append(entry)
        return completion
    def wait_all(self) -> None:
        for h in self._handles:
            if h not in self._completed:
                self.wait(h)
    def handles(self) -> list[RequestHandle]:
        return list(self._handles)
    # ── PyTorch-like tensor API ──────────────────────────────────────
    def _ensure_allocators(self) -> dict:
        """Lazily create PEMemAllocator instances from spec."""
        if self._allocators:
            return self._allocators
        if self.spec is None:
            raise RuntimeError(
                "RuntimeContext.spec is required for tensor operations. "
                "Pass spec=graph.spec when creating RuntimeContext."
            )
        from kernbench.policy.address.allocator import AddressConfig, PEMemAllocator
        system = self.spec.get("system", {})
        cube = self.spec.get("cube", {})
        mm = cube.get("memory_map", {})
        pe_template = cube.get("pe_template", {})
        pe_comps = pe_template.get("components", {})
        tcm_cfg = pe_comps.get("pe_tcm", {}).get("attrs", {})
        sip_count = system.get("sips", {}).get("count", 1)
        cubes_per_sip = system.get("sips", {}).get("cubes_per_sip", 16)
        pes_per_cube = (
            cube.get("pe_layout", {}).get("pe_per_corner", 2)
            * len(cube.get("pe_layout", {}).get("corners", ["NW", "NE", "SW", "SE"]))
        )
        hbm_gb = mm.get("hbm_total_gb_per_cube", 48)
        hbm_slices = mm.get("hbm_slices_per_cube", 8)
        tcm_mb = tcm_cfg.get("size_mb", 16)
        cfg = AddressConfig(
            sip_count=sip_count,
            cubes_per_sip=cubes_per_sip,
            pes_per_cube=pes_per_cube,
            hbm_bytes_per_cube=hbm_gb * (1 << 30),
            hbm_slices_per_cube=hbm_slices,
            tcm_bytes_per_pe=tcm_mb * (1 << 20),
            tcm_scheduler_reserved_bytes=4 * (1 << 20),
            sram_bytes_per_cube=32 * (1 << 20),
        )
        # Create allocators for all SIPs × cubes × PEs
        # Flat index: sip_id * cubes_per_sip * pes_per_cube + cube_id * pes_per_cube + pe_id
        self._pes_per_cube = pes_per_cube
        self._num_cubes = cubes_per_sip
        self._num_sips = sip_count
        cubes_x_pes = cubes_per_sip * pes_per_cube
        for sip_id in range(sip_count):
            for cube_id in range(cubes_per_sip):
                for pe_id in range(pes_per_cube):
                    flat_idx = sip_id * cubes_x_pes + cube_id * pes_per_cube + pe_id
                    self._allocators[flat_idx] = PEMemAllocator(
                        rack_id=0, sip_id=sip_id, cube_id=cube_id, pe_id=pe_id, cfg=cfg,
                    )
        return self._allocators
    def _next_tensor_name(self) -> str:
        self._tensor_counter += 1
        return f"t{self._tensor_counter}"
    def zeros(
        self,
        shape: tuple[int, ...],
        dtype: str = "f16",
        *,
        placement: list | None = None,
        dp: Any = None,
        name: str | None = None,
    ):
        """Create a tensor and deploy to HBM with zero-fill (like torch.zeros)."""
        return self._create_tensor(shape, dtype, placement, name, pattern="zero", dp=dp)
    def empty(
        self,
        shape: tuple[int, ...],
        dtype: str = "f16",
        *,
        placement: list | None = None,
        dp: Any = None,
        name: str | None = None,
    ):
        """Allocate a tensor in HBM without initialization (like torch.empty)."""
        return self._create_tensor(shape, dtype, placement, name, pattern=None, dp=dp)
    def _create_tensor(
        self,
        shape: tuple[int, ...],
        dtype: str,
        placement: list | None,
        name: str | None,
        pattern: str | None,
        dp: Any = None,
    ):
        from kernbench.policy.placement.dp import DPPolicy, ShardSpec, resolve_dp_policy
        from kernbench.runtime_api.kernel import MemoryWriteMsg
        from kernbench.runtime_api.tensor import Tensor, deploy_tensor, dtype_itemsize
        tensor_name = name or self._next_tensor_name()
        t = Tensor(shape=shape, dtype=dtype, name=tensor_name)
        dp_policy: DPPolicy | None = None
        # Resolve placement: dp= takes priority over placement=
        if dp is not None and isinstance(dp, DPPolicy):
            dp_policy = dp
            allocators = self._ensure_allocators()
            itemsize = dtype_itemsize(dtype)
            shape_2d = (shape[0], shape[1])  # type: tuple[int, int]
            total_cubes = self._num_sips * self._num_cubes
            placement = resolve_dp_policy(
                dp, shape=shape_2d, itemsize=itemsize,
                num_pe=self._pes_per_cube, num_cubes=total_cubes,
            )
        elif placement is None:
            placement = [ShardSpec(pe_index=0, offset_bytes=0, nbytes=t.nbytes)]
        # Infer target_pe from placement: multi-PE → "all", single PE → pe_index
        pe_indices = {s.pe_index for s in placement}
        target_pe: int | str = "all" if len(pe_indices) > 1 else next(iter(pe_indices))
        t.to(placement=placement, target_pe=target_pe, dp_policy=dp_policy)
        # Allocate PAs via PEMemAllocator
        allocators = self._ensure_allocators()
        handle = deploy_tensor(
            name=tensor_name,
            shape=shape,
            dtype=dtype,
            placement=placement,
            allocators=allocators,
        )
        t._handle = handle
        # Submit MemoryWriteMsg per shard (deploy data to device)
        if pattern is not None:
            for shard in handle.shards:
                h = self.submit(MemoryWriteMsg(
                    correlation_id=self.correlation_id,
                    request_id=f"deploy_{tensor_name}_pe{shard.pe}",
                    dst_sip=shard.sip, dst_cube=shard.cube, dst_pe=shard.pe,
                    dst_pa=shard.pa, nbytes=shard.nbytes, pattern=pattern,
                    target_cubes=(shard.cube,), target_pe=shard.pe,
                ))
                self.wait(h, _meta={
                    "phase": "memory_write", "name": tensor_name,
                    "sip": shard.sip, "cube": shard.cube, "pe": shard.pe,
                    "nbytes": shard.nbytes,
                })
        return t
    def launch(
        self,
        kernel_name: str,
        kernel_fn: Any,
        *args: Any,
        **kwargs: Any,
    ) -> RequestHandle:
        """Register and launch a kernel (like a fused torch op).
        Positional args: Tensor objects become TensorArg, int/float become ScalarArg.
        Keyword args: become ScalarArg (name is discarded, order preserved).
        """
        from kernbench.runtime_api.kernel import (
            KernelLaunchMsg,
            KernelRef,
            ScalarArg,
        )
        from kernbench.runtime_api.tensor import Tensor
        from kernbench.triton_emu.registry import register_kernel
        # Register kernel (idempotent)
        try:
            register_kernel(kernel_name, kernel_fn)
        except ValueError:
            pass
        # Build kernel args from positional + keyword args
        kernel_args: list = []
        target_pe: int | str = 0
        for a in args:
            if isinstance(a, Tensor):
                kernel_args.append(a.to_tensor_arg())
                # Infer target_pe from tensor DP metadata
                if a._dp_metadata is not None:
                    dp_target = a._dp_metadata.target_pe
                    if dp_target == "all":
                        target_pe = "all"
                    elif isinstance(dp_target, int) and target_pe != "all":
                        target_pe = dp_target
            elif isinstance(a, (int, float)):
                dtype_str = "f32" if isinstance(a, float) else "i32"
                kernel_args.append(ScalarArg(dtype=dtype_str, value=a))
        for v in kwargs.values():
            if isinstance(v, (int, float)):
                dtype_str = "f32" if isinstance(v, float) else "i32"
                kernel_args.append(ScalarArg(dtype=dtype_str, value=v))
        # Determine target cubes from all tensor shards
        cube_set: set[int] = set()
        for a in args:
            if isinstance(a, Tensor) and a._handle is not None:
                for s in a._handle.shards:
                    cube_set.add(s.cube)
        target_cubes = tuple(sorted(cube_set)) if cube_set else (0,)
        # Collect scalar values for GEMM FLOP calculation
        scalar_vals = [a.value for a in kernel_args if hasattr(a, "value")]
        h = self.submit(KernelLaunchMsg(
            correlation_id=self.correlation_id,
            request_id=kernel_name,
            kernel_ref=KernelRef(name=kernel_name, kind="builtin"),
            args=tuple(kernel_args),
            target_cubes=target_cubes,
            target_pe=target_pe,
        ))
        self.wait(h, _meta={
            "phase": "kernel", "name": kernel_name,
            "target_pe": target_pe, "scalars": scalar_vals,
        })
        return h
@@ -0,0 +1,123 @@
 from __future__ import annotations
 from dataclasses import dataclass
 from typing import Literal, TypeAlias
@dataclass(frozen=True)
 class MemoryWriteMsg:
    correlation_id: str
    request_id: str
    dst_sip: int
    dst_cube: int
    dst_pe: int
    dst_pa: int
    nbytes: int
    src_kind: Literal["pattern", "host_buffer_ref"] = "pattern"
    pattern: str | None = None
    target_cubes: tuple[int, ...] | Literal["all"] = "all"
    target_pe: int | Literal["all"] = "all"
    msg_type: Literal["memory_write"] = "memory_write"
@dataclass(frozen=True)
 class MemoryReadMsg:
    correlation_id: str
    request_id: str
    src_sip: int
    src_cube: int
    src_pe: int
    src_pa: int
    nbytes: int
    target_cubes: tuple[int, ...] | Literal["all"] = "all"
    target_pe: int | Literal["all"] = "all"
    msg_type: Literal["memory_read"] = "memory_read"
@dataclass(frozen=True)
 class KernelRef:
    """Reference to a kernel binary or builtin timing model.
    Kernel binaries must be pre-deployed to device memory via MemoryWriteMsg.
    KernelLaunchMsg references the deployed location by PA — source code or IR
    MUST NOT be embedded in launch messages.
    - "deployed": kernel binary pre-deployed to HBM/SRAM at deploy_pa.
    - "builtin":  simulator built-in timing model, identified by name.
    """
    name: str
    kind: Literal["deployed", "builtin"]
    deploy_pa: int | None = None
    deploy_sip: int = 0
    deploy_cube: int = 0
    deploy_pe: int = 0
    nbytes_code: int = 0
@dataclass(frozen=True)
 class TensorArgShard:
    sip: int
    cube: int
    pe: int
    pa: int
    nbytes: int
    offset_bytes: int
@dataclass(frozen=True)
 class TensorArg:
    shards: tuple[TensorArgShard, ...]
    arg_kind: Literal["tensor"] = "tensor"
@dataclass(frozen=True)
 class ScalarArg:
    dtype: str
    value: float | int
    arg_kind: Literal["scalar"] = "scalar"
 KernelArg: TypeAlias = TensorArg | ScalarArg
@dataclass(frozen=True)
 class KernelLaunchMsg:
    correlation_id: str
    request_id: str
    kernel_ref: KernelRef
    args: tuple[KernelArg, ...]
    target_cubes: tuple[int, ...] | Literal["all"] = "all"
    target_pe: int | Literal["all"] = "all"
    msg_type: Literal["kernel_launch"] = "kernel_launch"
@dataclass(frozen=True)
 class ResponseMsg:
    """Device→Host response carrying PE execution result."""
    correlation_id: str
    request_id: str
    src_cube: int
    src_pe: int
    success: bool
    msg_type: Literal["response"] = "response"
@dataclass(frozen=True)
 class PeDmaMsg:
    """Direct PE DMA request: host injects a transfer at PE_DMA level.
    Used by the probe utility to measure PE→HBM latency without requiring
    the full PE_CPU → scheduler → DMA pipeline.
    """
    correlation_id: str
    request_id: str
    src_sip: int
    src_cube: int
    src_pe: int
    dst_pa: int
    nbytes: int
    is_write: bool = False
    msg_type: Literal["pe_dma"] = "pe_dma"
@@ -0,0 +1,166 @@
 from __future__ import annotations
 import math
 from dataclasses import dataclass
 from typing import Literal
 from kernbench.policy.address.allocator import PEMemAllocator
 from kernbench.policy.placement.dp import DPPolicy, ShardSpec
 from kernbench.runtime_api.kernel import TensorArg, TensorArgShard
@dataclass(frozen=True)
 class TensorShard:
    sip: int
    cube: int
    pe: int
    pa: int
    nbytes: int
    offset_bytes: int
@dataclass(frozen=True)
 class TensorHandle:
    name: str
    shape: tuple[int, ...]
    dtype: str
    itemsize: int
    shards: tuple[TensorShard, ...]
    @property
    def nbytes(self) -> int:
        return math.prod(self.shape) * self.itemsize
 _DTYPE_ITEMSIZE = {
    "fp16": 2, "float16": 2, "f16": 2,
    "fp32": 4, "float32": 4, "f32": 4,
    "bf16": 2,
    "int8": 1, "i8": 1,
    "int16": 2, "i16": 2,
    "int32": 4, "i32": 4,
 }
 def dtype_itemsize(dtype: str) -> int:
    if dtype not in _DTYPE_ITEMSIZE:
        raise ValueError(f"unsupported dtype: {dtype}")
    return _DTYPE_ITEMSIZE[dtype]
 def deploy_tensor(
    *,
    name: str,
    shape: tuple[int, ...],
    dtype: str,
    placement: list[ShardSpec],
    allocators: dict[int, PEMemAllocator],
    mem_kind: Literal["hbm", "tcm"] = "hbm",
 ) -> TensorHandle:
    isize = dtype_itemsize(dtype)
    shards: list[TensorShard] = []
    for spec in placement:
        alloc = allocators[spec.pe_index]
        if mem_kind == "hbm":
            pa = alloc.alloc_hbm(spec.nbytes)
        else:
            pa = alloc.alloc_tcm(spec.nbytes)
        shards.append(TensorShard(
            sip=alloc._sip_id,
            cube=alloc._cube_id,
            pe=alloc._pe_id,
            pa=pa.encode(),
            nbytes=spec.nbytes,
            offset_bytes=spec.offset_bytes,
        ))
    return TensorHandle(
        name=name,
        shape=shape,
        dtype=dtype,
        itemsize=isize,
        shards=tuple(shards),
    )
 # ── PyTorch-like Tensor API ──────────────────────────────────────────
@dataclass(frozen=True)
 class DPMetadata:
    """Data-parallel placement metadata (stored as Tensor._dp_metadata)."""
    placement: list[ShardSpec]
    dp_policy: DPPolicy | None = None
    sip: int = 0
    cube: int = 0
    target_pe: int | str = 0  # int → single PE, "all" → all PEs
 class Tensor:
    """PyTorch-like tensor for benchmark code.
    Usage::
        a = ctx.zeros((M, K), dtype="f16")
        a = ctx.zeros((M, K), dtype="f16", placement=dp.replicate(num_pe=8))
        ctx.launch("kernel_name", kernel_fn, a, b, out, M=M, K=K)
    """
    def __init__(
        self,
        shape: tuple[int, ...],
        dtype: str = "f16",
        name: str = "",
    ) -> None:
        self.shape = shape
        self.dtype = dtype
        self.name = name
        self._dp_metadata: DPMetadata | None = None
        self._handle: TensorHandle | None = None
    @property
    def itemsize(self) -> int:
        return dtype_itemsize(self.dtype)
    @property
    def nbytes(self) -> int:
        return math.prod(self.shape) * self.itemsize
    @property
    def pa(self) -> int:
        """Primary PA (first shard). Used as kernel pointer argument."""
        if self._handle is None or not self._handle.shards:
            raise RuntimeError(f"Tensor '{self.name}' is not deployed yet")
        return self._handle.shards[0].pa
    def to(
        self,
        placement: list[ShardSpec] | None = None,
        *,
        dp_policy: DPPolicy | None = None,
        sip: int = 0,
        cube: int = 0,
        target_pe: int | str = 0,
    ) -> Tensor:
        """Set DP placement metadata (like torch.Tensor.to())."""
        if placement is None:
            placement = [ShardSpec(pe_index=0, offset_bytes=0, nbytes=self.nbytes)]
        self._dp_metadata = DPMetadata(
            placement=placement, dp_policy=dp_policy,
            sip=sip, cube=cube, target_pe=target_pe,
        )
        return self
    def to_tensor_arg(self) -> TensorArg:
        """Convert deployed shards to KernelLaunchMsg TensorArg."""
        if self._handle is None:
            raise RuntimeError(f"Tensor '{self.name}' is not deployed yet")
        return TensorArg(
            shards=tuple(
                TensorArgShard(
                    sip=s.sip, cube=s.cube, pe=s.pe,
                    pa=s.pa, nbytes=s.nbytes, offset_bytes=s.offset_bytes,
                )
                for s in self._handle.shards
            ),
        )
@@ -0,0 +1,71 @@
 from __future__ import annotations
 import re
 from dataclasses import dataclass
 from kernbench.common.types import Completion, Trace
@dataclass(frozen=True)
 class BenchResult:
    completion: Completion
    correlation_id: str
    trace: Trace | None = None
    traces: list[dict] | None = None
    def summary_text(self) -> str:
        if self.completion.ok:
            return f"[OK] correlation_id={self.correlation_id}"
        code = self.completion.error_code or "ERROR"
        msg = self.completion.error_message or ""
        return f"[FAIL:{code}] correlation_id={self.correlation_id} {msg}".rstrip()
@dataclass(frozen=True)
 class DeviceSelector:
    """
    Device selector.
    Supported:
      - "all"      : all SIPs in the tray topology
      - "sip:<N>"  : a single SIP index
    """
    raw: str  # "all" or "sip:<N>"
    @property
    def is_all(self) -> bool:
        return self.raw == "all"
    @property
    def sip_index(self) -> int:
        if self.is_all:
            raise ValueError("DeviceSelector is 'all'; no single sip_index.")
        m = re.fullmatch(r"sip:(\d+)", self.raw)
        if not m:
            raise ValueError(
                f"Invalid device '{self.raw}'. Expected 'all' or 'sip:<N>' (e.g., sip:0)."
            )
        return int(m.group(1))
 def resolve_device(raw: str | None) -> DeviceSelector:
    """
    Resolve the CLI --device string into a DeviceSelector.
    Semantics:
      - if omitted/empty -> "all"
      - else accept "all" or "sip:<N>"
    """
    if raw is None or raw.strip() == "":
        return DeviceSelector(raw="all")
    raw = raw.strip().lower()
    if raw == "all":
        return DeviceSelector(raw="all")
    m = re.fullmatch(r"sip:(\d+)", raw)
    if not m:
        raise ValueError(f"Invalid device '{raw}'. Expected 'all' or 'sip:<N>' (e.g., sip:0).")
    return DeviceSelector(raw=raw)
@@ -0,0 +1,31 @@
 # kernbench/engine/dummy.py
 from __future__ import annotations
 from dataclasses import dataclass
 from typing import Any
 from kernbench.common.types import Completion, RequestHandle, SimEngine, Trace
@dataclass
 class DummyEngine(SimEngine):
    topology: object
    device_raw: str
    _n: int = 0
    _store: dict[str, tuple[Completion, Trace | None]] = None  # type: ignore
    def __post_init__(self) -> None:
        self._store = {}
    def submit(self, request: Any) -> RequestHandle:
        self._n += 1
        h = RequestHandle(f"h{self._n}")
        # 여기서 request 처리/시뮬레이션/스케줄링 등을 수행
        self._store[str(h)] = (Completion(ok=True), {"request": request, "device": self.device_raw})
        return h
    def get_completion(self, handle: RequestHandle) -> tuple[Completion, Trace | None]:
        return self._store[str(handle)]
    def wait(self, handle: RequestHandle) -> None:
        pass
@@ -0,0 +1,298 @@
 from __future__ import annotations
 from typing import Any
 import simpy
 from kernbench.common.types import Completion, RequestHandle, Trace
 import kernbench.components.impls  # noqa: F401 — registers built-in implementations
 from kernbench.components.base import ComponentBase, ComponentRegistry
 from kernbench.components.context import ComponentContext
 from kernbench.policy.address.phyaddr import PhysAddr
 from kernbench.policy.routing.router import AddressResolver, PathRouter
 from kernbench.runtime_api.kernel import KernelLaunchMsg, MemoryReadMsg, MemoryWriteMsg, PeDmaMsg
 from kernbench.sim_engine.transaction import Transaction
 from kernbench.topology.types import Edge, TopologyGraph
 class GraphEngine:
    """simpy-based discrete-event simulation engine.
    Phase B: engine injects a Transaction into the PCIE_EP host queue for
    each request. Components handle their own routing:
      Path 1: PCIE_EP → IO_CPU   (engine-computed path, pre-loaded in Transaction)
      Path 2: IO_CPU → M_CPU     (IO_CPU dispatches, fire-and-forget callback)
      Path 3: M_CPU.DMA → HBM   (M_CPU dispatches, fire-and-forget callback)
    Component implementations are DI-injectable via component_overrides (ADR-0007 D3).
    """
    def __init__(
        self,
        graph: TopologyGraph,
        *,
        component_overrides: dict[str, type[ComponentBase]] | None = None,
    ) -> None:
        self._env = simpy.Environment()
        self._resolver = AddressResolver(graph)
        self._router = PathRouter(graph)
        self._nodes = graph.nodes
        self._edge_map: dict[tuple[str, str], Edge] = {}
        for e in graph.edges:
            self._edge_map[(e.src, e.dst)] = e
        self._ns_per_mm: float = graph.spec.get("system", {}).get("ns_per_mm", 0.01)
        self._results: dict[str, tuple[Completion, Trace]] = {}
        self._events: dict[str, simpy.Event] = {}
        self._counter = 0
        overrides = component_overrides or {}
        ctx = ComponentContext(
            router=self._router,
            resolver=self._resolver,
            positions={nid: n.pos_mm for nid, n in graph.nodes.items()},
            ns_per_mm=self._ns_per_mm,
            edge_map=self._edge_map,
            spec=graph.spec,
        )
        self._components: dict[str, ComponentBase] = {
            node_id: ComponentRegistry.create(node, overrides, ctx)
            for node_id, node in graph.nodes.items()
        }
        # Wire ports: one Store per directed edge (ADR-0015 D1)
        for e in graph.edges:
            src_comp = self._components.get(e.src)
            dst_comp = self._components.get(e.dst)
            if src_comp is None or dst_comp is None:
                continue
            store: simpy.Store = simpy.Store(self._env)
            src_comp.out_ports[e.dst] = store
            dst_comp.in_ports[e.src] = store
        # Wire processes: propagation delay per edge (ADR-0015 D2)
        # Cut-through (wormhole) model: wires apply propagation only.
        # Serialization (drain) is computed per-path and applied once at the terminal.
        for e in graph.edges:
            src_comp = self._components.get(e.src)
            dst_comp = self._components.get(e.dst)
            if src_comp is None or dst_comp is None:
                continue
            prop_ns = e.distance_mm * self._ns_per_mm
            self._env.process(
                self._wire(src_comp.out_ports[e.dst], dst_comp.in_ports[e.src],
                           prop_ns)
            )
        # Attach host queues to PCIE_EP in_ports before start() (ADR-0015 D3)
        self._host_queues: dict[str, simpy.Store] = {}
        for pcie_ep_id in self._resolver.find_all_pcie_eps():
            host_q: simpy.Store = simpy.Store(self._env)
            self._components[pcie_ep_id].in_ports["host"] = host_q
            self._host_queues[pcie_ep_id] = host_q
        # Attach host queues to PE_DMA nodes for direct PE DMA injection
        self._pe_dma_queues: dict[str, simpy.Store] = {}
        for node_id, node in graph.nodes.items():
            if node.kind == "pe_dma":
                host_q = simpy.Store(self._env)
                self._components[node_id].in_ports["host"] = host_q
                self._pe_dma_queues[node_id] = host_q
        # Start components after all ports are wired (ADR-0015 D3)
        for comp in self._components.values():
            comp.start(self._env)
    def submit(self, request: Any) -> RequestHandle:
        self._counter += 1
        handle = RequestHandle(f"h{self._counter}")
        event = self._env.event()
        self._events[str(handle)] = event
        self._env.process(self._process(str(handle), request, event))
        return handle
    def wait(self, handle: RequestHandle) -> None:
        key = str(handle)
        event = self._events[key]
        if not event.triggered:
            self._env.run(until=event)
    def get_completion(self, handle: RequestHandle) -> tuple[Completion, Trace | None]:
        return self._results[str(handle)]
    # ── internal ────────────────────────────────────────────────────
    def _wire(
        self,
        out_port: simpy.Store,
        in_port: simpy.Store,
        prop_ns: float,
    ):
        """SimPy process: relay messages with propagation delay only.
        Cut-through (wormhole) model: serialization (drain) is computed per-path
        and applied once at the terminal component, not at every wire hop.
        """
        while True:
            msg = yield out_port.get()
            if prop_ns > 0:
                yield self._env.timeout(prop_ns)
            yield in_port.put(msg)
    def _process(self, key: str, request: Any, done: simpy.Event):
        if isinstance(request, PeDmaMsg):
            yield from self._process_pe_dma(key, request, done)
            return
        entries = self._entry_points(request)
        if not entries:
            self._results[key] = (
                Completion(ok=True),
                {"total_ns": 0.0, "nbytes": 0},
            )
            done.succeed()
            return
        start_ns = self._env.now
        total_nbytes = 0
        root_txn: Transaction | None = None
        if len(entries) == 1:
            # Single-SIP: direct inject (common path, no extra events)
            pcie_ep_id, io_cpu_id, nbytes = entries[0]
            total_nbytes = nbytes
            path = self._router.find_node_path(pcie_ep_id, io_cpu_id)
            txn_done = self._env.event()
            txn = Transaction(request=request, path=path, step=0, nbytes=nbytes, done=txn_done)
            root_txn = txn
            yield self._host_queues[pcie_ep_id].put(txn)
            yield txn_done
        else:
            # Multi-SIP: inject per SIP, aggregate completions (ADR-0007)
            sub_dones: list[simpy.Event] = []
            sub_txns: list[Transaction] = []
            for pcie_ep_id, io_cpu_id, nbytes in entries:
                total_nbytes = max(total_nbytes, nbytes)
                path = self._router.find_node_path(pcie_ep_id, io_cpu_id)
                txn_done = self._env.event()
                txn = Transaction(
                    request=request, path=path, step=0,
                    nbytes=nbytes, done=txn_done,
                )
                yield self._host_queues[pcie_ep_id].put(txn)
                sub_dones.append(txn_done)
                sub_txns.append(txn)
            for sd in sub_dones:
                yield sd
            # Aggregate pe_exec_ns from multi-SIP (max)
            pe_vals = [st.result_data.get("pe_exec_ns") for st in sub_txns]
            pe_vals = [v for v in pe_vals if v is not None]
            if pe_vals:
                if root_txn is None:
                    root_txn = sub_txns[0]
                root_txn.result_data["pe_exec_ns"] = max(pe_vals)
        total_ns = self._env.now - start_ns
        result_trace: dict[str, Any] = {"total_ns": total_ns, "nbytes": total_nbytes}
        if root_txn is not None and root_txn.result_data:
            result_trace.update(root_txn.result_data)
        self._results[key] = (
            Completion(ok=True),
            result_trace,
        )
        done.succeed()
    def _process_pe_dma(self, key: str, request: PeDmaMsg, done: simpy.Event):
        """Inject a Transaction directly at PE_DMA for PE→HBM latency measurement."""
        pe_prefix = f"sip{request.src_sip}.cube{request.src_cube}.pe{request.src_pe}"
        pe_dma_id = f"{pe_prefix}.pe_dma"
        pa = PhysAddr.decode(request.dst_pa)
        dst_node = self._resolver.resolve(pa)
        path = self._router.find_path(pe_prefix, dst_node)
        drain_ns = self._path_drain_ns(path, request.nbytes)
        start_ns = self._env.now
        txn_done = self._env.event()
        txn = Transaction(request=request, path=path, step=0, nbytes=request.nbytes,
                          done=txn_done, drain_ns=drain_ns)
        yield self._pe_dma_queues[pe_dma_id].put(txn)
        yield txn_done
        total_ns = self._env.now - start_ns
        formula_ns = self._formula_latency(path, request.nbytes)
        self._results[key] = (
            Completion(ok=True),
            {"total_ns": total_ns, "formula_ns": formula_ns, "nbytes": request.nbytes},
        )
        done.succeed()
    def _path_drain_ns(self, path: list[str], nbytes: int) -> float:
        """Wormhole drain time: nbytes / bottleneck_bw along path."""
        min_bw = float("inf")
        for i in range(len(path) - 1):
            edge = self._edge_map.get((path[i], path[i + 1]))
            if edge and edge.bw_gbs:
                min_bw = min(min_bw, edge.bw_gbs)
        if min_bw == float("inf"):
            return 0.0
        return nbytes / min_bw
    def _formula_latency(self, path: list[str], nbytes: int) -> float:
        """Lower-bound formula latency (ADR-0015 D7).
        formula = Σ(wire propagation) + Σ(component overhead_ns) + drain_ns
        Phase 0: formula == actual (no contention).
        Phase 1+: formula <= actual (contention adds queueing).
        """
        total = 0.0
        # Wire propagation delays
        for i in range(len(path) - 1):
            edge = self._edge_map.get((path[i], path[i + 1]))
            if edge:
                total += edge.distance_mm * self._ns_per_mm
        # Component overhead_ns
        for node_id in path:
            node = self._nodes.get(node_id)
            if node:
                total += float(node.attrs.get("overhead_ns", 0.0))
        # Drain
        total += self._path_drain_ns(path, nbytes)
        return total
    def _entry_points(self, request: Any) -> list[tuple[str, str, int]]:
        """Return list of (pcie_ep_id, io_cpu_id, nbytes) per target SIP.
        For Memory{Write,Read}: single SIP entry.
        For KernelLaunchMsg: one entry per distinct SIP in tensor shards.
        """
        if isinstance(request, MemoryWriteMsg):
            sip = request.dst_sip
            return [(
                self._resolver.find_pcie_ep(sip),
                self._resolver.find_io_cpu(sip),
                request.nbytes,
            )]
        if isinstance(request, MemoryReadMsg):
            sip = request.src_sip
            return [(
                self._resolver.find_pcie_ep(sip),
                self._resolver.find_io_cpu(sip),
                request.nbytes,
            )]
        if isinstance(request, KernelLaunchMsg):
            seen: set[int] = set()
            entries: list[tuple[str, str, int]] = []
            for arg in request.args:
                if arg.arg_kind != "tensor":
                    continue
                for shard in arg.shards:
                    if shard.sip not in seen:
                        seen.add(shard.sip)
                        entries.append((
                            self._resolver.find_pcie_ep(shard.sip),
                            self._resolver.find_io_cpu(shard.sip),
                            shard.nbytes,
                        ))
            return entries
        raise ValueError(f"unsupported request type: {type(request)}")
@@ -0,0 +1,49 @@
 from __future__ import annotations
 from dataclasses import dataclass, field
 from typing import Any
 import simpy
@dataclass
 class Transaction:
    """In-flight request traversing the device fabric hop-by-hop (ADR-0015 D4).
    A Transaction carries a host request through one leg of the device fabric.
    Each component on the path reads from its in_port, processes (overhead_ns or
    other latency), and advances the Transaction to the next hop via out_port.
    Wire processes (ADR-0015 D2) model propagation delay between hops.
    Multi-leg flows (e.g. IO_CPU → M_CPU as leg 1, M_CPU.DMA → HBM as leg 2)
    use separate Transactions: the terminal component of leg 1 creates leg 2
    and waits for leg 2's done before succeeding leg 1's done.
    """
    request: Any            # original host request (MemoryReadMsg, KernelLaunchMsg, …)
    path: list[str]         # node_id sequence for this leg
    step: int               # index of the component currently holding this Transaction
    nbytes: int             # payload size (bytes)
    done: simpy.Event       # succeeded when this leg completes
    drain_ns: float = 0.0   # wormhole drain time: nbytes / bottleneck_bw (applied once at terminal)
    is_response: bool = False  # True when carrying ResponseMsg on reverse path
    result_data: dict[str, Any] = field(default_factory=dict)  # PE-level metrics (pe_exec_ns, etc.)
    @property
    def next_hop(self) -> str | None:
        """Node id of the next component, or None if this is the terminal hop."""
        nxt = self.step + 1
        return self.path[nxt] if nxt < len(self.path) else None
    def advance(self) -> Transaction:
        """Return a copy of this Transaction advanced one step along the path."""
        return Transaction(
            request=self.request,
            path=self.path,
            step=self.step + 1,
            nbytes=self.nbytes,
            done=self.done,
            drain_ns=self.drain_ns,
            is_response=self.is_response,
            result_data=self.result_data,
        )
@@ -0,0 +1,965 @@
 # kernbench/topology/builder.py
 """
 Topology compiler: parses topology.yaml and produces a fully-instantiated
 TopologyGraph with nodes, edges, and representative view projections.
 """
 from __future__ import annotations
 from pathlib import Path
 from typing import Any
 import yaml
 from .types import Edge, Node, TopologyGraph, TopologyHandle, ViewGraph
 # PE component offsets from PE center (small, intra-PE distances ~0.5mm)
 _PE_COMP_OFFSETS = {
    "pe_cpu": (-0.3, 0.0),
    "pe_scheduler": (-0.15, 0.0),
    "pe_dma": (0.0, -0.15),
    "pe_gemm": (0.0, 0.0),
    "pe_math": (0.0, 0.15),
    "pe_tcm": (0.3, 0.0),
 }
 # ── Public API ───────────────────────────────────────────────────────
 def resolve_topology(path_str: str) -> TopologyHandle:
    """Validate path and build compiled topology graph."""
    p = Path(path_str).expanduser().resolve()
    if not p.exists():
        raise FileNotFoundError(f"Topology file not found: {p}")
    if not p.is_file():
        raise ValueError(f"Topology path is not a file: {p}")
    graph = load_topology(p)
    return TopologyHandle(path=p, topology_obj=graph)
 def load_topology(path: Path) -> TopologyGraph:
    """Load topology spec from file and compile into a topology graph."""
    spec = _read_spec(path)
    _validate_spec(spec)
    return _compile_graph(spec)
 def _read_spec(path: Path) -> dict[str, Any]:
    """Read YAML topology spec file and return a dict."""
    try:
        with path.open("r", encoding="utf-8") as f:
            data = yaml.safe_load(f)
    except yaml.YAMLError as e:
        msg = f"Failed to parse YAML topology: {path}"
        mark = getattr(e, "problem_mark", None)
        if mark is not None:
            msg += f" (line {mark.line + 1}, column {mark.column + 1})"
        raise ValueError(msg) from e
    if data is None:
        raise ValueError(f"Topology YAML is empty: {path}")
    if not isinstance(data, dict):
        raise ValueError(
            f"Topology YAML root must be a mapping/dict: {path} (got {type(data).__name__})"
        )
    return data
 def _validate_spec(spec: dict) -> None:
    # TODO: schema validation
    return
 # ── Graph Compiler ───────────────────────────────────────────────────
 def _compile_graph(spec: dict) -> TopologyGraph:
    """Build fully-instantiated flat graph + representative view projections."""
    nodes: dict[str, Node] = {}
    edges: list[Edge] = []
    system = spec["system"]
    sip_spec = spec["sip"]
    cube_spec = spec["cube"]
    mesh_w = sip_spec["cube_mesh"]["w"]
    mesh_h = sip_spec["cube_mesh"]["h"]
    cube_w = cube_spec["geometry"]["cube_mm"]["w"]
    cube_h = cube_spec["geometry"]["cube_mm"]["h"]
    seam = sip_spec["links"]["inter_cube_mesh"]["distance_mm_across_seam"]
    stride_x = cube_w + seam
    stride_y = cube_h + seam
    # System-level
    _instantiate_system(nodes, system)
    # Per-SIP
    for sip_id in range(system["sips"]["count"]):
        sp = f"sip{sip_id}"
        # IO chiplets
        _instantiate_io_chiplets(
            nodes, edges, sp, sip_spec,
            cube_w, cube_h, mesh_w, mesh_h, seam,
        )
        # Cubes + PEs
        for row in range(mesh_h):
            for col in range(mesh_w):
                cid = row * mesh_w + col
                cp = f"{sp}.cube{cid}"
                origin = (col * stride_x, row * stride_y)
                _instantiate_cube(nodes, edges, cp, cube_spec, origin)
        # Inter-cube UCIe mesh
        _add_inter_cube_edges(edges, sp, mesh_w, mesh_h, sip_spec)
        # IO → cube UCIe
        _add_io_to_cube_edges(edges, sp, sip_spec, mesh_w)
        # Switch → IO pcie_ep
        _add_system_to_io_edges(edges, sp, sip_spec, system)
    # Build views
    return TopologyGraph(
        spec=spec,
        nodes=nodes,
        edges=edges,
        system_view=_build_system_view(spec),
        sip_view=_build_sip_view(spec),
        cube_view=_build_cube_view(spec),
        pe_view=_build_pe_view(spec),
    )
 # ── Layout helpers ───────────────────────────────────────────────────
 def _cube_local_positions(cube_w: float, cube_h: float) -> dict[str, tuple[float, float]]:
    """Cube-internal component positions relative to cube origin (0,0) at top-left."""
    cx, cy = cube_w / 2, cube_h / 2
    # UCIe node half-sizes (default 2.0×1.2mm) — inset so edges touch boundary
    uh = 0.6   # half height
    uw = 1.0   # half width
    return {
        "ucie-N": (cx, uh),
        "ucie-S": (cx, cube_h - uh),
        "ucie-W": (uw, cy),
        "ucie-E": (cube_w - uw, cy),
        "m_cpu": (cube_w - 2.5, cy - 1.5),
        "xbar.top": (cx, 3.5),       # Y reference for top-half xbar.pe nodes
        "hbm_ctrl": (cx - 2.0, cy),
        "xbar.bottom": (cx, cube_h - 3.5),  # Y reference for bottom-half xbar.pe nodes
        "bridge.left": (2.5, cy + 2.0),
        "bridge.right": (cube_w - 2.5, cy + 2.0),
        "noc": (cx + 2.0, cy),
        "sram": (2.5, cy - 1.5),
    }
 def _corner_pe_positions(cube_w: float, cube_h: float) -> dict[str, list[tuple[float, float]]]:
    """PE center positions per corner, relative to cube origin."""
    return {
        "NW": [(1.5, 1.5), (4.5, 1.5)],
        "NE": [(cube_w - 4.5, 1.5), (cube_w - 1.5, 1.5)],
        "SW": [(1.5, cube_h - 1.5), (4.5, cube_h - 1.5)],
        "SE": [(cube_w - 4.5, cube_h - 1.5), (cube_w - 1.5, cube_h - 1.5)],
    }
 # ── Instantiation: system ───────────────────────────────────────────
 def _instantiate_system(nodes: dict[str, Node], system: dict) -> None:
    """Add system-level nodes (fabric switch)."""
    sw = system["components"]["switch"]
    sw_id = "fabric.switch0"
    nodes[sw_id] = Node(
        id=sw_id, kind=sw["kind"], impl=sw["impl"],
        attrs=sw.get("attrs", {}), pos_mm=None, label="Switch",
    )
 # ── Instantiation: IO chiplets ──────────────────────────────────────
 def _instantiate_io_chiplets(
    nodes: dict[str, Node],
    edges: list[Edge],
    sp: str,
    sip_spec: dict,
    cube_w: float,
    cube_h: float,
    mesh_w: int,
    mesh_h: int,
    seam: float,
 ) -> None:
    """Add IO chiplet nodes and internal pcie_ep → io_cpu edges."""
    io_spec = sip_spec["iochiplet"]
    comp = io_spec["components"]
    links = io_spec["links"]
    mesh_total_w = mesh_w * cube_w + (mesh_w - 1) * seam
    mesh_total_h = mesh_h * cube_h + (mesh_h - 1) * seam
    for inst in io_spec["instances"]:
        iid = inst["id"]
        prefix = f"{sp}.{iid}"
        side = inst["place"]["side"]
        cx = mesh_total_w / 2
        if side == "N":
            pcie_y, cpu_y = -5.0, -3.0
        else:
            pcie_y, cpu_y = mesh_total_h + 5.0, mesh_total_h + 3.0
        # pcie_ep
        ep = comp["pcie_ep"]
        ep_id = f"{prefix}.pcie_ep"
        nodes[ep_id] = Node(
            id=ep_id, kind=ep["kind"], impl=ep["impl"],
            attrs=ep["attrs"], pos_mm=(cx, pcie_y), label="PCIe EP",
        )
        # io_cpu
        cpu = comp["io_cpu"]
        cpu_id = f"{prefix}.io_cpu"
        nodes[cpu_id] = Node(
            id=cpu_id, kind=cpu["kind"], impl=cpu["impl"],
            attrs=cpu["attrs"], pos_mm=(cx, cpu_y), label="IO CPU",
        )
        # Internal edge
        edges.append(Edge(
            src=ep_id, dst=cpu_id,
            distance_mm=links["pcie_ep_to_io_cpu_mm"],
            bw_gbs=links["pcie_ep_to_io_cpu_bw_gbs"],
            kind="io_internal",
        ))
 # ── Instantiation: cube + PEs ───────────────────────────────────────
 def _instantiate_cube(
    nodes: dict[str, Node],
    edges: list[Edge],
    cp: str,
    cube: dict,
    origin: tuple[float, float],
 ) -> None:
    """Add all cube-internal nodes and edges, including PE instances."""
    cube_w = cube["geometry"]["cube_mm"]["w"]
    cube_h = cube["geometry"]["cube_mm"]["h"]
    ox, oy = origin
    local_pos = _cube_local_positions(cube_w, cube_h)
    clinks = cube["links"]
    n_slices = cube["memory_map"]["hbm_slices_per_cube"]
    # ── UCIe ports ──
    ucie_ns = cube["ucie"]["overhead_ns"]
    for port in cube["ucie"]["ports"]:
        pid = f"{cp}.ucie-{port}"
        lx, ly = local_pos[f"ucie-{port}"]
        nodes[pid] = Node(
            id=pid, kind="ucie_port", impl="ucie_v1",
            attrs={"overhead_ns": ucie_ns}, pos_mm=(ox + lx, oy + ly),
            label=f"UCIe-{port}",
        )
    # ── Named components: noc, m_cpu, sram ──
    for name in ("noc", "m_cpu", "sram"):
        c = cube["components"][name]
        nid = f"{cp}.{name}"
        lx, ly = local_pos[name]
        nodes[nid] = Node(
            id=nid, kind=c["kind"], impl=c["impl"],
            attrs=c["attrs"], pos_mm=(ox + lx, oy + ly),
            label=name.upper().replace("_", " "),
        )
    # ── HBM controller slices (one per PE) ──
    hbm_spec = cube["components"]["hbm_ctrl"]
    hbm_lx, hbm_ly = local_pos["hbm_ctrl"]
    for sl in range(n_slices):
        sid = f"{cp}.hbm_ctrl.slice{sl}"
        nodes[sid] = Node(
            id=sid, kind=hbm_spec["kind"], impl=hbm_spec["impl"],
            attrs=hbm_spec["attrs"], pos_mm=(ox + hbm_lx, oy + hbm_ly),
            label=f"HBM SLICE{sl}",
        )
    # ── Bridges ──
    for br in cube["components"]["xbar"]["bridges"]:
        bname = br["id"]
        nid = f"{cp}.bridge.{bname}"
        lx, ly = local_pos[f"bridge.{bname}"]
        nodes[nid] = Node(
            id=nid, kind=br["kind"], impl=br["impl"],
            attrs=br["attrs"], pos_mm=(ox + lx, oy + ly),
            label=f"Bridge {bname.upper()}",
        )
    # ── PE instances + per-PE xbar entry nodes ──
    corners = cube["pe_layout"]["corners"]
    pe_per_corner = cube["pe_layout"]["pe_per_corner"]
    corner_pos = _corner_pe_positions(cube_w, cube_h)
    pe_tmpl = cube["pe_template"]
    pe_links = pe_tmpl["links"]
    xbar_pe_spec = cube["components"]["xbar"]["pe"]
    xbar_top_y = local_pos["xbar.top"][1]
    xbar_bot_y = local_pos["xbar.bottom"][1]
    pe_idx = 0
    for corner in corners:
        is_top = corner in ("NW", "NE")
        xbar_y = xbar_top_y if is_top else xbar_bot_y
        mm_key = "pe_to_xbar_row_n_mm" if is_top else "pe_to_xbar_row_s_mm"
        for ci in range(pe_per_corner):
            pp = f"{cp}.pe{pe_idx}"
            pe_cx, pe_cy = corner_pos[corner][ci]
            # Per-PE xbar entry node
            xbar_nid = f"{cp}.xbar.pe{pe_idx}"
            nodes[xbar_nid] = Node(
                id=xbar_nid, kind=xbar_pe_spec["kind"], impl=xbar_pe_spec["impl"],
                attrs=xbar_pe_spec["attrs"], pos_mm=(ox + pe_cx, oy + xbar_y),
                label=f"XBAR PE{pe_idx}",
            )
            # PE template components
            for comp_name, comp_spec in pe_tmpl["components"].items():
                cid = f"{pp}.{comp_name}"
                dx, dy = _PE_COMP_OFFSETS.get(comp_name, (0.0, 0.0))
                nodes[cid] = Node(
                    id=cid, kind=comp_spec["kind"], impl=comp_spec["impl"],
                    attrs=comp_spec["attrs"],
                    pos_mm=(ox + pe_cx + dx, oy + pe_cy + dy),
                    label=comp_name.upper().replace("_", " "),
                )
            # PE-internal edges
            _add_pe_internal_edges(edges, pp, pe_links)
            # PE_DMA → xbar.pe_i (HBM data path)
            edges.append(Edge(
                src=f"{pp}.pe_dma", dst=xbar_nid,
                distance_mm=clinks[mm_key],
                bw_gbs=clinks["pe_to_xbar_bw_gbs"],
                kind="pe_to_xbar",
            ))
            # PE_DMA → noc (non-HBM data path: SRAM, inter-cube, etc.)
            edges.append(Edge(
                src=f"{pp}.pe_dma", dst=f"{cp}.noc",
                distance_mm=clinks["pe_dma_to_noc_mm"],
                bw_gbs=clinks["pe_dma_to_noc_bw_gbs"],
                kind="pe_to_noc",
            ))
            # noc → PE_CPU (command delivery)
            edges.append(Edge(
                src=f"{cp}.noc", dst=f"{pp}.pe_cpu",
                distance_mm=clinks["noc_to_pe_cpu_mm"],
                kind="command",
            ))
            pe_idx += 1
    # ── Cube fabric edges ──
    # xbar.pe_i ↔ hbm_ctrl.slice_i (local Y-path, bidirectional for response)
    for i in range(n_slices):
        edges.append(Edge(
            src=f"{cp}.xbar.pe{i}", dst=f"{cp}.hbm_ctrl.slice{i}",
            distance_mm=clinks["xbar_to_hbm_mm"],
            bw_gbs=clinks["xbar_to_hbm_bw_gbs"],
            kind="xbar_to_hbm",
        ))
        edges.append(Edge(
            src=f"{cp}.hbm_ctrl.slice{i}", dst=f"{cp}.xbar.pe{i}",
            distance_mm=clinks["xbar_to_hbm_mm"],
            bw_gbs=clinks["xbar_to_hbm_bw_gbs"],
            kind="hbm_to_xbar",
        ))
    # xbar chain: pe0↔pe1↔pe2↔pe3 (top), pe4↔pe5↔pe6↔pe7 (bottom)
    half = n_slices // 2
    for half_start in (0, half):
        for i in range(half_start, half_start + half - 1):
            intra = ((i - half_start) % pe_per_corner) != (pe_per_corner - 1)
            x_dist = clinks["xbar_chain_intra_corner_mm"] if intra else clinks["xbar_chain_inter_corner_mm"]
            for a, b in [(i, i + 1), (i + 1, i)]:
                edges.append(Edge(
                    src=f"{cp}.xbar.pe{a}", dst=f"{cp}.xbar.pe{b}",
                    distance_mm=x_dist,
                    bw_gbs=clinks["xbar_x_bw_gbs"],
                    kind="xbar_chain",
                ))
    # bridge connections: pe0↔bridge.left↔pe4, pe3↔bridge.right↔pe7
    for bname, pe_top, pe_bot in [("left", 0, half), ("right", half - 1, n_slices - 1)]:
        br_node = f"{cp}.bridge.{bname}"
        for pe_i, br_mm_key in [(pe_top, "xbar_row_n_to_bridge_mm"),
                                 (pe_bot, "xbar_row_s_to_bridge_mm")]:
            xbar_node = f"{cp}.xbar.pe{pe_i}"
            edges.append(Edge(
                src=xbar_node, dst=br_node,
                distance_mm=clinks[br_mm_key],
                bw_gbs=clinks["xbar_to_bridge_bw_gbs"],
                kind="xbar_to_bridge",
            ))
            edges.append(Edge(
                src=br_node, dst=xbar_node,
                distance_mm=clinks[br_mm_key],
                bw_gbs=clinks["xbar_to_bridge_bw_gbs"],
                kind="bridge_to_xbar",
            ))
    # ucie ↔ noc (UCIe-NOC boundary; per_connection_bw_gbs = 128 GB/s, n_connections = 4)
    _noc_ucie = clinks["noc_to_ucie"]
    for port in cube["ucie"]["ports"]:
        edges.append(Edge(
            src=f"{cp}.ucie-{port}", dst=f"{cp}.noc",
            distance_mm=0.0,
            bw_gbs=_noc_ucie["per_connection_bw_gbs"],
            n_connections=_noc_ucie["n_connections"],
            kind="ucie_to_noc",
        ))
    for port in cube["ucie"]["ports"]:
        edges.append(Edge(
            src=f"{cp}.noc", dst=f"{cp}.ucie-{port}",
            distance_mm=0.0,
            bw_gbs=_noc_ucie["per_connection_bw_gbs"],
            n_connections=_noc_ucie["n_connections"],
            kind="noc_to_ucie",
        ))
    # noc ↔ xbar.pe{i}: wire delay is 0 (NOC traversal latency computed by TwoDMeshNocComponent);
    # routing_weight_mm=50.0 steers PE DMA Dijkstra away from this path (prefer direct pe_dma→xbar)
    _noc_xbar = clinks.get("noc_to_xbar", {})
    _noc_xbar_bw = _noc_xbar.get("per_connection_bw_gbs")
    for i in range(n_slices):
        edges.append(Edge(
            src=f"{cp}.noc", dst=f"{cp}.xbar.pe{i}",
            distance_mm=0.0,
            bw_gbs=_noc_xbar_bw,
            routing_weight_mm=50.0,
            kind="noc_to_xbar",
        ))
        edges.append(Edge(
            src=f"{cp}.xbar.pe{i}", dst=f"{cp}.noc",
            distance_mm=0.0,
            bw_gbs=_noc_xbar_bw,
            routing_weight_mm=50.0,
            kind="xbar_to_noc",
        ))
    # m_cpu ↔ noc (command dispatch, both directions)
    edges.append(Edge(
        src=f"{cp}.m_cpu", dst=f"{cp}.noc",
        distance_mm=clinks["m_cpu_to_noc_mm"],
        kind="command",
    ))
    edges.append(Edge(
        src=f"{cp}.noc", dst=f"{cp}.m_cpu",
        distance_mm=clinks["m_cpu_to_noc_mm"],
        kind="command",
    ))
    # noc ↔ sram (shared SRAM access; per_connection_bw_gbs = 128 GB/s, n_connections = 4)
    _noc_sram = clinks["noc_to_sram"]
    edges.append(Edge(
        src=f"{cp}.noc", dst=f"{cp}.sram",
        distance_mm=clinks["noc_to_sram_mm"],
        bw_gbs=_noc_sram["per_connection_bw_gbs"],
        n_connections=_noc_sram["n_connections"],
        kind="noc_to_sram",
    ))
    edges.append(Edge(
        src=f"{cp}.sram", dst=f"{cp}.noc",
        distance_mm=clinks["noc_to_sram_mm"],
        bw_gbs=_noc_sram["per_connection_bw_gbs"],
        n_connections=_noc_sram["n_connections"],
        kind="noc_to_sram",
    ))
 def _add_pe_internal_edges(edges: list[Edge], pp: str, pe_links: dict) -> None:
    """Add PE-internal edges for a single PE instance."""
    edges.append(Edge(
        src=f"{pp}.pe_cpu", dst=f"{pp}.pe_scheduler",
        distance_mm=pe_links["pe_cpu_to_scheduler_mm"],
        kind="pe_internal",
    ))
    for eng, key in [("pe_dma", "scheduler_to_dma_mm"),
                     ("pe_gemm", "scheduler_to_gemm_mm"),
                     ("pe_math", "scheduler_to_math_mm")]:
        edges.append(Edge(
            src=f"{pp}.pe_scheduler", dst=f"{pp}.{eng}",
            distance_mm=pe_links[key],
            kind="pe_internal",
        ))
    for eng, mm_key, bw_key in [("pe_dma", "dma_to_tcm_mm", "dma_to_tcm_bw_gbs"),
                                 ("pe_gemm", "gemm_to_tcm_mm", "gemm_to_tcm_bw_gbs"),
                                 ("pe_math", "math_to_tcm_mm", "math_to_tcm_bw_gbs")]:
        edges.append(Edge(
            src=f"{pp}.{eng}", dst=f"{pp}.pe_tcm",
            distance_mm=pe_links[mm_key],
            bw_gbs=pe_links[bw_key],
            kind="pe_internal",
        ))
 # ── Inter-cube / IO / system edges ──────────────────────────────────
 def _add_inter_cube_edges(
    edges: list[Edge], sp: str, mesh_w: int, mesh_h: int, sip_spec: dict,
 ) -> None:
    """Add UCIe mesh edges between adjacent cubes within a SIP."""
    mesh = sip_spec["links"]["inter_cube_mesh"]
    bw = mesh["bw_gbs_per_ucie_phy"]
    dist = mesh["distance_mm_across_seam"]
    for row in range(mesh_h):
        for col in range(mesh_w):
            cid = row * mesh_w + col
            if col + 1 < mesh_w:
                nid = row * mesh_w + (col + 1)
                edges.append(Edge(
                    src=f"{sp}.cube{cid}.ucie-E", dst=f"{sp}.cube{nid}.ucie-W",
                    distance_mm=dist, bw_gbs=bw, kind="ucie_mesh",
                ))
                edges.append(Edge(
                    src=f"{sp}.cube{nid}.ucie-W", dst=f"{sp}.cube{cid}.ucie-E",
                    distance_mm=dist, bw_gbs=bw, kind="ucie_mesh",
                ))
            if row + 1 < mesh_h:
                nid = (row + 1) * mesh_w + col
                edges.append(Edge(
                    src=f"{sp}.cube{cid}.ucie-S", dst=f"{sp}.cube{nid}.ucie-N",
                    distance_mm=dist, bw_gbs=bw, kind="ucie_mesh",
                ))
                edges.append(Edge(
                    src=f"{sp}.cube{nid}.ucie-N", dst=f"{sp}.cube{cid}.ucie-S",
                    distance_mm=dist, bw_gbs=bw, kind="ucie_mesh",
                ))
 def _add_io_to_cube_edges(
    edges: list[Edge], sp: str, sip_spec: dict, mesh_w: int,
 ) -> None:
    """Add IO chiplet io_cpu ↔ cube UCIe edges (bidirectional for response)."""
    io_links = sip_spec["iochiplet"]["links"]
    io_to_ucie_mm = io_links["io_cpu_to_ucie_mm"]
    io_to_ucie_bw = io_links["io_cpu_to_ucie_bw_gbs"]
    for inst in sip_spec["iochiplet"]["instances"]:
        iid = inst["id"]
        io_cpu_id = f"{sp}.{iid}.io_cpu"
        for port in inst["cube_ports"]:
            cube_col, cube_row = port["cube"]["xy"]
            cube_id = cube_row * mesh_w + cube_col
            cube_side = port["cube_side"]
            ucie_id = f"{sp}.cube{cube_id}.ucie-{cube_side}"
            edges.append(Edge(
                src=io_cpu_id, dst=ucie_id,
                distance_mm=io_to_ucie_mm + port["distance_mm"],
                bw_gbs=io_to_ucie_bw,
                kind="io_to_cube",
            ))
            edges.append(Edge(
                src=ucie_id, dst=io_cpu_id,
                distance_mm=io_to_ucie_mm + port["distance_mm"],
                bw_gbs=io_to_ucie_bw,
                kind="cube_to_io",
            ))
 def _add_system_to_io_edges(
    edges: list[Edge], sp: str, sip_spec: dict, system: dict,
 ) -> None:
    """Add fabric switch → IO chiplet PCIe edges."""
    sw_id = "fabric.switch0"
    sys_link = system["links"]["io_ep_to_switch"]
    for inst in sip_spec["iochiplet"]["instances"]:
        pcie_ep_id = f"{sp}.{inst['id']}.pcie_ep"
        edges.append(Edge(
            src=sw_id, dst=pcie_ep_id,
            distance_mm=sys_link["distance_mm"],
            bw_gbs=sys_link["bw_gbs_per_ep"],
            kind="pcie",
        ))
 # ── View builders ────────────────────────────────────────────────────
 def _build_system_view(spec: dict) -> ViewGraph:
    """System-level view: SIP blocks, IO chiplets, fabric switch."""
    system = spec["system"]
    sip_count = system["sips"]["count"]
    sip_w, sip_h = 71.0, 59.0
    gap = 30.0
    canvas_w = sip_count * sip_w + (sip_count - 1) * gap
    canvas_h = sip_h + 20.0
    nodes: dict[str, Node] = {}
    view_edges: list[Edge] = []
    sw = system["components"]["switch"]
    sw_id = "fabric.switch0"
    nodes[sw_id] = Node(
        id=sw_id, kind=sw["kind"], impl=sw["impl"],
        attrs=sw.get("attrs", {}), pos_mm=(canvas_w / 2, 5.0), label="Fabric Switch",
    )
    for s in range(sip_count):
        sx = s * (sip_w + gap)
        sy = 20.0
        sip_id = f"sip{s}"
        nodes[sip_id] = Node(
            id=sip_id, kind="sip", impl="",
            attrs={"w_mm": sip_w, "h_mm": sip_h},
            pos_mm=(sx + sip_w / 2, sy + sip_h / 2),
            label=f"SIP {s}",
        )
        for inst in spec["sip"]["iochiplet"]["instances"]:
            iid = inst["id"]
            io_nid = f"{sip_id}.{iid}"
            side = inst["place"]["side"]
            iy = sy if side == "N" else sy + sip_h
            nodes[io_nid] = Node(
                id=io_nid, kind="iochiplet", impl="",
                attrs={}, pos_mm=(sx + sip_w / 2, iy), label=f"IO {iid}",
            )
            view_edges.append(Edge(
                src=sw_id, dst=io_nid,
                distance_mm=system["links"]["io_ep_to_switch"]["distance_mm"],
                bw_gbs=system["links"]["io_ep_to_switch"]["bw_gbs_per_ep"],
                kind="pcie",
            ))
    return ViewGraph(
        name="system", nodes=nodes, edges=view_edges,
        width_mm=canvas_w, height_mm=canvas_h,
    )
 def _build_sip_view(spec: dict) -> ViewGraph:
    """SIP-level view: cube mesh + IO chiplets (representative, sip0)."""
    sip_spec = spec["sip"]
    cube_spec = spec["cube"]
    mesh_w = sip_spec["cube_mesh"]["w"]
    mesh_h = sip_spec["cube_mesh"]["h"]
    cube_w = cube_spec["geometry"]["cube_mm"]["w"]
    cube_h = cube_spec["geometry"]["cube_mm"]["h"]
    seam = sip_spec["links"]["inter_cube_mesh"]["distance_mm_across_seam"]
    stride_x = cube_w + seam
    stride_y = cube_h + seam
    mesh_total_w = mesh_w * cube_w + (mesh_w - 1) * seam
    mesh_total_h = mesh_h * cube_h + (mesh_h - 1) * seam
    io_margin = 6.0
    canvas_w = mesh_total_w
    canvas_h = mesh_total_h + 2 * io_margin
    nodes: dict[str, Node] = {}
    view_edges: list[Edge] = []
    # Cubes as opaque blocks
    for row in range(mesh_h):
        for col in range(mesh_w):
            cid = row * mesh_w + col
            cx = col * stride_x + cube_w / 2
            cy = io_margin + row * stride_y + cube_h / 2
            nid = f"cube{cid}"
            nodes[nid] = Node(
                id=nid, kind="cube", impl="",
                attrs={"w_mm": cube_w, "h_mm": cube_h, "col": col, "row": row},
                pos_mm=(cx, cy), label=f"CUBE ({col},{row})",
            )
    # Inter-cube mesh edges
    mesh_link = sip_spec["links"]["inter_cube_mesh"]
    for row in range(mesh_h):
        for col in range(mesh_w):
            cid = row * mesh_w + col
            if col + 1 < mesh_w:
                nid = row * mesh_w + (col + 1)
                view_edges.append(Edge(
                    src=f"cube{cid}", dst=f"cube{nid}",
                    distance_mm=mesh_link["distance_mm_across_seam"],
                    bw_gbs=mesh_link["bw_gbs_per_ucie_phy"],
                    kind="ucie_mesh",
                ))
            if row + 1 < mesh_h:
                nid = (row + 1) * mesh_w + col
                view_edges.append(Edge(
                    src=f"cube{cid}", dst=f"cube{nid}",
                    distance_mm=mesh_link["distance_mm_across_seam"],
                    bw_gbs=mesh_link["bw_gbs_per_ucie_phy"],
                    kind="ucie_mesh",
                ))
    # IO chiplets
    io_links = sip_spec["iochiplet"]["links"]
    for inst in sip_spec["iochiplet"]["instances"]:
        iid = inst["id"]
        side = inst["place"]["side"]
        iy = 2.0 if side == "N" else canvas_h - 2.0
        nodes[iid] = Node(
            id=iid, kind="iochiplet", impl="",
            attrs={}, pos_mm=(mesh_total_w / 2, iy), label=f"IO {iid}",
        )
        for port in inst["cube_ports"]:
            cube_col, cube_row = port["cube"]["xy"]
            cube_id = cube_row * mesh_w + cube_col
            view_edges.append(Edge(
                src=iid, dst=f"cube{cube_id}",
                distance_mm=io_links["io_cpu_to_ucie_mm"] + port["distance_mm"],
                bw_gbs=io_links["io_cpu_to_ucie_bw_gbs"],
                kind="io_to_cube",
            ))
    return ViewGraph(
        name="sip", nodes=nodes, edges=view_edges,
        width_mm=canvas_w, height_mm=canvas_h,
    )
 def _build_cube_view(spec: dict) -> ViewGraph:
    """Cube-level view: representative single cube, PEs as opaque blocks."""
    cube = spec["cube"]
    cube_w = cube["geometry"]["cube_mm"]["w"]
    cube_h = cube["geometry"]["cube_mm"]["h"]
    local_pos = _cube_local_positions(cube_w, cube_h)
    clinks = cube["links"]
    n_slices = cube["memory_map"]["hbm_slices_per_cube"]
    nodes: dict[str, Node] = {}
    view_edges: list[Edge] = []
    # UCIe ports
    for port in cube["ucie"]["ports"]:
        pid = f"ucie-{port}"
        lx, ly = local_pos[pid]
        nodes[pid] = Node(
            id=pid, kind="ucie_port", impl="ucie_v1",
            attrs={}, pos_mm=(lx, ly), label=f"UCIe-{port}",
        )
    # Named components (hbm_ctrl as single representative node in view)
    for name in ("noc", "m_cpu", "hbm_ctrl", "sram"):
        c = cube["components"][name]
        lx, ly = local_pos[name]
        nodes[name] = Node(
            id=name, kind=c["kind"], impl=c["impl"],
            attrs=c["attrs"], pos_mm=(lx, ly),
            label=name.upper().replace("_", " "),
        )
    # Bridges
    for br in cube["components"]["xbar"]["bridges"]:
        bname = br["id"]
        bid = f"bridge.{bname}"
        lx, ly = local_pos[bid]
        nodes[bid] = Node(
            id=bid, kind=br["kind"], impl=br["impl"],
            attrs=br["attrs"], pos_mm=(lx, ly),
            label=f"Bridge {bname.upper()}",
        )
    # PEs as opaque blocks + per-PE xbar entry nodes
    corners = cube["pe_layout"]["corners"]
    pe_per_corner = cube["pe_layout"]["pe_per_corner"]
    corner_pos = _corner_pe_positions(cube_w, cube_h)
    xbar_pe_spec = cube["components"]["xbar"]["pe"]
    xbar_top_y = local_pos["xbar.top"][1]
    xbar_bot_y = local_pos["xbar.bottom"][1]
    pe_idx = 0
    for corner in corners:
        is_top = corner in ("NW", "NE")
        xbar_y = xbar_top_y if is_top else xbar_bot_y
        mm_key = "pe_to_xbar_row_n_mm" if is_top else "pe_to_xbar_row_s_mm"
        for ci in range(pe_per_corner):
            pid = f"pe{pe_idx}"
            xbar_id = f"xbar.pe{pe_idx}"
            px, py = corner_pos[corner][ci]
            nodes[pid] = Node(
                id=pid, kind="pe", impl="",
                attrs={"corner": corner}, pos_mm=(px, py),
                label=f"PE{pe_idx}",
            )
            nodes[xbar_id] = Node(
                id=xbar_id, kind=xbar_pe_spec["kind"], impl=xbar_pe_spec["impl"],
                attrs=xbar_pe_spec["attrs"], pos_mm=(px, xbar_y),
                label=f"XBAR PE{pe_idx}",
            )
            # PE → xbar.pe_i (HBM data path)
            view_edges.append(Edge(
                src=pid, dst=xbar_id,
                distance_mm=clinks[mm_key],
                bw_gbs=clinks["pe_to_xbar_bw_gbs"],
                kind="pe_to_xbar",
            ))
            # PE → noc (non-HBM data path)
            view_edges.append(Edge(
                src=pid, dst="noc",
                distance_mm=clinks["pe_dma_to_noc_mm"],
                bw_gbs=clinks["pe_dma_to_noc_bw_gbs"],
                kind="pe_to_noc",
            ))
            # noc → PE (command delivery)
            view_edges.append(Edge(
                src="noc", dst=pid,
                distance_mm=clinks["noc_to_pe_cpu_mm"],
                kind="command",
            ))
            pe_idx += 1
    # Cube fabric edges
    # xbar.pe_i → hbm_ctrl (single representative node in view)
    for i in range(n_slices):
        view_edges.append(Edge(
            src=f"xbar.pe{i}", dst="hbm_ctrl",
            distance_mm=clinks["xbar_to_hbm_mm"],
            bw_gbs=clinks["xbar_to_hbm_bw_gbs"],
            kind="xbar_to_hbm",
        ))
    # xbar chain
    half = n_slices // 2
    for half_start in (0, half):
        for i in range(half_start, half_start + half - 1):
            intra = ((i - half_start) % pe_per_corner) != (pe_per_corner - 1)
            x_dist = clinks["xbar_chain_intra_corner_mm"] if intra else clinks["xbar_chain_inter_corner_mm"]
            for a, b in [(i, i + 1), (i + 1, i)]:
                view_edges.append(Edge(
                    src=f"xbar.pe{a}", dst=f"xbar.pe{b}",
                    distance_mm=x_dist,
                    bw_gbs=clinks["xbar_x_bw_gbs"],
                    kind="xbar_chain",
                ))
    # bridge connections
    for bname, pe_top, pe_bot in [("left", 0, half), ("right", half - 1, n_slices - 1)]:
        br_id = f"bridge.{bname}"
        for pe_i, br_mm_key in [(pe_top, "xbar_row_n_to_bridge_mm"),
                                 (pe_bot, "xbar_row_s_to_bridge_mm")]:
            xbar_id = f"xbar.pe{pe_i}"
            view_edges.append(Edge(
                src=xbar_id, dst=br_id,
                distance_mm=clinks[br_mm_key],
                bw_gbs=clinks["xbar_to_bridge_bw_gbs"],
                kind="xbar_to_bridge",
            ))
            view_edges.append(Edge(
                src=br_id, dst=xbar_id,
                distance_mm=clinks[br_mm_key],
                bw_gbs=clinks["xbar_to_bridge_bw_gbs"],
                kind="bridge_to_xbar",
            ))
    _noc_ucie_v = clinks["noc_to_ucie"]
    for port in cube["ucie"]["ports"]:
        view_edges.append(Edge(
            src="noc", dst=f"ucie-{port}",
            distance_mm=0.0,
            bw_gbs=_noc_ucie_v["per_connection_bw_gbs"],
            n_connections=_noc_ucie_v["n_connections"],
            kind="noc_to_ucie",
        ))
    # m_cpu ↔ noc (command dispatch, both directions)
    view_edges.append(Edge(
        src="m_cpu", dst="noc",
        distance_mm=clinks["m_cpu_to_noc_mm"],
        kind="command",
    ))
    view_edges.append(Edge(
        src="noc", dst="m_cpu",
        distance_mm=clinks["m_cpu_to_noc_mm"],
        kind="command",
    ))
    # noc ↔ sram (shared SRAM access, bidirectional)
    _noc_sram_v = clinks["noc_to_sram"]
    view_edges.append(Edge(
        src="noc", dst="sram",
        distance_mm=clinks["noc_to_sram_mm"],
        bw_gbs=_noc_sram_v["per_connection_bw_gbs"],
        n_connections=_noc_sram_v["n_connections"],
        kind="noc_to_sram",
    ))
    view_edges.append(Edge(
        src="sram", dst="noc",
        distance_mm=clinks["noc_to_sram_mm"],
        bw_gbs=_noc_sram_v["per_connection_bw_gbs"],
        n_connections=_noc_sram_v["n_connections"],
        kind="noc_to_sram",
    ))
    return ViewGraph(
        name="cube", nodes=nodes, edges=view_edges,
        width_mm=cube_w, height_mm=cube_h,
    )
 def _build_pe_view(spec: dict) -> ViewGraph:
    """PE-level view: representative single PE with all template components."""
    pe_tmpl = spec["cube"]["pe_template"]
    pe_links = pe_tmpl["links"]
    canvas_w, canvas_h = 12.0, 8.0
    positions = {
        "pe_cpu": (1.5, 4.0),
        "pe_scheduler": (4.0, 4.0),
        "pe_dma": (7.0, 1.5),
        "pe_gemm": (7.0, 4.0),
        "pe_math": (7.0, 6.5),
        "pe_tcm": (10.0, 4.0),
    }
    nodes: dict[str, Node] = {}
    view_edges: list[Edge] = []
    for comp_name, comp_spec in pe_tmpl["components"].items():
        px, py = positions[comp_name]
        nodes[comp_name] = Node(
            id=comp_name, kind=comp_spec["kind"], impl=comp_spec["impl"],
            attrs=comp_spec["attrs"], pos_mm=(px, py),
            label=comp_name.upper().replace("_", " "),
        )
    view_edges.append(Edge(
        src="pe_cpu", dst="pe_scheduler",
        distance_mm=pe_links["pe_cpu_to_scheduler_mm"],
        kind="pe_internal",
    ))
    for eng, key in [("pe_dma", "scheduler_to_dma_mm"),
                     ("pe_gemm", "scheduler_to_gemm_mm"),
                     ("pe_math", "scheduler_to_math_mm")]:
        view_edges.append(Edge(
            src="pe_scheduler", dst=eng,
            distance_mm=pe_links[key],
            kind="pe_internal",
        ))
    for eng, mm_key, bw_key in [("pe_dma", "dma_to_tcm_mm", "dma_to_tcm_bw_gbs"),
                                 ("pe_gemm", "gemm_to_tcm_mm", "gemm_to_tcm_bw_gbs"),
                                 ("pe_math", "math_to_tcm_mm", "math_to_tcm_bw_gbs")]:
        view_edges.append(Edge(
            src=eng, dst="pe_tcm",
            distance_mm=pe_links[mm_key],
            bw_gbs=pe_links[bw_key],
            kind="pe_internal",
        ))
    return ViewGraph(
        name="pe", nodes=nodes, edges=view_edges,
        width_mm=canvas_w, height_mm=canvas_h,
    )
@@ -0,0 +1,56 @@
 from __future__ import annotations
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any
@dataclass
 class Node:
    id: str
    kind: str
    impl: str
    attrs: dict[str, Any]
    pos_mm: tuple[float, float] | None  # (x_mm, y_mm); None for abstract nodes
    label: str = ""
@dataclass
 class Edge:
    src: str                  # node id
    dst: str                  # node id
    distance_mm: float        # physical wire delay distance (ns = distance_mm * ns_per_mm)
    routing_weight_mm: float | None = None  # Dijkstra cost; None → use distance_mm
    bw_gbs: float | None = None
    n_connections: int | None = None  # multi-connection links; single request uses 1 connection
    kind: str = "link"
@dataclass
 class ViewGraph:
    name: str                 # "system" | "sip" | "cube" | "pe"
    nodes: dict[str, Node]
    edges: list[Edge]
    width_mm: float
    height_mm: float
@dataclass
 class TopologyGraph:
    spec: dict[str, Any]
    # Full instantiated flat graph (used by sim_engine)
    nodes: dict[str, Node] = field(default_factory=dict)
    edges: list[Edge] = field(default_factory=list)
    # Representative view projections (used by visualizer)
    system_view: ViewGraph | None = None
    sip_view:    ViewGraph | None = None
    cube_view:   ViewGraph | None = None
    pe_view:     ViewGraph | None = None
@dataclass(frozen=True)
 class TopologyHandle:
    path: Path
    topology_obj: TopologyGraph | None  # None until _compile_graph is implemented
@@ -0,0 +1,367 @@
 # kernbench/topology/visualizer.py
 """
 SVG diagram generator for TopologyGraph views.
 Produces mm-accurate, deterministic SVG files for each view level
 (system, SIP, cube, PE) per ADR-0005 and ADR-0006.
 """
 from __future__ import annotations
 from pathlib import Path
 from .types import Edge, Node, TopologyGraph, ViewGraph
 # ── Color palette by component kind ─────────────────────────────────
 _KIND_COLORS: dict[str, str] = {
    "switch":       "#6366f1",  # indigo
    "sip":          "#e0e7ff",  # light indigo
    "iochiplet":    "#0ea5e9",  # sky blue
    "pcie_ep":      "#0ea5e9",
    "io_cpu":       "#0ea5e9",
    "ucie_port":    "#3b82f6",  # blue
    "noc":          "#a78bfa",  # purple
    "m_cpu":        "#f59e0b",  # amber
    "xbar":         "#f97316",  # orange
    "hbm_ctrl":     "#10b981",  # emerald
    "pe":           "#94a3b8",  # slate
    "pe_cpu":       "#ef4444",  # red
    "pe_scheduler": "#f59e0b",  # amber
    "pe_dma":       "#3b82f6",  # blue
    "pe_gemm":      "#8b5cf6",  # violet
    "pe_math":      "#ec4899",  # pink
    "pe_tcm":       "#10b981",  # emerald
    "sram":         "#f59e0b",  # amber
    "cube":         "#cbd5e1",  # slate-300
 }
 _EDGE_COLORS: dict[str, str] = {
    "pcie":         "#6366f1",
    "io_internal":  "#0ea5e9",
    "io_to_cube":   "#0ea5e9",
    "ucie_mesh":    "#3b82f6",
    "pe_to_xbar":   "#f97316",
    "xbar_to_hbm":  "#10b981",
    "xbar_to_bridge": "#a78bfa",
    "bridge_to_xbar": "#a78bfa",
    "noc_to_ucie":  "#a78bfa",
    "pe_to_noc":    "#a78bfa",
    "noc_to_sram":  "#f59e0b",
    "command":      "#f59e0b",
    "pe_internal":  "#94a3b8",
 }
 # ── Node sizing ──────────────────────────────────────────────────────
 _DEFAULT_NODE_W = 2.0  # mm
 _DEFAULT_NODE_H = 1.2  # mm
 _KIND_SIZE: dict[str, tuple[float, float]] = {
    "sip":  (60.0, 50.0),
    "cube": (6.0, 4.0),
    "iochiplet": (4.0, 1.5),
    "switch": (5.0, 1.5),
 }
 # ── Public API ───────────────────────────────────────────────────────
 def emit_diagrams(graph: TopologyGraph, out_dir: Path) -> list[Path]:
    """Generate SVG diagrams for all views. Returns list of created file paths."""
    out_dir.mkdir(parents=True, exist_ok=True)
    created: list[Path] = []
    views = [
        ("system_view", graph.system_view),
        ("sip_view", graph.sip_view),
        ("cube_view", graph.cube_view),
        ("pe_view", graph.pe_view),
    ]
    for name, view in views:
        if view is None:
            continue
        svg = _render_view_svg(view)
        path = out_dir / f"{name}.svg"
        path.write_text(svg, encoding="utf-8")
        created.append(path)
    return created
 # ── SVG rendering ────────────────────────────────────────────────────
 def _render_view_svg(view: ViewGraph) -> str:
    """Render a ViewGraph to an SVG string."""
    scale = _pick_scale(view)
    pad = 40  # px padding
    node_sizes = _compute_node_sizes(view, scale)
    # Canvas size in px
    w_px = int(view.width_mm * scale + 2 * pad)
    h_px = int(view.height_mm * scale + 2 * pad)
    parts: list[str] = []
    parts.append(_svg_header(w_px, h_px, view.name))
    # Background
    parts.append(f'  <rect width="{w_px}" height="{h_px}" fill="#f8fafc"/>')
    # Title
    parts.append(
        f'  <text x="{w_px // 2}" y="18" text-anchor="middle" '
        f'font-family="monospace" font-size="14" font-weight="bold" fill="#1e293b">'
        f'{view.name.upper()} VIEW</text>'
    )
    # Special: draw cube boundary + HBM block background in cube view
    if view.name == "cube":
        _draw_cube_boundary(parts, view, scale, pad)
        _draw_hbm_block(parts, view, scale, pad)
    # Edges (draw before nodes so nodes are on top)
    # Track fan-out edges to assign per-edge offsets
    fanout_counter: dict[str, int] = {}
    for edge in view.edges:
        if edge.src in view.nodes and edge.dst in view.nodes:
            _draw_edge(parts, edge, view, node_sizes, scale, pad, fanout_counter)
    # Nodes
    for node in view.nodes.values():
        _draw_node(parts, node, node_sizes, scale, pad)
    parts.append("</svg>")
    return "\n".join(parts)
 def _pick_scale(view: ViewGraph) -> float:
    """Pixels per mm, chosen per view type."""
    return {
        "system": 4.0,
        "sip": 8.0,
        "cube": 28.0,
        "pe": 35.0,
    }.get(view.name, 10.0)
 def _compute_node_sizes(
    view: ViewGraph, scale: float,
 ) -> dict[str, tuple[float, float]]:
    """Returns (w_px, h_px) for each node."""
    sizes: dict[str, tuple[float, float]] = {}
    for nid, node in view.nodes.items():
        w_mm, h_mm = _KIND_SIZE.get(node.kind, (_DEFAULT_NODE_W, _DEFAULT_NODE_H))
        # For cube view, use smaller PE nodes
        if view.name == "cube" and node.kind == "pe":
            w_mm, h_mm = 1.8, 1.0
        if view.name == "pe":
            w_mm, h_mm = 2.5, 1.4
        sizes[nid] = (w_mm * scale, h_mm * scale)
    return sizes
 def _svg_header(w: int, h: int, title: str) -> str:
    return (
        f'<svg xmlns="http://www.w3.org/2000/svg" '
        f'width="{w}" height="{h}" viewBox="0 0 {w} {h}">\n'
        f'  <title>{title}</title>'
    )
 def _draw_cube_boundary(
    parts: list[str], view: ViewGraph, scale: float, pad: int,
 ) -> None:
    """Draw the cube die outline as a dashed rectangle."""
    bx = pad
    by = pad
    bw = view.width_mm * scale
    bh = view.height_mm * scale
    parts.append(
        f'  <rect x="{bx:.1f}" y="{by:.1f}" '
        f'width="{bw:.1f}" height="{bh:.1f}" '
        f'rx="6" fill="none" stroke="#475569" stroke-width="2" '
        f'stroke-dasharray="8,4"/>'
    )
 def _draw_hbm_block(
    parts: list[str], view: ViewGraph, scale: float, pad: int,
 ) -> None:
    """Draw HBM area as a filled rectangle in cube view."""
    # HBM area: centered at (8.5, 7.0), size 9x5 -> x=[4.0,13.0], y=[4.5,9.5]
    hbm_x = 4.0 * scale + pad
    hbm_y = 4.5 * scale + pad
    hbm_w = 9.0 * scale
    hbm_h = 5.0 * scale
    parts.append(
        f'  <rect x="{hbm_x:.1f}" y="{hbm_y:.1f}" '
        f'width="{hbm_w:.1f}" height="{hbm_h:.1f}" '
        f'rx="4" fill="#d1fae5" stroke="#10b981" stroke-width="1.5" '
        f'stroke-dasharray="6,3" opacity="0.5"/>'
    )
    cx = 8.5 * scale + pad
    cy = 8.5 * scale + pad
    parts.append(
        f'  <text x="{cx:.1f}" y="{cy:.1f}" text-anchor="middle" '
        f'font-family="monospace" font-size="11" fill="#047857" opacity="0.7">'
        f'HBM</text>'
    )
 def _draw_node(
    parts: list[str],
    node: Node,
    sizes: dict[str, tuple[float, float]],
    scale: float,
    pad: int,
 ) -> None:
    """Draw a single node as a rounded rectangle with label."""
    if node.pos_mm is None:
        return
    px = node.pos_mm[0] * scale + pad
    py = node.pos_mm[1] * scale + pad
    w, h = sizes.get(node.id, (40, 24))
    x = px - w / 2
    y = py - h / 2
    fill = _KIND_COLORS.get(node.kind, "#e2e8f0")
    text_color = "#ffffff" if _is_dark(fill) else "#1e293b"
    parts.append(
        f'  <rect x="{x:.1f}" y="{y:.1f}" width="{w:.1f}" height="{h:.1f}" '
        f'rx="4" fill="{fill}" stroke="#475569" stroke-width="1"/>'
    )
    label = node.label or node.id
    font_size = _label_font_size(w, label)
    parts.append(
        f'  <text x="{px:.1f}" y="{py + 4:.1f}" text-anchor="middle" '
        f'font-family="monospace" font-size="{font_size}" fill="{text_color}">'
        f'{_escape(label)}</text>'
    )
 # ── Fan-out edge kinds that need offset routing ─────────────────────
 _FANOUT_KINDS = {"pe_to_xbar", "pe_to_noc", "command", "noc_to_ucie"}
 def _draw_edge(
    parts: list[str],
    edge: Edge,
    view: ViewGraph,
    sizes: dict[str, tuple[float, float]],
    scale: float,
    pad: int,
    fanout_counter: dict[str, int],
 ) -> None:
    """Draw an edge with orthogonal (90-degree) routing for fan-out kinds."""
    nodes = view.nodes
    src_node = nodes[edge.src]
    dst_node = nodes[edge.dst]
    if src_node.pos_mm is None or dst_node.pos_mm is None:
        return
    x1 = src_node.pos_mm[0] * scale + pad
    y1 = src_node.pos_mm[1] * scale + pad
    x2 = dst_node.pos_mm[0] * scale + pad
    y2 = dst_node.pos_mm[1] * scale + pad
    color = _EDGE_COLORS.get(edge.kind, "#94a3b8")
    width = "1.5" if edge.kind == "pe_internal" else "1"
    opacity = "0.6" if edge.kind in ("command", "noc_to_ucie") else "0.8"
    if edge.kind in _FANOUT_KINDS and view.name == "cube":
        # Orthogonal routing: src→horizontal→vertical→dst with per-edge offset.
        group_key = f"{edge.kind}:{edge.dst}"
        idx = fanout_counter.get(group_key, 0)
        fanout_counter[group_key] = idx + 1
        # Route: go vertically from src to a staggered horizontal channel,
        # then horizontally to dst x, then vertically to dst.
        mid_y = (y1 + y2) / 2 + (idx - 1.5) * 10  # spread channels vertically
        parts.append(
            f'  <polyline points="{x1:.1f},{y1:.1f} {x1:.1f},{mid_y:.1f} '
            f'{x2:.1f},{mid_y:.1f} {x2:.1f},{y2:.1f}" '
            f'fill="none" stroke="{color}" stroke-width="{width}" opacity="{opacity}"/>'
        )
        # Label on the horizontal segment
        if edge.distance_mm > 0:
            lx = (x1 + x2) / 2
            label = f"{edge.distance_mm:.1f}mm"
            if edge.bw_gbs:
                label += f" {edge.bw_gbs:.0f}GB/s"
            parts.append(
                f'  <text x="{lx:.1f}" y="{mid_y - 3:.1f}" text-anchor="middle" '
                f'font-family="monospace" font-size="7" fill="#64748b">'
                f'{label}</text>'
            )
        return
    # Non-fanout: orthogonal L-bend
    if abs(x2 - x1) > 1 and abs(y2 - y1) > 1:
        # PE view: vertical-first for left→right edges (scheduler→engines),
        # horizontal-first for right→right edges (engines→tcm)
        if view.name == "pe":
            if src_node.pos_mm[0] < view.width_mm / 2:
                # Source in left half: vertical-first (scheduler fan-out)
                parts.append(
                    f'  <polyline points="{x1:.1f},{y1:.1f} {x1:.1f},{y2:.1f} {x2:.1f},{y2:.1f}" '
                    f'fill="none" stroke="{color}" stroke-width="{width}" opacity="{opacity}"/>'
                )
            else:
                # Source in right half: horizontal-first (dma/math→tcm)
                parts.append(
                    f'  <polyline points="{x1:.1f},{y1:.1f} {x2:.1f},{y1:.1f} {x2:.1f},{y2:.1f}" '
                    f'fill="none" stroke="{color}" stroke-width="{width}" opacity="{opacity}"/>'
                )
        else:
            parts.append(
                f'  <polyline points="{x1:.1f},{y1:.1f} {x2:.1f},{y1:.1f} {x2:.1f},{y2:.1f}" '
                f'fill="none" stroke="{color}" stroke-width="{width}" opacity="{opacity}"/>'
            )
    else:
        parts.append(
            f'  <line x1="{x1:.1f}" y1="{y1:.1f}" x2="{x2:.1f}" y2="{y2:.1f}" '
            f'stroke="{color}" stroke-width="{width}" opacity="{opacity}"/>'
        )
    # Distance label at midpoint
    if edge.distance_mm > 0:
        mx = (x1 + x2) / 2
        my = (y1 + y2) / 2
        label = f"{edge.distance_mm:.1f}mm"
        if edge.bw_gbs:
            label += f" {edge.bw_gbs:.0f}GB/s"
        parts.append(
            f'  <text x="{mx:.1f}" y="{my - 4:.1f}" text-anchor="middle" '
            f'font-family="monospace" font-size="7" fill="#64748b">'
            f'{label}</text>'
        )
 # ── Helpers ──────────────────────────────────────────────────────────
 def _is_dark(hex_color: str) -> bool:
    """Check if a hex color is dark (for white text)."""
    h = hex_color.lstrip("#")
    r, g, b = int(h[0:2], 16), int(h[2:4], 16), int(h[4:6], 16)
    return (r * 0.299 + g * 0.587 + b * 0.114) < 140
 def _label_font_size(box_width: float, label: str) -> int:
    """Choose font size to fit label in box."""
    char_w = len(label) * 7
    if char_w > box_width * 0.9:
        return max(7, int(box_width * 0.9 / len(label) * 1.4))
    return 10
 def _escape(text: str) -> str:
    """Escape XML special characters."""
    return text.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
@@ -0,0 +1,11 @@
 """Triton emulator: fake tl module for kernel performance simulation.
 Provides TLContext (the fake `tl` parameter) that kernels use to express
 memory access patterns and compute operations. Kernel functions are plain
 Python — no yield, no async — and generate a PeCommand trace that PE_CPU
 replays through SimPy.
 Usage:
    from kernbench.triton_emu.registry import register_kernel, get_kernel
    from kernbench.triton_emu.tl_context import TLContext
 """
@@ -0,0 +1,30 @@
 """Kernel registry: maps kernel names to Python callable generators.
 Benchmarks register kernel functions here; PE_CPU looks them up by
 KernelRef.name at execution time.
 """
 from __future__ import annotations
 from collections.abc import Callable
 from typing import Any
 _kernels: dict[str, Callable[..., None]] = {}
 def register_kernel(name: str, fn: Callable[..., None]) -> None:
    """Register a kernel function by name."""
    if name in _kernels:
        raise ValueError(f"kernel '{name}' already registered")
    _kernels[name] = fn
 def get_kernel(name: str) -> Callable[..., None]:
    """Look up a registered kernel function by name."""
    if name not in _kernels:
        raise KeyError(f"kernel '{name}' not registered")
    return _kernels[name]
 def clear_registry() -> None:
    """Clear all registered kernels (for testing)."""
    _kernels.clear()
@@ -0,0 +1,356 @@
 """TLContext: fake Triton Language module for kernel performance simulation.
 Passed as the `tl` parameter to kernel functions. Each API call records a
 PeCommand in the internal trace. After the kernel returns, PE_CPU replays
 the command list through SimPy.
 Kernel code looks like standard Python — no yield, no async:
    def my_kernel(a_ptr, b_ptr, out_ptr, tl):
        pid = tl.program_id(0)
        a = tl.load(a_ptr, shape=(32, 64), dtype="f16")
        b = tl.load(b_ptr + pid * stride, shape=(64, 32), dtype="f16")
        tl.composite(op="gemm", a=a, b=b, out_ptr=out_ptr)
 """
 from __future__ import annotations
 import math
 from typing import Literal
 from kernbench.common.pe_commands import (
    CompletionHandle,
    CompositeCmd,
    DmaReadCmd,
    DmaWriteCmd,
    GemmCmd,
    MathCmd,
    PeCommand,
    PeCpuOverheadCmd,
    TensorHandle,
    WaitCmd,
 )
 _DTYPE_BYTES: dict[str, int] = {
    "f16": 2, "f32": 4, "f64": 8,
    "bf16": 2,
    "i8": 1, "i16": 2, "i32": 4, "i64": 8,
    "u8": 1, "u16": 2, "u32": 4, "u64": 8,
 }
 class TLContext:
    """Fake Triton Language context.
    Args:
        pe_id: program instance index (returned by program_id).
        num_programs: total number of program instances.
        dispatch_cycles: PE_CPU overhead per tl API call (auto-inserted).
    """
    def __init__(
        self,
        pe_id: int = 0,
        num_programs: int = 1,
        dispatch_cycles: int = 1,
    ) -> None:
        self._pe_id = pe_id
        self._num_programs = num_programs
        self._dispatch_cycles = dispatch_cycles
        self._commands: list[PeCommand] = []
        self._handle_counter = 0
        self._completion_counter = 0
    @property
    def commands(self) -> list[PeCommand]:
        """Return the recorded command trace."""
        return self._commands
    # ── helpers ────────────────────────────────────────────────────
    def _next_handle_id(self) -> str:
        self._handle_counter += 1
        return f"t{self._handle_counter}"
    def _next_completion_id(self) -> str:
        self._completion_counter += 1
        return f"c{self._completion_counter}"
    def _dtype_bytes(self, dtype: str) -> int:
        return _DTYPE_BYTES.get(dtype, 2)
    def _nbytes(self, shape: tuple[int, ...], dtype: str) -> int:
        return math.prod(shape) * self._dtype_bytes(dtype)
    def _emit_dispatch_overhead(self) -> None:
        if self._dispatch_cycles > 0:
            self._commands.append(PeCpuOverheadCmd(cycles=self._dispatch_cycles))
    def _make_handle(
        self, pa: int, shape: tuple[int, ...], dtype: str,
    ) -> TensorHandle:
        return TensorHandle(
            id=self._next_handle_id(),
            pa=pa, shape=shape, dtype=dtype,
            nbytes=self._nbytes(shape, dtype),
        )
    # ── Reference (no DMA, metadata only) ────────────────────────
    def ref(
        self, ptr: int, shape: tuple[int, ...], dtype: str = "f16",
    ) -> TensorHandle:
        """Create a TensorHandle referencing HBM data without issuing DMA.
        Used when the scheduler will stream data per-tile (e.g., tensor b
        in a composite GEMM). No command is generated.
        """
        return self._make_handle(pa=ptr, shape=shape, dtype=dtype)
    # ── Data Movement (blocking, DMA engine) ──────────────────────
    def load(
        self, ptr: int, shape: tuple[int, ...], dtype: str = "f16",
    ) -> TensorHandle:
        """Load tensor from HBM to TCM. Returns TensorHandle."""
        self._emit_dispatch_overhead()
        handle = self._make_handle(pa=ptr, shape=shape, dtype=dtype)
        self._commands.append(DmaReadCmd(
            handle=handle, src_pa=ptr, nbytes=handle.nbytes,
        ))
        return handle
    def store(self, ptr: int, handle: TensorHandle) -> None:
        """Store tensor from TCM to HBM."""
        self._emit_dispatch_overhead()
        self._commands.append(DmaWriteCmd(
            handle=handle, dst_pa=ptr, nbytes=handle.nbytes,
        ))
    # ── GEMM Engine (blocking) ────────────────────────────────────
    def dot(self, a: TensorHandle, b: TensorHandle) -> TensorHandle:
        """Matrix multiply: out = a @ b. Both operands must be in TCM.
        a: (M, K), b: (K, N) → out: (M, N)
        """
        if len(a.shape) < 2 or len(b.shape) < 2:
            raise ValueError("dot requires 2D tensors")
        m, k = a.shape[-2], a.shape[-1]
        k2, n = b.shape[-2], b.shape[-1]
        if k != k2:
            raise ValueError(f"dot shape mismatch: a.K={k} != b.K={k2}")
        out_shape = (*a.shape[:-2], m, n)
        out_dtype = a.dtype
        out = self._make_handle(pa=0, shape=out_shape, dtype=out_dtype)
        self._emit_dispatch_overhead()
        self._commands.append(GemmCmd(a=a, b=b, out=out, m=m, k=k, n=n))
        return out
    # ── MATH Engine: unary (blocking) ─────────────────────────────
    def _unary_math(self, op: str, x: TensorHandle) -> TensorHandle:
        out = self._make_handle(pa=0, shape=x.shape, dtype=x.dtype)
        self._emit_dispatch_overhead()
        self._commands.append(MathCmd(op=op, inputs=(x,), out=out))
        return out
    def exp(self, x: TensorHandle) -> TensorHandle:
        return self._unary_math("exp", x)
    def log(self, x: TensorHandle) -> TensorHandle:
        return self._unary_math("log", x)
    def sqrt(self, x: TensorHandle) -> TensorHandle:
        return self._unary_math("sqrt", x)
    def abs(self, x: TensorHandle) -> TensorHandle:
        return self._unary_math("abs", x)
    def sigmoid(self, x: TensorHandle) -> TensorHandle:
        return self._unary_math("sigmoid", x)
    def cos(self, x: TensorHandle) -> TensorHandle:
        return self._unary_math("cos", x)
    def sin(self, x: TensorHandle) -> TensorHandle:
        return self._unary_math("sin", x)
    # ── MATH Engine: reduction (blocking) ─────────────────────────
    def _reduction(
        self, op: str, x: TensorHandle, axis: int,
    ) -> TensorHandle:
        out_shape = list(x.shape)
        out_shape[axis] = 1
        out = self._make_handle(pa=0, shape=tuple(out_shape), dtype=x.dtype)
        self._emit_dispatch_overhead()
        self._commands.append(MathCmd(op=op, inputs=(x,), out=out, axis=axis))
        return out
    def sum(self, x: TensorHandle, axis: int) -> TensorHandle:
        return self._reduction("sum", x, axis)
    def max(self, x: TensorHandle, axis: int) -> TensorHandle:
        return self._reduction("max", x, axis)
    def min(self, x: TensorHandle, axis: int) -> TensorHandle:
        return self._reduction("min", x, axis)
    # ── MATH Engine: binary (blocking) ────────────────────────────
    def _binary_math(
        self, op: str, a: TensorHandle, b: TensorHandle,
    ) -> TensorHandle:
        out = self._make_handle(pa=0, shape=a.shape, dtype=a.dtype)
        self._emit_dispatch_overhead()
        self._commands.append(MathCmd(op=op, inputs=(a, b), out=out))
        return out
    def where(
        self, cond: TensorHandle, a: TensorHandle, b: TensorHandle,
    ) -> TensorHandle:
        out = self._make_handle(pa=0, shape=a.shape, dtype=a.dtype)
        self._emit_dispatch_overhead()
        self._commands.append(MathCmd(op="where", inputs=(cond, a, b), out=out))
        return out
    # ── Index / Scalar (PE_CPU, no engine) ────────────────────────
    def program_id(self, axis: int = 0) -> int:
        """Return program instance index."""
        return self._pe_id
    def num_programs(self, axis: int = 0) -> int:
        """Return total number of program instances."""
        return self._num_programs
    def arange(self, start: int, end: int, dtype: str = "i32") -> TensorHandle:
        """Create index range tensor in TCM."""
        n = end - start
        return self._make_handle(pa=0, shape=(n,), dtype=dtype)
    def zeros(self, shape: tuple[int, ...], dtype: str = "f16") -> TensorHandle:
        """Create zero-filled tensor in TCM."""
        return self._make_handle(pa=0, shape=shape, dtype=dtype)
    def full(
        self, shape: tuple[int, ...], value: float | int, dtype: str = "f16",
    ) -> TensorHandle:
        """Create constant-filled tensor in TCM."""
        return self._make_handle(pa=0, shape=shape, dtype=dtype)
    # ── Metadata (no compute, no DMA) ─────────────────────────────
    def trans(self, x: TensorHandle) -> TensorHandle:
        """Transpose — shape change only, no command generated."""
        if len(x.shape) < 2:
            raise ValueError("trans requires at least 2D tensor")
        new_shape = (*x.shape[:-2], x.shape[-1], x.shape[-2])
        return TensorHandle(
            id=x.id, pa=x.pa, shape=new_shape,
            dtype=x.dtype, nbytes=x.nbytes, data=x.data,
        )
    # ── Composite + Control ───────────────────────────────────────
    def composite(
        self,
        op: Literal["gemm", "math"],
        a: TensorHandle,
        b: TensorHandle | None = None,
        out_ptr: int = 0,
        math_op: str | None = None,
    ) -> CompletionHandle:
        """Submit a composite command (non-blocking, tiled pipeline).
        Returns CompletionHandle for use with wait().
        """
        # Compute output size based on op
        if op == "gemm" and b is not None:
            m, k = a.shape[-2], a.shape[-1]
            n = b.shape[-1]
            out_dtype = a.dtype
            out_nbytes = m * n * self._dtype_bytes(out_dtype)
        else:
            out_nbytes = a.nbytes
        completion = CompletionHandle(id=self._next_completion_id())
        self._emit_dispatch_overhead()
        self._commands.append(CompositeCmd(
            completion=completion, op=op,
            a=a, b=b, out_pa=out_ptr, out_nbytes=out_nbytes,
            math_op=math_op,
        ))
        return completion
    def wait(self, handle: CompletionHandle | None = None) -> None:
        """Wait for a specific composite or all pending composites."""
        self._commands.append(WaitCmd(handle=handle))
    def cycles(self, n: int) -> None:
        """Declare PE_CPU scalar execution overhead (cycles)."""
        self._commands.append(PeCpuOverheadCmd(cycles=n))
 # ── TensorHandle arithmetic operators ─────────────────────────────
 # Enables: a + b, a * b, a - b, a / b in kernel code.
 # Each creates a MathCmd via a module-level helper that requires a
 # TLContext. We attach the context to handles via a closure approach.
 def _enable_tensor_ops() -> None:
    """Patch TensorHandle with arithmetic operators.
    Called once at module load. Operators create MathCmd entries via
    a thread-local TLContext reference set during kernel execution.
    """
    import threading
    _local = threading.local()
    def set_active_context(ctx: TLContext | None) -> None:
        _local.ctx = ctx
    def get_active_context() -> TLContext:
        ctx = getattr(_local, "ctx", None)
        if ctx is None:
            raise RuntimeError("TensorHandle ops require an active TLContext")
        return ctx
    def _binop(op: str):
        def method(self: TensorHandle, other: TensorHandle) -> TensorHandle:
            ctx = get_active_context()
            return ctx._binary_math(op, self, other)
        return method
    # Patch TensorHandle class with operators
    TensorHandle.__add__ = _binop("add")       # type: ignore[attr-defined]
    TensorHandle.__sub__ = _binop("sub")       # type: ignore[attr-defined]
    TensorHandle.__mul__ = _binop("mul")       # type: ignore[attr-defined]
    TensorHandle.__truediv__ = _binop("div")   # type: ignore[attr-defined]
    # Expose context management
    TLContext._set_active = staticmethod(set_active_context)  # type: ignore[attr-defined]
    TLContext._get_active = staticmethod(get_active_context)  # type: ignore[attr-defined]
 _enable_tensor_ops()
 def run_kernel(
    kernel_fn,
    tl_ctx: TLContext,
    *args,
    **kwargs,
 ) -> list[PeCommand]:
    """Execute a kernel function with the given TLContext and return commands.
    Sets tl_ctx as the active context for TensorHandle operators,
    calls the kernel, then clears the context.
    """
    TLContext._set_active(tl_ctx)  # type: ignore[attr-defined]
    try:
        kernel_fn(*args, tl=tl_ctx, **kwargs)
    finally:
        TLContext._set_active(None)  # type: ignore[attr-defined]
    return tl_ctx.commands
@@ -0,0 +1,22 @@
 import kernbench.cli.main as cli_main
 def test_cli_main_arg_parsing(monkeypatch):
    def fake_cmd_run(args) -> int:
        assert args.cmd == "run"
        assert args.topology == "topology.yaml"
        assert args.bench == "qkv_gemm"
        assert args.device == None
        return 0
    # monkey patch the handler to test arg parsing without running the actual bench
    monkeypatch.setattr(cli_main, "cmd_run", fake_cmd_run)
    rc = cli_main.main(["run", "--topology", "topology.yaml", "--bench", "qkv_gemm"])
    assert rc == 0
 def test_cli_main():
    rc = cli_main.main(["run", "--topology", "topology.yaml", "--bench", "qkv_gemm"])
    assert rc == 0
@@ -0,0 +1,187 @@
 """Tests for the SimPy component model and DI registry (ADR-0007 D3).
 Phase 1 verification: all tests FAIL until Phase 2 implements production code.
 Latency invariant after refactor:
  total_ns = Σ(wire propagation) + Σ(component.run() overhead_ns) + nbytes / bottleneck_bw
  This is identical to the current formula for Phase 0 (no contention).
 """
 import pytest
 import simpy
 from pathlib import Path
 from kernbench.components.base import ComponentBase, ComponentRegistry
 from kernbench.components.impls.forwarding import TransitComponent
 from kernbench.policy.address.phyaddr import PhysAddr
 from kernbench.runtime_api.kernel import MemoryReadMsg
 from kernbench.sim_engine.engine import GraphEngine
 from kernbench.topology.builder import load_topology
 from kernbench.topology.types import Node
 TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml"
 def _graph():
    return load_topology(TOPOLOGY_PATH)
 def _hbm_pa(pe_id: int = 0) -> int:
    slice_bytes = 48 * (1 << 30) // 8
    pa = PhysAddr.pe_hbm_addr(
        rack_id=0, sip_id=0, cube_id=0, pe_id=pe_id,
        pe_local_hbm_offset=0x1000, slice_size_bytes=slice_bytes,
    )
    return pa.encode()
 def _node(impl: str, overhead_ns: float = 0.0) -> Node:
    return Node(id="test", kind="xbar", impl=impl, attrs={"overhead_ns": overhead_ns}, pos_mm=None)
 # ── 1. unknown impl → error ──────────────────────────────────────────
 def test_registry_unknown_impl_raises_error():
    """Unregistered impl raises ValueError (no fallback)."""
    node = _node("totally_unknown_v99", overhead_ns=5.0)
    with pytest.raises(ValueError, match="No component registered"):
        ComponentRegistry.create(node)
 # ── 2. TransitComponent yields exactly overhead_ns via simpy timeout ──
 def test_transit_component_yields_overhead_ns():
    """TransitComponent.run() yields exactly node.attrs['overhead_ns'] ns."""
    node = _node("xbar_v1", overhead_ns=3.0)
    comp = TransitComponent(node)
    env = simpy.Environment()
    def proc():
        yield from comp.run(env, nbytes=4096)
    env.process(proc())
    env.run()
    assert env.now == pytest.approx(3.0)
 def test_transit_component_zero_overhead_ns():
    """TransitComponent with overhead_ns=0 still yields (no infinite loop)."""
    node = _node("noc_v1", overhead_ns=0.0)
    comp = TransitComponent(node)
    env = simpy.Environment()
    done = []
    def proc():
        yield from comp.run(env, nbytes=1024)
        done.append(True)
    env.process(proc())
    env.run()
    assert done == [True]
    assert env.now == pytest.approx(0.0)
 # ── 3. DI override: custom component is invoked by engine ────────────
 def test_engine_component_override_is_called():
    """Custom component injected via component_overrides is invoked during simulation."""
    class SpyXbar(ComponentBase):
        calls = 0
        def run(self, env, nbytes):
            SpyXbar.calls += 1
            yield env.timeout(0)
    SpyXbar.calls = 0
    graph = _graph()
    engine = GraphEngine(graph, component_overrides={"xbar_v1": SpyXbar})
    msg = MemoryReadMsg(
        correlation_id="c", request_id="r",
        src_sip=0, src_cube=0, src_pe=0,
        src_pa=_hbm_pa(pe_id=0), nbytes=4096,
    )
    h = engine.submit(msg)
    engine.wait(h)
    # PE0→slice0 path passes through xbar.pe0 (impl=xbar_v1)
    assert SpyXbar.calls > 0
 # ── 4. behavior unchanged: total_ns matches existing formula ─────────
 def test_engine_component_model_same_latency_as_before():
    """Phase B component model total_ns for PE0→slice0 local HBM (4096B).
    Cut-through (wormhole) wire model: wires apply propagation only.
    Serialization (drain) is computed per-path and applied once at the terminal.
    Forward path:
      Path 1: pcie_ep(5.0) + wire(1.0mm=0.01) + io_cpu(10.0)
      Path 2: wire(3.5mm=0.035) + ucie-N(1.0)
              + 2DMeshNOC(ucie-N→m_cpu: Manhattan 10.9mm=0.109) + m_cpu(5.0)
      Path 3 DMA (m_cpu→noc→xbar.pe0→hbm_ctrl.slice0):
              + 2DMeshNOC(m_cpu→xbar.pe0: Manhattan 15.0mm=0.15)
              + xbar.pe0(2.0) + wire(2.5mm=0.025) + hbm_ctrl(0.0)
              + drain_ns(4096/128 = 32.0, bottleneck = noc_to_xbar 128 GB/s)
    Response path (reverse, nbytes=0, drain=0):
      DMA response: hbm_ctrl→xbar.pe0→noc→m_cpu (propagation + xbar overhead_ns)
      Command response: m_cpu→noc→ucie-N→io_cpu (propagation + ucie overhead_ns)
    Total:  ~58.648 ns
    """
    graph = _graph()
    engine = GraphEngine(graph)
    msg = MemoryReadMsg(
        correlation_id="c", request_id="r",
        src_sip=0, src_cube=0, src_pe=0,
        src_pa=_hbm_pa(pe_id=0), nbytes=4096,
    )
    h = engine.submit(msg)
    engine.wait(h)
    _, trace = engine.get_completion(h)
    assert trace["total_ns"] == pytest.approx(58.648, rel=1e-4)
 # ── 5. override is scoped: only targeted impl is replaced ────────────
 def test_engine_override_is_scoped_to_impl():
    """xbar_v1 override (ZeroXbar, no overhead_ns) reduces total_ns by exactly 4.0 ns.
    xbar.pe0 has overhead_ns=2.0. It is traversed on both the forward DMA path
    and the reverse response path, so replacing it with a zero-latency impl
    removes 2.0 ns × 2 = 4.0 ns; all other components are unchanged.
    """
    class ZeroXbar(ComponentBase):
        def run(self, env, nbytes):
            yield env.timeout(0)
    graph = _graph()
    engine_default = GraphEngine(graph)
    engine_override = GraphEngine(graph, component_overrides={"xbar_v1": ZeroXbar})
    msg = MemoryReadMsg(
        correlation_id="c", request_id="r",
        src_sip=0, src_cube=0, src_pe=0,
        src_pa=_hbm_pa(pe_id=0), nbytes=4096,
    )
    h_d = engine_default.submit(msg)
    engine_default.wait(h_d)
    _, t_default = engine_default.get_completion(h_d)
    h_o = engine_override.submit(msg)
    engine_override.wait(h_o)
    _, t_override = engine_override.get_completion(h_o)
    # ZeroXbar removes overhead_ns=2.0 from xbar.pe0 on forward + response = 4.0 ns faster
    assert t_override["total_ns"] < t_default["total_ns"]
    assert t_default["total_ns"] - t_override["total_ns"] == pytest.approx(4.0, rel=1e-6)
@@ -0,0 +1,405 @@
 import pytest
 from pathlib import Path
 from kernbench.common.types import Completion, RequestHandle
 from kernbench.policy.address.phyaddr import PhysAddr
 from kernbench.runtime_api.kernel import (
    KernelLaunchMsg,
    KernelRef,
    MemoryReadMsg,
    MemoryWriteMsg,
    ScalarArg,
    TensorArg,
    TensorArgShard,
 )
 from kernbench.sim_engine.engine import GraphEngine
 from kernbench.topology.builder import load_topology
 TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml"
 def _engine():
    graph = load_topology(TOPOLOGY_PATH)
    return GraphEngine(graph)
 def _hbm_pa(sip: int = 0, cube: int = 0, pe_id: int = 0) -> int:
    """Create an HBM physical address targeting a specific PE's HBM slice."""
    # 48 GB / 8 slices = 6 GB per slice
    slice_bytes = 48 * (1 << 30) // 8
    pa = PhysAddr.pe_hbm_addr(
        rack_id=0, sip_id=sip, cube_id=cube, pe_id=pe_id,
        pe_local_hbm_offset=0x1000, slice_size_bytes=slice_bytes,
    )
    return pa.encode()
 def _sram_pa(sip: int = 0, cube: int = 0) -> int:
    """Create an SRAM physical address."""
    pa = PhysAddr.cube_sram_addr(rack_id=0, sip_id=sip, cube_id=cube, sram_offset=0x800)
    return pa.encode()
 # ── 1. submit returns handle ────────────────────────────────────────
 def test_engine_submit_returns_handle():
    """submit() must return a RequestHandle (non-empty string)."""
    engine = _engine()
    msg = MemoryWriteMsg(
        correlation_id="c0", request_id="r0",
        dst_sip=0, dst_cube=0, dst_pe=0,
        dst_pa=_hbm_pa(), nbytes=4096, pattern="zero",
    )
    handle = engine.submit(msg)
    assert isinstance(handle, str)
    assert len(handle) > 0
 # ── 2. memory write completion ──────────────────────────────────────
 def test_engine_memory_write_completion():
    """MemoryWrite must complete with ok=True."""
    engine = _engine()
    msg = MemoryWriteMsg(
        correlation_id="c0", request_id="r1",
        dst_sip=0, dst_cube=0, dst_pe=0,
        dst_pa=_hbm_pa(), nbytes=4096, pattern="zero",
    )
    h = engine.submit(msg)
    engine.wait(h)
    comp, trace = engine.get_completion(h)
    assert comp.ok is True
 # ── 3. memory read completion ───────────────────────────────────────
 def test_engine_memory_read_completion():
    """MemoryRead must complete with ok=True."""
    engine = _engine()
    msg = MemoryReadMsg(
        correlation_id="c0", request_id="r2",
        src_sip=0, src_cube=0, src_pe=0,
        src_pa=_hbm_pa(), nbytes=4096,
    )
    h = engine.submit(msg)
    engine.wait(h)
    comp, trace = engine.get_completion(h)
    assert comp.ok is True
 # ── 4. latency positive ────────────────────────────────────────────
 def test_engine_latency_positive():
    """Trace total_ns must be > 0 (ADR-0002 D4)."""
    engine = _engine()
    msg = MemoryWriteMsg(
        correlation_id="c0", request_id="r3",
        dst_sip=0, dst_cube=0, dst_pe=0,
        dst_pa=_hbm_pa(), nbytes=4096, pattern="zero",
    )
    h = engine.submit(msg)
    engine.wait(h)
    _, trace = engine.get_completion(h)
    assert trace["total_ns"] > 0
 # ── 5. trace has total_ns and nbytes ───────────────────────────────
 def test_engine_trace_has_total_ns_and_nbytes():
    """Trace must contain 'total_ns' and 'nbytes'."""
    engine = _engine()
    msg = MemoryWriteMsg(
        correlation_id="c0", request_id="r4",
        dst_sip=0, dst_cube=0, dst_pe=0,
        dst_pa=_hbm_pa(), nbytes=4096, pattern="zero",
    )
    h = engine.submit(msg)
    engine.wait(h)
    _, trace = engine.get_completion(h)
    assert "total_ns" in trace
    assert "nbytes" in trace
    assert trace["nbytes"] == 4096
 # ── 6. latency includes node overhead_ns ────────────────────────────
 def test_engine_latency_includes_node_overhead_ns():
    """Path traverses components with overhead_ns > 0, so total >= some minimum."""
    engine = _engine()
    msg = MemoryWriteMsg(
        correlation_id="c0", request_id="r7",
        dst_sip=0, dst_cube=0, dst_pe=0,
        dst_pa=_hbm_pa(), nbytes=4096, pattern="zero",
    )
    h = engine.submit(msg)
    engine.wait(h)
    _, trace = engine.get_completion(h)
    # pcie_ep (5.0) + io_cpu (10.0) + m_cpu (5.0) = at least 20 ns
    assert trace["total_ns"] >= 20.0
 # ── 7. concurrent requests ─────────────────────────────────────────
 def test_engine_concurrent_requests():
    """Two requests submitted before wait must both complete with traces."""
    engine = _engine()
    msg1 = MemoryWriteMsg(
        correlation_id="c0", request_id="r9a",
        dst_sip=0, dst_cube=0, dst_pe=0,
        dst_pa=_hbm_pa(), nbytes=4096, pattern="zero",
    )
    msg2 = MemoryWriteMsg(
        correlation_id="c0", request_id="r9b",
        dst_sip=0, dst_cube=0, dst_pe=1,
        dst_pa=_hbm_pa(pe_id=1), nbytes=4096, pattern="zero",
    )
    h1 = engine.submit(msg1)
    h2 = engine.submit(msg2)
    engine.wait(h1)
    engine.wait(h2)
    comp1, trace1 = engine.get_completion(h1)
    comp2, trace2 = engine.get_completion(h2)
    assert comp1.ok is True
    assert comp2.ok is True
    assert trace1["total_ns"] > 0
    assert trace2["total_ns"] > 0
 # ── 8. kernel launch ───────────────────────────────────────────────
 def test_engine_kernel_launch_simplified():
    """KernelLaunch returns latency > 0."""
    from kernbench.triton_emu.registry import clear_registry, register_kernel
    clear_registry()
    hbm_pa = _hbm_pa(pe_id=0)
    def gemm_kernel(a_ptr, tl):
        a = tl.load(a_ptr, shape=(4, 4), dtype="f16")
        tl.store(a_ptr, a)
    register_kernel("gemm", gemm_kernel)
    engine = _engine()
    shard0 = TensorArgShard(
        sip=0, cube=0, pe=0,
        pa=_hbm_pa(pe_id=0), nbytes=4096, offset_bytes=0,
    )
    shard1 = TensorArgShard(
        sip=0, cube=0, pe=1,
        pa=_hbm_pa(pe_id=1), nbytes=4096, offset_bytes=4096,
    )
    msg = KernelLaunchMsg(
        correlation_id="c0", request_id="r10",
        kernel_ref=KernelRef(name="gemm", kind="builtin"),
        args=(TensorArg(shards=(shard0, shard1)),),
    )
    h = engine.submit(msg)
    engine.wait(h)
    comp, trace = engine.get_completion(h)
    assert comp.ok is True
    assert trace["total_ns"] > 0
    clear_registry()
 # ── 9. deterministic ───────────────────────────────────────────────
 def test_engine_deterministic():
    """Same request on two engines must produce identical latency."""
    msg = MemoryWriteMsg(
        correlation_id="c0", request_id="r11",
        dst_sip=0, dst_cube=0, dst_pe=0,
        dst_pa=_hbm_pa(), nbytes=4096, pattern="zero",
    )
    e1 = _engine()
    h1 = e1.submit(msg)
    e1.wait(h1)
    _, t1 = e1.get_completion(h1)
    e2 = _engine()
    h2 = e2.submit(msg)
    e2.wait(h2)
    _, t2 = e2.get_completion(h2)
    assert t1["total_ns"] == t2["total_ns"]
 # ── 10. remote cube access succeeds with higher latency ────────────
 def test_dma_capacity_serializes_concurrent():
    """Two concurrent DMA writes to the same cube must contend at DMA capacity=1.
    When two MemoryWrite requests target the same cube's M_CPU simultaneously,
    the DMA engine (capacity=1) serializes them. The slower request must take
    longer than a single isolated request (ADR-0014 D4, ADR-0015 D5).
    """
    # Single isolated write baseline
    engine_single = _engine()
    msg_single = MemoryWriteMsg(
        correlation_id="c0", request_id="single",
        dst_sip=0, dst_cube=0, dst_pe=0,
        dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096,
        pattern="zero", target_pe=0,
    )
    h1 = engine_single.submit(msg_single)
    engine_single.wait(h1)
    _, t1 = engine_single.get_completion(h1)
    single_ns = t1["total_ns"]
    # Two concurrent writes to same cube (different PEs) → DMA contention
    engine_conc = _engine()
    msg_a = MemoryWriteMsg(
        correlation_id="c0", request_id="conc-a",
        dst_sip=0, dst_cube=0, dst_pe=0,
        dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096,
        pattern="zero", target_pe=0,
    )
    msg_b = MemoryWriteMsg(
        correlation_id="c0", request_id="conc-b",
        dst_sip=0, dst_cube=0, dst_pe=1,
        dst_pa=_hbm_pa(sip=0, cube=0, pe_id=1), nbytes=4096,
        pattern="zero", target_pe=1,
    )
    ha = engine_conc.submit(msg_a)
    hb = engine_conc.submit(msg_b)
    engine_conc.wait(ha)
    engine_conc.wait(hb)
    _, ta = engine_conc.get_completion(ha)
    _, tb = engine_conc.get_completion(hb)
    # At least one must be delayed by DMA contention
    max_ns = max(ta["total_ns"], tb["total_ns"])
    assert max_ns > single_ns, (
        f"concurrent max ({max_ns:.2f}ns) must > single ({single_ns:.2f}ns) "
        f"due to DMA capacity=1 contention"
    )
 # ── 11. formula latency lower bound ──────────────────────────────
 def test_formula_latency_lower_bound():
    """_formula_latency must be <= actual latency (ADR-0015 D7).
    Uses PE DMA path which is fully known at engine level.
    """
    from kernbench.policy.address.phyaddr import PhysAddr as PA
    from kernbench.policy.routing.router import AddressResolver, PathRouter
    from kernbench.topology.builder import load_topology as lt
    graph = lt(TOPOLOGY_PATH)
    engine = GraphEngine(graph)
    resolver = AddressResolver(graph)
    router = PathRouter(graph)
    pa = _hbm_pa(sip=0, cube=0, pe_id=1)
    pa_obj = PA.decode(pa)
    dst_node = resolver.resolve(pa_obj)
    pe_ref = "sip0.cube0.pe0"
    path = router.find_path(pe_ref, dst_node)
    formula = engine._formula_latency(path, 4096)
    # Run actual simulation
    msg = MemoryReadMsg(
        correlation_id="c0", request_id="formula-lb",
        src_sip=0, src_cube=0, src_pe=0,
        src_pa=pa, nbytes=4096, target_pe=1,
    )
    h = engine.submit(msg)
    engine.wait(h)
    _, trace = engine.get_completion(h)
    actual = trace["total_ns"]
    assert formula <= actual, (
        f"formula ({formula:.2f}) must <= actual ({actual:.2f})"
    )
    assert formula > 0, "formula must be > 0"
 def test_formula_latency_exact_no_contention():
    """With no contention, formula should approximate actual for PE DMA.
    PE DMA is single-request with no fan-out or aggregation,
    so formula ≈ actual (within small tolerance for SimPy scheduling).
    """
    from kernbench.runtime_api.kernel import PeDmaMsg
    from kernbench.policy.address.phyaddr import PhysAddr as PA
    from kernbench.policy.routing.router import AddressResolver, PathRouter
    from kernbench.topology.builder import load_topology as lt
    graph = lt(TOPOLOGY_PATH)
    engine = GraphEngine(graph)
    resolver = AddressResolver(graph)
    router = PathRouter(graph)
    pa = _hbm_pa(sip=0, cube=0, pe_id=0)
    pa_obj = PA.decode(pa)
    dst_node = resolver.resolve(pa_obj)
    pe_ref = "sip0.cube0.pe0"
    path = router.find_path(pe_ref, dst_node)
    formula = engine._formula_latency(path, 4096)
    msg = PeDmaMsg(
        correlation_id="c0", request_id="formula-exact",
        src_sip=0, src_cube=0, src_pe=0,
        dst_pa=pa, nbytes=4096,
    )
    h = engine.submit(msg)
    engine.wait(h)
    _, trace = engine.get_completion(h)
    actual = trace["total_ns"]
    # No contention: formula should equal actual
    assert abs(formula - actual) < 0.01, (
        f"formula ({formula:.4f}) ≈ actual ({actual:.4f}) expected with no contention"
    )
 # ── 10. remote cube access succeeds with higher latency ────────────
 def test_engine_remote_cube_latency_higher():
    """Accessing a distant cube's HBM must have strictly higher latency than local.
    Uses separate engines to avoid contention effects.
    cube15 (far corner of 4x4 mesh) requires multiple UCIe + NOC hops
    from IO chiplet compared to cube0 (directly connected).
    """
    engine_local = _engine()
    engine_remote = _engine()
    msg_local = MemoryReadMsg(
        correlation_id="c0", request_id="r14a",
        src_sip=0, src_cube=0, src_pe=0,
        src_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096,
    )
    msg_remote = MemoryReadMsg(
        correlation_id="c0", request_id="r14b",
        src_sip=0, src_cube=0, src_pe=0,
        src_pa=_hbm_pa(sip=0, cube=15, pe_id=0), nbytes=4096,
    )
    h_local = engine_local.submit(msg_local)
    engine_local.wait(h_local)
    _, t_local = engine_local.get_completion(h_local)
    h_remote = engine_remote.submit(msg_remote)
    engine_remote.wait(h_remote)
    comp_remote, t_remote = engine_remote.get_completion(h_remote)
    assert comp_remote.ok is True
    assert t_remote is not None and t_local is not None
    assert t_remote["total_ns"] > t_local["total_ns"], (
        f"remote cube {t_remote['total_ns']:.2f} must > local {t_local['total_ns']:.2f}"
    )
@@ -0,0 +1,269 @@
 """Phase A component infrastructure tests (ADR-0015).
 Verifies:
  - TransitComponent, IoCpuComponent apply overhead_ns via run()
  - HbmCtrlComponent and SramComponent act as terminal nodes (succeed done)
  - MCpuComponent forwards when not terminal; completes when terminal + no ctx
  - ComponentRegistry resolves impl strings to correct concrete classes
  - GraphEngine passes ComponentContext to every component
  - ComponentContext.router and .resolver are correctly populated
 """
 from __future__ import annotations
 from pathlib import Path
 from typing import Any
 import pytest
 import simpy
 from kernbench.components.base import ComponentBase, ComponentRegistry
 from kernbench.components.context import ComponentContext
 from kernbench.components.impls import (
    HbmCtrlComponent,
    IoCpuComponent,
    MCpuComponent,
    PcieEpComponent,
    SramComponent,
    TransitComponent,
 )
 from kernbench.sim_engine.engine import GraphEngine
 from kernbench.sim_engine.transaction import Transaction
 from kernbench.topology.builder import load_topology
 from kernbench.topology.types import Node
 TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml"
 def _node(impl: str, attrs: dict | None = None) -> Node:
    return Node(id="test.node", kind="test", impl=impl, attrs=attrs or {}, pos_mm=None)
 def _run_worker(comp: ComponentBase, env: simpy.Environment, txn: Transaction) -> None:
    """Wire one in_port, start the component, inject txn, run env until done."""
    in_store: simpy.Store = simpy.Store(env)
    comp.in_ports["src"] = in_store
    comp.start(env)
    env.process(_inject(in_store, txn))
    env.run(until=txn.done)
 def _inject(store: simpy.Store, txn: Transaction):
    yield store.put(txn)
 # ── 1. run() latency: TransitComponent ───────────────────────────────
 def test_transit_component_run_overhead_ns():
    """TransitComponent.run() yields exactly overhead_ns."""
    node = _node("forwarding_v1", {"overhead_ns": 7.5})
    comp = TransitComponent(node)
    env = simpy.Environment()
    def proc():
        yield from comp.run(env, nbytes=1024)
    env.process(proc())
    env.run()
    assert env.now == pytest.approx(7.5)
 def test_transit_component_run_zero_overhead_ns():
    """TransitComponent.run() with overhead_ns=0 completes immediately."""
    node = _node("noc_v1", {"overhead_ns": 0.0})
    comp = TransitComponent(node)
    env = simpy.Environment()
    done = []
    def proc():
        yield from comp.run(env, nbytes=512)
        done.append(True)
    env.process(proc())
    env.run()
    assert done == [True]
    assert env.now == pytest.approx(0.0)
 # ── 2. run() latency: IoCpuComponent ────────────────────────────────
 def test_io_cpu_component_run_overhead_ns():
    """IoCpuComponent.run() yields exactly overhead_ns."""
    node = _node("io_cpu_v1", {"overhead_ns": 10.0})
    comp = IoCpuComponent(node)
    env = simpy.Environment()
    def proc():
        yield from comp.run(env, nbytes=2048)
    env.process(proc())
    env.run()
    assert env.now == pytest.approx(10.0)
 # ── 3. Terminal: HbmCtrlComponent succeeds done ──────────────────────
 def test_hbm_ctrl_terminal_succeeds_done():
    """HbmCtrlComponent is a terminal node: succeeds txn.done after run()."""
    node = _node("hbm_ctrl_v1", {"overhead_ns": 0.0, "capacity": 1})
    comp = HbmCtrlComponent(node)
    env = simpy.Environment()
    done_event = env.event()
    txn = Transaction(request=None, path=["test.node"], step=0, nbytes=256, done=done_event)
    _run_worker(comp, env, txn)
    assert done_event.triggered
 def test_hbm_ctrl_resource_serializes_requests():
    """HbmCtrlComponent with capacity=1 serializes concurrent requests."""
    node = _node("hbm_ctrl_v1", {"overhead_ns": 5.0, "capacity": 1})
    comp = HbmCtrlComponent(node)
    env = simpy.Environment()
    in_store: simpy.Store = simpy.Store(env)
    comp.in_ports["src"] = in_store
    comp.start(env)
    done1 = env.event()
    done2 = env.event()
    txn1 = Transaction(request=None, path=["test.node"], step=0, nbytes=0, done=done1)
    txn2 = Transaction(request=None, path=["test.node"], step=0, nbytes=0, done=done2)
    def inject():
        yield in_store.put(txn1)
        yield in_store.put(txn2)
    env.process(inject())
    env.run(until=done2)
    # Both must be done; with serialization: t=5 + t=10
    assert done1.triggered
    assert done2.triggered
    assert env.now == pytest.approx(10.0)
 # ── 4. Terminal: SramComponent succeeds done ─────────────────────────
 def test_sram_terminal_succeeds_done():
    """SramComponent is a terminal node: succeeds txn.done after run()."""
    node = _node("sram_v1", {"overhead_ns": 2.0})
    comp = SramComponent(node)
    env = simpy.Environment()
    done_event = env.event()
    txn = Transaction(request=None, path=["test.node"], step=0, nbytes=512, done=done_event)
    _run_worker(comp, env, txn)
    assert done_event.triggered
    assert env.now == pytest.approx(2.0)
 # ── 5. MCpuComponent: forward when not terminal ──────────────────────
 def test_m_cpu_forwards_when_not_terminal():
    """MCpuComponent forwards Transaction to next hop when not terminal."""
    node = _node("m_cpu_v1", {"overhead_ns": 5.0})
    comp = MCpuComponent(node)
    env = simpy.Environment()
    # Wire in_port and out_port for a two-hop path [src, test.node, next]
    in_store: simpy.Store = simpy.Store(env)
    out_store: simpy.Store = simpy.Store(env)
    comp.in_ports["src"] = in_store
    comp.out_ports["next"] = out_store
    comp.start(env)
    done_event = env.event()
    txn = Transaction(
        request=None,
        path=["src", "test.node", "next"],
        step=1,           # currently at test.node; next_hop = "next"
        nbytes=128,
        done=done_event,
    )
    forwarded: list[Any] = []
    def receiver():
        msg = yield out_store.get()
        forwarded.append(msg)
        msg.done.succeed()
    env.process(receiver())
    def inject():
        yield in_store.put(txn)
    env.process(inject())
    env.run(until=done_event)
    assert len(forwarded) == 1
    assert forwarded[0].step == 2     # advanced
    assert env.now == pytest.approx(5.0)
 # ── 6. MCpuComponent: terminal with no ctx just completes ────────────
 def test_m_cpu_terminal_no_ctx_completes():
    """MCpuComponent without ctx completes txn.done when it is the terminal hop."""
    node = _node("m_cpu_v1", {"overhead_ns": 0.0})
    comp = MCpuComponent(node, ctx=None)
    env = simpy.Environment()
    done_event = env.event()
    txn = Transaction(request=None, path=["test.node"], step=0, nbytes=64, done=done_event)
    _run_worker(comp, env, txn)
    assert done_event.triggered
 # ── 7. ComponentRegistry resolves impl strings ───────────────────────
@pytest.mark.parametrize("impl,expected_cls", [
    ("forwarding_v1", TransitComponent),
    ("noc_v1",        TransitComponent),
    ("ucie_v1",       TransitComponent),
    ("xbar_v1",       TransitComponent),
    ("pcie_ep_v1",    PcieEpComponent),
    ("io_cpu_v1",     IoCpuComponent),
    ("m_cpu_v1",      MCpuComponent),
    ("hbm_ctrl_v1",   HbmCtrlComponent),
    ("sram_v1",       SramComponent),
 ])
 def test_registry_resolves_impl(impl, expected_cls):
    """ComponentRegistry.create() returns the correct concrete class for each impl."""
    node = _node(impl, {"overhead_ns": 0.0})
    comp = ComponentRegistry.create(node)
    assert isinstance(comp, expected_cls)
 # ── 8. GraphEngine passes ComponentContext to components ─────────────
 def test_engine_passes_ctx_to_components():
    """GraphEngine injects a non-None ComponentContext into every component."""
    graph = load_topology(TOPOLOGY_PATH)
    engine = GraphEngine(graph)
    for node_id, comp in engine._components.items():
        assert comp.ctx is not None, f"{node_id}: ctx is None"
        assert isinstance(comp.ctx, ComponentContext), f"{node_id}: ctx wrong type"
 def test_engine_ctx_router_and_resolver_populated():
    """ComponentContext.router and .resolver are PathRouter / AddressResolver instances."""
    from kernbench.policy.routing.router import AddressResolver, PathRouter
    graph = load_topology(TOPOLOGY_PATH)
    engine = GraphEngine(graph)
    # Spot-check one component
    first_comp = next(iter(engine._components.values()))
    assert isinstance(first_comp.ctx.router, PathRouter)
    assert isinstance(first_comp.ctx.resolver, AddressResolver)
--- a/Show More
+++ b/Show More
		`@@ -0,0 +1,2 @@`
							`def run(ctx):`
							`print("IPCQ all reduce kernel bench")`