From 6f43807900443cd0ab2abd163ed42447f8140d56 Mon Sep 17 00:00:00 2001
From: Yangwook <ywkang80@gmail.com>
Date: Wed, 18 Mar 2026 11:47:48 -0700
Subject: [PATCH] commit - release 1

---
 .claude/settings.json                         |   15 +
 .claude/settings.local.json                   |   36 +
 .editorconfig                                 |   12 +
 .gitignore                                    |   28 +
 .pylintrc                                     |    3 +
 .vscode/extensions.json                       |   16 +
 .vscode/launch.json                           |   55 +
 .vscode/settings.json                         |   39 +
 .vscode/tasks.json                            |  127 ++
 CLAUDE.md                                     |  196 +++
 README.md                                     |   13 +
 SPEC.md                                       |  327 +++++
 benches/__init__.py                           |    0
 benches/ipcq_allreduce.py                     |    2 +
 benches/loader.py                             |   37 +
 benches/qkv_gemm.py                           |   39 +
 benches/qkv_gemm_multi_pe.py                  |   39 +
 docs/adr/ADR-0001-physaddr-layout.md          |  108 ++
 docs/adr/ADR-0002-routing-distance.md         |  103 ++
 docs/adr/ADR-0003-target-system-hierarchy.md  |   64 +
 .../ADR-0004-memory-semantics-local-hbm.md    |   64 +
 .../ADR-0005-diagram-views-distance-layout.md |  186 +++
 ...6-topology-compilation-distance-diagram.md |  130 ++
 docs/adr/ADR-0007-runtime-api-boundaries.md   |   89 ++
 .../ADR-0008-tensor-deploy-and-allocation.md  |  100 ++
 .../ADR-0009-kernel-execution-messaging.md    |   74 ++
 docs/adr/ADR-0010-cli-device-selection.md     |   62 +
 ...R-0011-memory-addressing-simplification.md |   65 +
 docs/adr/ADR-0012-host-io-message-schema.md   |  232 ++++
 docs/adr/ADR-0013-verification_strategy.md    |  139 ++
 .../ADR-0014-pe-internal-execution-model.md   |  364 +++++
 .../adr/ADR-0015-component-port-wire-model.md |  178 +++
 docs/di-presentation.md                       |  363 +++++
 docs/diagrams/README.md                       |   26 +
 docs/diagrams/cube_view.svg                   |  156 +++
 docs/diagrams/pe_view.svg                     |   31 +
 docs/diagrams/placement_column_wise.svg       |   72 +
 docs/diagrams/placement_replicate.svg         |   47 +
 docs/diagrams/placement_row_wise.svg          |   72 +
 .../diagrams/placement_tiled_column_major.svg |  116 ++
 docs/diagrams/placement_tiled_row_major.svg   |  116 ++
 docs/diagrams/sip_view.svg                    |   95 ++
 docs/diagrams/system_view.svg                 |   19 +
 docs/latency-model.md                         |  381 ++++++
 pyproject.toml                                |   30 +
 scripts/gen_placement_diagrams.py             |  393 ++++++
 src/kernbench/__init__.py                     |    0
 src/kernbench/cli/main.py                     |   64 +
 src/kernbench/cli/probe.py                    |  248 ++++
 src/kernbench/cli/report.py                   |  175 +++
 src/kernbench/common/__init__.py              |    0
 src/kernbench/common/pe_commands.py           |  150 +++
 src/kernbench/common/types.py                 |   29 +
 src/kernbench/components/__init__.py          |    4 +
 src/kernbench/components/base.py              |  167 +++
 src/kernbench/components/context.py           |   52 +
 src/kernbench/components/impls/__init__.py    |   54 +
 src/kernbench/components/impls/forwarding.py  |   27 +
 src/kernbench/components/impls/hbm_ctrl.py    |  101 ++
 src/kernbench/components/impls/io_cpu.py      |  145 ++
 src/kernbench/components/impls/m_cpu.py       |  269 ++++
 src/kernbench/components/impls/noc.py         |  187 +++
 src/kernbench/components/impls/pcie_ep.py     |   27 +
 src/kernbench/components/impls/pe_cpu.py      |  154 +++
 src/kernbench/components/impls/pe_dma.py      |  116 ++
 src/kernbench/components/impls/pe_gemm.py     |   90 ++
 src/kernbench/components/impls/pe_math.py     |   54 +
 .../components/impls/pe_scheduler.py          |  245 ++++
 src/kernbench/components/impls/pe_tcm.py      |   25 +
 src/kernbench/components/impls/sram.py        |   59 +
 src/kernbench/di/registry.py                  |    0
 src/kernbench/policy/address/allocator.py     |   85 ++
 src/kernbench/policy/address/phyaddr.py       |  184 +++
 src/kernbench/policy/placement/dp.py          |  174 +++
 src/kernbench/policy/routing/router.py        |  184 +++
 src/kernbench/runtime_api/__init__.py         |    0
 src/kernbench/runtime_api/bench_runner.py     |   96 ++
 src/kernbench/runtime_api/context.py          |  282 ++++
 src/kernbench/runtime_api/kernel.py           |  123 ++
 src/kernbench/runtime_api/tensor.py           |  166 +++
 src/kernbench/runtime_api/types.py            |   71 +
 src/kernbench/sim_engine/dummy.py             |   31 +
 src/kernbench/sim_engine/engine.py            |  298 +++++
 src/kernbench/sim_engine/transaction.py       |   49 +
 src/kernbench/topology/__init__.py            |    0
 src/kernbench/topology/builder.py             |  965 ++++++++++++++
 src/kernbench/topology/graph.py               |    0
 .../topology/projections/cube_view.py         |    0
 src/kernbench/topology/projections/pe_view.py |    0
 .../topology/projections/sip_view.py          |    0
 src/kernbench/topology/types.py               |   56 +
 src/kernbench/topology/visualizer.py          |  367 +++++
 src/kernbench/triton_emu/__init__.py          |   11 +
 src/kernbench/triton_emu/registry.py          |   30 +
 src/kernbench/triton_emu/tl_context.py        |  356 +++++
 tests/test_cli.py                             |   22 +
 tests/test_component_registry.py              |  187 +++
 tests/test_engine.py                          |  405 ++++++
 tests/test_pe_components.py                   | 1175 +++++++++++++++++
 tests/test_phase_a_components.py              |  269 ++++
 tests/test_phyaddr.py                         |  268 ++++
 tests/test_probe.py                           |  221 ++++
 tests/test_routing.py                         |  226 ++++
 tests/test_tensor.py                          |  282 ++++
 tests/test_topology_compile.py                |  409 ++++++
 tests/test_topology_load.py                   |   60 +
 tests/test_topology_visualize.py              |   81 ++
 tests/test_triton_emu.py                      |  349 +++++
 topology.yaml                                 |  126 ++
 109 files changed, 14909 insertions(+)
 create mode 100644 .claude/settings.json
 create mode 100644 .claude/settings.local.json
 create mode 100644 .editorconfig
 create mode 100644 .gitignore
 create mode 100644 .pylintrc
 create mode 100644 .vscode/extensions.json
 create mode 100644 .vscode/launch.json
 create mode 100644 .vscode/settings.json
 create mode 100644 .vscode/tasks.json
 create mode 100644 CLAUDE.md
 create mode 100644 README.md
 create mode 100644 SPEC.md
 create mode 100644 benches/__init__.py
 create mode 100644 benches/ipcq_allreduce.py
 create mode 100644 benches/loader.py
 create mode 100644 benches/qkv_gemm.py
 create mode 100644 benches/qkv_gemm_multi_pe.py
 create mode 100644 docs/adr/ADR-0001-physaddr-layout.md
 create mode 100644 docs/adr/ADR-0002-routing-distance.md
 create mode 100644 docs/adr/ADR-0003-target-system-hierarchy.md
 create mode 100644 docs/adr/ADR-0004-memory-semantics-local-hbm.md
 create mode 100644 docs/adr/ADR-0005-diagram-views-distance-layout.md
 create mode 100644 docs/adr/ADR-0006-topology-compilation-distance-diagram.md
 create mode 100644 docs/adr/ADR-0007-runtime-api-boundaries.md
 create mode 100644 docs/adr/ADR-0008-tensor-deploy-and-allocation.md
 create mode 100644 docs/adr/ADR-0009-kernel-execution-messaging.md
 create mode 100644 docs/adr/ADR-0010-cli-device-selection.md
 create mode 100644 docs/adr/ADR-0011-memory-addressing-simplification.md
 create mode 100644 docs/adr/ADR-0012-host-io-message-schema.md
 create mode 100644 docs/adr/ADR-0013-verification_strategy.md
 create mode 100644 docs/adr/ADR-0014-pe-internal-execution-model.md
 create mode 100644 docs/adr/ADR-0015-component-port-wire-model.md
 create mode 100644 docs/di-presentation.md
 create mode 100644 docs/diagrams/README.md
 create mode 100644 docs/diagrams/cube_view.svg
 create mode 100644 docs/diagrams/pe_view.svg
 create mode 100644 docs/diagrams/placement_column_wise.svg
 create mode 100644 docs/diagrams/placement_replicate.svg
 create mode 100644 docs/diagrams/placement_row_wise.svg
 create mode 100644 docs/diagrams/placement_tiled_column_major.svg
 create mode 100644 docs/diagrams/placement_tiled_row_major.svg
 create mode 100644 docs/diagrams/sip_view.svg
 create mode 100644 docs/diagrams/system_view.svg
 create mode 100644 docs/latency-model.md
 create mode 100644 pyproject.toml
 create mode 100644 scripts/gen_placement_diagrams.py
 create mode 100644 src/kernbench/__init__.py
 create mode 100644 src/kernbench/cli/main.py
 create mode 100644 src/kernbench/cli/probe.py
 create mode 100644 src/kernbench/cli/report.py
 create mode 100644 src/kernbench/common/__init__.py
 create mode 100644 src/kernbench/common/pe_commands.py
 create mode 100644 src/kernbench/common/types.py
 create mode 100644 src/kernbench/components/__init__.py
 create mode 100644 src/kernbench/components/base.py
 create mode 100644 src/kernbench/components/context.py
 create mode 100644 src/kernbench/components/impls/__init__.py
 create mode 100644 src/kernbench/components/impls/forwarding.py
 create mode 100644 src/kernbench/components/impls/hbm_ctrl.py
 create mode 100644 src/kernbench/components/impls/io_cpu.py
 create mode 100644 src/kernbench/components/impls/m_cpu.py
 create mode 100644 src/kernbench/components/impls/noc.py
 create mode 100644 src/kernbench/components/impls/pcie_ep.py
 create mode 100644 src/kernbench/components/impls/pe_cpu.py
 create mode 100644 src/kernbench/components/impls/pe_dma.py
 create mode 100644 src/kernbench/components/impls/pe_gemm.py
 create mode 100644 src/kernbench/components/impls/pe_math.py
 create mode 100644 src/kernbench/components/impls/pe_scheduler.py
 create mode 100644 src/kernbench/components/impls/pe_tcm.py
 create mode 100644 src/kernbench/components/impls/sram.py
 create mode 100644 src/kernbench/di/registry.py
 create mode 100644 src/kernbench/policy/address/allocator.py
 create mode 100644 src/kernbench/policy/address/phyaddr.py
 create mode 100644 src/kernbench/policy/placement/dp.py
 create mode 100644 src/kernbench/policy/routing/router.py
 create mode 100644 src/kernbench/runtime_api/__init__.py
 create mode 100644 src/kernbench/runtime_api/bench_runner.py
 create mode 100644 src/kernbench/runtime_api/context.py
 create mode 100644 src/kernbench/runtime_api/kernel.py
 create mode 100644 src/kernbench/runtime_api/tensor.py
 create mode 100644 src/kernbench/runtime_api/types.py
 create mode 100644 src/kernbench/sim_engine/dummy.py
 create mode 100644 src/kernbench/sim_engine/engine.py
 create mode 100644 src/kernbench/sim_engine/transaction.py
 create mode 100644 src/kernbench/topology/__init__.py
 create mode 100644 src/kernbench/topology/builder.py
 create mode 100644 src/kernbench/topology/graph.py
 create mode 100644 src/kernbench/topology/projections/cube_view.py
 create mode 100644 src/kernbench/topology/projections/pe_view.py
 create mode 100644 src/kernbench/topology/projections/sip_view.py
 create mode 100644 src/kernbench/topology/types.py
 create mode 100644 src/kernbench/topology/visualizer.py
 create mode 100644 src/kernbench/triton_emu/__init__.py
 create mode 100644 src/kernbench/triton_emu/registry.py
 create mode 100644 src/kernbench/triton_emu/tl_context.py
 create mode 100644 tests/test_cli.py
 create mode 100644 tests/test_component_registry.py
 create mode 100644 tests/test_engine.py
 create mode 100644 tests/test_pe_components.py
 create mode 100644 tests/test_phase_a_components.py
 create mode 100644 tests/test_phyaddr.py
 create mode 100644 tests/test_probe.py
 create mode 100644 tests/test_routing.py
 create mode 100644 tests/test_tensor.py
 create mode 100644 tests/test_topology_compile.py
 create mode 100644 tests/test_topology_load.py
 create mode 100644 tests/test_topology_visualize.py
 create mode 100644 tests/test_triton_emu.py
 create mode 100644 topology.yaml

diff --git a/.claude/settings.json b/.claude/settings.json
new file mode 100644
index 0000000..bb31422
--- /dev/null
+++ b/.claude/settings.json
@@ -0,0 +1,15 @@
+{
+  "permissions": {
+    "allow": [
+      "Bash(python -c \":*)",
+      "Bash(xargs ls -la)",
+      "Bash(wc -l /c/Users/ywkang/kernbench/src/kernbench/policy/address/phyaddr.py /c/Users/ywkang/kernbench/src/kernbench/runtime_api/*.py)",
+      "Bash(python scripts/gen_placement_diagrams.py)",
+      "Bash(python -c:*)",
+      "Bash(python -m kernbench.cli.main probe --topology topology.yaml)",
+      "Bash(xargs grep -l \"class.*ComponentBase\\\\|class.*DefaultComponent\")",
+      "Bash(python -m pytest tests/test_probe.py -v)",
+      "Bash(python -m pytest tests/test_probe.py tests/test_component_registry.py -v)"
+    ]
+  }
+}
diff --git a/.claude/settings.local.json b/.claude/settings.local.json
new file mode 100644
index 0000000..fbd7d37
--- /dev/null
+++ b/.claude/settings.local.json
@@ -0,0 +1,36 @@
+{
+  "permissions": {
+    "allow": [
+      "Bash(python -m ruff check --select I --fix src/kernbench/runtime_api/context.py src/kernbench/runtime_api/bench_runner.py src/kernbench/cli/main.py)",
+      "Bash(python -m pytest tests/ -q)",
+      "Bash(python -m pytest tests/ -v)",
+      "Bash(python -m pytest tests/test_topology_load.py tests/test_cli.py -v)",
+      "Bash(python -c \":*)",
+      "Bash(python -m pytest tests/ -q --tb=no)",
+      "Bash(python -m pytest tests/ -v --tb=short)",
+      "Bash(python -m pytest tests/test_component_registry.py -v --tb=line)",
+      "Bash(python -m pytest tests/ --ignore=tests/test_component_registry.py -q --tb=no)",
+      "Bash(python -m pytest tests/ -q --tb=short)",
+      "Bash(python -m pytest --tb=short -q)",
+      "Bash(python -m pytest tests/test_phase_a_components.py -v)",
+      "Bash(python -m pytest --tb=short)",
+      "Bash(python -m pytest tests/ -x -q)",
+      "Bash(python -m pytest tests/test_probe.py::test_h2d_latency_monotonic -x -q)",
+      "Bash(python -m pytest tests/ -x --tb=short)",
+      "Bash(python -m pytest tests/ --tb=line)",
+      "Bash(python -m pytest tests/ --tb=short)",
+      "Bash(python -m kernbench probe --topology topology.yaml)",
+      "Bash(python -m pytest tests/ --tb=short -q)",
+      "Bash(python -m pytest tests/test_probe.py tests/test_component_registry.py tests/test_topology_compile.py -v)",
+      "Bash(python -m pytest tests/test_tensor.py tests/test_engine.py tests/test_probe.py tests/test_component_registry.py -v)",
+      "Bash(grep -l \"class.*CPU\" \"/c/Users/ywkang/kernbench/src/kernbench/components/impls\"/*.py)",
+      "Bash(grep -n \"^class \" \"/c/Users/ywkang/kernbench/src/kernbench/components/impls\"/*.py)",
+      "Bash(python -m pytest tests/test_engine.py tests/test_probe.py tests/test_component_registry.py -v)",
+      "Bash(grep -E \"\\\\.\\(py|md\\)$\")",
+      "Bash(python -m pytest tests/test_pe_components.py -v)",
+      "Bash(python -m pytest tests/test_triton_emu.py -v)",
+      "Bash(python -m pytest tests/test_pe_components.py tests/test_triton_emu.py -v)",
+      "Bash(python -m pytest tests/test_pe_components.py::test_mcpu_multi_pe_kernel_launch tests/test_pe_components.py::test_qkv_gemm_bench_multi_pe_completes -v)"
+    ]
+  }
+}
diff --git a/.editorconfig b/.editorconfig
new file mode 100644
index 0000000..6646a90
--- /dev/null
+++ b/.editorconfig
@@ -0,0 +1,12 @@
+root = true
+
+[*]
+charset = utf-8
+end_of_line = lf
+insert_final_newline = true
+trim_trailing_whitespace = true
+indent_style = space
+indent_size = 4
+
+[*.md]
+trim_trailing_whitespace = false
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..8599830
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,28 @@
+# OS / Editor
+.DS_Store
+.vscode/.history/
+*.swp
+
+# Python
+__pycache__/
+*.py[cod]
+*.pyd
+.pytest_cache/
+.mypy_cache/
+.ruff_cache/
+
+# Virtualenv
+.venv/
+
+# Packaging
+dist/
+build/
+*.egg-info/
+
+# Env
+.env
+.env.*
+!.env.example
+
+# Logs
+*.log
diff --git a/.pylintrc b/.pylintrc
new file mode 100644
index 0000000..b936b99
--- /dev/null
+++ b/.pylintrc
@@ -0,0 +1,3 @@
+[MESSAGES CONTROL]
+disable=missing-function-docstring
+disable=missing-module-docstring
diff --git a/.vscode/extensions.json b/.vscode/extensions.json
new file mode 100644
index 0000000..e738d79
--- /dev/null
+++ b/.vscode/extensions.json
@@ -0,0 +1,16 @@
+{
+  "recommendations": [
+    "ms-python.python",
+    "ms-python.vscode-pylance",
+    "charliermarsh.ruff",
+    "ms-python.debugpy",
+    "eamodio.gitlens",
+    "usernamehw.errorlens",
+    "wayou.vscode-todo-highlight",
+    "christian-kohler.path-intellisense",
+    "editorconfig.editorconfig",
+    "ms-azuretools.vscode-docker",
+    "humao.rest-client",
+    "shd101wyy.markdown-preview-enhanced"
+  ]
+}
\ No newline at end of file
diff --git a/.vscode/launch.json b/.vscode/launch.json
new file mode 100644
index 0000000..3147e32
--- /dev/null
+++ b/.vscode/launch.json
@@ -0,0 +1,55 @@
+{
+  "version": "0.2.0",
+  "configurations": [
+    {
+      "name": "Python: Current File",
+      "type": "python",
+      "request": "launch",
+      "program": "${file}",
+      "console": "integratedTerminal",
+      "justMyCode": true
+    },
+    {
+      "name": "Run kernbench CLI",
+      "type": "python",
+      "request": "launch",
+      "module": "kernbench.cli.main",
+      "console": "integratedTerminal",
+      "justMyCode": true,
+      "args": [
+        "run",
+        "--topology", "topology.yaml",
+        "--bench", "qkv_gemm"
+      ],
+      "env": {
+        "PYTHONPATH": "${workspaceFolder}/src"
+      }
+    },
+    {
+      "name": "Run KernBench Probe",
+      "type": "python",
+      "request": "launch",
+      "module": "kernbench.cli.main",
+      "console": "integratedTerminal",
+      "justMyCode": true,
+      "args": [
+        "probe",
+        "--topology", "topology.yaml",
+      ],
+      "env": {
+        "PYTHONPATH": "${workspaceFolder}/src"
+      }
+    },
+    {
+      "name": "Pytest: Run All (terminal)",
+      "type": "python",
+      "request": "launch",
+      "module": "pytest",
+      "args": [
+        "-q"
+      ],
+      "console": "integratedTerminal",
+      "justMyCode": true
+    }
+  ]
+}
diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..70ca63f
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,39 @@
+{
+  "editor.formatOnSave": true,
+  "editor.formatOnSaveMode": "file",
+  "editor.tabSize": 4,
+  "editor.insertSpaces": true,
+  "editor.rulers": [
+    100
+  ],
+  "editor.minimap.enabled": false,
+  "files.eol": "\n",
+  "files.trimTrailingWhitespace": true,
+  "files.insertFinalNewline": true,
+  "search.useIgnoreFiles": true,
+  "git.autofetch": true,
+  "python.venvFolders": [".venv"],
+  "python.testing.pytestEnabled": true,
+  "python.testing.unittestEnabled": false,
+  "python.testing.autoTestDiscoverOnSaveEnabled": true,
+  "python.testing.pytestArgs": [
+    "tests", "-q", "-vv", "-s",  "--tb=short", "--no-header",
+  ],
+  "python.analysis.typeCheckingMode": "basic",
+  "python.analysis.autoImportCompletions": true,
+  "python.terminal.activateEnvironment": true,
+  "ruff.lint.enable": true,
+  "ruff.format.enable": true,
+  "editor.defaultFormatter": "charliermarsh.ruff",
+  "[python]": {
+    "editor.defaultFormatter": "charliermarsh.ruff"
+  },
+  "editor.codeActionsOnSave": {
+    "source.fixAll": "explicit",
+    "source.organizeImports": "explicit"
+  },
+  "python.analysis.extraPaths": [
+    "${workspaceFolder}/src"
+  ]
+
+}
diff --git a/.vscode/tasks.json b/.vscode/tasks.json
new file mode 100644
index 0000000..9f3a1e2
--- /dev/null
+++ b/.vscode/tasks.json
@@ -0,0 +1,127 @@
+{
+  "version": "2.0.0",
+  "tasks": [
+
+    {
+      "label": "venv: create",
+      "type": "shell",
+      "command": "python -m venv .venv",
+      "problemMatcher": []
+    },
+
+    {
+      "label": "deps: install",
+      "type": "shell",
+      "command": "${command:python.interpreterPath}",
+      "args": [
+        "-m",
+        "pip",
+        "install",
+        "-U",
+        "pip",
+        "&&",
+        "${command:python.interpreterPath}",
+        "-m",
+        "pip",
+        "install",
+        "-e",
+        ".[dev]"
+      ],
+      "problemMatcher": [],
+      "dependsOn": "venv: create"
+    },
+
+    {
+      "label": "Run KernBench CLI",
+      "type": "shell",
+      "command": "${command:python.interpreterPath}",
+      "args": [
+        "-m",
+        "kernbench.cli.main",
+        "run",
+        "--topology",
+        "topology.yaml",
+        "--bench",
+        "qkv_gemm"
+      ],
+      "options": { "cwd": "${workspaceFolder}" },
+      "problemMatcher": [],
+      "presentation": {
+        "reveal": "silent",
+        "panel": "shared",
+        "clear": true
+      }
+    },
+
+    {
+      "label": "Run KernBench Probe",
+      "type": "shell",
+      "command": "${command:python.interpreterPath}",
+      "args": [
+        "-m",
+        "kernbench.cli.main",
+        "probe",
+        "--topology",
+        "topology.yaml"
+      ],
+      "options": { "cwd": "${workspaceFolder}" },
+      "problemMatcher": [],
+      "presentation": {
+        "reveal": "silent",
+        "panel": "shared",
+        "clear": true
+      }
+    },
+    {
+      "label": "Pytest: Run All (terminal)",
+      "type": "shell",
+      "command": "${command:python.interpreterPath}",
+      "args": [
+        "-m",
+        "pytest",
+        "-q",
+        "-s"
+      ],
+      "problemMatcher": [],
+      "group": "test"
+    },
+
+    {
+      "label": "lint",
+      "type": "shell",
+      "command": "${command:python.interpreterPath}",
+      "args": [
+        "-m",
+        "ruff",
+        "check",
+        "."
+      ],
+      "problemMatcher": []
+    },
+
+    {
+      "label": "format",
+      "type": "shell",
+      "command": "${command:python.interpreterPath}",
+      "args": [
+        "-m",
+        "ruff",
+        "format",
+        "."
+      ],
+      "problemMatcher": []
+    },
+
+    {
+      "label": "which python",
+      "type": "shell",
+      "command": "${command:python.interpreterPath}",
+      "args": [
+        "-c",
+        "import sys;print(sys.executable)"
+      ],
+      "problemMatcher": []
+    }
+
+  ]
+}
diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 0000000..e4787e5
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,196 @@
+# Claude Code Instructions (Repo)
+
+This repository uses Claude Code with strict architectural and verification rules.
+SPEC.md and ADRs are the source of truth.
+
+---
+
+## Terminology
+
+- runtime API:
+  Host-facing public API used by benchmarks and user code (e.g., tensor deployment, kernel launch).
+- simulation engine (sim_engine):
+  Discrete-event engine responsible for request injection, scheduling, and completion tracking.
+- components:
+  Device-side nodes modeling hardware behavior (IO_CPU, M_CPU, PE_CPU, routers, engines, etc.).
+
+## Authority & Scope
+
+- SPEC.md defines the architectural contract.
+- ADRs (docs/adr/ADR-*.md) define non-trivial architectural decisions.
+- If a change conflicts with SPEC.md or an ADR:
+  - STOP.
+  - Explain the conflict.
+  - Propose options (keep spec, update ADR, or narrow scope).
+- Do NOT silently change architecture.
+- The repository structure reflects architectural intent; Claude Code MUST respect existing module boundaries and file locations.
+
+---
+
+## Design Questions
+
+- Design / architecture questions are ALWAYS allowed.
+- Design questions MUST NOT modify:
+  - production code
+  - test code
+  - SPEC.md
+  - ADRs
+- If a design question implies a change, default to Phase 1.
+
+---
+
+## Change & Test Protocol (Mandatory)
+
+All non-trivial changes MUST follow a two-phase process.
+Design discussion is always allowed; code changes are not.
+
+---
+
+### Phase 1 — Proposal + Verification
+
+(No Production Code Changes)
+
+#### Purpose
+
+- Decide *what* to change and *how it will be validated*
+- Establish verification coverage BEFORE touching production code
+
+#### Phase 1 MUST include
+
+1) **Design Proposal**
+
+- Explain the design change.
+- Explain why the change is needed.
+- Explain consistency with SPEC.md and relevant ADRs.
+
+1) **Verification Plan**
+
+- SPEC requirement(s) / ADR(s) affected (e.g., R1/R2/R5, ADR-0002).
+- Tests that validate the change:
+  - existing tests to run, and/or
+  - new tests to add.
+- Concrete input cases used by the tests:
+  - topology (SIP / CUBE / PE layout)
+  - request parameters (src, dst, size_bytes).
+- Expected observable assertions, such as:
+  - hop trace contains key waypoints,
+  - latency invariants (e.g., > 0, monotonic increase),
+  - deterministic route selection.
+  - **expected changes (or no changes) in generated diagrams**, if applicable.
+
+If the Verification Plan is missing or vague, STOP.
+
+#### Allowed in Phase 1
+
+- Creating or modifying **test code only**
+- Running tests and reporting results
+
+#### Forbidden in Phase 1
+
+- Any production code changes
+- Any SPEC.md or ADR modifications
+- Any production diff output
+
+#### Phase 1 Output
+
+- Proposal + Verification Plan
+- Tests added/modified (if any)
+- Test execution results (PASS / FAIL)
+- Clear recommendation:
+  - "No Phase 2 needed" OR
+  - "Await approval for Phase 2"
+
+---
+
+### Phase 2 — Apply + Verify + Rollback
+
+#### Trigger
+
+Phase 2 is triggered ONLY by the exact user approval phrase:
+
+**"ok"**
+
+#### Phase 2 Rules
+
+- Output **minimal unified diffs only**
+- Modify ONLY production files declared in Phase 1
+- Do NOT include explanations, comments, or unchanged code
+- Automatically apply the diff to the working tree
+
+#### Mandatory Verification
+
+- Run the tests defined in the Phase 1 Verification Plan
+
+#### Success Path
+
+If ALL tests PASS:
+
+- Keep the applied changes
+- Ensure generated diagrams (if affected) are consistent
+- Report success concisely
+
+#### Failure Path (Mandatory)
+
+If ANY test FAILS:
+
+- Immediately rollback ALL Phase 2 changes
+- Do NOT keep partial changes
+- Report:
+  - failing test names
+  - error messages / assertions
+  - brief hypothesis of the root cause
+- Return to Phase 1 state
+
+Tests must NEVER be weakened, removed, or altered to force Phase 2 to pass.
+
+---
+
+## What Counts as "Non-Trivial"
+
+(Protocol Required)
+
+Any of the following:
+
+- routing policy or ordering changes
+- topology builder changes (nodes, links, parameters)
+- address decoding / PhysAddr behavior
+- latency composition rules
+- changes affecting determinism or connectivity
+- changes touching two or more production files
+
+---
+
+## Allowed Exceptions
+
+(Protocol Still Required)
+
+- comments or docstrings
+- formatting-only changes
+- type annotation changes with no runtime behavior change
+
+In exceptions, Phase 1 MUST explicitly state:
+**"No behavior change; tests unchanged."**
+
+---
+
+## CLI Semantics
+
+- `kernbench run --device <id>` runs the benchmark on a single device.
+- Omitting `--device` runs the benchmark on all devices discovered in the topology (logically parallel).
+- Device enumeration is handled by the CLI only; benchmarks MUST remain single-device.
+
+## Derived Artifacts (Clarification)
+
+- Generated diagrams under `docs/diagrams/` are **derived artifacts**, not production code.
+- Creating or updating files in `docs/diagrams/`:
+  - does NOT count as a production code change,
+  - does NOT require Phase 2 approval,
+  - MUST be consistent with SPEC.md and ADRs.
+
+## Enforcement Defaults
+
+- If unsure whether a change is non-trivial → treat it as non-trivial.
+- If unsure whether Phase 2 is allowed → STOP and ask.
+- SPEC.md and ADRs are the final authority.
+- runtime API MUST NOT hardcode topology/routing or internal hop sequences.
+- sim_engine MUST remain independent of runtime API semantics (no tensor/kernel policy logic).
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..b276a9f
--- /dev/null
+++ b/README.md
@@ -0,0 +1,13 @@
+# Python Project (VS Code Template)
+
+## Quick start
+1. Create venv + install dev deps (editable):
+   - VS Code: Run Task → `deps: install (editable)`
+2. Run tests:
+   - VS Code: Run Task → `test`
+3. Lint / format:
+   - `lint`, `format` tasks
+
+## Structure
+- `src/` app code
+- `tests/` pytest
diff --git a/SPEC.md b/SPEC.md
new file mode 100644
index 0000000..e881bbe
--- /dev/null
+++ b/SPEC.md
@@ -0,0 +1,327 @@
+# KernBench System-Level Simulator — SPEC
+
+This document defines the architectural contract for the KernBench
+system-level discrete-event simulator for our AI Accelerator SIP-based systems.
+All implementations, tests, and changes MUST conform to this SPEC.
+
+---
+
+## 0. Goal
+
+Build a **system-level, discrete-event simulator** to evaluate the performance of
+**LLM kernels running on our AI Accelerator SIP-based systems**, under varying
+**SIP architectures, topologies, and interconnect configurations**.
+
+The simulator models **data-movement and control paths across the full hardware
+hierarchy** and computes **end-to-end execution latency** for kernel executions
+dispatched to Processing Elements (PEs).
+
+Primary objectives:
+
+- compare LLM kernel execution latency under different system configurations
+- model PE↔HBM, PE↔PE, CUBE↔CUBE, and SIP↔SIP communication and control paths
+- guarantee deterministic, verifiable behavior with strong debuggability
+- support visual inspection of the modeled system at multiple abstraction levels
+
+---
+
+## 0.1 Golden Invariants (Must NOT be violated)
+
+- End-to-end latency is computed **strictly by explicit traversal** over modeled
+  components and links.
+- Every routed request MUST incur **latency > 0**.
+- Routing decisions MUST be **deterministic** given
+  (topology + routing policy + request).
+- All valid request flows MUST have explicit connectivity in the model.
+- No hidden shortcuts, implicit bypasses, or magic paths are allowed.
+- Architectural decisions documented in ADRs override local optimizations.
+
+---
+
+## 0.2 Architectural References (ADRs)
+
+Major architectural decisions are documented in ADRs and referenced by number.
+
+- ADR-0001: PhysAddr layout & address decoding contract
+- ADR-0002: Routing distance, ordering, and bypass rules
+- ADR-0003: Target system hierarchy & modeling scope (Tray / SIP / CUBE / PE / IO chiplet)
+- ADR-0004: Memory semantics & local-HBM bandwidth guarantee contract
+- ADR-0005: Diagram views (SIP / CUBE / PE) and distance-aware layout rules
+- ADR-0006: Topology compilation, distance extraction, and automatic diagram generation
+- ADR-0007: runtime_api vs sim_engine responsibility boundaries
+- ADR-0008: Tensor deployment and allocation (Host allocator, PA-first)
+- ADR-0009: Kernel execution fan-out and completion semantics
+- ADR-0010: CLI device selection and multi-device execution semantics
+- ADR-0011: Memory addressing simplification (PA-first)
+- ADR-0012: Host ↔ IO_CPU message schema (PA-first, PE-tagged shards)
+- ADR-0013: Verification strategy and Phase 1 test plan
+
+SPEC MUST remain consistent with accepted ADRs.
+
+---
+
+## 1. Core Requirements
+
+### R1. Correct Routing and Control Path
+
+- A request MUST traverse the correct sequence of components based on:
+  - source location,
+  - destination address or placement tags,
+  - routing policy and available topology connectivity.
+- Local vs remote traffic MUST be distinguishable:
+  - same SIP vs different SIP,
+  - same CUBE vs different CUBE,
+  - (optional) same PE-group vs cross PE-group.
+- Routing behavior MUST be reproducible and deterministic.
+
+---
+
+### R2. Latency is Computed by Traversal
+
+End-to-end latency is the sum of:
+
+- per-node fixed latency (processing / router delay),
+- per-link latency (fixed and/or size-aware serialization: bytes / BW),
+- per-service latency (e.g., memory controller service time).
+
+The simulator MUST:
+
+- support both fixed and size-aware latency,
+- emit hop-by-hop traces with timestamps and component identifiers.
+
+---
+
+### R3. Topology is Configurable and Variable
+
+Topology MUST NOT be hardcoded.
+
+The simulator MUST accept multiple topologies (YAML / JSON / dict), varying:
+
+- SIP count,
+- CUBE count per SIP,
+- PE count per CUBE,
+- on-chip fabric structure (e.g., mesh / NoC / XBAR),
+- IO chiplets and interconnects,
+- link bandwidth, latency, and capacity parameters.
+
+Given a topology:
+
+- all required request flows MUST have valid connectivity,
+- missing links are a topology construction error, not a routing error.
+
+---
+
+### R4. DI-First Component Design (Swappable Implementations)
+
+All components MUST be replaceable behind stable interfaces, including:
+
+- routers and fabrics (NoC, bridges, switches),
+- XBAR-like selectors,
+- DMA engines and queues,
+- memory controllers and services (HBM, TCM, queues),
+- management and control processors (modeled components).
+
+The simulator MUST:
+
+- use dependency injection (DI) to bind node specifications to implementation classes,
+- allow component swapping without changing test logic,
+- avoid leaking routing or policy logic into unrelated components.
+
+---
+
+### R5. Multi-Domain Communication Modeling
+
+The simulator MUST model communication across hierarchical domains, including:
+
+- PE ↔ local HBM
+- PE ↔ remote HBM in the same CUBE
+- PE ↔ remote HBM in other CUBEs within the same SIP
+- PE ↔ remote HBM in other SIPs
+- PE ↔ PE messaging (e.g., IPCQ)
+- PE ↔ IO chiplets
+- CUBE ↔ CUBE (e.g., via UCIe)
+- SIP ↔ SIP (e.g., via PCIe or UAL)
+
+Policy-based bypass is allowed ONLY if:
+
+- the bypass path is explicitly represented in the model,
+- the bypass incurs non-zero latency,
+- the bypass is visible in traces and diagrams.
+
+---
+
+### R6. Verification-Driven Development
+
+Development MUST follow a verification-driven workflow:
+
+- behavior is validated by tests with meaningful input cases,
+- tests encode SPEC-defined invariants, not incidental implementation details,
+- changes without clear verification coverage are not allowed.
+
+---
+
+## R7. Runtime API
+
+The simulator MUST provide a host-facing runtime API that:
+
+- exposes tensor deployment and kernel execution operations,
+- submits requests only to endpoint components (e.g., IO_CPU),
+- owns host-side tensor handles and allocation metadata as PA shard maps,
+- remains topology-agnostic and does not perform routing or fan-out.
+
+Tensor deployment in Phase 0 produces **device physical-address (PA) shard mappings**.
+Each shard explicitly identifies its target `(sip, cube, pe)` and PA range.
+No separate host-visible allocation RPC (e.g., AllocateTensorMeta) exists.
+
+---
+
+## R8. Simulation Engine
+
+The simulator MUST include a discrete-event simulation engine that:
+
+- injects requests into the system graph,
+- schedules events deterministically,
+- tracks completion via correlation identifiers,
+- decomposes runtime API operations into explicit graph requests
+  (e.g., MemoryWrite, MemoryRead, KernelLaunch).
+
+---
+
+## R9. CLI Execution Semantics
+
+The CLI MUST support executing benchmarks:
+
+- on a specified device.
+
+Benchmarks are executed once per invocation within a single simulation instance.
+If multiple devices are present in the topology, a benchmark MAY interact with
+multiple devices internally, but the CLI does not launch multiple independent
+benchmark instances by default.
+
+---
+
+## R10. Memory Addressing (Phase 0)
+
+In Phase 0, the simulator uses a **PA-first memory model**:
+
+- All memory operations use device physical addresses (PA) only.
+- Virtual addressing, MMU/IOMMU, and address translation latency are out of scope.
+- Tensor placement is represented as a list of PA shards, each explicitly tagged
+  with `(sip, cube, pe)`.
+
+All memory access latency MUST be modeled explicitly via graph traversal.
+No implicit translation or hidden latency is allowed.
+
+---
+
+## 2. Model Concepts
+
+### 2.1 Graph Execution Model
+
+- Nodes represent modeled components (PE blocks, XBAR, NoC, bridges,
+  HBM controllers, IO components, etc.).
+- Directed edges represent interconnect links with latency and bandwidth attributes.
+- Execution model:
+  - a node receives a request,
+  - incurs node or service latency,
+  - emits the request to the next hop via a link,
+  - repeats until the destination service completes.
+
+---
+
+### 2.2 Routing
+
+Routing MAY be implemented as:
+
+- policy-based routing (code-driven),
+- routing tables (config-driven),
+- topology-driven routing (e.g., mesh XY),
+- or a hybrid approach.
+
+Routing MUST:
+
+- consume decoded address domains or explicit placement tags,
+- operate only on explicit topology connectivity,
+- remain deterministic.
+
+Kernel execution requests reference tensors via PA shard mappings.
+Each shard explicitly identifies its target PE, allowing IO_CPU to
+deterministically fan-out execution without relying on PA decoding.
+
+---
+
+## 3. Inputs and Identity
+
+### 3.1 Node Identity Scheme
+
+Nodes MUST have stable, parsable identifiers sufficient for domain inference
+and trace-based debugging.
+
+Example patterns:
+
+- `tray.host_cpu`
+- `sip{S}.io{I}.pcie_ep`
+- `sip{S}.cube{C}.fabric`
+- `sip{S}.cube{C}.pe{P}`
+- `sip{S}.cube{C}.hbm_ctrl`
+
+---
+
+### 3.2 Link Specifications
+
+A link MAY include:
+
+- fixed latency (ns),
+- bandwidth (GB/s) for serialization latency,
+- optional capacity for contention modeling.
+
+Topology builders MUST ensure:
+
+- required links exist,
+- link parameters are consistent with topology intent.
+
+---
+
+## 4. Output, Debuggability, and Diagrams
+
+The simulator MUST provide:
+
+- per-request hop-by-hop traces with timestamps,
+- clear error messages for missing connectivity
+  (e.g., "no link for A → B"),
+- reproducible, inspectable representations of the modeled system.
+
+Diagrams are **derived artifacts** of the simulator model:
+
+- They MUST be generatable from the **compiled topology** and **distance metadata**
+  used by execution and routing.
+- Generation MAY be performed lazily or cached by the implementation,
+  as long as outputs remain consistent with the compiled topology.
+
+Diagram abstraction levels and distance-aware layout rules are defined in ADR-0005.
+Automatic diagram generation and output conventions are defined in ADR-0006.
+
+By default, generated diagrams are written under:
+
+- `docs/diagrams/`
+
+---
+
+## 5. Non-Goals (for now)
+
+The following are explicitly out of scope:
+
+- cycle-accurate microarchitecture modeling,
+- detailed cache coherence protocols,
+- full PCIe / CXL protocol correctness.
+
+These MAY be layered later via additional components and policies.
+
+---
+
+## 6. Decision Boundaries
+
+- SPEC.md defines architectural intent and invariants.
+- Code implements SPEC and MUST NOT introduce hidden invariants.
+- Tests validate SPEC-defined behavior and MUST NOT encode fixed topology assumptions.
+- ADRs record non-trivial architectural decisions and MUST be referenced when relevant.
diff --git a/benches/__init__.py b/benches/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/benches/ipcq_allreduce.py b/benches/ipcq_allreduce.py
new file mode 100644
index 0000000..99e5217
--- /dev/null
+++ b/benches/ipcq_allreduce.py
@@ -0,0 +1,2 @@
+def run(ctx):
+    print("IPCQ all reduce kernel bench")
diff --git a/benches/loader.py b/benches/loader.py
new file mode 100644
index 0000000..e78e1a2
--- /dev/null
+++ b/benches/loader.py
@@ -0,0 +1,37 @@
+from __future__ import annotations
+
+import importlib
+from collections.abc import Callable
+from typing import Any
+
+from kernbench.runtime_api.context import RuntimeContext
+
+BenchFn = Callable[[RuntimeContext], Any]
+
+
+def resolve_bench(bench_id: str) -> BenchFn:
+    """
+    Resolve a bench id into a callable bench function.
+
+    Expected layout (repo root):
+        benches/<bench_id>.py
+            def run(ctx: RuntimeContext) -> Any
+    """
+    bench_id = bench_id.strip()
+    if not bench_id:
+        raise ValueError("Bench id is empty.")
+
+    module_path = f"benches.{bench_id}"
+
+    try:
+        mod = importlib.import_module(module_path)
+    except ModuleNotFoundError as e:
+        raise ValueError(f"Unknown bench '{bench_id}'. Expected module {module_path}.py") from e
+
+    run_fn = getattr(mod, "run", None)
+    if run_fn is None:
+        raise ValueError(f"Bench module {module_path} must define a 'run(ctx)' function.")
+    if not callable(run_fn):
+        raise ValueError(f"'run' in {module_path} is not callable.")
+
+    return run_fn
diff --git a/benches/qkv_gemm.py b/benches/qkv_gemm.py
new file mode 100644
index 0000000..7c92569
--- /dev/null
+++ b/benches/qkv_gemm.py
@@ -0,0 +1,39 @@
+"""QKV GEMM benchmark: Q*K^T projection on a single PE.
+
+Demonstrates the full host-to-PE kernel launch pipeline:
+  Host → PCIE_EP → IO_CPU → M_CPU → NOC → PE_CPU → PE_SCHEDULER → engines
+
+Kernel: tl.load(a) + tl.ref(b) + tl.composite(gemm) + tl.wait()
+  - Tensor a is loaded into TCM via DMA
+  - Tensor b stays in HBM; PE_SCHEDULER streams it per-tile (32x64x32)
+"""
+from kernbench.policy.placement.dp import DPPolicy
+
+# GEMM dimensions: (M, K) x (K, N) → (M, N)
+M, K, N = 128, 256, 128
+DTYPE = "f16"
+
+
+def _gemm_kernel(a_ptr, b_ptr, out_ptr, M, K, N, tl, DTYPE="f16"):
+    """QKV GEMM kernel: out = a @ b.
+
+    a is loaded into TCM (DMA_READ).
+    b is referenced in HBM (tl.ref, no DMA — scheduler streams per-tile).
+    """
+    a = tl.load(a_ptr, shape=(M, K), dtype=DTYPE)
+    b = tl.ref(b_ptr, shape=(K, N), dtype=DTYPE)
+    handle = tl.composite(op="gemm", a=a, b=b, out_ptr=out_ptr)
+    tl.wait(handle)
+
+
+def run(ctx):
+    """Run the QKV GEMM benchmark."""
+    # DP placement: a=replicate (cube-level), b/out=column_wise (N-axis, single PE)
+    a = ctx.zeros((M, K), dtype=DTYPE, dp=DPPolicy(cube="replicate", pe="replicate"), name="a")
+    b = ctx.zeros((K, N), dtype=DTYPE, dp=DPPolicy(cube="replicate", pe="column_wise"), name="b")
+    out = ctx.empty(
+        (M, N), dtype=DTYPE, dp=DPPolicy(cube="replicate", pe="column_wise"), name="out",
+    )
+
+    # Launch GEMM kernel
+    ctx.launch("qkv_gemm", _gemm_kernel, a, b, out, M, K, N)
diff --git a/benches/qkv_gemm_multi_pe.py b/benches/qkv_gemm_multi_pe.py
new file mode 100644
index 0000000..2b7bd87
--- /dev/null
+++ b/benches/qkv_gemm_multi_pe.py
@@ -0,0 +1,39 @@
+"""QKV GEMM benchmark: Q*K^T projection on all PEs in a cube (multi-PE).
+
+Column-parallel GEMM: a is replicated (cube-level), b/out are column-sharded.
+M_CPU fans out KernelLaunchMsg to all 8 PE_CPUs (ADR-0009 D3).
+
+Kernel: tl.load(a) + tl.ref(b) + tl.composite(gemm) + tl.wait()
+  - Tensor a is loaded into TCM via DMA
+  - Tensor b stays in HBM; PE_SCHEDULER streams it per-tile (32x64x32)
+"""
+from kernbench.policy.placement.dp import DPPolicy
+
+# GEMM dimensions: (M, K) x (K, N) -> (M, N)
+M, K, N = 128, 256, 128
+DTYPE = "f16"
+
+
+def _gemm_kernel(a_ptr, b_ptr, out_ptr, M, K, N, tl, DTYPE="f16"):
+    """QKV GEMM kernel: out = a @ b.
+
+    a is loaded into TCM (DMA_READ).
+    b is referenced in HBM (tl.ref, no DMA -- scheduler streams per-tile).
+    """
+    a = tl.load(a_ptr, shape=(M, K), dtype=DTYPE)
+    b = tl.ref(b_ptr, shape=(K, N), dtype=DTYPE)
+    handle = tl.composite(op="gemm", a=a, b=b, out_ptr=out_ptr)
+    tl.wait(handle)
+
+
+def run(ctx):
+    """Run the multi-PE QKV GEMM benchmark."""
+    # DP placement: a=replicate (cube-level), b/out=column_wise (N-axis split)
+    a = ctx.zeros((M, K), dtype=DTYPE, dp=DPPolicy(cube="replicate", pe="replicate"), name="a")
+    b = ctx.zeros((K, N), dtype=DTYPE, dp=DPPolicy(cube="replicate", pe="column_wise"), name="b")
+    out = ctx.empty(
+        (M, N), dtype=DTYPE, dp=DPPolicy(cube="replicate", pe="column_wise"), name="out",
+    )
+
+    # Launch GEMM kernel on all PEs
+    ctx.launch("qkv_gemm_multi", _gemm_kernel, a, b, out, M, K, N)
diff --git a/docs/adr/ADR-0001-physaddr-layout.md b/docs/adr/ADR-0001-physaddr-layout.md
new file mode 100644
index 0000000..9158f7a
--- /dev/null
+++ b/docs/adr/ADR-0001-physaddr-layout.md
@@ -0,0 +1,108 @@
+# ADR-0001: PhysAddr Layout & Address Decoding Contract
+
+## Status
+
+Accepted
+
+## Date
+
+2026-02-27
+
+## Context
+
+KernBench Graph Latency Simulator must route requests deterministically and compute end-to-end latency strictly by graph traversal.
+To model local vs remote traffic (same/different SIP, same/different CUBE, optional PE-group), requests need a stable, parsable address/location scheme that:
+
+- can be decoded into routing domains (SIP/CUBE/HBM/PE-resource, etc.)
+- remains topology-agnostic (no hardcoded counts)
+- supports swappable policy and DI-first components without leaking topology assumptions into node implementations
+
+## Decision
+
+We define a **PhysAddr value object** and an **address decoding contract** that converts an integer address into routing domains.
+
+### D1. PhysAddr is an immutable value object
+
+- PhysAddr is immutable and comparable as a pure value.
+- Any allocator returns a **fully specified PhysAddr** (not partial metadata).
+- No global state may be required to interpret a PhysAddr.
+
+### D2. PhysAddr fields (logical contract)
+
+PhysAddr must be able to represent at least:
+
+- `rack_id` (optional but reserved for scale-out)
+- `sip_id`  (device / SIP domain)
+- `sip_seg` (SIP-level segment/window selection, e.g., cube window)
+- `local_offset` (offset within the chosen segment/window)
+
+Decoded/derived fields may include (optional):
+
+- `cube_id`
+- `kind` (e.g., HBM vs PE-resource vs raw)
+- `unit_type` / `pe_id` (if PE-level addressing is modeled)
+
+**Important:** The exact bit allocation may evolve, but the *semantic fields above* must remain decodable without hidden assumptions.
+
+### D3. Decoding is deterministic and policy-compatible
+
+- Decoding must deterministically map an integer address to:
+  - destination SIP domain (`sip_id`)
+  - destination sub-domain (`cube_id` if applicable)
+  - destination target kind (HBM/PE-resource/other)
+- Decoding must not depend on runtime topology sizes; it may depend on **explicit topology parameters** provided through configuration (e.g., segment size, slice size), and those parameters must live in the topology/config layer (not in random components).
+
+### D4. Topology-derived constants live in the topology layer
+
+Constants such as segment sizes (e.g., HBM slice size / window size) are derived from topology configuration (YAML/JSON/dict) and are provided to the decoder via DI/config.
+They must not be hardcoded in node implementations.
+
+### D5. Routing consumes decoded domains, not raw bits
+
+Routing policy uses decoded domains:
+
+- `src` location (sip/cube/pe or node_id)
+- `dst` domains derived from PhysAddr decoding
+- `size_bytes` for size-aware link latency
+Routing must not inspect raw bit-fields directly except inside the decoding module.
+
+## Alternatives Considered
+
+1) **Use raw integers everywhere, decode ad-hoc in routing**
+
+- Rejected: leads to duplicated logic, inconsistent routing, and hidden assumptions embedded in multiple components.
+
+1) **Hardcode topology sizes (SIP/CUBE/PE counts) into decoding**
+
+- Rejected: violates SPEC (R3) and breaks swappability and configuration-driven topologies.
+
+1) **Put decoding inside memory controllers or routers**
+
+- Rejected: leaks policy into components and undermines DI-first, swappable implementations (SPEC R4).
+
+## Consequences
+
+### Positive
+
+- Deterministic routing domains enable clear test invariants for local vs remote paths (SPEC R1, R5).
+- Keeps topology variability (SPEC R3) while preserving consistent semantics.
+- DI-first: decoder can be swapped or extended without changing components or tests (SPEC R4).
+
+### Tradeoffs / Costs
+
+- Requires explicit configuration for any topology-derived sizes.
+- Introduces a single “blessed” decoding module that must remain stable and well-tested.
+
+## Implementation Notes (Non-normative)
+
+- Recommended module boundary:
+  - `src/kernbench/policy/address/phyaddr.py`
+
+- Tests should cover:
+  - deterministic decoding
+  - local vs remote classification from decoded fields
+  - invariants: “allocator returns full PhysAddr”, “decoding requires no global state”
+
+## Links
+
+- SPEC.md: R1 (routing), R3 (configurable topology), R4 (DI-first), R5 (multi-domain comm)
diff --git a/docs/adr/ADR-0002-routing-distance.md b/docs/adr/ADR-0002-routing-distance.md
new file mode 100644
index 0000000..2c28f41
--- /dev/null
+++ b/docs/adr/ADR-0002-routing-distance.md
@@ -0,0 +1,103 @@
+# ADR-0002: Routing Distance, Ordering & Bypass Rules
+
+## Status
+Accepted
+
+## Date
+2026-02-27
+
+## Context
+The KernBench Graph Latency Simulator must compare kernel execution time
+across different architectures and topologies by computing end-to-end
+latency from graph traversal.
+
+To support meaningful comparison:
+- routing must be deterministic
+- latency must reflect actual interconnect structure
+- local vs remote traffic must be distinguishable
+- “bypass” optimizations must not undermine debuggability or correctness
+
+The simulator also aims to avoid software-managed metadata and hidden
+shortcuts that obscure control paths.
+
+## Decision
+
+### D1. Distance is accumulated latency, not hop count
+- Routing “distance” is defined as the **sum of per-node and per-link latency**.
+- Hop count alone must not be used for ordering or path selection.
+- Size-aware serialization latency (bytes / BW) contributes to distance.
+
+### D2. Routing order is derived from graph traversal
+- The chosen route is the path with minimum accumulated latency
+  given the constructed graph and routing policy.
+- Deterministic ordering must be guaranteed for identical inputs
+  (topology + policy + request).
+
+### D3. Bypass is explicit and graph-represented
+- Any bypass (e.g., local cube HBM access via XBAR instead of NOC) must be:
+  - explicitly represented as a graph path, and
+  - subject to latency accumulation like any other path.
+- Example: PE_DMA has dual egress — one to XBAR (HBM path) and one to NOC (non-HBM path).
+  Both are explicit graph edges; neither is a “bypass” — they are distinct data paths
+  serving different memory domains.
+- Implicit or “magic” bypass paths are disallowed.
+
+### D4. No zero-latency end-to-end paths
+
+- Every routed request must incur **end-to-end** latency > 0.
+- Individual fabric segments (e.g., NOC hops) MAY have distance_mm = 0
+  when the fabric is distributed and distance is not meaningful at that granularity.
+  This is allowed because other components on the same path (e.g., PE_DMA, SRAM,
+  UCIe endpoints) contribute non-zero latency, ensuring the end-to-end invariant holds.
+- Fully zero-latency end-to-end paths are disallowed, except for explicit
+  test-only stubs clearly marked as such.
+
+### D5. Policy vs topology responsibility split
+- Topology builder:
+  - defines nodes and links and their latency/BW parameters
+- Routing policy:
+  - selects among available graph paths based on decoded domains
+- Routing policy must not assume missing links; missing connectivity
+  is a topology construction error.
+
+### D6. No software-managed routing metadata
+- Routing decisions must not rely on per-request software-managed metadata
+  that tracks distance, hop count, or ordering outside the graph model.
+- All distance/order computation is derived from traversal itself.
+
+## Alternatives Considered
+
+1) **Hop-count based routing**
+- Rejected: ignores heterogeneous latency/BW and misrepresents
+  architectural differences.
+
+2) **Implicit local shortcuts**
+- Rejected: breaks debuggability and violates traversal-based latency.
+
+3) **Software-managed distance metadata**
+- Rejected: increases control overhead and obscures routing semantics.
+
+## Consequences
+
+### Positive
+- Clear, debuggable hop-by-hop traces (SPEC R2, R4).
+- Architecture comparisons reflect real interconnect structure.
+- Routing behavior is reproducible and deterministic.
+
+### Tradeoffs / Costs
+- Graph construction must be correct and complete.
+- Bypass modeling requires explicit graph representation,
+  which slightly increases topology description complexity.
+
+## Implementation Notes (Non-normative)
+- Recommended responsibilities:
+  - Graph builder: ensure all required paths exist.
+  - Router: select next hop based on decoded domains and policy.
+- Tests should assert:
+  - non-zero end-to-end latency
+  - deterministic routing for identical inputs
+  - bypass paths appear explicitly in emitted traces
+
+## Links
+- SPEC.md: R1 (routing), R2 (latency), R3 (topology), R5 (multi-domain comm)
+- ADR-0001: PhysAddr layout & decoding contract
diff --git a/docs/adr/ADR-0003-target-system-hierarchy.md b/docs/adr/ADR-0003-target-system-hierarchy.md
new file mode 100644
index 0000000..4a685d8
--- /dev/null
+++ b/docs/adr/ADR-0003-target-system-hierarchy.md
@@ -0,0 +1,64 @@
+# ADR-0003: Target System Hierarchy & Modeling Scope
+
+## Status
+
+Accepted
+
+## Context
+
+We need a system-level simulator to evaluate LLM kernel performance on our AI Accelerator platform.
+The platform is organized as a compute tray containing multiple identical SIPs connected via PCIe or UAL
+through switching fabrics, with a host CPU issuing commands/kernels.
+
+## Decision
+
+We model the system hierarchy explicitly:
+
+### D1. Tray-level
+
+- A compute tray contains:
+  - Host CPU (issues requests / coordinates runtime & data placement)
+  - Multiple identical SIPs (accelerators)
+  - Interconnect fabric between SIPs (PCIe and/or UAL via switches)
+
+### D2. SIP-level
+
+- A SIP is a multi-die package composed of:
+  - Multiple CUBEs (HBM die + compute PEs + UCIe)
+  - One or more IO chiplets (host/SIP interfaces)
+- IO chiplets:
+  - provide interfaces: PCIe-EP, IO_CPU, optionally UAL-EP
+  - can be multiple per SIP
+  - placement constrained to SIP shoreline (top/bottom/left/right); each shoreline may host 1–2 IO chiplets
+
+### D3. CUBE-level
+
+- A CUBE contains:
+  - HBM + memory controller (HBM_CTRL)
+  - XBAR (top/bottom): HBM pseudo-channel crossbar, PE's dedicated path to HBM
+  - Bridge (left/right): connects XBAR.top ↔ XBAR.bottom for cross-half HBM access
+  - NOC: distributed on-die fabric spanning the entire cube (distance modeled as 0);
+    carries non-HBM traffic including inter-cube (UCIe), command (M_CPU↔PE_CPU), and shared SRAM access
+  - Shared SRAM: cube-level shared memory accessible by all PEs via NOC
+  - management/control CPU (M_CPU) coordinating PE command distribution and completion aggregation
+  - multiple PEs
+  - up to 4 UCIe endpoints (N/E/W/S) for CUBE↔CUBE and CUBE↔IO connectivity
+
+### D4. PE-level
+
+- A PE can execute one kernel instance
+- PE contains internal control + accelerators (modeled at PE view granularity):
+  - PE_CPU, command handler, PE_TCM, DMA/GEMM/MATH engines, internal queues
+
+## Consequences
+
+- The simulator supports abstraction by “views”:
+  - SIP view hides PE internals
+  - CUBE view treats each PE as a single block
+  - PE view expands PE internals
+- Topology remains parameterized; sizes/counts/links come from configuration.
+
+## Links
+
+- SPEC R3/R5
+- ADR-0005 (diagram views)
diff --git a/docs/adr/ADR-0004-memory-semantics-local-hbm.md b/docs/adr/ADR-0004-memory-semantics-local-hbm.md
new file mode 100644
index 0000000..ed91e7d
--- /dev/null
+++ b/docs/adr/ADR-0004-memory-semantics-local-hbm.md
@@ -0,0 +1,64 @@
+# ADR-0004: Memory Semantics & Local-HBM Bandwidth Guarantee
+
+## Status
+
+Accepted
+
+## Context
+
+Accurately modeling PE↔HBM behavior is essential for kernel latency estimation.
+Each PE has a notion of “local HBM” that must guarantee full HBM bandwidth, independent of intervening on-die fabric bandwidth.
+
+## Decision
+
+### D1. Local HBM definition
+
+- Each PE is assigned a logically defined “local HBM” region.
+- Local HBM corresponds to the pseudo-channel subset directly attached to that PE’s DMA path
+  via the XBAR (top or bottom, depending on PE corner placement).
+- The path is: PE_DMA → XBAR.top/bottom → HBM_CTRL.
+- The mapping (HBM pseudo-channels → PE local regions) is derived from topology configuration.
+
+### D2. Local HBM bandwidth guarantee contract
+
+- Accesses from a PE to its local HBM MUST guarantee full HBM read/write bandwidth
+  independent of intervening fabric bandwidth limits.
+- This guarantee is modeled by:
+  - a dedicated logical path and/or service model that enforces HBM BW at the PE-local-HBM interaction point,
+  - while still incurring non-zero latency along explicitly modeled components.
+
+### D3. Cross-half HBM semantics
+
+- A PE connected to XBAR.bottom that accesses HBM pseudo-channels on the XBAR.top half
+  (or vice versa) traverses a bridge:
+  - PE_DMA → XBAR.bottom → bridge → XBAR.top → HBM_CTRL
+- Bridge bandwidth may limit cross-half HBM access relative to local-half access.
+
+### D4. Non-local HBM semantics (inter-cube / inter-SIP)
+
+- Accesses from a PE to HBM in a different cube or SIP MAY be limited by:
+  - NOC bandwidth within the cube,
+  - inter-cube UCIe links,
+  - inter-SIP fabric (PCIe/UAL).
+- These paths MUST be explicit and traceable.
+
+### D5. Shared SRAM semantics
+
+- Each CUBE contains a shared SRAM accessible by all PEs in that CUBE.
+- Access path: PE_DMA → NOC → shared SRAM.
+- Shared SRAM bandwidth is limited by the NOC↔SRAM link bandwidth.
+- Shared SRAM is not part of the HBM address space; it is a separate memory domain.
+
+## Verification Notes
+
+Tests should cover:
+
+- local-HBM case: BW matches HBM BW regardless of fabric BW parameter
+- cross-half HBM case: latency includes bridge traversal
+- non-local cases (inter-cube/inter-SIP): BW/latency respond to fabric/link parameters
+- shared SRAM case: access via NOC with correct BW
+
+## Links
+
+- SPEC R2/R5
+- ADR-0002 (distance/order & explicit bypass)
diff --git a/docs/adr/ADR-0005-diagram-views-distance-layout.md b/docs/adr/ADR-0005-diagram-views-distance-layout.md
new file mode 100644
index 0000000..918afbe
--- /dev/null
+++ b/docs/adr/ADR-0005-diagram-views-distance-layout.md
@@ -0,0 +1,186 @@
+# ADR-0005: Diagram Views & Distance-Aware Layout Rules
+
+## Status
+
+Accepted
+
+## Context
+
+We require verifiable and inspectable system modeling for a large-scale,
+parameterized AI Accelerator system.
+
+Humans must be able to:
+
+- visually inspect the modeled topology,
+- reason about communication structure and relative distance,
+- do so at multiple abstraction levels without being overwhelmed by detail.
+
+The simulator models distance (accumulated latency) as a first-class concept.
+Diagrams must reflect this distance by default.
+
+---
+
+## Global Defaults
+
+- All diagrams MUST be **distance-aware by default**.
+- All diagrams MUST render **representative views** of the architecture.
+- Instance indices (e.g., sip0, cube2, pe3) MUST NOT be required for diagram generation.
+- Instance indices MAY be used ONLY:
+  - to define a distance anchor in asymmetric or debugging scenarios, or
+  - when explicitly requested.
+
+---
+
+## Representative Rendering Rule
+
+- All CUBEs share the same internal structure.
+- All PEs share the same internal structure.
+
+Therefore:
+
+- SIP-level diagrams render representative CUBEs and IO chiplets.
+- CUBE-level diagrams render representative PEs as opaque blocks.
+- PE-level diagrams render a representative PE with fully expanded internals.
+
+Diagrams MUST NOT depend on specific SIP, CUBE, or PE indices
+unless explicitly requested.
+
+---
+
+## Diagram Views
+
+### View A — SIP-Level Diagram
+
+**Purpose**
+Explain system-scale structure and connectivity.
+
+**Visible elements**
+
+- SIP boundaries (optional)
+- CUBEs (opaque blocks)
+- IO chiplets (opaque blocks)
+- Optional UCIe stubs only if needed to clarify connectivity
+
+**Hidden elements**
+
+- PE internals
+- CUBE internal fabric
+- IO chiplet internals
+
+**Visible links**
+
+- Host ↔ IO chiplets (PCIe)
+- SIP ↔ SIP (PCIe / UAL via switches)
+- IO ↔ CUBE (on-package links)
+
+---
+
+### View B — CUBE-Level Diagram
+
+**Purpose**
+Explain cube-internal structure and data/control flow.
+
+**Visible elements**
+
+- XBAR (top/bottom): HBM pseudo-channel crossbar
+- Bridge (left/right): cross-half HBM connectors between XBAR.top and XBAR.bottom
+- NOC: distributed on-die fabric for non-HBM traffic
+- HBM subsystem (HBM_CTRL)
+- Shared SRAM: cube-level shared memory
+- Management CPU (M_CPU)
+- PEs as opaque blocks (PE[0..N−1])
+- UCIe endpoints (N/E/W/S) as ports
+
+**Hidden elements**
+
+- PE internals
+
+**Visible links**
+
+- PE → XBAR (HBM data path, top or bottom by corner placement)
+- PE → NOC (non-HBM data path)
+- XBAR ↔ bridge ↔ XBAR (cross-half HBM access)
+- XBAR → HBM_CTRL
+- NOC ↔ UCIe endpoints
+- NOC ↔ shared SRAM
+- M_CPU ↔ NOC (command path)
+- NOC → PE_CPU (command delivery, collapsed into PE block)
+
+---
+
+### View C — PE-Level Diagram
+
+**Purpose**
+Explain internal PE behavior and execution structure.
+
+**Visible elements**
+
+- PE_CPU
+- Command handler / scheduler
+- PE_TCM (local SRAM)
+- HW accelerators (DMA, GEMM, MATH, etc.)
+- Local HBM interface
+- Optional IPCQ / messaging endpoints
+
+**Visible links**
+
+- Control paths (CPU → scheduler → engines)
+- Data paths (engines ↔ TCM, DMA ↔ local HBM)
+- External fabric ports as abstract ports only
+
+---
+
+## Distance-Aware Layout (Default)
+
+### Distance definition
+
+- Distance is defined as **accumulated latency**, consistent with ADR-0002.
+- Distance is computed from a single anchor node.
+
+### Default anchor selection
+
+- SIP view: IO chiplet (or Host CPU if present)
+- CUBE view: a representative PE
+- PE view: PE_CPU or Command Handler
+
+Anchors are **implicit defaults** and MUST NOT be required to be specified.
+
+### Layout rules
+
+- Diagrams MUST be laid out in layers based on distance buckets.
+- Layout direction MUST be consistent within a view type
+  (preferred: left-to-right).
+- Nodes with equal distance MUST have stable ordering
+  (by role or identifier, deterministically).
+
+Cycles MAY be rendered using dashed or curved edges for readability,
+without affecting distance semantics.
+
+---
+
+## Generation Contract (for Tools / Claude Code)
+
+When generating diagrams:
+
+- Assume distance-aware layout by default.
+- Assume representative rendering by default.
+- Do NOT ask for SIP/CUBE/PE indices unless required.
+- Do NOT expand hidden abstraction levels.
+- Prefer architectural clarity over micro-hop fidelity.
+
+---
+
+## Consequences
+
+- Diagrams are stable across topology scaling.
+- Changes in distance or routing policy are reflected visually.
+- Diagrams serve as verifiable artifacts derived from the simulator model,
+  not as hand-maintained documentation.
+
+---
+
+## Links
+
+- SPEC Section 4 (Output, Debuggability, and Diagrams)
+- ADR-0002 (Routing distance semantics)
+- ADR-0006 (Topology compilation & automatic diagram generation)
diff --git a/docs/adr/ADR-0006-topology-compilation-distance-diagram.md b/docs/adr/ADR-0006-topology-compilation-distance-diagram.md
new file mode 100644
index 0000000..b9c8fe1
--- /dev/null
+++ b/docs/adr/ADR-0006-topology-compilation-distance-diagram.md
@@ -0,0 +1,130 @@
+# ADR-0006: Topology Compilation, Distance Extraction, and Automatic Diagram Generation
+
+## Status
+
+Accepted
+
+## Context
+
+The simulator compiles topology configuration (e.g., topology.yaml) into an explicit model graph,
+and computes routing and accumulated latency (distance).
+Diagrams should be generated from these authoritative artifacts to ensure consistency and avoid
+hand-maintained topology drawings.
+
+Additionally, for usability, diagrams should be emitted automatically into a stable location
+so that developers can preview them immediately in the repository.
+
+---
+
+## Decision
+
+### D1. Topology compilation is the single source of truth
+
+- topology.yaml (or equivalent config) is compiled into:
+  - an explicit system graph,
+  - node/link attributes,
+  - routing policies.
+This compiled graph is the authoritative representation of the system.
+
+### D2. Distance extraction during compilation
+
+- During or immediately after topology compilation, the simulator MUST compute distance metadata
+  (accumulated latency) consistent with ADR-0002.
+- Distance metadata MUST be sufficient to support distance-aware diagram layout as defined in ADR-0005.
+- Distributed fabric segments (e.g., NOC) MAY have distance_mm = 0 per ADR-0002 D4;
+  layout placement for such nodes uses explicit position metadata rather than distance buckets.
+
+### D3. Diagram generation is a derived artifact
+
+- Diagrams MUST be generated from:
+  - the compiled topology graph,
+  - extracted distance metadata,
+  - view/layout rules defined in ADR-0005.
+- Diagram generation MUST NOT require additional hand-written topology descriptions.
+
+### D4. Automatic diagram emission to the repository
+
+- As part of topology compilation, the implementation MUST produce the following diagrams by default:
+  - SIP-level diagram (representative, distance-aware)
+  - CUBE-level diagram (representative, distance-aware)
+  - PE-level diagram (representative, distance-aware)
+- The default output directory is:
+  - `docs/diagrams/`
+- The generator MUST overwrite/update only when the compiled topology (or diagram rules) changes.
+
+### D5. View-specific projection and layout
+
+For each view (SIP / CUBE / PE):
+
+- The generator MUST project the compiled graph into a reduced view graph:
+  - hide/collapse nodes according to ADR-0005,
+  - preserve connectivity semantics relevant to that view,
+  - compute distance buckets and assign layout layers deterministically.
+- CUBE-level projection MUST include:
+  - XBAR (top/bottom), bridge (left/right), NOC, HBM_CTRL, shared SRAM, M_CPU, UCIe ports,
+    and PEs as opaque blocks.
+  - Distinct edge kinds for HBM path (PE→XBAR) vs non-HBM path (PE→NOC).
+- Default anchors are implicit (ADR-0005) and MUST NOT require instance indices.
+
+### D6. Output formats and determinism
+
+- The generator MUST output at least one of:
+  - Mermaid (Markdown-native)
+  - Graphviz DOT (rank-based control)
+  - SVG (mm-accurate layout, no external dependencies)
+- SVG is preferred when mm-accurate position metadata is available from the compiled topology.
+- Output MUST be deterministic:
+  - same topology + same rules → identical diagram text
+- File naming MUST be deterministic and stable (see "Output Conventions").
+
+### D7. Performance and caching
+
+- Diagram generation MAY be lazy and/or cached, as long as the outputs in `docs/diagrams/`
+  remain consistent with the compiled topology.
+- The implementation SHOULD use a cache key based on:
+  - topology content hash,
+  - routing policy version,
+  - diagram rules version,
+  - view type (SIP/CUBE/PE).
+
+---
+
+## Output Conventions
+
+### Directory
+
+- `docs/diagrams/` is the canonical output directory for generated diagrams.
+
+### File names (recommended, deterministic)
+
+- `system_view.svg` / `system_view.mmd` / `system_view.dot`
+- `sip_view.svg` / `sip_view.mmd` / `sip_view.dot`
+- `cube_view.svg` / `cube_view.mmd` / `cube_view.dot`
+- `pe_view.svg` / `pe_view.mmd` / `pe_view.dot`
+
+Optionally, for multi-topology workflows:
+
+- `sip_view__{topology_id}.svg`
+- `cube_view__{topology_id}.svg`
+- `pe_view__{topology_id}.svg`
+
+### Repository policy
+
+- Generated diagram files MAY be committed to the repository to enable diff-based review.
+- If committed, they MUST be reproducible from topology compilation.
+
+---
+
+## Consequences
+
+- Diagrams are always consistent with simulator behavior.
+- Architectural changes automatically propagate to visualizations.
+- Diagram diffs become meaningful indicators of architectural change.
+
+---
+
+## Links
+
+- SPEC Section 4 (Output, Debuggability, and Diagrams)
+- ADR-0002 (Distance semantics)
+- ADR-0005 (Diagram views and layout rules)
diff --git a/docs/adr/ADR-0007-runtime-api-boundaries.md b/docs/adr/ADR-0007-runtime-api-boundaries.md
new file mode 100644
index 0000000..51975be
--- /dev/null
+++ b/docs/adr/ADR-0007-runtime-api-boundaries.md
@@ -0,0 +1,89 @@
+# ADR-0007: Runtime API and Simulation Engine Boundaries
+
+## Status
+
+Accepted
+
+## Context
+
+The simulator consists of multiple layers with distinct responsibilities:
+
+- a host-facing API layer used by benchmarks and user code,
+- a discrete-event simulation engine that executes requests,
+- device components that model hardware behavior.
+
+Without strict boundaries, orchestration logic can leak into components,
+or simulation internals can become entangled with user-facing APIs.
+
+This ADR defines clear responsibility boundaries between:
+
+- runtime API,
+- simulation engine (sim_engine),
+- hardware components.
+
+---
+
+## Decision
+
+### D1. Runtime API is host-facing orchestration only
+
+The runtime API represents host/driver-level behavior and MUST:
+
+- expose high-level operations (tensor deployment, kernel launch),
+- submit requests only to endpoint components (e.g., IO_CPU),
+- await completion via futures/handles,
+- own and persist host-side metadata (tensor allocation maps, kernel bindings).
+
+The runtime API MUST NOT:
+
+- hardcode hop-by-hop routing or fan-out,
+- directly invoke internal components (M_CPU, PE_CPU, engines),
+- embed topology- or routing-specific assumptions.
+
+---
+
+### D2. Simulation engine executes and schedules requests
+
+The simulation engine (sim_engine) MUST:
+
+- inject requests into the compiled topology graph,
+- schedule and execute events using a discrete-event model,
+- manage correlation ids and completion tracking,
+- decompose operations into low-level requests when required
+  (e.g., MemoryWrite events).
+
+The simulation engine MUST NOT:
+
+- define tensor semantics,
+- define kernel execution policies,
+- expose internal graph details to the runtime API.
+
+---
+
+### D3. Components own fan-out and aggregation
+
+Device-side components MUST:
+
+- fan-out requests to downstream domains
+  (IO_CPU → M_CPU → PE_CPU → schedulers/engines),
+- aggregate completion and failure signals,
+- propagate results deterministically upstream.
+
+Neither the runtime API nor the simulation engine may orchestrate
+component-level fan-out explicitly.
+
+---
+
+## Consequences
+
+- Runtime APIs remain stable as topology and routing evolve.
+- Simulation internals can change without affecting user-facing code.
+- Component implementations remain swappable via DI.
+
+---
+
+## Links
+
+- SPEC R4, R7, R8
+- ADR-0008 (Tensor deployment)
+- ADR-0009 (Kernel execution)
diff --git a/docs/adr/ADR-0008-tensor-deploy-and-allocation.md b/docs/adr/ADR-0008-tensor-deploy-and-allocation.md
new file mode 100644
index 0000000..2ef5e82
--- /dev/null
+++ b/docs/adr/ADR-0008-tensor-deploy-and-allocation.md
@@ -0,0 +1,100 @@
+# ADR-0008: Tensor Deployment and Allocation (Host Allocator, PA-first)
+
+## Status
+
+Accepted
+
+## Context
+
+Benchmarks require PyTorch-like tensor semantics:
+
+- tensor creation (empty, fill),
+- deployment to accelerator devices (tensor.to()).
+
+In the realistic system, host software manages allocation/mapping and installs
+mappings for DMA/MMU. For Phase 0 we simplify (ADR-0011):
+
+- device memory operations use PA only,
+- VA/MMU/IOMMU is not modeled.
+
+To keep the host↔device interface minimal, we avoid a separate
+AllocateTensorMeta message. Instead, host allocation produces a PA shard map
+that is used directly by MemoryWrite/Read and KernelLaunch.
+
+---
+
+## Decision
+
+### D1. Tensor is a host-owned handle with PA shard mapping
+
+A Tensor object is a host-owned handle that encapsulates:
+
+- shape and dtype,
+- initialization intent,
+- device placement and allocation metadata as a PA shard map.
+
+After deployment, the Tensor handle MUST contain:
+
+- a list of shards, each with (sip,cube,pe,pa,nbytes,offset_bytes).
+
+This PA shard mapping is the single source of truth for kernel argument binding.
+
+---
+
+### D2. Deployment uses a host allocator (Phase 0)
+
+In Phase 0, tensor deployment produces PA shard mappings via a host allocator:
+
+- placement (split/replicate/hybrid) is decided by a DP policy,
+- allocation assigns PA ranges at the PE level and returns shard mappings,
+- the Tensor handle stores the resulting shard list deterministically.
+
+No separate host-visible device allocation RPC is required in Phase 0.
+
+---
+
+### D3. Data initialization and transfer uses MemoryWrite/Read only
+
+Any data initialization or transfer implied by a tensor (e.g., fill, copy)
+MUST be represented using Host ↔ IO_CPU messages only:
+
+- MemoryWrite
+- MemoryRead
+
+Rules:
+
+- MemoryWrite/Read MUST reference PA + (sip,cube,pe) tags (ADR-0012).
+- Allocation metadata MUST NOT be embedded as a separate allocation message.
+- Bulk tensor data MUST NOT be embedded in Phase 0 messages.
+
+The simulation engine schedules MemoryWrite/Read through the graph so that
+latency is computed by explicit traversal.
+
+---
+
+### D4. Extension path (non-breaking)
+
+Future ADRs MAY introduce optional VA/MMU/IOMMU modeling by adding:
+
+- virtual addressing in tensor handles,
+- mapping install steps,
+- translation latency/page granularity.
+
+The Phase 0 PA shard map remains a valid fast-path configuration.
+
+---
+
+## Consequences
+
+- Host↔IO_CPU contract remains minimal (MemoryRead/Write + KernelLaunch).
+- KernelLaunch can pass per-PE data placement explicitly via shard tags.
+- Early implementation stays simple and testable.
+
+---
+
+## Links
+
+- ADR-0011 (PA-first)
+- ADR-0012 (Host↔IO_CPU schema)
+- ADR-0007 (runtime_api vs sim_engine boundaries)
+- ADR-0009 (Kernel execution)
diff --git a/docs/adr/ADR-0009-kernel-execution-messaging.md b/docs/adr/ADR-0009-kernel-execution-messaging.md
new file mode 100644
index 0000000..91ca443
--- /dev/null
+++ b/docs/adr/ADR-0009-kernel-execution-messaging.md
@@ -0,0 +1,74 @@
+# ADR-0009: Kernel Execution Messaging and Completion Semantics
+
+## Status
+
+Accepted
+
+## Context
+
+Kernel execution is initiated by the host and proceeds through
+device control components:
+
+Host → IO_CPU → M_CPU → PE_CPU → schedulers → engines
+
+Completion propagates in reverse order.
+
+To keep benchmarks simple and topology-agnostic,
+kernel execution must be endpoint-driven with deterministic aggregation.
+
+---
+
+## Decision
+
+### D1. Kernel launch is an endpoint request
+
+A kernel launch is initiated by submitting a single KernelLaunch request
+to the IO_CPU endpoint.
+
+The runtime API MUST:
+
+- construct the kernel launch request,
+- submit it to IO_CPU,
+- await a single completion result.
+
+The runtime API MUST NOT orchestrate internal fan-out.
+
+---
+
+### D2. Tensor arguments are passed by metadata
+
+KernelLaunch requests MUST reference tensor arguments via:
+
+- host-owned tensor handles, or
+- resolved device address maps derived from those handles.
+
+Bulk tensor data MUST NOT be embedded in kernel launch messages.
+
+---
+
+### D3. Fan-out and aggregation are component responsibilities
+
+- IO_CPU fans out work to M_CPUs.
+- M_CPU fans out work to PE_CPUs.
+- PE_CPU manages kernel execution and engine dispatch.
+
+Completion semantics:
+
+- M_CPU completes when all targeted PEs complete or a failure policy triggers.
+- IO_CPU completes when all targeted CUBEs complete or a failure policy triggers.
+
+---
+
+### D4. Completion and failure propagation
+
+- All messages MUST carry correlation identifiers.
+- Completion and failure MUST propagate deterministically to the host.
+- The simulation engine provides futures/handles to observe completion.
+
+---
+
+## Links
+
+- SPEC R1, R2, R7, R8
+- ADR-0007 (Runtime API boundaries)
+- ADR-0008 (Tensor deployment)
diff --git a/docs/adr/ADR-0010-cli-device-selection.md b/docs/adr/ADR-0010-cli-device-selection.md
new file mode 100644
index 0000000..bed601b
--- /dev/null
+++ b/docs/adr/ADR-0010-cli-device-selection.md
@@ -0,0 +1,62 @@
+# ADR-0010: CLI Device Selection and Multi-Device Execution Semantics
+
+## Status
+
+Accepted
+
+## Context
+
+Benchmarks represent device-agnostic workloads that operate on a single device.
+Users may want to run a benchmark:
+
+- on a specific device, or
+- across all devices in the system.
+
+Device enumeration must not leak into benchmarks or runtime APIs.
+
+---
+
+## Decision
+
+### D1. Benchmarks are single-device by design
+
+- A benchmark MUST define behavior for a single device only.
+- A benchmark MUST accept a device identifier as input.
+- Benchmarks MUST NOT enumerate or loop over multiple devices.
+
+---
+
+### D2. CLI controls device selection
+
+The `kernbench run` command supports an optional `--device` argument:
+
+- If `--device <id>` is specified:
+  - the benchmark executes once for the specified device.
+
+- If `--device` is omitted:
+  - the benchmark executes once using all the SIPs discovered in the topology.
+
+---
+
+### D3. Multi-device execution is logically parallel
+
+When running on multiple devices:
+
+- benchmark executions are submitted to a single simulation engine instance,
+- executions are logically parallel in simulation time,
+- inter-device contention is naturally modeled.
+
+---
+
+### D4. Runtime API and simulation engine remain device-scoped
+
+- Runtime API calls operate on one device per invocation.
+- The simulation engine schedules all requests deterministically.
+- Neither layer enumerates devices.
+
+---
+
+## Links
+
+- SPEC R7, R8
+- ADR-0007 (Runtime API boundaries)
diff --git a/docs/adr/ADR-0011-memory-addressing-simplification.md b/docs/adr/ADR-0011-memory-addressing-simplification.md
new file mode 100644
index 0000000..3fa7003
--- /dev/null
+++ b/docs/adr/ADR-0011-memory-addressing-simplification.md
@@ -0,0 +1,65 @@
+# ADR-0011: Memory Addressing Simplification (PA-first)
+
+## Status
+
+Accepted
+
+## Context
+
+A realistic system uses host-side virtual addressing and an MMU/IOMMU-style
+translation path for DMA: host allocates physical memory at PE level, maps it
+into a virtual address space, installs mappings, and DMA requests use virtual
+addresses that are translated to physical addresses.
+
+For early development, we want a minimal, deterministic model that enables:
+
+- correct routing and latency accounting through the graph,
+- stable tensor deployment and kernel execution semantics,
+- future extension toward VA/MMU without rewriting workflows.
+
+---
+
+## Decision
+
+### D1. Phase 0 model is PA-only
+
+The simulator uses a PA-first model:
+
+- All device memory accesses (MemoryRead/MemoryWrite) operate on device physical
+  addresses (PA) plus size.
+- Tensor handles store PA-based shard mappings after deployment.
+- KernelLaunch passes tensor arguments as PA-based mappings (or references to them).
+- MMU/IOMMU concepts (virtual address spaces, page tables, translation latency)
+  are NOT modeled in Phase 0.
+
+### D2. Allocation produces PA mappings
+
+Device allocation selects PE-local memory regions and returns PA mappings
+sufficient to execute kernels and issue DMA requests.
+
+### D3. Extension path (non-breaking)
+
+A future ADR MAY introduce an optional VA/MMU layer by:
+
+- introducing virtual addresses in tensor handles,
+- adding a mapping-install step,
+- modeling translation latency and page granularity.
+
+The Phase 0 PA model remains a valid fast-path configuration.
+
+---
+
+## Consequences
+
+- Early implementation stays simple and testable.
+- All latency remains explicit via graph traversal, not hidden translation.
+- Future VA/MMU modeling can be added without breaking existing benchmarks.
+
+---
+
+## Links
+
+- ADR-0007 (runtime_api vs sim_engine boundaries)
+- ADR-0008 (tensor deployment)
+- ADR-0009 (kernel execution)
+- SPEC R2 (latency by traversal)
diff --git a/docs/adr/ADR-0012-host-io-message-schema.md b/docs/adr/ADR-0012-host-io-message-schema.md
new file mode 100644
index 0000000..b3f4c6f
--- /dev/null
+++ b/docs/adr/ADR-0012-host-io-message-schema.md
@@ -0,0 +1,232 @@
+# ADR-0012: Host ↔ IO_CPU Message Schema (PA-first, PE-tagged)
+
+## Status
+
+Accepted
+
+## Context
+
+Phase 0 uses a PA-first memory model (ADR-0011):
+
+- memory operations use device physical addresses (PA) only,
+- VA/MMU/IOMMU is not modeled.
+
+The host-facing runtime API interacts with the device via the IO_CPU endpoint.
+We define stable, minimal message schemas for Host ↔ IO_CPU so that:
+
+- benchmarks remain stable,
+- IO_CPU-internal fan-out/aggregation can evolve independently,
+- completion and failure propagation is deterministic.
+
+We also require PE-tagging (A 방식): each shard explicitly carries (sip,cube,pe)
+so IO_CPU can deterministically route/fan-out without relying on PA decoding.
+
+---
+
+## Decision
+
+### D1. Contract scope
+
+This schema is the stable contract ONLY for Host ↔ IO_CPU.
+
+Messages beyond IO_CPU (to M_CPU, PE_CPU, schedulers, engines) are component-internal
+and are NOT part of this host contract in Phase 0.
+
+---
+
+### D2. Required message set
+
+The runtime API MUST use only these message types for Host ↔ IO_CPU:
+
+- MemoryWrite
+- MemoryRead
+- KernelLaunch
+
+All operations required by benchmarks (tensor init/copy, kernel run) MUST be expressible
+with these messages.
+
+---
+
+### D3. Common envelope (mandatory for all requests)
+
+All Host ↔ IO_CPU requests MUST include:
+
+- `msg_type: str`
+- `correlation_id: str`
+  - generated by the host
+  - used to match responses deterministically
+- `request_id: str`
+  - unique within a correlation_id
+- `target_device: str`
+  - device identifier (e.g., "sip:0")
+- `timestamp_tag: str | None` (optional)
+  - debug tag only; MUST NOT affect determinism
+
+All Host ↔ IO_CPU responses MUST include:
+
+- `correlation_id: str`
+- `request_id: str`
+- `completion: Completion`
+
+---
+
+### D4. Completion schema (mandatory)
+
+`Completion` MUST have:
+
+- `ok: bool`
+- `error_code: str | None`
+- `error_message: str | None`
+
+Rules:
+
+- If `ok == true` then `error_code` and `error_message` MUST be null.
+- If `ok == false` then `error_code` MUST be non-null.
+- Completion semantics MUST be deterministic.
+
+---
+
+### D5. MemoryWrite schema (PA-first, PE-tagged)
+
+`MemoryWrite` represents a host-initiated write/initialize operation to device memory.
+
+Mandatory fields:
+
+- common envelope fields (D3)
+- destination placement tags (A 방식):
+  - `dst_sip: int`
+  - `dst_cube: int`
+  - `dst_pe: int`
+- `dst_pa: int`
+  - destination physical address in the destination PE's address space
+- `nbytes: int`
+- `src_kind: "pattern" | "host_buffer_ref"`
+  - Phase 0 MUST support "pattern"
+- `pattern: Pattern | None`
+  - required if `src_kind == "pattern"`
+
+`Pattern` (Phase 0 mandatory support):
+
+- `pattern_kind: "zero" | "fill_u8" | "fill_u16" | "fill_u32" | "fill_fp16" | "fill_fp32"`
+- `value: number | None`
+  - required for fill_*; ignored for zero
+
+Optional fields:
+
+- `dst_mem_kind: "HBM" | "TCM" | "AUTO"` (default "AUTO")
+- `debug_label: str | None`
+
+Notes:
+
+- This message MUST NOT embed bulk tensor data in Phase 0.
+- All latency MUST come from explicit graph traversal and modeled components.
+
+---
+
+### D6. MemoryRead schema (PA-first, PE-tagged)
+
+`MemoryRead` represents a host-initiated read from device memory.
+
+Mandatory fields:
+
+- common envelope fields (D3)
+- source placement tags (A 방식):
+  - `src_sip: int`
+  - `src_cube: int`
+  - `src_pe: int`
+- `src_pa: int`
+- `nbytes: int`
+
+Optional fields:
+
+- `dst_kind: "host_sink" | "discard"` (default "host_sink")
+- `debug_label: str | None`
+
+Response payload:
+
+- actual bytes are NOT required in Phase 0 (latency/traces focus)
+- implementations MAY return lightweight stats or hashes later via a new ADR
+
+---
+
+### D7. KernelLaunch schema (PA-first, PE-tagged shards)
+
+`KernelLaunch` represents launching a kernel on a target device via IO_CPU.
+
+Mandatory fields:
+
+- common envelope fields (D3)
+- `kernel_ref: KernelRef`
+- `args: list[KernelArg]`
+
+`KernelRef` MUST have:
+
+- `name: str`
+- `kind: "deployed" | "builtin"`
+- `deploy_pa: int | None` — PA where kernel binary was deployed (required for "deployed")
+- `deploy_sip: int` — SIP where binary resides
+- `deploy_cube: int` — cube where binary resides
+- `deploy_pe: int` — PE where binary resides
+- `nbytes_code: int` — kernel binary size (for BW modeling)
+
+Kernel binaries MUST be pre-deployed to device memory via MemoryWrite.
+KernelLaunch MUST NOT embed kernel source code or IR in the launch message.
+
+`KernelArg` supports tensor args by PA mapping and scalars by value.
+
+Tensor arg (mandatory):
+
+- `arg_kind: "tensor"`
+- `tensor_pa_map: TensorPAMap`
+
+`TensorPAMap` MUST have:
+
+- `shards: list[TensorShard]`
+
+`TensorShard` MUST have (A 방식 강제):
+
+- `sip: int`
+- `cube: int`
+- `pe: int`
+- `pa: int`
+- `nbytes: int`
+- `offset_bytes: int`
+
+Scalar arg (mandatory):
+
+- `arg_kind: "scalar"`
+- `dtype: "i32" | "i64" | "fp16" | "fp32" | "bool"`
+- `value: number | bool`
+
+Optional KernelLaunch fields:
+
+- `grid: dict | None`
+- `meta: dict | None`
+- `failure_policy: "fail_fast" | "collect_all"` (default "fail_fast")
+- `debug_label: str | None`
+
+Notes:
+
+- KernelLaunch MUST NOT embed bulk tensor data.
+- KernelLaunch MUST be submitted only to the IO_CPU endpoint.
+- IO_CPU MUST fan-out work internally using the shard (sip,cube,pe) tags.
+
+---
+
+## Verification Notes
+
+Tests SHOULD validate:
+
+- schema validation rejects missing mandatory fields,
+- deterministic correlation/response matching,
+- MemoryWrite/Read/KernelLaunch produce explicit hop traces,
+- all routed requests incur latency > 0.
+
+---
+
+## Links
+
+- ADR-0011 (PA-first memory addressing)
+- ADR-0007 (runtime_api vs sim_engine boundaries)
+- ADR-0009 (kernel execution fan-out/aggregation)
+- SPEC R2, R7, R8
diff --git a/docs/adr/ADR-0013-verification_strategy.md b/docs/adr/ADR-0013-verification_strategy.md
new file mode 100644
index 0000000..3f83712
--- /dev/null
+++ b/docs/adr/ADR-0013-verification_strategy.md
@@ -0,0 +1,139 @@
+# ADR-0013: Verification Strategy and Phase 1 Test Plan
+
+## Status
+
+Accepted
+
+## Context
+
+KernBench is a system-level simulator whose correctness is defined by:
+
+- adherence to SPEC-defined invariants,
+- determinism and debuggability,
+- explicit modeling of routing and latency.
+
+Given the evolving implementation, we need a stable verification strategy
+that prevents architectural drift while allowing incremental development.
+
+This ADR defines the Phase 1 verification plan and what constitutes
+"correct behavior" for early implementations.
+
+---
+
+## Decision
+
+### D1. Verification is contract-based
+
+Verification MUST be derived from:
+
+- SPEC requirements,
+- accepted ADRs.
+
+Tests MUST validate architectural contracts, not incidental implementation details.
+
+---
+
+### D2. Phase 1 verification scope
+
+Phase 1 verification focuses on:
+
+- message contract validity (ADR-0012),
+- routing and fan-out semantics at the IO_CPU boundary (ADR-0009),
+- PA-first memory addressing and shard tagging (ADR-0011),
+- core latency and trace invariants (SPEC 0.1, R2).
+
+Microarchitectural accuracy, bandwidth contention, and cycle-level behavior
+are explicitly out of scope in Phase 1.
+
+---
+
+### D3. Required Phase 1 verification cases
+
+The following verification cases MUST be supported by the implementation:
+
+#### V1. Message schema validation
+
+- KernelLaunch requests missing `(sip, cube, pe)` in any tensor shard MUST be rejected.
+- MemoryWrite/MemoryRead requests missing destination/source placement tags MUST be rejected.
+- Completion results MUST follow the `ok / error_code / error_message` contract.
+
+#### V2. IO_CPU fan-out and aggregation
+
+Given:
+
+- a topology with one SIP, one CUBE, and two PEs,
+- a KernelLaunch request containing two tensor shards targeting different PEs,
+
+The system MUST:
+
+- submit a single KernelLaunch to IO_CPU,
+- fan-out work internally to both PEs,
+- aggregate completion and return a single deterministic completion to the host.
+
+#### V3. Latency and trace invariants
+
+For any valid request:
+
+- the hop-by-hop trace MUST be non-empty,
+- total latency MUST be greater than zero,
+- repeated runs with identical inputs MUST produce identical traces.
+
+#### V4. Topology independence and cross-domain coverage
+
+Verification cases MUST pass for multiple topology shapes, including:
+
+- minimal: (1 SIP, 1 CUBE, 1 PE)
+- multi-PE: (1 SIP, 1 CUBE, N PEs)
+- multi-CUBE within a SIP: (1 SIP, M CUBEs, ≥1 PE per CUBE)
+- multi-SIP tray: (K SIPs, ≥1 CUBE per SIP, ≥1 PE per CUBE)
+
+For multi-CUBE and multi-SIP topologies, Phase 1 verification focuses on:
+
+- explicit connectivity (required links exist),
+- deterministic routing and control-path traversal,
+- non-empty traces and latency > 0 for representative cross-domain requests
+  (inter-CUBE and inter-SIP paths).
+
+Tests MUST NOT hardcode topology sizes, node ids, or link counts.
+Instead, tests MUST derive expectations from the compiled topology metadata
+---
+
+### D4. Phase 1 artifacts
+
+Phase 1 MAY include:
+
+- verification-only test code,
+- topology fixtures,
+- trace inspection utilities.
+
+Phase 1 MUST NOT require:
+
+- production code changes solely to satisfy tests,
+- weakening or removing tests to allow progress.
+
+---
+
+### D5. Phase 2 enforcement
+
+Phase 2 (Apply) MUST:
+
+- run the Phase 1 verification cases,
+- rollback all changes if any verification fails,
+- preserve tests as authoritative contracts.
+
+---
+
+## Consequences
+
+- Architectural correctness is enforced early.
+- Tests serve as executable documentation of system behavior.
+- Implementation remains flexible without losing rigor.
+
+---
+
+## Links
+
+- SPEC 0.1, R2, R6
+- ADR-0011 (PA-first memory addressing)
+- ADR-0012 (Host ↔ IO_CPU message schema)
+- ADR-0009 (Kernel execution semantics)
diff --git a/docs/adr/ADR-0014-pe-internal-execution-model.md b/docs/adr/ADR-0014-pe-internal-execution-model.md
new file mode 100644
index 0000000..99023a0
--- /dev/null
+++ b/docs/adr/ADR-0014-pe-internal-execution-model.md
@@ -0,0 +1,364 @@
+# ADR-0014: PE Internal Execution Model (PE_CPU, PE_SCHEDULER, and Composite Commands)
+
+## Status
+
+Proposed
+
+## Context
+
+ADR-0003 (system hierarchy) and ADR-0009 (kernel execution semantics) reference PE internals but do not define:
+
+- the dispatch model inside a PE,
+- the responsibilities of PE_SCHEDULER,
+- the PE_TCM-centric dataflow contract used by accelerator engines.
+
+We need a deterministic and debuggable PE-internal execution contract that supports:
+
+- simple single-engine commands
+- composite commands that build a tiled pipeline across DMA and accelerator engines
+
+The simulator must produce deterministic traces and allow modeling of PE-internal pipelining without introducing nondeterministic engine scheduling.
+
+## Decision
+
+### D1. PE internal component roles
+
+Each PE contains the following logical components.
+
+**PE_CPU**
+
+- Executes kernel instruction stream or kernel control logic.
+- Generates PE commands.
+- Submits commands to PE_SCHEDULER.
+- PE_CPU does NOT enqueue work directly into engine queues.
+
+**PE_SCHEDULER**
+
+- The sole dispatcher inside a PE.
+- Receives commands from PE_CPU.
+- Expands composite commands into sub-commands.
+- Tracks dependencies and command state.
+- Dispatches work to engine queues.
+- Manages tile scheduling for composite commands.
+
+**PE_DMA**
+
+- Handles memory transfers between PE_TCM and external memory domains.
+- PE_DMA has **dual egress** at the CUBE level:
+  - **→ XBAR**: dedicated path to HBM (local and cross-half via bridge)
+  - **→ NOC**: path to non-HBM destinations (shared SRAM, inter-cube UCIe, etc.)
+- Supported directions include:
+  - HBM → PE_TCM (via XBAR)
+  - PE_TCM → HBM (via XBAR)
+  - PE_TCM → shared SRAM (via NOC)
+  - PE_TCM → other memory domains (via NOC, if supported by topology)
+
+**PE_GEMM**
+
+- Matrix multiplication engine.
+- Reads activations from PE_TCM.
+- May stream weights directly from HBM.
+
+**PE_MATH**
+
+- Element-wise computation engine.
+- Reads and writes PE_TCM.
+
+**PE_TCM**
+
+- Local SRAM used as the staging memory for accelerator operations.
+
+---
+
+### D2. Command lifecycle and queues
+
+PE_SCHEDULER maintains three logical structures.
+
+**SubmissionQueue**
+
+- Written by PE_CPU.
+- Contains incoming PE commands waiting to be processed.
+
+**InflightTable**
+
+- Owned and mutated only by PE_SCHEDULER.
+- Tracks:
+  - expanded sub-commands
+  - dependency state
+  - engine assignment
+  - completion status
+
+**CompletionQueue**
+
+- Written by PE_SCHEDULER.
+- Contains final completion records for commands.
+
+**Single-writer rule**
+
+- Only PE_SCHEDULER is allowed to mutate command completion state.
+- Engine components must report completion via explicit completion events/messages.
+
+**Command completion**
+
+A command becomes DONE when:
+
+- all sub-commands complete
+- PE_SCHEDULER publishes a completion record to CompletionQueue.
+
+---
+
+### D3. Dispatch modes
+
+PE commands are divided into two categories.
+
+#### D3.1 Simple command
+
+A simple command expands to exactly one engine sub-command.
+
+Examples include:
+
+- DMA transfer
+- GEMM compute
+- MATH compute
+
+Execution flow:
+
+```
+PE_CPU → SubmissionQueue → PE_SCHEDULER → engine queue → engine execution → completion event → PE_SCHEDULER → CompletionQueue
+```
+
+#### D3.2 Composite command (tiled pipeline)
+
+Composite commands implement tiled pipelined execution across engines.
+
+Each tile executes the following pipeline:
+
+```
+Input DMA (READ)
+→ Compute (GEMM or MATH)
+→ Output DMA (WRITE)
+```
+
+**Tiling rule**
+
+If the DMA payload exceeds hardware tile size, PE_SCHEDULER splits the transfer into tiles.
+Each tile is assigned a monotonically increasing `tile_id`.
+
+**Tile dependency rules**
+
+For tile `t`:
+
+- Compute must wait for input DMA: `DMA_READ(t) → COMPUTE(t)`
+- Output DMA must wait for compute: `COMPUTE(t) → DMA_WRITE(t)`
+- All dependencies are enforced by PE_SCHEDULER.
+
+**Overlap policy (Phase 0 default)**
+
+Operations for different tiles may overlap when engine resources permit.
+
+Allowed overlaps:
+
+```
+DMA_READ(t+1) ∥ COMPUTE(t)
+DMA_WRITE(t−1) ∥ COMPUTE(t)
+DMA_READ(t) ∥ DMA_WRITE(t)
+```
+
+Disallowed overlaps:
+
+```
+GEMM(t) ∥ GEMM(t′)
+MATH(t) ∥ MATH(t′)
+GEMM(t) ∥ MATH(t′)
+```
+
+---
+
+### D4. Engine execution model (Phase 0 default)
+
+Each engine behaves as a deterministic service resource.
+
+**DMA engine**
+
+PE_DMA contains two independent channels.
+
+```
+DMA_READ capacity  = 1
+DMA_WRITE capacity = 1
+```
+
+Rules:
+
+- DMA_READ and DMA_WRITE may execute concurrently.
+- Multiple READs cannot overlap.
+- Multiple WRITEs cannot overlap.
+
+Example allowed:
+
+```
+DMA_READ(t+1) ∥ DMA_WRITE(t)
+```
+
+Example not allowed:
+
+```
+DMA_READ(t) ∥ DMA_READ(t+1)
+DMA_WRITE(t) ∥ DMA_WRITE(t+1)
+```
+
+**Compute engine**
+
+Compute operations share a single compute resource.
+
+```
+PE_ACCEL capacity = 1
+```
+
+Both GEMM and MATH require this shared compute slot.
+
+Consequences:
+
+- GEMM ∥ GEMM not allowed
+- MATH ∥ MATH not allowed
+- GEMM ∥ MATH not allowed
+
+Only one compute operation can run in a PE at a time.
+
+**Compute opcode restriction**
+
+Composite commands contain one compute opcode only.
+
+Examples:
+
+```
+COMPOSITE_GEMM
+COMPOSITE_MATH
+```
+
+Mixed compute pipelines such as `GEMM → MATH` are not supported in Phase 0.
+
+**Engine completion signaling**
+
+Every engine emits a completion event when a sub-command finishes.
+Completion events are delivered to PE_SCHEDULER.
+
+---
+
+### D5. Dataflow model
+
+Compute operations use a TCM-centric dataflow model.
+
+**Input path (HBM)**
+
+```
+HBM → XBAR → PE_DMA (DMA_READ) → PE_TCM
+```
+
+**Input path (shared SRAM)**
+
+```
+Shared SRAM → NOC → PE_DMA (DMA_READ) → PE_TCM
+```
+
+**Compute stage**
+
+Compute engines read input tensors from PE_TCM.
+
+```
+PE_TCM → GEMM / MATH
+```
+
+Weights for GEMM may optionally stream directly from HBM (via XBAR).
+
+**Output path (HBM)**
+
+Compute results are written to PE_TCM, then DMA writes to HBM.
+
+```
+PE_TCM → PE_DMA (DMA_WRITE) → XBAR → HBM
+```
+
+**Output path (shared SRAM)**
+
+```
+PE_TCM → PE_DMA (DMA_WRITE) → NOC → Shared SRAM
+```
+
+#### D5.1 PE_TCM partitioning and ownership boundary
+
+The PE_TCM address space is partitioned into two logical regions.
+
+**SchedulerReservedTCM**
+
+- A staging region owned exclusively by PE_SCHEDULER.
+- This region is used for composite command tile buffers.
+- PE_SCHEDULER:
+  - partitions this region into tile buffers
+  - assigns buffers for DMA_READ, COMPUTE, and DMA_WRITE stages
+  - guarantees input/output buffer separation
+  - manages tile buffer lifetime
+
+**AllocatableTCM**
+
+- General-purpose region managed by PEMemAllocator.
+- Used by host or DP-visible allocations.
+
+**Visibility rule (hard isolation)**
+
+- PEMemAllocator must not see or allocate memory inside SchedulerReservedTCM.
+- SchedulerReservedTCM is excluded from allocator-managed ranges by construction.
+- This prevents DP or host allocations from interfering with scheduler staging buffers.
+
+**Tile buffer rules**
+
+Within SchedulerReservedTCM:
+
+- input buffers and output buffers must not overlap
+- PE_SCHEDULER assigns tile buffers for DMA and compute stages
+- tile buffers remain valid until the corresponding DMA_WRITE completes
+- Buffer reuse is allowed only after the tile lifetime finishes.
+
+---
+
+### D6. Observability and trace contract
+
+The simulator must emit deterministic trace events.
+
+Required events include:
+
+- `command_submitted`
+- `sub_command_dispatched`
+- `engine_start`
+- `engine_complete`
+- `tile_ready`
+- `command_complete`
+
+Trace ordering must be deterministic for identical inputs.
+
+---
+
+### D7. Topology representation
+
+PE internal components are declared in `cube.pe_template`.
+
+The template is instantiated once per PE.
+
+PE instances are derived from `cube.pe_layout`.
+
+External connectivity such as:
+
+- PE_DMA → XBAR (HBM data path)
+- PE_DMA → NOC (non-HBM data path: shared SRAM, inter-cube UCIe)
+- NOC → PE_CPU (command path from M_CPU)
+
+is modeled at the CUBE level (see ADR-0003 D3).
+
+---
+
+## Links
+
+- SPEC R3, R4
+- ADR-0003 D4 (PE-level system hierarchy)
+- ADR-0005 View C (PE-level diagram)
+- ADR-0008 D2 (PA-level allocation at PE scope; PEMemAllocator is the per-PE allocator instance)
+- ADR-0009 D3 (kernel execution fan-out and PE_CPU dispatch)
diff --git a/docs/adr/ADR-0015-component-port-wire-model.md b/docs/adr/ADR-0015-component-port-wire-model.md
new file mode 100644
index 0000000..a9c4a94
--- /dev/null
+++ b/docs/adr/ADR-0015-component-port-wire-model.md
@@ -0,0 +1,178 @@
+# ADR-0015: Component Port/Wire Model and Fabric Routing
+
+## Status
+
+Proposed
+
+## Context
+
+ADR-0007 D2 assigns path-walking and low-level request decomposition to the simulation engine.
+In practice, the engine iterates the topology path and calls `run()` on each component
+sequentially — conflating routing policy with component behavior and preventing realistic
+hardware modeling (queues, contention, fan-out).
+
+ADR-0007 D3 already states that components own fan-out and aggregation, but the current
+implementation does not enforce this for fabric traversal.
+
+This ADR defines:
+
+- how components communicate via typed port queues,
+- how propagation delay is modeled (wire processes),
+- the fabric path for Memory R/W through M_CPU.DMA,
+- the reduced role of the simulation engine,
+- M_CPU.DMA as an internal subcomponent of M_CPU.
+
+---
+
+## Decision
+
+### D1. Component port model
+
+Each component has typed input/output ports modeled as SimPy Stores:
+
+```
+in_ports:  dict[str, simpy.Store]   # keyed by source node_id
+out_ports: dict[str, simpy.Store]   # keyed by destination node_id
+```
+
+Ports are created at engine initialization based on graph edges.
+Each directed edge (src → dst) results in:
+
+- `src.out_ports[dst]`  — the sending end
+- `dst.in_ports[src]`   — the receiving end
+
+---
+
+### D2. Wire process (propagation delay)
+
+For each directed edge (src, dst) in the topology graph, a SimPy wire process
+models propagation delay:
+
+```python
+def wire_process(env, out_port, in_port, delay_ns):
+    while True:
+        cmd = yield out_port.get()
+        yield env.timeout(delay_ns)
+        yield in_port.put(cmd)
+```
+
+Wire processes are started at engine initialization.
+BW constraints are enforced by the sending component's out_port capacity or token model,
+not by the wire process itself.
+
+---
+
+### D3. Engine role (reduced)
+
+The simulation engine MUST:
+
+- wire components at initialization (create port Stores, start wire processes),
+- identify the entry component for each request type (PCIE_EP),
+- put the request into the entry component's in_port,
+- wait for a completion event.
+
+The simulation engine MUST NOT:
+
+- walk the topology path during request execution,
+- call component `run()` methods directly,
+- track per-hop latency or decompose fan-out.
+
+This supersedes ADR-0007 D2's "decompose operations into low-level requests" clause.
+ADR-0007 D2 must be amended accordingly.
+
+---
+
+### D4. Unified fabric path for Memory R/W and Kernel Launch
+
+Both Memory R/W and Kernel Launch use the same fabric path to reach the target cube's M_CPU.
+The difference is what M_CPU does upon receiving the request.
+
+**Forward path (IO_CPU → target M_CPU):**
+
+```
+IO_CPU
+  → [transit cubes: ucie_out → wire → ucie_in → noc → ucie_out]  (zero or more)
+  → target cube: ucie_in → noc → M_CPU
+```
+
+**At M_CPU (diverges by operation type):**
+
+```
+Memory R/W:     M_CPU → M_CPU.DMA → noc → hbm_ctrl
+Kernel Launch:  M_CPU → PE[0..n] (parallel fan-out)
+```
+
+**Completion path (reverse, same fabric):**
+
+```
+Memory R/W:     hbm_ctrl → noc → M_CPU.DMA → M_CPU
+Kernel Launch:  PE[0..n] all complete → M_CPU (aggregation)
+
+M_CPU → [transit cubes: ucie → noc → ucie] → IO_CPU → runtime_api
+```
+
+---
+
+### D5. M_CPU.DMA is an internal subcomponent of M_CPU
+
+M_CPU.DMA is NOT a separate topology node.
+It is an internal subcomponent owned by the M_CPU component implementation.
+
+M_CPU.DMA:
+
+- owns the DMA READ and DMA WRITE queues (capacity=1 each, per ADR-0014 D4),
+- issues memory requests over the NOC to hbm_ctrl,
+- receives completion from hbm_ctrl via the NOC,
+- reports completion to M_CPU,
+- is created and managed inside M_CPU's `__init__` and `run()`.
+
+M_CPU.DMA does not appear as a node in the compiled topology graph.
+
+---
+
+### D6. Transit cube forwarding
+
+A cube that is not the target of a memory or kernel request acts as a transit node.
+Transit cubes forward requests without consuming them:
+
+```
+ucie_in (from upstream) → noc → ucie_out (to downstream)
+```
+
+Transit forwarding is implemented entirely within the ucie_in component.
+The noc and ucie_out components in a transit cube forward the packet without modification.
+
+---
+
+### D7. _formula_latency is preserved as a lower-bound cross-check
+
+The path-based formula latency function (`_formula_latency`) is preserved in the engine
+as a lower bound for correctness verification.
+
+Invariant:
+
+- Phase 0: `_formula_latency == component model total_ns`
+- Phase 1+: `_formula_latency <= component model total_ns` (contention adds queueing)
+
+This function is independent of the port/wire model and requires only the topology graph.
+It is used for shard comparison in `_route_kernel` and as a regression guard.
+
+---
+
+## Consequences
+
+- Components model realistic hardware behavior (queues, contention, fan-out).
+- Propagation delay is modeled accurately per edge.
+- Engine is decoupled from routing policy.
+- Component implementations remain swappable via DI (ADR-0007 D3).
+- ADR-0007 D2 must be amended to remove path-walking from engine responsibilities.
+- ADR-0009 D3 should be updated to reference the unified fabric path (D4 above).
+
+---
+
+## Links
+
+- ADR-0007 D2 (to be amended: engine path-walking clause)
+- ADR-0009 D3 (kernel execution fan-out; fabric path to be referenced)
+- ADR-0014 D4 (DMA engine capacity=1)
+- ADR-0012 D1 (host ↔ IO_CPU message schema; M_CPU.DMA is component-internal)
diff --git a/docs/di-presentation.md b/docs/di-presentation.md
new file mode 100644
index 0000000..5f64572
--- /dev/null
+++ b/docs/di-presentation.md
@@ -0,0 +1,363 @@
+# 실무 DI 패턴: kernbench 구현으로 배우는 Dependency Injection
+
+---
+
+## 슬라이드 1 — 오늘 이야기할 것
+
+**질문:** 코드를 어떻게 설계해야 테스트하기 쉽고, 갈아끼우기 쉬울까?
+
+**답:** Dependency Injection (DI)
+
+오늘은 이론이 아니라 **실제로 돌아가는 시뮬레이터 코드**를 보면서 배웁니다.
+
+```
+kernbench
+└── AI 가속기 하드웨어를 Python으로 시뮬레이션하는 프레임워크
+    - 수십 개의 하드웨어 컴포넌트 (NOC, HBM, PE, CPU...)
+    - 각 컴포넌트는 런타임에 교체 가능
+    - 테스트에서 Mock 컴포넌트로 즉시 대체 가능
+```
+
+---
+
+## 슬라이드 2 — DI가 없으면 어떤 일이 생기나
+
+```python
+# ❌ DI 없는 코드
+class IoCpuComponent:
+    def run(self, env, nbytes):
+        router = PathRouter()        # 직접 생성 — 교체 불가
+        hbm = HbmCtrlComponent()    # 직접 생성 — 교체 불가
+        yield env.timeout(10.0)
+```
+
+**문제:**
+- 테스트할 때 실제 `PathRouter`와 `HbmCtrl`이 항상 따라온다
+- 컴포넌트를 Mock으로 바꾸려면 **소스 코드를 수정**해야 한다
+- 다른 topology(다른 라우팅 전략)를 쓰고 싶으면 **또 수정**
+
+> 클래스가 자기 의존성을 스스로 만들면, 그 클래스는 의존성과 결합된다
+
+---
+
+## 슬라이드 3 — DI의 핵심 원칙
+
+**의존성은 밖에서 만들어서 안으로 넣어준다**
+
+```
+┌────────────────────────────┐
+│  조립자 (Assembler)         │  ← 누가 무엇을 쓸지 결정
+│  GraphEngine.__init__      │
+└────────────┬───────────────┘
+             │ ctx 주입
+             ▼
+┌────────────────────────────┐
+│  컴포넌트 (Component)       │  ← 어떻게 동작하는지만 알면 됨
+│  IoCpuComponent            │
+│    self.ctx.router.find_path(...)  ← 그냥 사용
+└────────────────────────────┘
+```
+
+**세 가지 역할 분리:**
+1. **Interface** — 무엇을 할 수 있는가 (`ComponentBase`)
+2. **Implementation** — 어떻게 하는가 (`IoCpuComponent`, `HbmCtrlComponent`, ...)
+3. **Assembler** — 무엇을 연결할 것인가 (`GraphEngine`)
+
+---
+
+## 슬라이드 4 — 패턴 1: Constructor Injection
+
+> 생성자로 의존성을 받는다
+
+```python
+# kernbench/components/base.py
+
+class ComponentBase(ABC):
+    def __init__(self, node: Node, ctx: ComponentContext | None = None):
+        self.node = node
+        self.ctx = ctx          # 외부에서 주입받은 의존성
+        self.in_ports: dict[str, simpy.Store] = {}
+        self.out_ports: dict[str, simpy.Store] = {}
+```
+
+```python
+# 사용 측 — ctx를 직접 만들지 않는다
+class IoCpuComponent(ComponentBase):
+    def _dispatch(self, env, txn):
+        path = self.ctx.router.find_node_path(...)   # ctx는 이미 들어와 있음
+        yield self.out_ports[next_hop].put(...)
+```
+
+**언제 쓰나:**
+- 컴포넌트가 살아있는 동안 의존성이 바뀌지 않을 때
+- 의존성 없이는 컴포넌트가 동작하지 않을 때 (필수 의존성)
+
+---
+
+## 슬라이드 5 — Context Object 패턴
+
+> 의존성이 많아지면 묶어서 하나로
+
+```python
+# kernbench/components/context.py
+
+@dataclass
+class ComponentContext:
+    router: PathRouter              # 라우팅 정책
+    resolver: AddressResolver       # 주소 해석
+    positions: dict[str, ...]       # 물리적 위치 정보
+    ns_per_mm: float                # 전파 지연 상수
+    edge_map: dict[...]             # 엣지 정보
+    spec: dict                      # 토폴로지 스펙
+```
+
+**왜 Context로 묶나?**
+- 생성자 인자가 6개면 → 컴포넌트 추가할 때마다 시그니처 변경
+- Context 하나면 → 새 필드 추가해도 기존 컴포넌트 무영향
+- 컴포넌트는 **필요한 것만 꺼내 쓴다**
+
+```python
+class TwoDMeshNocComponent(ComponentBase):
+    def _route(self, env, txn):
+        src_pos = self.ctx.positions.get(prev_hop)   # 위치만 사용
+        ns_per_mm = self.ctx.ns_per_mm               # 상수만 사용
+        # router, resolver 등은 건드리지 않음
+```
+
+---
+
+## 슬라이드 6 — 패턴 2: Registry + Factory
+
+> 문자열 키 → 클래스 매핑으로 런타임 교체
+
+```python
+# kernbench/components/base.py
+
+class ComponentRegistry:
+    _registry: dict[str, type[ComponentBase]] = {}
+
+    @classmethod
+    def register(cls, impl: str, component_cls: type[ComponentBase]):
+        cls._registry[impl] = component_cls
+
+    @classmethod
+    def create(cls, node, overrides=None, ctx=None) -> ComponentBase:
+        if overrides and node.impl in overrides:
+            return overrides[node.impl](node, ctx)   # 1순위: 호출자 override
+        if node.impl in cls._registry:
+            return cls._registry[node.impl](node, ctx)  # 2순위: 등록된 구현
+        return DefaultComponent(node, ctx)           # 3순위: 기본값 fallback
+```
+
+**Resolution 우선순위:**
+```
+overrides[impl]        ← 테스트/실험용 주입
+  ↓ (없으면)
+_registry[impl]        ← 프로덕션 구현
+  ↓ (없으면)
+DefaultComponent       ← 안전한 fallback
+```
+
+---
+
+## 슬라이드 7 — Registry 등록 방식
+
+```python
+# kernbench/components/impls/__init__.py
+
+from kernbench.components.base import ComponentRegistry
+from kernbench.components.impls.noc import TwoDMeshNocComponent
+from kernbench.components.impls.io_cpu import IoCpuComponent
+# ...
+
+ComponentRegistry.register("noc_2d_mesh_v1", TwoDMeshNocComponent)
+ComponentRegistry.register("io_cpu_v1",       IoCpuComponent)
+ComponentRegistry.register("hbm_ctrl_v1",     HbmCtrlComponent)
+# ...
+```
+
+**topology.yaml (설정 파일)**
+```yaml
+nodes:
+  - id: sip0.cube0.noc
+    impl: noc_2d_mesh_v1    # ← 이 문자열이 Registry 키
+```
+
+**흐름:**
+```
+YAML → impl 문자열 → Registry.create() → 실제 컴포넌트 인스턴스
+```
+
+impl 문자열만 바꾸면 동작이 바뀐다. 코드 수정 없음.
+
+---
+
+## 슬라이드 8 — 패턴 3: Override Injection (테스트용)
+
+> 호출자가 특정 impl만 갈아끼운다
+
+```python
+# tests/test_component_registry.py
+
+class SpyXbar(ComponentBase):
+    calls = 0
+
+    def run(self, env, nbytes):
+        SpyXbar.calls += 1
+        yield env.timeout(0)
+
+
+# 테스트에서 xbar_v1만 SpyXbar로 교체
+engine = GraphEngine(
+    graph,
+    component_overrides={"xbar_v1": SpyXbar}   # ← 이것만 추가
+)
+
+result = engine.run(msg)
+assert SpyXbar.calls > 0    # Xbar가 실제로 호출됐는지 검증
+```
+
+**핵심:** 테스트 코드가 프로덕션 코드를 **수정하지 않는다**
+
+---
+
+## 슬라이드 9 — 조립자: GraphEngine
+
+> 컴포넌트를 생성하고 연결하는 유일한 곳
+
+```python
+# kernbench/sim_engine/engine.py
+
+class GraphEngine:
+    def __init__(self, graph, component_overrides=None):
+
+        # 1. 공유 의존성 생성
+        ctx = ComponentContext(
+            router=PathRouter(graph),
+            resolver=AddressResolver(graph),
+            positions={nid: n.pos_mm for nid, n in graph.nodes.items()},
+            ns_per_mm=...,
+        )
+
+        # 2. 컴포넌트 생성 (DI: ctx 주입)
+        self._components = {
+            node_id: ComponentRegistry.create(node, overrides, ctx)
+            for node_id, node in graph.nodes.items()
+        }
+
+        # 3. 포트 연결 (배선)
+        for e in graph.edges:
+            store = simpy.Store(self._env)
+            self._components[e.src].out_ports[e.dst] = store
+            self._components[e.dst].in_ports[e.src] = store
+```
+
+**생성 → 주입 → 연결** — 이 세 단계가 한 곳에서만 일어난다
+
+---
+
+## 슬라이드 10 — 전체 구조 한눈에 보기
+
+```
+topology.yaml
+    │ impl: "noc_2d_mesh_v1"
+    ▼
+GraphEngine.__init__()                     ← 조립자
+    │
+    ├── ComponentContext 생성               ← 공유 의존성 묶음
+    │     ├── PathRouter
+    │     ├── AddressResolver
+    │     └── positions, ns_per_mm, ...
+    │
+    ├── ComponentRegistry.create(node, overrides, ctx)
+    │     ├── overrides["noc_2d_mesh_v1"]? → SpyNoc (테스트)
+    │     ├── registry["noc_2d_mesh_v1"]?  → TwoDMeshNocComponent (프로덕션)
+    │     └── fallback                     → DefaultComponent
+    │
+    └── 포트 배선: out_ports / in_ports 연결
+
+Component (TwoDMeshNocComponent)
+    └── self.ctx.positions, self.ctx.ns_per_mm 사용
+        (라우터, 리졸버는 건드리지 않음 — 필요한 것만)
+```
+
+---
+
+## 슬라이드 11 — 무엇을 얻었나
+
+| 상황 | DI 없이 | DI 있이 |
+|------|---------|---------|
+| NOC 알고리즘 교체 | 소스 코드 수정 | YAML에서 impl 문자열 변경 |
+| Xbar 동작 검증 | 실제 HW 전부 구동 | `overrides={"xbar_v1": SpyXbar}` |
+| 새 컴포넌트 추가 | 기존 코드 수정 | `register("new_v1", NewComp)` |
+| 컨텍스트 필드 추가 | 모든 생성자 수정 | `ComponentContext`에 필드 추가 |
+| 테스트 격리 | 불가능 | 필요한 것만 override |
+
+---
+
+## 슬라이드 12 — 실무 적용 체크리스트
+
+**설계할 때 물어볼 것:**
+
+1. **이 클래스가 직접 `new`(생성)하는 것은 무엇인가?**
+   → 생성하는 것 = 교체할 수 없는 것. 생성자로 받을 수 없는지 검토.
+
+2. **의존성이 3개 이상이면?**
+   → Context Object로 묶어라.
+
+3. **테스트에서 이 클래스를 단독으로 실행할 수 있는가?**
+   → 없다면 DI가 필요하다는 신호.
+
+4. **설정(YAML/config)으로 동작을 바꾸고 싶은가?**
+   → Registry + 문자열 키 패턴.
+
+5. **누가 조립하는가?**
+   → 조립자는 하나여야 한다. 컴포넌트 안에 조립 로직이 있으면 안 된다.
+
+---
+
+## 슬라이드 13 — 안티패턴: 이것은 하지 말자
+
+```python
+# ❌ 서비스 로케이터 (컴포넌트 안에서 registry 호출)
+class BadComponent(ComponentBase):
+    def run(self, env, nbytes):
+        router = ComponentRegistry.get("router")  # 컴포넌트가 직접 찾는다
+        ...
+
+# ❌ 전역 싱글톤 직접 참조
+class BadComponent(ComponentBase):
+    def run(self, env, nbytes):
+        router = GlobalRouter.instance()          # 교체 불가
+        ...
+
+# ❌ 생성자 안에서 의존성 생성
+class BadComponent(ComponentBase):
+    def __init__(self, node):
+        self.router = PathRouter(node.graph)      # 테스트에서 격리 불가
+```
+
+**공통 문제:** 컴포넌트가 자기 의존성을 스스로 해결한다 → 결합도 증가
+
+---
+
+## 슬라이드 14 — 요약
+
+> **DI = 의존성의 생성과 사용을 분리하는 것**
+
+```
+생성  →  Registry / Assembler (GraphEngine)
+사용  →  Component (IoCpuComponent, TwoDMeshNocComponent, ...)
+```
+
+**kernbench에서 배운 패턴 3가지:**
+
+1. **Constructor Injection** — 필수 의존성은 생성자로
+2. **Context Object** — 의존성 묶음을 하나의 dataclass로
+3. **Registry + Override** — 문자열 키로 구현체 선택, 테스트에서 교체
+
+**결과:** 141개 테스트, YAML 한 줄로 컴포넌트 교체, 프로덕션 코드 수정 없이 Mock 주입
+
+---
+
+*참고 코드: kernbench/src/kernbench/components/*
diff --git a/docs/diagrams/README.md b/docs/diagrams/README.md
new file mode 100644
index 0000000..d151243
--- /dev/null
+++ b/docs/diagrams/README.md
@@ -0,0 +1,26 @@
+# Generated Diagrams
+
+This directory contains diagrams generated from topology compilation.
+
+## What these files are
+- Derived artifacts generated from:
+  - compiled topology graph
+  - distance (accumulated latency) metadata
+  - view/layout rules (ADR-0005)
+
+These files are meant for quick visual inspection and review.
+
+## Default outputs
+- SIP view: `sip_view.mmd` (and/or `sip_view.dot`)
+- CUBE view: `cube_view.mmd` (and/or `cube_view.dot`)
+- PE view: `pe_view.mmd` (and/or `pe_view.dot`)
+
+## How to preview
+- In VS Code:
+  - open `.mmd` or `.md` containing Mermaid blocks and use Markdown Preview
+  - for `.dot`, use a Graphviz preview extension or `dot -Tpng`
+
+## Notes
+- Diagrams are representative and distance-aware by default.
+- Instance indices are not required unless debugging asymmetry.
+- Outputs should be deterministic for the same topology and rules.
diff --git a/docs/diagrams/cube_view.svg b/docs/diagrams/cube_view.svg
new file mode 100644
index 0000000..ebf8c05
--- /dev/null
+++ b/docs/diagrams/cube_view.svg
@@ -0,0 +1,156 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="556" height="472" viewBox="0 0 556 472">
+  <title>cube</title>
+  <rect width="556" height="472" fill="#f8fafc"/>
+  <text x="278" y="18" text-anchor="middle" font-family="monospace" font-size="14" font-weight="bold" fill="#1e293b">CUBE VIEW</text>
+  <rect x="40.0" y="40.0" width="476.0" height="392.0" rx="6" fill="none" stroke="#475569" stroke-width="2" stroke-dasharray="8,4"/>
+  <rect x="152.0" y="166.0" width="252.0" height="140.0" rx="4" fill="#d1fae5" stroke="#10b981" stroke-width="1.5" stroke-dasharray="6,3" opacity="0.5"/>
+  <text x="278.0" y="278.0" text-anchor="middle" font-family="monospace" font-size="11" fill="#047857" opacity="0.7">HBM</text>
+  <polyline points="82.0,82.0 82.0,95.0 82.0,95.0 82.0,138.0" fill="none" stroke="#f97316" stroke-width="1" opacity="0.8"/>
+  <text x="82.0" y="92.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">6.0mm 256GB/s</text>
+  <polyline points="82.0,82.0 82.0,144.0 334.0,144.0 334.0,236.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
+  <polyline points="334.0,236.0 334.0,144.0 82.0,144.0 82.0,82.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.6"/>
+  <polyline points="166.0,82.0 166.0,95.0 166.0,95.0 166.0,138.0" fill="none" stroke="#f97316" stroke-width="1" opacity="0.8"/>
+  <text x="166.0" y="92.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">6.0mm 256GB/s</text>
+  <polyline points="166.0,82.0 166.0,154.0 334.0,154.0 334.0,236.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
+  <polyline points="334.0,236.0 334.0,144.0 166.0,144.0 166.0,82.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.6"/>
+  <polyline points="390.0,82.0 390.0,95.0 390.0,95.0 390.0,138.0" fill="none" stroke="#f97316" stroke-width="1" opacity="0.8"/>
+  <text x="390.0" y="92.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">6.0mm 256GB/s</text>
+  <polyline points="390.0,82.0 390.0,164.0 334.0,164.0 334.0,236.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
+  <polyline points="334.0,236.0 334.0,144.0 390.0,144.0 390.0,82.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.6"/>
+  <polyline points="474.0,82.0 474.0,95.0 474.0,95.0 474.0,138.0" fill="none" stroke="#f97316" stroke-width="1" opacity="0.8"/>
+  <text x="474.0" y="92.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">6.0mm 256GB/s</text>
+  <polyline points="474.0,82.0 474.0,174.0 334.0,174.0 334.0,236.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
+  <polyline points="334.0,236.0 334.0,144.0 474.0,144.0 474.0,82.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.6"/>
+  <polyline points="82.0,390.0 82.0,347.0 82.0,347.0 82.0,334.0" fill="none" stroke="#f97316" stroke-width="1" opacity="0.8"/>
+  <text x="82.0" y="344.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">6.0mm 256GB/s</text>
+  <polyline points="82.0,390.0 82.0,338.0 334.0,338.0 334.0,236.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
+  <polyline points="334.0,236.0 334.0,298.0 82.0,298.0 82.0,390.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.6"/>
+  <polyline points="166.0,390.0 166.0,347.0 166.0,347.0 166.0,334.0" fill="none" stroke="#f97316" stroke-width="1" opacity="0.8"/>
+  <text x="166.0" y="344.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">6.0mm 256GB/s</text>
+  <polyline points="166.0,390.0 166.0,348.0 334.0,348.0 334.0,236.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
+  <polyline points="334.0,236.0 334.0,298.0 166.0,298.0 166.0,390.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.6"/>
+  <polyline points="390.0,390.0 390.0,347.0 390.0,347.0 390.0,334.0" fill="none" stroke="#f97316" stroke-width="1" opacity="0.8"/>
+  <text x="390.0" y="344.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">6.0mm 256GB/s</text>
+  <polyline points="390.0,390.0 390.0,358.0 334.0,358.0 334.0,236.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
+  <polyline points="334.0,236.0 334.0,298.0 390.0,298.0 390.0,390.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.6"/>
+  <polyline points="474.0,390.0 474.0,347.0 474.0,347.0 474.0,334.0" fill="none" stroke="#f97316" stroke-width="1" opacity="0.8"/>
+  <text x="474.0" y="344.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">6.0mm 256GB/s</text>
+  <polyline points="474.0,390.0 474.0,368.0 334.0,368.0 334.0,236.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
+  <polyline points="334.0,236.0 334.0,298.0 474.0,298.0 474.0,390.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.6"/>
+  <polyline points="82.0,138.0 222.0,138.0 222.0,236.0" fill="none" stroke="#10b981" stroke-width="1" opacity="0.8"/>
+  <text x="152.0" y="183.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.5mm 256GB/s</text>
+  <polyline points="166.0,138.0 222.0,138.0 222.0,236.0" fill="none" stroke="#10b981" stroke-width="1" opacity="0.8"/>
+  <text x="194.0" y="183.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.5mm 256GB/s</text>
+  <polyline points="390.0,138.0 222.0,138.0 222.0,236.0" fill="none" stroke="#10b981" stroke-width="1" opacity="0.8"/>
+  <text x="306.0" y="183.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.5mm 256GB/s</text>
+  <polyline points="474.0,138.0 222.0,138.0 222.0,236.0" fill="none" stroke="#10b981" stroke-width="1" opacity="0.8"/>
+  <text x="348.0" y="183.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.5mm 256GB/s</text>
+  <polyline points="82.0,334.0 222.0,334.0 222.0,236.0" fill="none" stroke="#10b981" stroke-width="1" opacity="0.8"/>
+  <text x="152.0" y="281.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.5mm 256GB/s</text>
+  <polyline points="166.0,334.0 222.0,334.0 222.0,236.0" fill="none" stroke="#10b981" stroke-width="1" opacity="0.8"/>
+  <text x="194.0" y="281.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.5mm 256GB/s</text>
+  <polyline points="390.0,334.0 222.0,334.0 222.0,236.0" fill="none" stroke="#10b981" stroke-width="1" opacity="0.8"/>
+  <text x="306.0" y="281.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.5mm 256GB/s</text>
+  <polyline points="474.0,334.0 222.0,334.0 222.0,236.0" fill="none" stroke="#10b981" stroke-width="1" opacity="0.8"/>
+  <text x="348.0" y="281.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.5mm 256GB/s</text>
+  <line x1="82.0" y1="138.0" x2="166.0" y2="138.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
+  <text x="124.0" y="134.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.0mm 128GB/s</text>
+  <line x1="166.0" y1="138.0" x2="82.0" y2="138.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
+  <text x="124.0" y="134.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.0mm 128GB/s</text>
+  <line x1="166.0" y1="138.0" x2="390.0" y2="138.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
+  <text x="278.0" y="134.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">10.0mm 128GB/s</text>
+  <line x1="390.0" y1="138.0" x2="166.0" y2="138.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
+  <text x="278.0" y="134.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">10.0mm 128GB/s</text>
+  <line x1="390.0" y1="138.0" x2="474.0" y2="138.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
+  <text x="432.0" y="134.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.0mm 128GB/s</text>
+  <line x1="474.0" y1="138.0" x2="390.0" y2="138.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
+  <text x="432.0" y="134.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.0mm 128GB/s</text>
+  <line x1="82.0" y1="334.0" x2="166.0" y2="334.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
+  <text x="124.0" y="330.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.0mm 128GB/s</text>
+  <line x1="166.0" y1="334.0" x2="82.0" y2="334.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
+  <text x="124.0" y="330.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.0mm 128GB/s</text>
+  <line x1="166.0" y1="334.0" x2="390.0" y2="334.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
+  <text x="278.0" y="330.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">10.0mm 128GB/s</text>
+  <line x1="390.0" y1="334.0" x2="166.0" y2="334.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
+  <text x="278.0" y="330.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">10.0mm 128GB/s</text>
+  <line x1="390.0" y1="334.0" x2="474.0" y2="334.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
+  <text x="432.0" y="330.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.0mm 128GB/s</text>
+  <line x1="474.0" y1="334.0" x2="390.0" y2="334.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
+  <text x="432.0" y="330.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.0mm 128GB/s</text>
+  <polyline points="82.0,138.0 110.0,138.0 110.0,292.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
+  <text x="96.0" y="211.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.0mm 512GB/s</text>
+  <polyline points="110.0,292.0 82.0,292.0 82.0,138.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
+  <text x="96.0" y="211.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.0mm 512GB/s</text>
+  <polyline points="82.0,334.0 110.0,334.0 110.0,292.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
+  <text x="96.0" y="309.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.0mm 512GB/s</text>
+  <polyline points="110.0,292.0 82.0,292.0 82.0,334.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
+  <text x="96.0" y="309.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.0mm 512GB/s</text>
+  <polyline points="474.0,138.0 446.0,138.0 446.0,292.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
+  <text x="460.0" y="211.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.0mm 512GB/s</text>
+  <polyline points="446.0,292.0 474.0,292.0 474.0,138.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
+  <text x="460.0" y="211.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.0mm 512GB/s</text>
+  <polyline points="474.0,334.0 446.0,334.0 446.0,292.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
+  <text x="460.0" y="309.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.0mm 512GB/s</text>
+  <polyline points="446.0,292.0 474.0,292.0 474.0,334.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
+  <text x="460.0" y="309.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.0mm 512GB/s</text>
+  <polyline points="334.0,236.0 334.0,131.4 278.0,131.4 278.0,56.8" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.6"/>
+  <polyline points="334.0,236.0 334.0,310.6 278.0,310.6 278.0,415.2" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.6"/>
+  <polyline points="334.0,236.0 334.0,221.0 488.0,221.0 488.0,236.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.6"/>
+  <polyline points="334.0,236.0 334.0,221.0 68.0,221.0 68.0,236.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.6"/>
+  <polyline points="446.0,194.0 446.0,200.0 334.0,200.0 334.0,236.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.6"/>
+  <polyline points="334.0,236.0 334.0,200.0 446.0,200.0 446.0,194.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.6"/>
+  <polyline points="334.0,236.0 110.0,236.0 110.0,194.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.8"/>
+  <polyline points="110.0,194.0 334.0,194.0 334.0,236.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.8"/>
+  <rect x="250.0" y="40.0" width="56.0" height="33.6" rx="4" fill="#3b82f6" stroke="#475569" stroke-width="1"/>
+  <text x="278.0" y="60.8" text-anchor="middle" font-family="monospace" font-size="10" fill="#ffffff">UCIe-N</text>
+  <rect x="250.0" y="398.4" width="56.0" height="33.6" rx="4" fill="#3b82f6" stroke="#475569" stroke-width="1"/>
+  <text x="278.0" y="419.2" text-anchor="middle" font-family="monospace" font-size="10" fill="#ffffff">UCIe-S</text>
+  <rect x="460.0" y="219.2" width="56.0" height="33.6" rx="4" fill="#3b82f6" stroke="#475569" stroke-width="1"/>
+  <text x="488.0" y="240.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#ffffff">UCIe-E</text>
+  <rect x="40.0" y="219.2" width="56.0" height="33.6" rx="4" fill="#3b82f6" stroke="#475569" stroke-width="1"/>
+  <text x="68.0" y="240.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#ffffff">UCIe-W</text>
+  <rect x="306.0" y="219.2" width="56.0" height="33.6" rx="4" fill="#a78bfa" stroke="#475569" stroke-width="1"/>
+  <text x="334.0" y="240.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">NOC</text>
+  <rect x="418.0" y="177.2" width="56.0" height="33.6" rx="4" fill="#f59e0b" stroke="#475569" stroke-width="1"/>
+  <text x="446.0" y="198.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">M CPU</text>
+  <rect x="194.0" y="219.2" width="56.0" height="33.6" rx="4" fill="#10b981" stroke="#475569" stroke-width="1"/>
+  <text x="222.0" y="240.0" text-anchor="middle" font-family="monospace" font-size="8" fill="#ffffff">HBM CTRL</text>
+  <rect x="82.0" y="177.2" width="56.0" height="33.6" rx="4" fill="#f59e0b" stroke="#475569" stroke-width="1"/>
+  <text x="110.0" y="198.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">SRAM</text>
+  <rect x="82.0" y="275.2" width="56.0" height="33.6" rx="4" fill="#f97316" stroke="#475569" stroke-width="1"/>
+  <text x="110.0" y="296.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">Bridge LEFT</text>
+  <rect x="418.0" y="275.2" width="56.0" height="33.6" rx="4" fill="#f97316" stroke="#475569" stroke-width="1"/>
+  <text x="446.0" y="296.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">Bridge RIGHT</text>
+  <rect x="56.8" y="68.0" width="50.4" height="28.0" rx="4" fill="#94a3b8" stroke="#475569" stroke-width="1"/>
+  <text x="82.0" y="86.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">PE0</text>
+  <rect x="54.0" y="121.2" width="56.0" height="33.6" rx="4" fill="#f97316" stroke="#475569" stroke-width="1"/>
+  <text x="82.0" y="142.0" text-anchor="middle" font-family="monospace" font-size="8" fill="#1e293b">XBAR PE0</text>
+  <rect x="140.8" y="68.0" width="50.4" height="28.0" rx="4" fill="#94a3b8" stroke="#475569" stroke-width="1"/>
+  <text x="166.0" y="86.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">PE1</text>
+  <rect x="138.0" y="121.2" width="56.0" height="33.6" rx="4" fill="#f97316" stroke="#475569" stroke-width="1"/>
+  <text x="166.0" y="142.0" text-anchor="middle" font-family="monospace" font-size="8" fill="#1e293b">XBAR PE1</text>
+  <rect x="364.8" y="68.0" width="50.4" height="28.0" rx="4" fill="#94a3b8" stroke="#475569" stroke-width="1"/>
+  <text x="390.0" y="86.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">PE2</text>
+  <rect x="362.0" y="121.2" width="56.0" height="33.6" rx="4" fill="#f97316" stroke="#475569" stroke-width="1"/>
+  <text x="390.0" y="142.0" text-anchor="middle" font-family="monospace" font-size="8" fill="#1e293b">XBAR PE2</text>
+  <rect x="448.8" y="68.0" width="50.4" height="28.0" rx="4" fill="#94a3b8" stroke="#475569" stroke-width="1"/>
+  <text x="474.0" y="86.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">PE3</text>
+  <rect x="446.0" y="121.2" width="56.0" height="33.6" rx="4" fill="#f97316" stroke="#475569" stroke-width="1"/>
+  <text x="474.0" y="142.0" text-anchor="middle" font-family="monospace" font-size="8" fill="#1e293b">XBAR PE3</text>
+  <rect x="56.8" y="376.0" width="50.4" height="28.0" rx="4" fill="#94a3b8" stroke="#475569" stroke-width="1"/>
+  <text x="82.0" y="394.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">PE4</text>
+  <rect x="54.0" y="317.2" width="56.0" height="33.6" rx="4" fill="#f97316" stroke="#475569" stroke-width="1"/>
+  <text x="82.0" y="338.0" text-anchor="middle" font-family="monospace" font-size="8" fill="#1e293b">XBAR PE4</text>
+  <rect x="140.8" y="376.0" width="50.4" height="28.0" rx="4" fill="#94a3b8" stroke="#475569" stroke-width="1"/>
+  <text x="166.0" y="394.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">PE5</text>
+  <rect x="138.0" y="317.2" width="56.0" height="33.6" rx="4" fill="#f97316" stroke="#475569" stroke-width="1"/>
+  <text x="166.0" y="338.0" text-anchor="middle" font-family="monospace" font-size="8" fill="#1e293b">XBAR PE5</text>
+  <rect x="364.8" y="376.0" width="50.4" height="28.0" rx="4" fill="#94a3b8" stroke="#475569" stroke-width="1"/>
+  <text x="390.0" y="394.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">PE6</text>
+  <rect x="362.0" y="317.2" width="56.0" height="33.6" rx="4" fill="#f97316" stroke="#475569" stroke-width="1"/>
+  <text x="390.0" y="338.0" text-anchor="middle" font-family="monospace" font-size="8" fill="#1e293b">XBAR PE6</text>
+  <rect x="448.8" y="376.0" width="50.4" height="28.0" rx="4" fill="#94a3b8" stroke="#475569" stroke-width="1"/>
+  <text x="474.0" y="394.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">PE7</text>
+  <rect x="446.0" y="317.2" width="56.0" height="33.6" rx="4" fill="#f97316" stroke="#475569" stroke-width="1"/>
+  <text x="474.0" y="338.0" text-anchor="middle" font-family="monospace" font-size="8" fill="#1e293b">XBAR PE7</text>
+</svg>
\ No newline at end of file
diff --git a/docs/diagrams/pe_view.svg b/docs/diagrams/pe_view.svg
new file mode 100644
index 0000000..6142e2f
--- /dev/null
+++ b/docs/diagrams/pe_view.svg
@@ -0,0 +1,31 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="500" height="360" viewBox="0 0 500 360">
+  <title>pe</title>
+  <rect width="500" height="360" fill="#f8fafc"/>
+  <text x="250" y="18" text-anchor="middle" font-family="monospace" font-size="14" font-weight="bold" fill="#1e293b">PE VIEW</text>
+  <line x1="92.5" y1="180.0" x2="180.0" y2="180.0" stroke="#94a3b8" stroke-width="1.5" opacity="0.8"/>
+  <text x="136.2" y="176.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">0.5mm</text>
+  <polyline points="180.0,180.0 180.0,92.5 285.0,92.5" fill="none" stroke="#94a3b8" stroke-width="1.5" opacity="0.8"/>
+  <text x="232.5" y="132.2" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">0.5mm</text>
+  <line x1="180.0" y1="180.0" x2="285.0" y2="180.0" stroke="#94a3b8" stroke-width="1.5" opacity="0.8"/>
+  <text x="232.5" y="176.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">0.5mm</text>
+  <polyline points="180.0,180.0 180.0,267.5 285.0,267.5" fill="none" stroke="#94a3b8" stroke-width="1.5" opacity="0.8"/>
+  <text x="232.5" y="219.8" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">0.5mm</text>
+  <polyline points="285.0,92.5 390.0,92.5 390.0,180.0" fill="none" stroke="#94a3b8" stroke-width="1.5" opacity="0.8"/>
+  <text x="337.5" y="132.2" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">0.5mm 512GB/s</text>
+  <line x1="285.0" y1="180.0" x2="390.0" y2="180.0" stroke="#94a3b8" stroke-width="1.5" opacity="0.8"/>
+  <text x="337.5" y="176.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">0.5mm 512GB/s</text>
+  <polyline points="285.0,267.5 390.0,267.5 390.0,180.0" fill="none" stroke="#94a3b8" stroke-width="1.5" opacity="0.8"/>
+  <text x="337.5" y="219.8" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">0.5mm 512GB/s</text>
+  <rect x="48.8" y="155.5" width="87.5" height="49.0" rx="4" fill="#ef4444" stroke="#475569" stroke-width="1"/>
+  <text x="92.5" y="184.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#ffffff">PE CPU</text>
+  <rect x="136.2" y="155.5" width="87.5" height="49.0" rx="4" fill="#f59e0b" stroke="#475569" stroke-width="1"/>
+  <text x="180.0" y="184.0" text-anchor="middle" font-family="monospace" font-size="9" fill="#1e293b">PE SCHEDULER</text>
+  <rect x="241.2" y="68.0" width="87.5" height="49.0" rx="4" fill="#3b82f6" stroke="#475569" stroke-width="1"/>
+  <text x="285.0" y="96.5" text-anchor="middle" font-family="monospace" font-size="10" fill="#ffffff">PE DMA</text>
+  <rect x="241.2" y="155.5" width="87.5" height="49.0" rx="4" fill="#8b5cf6" stroke="#475569" stroke-width="1"/>
+  <text x="285.0" y="184.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#ffffff">PE GEMM</text>
+  <rect x="241.2" y="243.0" width="87.5" height="49.0" rx="4" fill="#ec4899" stroke="#475569" stroke-width="1"/>
+  <text x="285.0" y="271.5" text-anchor="middle" font-family="monospace" font-size="10" fill="#ffffff">PE MATH</text>
+  <rect x="346.2" y="155.5" width="87.5" height="49.0" rx="4" fill="#10b981" stroke="#475569" stroke-width="1"/>
+  <text x="390.0" y="184.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#ffffff">PE TCM</text>
+</svg>
\ No newline at end of file
diff --git a/docs/diagrams/placement_column_wise.svg b/docs/diagrams/placement_column_wise.svg
new file mode 100644
index 0000000..ffae2ae
--- /dev/null
+++ b/docs/diagrams/placement_column_wise.svg
@@ -0,0 +1,72 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="820" height="500" viewBox="0 0 820 500" font-family="monospace">
+<rect width="820" height="500" fill="#f8fafc" rx="6"/>
+<text x="410" y="32" text-anchor="middle" font-size="16" font-weight="bold" fill="#1e293b">Placement: column_wise</text>
+<text x="410.0" y="54.0" text-anchor="middle" font-size="12" fill="#475569" font-weight="normal">Tensor (1024×512) fp16  →  K axis split into 8 parts</text>
+<text x="320.0" y="82.0" text-anchor="middle" font-size="11" fill="#475569" font-weight="normal">← K=512 →</text>
+<text x="68.0" y="250.0" text-anchor="middle" font-size="11" fill="#475569" transform="rotate(-90 68.0 250.0)">↑ M=1024 ↓</text>
+<rect x="80.0" y="90.0" width="60.0" height="320.0" fill="#3b82f6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="110.0" y="246.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE0</text>
+<text x="110.0" y="262.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">(1024×64)</text>
+<rect x="140.0" y="90.0" width="60.0" height="320.0" fill="#10b981" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="170.0" y="246.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE1</text>
+<text x="170.0" y="262.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">(1024×64)</text>
+<rect x="200.0" y="90.0" width="60.0" height="320.0" fill="#f59e0b" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="230.0" y="246.0" text-anchor="middle" font-size="12" fill="#000" font-weight="bold">PE2</text>
+<text x="230.0" y="262.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">(1024×64)</text>
+<rect x="260.0" y="90.0" width="60.0" height="320.0" fill="#ef4444" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="290.0" y="246.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE3</text>
+<text x="290.0" y="262.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">(1024×64)</text>
+<rect x="320.0" y="90.0" width="60.0" height="320.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="350.0" y="246.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE4</text>
+<text x="350.0" y="262.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">(1024×64)</text>
+<rect x="380.0" y="90.0" width="60.0" height="320.0" fill="#ec4899" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="410.0" y="246.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE5</text>
+<text x="410.0" y="262.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">(1024×64)</text>
+<rect x="440.0" y="90.0" width="60.0" height="320.0" fill="#06b6d4" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="470.0" y="246.0" text-anchor="middle" font-size="12" fill="#000" font-weight="bold">PE6</text>
+<text x="470.0" y="262.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">(1024×64)</text>
+<rect x="500.0" y="90.0" width="60.0" height="320.0" fill="#f97316" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="530.0" y="246.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE7</text>
+<text x="530.0" y="262.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">(1024×64)</text>
+<rect x="80.0" y="90.0" width="480.0" height="320.0" fill="none" stroke="#1e293b" stroke-width="2" fill-opacity="1.0" rx="2"/>
+<text x="110.0" y="426.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">off=0 B</text>
+<text x="110.0" y="440.0" text-anchor="middle" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
+<text x="170.0" y="426.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">off=128 KB</text>
+<text x="170.0" y="440.0" text-anchor="middle" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
+<text x="230.0" y="426.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">off=256 KB</text>
+<text x="230.0" y="440.0" text-anchor="middle" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
+<text x="290.0" y="426.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">off=384 KB</text>
+<text x="290.0" y="440.0" text-anchor="middle" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
+<text x="350.0" y="426.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">off=512 KB</text>
+<text x="350.0" y="440.0" text-anchor="middle" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
+<text x="410.0" y="426.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">off=640 KB</text>
+<text x="410.0" y="440.0" text-anchor="middle" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
+<text x="470.0" y="426.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">off=768 KB</text>
+<text x="470.0" y="440.0" text-anchor="middle" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
+<text x="530.0" y="426.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">off=896 KB</text>
+<text x="530.0" y="440.0" text-anchor="middle" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
+<text x="670.0" y="100.0" text-anchor="middle" font-size="12" fill="#1e293b" font-weight="bold">PE Legend</text>
+<rect x="620.0" y="106.0" width="16.0" height="16.0" fill="#3b82f6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="642.0" y="118.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE0</text>
+<rect x="620.0" y="128.0" width="16.0" height="16.0" fill="#10b981" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="642.0" y="140.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE1</text>
+<rect x="620.0" y="150.0" width="16.0" height="16.0" fill="#f59e0b" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="642.0" y="162.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE2</text>
+<rect x="620.0" y="172.0" width="16.0" height="16.0" fill="#ef4444" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="642.0" y="184.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE3</text>
+<rect x="620.0" y="194.0" width="16.0" height="16.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="642.0" y="206.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE4</text>
+<rect x="620.0" y="216.0" width="16.0" height="16.0" fill="#ec4899" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="642.0" y="228.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE5</text>
+<rect x="620.0" y="238.0" width="16.0" height="16.0" fill="#06b6d4" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="642.0" y="250.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE6</text>
+<rect x="620.0" y="260.0" width="16.0" height="16.0" fill="#f97316" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="642.0" y="272.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE7</text>
+<rect x="620.0" y="320.0" width="167.0" height="120.0" fill="#e2e8f0" stroke="#94a3b8" stroke-width="1" fill-opacity="1.0" rx="2"/>
+<text x="630.0" y="338.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Strategy: column_wise</text>
+<text x="630.0" y="356.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Split axis: K</text>
+<text x="630.0" y="374.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Shards: 8</text>
+<text x="630.0" y="392.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Each: (1024, 64)</text>
+<text x="630.0" y="410.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Each: 128 KB</text>
+<text x="630.0" y="428.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Total: 1 MB</text>
+</svg>
diff --git a/docs/diagrams/placement_replicate.svg b/docs/diagrams/placement_replicate.svg
new file mode 100644
index 0000000..09115f5
--- /dev/null
+++ b/docs/diagrams/placement_replicate.svg
@@ -0,0 +1,47 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="820" height="500" viewBox="0 0 820 500" font-family="monospace">
+<rect width="820" height="500" fill="#f8fafc" rx="6"/>
+<text x="410" y="32" text-anchor="middle" font-size="16" font-weight="bold" fill="#1e293b">Placement: replicate</text>
+<text x="410.0" y="54.0" text-anchor="middle" font-size="12" fill="#475569" font-weight="normal">Tensor (1024×512) fp16  →  full copy to each PE</text>
+<rect x="60.0" y="90.0" width="163.0" height="162.0" fill="#3b82f6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="141.5" y="157.0" text-anchor="middle" font-size="14" fill="#fff" font-weight="bold">PE0</text>
+<text x="141.5" y="177.0" text-anchor="middle" font-size="11" fill="#fff" font-weight="normal">(1024×512)</text>
+<text x="141.5" y="193.0" text-anchor="middle" font-size="10" fill="#fff" font-weight="normal">1 MB</text>
+<text x="141.5" y="207.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">offset=0</text>
+<rect x="239.0" y="90.0" width="163.0" height="162.0" fill="#10b981" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="320.5" y="157.0" text-anchor="middle" font-size="14" fill="#fff" font-weight="bold">PE1</text>
+<text x="320.5" y="177.0" text-anchor="middle" font-size="11" fill="#fff" font-weight="normal">(1024×512)</text>
+<text x="320.5" y="193.0" text-anchor="middle" font-size="10" fill="#fff" font-weight="normal">1 MB</text>
+<text x="320.5" y="207.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">offset=0</text>
+<rect x="418.0" y="90.0" width="163.0" height="162.0" fill="#f59e0b" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="499.5" y="157.0" text-anchor="middle" font-size="14" fill="#000" font-weight="bold">PE2</text>
+<text x="499.5" y="177.0" text-anchor="middle" font-size="11" fill="#000" font-weight="normal">(1024×512)</text>
+<text x="499.5" y="193.0" text-anchor="middle" font-size="10" fill="#000" font-weight="normal">1 MB</text>
+<text x="499.5" y="207.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">offset=0</text>
+<rect x="597.0" y="90.0" width="163.0" height="162.0" fill="#ef4444" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="678.5" y="157.0" text-anchor="middle" font-size="14" fill="#fff" font-weight="bold">PE3</text>
+<text x="678.5" y="177.0" text-anchor="middle" font-size="11" fill="#fff" font-weight="normal">(1024×512)</text>
+<text x="678.5" y="193.0" text-anchor="middle" font-size="10" fill="#fff" font-weight="normal">1 MB</text>
+<text x="678.5" y="207.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">offset=0</text>
+<rect x="60.0" y="268.0" width="163.0" height="162.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="141.5" y="335.0" text-anchor="middle" font-size="14" fill="#fff" font-weight="bold">PE4</text>
+<text x="141.5" y="355.0" text-anchor="middle" font-size="11" fill="#fff" font-weight="normal">(1024×512)</text>
+<text x="141.5" y="371.0" text-anchor="middle" font-size="10" fill="#fff" font-weight="normal">1 MB</text>
+<text x="141.5" y="385.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">offset=0</text>
+<rect x="239.0" y="268.0" width="163.0" height="162.0" fill="#ec4899" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="320.5" y="335.0" text-anchor="middle" font-size="14" fill="#fff" font-weight="bold">PE5</text>
+<text x="320.5" y="355.0" text-anchor="middle" font-size="11" fill="#fff" font-weight="normal">(1024×512)</text>
+<text x="320.5" y="371.0" text-anchor="middle" font-size="10" fill="#fff" font-weight="normal">1 MB</text>
+<text x="320.5" y="385.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">offset=0</text>
+<rect x="418.0" y="268.0" width="163.0" height="162.0" fill="#06b6d4" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="499.5" y="335.0" text-anchor="middle" font-size="14" fill="#000" font-weight="bold">PE6</text>
+<text x="499.5" y="355.0" text-anchor="middle" font-size="11" fill="#000" font-weight="normal">(1024×512)</text>
+<text x="499.5" y="371.0" text-anchor="middle" font-size="10" fill="#000" font-weight="normal">1 MB</text>
+<text x="499.5" y="385.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">offset=0</text>
+<rect x="597.0" y="268.0" width="163.0" height="162.0" fill="#f97316" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="678.5" y="335.0" text-anchor="middle" font-size="14" fill="#fff" font-weight="bold">PE7</text>
+<text x="678.5" y="355.0" text-anchor="middle" font-size="11" fill="#fff" font-weight="normal">(1024×512)</text>
+<text x="678.5" y="371.0" text-anchor="middle" font-size="10" fill="#fff" font-weight="normal">1 MB</text>
+<text x="678.5" y="385.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">offset=0</text>
+<rect x="60.0" y="450.0" width="496.0" height="30.0" fill="#e2e8f0" stroke="#94a3b8" stroke-width="1" fill-opacity="1.0" rx="2"/>
+<text x="70.0" y="468.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Strategy: replicate  |  Shards: 8  |  Each: 1 MB  |  Total mem: 8 MB</text>
+</svg>
diff --git a/docs/diagrams/placement_row_wise.svg b/docs/diagrams/placement_row_wise.svg
new file mode 100644
index 0000000..2750eb8
--- /dev/null
+++ b/docs/diagrams/placement_row_wise.svg
@@ -0,0 +1,72 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="820" height="560" viewBox="0 0 820 560" font-family="monospace">
+<rect width="820" height="560" fill="#f8fafc" rx="6"/>
+<text x="410" y="32" text-anchor="middle" font-size="16" font-weight="bold" fill="#1e293b">Placement: row_wise</text>
+<text x="410.0" y="54.0" text-anchor="middle" font-size="12" fill="#475569" font-weight="normal">Tensor (1024×512) fp16  →  M axis split into 8 parts</text>
+<text x="240.0" y="82.0" text-anchor="middle" font-size="11" fill="#475569" font-weight="normal">← K=512 →</text>
+<text x="68.0" y="290.0" text-anchor="middle" font-size="11" fill="#475569" transform="rotate(-90 68.0 290.0)">↑ M=1024 ↓</text>
+<rect x="80.0" y="90.0" width="320.0" height="50.0" fill="#3b82f6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="240.0" y="111.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE0</text>
+<text x="240.0" y="127.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">(128×512)</text>
+<rect x="80.0" y="140.0" width="320.0" height="50.0" fill="#10b981" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="240.0" y="161.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE1</text>
+<text x="240.0" y="177.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">(128×512)</text>
+<rect x="80.0" y="190.0" width="320.0" height="50.0" fill="#f59e0b" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="240.0" y="211.0" text-anchor="middle" font-size="12" fill="#000" font-weight="bold">PE2</text>
+<text x="240.0" y="227.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">(128×512)</text>
+<rect x="80.0" y="240.0" width="320.0" height="50.0" fill="#ef4444" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="240.0" y="261.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE3</text>
+<text x="240.0" y="277.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">(128×512)</text>
+<rect x="80.0" y="290.0" width="320.0" height="50.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="240.0" y="311.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE4</text>
+<text x="240.0" y="327.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">(128×512)</text>
+<rect x="80.0" y="340.0" width="320.0" height="50.0" fill="#ec4899" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="240.0" y="361.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE5</text>
+<text x="240.0" y="377.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">(128×512)</text>
+<rect x="80.0" y="390.0" width="320.0" height="50.0" fill="#06b6d4" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="240.0" y="411.0" text-anchor="middle" font-size="12" fill="#000" font-weight="bold">PE6</text>
+<text x="240.0" y="427.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">(128×512)</text>
+<rect x="80.0" y="440.0" width="320.0" height="50.0" fill="#f97316" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="240.0" y="461.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE7</text>
+<text x="240.0" y="477.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">(128×512)</text>
+<rect x="80.0" y="90.0" width="320.0" height="400.0" fill="none" stroke="#1e293b" stroke-width="2" fill-opacity="1.0" rx="2"/>
+<text x="410.0" y="111.0" text-anchor="start" font-size="9" fill="#475569" font-weight="normal">off=0 B</text>
+<text x="410.0" y="125.0" text-anchor="start" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
+<text x="410.0" y="161.0" text-anchor="start" font-size="9" fill="#475569" font-weight="normal">off=128 KB</text>
+<text x="410.0" y="175.0" text-anchor="start" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
+<text x="410.0" y="211.0" text-anchor="start" font-size="9" fill="#475569" font-weight="normal">off=256 KB</text>
+<text x="410.0" y="225.0" text-anchor="start" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
+<text x="410.0" y="261.0" text-anchor="start" font-size="9" fill="#475569" font-weight="normal">off=384 KB</text>
+<text x="410.0" y="275.0" text-anchor="start" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
+<text x="410.0" y="311.0" text-anchor="start" font-size="9" fill="#475569" font-weight="normal">off=512 KB</text>
+<text x="410.0" y="325.0" text-anchor="start" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
+<text x="410.0" y="361.0" text-anchor="start" font-size="9" fill="#475569" font-weight="normal">off=640 KB</text>
+<text x="410.0" y="375.0" text-anchor="start" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
+<text x="410.0" y="411.0" text-anchor="start" font-size="9" fill="#475569" font-weight="normal">off=768 KB</text>
+<text x="410.0" y="425.0" text-anchor="start" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
+<text x="410.0" y="461.0" text-anchor="start" font-size="9" fill="#475569" font-weight="normal">off=896 KB</text>
+<text x="410.0" y="475.0" text-anchor="start" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
+<text x="630.0" y="100.0" text-anchor="middle" font-size="12" fill="#1e293b" font-weight="bold">PE Legend</text>
+<rect x="580.0" y="106.0" width="16.0" height="16.0" fill="#3b82f6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="602.0" y="118.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE0</text>
+<rect x="580.0" y="128.0" width="16.0" height="16.0" fill="#10b981" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="602.0" y="140.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE1</text>
+<rect x="580.0" y="150.0" width="16.0" height="16.0" fill="#f59e0b" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="602.0" y="162.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE2</text>
+<rect x="580.0" y="172.0" width="16.0" height="16.0" fill="#ef4444" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="602.0" y="184.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE3</text>
+<rect x="580.0" y="194.0" width="16.0" height="16.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="602.0" y="206.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE4</text>
+<rect x="580.0" y="216.0" width="16.0" height="16.0" fill="#ec4899" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="602.0" y="228.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE5</text>
+<rect x="580.0" y="238.0" width="16.0" height="16.0" fill="#06b6d4" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="602.0" y="250.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE6</text>
+<rect x="580.0" y="260.0" width="16.0" height="16.0" fill="#f97316" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="602.0" y="272.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE7</text>
+<rect x="580.0" y="320.0" width="146.0" height="120.0" fill="#e2e8f0" stroke="#94a3b8" stroke-width="1" fill-opacity="1.0" rx="2"/>
+<text x="590.0" y="338.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Strategy: row_wise</text>
+<text x="590.0" y="356.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Split axis: M</text>
+<text x="590.0" y="374.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Shards: 8</text>
+<text x="590.0" y="392.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Each: (128, 512)</text>
+<text x="590.0" y="410.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Each: 128 KB</text>
+<text x="590.0" y="428.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Total: 1 MB</text>
+</svg>
diff --git a/docs/diagrams/placement_tiled_column_major.svg b/docs/diagrams/placement_tiled_column_major.svg
new file mode 100644
index 0000000..f94b6b9
--- /dev/null
+++ b/docs/diagrams/placement_tiled_column_major.svg
@@ -0,0 +1,116 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="820" height="620" viewBox="0 0 820 620" font-family="monospace">
+<rect width="820" height="620" fill="#f8fafc" rx="6"/>
+<text x="410" y="32" text-anchor="middle" font-size="16" font-weight="bold" fill="#1e293b">Placement: tiled_column_major</text>
+<text x="410.0" y="54.0" text-anchor="middle" font-size="11" fill="#475569" font-weight="normal">Tensor (1024×512) fp16, tile=(256×128)  →  4×4=16 tiles, column-major (K first)</text>
+<text x="280.0" y="82.0" text-anchor="middle" font-size="11" fill="#475569" font-weight="normal">← K=512 →</text>
+<text x="68.0" y="290.0" text-anchor="middle" font-size="11" fill="#475569" transform="rotate(-90 68.0 290.0)">↑ M=1024 ↓</text>
+<rect x="80.0" y="90.0" width="100.0" height="100.0" fill="#3b82f6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="130.0" y="136.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE0</text>
+<text x="130.0" y="152.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t0</text>
+<rect x="180.0" y="90.0" width="100.0" height="100.0" fill="#10b981" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="230.0" y="136.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE1</text>
+<text x="230.0" y="152.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t1</text>
+<rect x="280.0" y="90.0" width="100.0" height="100.0" fill="#f59e0b" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="330.0" y="136.0" text-anchor="middle" font-size="12" fill="#000" font-weight="bold">PE2</text>
+<text x="330.0" y="152.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">t2</text>
+<rect x="380.0" y="90.0" width="100.0" height="100.0" fill="#ef4444" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="430.0" y="136.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE3</text>
+<text x="430.0" y="152.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t3</text>
+<rect x="80.0" y="190.0" width="100.0" height="100.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="130.0" y="236.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE4</text>
+<text x="130.0" y="252.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t4</text>
+<rect x="180.0" y="190.0" width="100.0" height="100.0" fill="#ec4899" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="230.0" y="236.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE5</text>
+<text x="230.0" y="252.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t5</text>
+<rect x="280.0" y="190.0" width="100.0" height="100.0" fill="#06b6d4" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="330.0" y="236.0" text-anchor="middle" font-size="12" fill="#000" font-weight="bold">PE6</text>
+<text x="330.0" y="252.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">t6</text>
+<rect x="380.0" y="190.0" width="100.0" height="100.0" fill="#f97316" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="430.0" y="236.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE7</text>
+<text x="430.0" y="252.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t7</text>
+<rect x="80.0" y="290.0" width="100.0" height="100.0" fill="#3b82f6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="130.0" y="336.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE0</text>
+<text x="130.0" y="352.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t8</text>
+<rect x="180.0" y="290.0" width="100.0" height="100.0" fill="#10b981" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="230.0" y="336.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE1</text>
+<text x="230.0" y="352.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t9</text>
+<rect x="280.0" y="290.0" width="100.0" height="100.0" fill="#f59e0b" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="330.0" y="336.0" text-anchor="middle" font-size="12" fill="#000" font-weight="bold">PE2</text>
+<text x="330.0" y="352.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">t10</text>
+<rect x="380.0" y="290.0" width="100.0" height="100.0" fill="#ef4444" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="430.0" y="336.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE3</text>
+<text x="430.0" y="352.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t11</text>
+<rect x="80.0" y="390.0" width="100.0" height="100.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="130.0" y="436.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE4</text>
+<text x="130.0" y="452.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t12</text>
+<rect x="180.0" y="390.0" width="100.0" height="100.0" fill="#ec4899" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="230.0" y="436.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE5</text>
+<text x="230.0" y="452.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t13</text>
+<rect x="280.0" y="390.0" width="100.0" height="100.0" fill="#06b6d4" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="330.0" y="436.0" text-anchor="middle" font-size="12" fill="#000" font-weight="bold">PE6</text>
+<text x="330.0" y="452.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">t14</text>
+<rect x="380.0" y="390.0" width="100.0" height="100.0" fill="#f97316" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="430.0" y="436.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE7</text>
+<text x="430.0" y="452.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t15</text>
+<rect x="80.0" y="90.0" width="400.0" height="400.0" fill="none" stroke="#1e293b" stroke-width="2" fill-opacity="1.0" rx="2"/>
+<text x="130.0" y="506.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">k=0..127</text>
+<text x="230.0" y="506.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">k=128..255</text>
+<text x="330.0" y="506.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">k=256..383</text>
+<text x="430.0" y="506.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">k=384..511</text>
+<text x="64.0" y="140.0" text-anchor="end" font-size="9" fill="#475569" font-weight="normal">m=0..255</text>
+<text x="64.0" y="240.0" text-anchor="end" font-size="9" fill="#475569" font-weight="normal">m=256..511</text>
+<text x="64.0" y="340.0" text-anchor="end" font-size="9" fill="#475569" font-weight="normal">m=512..767</text>
+<text x="64.0" y="440.0" text-anchor="end" font-size="9" fill="#475569" font-weight="normal">m=768..1023</text>
+<text x="590.0" y="90.0" text-anchor="middle" font-size="12" fill="#1e293b" font-weight="bold">PE Legend</text>
+<rect x="540.0" y="96.0" width="16.0" height="16.0" fill="#3b82f6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="562.0" y="108.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE0</text>
+<rect x="540.0" y="118.0" width="16.0" height="16.0" fill="#10b981" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="562.0" y="130.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE1</text>
+<rect x="540.0" y="140.0" width="16.0" height="16.0" fill="#f59e0b" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="562.0" y="152.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE2</text>
+<rect x="540.0" y="162.0" width="16.0" height="16.0" fill="#ef4444" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="562.0" y="174.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE3</text>
+<rect x="540.0" y="184.0" width="16.0" height="16.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="562.0" y="196.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE4</text>
+<rect x="540.0" y="206.0" width="16.0" height="16.0" fill="#ec4899" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="562.0" y="218.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE5</text>
+<rect x="540.0" y="228.0" width="16.0" height="16.0" fill="#06b6d4" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="562.0" y="240.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE6</text>
+<rect x="540.0" y="250.0" width="16.0" height="16.0" fill="#f97316" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="562.0" y="262.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE7</text>
+<text x="540.0" y="310.0" text-anchor="middle" font-size="12" fill="#1e293b" font-weight="bold">Tile Assignment Order</text>
+<rect x="540.0" y="318.0" width="12.0" height="12.0" fill="#3b82f6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="558.0" y="328.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 0 → PE0  (0,0)  off=0 B</text>
+<rect x="540.0" y="334.0" width="12.0" height="12.0" fill="#10b981" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="558.0" y="344.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 1 → PE1  (0,1)  off=256 B</text>
+<rect x="540.0" y="350.0" width="12.0" height="12.0" fill="#f59e0b" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="558.0" y="360.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 2 → PE2  (0,2)  off=512 B</text>
+<rect x="540.0" y="366.0" width="12.0" height="12.0" fill="#ef4444" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="558.0" y="376.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 3 → PE3  (0,3)  off=768 B</text>
+<rect x="540.0" y="382.0" width="12.0" height="12.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="558.0" y="392.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 4 → PE4  (1,0)  off=256 KB</text>
+<rect x="540.0" y="398.0" width="12.0" height="12.0" fill="#ec4899" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="558.0" y="408.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 5 → PE5  (1,1)  off=256 KB</text>
+<rect x="540.0" y="414.0" width="12.0" height="12.0" fill="#06b6d4" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="558.0" y="424.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 6 → PE6  (1,2)  off=256 KB</text>
+<rect x="540.0" y="430.0" width="12.0" height="12.0" fill="#f97316" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="558.0" y="440.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 7 → PE7  (1,3)  off=256 KB</text>
+<rect x="540.0" y="446.0" width="12.0" height="12.0" fill="#3b82f6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="558.0" y="456.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 8 → PE0  (2,0)  off=512 KB</text>
+<rect x="540.0" y="462.0" width="12.0" height="12.0" fill="#10b981" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="558.0" y="472.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 9 → PE1  (2,1)  off=512 KB</text>
+<rect x="540.0" y="478.0" width="12.0" height="12.0" fill="#f59e0b" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="558.0" y="488.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t10 → PE2  (2,2)  off=512 KB</text>
+<rect x="540.0" y="494.0" width="12.0" height="12.0" fill="#ef4444" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="558.0" y="504.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t11 → PE3  (2,3)  off=512 KB</text>
+<rect x="540.0" y="510.0" width="12.0" height="12.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="558.0" y="520.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t12 → PE4  (3,0)  off=768 KB</text>
+<rect x="540.0" y="526.0" width="12.0" height="12.0" fill="#ec4899" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="558.0" y="536.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t13 → PE5  (3,1)  off=768 KB</text>
+<rect x="540.0" y="542.0" width="12.0" height="12.0" fill="#06b6d4" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="558.0" y="552.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t14 → PE6  (3,2)  off=768 KB</text>
+<rect x="540.0" y="558.0" width="12.0" height="12.0" fill="#f97316" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="558.0" y="568.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t15 → PE7  (3,3)  off=768 KB</text>
+<rect x="80.0" y="560.0" width="608.0" height="30.0" fill="#e2e8f0" stroke="#94a3b8" stroke-width="1" fill-opacity="1.0" rx="2"/>
+<text x="90.0" y="578.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Strategy: tiled_column_major  |  Tile: (256×128)=64 KB  |  Tiles: 16  |  Total: 1 MB</text>
+</svg>
diff --git a/docs/diagrams/placement_tiled_row_major.svg b/docs/diagrams/placement_tiled_row_major.svg
new file mode 100644
index 0000000..346df72
--- /dev/null
+++ b/docs/diagrams/placement_tiled_row_major.svg
@@ -0,0 +1,116 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="820" height="620" viewBox="0 0 820 620" font-family="monospace">
+<rect width="820" height="620" fill="#f8fafc" rx="6"/>
+<text x="410" y="32" text-anchor="middle" font-size="16" font-weight="bold" fill="#1e293b">Placement: tiled_row_major</text>
+<text x="410.0" y="54.0" text-anchor="middle" font-size="11" fill="#475569" font-weight="normal">Tensor (1024×512) fp16, tile=(256×128)  →  4×4=16 tiles, row-major (M first)</text>
+<text x="280.0" y="82.0" text-anchor="middle" font-size="11" fill="#475569" font-weight="normal">← K=512 →</text>
+<text x="68.0" y="290.0" text-anchor="middle" font-size="11" fill="#475569" transform="rotate(-90 68.0 290.0)">↑ M=1024 ↓</text>
+<rect x="80.0" y="90.0" width="100.0" height="100.0" fill="#3b82f6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="130.0" y="136.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE0</text>
+<text x="130.0" y="152.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t0</text>
+<rect x="80.0" y="190.0" width="100.0" height="100.0" fill="#10b981" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="130.0" y="236.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE1</text>
+<text x="130.0" y="252.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t1</text>
+<rect x="80.0" y="290.0" width="100.0" height="100.0" fill="#f59e0b" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="130.0" y="336.0" text-anchor="middle" font-size="12" fill="#000" font-weight="bold">PE2</text>
+<text x="130.0" y="352.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">t2</text>
+<rect x="80.0" y="390.0" width="100.0" height="100.0" fill="#ef4444" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="130.0" y="436.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE3</text>
+<text x="130.0" y="452.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t3</text>
+<rect x="180.0" y="90.0" width="100.0" height="100.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="230.0" y="136.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE4</text>
+<text x="230.0" y="152.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t4</text>
+<rect x="180.0" y="190.0" width="100.0" height="100.0" fill="#ec4899" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="230.0" y="236.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE5</text>
+<text x="230.0" y="252.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t5</text>
+<rect x="180.0" y="290.0" width="100.0" height="100.0" fill="#06b6d4" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="230.0" y="336.0" text-anchor="middle" font-size="12" fill="#000" font-weight="bold">PE6</text>
+<text x="230.0" y="352.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">t6</text>
+<rect x="180.0" y="390.0" width="100.0" height="100.0" fill="#f97316" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="230.0" y="436.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE7</text>
+<text x="230.0" y="452.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t7</text>
+<rect x="280.0" y="90.0" width="100.0" height="100.0" fill="#3b82f6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="330.0" y="136.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE0</text>
+<text x="330.0" y="152.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t8</text>
+<rect x="280.0" y="190.0" width="100.0" height="100.0" fill="#10b981" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="330.0" y="236.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE1</text>
+<text x="330.0" y="252.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t9</text>
+<rect x="280.0" y="290.0" width="100.0" height="100.0" fill="#f59e0b" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="330.0" y="336.0" text-anchor="middle" font-size="12" fill="#000" font-weight="bold">PE2</text>
+<text x="330.0" y="352.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">t10</text>
+<rect x="280.0" y="390.0" width="100.0" height="100.0" fill="#ef4444" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="330.0" y="436.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE3</text>
+<text x="330.0" y="452.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t11</text>
+<rect x="380.0" y="90.0" width="100.0" height="100.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="430.0" y="136.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE4</text>
+<text x="430.0" y="152.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t12</text>
+<rect x="380.0" y="190.0" width="100.0" height="100.0" fill="#ec4899" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="430.0" y="236.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE5</text>
+<text x="430.0" y="252.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t13</text>
+<rect x="380.0" y="290.0" width="100.0" height="100.0" fill="#06b6d4" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="430.0" y="336.0" text-anchor="middle" font-size="12" fill="#000" font-weight="bold">PE6</text>
+<text x="430.0" y="352.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">t14</text>
+<rect x="380.0" y="390.0" width="100.0" height="100.0" fill="#f97316" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
+<text x="430.0" y="436.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE7</text>
+<text x="430.0" y="452.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t15</text>
+<rect x="80.0" y="90.0" width="400.0" height="400.0" fill="none" stroke="#1e293b" stroke-width="2" fill-opacity="1.0" rx="2"/>
+<text x="130.0" y="506.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">k=0..127</text>
+<text x="230.0" y="506.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">k=128..255</text>
+<text x="330.0" y="506.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">k=256..383</text>
+<text x="430.0" y="506.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">k=384..511</text>
+<text x="64.0" y="140.0" text-anchor="end" font-size="9" fill="#475569" font-weight="normal">m=0..255</text>
+<text x="64.0" y="240.0" text-anchor="end" font-size="9" fill="#475569" font-weight="normal">m=256..511</text>
+<text x="64.0" y="340.0" text-anchor="end" font-size="9" fill="#475569" font-weight="normal">m=512..767</text>
+<text x="64.0" y="440.0" text-anchor="end" font-size="9" fill="#475569" font-weight="normal">m=768..1023</text>
+<text x="590.0" y="90.0" text-anchor="middle" font-size="12" fill="#1e293b" font-weight="bold">PE Legend</text>
+<rect x="540.0" y="96.0" width="16.0" height="16.0" fill="#3b82f6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="562.0" y="108.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE0</text>
+<rect x="540.0" y="118.0" width="16.0" height="16.0" fill="#10b981" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="562.0" y="130.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE1</text>
+<rect x="540.0" y="140.0" width="16.0" height="16.0" fill="#f59e0b" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="562.0" y="152.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE2</text>
+<rect x="540.0" y="162.0" width="16.0" height="16.0" fill="#ef4444" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="562.0" y="174.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE3</text>
+<rect x="540.0" y="184.0" width="16.0" height="16.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="562.0" y="196.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE4</text>
+<rect x="540.0" y="206.0" width="16.0" height="16.0" fill="#ec4899" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="562.0" y="218.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE5</text>
+<rect x="540.0" y="228.0" width="16.0" height="16.0" fill="#06b6d4" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="562.0" y="240.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE6</text>
+<rect x="540.0" y="250.0" width="16.0" height="16.0" fill="#f97316" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="562.0" y="262.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE7</text>
+<text x="540.0" y="310.0" text-anchor="middle" font-size="12" fill="#1e293b" font-weight="bold">Tile Assignment Order</text>
+<rect x="540.0" y="318.0" width="12.0" height="12.0" fill="#3b82f6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="558.0" y="328.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 0 → PE0  (0,0)  off=0 B</text>
+<rect x="540.0" y="334.0" width="12.0" height="12.0" fill="#10b981" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="558.0" y="344.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 1 → PE1  (1,0)  off=256 KB</text>
+<rect x="540.0" y="350.0" width="12.0" height="12.0" fill="#f59e0b" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="558.0" y="360.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 2 → PE2  (2,0)  off=512 KB</text>
+<rect x="540.0" y="366.0" width="12.0" height="12.0" fill="#ef4444" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="558.0" y="376.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 3 → PE3  (3,0)  off=768 KB</text>
+<rect x="540.0" y="382.0" width="12.0" height="12.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="558.0" y="392.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 4 → PE4  (0,1)  off=256 B</text>
+<rect x="540.0" y="398.0" width="12.0" height="12.0" fill="#ec4899" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="558.0" y="408.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 5 → PE5  (1,1)  off=256 KB</text>
+<rect x="540.0" y="414.0" width="12.0" height="12.0" fill="#06b6d4" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="558.0" y="424.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 6 → PE6  (2,1)  off=512 KB</text>
+<rect x="540.0" y="430.0" width="12.0" height="12.0" fill="#f97316" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="558.0" y="440.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 7 → PE7  (3,1)  off=768 KB</text>
+<rect x="540.0" y="446.0" width="12.0" height="12.0" fill="#3b82f6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="558.0" y="456.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 8 → PE0  (0,2)  off=512 B</text>
+<rect x="540.0" y="462.0" width="12.0" height="12.0" fill="#10b981" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="558.0" y="472.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 9 → PE1  (1,2)  off=256 KB</text>
+<rect x="540.0" y="478.0" width="12.0" height="12.0" fill="#f59e0b" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="558.0" y="488.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t10 → PE2  (2,2)  off=512 KB</text>
+<rect x="540.0" y="494.0" width="12.0" height="12.0" fill="#ef4444" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="558.0" y="504.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t11 → PE3  (3,2)  off=768 KB</text>
+<rect x="540.0" y="510.0" width="12.0" height="12.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="558.0" y="520.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t12 → PE4  (0,3)  off=768 B</text>
+<rect x="540.0" y="526.0" width="12.0" height="12.0" fill="#ec4899" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="558.0" y="536.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t13 → PE5  (1,3)  off=256 KB</text>
+<rect x="540.0" y="542.0" width="12.0" height="12.0" fill="#06b6d4" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="558.0" y="552.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t14 → PE6  (2,3)  off=512 KB</text>
+<rect x="540.0" y="558.0" width="12.0" height="12.0" fill="#f97316" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
+<text x="558.0" y="568.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t15 → PE7  (3,3)  off=768 KB</text>
+<rect x="80.0" y="560.0" width="587.0" height="30.0" fill="#e2e8f0" stroke="#94a3b8" stroke-width="1" fill-opacity="1.0" rx="2"/>
+<text x="90.0" y="578.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Strategy: tiled_row_major  |  Tile: (256×128)=64 KB  |  Tiles: 16  |  Total: 1 MB</text>
+</svg>
diff --git a/docs/diagrams/sip_view.svg b/docs/diagrams/sip_view.svg
new file mode 100644
index 0000000..c1faf21
--- /dev/null
+++ b/docs/diagrams/sip_view.svg
@@ -0,0 +1,95 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="648" height="648" viewBox="0 0 648 648">
+  <title>sip</title>
+  <rect width="648" height="648" fill="#f8fafc"/>
+  <text x="324" y="18" text-anchor="middle" font-family="monospace" font-size="14" font-weight="bold" fill="#1e293b">SIP VIEW</text>
+  <line x1="108.0" y1="144.0" x2="252.0" y2="144.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
+  <text x="180.0" y="140.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
+  <line x1="108.0" y1="144.0" x2="108.0" y2="264.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
+  <text x="108.0" y="200.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
+  <line x1="252.0" y1="144.0" x2="396.0" y2="144.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
+  <text x="324.0" y="140.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
+  <line x1="252.0" y1="144.0" x2="252.0" y2="264.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
+  <text x="252.0" y="200.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
+  <line x1="396.0" y1="144.0" x2="540.0" y2="144.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
+  <text x="468.0" y="140.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
+  <line x1="396.0" y1="144.0" x2="396.0" y2="264.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
+  <text x="396.0" y="200.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
+  <line x1="540.0" y1="144.0" x2="540.0" y2="264.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
+  <text x="540.0" y="200.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
+  <line x1="108.0" y1="264.0" x2="252.0" y2="264.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
+  <text x="180.0" y="260.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
+  <line x1="108.0" y1="264.0" x2="108.0" y2="384.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
+  <text x="108.0" y="320.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
+  <line x1="252.0" y1="264.0" x2="396.0" y2="264.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
+  <text x="324.0" y="260.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
+  <line x1="252.0" y1="264.0" x2="252.0" y2="384.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
+  <text x="252.0" y="320.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
+  <line x1="396.0" y1="264.0" x2="540.0" y2="264.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
+  <text x="468.0" y="260.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
+  <line x1="396.0" y1="264.0" x2="396.0" y2="384.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
+  <text x="396.0" y="320.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
+  <line x1="540.0" y1="264.0" x2="540.0" y2="384.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
+  <text x="540.0" y="320.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
+  <line x1="108.0" y1="384.0" x2="252.0" y2="384.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
+  <text x="180.0" y="380.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
+  <line x1="108.0" y1="384.0" x2="108.0" y2="504.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
+  <text x="108.0" y="440.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
+  <line x1="252.0" y1="384.0" x2="396.0" y2="384.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
+  <text x="324.0" y="380.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
+  <line x1="252.0" y1="384.0" x2="252.0" y2="504.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
+  <text x="252.0" y="440.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
+  <line x1="396.0" y1="384.0" x2="540.0" y2="384.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
+  <text x="468.0" y="380.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
+  <line x1="396.0" y1="384.0" x2="396.0" y2="504.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
+  <text x="396.0" y="440.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
+  <line x1="540.0" y1="384.0" x2="540.0" y2="504.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
+  <text x="540.0" y="440.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
+  <line x1="108.0" y1="504.0" x2="252.0" y2="504.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
+  <text x="180.0" y="500.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
+  <line x1="252.0" y1="504.0" x2="396.0" y2="504.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
+  <text x="324.0" y="500.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
+  <line x1="396.0" y1="504.0" x2="540.0" y2="504.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
+  <text x="468.0" y="500.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
+  <polyline points="324.0,56.0 108.0,56.0 108.0,144.0" fill="none" stroke="#0ea5e9" stroke-width="1" opacity="0.8"/>
+  <text x="216.0" y="96.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.5mm 512GB/s</text>
+  <polyline points="324.0,56.0 252.0,56.0 252.0,144.0" fill="none" stroke="#0ea5e9" stroke-width="1" opacity="0.8"/>
+  <text x="288.0" y="96.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.5mm 512GB/s</text>
+  <polyline points="324.0,56.0 396.0,56.0 396.0,144.0" fill="none" stroke="#0ea5e9" stroke-width="1" opacity="0.8"/>
+  <text x="360.0" y="96.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.5mm 512GB/s</text>
+  <polyline points="324.0,56.0 540.0,56.0 540.0,144.0" fill="none" stroke="#0ea5e9" stroke-width="1" opacity="0.8"/>
+  <text x="432.0" y="96.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.5mm 512GB/s</text>
+  <rect x="84.0" y="128.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
+  <text x="108.0" y="148.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (0,0)</text>
+  <rect x="228.0" y="128.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
+  <text x="252.0" y="148.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (1,0)</text>
+  <rect x="372.0" y="128.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
+  <text x="396.0" y="148.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (2,0)</text>
+  <rect x="516.0" y="128.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
+  <text x="540.0" y="148.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (3,0)</text>
+  <rect x="84.0" y="248.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
+  <text x="108.0" y="268.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (0,1)</text>
+  <rect x="228.0" y="248.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
+  <text x="252.0" y="268.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (1,1)</text>
+  <rect x="372.0" y="248.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
+  <text x="396.0" y="268.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (2,1)</text>
+  <rect x="516.0" y="248.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
+  <text x="540.0" y="268.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (3,1)</text>
+  <rect x="84.0" y="368.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
+  <text x="108.0" y="388.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (0,2)</text>
+  <rect x="228.0" y="368.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
+  <text x="252.0" y="388.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (1,2)</text>
+  <rect x="372.0" y="368.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
+  <text x="396.0" y="388.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (2,2)</text>
+  <rect x="516.0" y="368.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
+  <text x="540.0" y="388.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (3,2)</text>
+  <rect x="84.0" y="488.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
+  <text x="108.0" y="508.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (0,3)</text>
+  <rect x="228.0" y="488.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
+  <text x="252.0" y="508.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (1,3)</text>
+  <rect x="372.0" y="488.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
+  <text x="396.0" y="508.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (2,3)</text>
+  <rect x="516.0" y="488.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
+  <text x="540.0" y="508.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (3,3)</text>
+  <rect x="308.0" y="50.0" width="32.0" height="12.0" rx="4" fill="#0ea5e9" stroke="#475569" stroke-width="1"/>
+  <text x="324.0" y="60.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#ffffff">IO io0</text>
+</svg>
\ No newline at end of file
diff --git a/docs/diagrams/system_view.svg b/docs/diagrams/system_view.svg
new file mode 100644
index 0000000..fa7102d
--- /dev/null
+++ b/docs/diagrams/system_view.svg
@@ -0,0 +1,19 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="768" height="396" viewBox="0 0 768 396">
+  <title>system</title>
+  <rect width="768" height="396" fill="#f8fafc"/>
+  <text x="384" y="18" text-anchor="middle" font-family="monospace" font-size="14" font-weight="bold" fill="#1e293b">SYSTEM VIEW</text>
+  <polyline points="384.0,60.0 182.0,60.0 182.0,120.0" fill="none" stroke="#6366f1" stroke-width="1" opacity="0.8"/>
+  <text x="283.0" y="86.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">20.0mm 256GB/s</text>
+  <polyline points="384.0,60.0 586.0,60.0 586.0,120.0" fill="none" stroke="#6366f1" stroke-width="1" opacity="0.8"/>
+  <text x="485.0" y="86.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">20.0mm 256GB/s</text>
+  <rect x="374.0" y="57.0" width="20.0" height="6.0" rx="4" fill="#6366f1" stroke="#475569" stroke-width="1"/>
+  <text x="384.0" y="64.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#ffffff">Fabric Switch</text>
+  <rect x="62.0" y="138.0" width="240.0" height="200.0" rx="4" fill="#e0e7ff" stroke="#475569" stroke-width="1"/>
+  <text x="182.0" y="242.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">SIP 0</text>
+  <rect x="174.0" y="117.0" width="16.0" height="6.0" rx="4" fill="#0ea5e9" stroke="#475569" stroke-width="1"/>
+  <text x="182.0" y="124.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#ffffff">IO io0</text>
+  <rect x="466.0" y="138.0" width="240.0" height="200.0" rx="4" fill="#e0e7ff" stroke="#475569" stroke-width="1"/>
+  <text x="586.0" y="242.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">SIP 1</text>
+  <rect x="578.0" y="117.0" width="16.0" height="6.0" rx="4" fill="#0ea5e9" stroke="#475569" stroke-width="1"/>
+  <text x="586.0" y="124.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#ffffff">IO io0</text>
+</svg>
\ No newline at end of file
diff --git a/docs/latency-model.md b/docs/latency-model.md
new file mode 100644
index 0000000..3f3cb07
--- /dev/null
+++ b/docs/latency-model.md
@@ -0,0 +1,381 @@
+# Latency Model
+
+## Overview
+
+kernbench uses a discrete-event simulation (SimPy) to compute end-to-end latency.
+Every request flows through a graph of **components** connected by **wires**.
+The total latency reported is the **actual SimPy wall-clock** (`env.now` delta),
+not a static formula—so contention and queueing are captured automatically.
+
+```
+total_ns (actual) = wire_prop + component_overhead + drain + queueing
+                    ├── deterministic ──────────────────┘       │
+                    └── contention-dependent ────────────────────┘
+```
+
+## Three Deterministic Cost Components
+
+### 1. Wire Propagation
+
+```
+wire_ns = distance_mm × ns_per_mm       (global: 0.01 = 10 ps/mm)
+```
+
+Every edge in the topology graph has a `distance_mm`. A SimPy wire process
+delays each message by `wire_ns` before delivering it to the next component.
+For on-chip silicon this is ~10 ps/mm; the same constant applies everywhere
+since all links are on-die or interposer. Wire propagation is typically <1 ns
+and negligible compared to other costs.
+
+### 2. Component Overhead (`overhead_ns`)
+
+```
+component_ns = node.attrs["overhead_ns"]
+```
+
+Each component on the path adds a fixed processing delay via `yield env.timeout(overhead_ns)`.
+This models arbitration, protocol processing, pipeline stages, etc.
+
+| Component | overhead_ns | Meaning |
+|-----------|-------------|---------|
+| pcie_ep | 5.0 | PCIe protocol processing |
+| io_cpu | 10.0 | Command decode / dispatch |
+| m_cpu | 5.0 | DMA scheduling |
+| fabric switch | 5.0 | Packet arbitration |
+| xbar | 2.0 | Crossbar arbitration |
+| xbar bridge | 1.0 | Bridge traversal between xbar halves |
+| ucie | 1.0 | UCIe protocol overhead per port |
+| noc (2D mesh) | 0.0 | Hop delay modeled internally via manhattan distance |
+| hbm_ctrl | 0.0 | Access time captured in drain_ns |
+| pe_cpu | 2.0 | Command dispatch |
+| pe_scheduler | 1.0 | PE-internal scheduling |
+| pe_gemm/math | 0.0 | Placeholder; will use flops-based model |
+
+### 3. Drain (Serialization Delay)
+
+```
+drain_ns = nbytes / bottleneck_bw_gbs
+```
+
+**Wormhole (cut-through) model**: data flows through intermediate nodes as a
+pipeline. Serialization cost is paid **once** at the terminal node, not at
+every hop. The bottleneck is the minimum `bw_gbs` across all edges in the path.
+
+Example: 4096 bytes through a path with bottleneck 128 GB/s → `4096 / 128 = 32.0 ns`.
+
+### Formula (Theoretical Lower Bound)
+
+```
+formula_ns = Σ(wire_prop) + Σ(overhead_ns) + drain_ns
+```
+
+This is the latency with **zero contention**—no other request competing for
+any resource. The engine provides `_formula_latency()` for verification.
+With no contention: `actual == formula`. With contention: `actual > formula`.
+
+### Diagram: PE DMA Read (pe0 → local slice0, 4096 bytes)
+
+```mermaid
+sequenceDiagram
+    participant D as pe_dma
+    participant X as xbar.pe0
+    participant H as hbm_ctrl.slice0
+
+    D->>X: txn (4096B)
+    Note over X: overhead 2.0 ns
+    X->>H: txn (wire 0.025 ns)
+    Note over H: acquire Resource
+    Note over H: overhead 0 ns
+    Note over H: drain 4096/256 = 16.0 ns
+    Note over H: release Resource
+    H-->>D: done.succeed()
+
+    Note over D,H: total_ns = 18.09 ns<br/>formula = wire(0.025) + ovhd(2.0) + drain(16.0) = 18.025 ns<br/>actual ≈ formula (no contention)
+```
+
+### Diagram: Two Requests — No Contention vs HOL Blocking
+
+#### Case 1: Different slices (parallel, no contention)
+
+```mermaid
+sequenceDiagram
+    participant A as Request A
+    participant S0 as hbm_ctrl.slice0<br/>Resource(cap=1)
+    participant S1 as hbm_ctrl.slice1<br/>Resource(cap=1)
+
+    Note over A,S1: t=2 ns — both requests arrive at their own slice
+    A->>S0: A (4KB)
+    A->>S1: B (4KB)
+    Note over S0: acquire (immediate)
+    Note over S1: acquire (immediate)
+    Note over S0: drain 16.0 ns
+    Note over S1: drain 16.0 ns
+    Note over S0: t=18 release
+    Note over S1: t=18 release
+
+    Note over A,S1: A actual = 18 ns, B actual = 18 ns<br/>No waiting — separate Resources
+```
+
+#### Case 2: Same slice (HOL blocking)
+
+```mermaid
+sequenceDiagram
+    participant A as Request A (4KB)
+    participant Q as hbm_ctrl.slice0<br/>Resource(cap=1)
+    participant B as Request B (64B)
+
+    Note over A,B: t=0 — A arrives first
+    A->>Q: acquire (immediate)
+    Note over Q: drain A = 16.0 ns
+
+    Note over B,Q: t=5 — B arrives, yield req → BLOCKED
+    B--xQ: waiting...
+
+    Note over Q: t=16 — A drain done, release
+    Q->>B: B acquires resource
+    Note over Q: drain B = 0.25 ns
+    Note over Q: t=16.25 — B done, release
+
+    Note over A,B: A actual = 16.0 ns (== formula)<br/>B actual = 11.25 ns (formula 0.25 + queueing 11.0)<br/>HOL blocking: short request waits behind long drain
+```
+
+---
+
+## How SimPy Tracks Latency
+
+### Measurement
+
+```python
+start_ns = env.now
+yield txn_done          # wait for the transaction to complete
+total_ns = env.now - start_ns     # ← this is what probe reports
+```
+
+`env.now` is SimPy's simulation clock. It only advances when a process `yield`s
+a timeout or waits on a resource/store. The delta between start and done captures
+**everything**: wire delays, component overheads, drain, and any queueing.
+
+### Component Pipeline
+
+Each component is a SimPy process:
+
+```
+_fan_in (per in_port)  →  _inbox (Store)  →  _worker  →  out_ports
+```
+
+1. **`_fan_in`**: relays messages from each `in_port` into a shared `_inbox` Store.
+2. **`_worker`**: pulls from `_inbox`, spawns `_forward_txn` per message.
+3. **`_forward_txn`**: calls `run()` (overhead), then puts to `out_ports[next_hop]`.
+
+The worker uses `env.process()` (pipeline model), so multiple messages can be
+in-flight through the same component concurrently. Contention happens when
+they compete for shared resources (e.g., `simpy.Resource` in hbm_ctrl).
+
+### Wire Process
+
+```python
+while True:
+    msg = yield out_port.get()      # wait for sender
+    yield env.timeout(prop_ns)      # propagation delay
+    yield in_port.put(msg)          # deliver to receiver
+```
+
+Each directed edge has its own wire process. Messages are delayed by exactly
+`distance_mm × ns_per_mm`.
+
+---
+
+## Contention and Queueing
+
+Queueing delay is **not a separate formula term**—it emerges from SimPy's
+event scheduling when multiple requests compete for the same resource.
+
+### Where Contention Occurs
+
+| Resource | SimPy Type | Capacity | Effect |
+|----------|-----------|----------|--------|
+| hbm_ctrl | `simpy.Resource` | 1 | Serializes HBM access |
+| m_cpu DMA read engine | `simpy.Resource` | 1 | Serializes DMA reads |
+| m_cpu DMA write engine | `simpy.Resource` | 1 | Serializes DMA writes |
+| pe_dma channels | `simpy.Resource` | configurable | Serializes PE DMA ops |
+| component inbox | `simpy.Store` | unbounded | No backpressure (FIFO) |
+
+### How Queueing Works
+
+```python
+# hbm_ctrl._worker
+with self._resource.request() as req:
+    yield req                     # ← BLOCKS if resource is occupied
+    yield from self.run(env, txn.nbytes)
+    yield env.timeout(drain_ns)
+```
+
+If request A holds the resource and request B arrives:
+- B's `yield req` blocks until A releases the resource
+- SimPy advances B's `env.now` by A's remaining service time
+- This "extra" time shows up in B's `total_ns` automatically
+
+```
+No contention:  actual_ns == formula_ns
+Contention:     actual_ns  > formula_ns
+                queueing_delay = actual_ns - formula_ns
+```
+
+### Head-of-Line (HOL) Blocking at hbm_ctrl
+
+The `simpy.Resource` is held for the **entire** `with` block—both overhead and
+drain. The resource is NOT released between overhead and drain:
+
+```python
+with self._resource.request() as req:
+    yield req                              # acquire (or wait)
+    yield from self.run(env, txn.nbytes)   # overhead_ns  ─┐
+    yield env.timeout(drain_ns)            # drain_ns      │ resource held
+# ← resource released here ───────────────────────────────┘
+```
+
+This means a short request arriving during a long request's drain must wait
+for the full remaining drain time—classic head-of-line blocking:
+
+```
+Request A: 4 KB,  drain = 16.0 ns   (arrives at t=0)
+Request B: 64 B,  drain = 0.25 ns   (arrives at t=5)
+
+Timeline:
+  t=0.00   A acquires resource
+  t=0.00   A: overhead (0 ns)
+  t=0.00   A: drain starts (16.0 ns)
+  t=5.00   B arrives → yield req → BLOCKED (A holds resource)
+  t=16.00  A: drain done → resource released
+  t=16.00  B acquires resource
+  t=16.00  B: overhead (0 ns)
+  t=16.25  B: drain done → resource released
+
+  B actual  = 11.25 ns (waited 11.0 + own 0.25)
+  B formula = 0.25 ns
+  B queueing = 11.0 ns  ← HOL blocking penalty
+```
+
+**Why this is physically realistic**: An HBM channel processes one burst at a
+time. While data is being serialized onto the channel (drain), no other request
+can use that channel. The FIFO ordering (`simpy.Resource` default) reflects
+the simplest controller scheduling policy.
+
+**Alternative: priority scheduling**: If needed, `simpy.PriorityResource` can
+prioritize shorter requests (Shortest Job First), but this is not currently
+used since FIFO matches typical HBM controller behavior.
+
+---
+
+## Worked Example: Two Concurrent PE DMA Reads
+
+Setup: PE0 and PE1 in cube0 both read 4096 bytes from their local HBM slices
+(slice0 and slice1), submitted to the **same engine** at the same time.
+
+### Paths
+
+```
+DMA A: pe0.pe_dma → xbar.pe0 → hbm_ctrl.slice0
+DMA B: pe1.pe_dma → xbar.pe1 → hbm_ctrl.slice1
+```
+
+### No Contention (different HBM slices)
+
+Since slice0 and slice1 are **separate** hbm_ctrl instances, each with its own
+`simpy.Resource(capacity=1)`, there is no resource competition.
+
+```
+DMA A timeline:
+  t=0.00   pe_dma dequeues txn
+  t=0.00   xbar.pe0: overhead_ns=2.0 → t=2.00
+  t=2.025  wire prop (2.5mm × 0.01) → t=2.025
+  t=2.025  hbm_ctrl.slice0: yield req → immediate (no contention)
+  t=2.025  hbm_ctrl.slice0: overhead_ns=0 → t=2.025
+  t=18.025 drain_ns = 4096/256 = 16.0 → t=18.025
+  t=18.025 done
+
+DMA B timeline: (identical, on its own slice)
+  t=0.00   → ... → t=18.09  done
+```
+
+Both complete at ~18.09 ns. `actual == formula` for both.
+
+### With Contention (same HBM slice)
+
+Now suppose both PE0 and PE1 read from **slice0**:
+
+```
+DMA A: pe0.pe_dma → xbar.pe0 → hbm_ctrl.slice0
+DMA B: pe1.pe_dma → xbar.pe1 → xbar.pe0 → hbm_ctrl.slice0
+                                (chain traversal to reach slice0)
+```
+
+```
+DMA A timeline:
+  t=0.00   xbar.pe0(2.0) → wire → hbm_ctrl.slice0
+  t=2.025  yield req → immediate (first to arrive)
+  t=18.025 drain 16.0 → release resource → done
+  actual_A = 18.025 ns (== formula)
+
+DMA B timeline:
+  t=0.00   xbar.pe1(2.0) → xbar.pe0(2.0) → wire → hbm_ctrl.slice0
+  t=4.035  yield req → BLOCKED (A holds resource until t=18.025)
+  t=18.025 acquire resource
+  t=34.025 drain 16.0 → release → done
+  actual_B = 34.035 ns
+
+  formula_B = wire(0.035) + overhead(4.0) + drain(32.0) = 36.035 ns
+  But actual_B is different because drain uses bottleneck BW of B's path (128 GB/s)
+  while A's path has BW 256 GB/s. Let's recalculate:
+
+  B's bottleneck: xbar_x_bw = 128 GB/s → drain = 4096/128 = 32.0 ns
+  formula_B = 0.035 + 4.0 + 32.0 = 36.035 ns
+  actual_B  = 36.035 + queueing ≈ 50+ ns
+  queueing  = time waiting for A to release hbm_ctrl
+```
+
+The key insight: **queueing delay is not in the formula**. It only appears in
+the actual SimPy simulation when resources are contested. The probe reports
+`actual_ns`, which includes all queueing. To see pure queueing overhead,
+compare `actual_ns` vs `formula_ns` (available in PE DMA traces).
+
+---
+
+## Probe Output Explained
+
+```
+=== PE DMA Latency ===
+Case                Target              Actual  Ovhd  Drain  Wire  Ovhd% Drain%  Eff.BW   BN.BW   Util%
+pe-local-hbm        c0.pe0->c0.slice0    18.09   2.0  16.0  0.08  11.1% 88.5%   226.49   256.0   88.5%
+pe-cross-half-hbm   c0.pe0->c0.slice4    37.14   5.0  32.0  0.14  13.5% 86.1%   110.27   128.0   86.1%
+```
+
+| Column | Meaning |
+|--------|---------|
+| **Actual** | SimPy measured `env.now` delta (includes contention if any) |
+| **Ovhd** | Sum of `overhead_ns` for all components on the forward path |
+| **Drain** | `nbytes / bottleneck_bw` — serialization at terminal |
+| **Wire** | Sum of `distance_mm × ns_per_mm` for all edges |
+| **Ovhd%** | `Ovhd / Actual × 100` — fraction of time spent in component processing |
+| **Drain%** | `Drain / Actual × 100` — fraction of time spent in data transfer |
+| **Eff.BW** | `nbytes / Actual` — achieved bandwidth |
+| **BN.BW** | Bottleneck bandwidth (min `bw_gbs` on path) |
+| **Util%** | `Eff.BW / BN.BW × 100` — how close to theoretical max BW |
+
+### Why Util% < 100%
+
+`Util% = Drain% = drain_ns / actual_ns`. The gap from 100% is the overhead
+fraction. For small transfers (4KB), overhead is significant relative to drain.
+For large transfers, drain dominates and utilization approaches 100%.
+
+```
+  4 KB:  Ovhd=2.0, Drain=16.0  → Util=88.5%   (overhead is 11% of time)
+ 64 KB:  Ovhd=2.0, Drain=256.0 → Util=99.2%   (overhead is <1% of time)
+```
+
+### H2D Path: Why Ovhd% is ~40%
+
+H2D traverses many components (pcie_ep → io_cpu → ucie → noc → m_cpu → noc →
+xbar → hbm_ctrl + response path). Total forward overhead is ~23 ns vs drain
+of 32 ns for 4KB, so overhead is comparable to data transfer time—resulting
+in ~55% utilization. This is expected for small command-path transfers.
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..579aa33
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,30 @@
+[build-system]
+requires = ["setuptools>=68", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "kernbench"
+version = "0.1.0"
+requires-python = ">=3.10"
+dependencies = ["pytest", "simpy", "pyyaml"]
+
+[project.scripts]
+kernbench = "kernbench.cli.main:main"
+
+[project.optional-dependencies]
+dev = [
+  "pytest>=7",
+  "ruff>=0.4.0",
+]
+
+[tool.ruff]
+line-length = 100
+target-version = "py310"
+fix = false
+
+[tool.ruff.lint]
+select = ["E", "F", "I", "B", "UP"]
+ignore = ["F401"]
+
+[tool.pytest.ini_options]
+addopts = ["--disable-warnings"]
diff --git a/scripts/gen_placement_diagrams.py b/scripts/gen_placement_diagrams.py
new file mode 100644
index 0000000..81036bc
--- /dev/null
+++ b/scripts/gen_placement_diagrams.py
@@ -0,0 +1,393 @@
+#!/usr/bin/env python3
+"""Generate SVG diagrams illustrating each placement strategy.
+
+Example tensor: (M=1024, K=512) fp16 (itemsize=2), 8 PEs.
+Tiled variants use tile_m=256, tile_k=128.
+
+Output: docs/diagrams/placement_*.svg
+"""
+from __future__ import annotations
+
+import math
+from pathlib import Path
+
+# ── Diagram parameters ──────────────────────────────────────────────
+M, K = 1024, 512
+ITEMSIZE = 2
+NUM_PE = 8
+TILE_M, TILE_K = 256, 128
+
+PE_COLORS = [
+    "#3b82f6",  # PE0 blue
+    "#10b981",  # PE1 emerald
+    "#f59e0b",  # PE2 amber
+    "#ef4444",  # PE3 red
+    "#8b5cf6",  # PE4 violet
+    "#ec4899",  # PE5 pink
+    "#06b6d4",  # PE6 cyan
+    "#f97316",  # PE7 orange
+]
+PE_TEXT_COLORS = [
+    "#fff", "#fff", "#000", "#fff",
+    "#fff", "#fff", "#000", "#fff",
+]
+
+OUT_DIR = Path(__file__).parent.parent / "docs" / "diagrams"
+
+# ── SVG helpers ─────────────────────────────────────────────────────
+
+def _svg_header(w: int, h: int, title: str) -> str:
+    return (
+        f'<svg xmlns="http://www.w3.org/2000/svg" width="{w}" height="{h}"'
+        f' viewBox="0 0 {w} {h}" font-family="monospace">\n'
+        f'<rect width="{w}" height="{h}" fill="#f8fafc" rx="6"/>\n'
+        f'<text x="{w // 2}" y="32" text-anchor="middle" font-size="16"'
+        f' font-weight="bold" fill="#1e293b">{title}</text>\n'
+    )
+
+def _svg_footer() -> str:
+    return "</svg>\n"
+
+def _rect(x: float, y: float, w: float, h: float, fill: str,
+          stroke: str = "#334155", sw: float = 1.0, opacity: float = 1.0) -> str:
+    return (
+        f'<rect x="{x:.1f}" y="{y:.1f}" width="{w:.1f}" height="{h:.1f}"'
+        f' fill="{fill}" stroke="{stroke}" stroke-width="{sw}"'
+        f' fill-opacity="{opacity}" rx="2"/>\n'
+    )
+
+def _text(x: float, y: float, txt: str, size: int = 11,
+          anchor: str = "middle", fill: str = "#1e293b",
+          weight: str = "normal") -> str:
+    return (
+        f'<text x="{x:.1f}" y="{y:.1f}" text-anchor="{anchor}"'
+        f' font-size="{size}" fill="{fill}" font-weight="{weight}">{txt}</text>\n'
+    )
+
+def _line(x1: float, y1: float, x2: float, y2: float,
+          stroke: str = "#94a3b8", sw: float = 1) -> str:
+    return (
+        f'<line x1="{x1:.1f}" y1="{y1:.1f}" x2="{x2:.1f}" y2="{y2:.1f}"'
+        f' stroke="{stroke}" stroke-width="{sw}"/>\n'
+    )
+
+def _format_bytes(n: int) -> str:
+    if n >= (1 << 20):
+        return f"{n >> 20} MB"
+    if n >= (1 << 10):
+        return f"{n >> 10} KB"
+    return f"{n} B"
+
+def _legend(x: float, y0: float, num_pe: int = NUM_PE) -> str:
+    s = _text(x + 50, y0, "PE Legend", size=12, weight="bold")
+    for i in range(num_pe):
+        ly = y0 + 18 + i * 22
+        s += _rect(x, ly - 12, 16, 16, PE_COLORS[i])
+        s += _text(x + 22, ly, f"PE{i}", size=11, anchor="start")
+    return s
+
+def _axes(gx: float, gy: float, gw: float, gh: float,
+          m_label: str = "M=1024", k_label: str = "K=512") -> str:
+    """Draw axis labels and dimension arrows."""
+    s = ""
+    # K axis (horizontal) label above grid
+    s += _text(gx + gw / 2, gy - 8, f"← {k_label} →", size=11, fill="#475569")
+    # M axis (vertical) label left of grid
+    mx = gx - 12
+    my = gy + gh / 2
+    s += (
+        f'<text x="{mx:.1f}" y="{my:.1f}" text-anchor="middle"'
+        f' font-size="11" fill="#475569"'
+        f' transform="rotate(-90 {mx:.1f} {my:.1f})">↑ {m_label} ↓</text>\n'
+    )
+    return s
+
+def _info_box(x: float, y: float, lines: list[str]) -> str:
+    """Rounded info box with key/value lines."""
+    bw = max(len(l) for l in lines) * 7 + 20
+    bh = len(lines) * 18 + 12
+    s = _rect(x, y, bw, bh, "#e2e8f0", stroke="#94a3b8", sw=1)
+    for i, line in enumerate(lines):
+        s += _text(x + 10, y + 18 + i * 18, line, size=10, anchor="start", fill="#334155")
+    return s
+
+# ── Grid drawing ────────────────────────────────────────────────────
+
+def _draw_grid(
+    gx: float, gy: float, gw: float, gh: float,
+    cells: list[dict],  # [{row, col, rspan, cspan, pe, label?, offset?}]
+    rows: int, cols: int,
+    cell_labels: bool = True,
+) -> str:
+    """Draw a grid of colored cells representing shard placement."""
+    cw = gw / cols
+    ch = gh / rows
+    s = ""
+    for c in cells:
+        cx = gx + c["col"] * cw
+        cy = gy + c["row"] * ch
+        w = c.get("cspan", 1) * cw
+        h = c.get("rspan", 1) * ch
+        pe = c["pe"]
+        s += _rect(cx, cy, w, h, PE_COLORS[pe], stroke="#334155", sw=1.5)
+        # PE label
+        lx = cx + w / 2
+        ly = cy + h / 2
+        s += _text(lx, ly - 4, f"PE{pe}", size=12,
+                   fill=PE_TEXT_COLORS[pe], weight="bold")
+        if cell_labels and "label" in c:
+            s += _text(lx, ly + 12, c["label"], size=9,
+                       fill=PE_TEXT_COLORS[pe])
+    # Grid border
+    s += _rect(gx, gy, gw, gh, "none", stroke="#1e293b", sw=2)
+    return s
+
+
+# ── Strategy-specific generators ────────────────────────────────────
+
+def gen_column_wise() -> str:
+    """Column-wise: split K into 8 equal parts."""
+    W, H = 820, 500
+    s = _svg_header(W, H, "Placement: column_wise")
+    s += _text(W // 2, 54, f"Tensor ({M}×{K}) fp16  →  K axis split into {NUM_PE} parts",
+               size=12, fill="#475569")
+
+    gx, gy, gw, gh = 80, 90, 480, 320
+    chunk_k = K // NUM_PE  # 64
+    chunk_bytes = M * chunk_k * ITEMSIZE
+
+    s += _axes(gx, gy, gw, gh)
+    cells = []
+    for i in range(NUM_PE):
+        cells.append({
+            "row": 0, "col": i, "rspan": 1, "cspan": 1,
+            "pe": i,
+            "label": f"({M}×{chunk_k})",
+        })
+    s += _draw_grid(gx, gy, gw, gh, cells, rows=1, cols=NUM_PE)
+
+    # Column dimension labels
+    cw = gw / NUM_PE
+    for i in range(NUM_PE):
+        cx = gx + i * cw + cw / 2
+        off = i * chunk_bytes
+        s += _text(cx, gy + gh + 16, f"off={_format_bytes(off)}", size=9, fill="#475569")
+        s += _text(cx, gy + gh + 30, f"{_format_bytes(chunk_bytes)}", size=9, fill="#64748b")
+
+    s += _legend(620, 100)
+    s += _info_box(620, 320, [
+        f"Strategy: column_wise",
+        f"Split axis: K",
+        f"Shards: {NUM_PE}",
+        f"Each: ({M}, {chunk_k})",
+        f"Each: {_format_bytes(chunk_bytes)}",
+        f"Total: {_format_bytes(M * K * ITEMSIZE)}",
+    ])
+    s += _svg_footer()
+    return s
+
+
+def gen_row_wise() -> str:
+    """Row-wise: split M into 8 equal parts."""
+    W, H = 820, 560
+    s = _svg_header(W, H, "Placement: row_wise")
+    s += _text(W // 2, 54, f"Tensor ({M}×{K}) fp16  →  M axis split into {NUM_PE} parts",
+               size=12, fill="#475569")
+
+    gx, gy, gw, gh = 80, 90, 320, 400
+    chunk_m = M // NUM_PE  # 128
+    chunk_bytes = chunk_m * K * ITEMSIZE
+
+    s += _axes(gx, gy, gw, gh)
+    cells = []
+    for i in range(NUM_PE):
+        cells.append({
+            "row": i, "col": 0, "rspan": 1, "cspan": 1,
+            "pe": i,
+            "label": f"({chunk_m}×{K})",
+        })
+    s += _draw_grid(gx, gy, gw, gh, cells, rows=NUM_PE, cols=1)
+
+    # Row dimension labels
+    ch = gh / NUM_PE
+    for i in range(NUM_PE):
+        cy = gy + i * ch + ch / 2
+        off = i * chunk_bytes
+        s += _text(gx + gw + 10, cy - 4, f"off={_format_bytes(off)}",
+                   size=9, anchor="start", fill="#475569")
+        s += _text(gx + gw + 10, cy + 10, f"{_format_bytes(chunk_bytes)}",
+                   size=9, anchor="start", fill="#64748b")
+
+    s += _legend(580, 100)
+    s += _info_box(580, 320, [
+        f"Strategy: row_wise",
+        f"Split axis: M",
+        f"Shards: {NUM_PE}",
+        f"Each: ({chunk_m}, {K})",
+        f"Each: {_format_bytes(chunk_bytes)}",
+        f"Total: {_format_bytes(M * K * ITEMSIZE)}",
+    ])
+    s += _svg_footer()
+    return s
+
+
+def gen_replicate() -> str:
+    """Replicate: full copy per PE."""
+    W, H = 820, 500
+    s = _svg_header(W, H, "Placement: replicate")
+    s += _text(W // 2, 54, f"Tensor ({M}×{K}) fp16  →  full copy to each PE",
+               size=12, fill="#475569")
+
+    full_bytes = M * K * ITEMSIZE
+    # Show 8 small copies in 2 rows × 4 cols
+    cols, rows = 4, 2
+    margin_x, margin_y = 60, 90
+    gap = 16
+    bw = (700 - (cols - 1) * gap) / cols
+    bh = (340 - (rows - 1) * gap) / rows
+
+    for i in range(NUM_PE):
+        r = i // cols
+        c = i % cols
+        bx = margin_x + c * (bw + gap)
+        by = margin_y + r * (bh + gap)
+        s += _rect(bx, by, bw, bh, PE_COLORS[i], stroke="#334155", sw=1.5)
+        s += _text(bx + bw / 2, by + bh / 2 - 14, f"PE{i}",
+                   size=14, fill=PE_TEXT_COLORS[i], weight="bold")
+        s += _text(bx + bw / 2, by + bh / 2 + 6, f"({M}×{K})",
+                   size=11, fill=PE_TEXT_COLORS[i])
+        s += _text(bx + bw / 2, by + bh / 2 + 22, f"{_format_bytes(full_bytes)}",
+                   size=10, fill=PE_TEXT_COLORS[i])
+        s += _text(bx + bw / 2, by + bh / 2 + 36, "offset=0",
+                   size=9, fill=PE_TEXT_COLORS[i])
+
+    s += _info_box(60, 450, [
+        f"Strategy: replicate  |  Shards: {NUM_PE}  |  Each: {_format_bytes(full_bytes)}"
+        f"  |  Total mem: {_format_bytes(full_bytes * NUM_PE)}",
+    ])
+    s += _svg_footer()
+    return s
+
+
+def gen_tiled(column_major: bool) -> str:
+    """2D tiled placement. column_major=True → tiled_column_major."""
+    name = "tiled_column_major" if column_major else "tiled_row_major"
+    order = "column-major (K first)" if column_major else "row-major (M first)"
+
+    tiles_m = M // TILE_M  # 4
+    tiles_k = K // TILE_K  # 4
+    total_tiles = tiles_m * tiles_k  # 16
+    tile_bytes = TILE_M * TILE_K * ITEMSIZE
+
+    W, H = 820, 620
+    s = _svg_header(W, H, f"Placement: {name}")
+    s += _text(W // 2, 54,
+               f"Tensor ({M}×{K}) fp16, tile=({TILE_M}×{TILE_K})  →  "
+               f"{tiles_m}×{tiles_k}={total_tiles} tiles, {order}",
+               size=11, fill="#475569")
+
+    gx, gy, gw, gh = 80, 90, 400, 400
+    s += _axes(gx, gy, gw, gh)
+
+    # Build tile → PE mapping
+    cells = []
+    idx = 0
+    if column_major:
+        # iterate M first (rows), then K (cols) — but column-major means
+        # we traverse in the order that fills columns first
+        # Actually: column-major = K axis first within each M row
+        # The implementation iterates: for mi in tiles_m: for ki in tiles_k
+        for mi in range(tiles_m):
+            for ki in range(tiles_k):
+                pe = idx % NUM_PE
+                row_bytes = K * ITEMSIZE
+                offset = (mi * TILE_M * row_bytes) + (ki * TILE_K * ITEMSIZE)
+                cells.append({
+                    "row": mi, "col": ki, "rspan": 1, "cspan": 1,
+                    "pe": pe,
+                    "label": f"t{idx}",
+                    "offset": offset,
+                    "idx": idx,
+                })
+                idx += 1
+    else:
+        # row-major: iterate K first (cols), then M (rows)
+        for ki in range(tiles_k):
+            for mi in range(tiles_m):
+                pe = idx % NUM_PE
+                row_bytes = K * ITEMSIZE
+                offset = (mi * TILE_M * row_bytes) + (ki * TILE_K * ITEMSIZE)
+                cells.append({
+                    "row": mi, "col": ki, "rspan": 1, "cspan": 1,
+                    "pe": pe,
+                    "label": f"t{idx}",
+                    "offset": offset,
+                    "idx": idx,
+                })
+                idx += 1
+
+    s += _draw_grid(gx, gy, gw, gh, cells, rows=tiles_m, cols=tiles_k)
+
+    # Tile dimension labels on top
+    cw = gw / tiles_k
+    for ki in range(tiles_k):
+        cx = gx + ki * cw + cw / 2
+        s += _text(cx, gy + gh + 16, f"k={ki * TILE_K}..{(ki + 1) * TILE_K - 1}",
+                   size=9, fill="#475569")
+
+    # Tile dimension labels on left
+    ch = gh / tiles_m
+    for mi in range(tiles_m):
+        cy = gy + mi * ch + ch / 2
+        s += _text(gx - 16, cy, f"m={mi * TILE_M}..{(mi + 1) * TILE_M - 1}",
+                   size=9, anchor="end", fill="#475569")
+
+    s += _legend(540, 90)
+
+    # Assignment table
+    table_y = 310
+    s += _text(540, table_y, "Tile Assignment Order", size=12, weight="bold")
+    # Sort cells by idx for table
+    sorted_cells = sorted(cells, key=lambda c: c["idx"])
+    for i, c in enumerate(sorted_cells):
+        ty = table_y + 18 + i * 16
+        if ty > H - 20:
+            break
+        pe = c["pe"]
+        s += _rect(540, ty - 10, 12, 12, PE_COLORS[pe])
+        s += _text(558, ty,
+                   f"t{c['idx']:>2d} → PE{pe}  ({c['row']},{c['col']})"
+                   f"  off={_format_bytes(c['offset'])}",
+                   size=9, anchor="start", fill="#334155")
+
+    s += _info_box(80, H - 60, [
+        f"Strategy: {name}  |  Tile: ({TILE_M}×{TILE_K})={_format_bytes(tile_bytes)}"
+        f"  |  Tiles: {total_tiles}  |  Total: {_format_bytes(M * K * ITEMSIZE)}",
+    ])
+    s += _svg_footer()
+    return s
+
+
+# ── Main ────────────────────────────────────────────────────────────
+
+def main() -> None:
+    OUT_DIR.mkdir(parents=True, exist_ok=True)
+
+    diagrams = {
+        "placement_column_wise.svg": gen_column_wise(),
+        "placement_row_wise.svg": gen_row_wise(),
+        "placement_replicate.svg": gen_replicate(),
+        "placement_tiled_column_major.svg": gen_tiled(column_major=True),
+        "placement_tiled_row_major.svg": gen_tiled(column_major=False),
+    }
+
+    for name, svg in diagrams.items():
+        path = OUT_DIR / name
+        path.write_text(svg, encoding="utf-8")
+        print(f"  wrote {path}")
+
+    print(f"\nGenerated {len(diagrams)} placement diagrams.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/kernbench/__init__.py b/src/kernbench/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/kernbench/cli/main.py b/src/kernbench/cli/main.py
new file mode 100644
index 0000000..93c724e
--- /dev/null
+++ b/src/kernbench/cli/main.py
@@ -0,0 +1,64 @@
+import argparse
+import sys
+
+from benches.loader import resolve_bench
+from kernbench.cli.probe import cmd_probe
+from kernbench.cli.report import format_report
+from kernbench.common.types import SimEngine
+from kernbench.runtime_api.bench_runner import run_bench
+from kernbench.runtime_api.types import DeviceSelector, resolve_device
+from kernbench.sim_engine.engine import GraphEngine
+from kernbench.topology.builder import resolve_topology
+
+
+def build_parser() -> argparse.ArgumentParser:
+    p = argparse.ArgumentParser(prog="kernbench")
+    sub = p.add_subparsers(dest="cmd", required=True)
+
+    runp = sub.add_parser("run", help="Run a benchmark")
+    runp.add_argument("--topology", required=True)
+    runp.add_argument("--bench", required=True)
+    runp.add_argument(
+        "--device", default=None, help="Target device: 'all' or 'sip:<N>' (default: all)"
+    )
+    runp.set_defaults(_handler=cmd_run)
+
+    probep = sub.add_parser("probe", help="Probe latency and BW for predefined traffic patterns")
+    probep.add_argument("--topology", required=True)
+    probep.add_argument("--case", default="all", help="Case name or 'all' (default: all)")
+    probep.set_defaults(_handler=cmd_probe)
+
+    return p
+
+
+def engine_factory(topology: object, device: DeviceSelector) -> SimEngine:
+    topo_obj = getattr(topology, "topology_obj", topology)
+    return GraphEngine(topo_obj)
+
+
+def cmd_run(args) -> int:
+    print("> Running benchmark with:", args)
+
+    topo = resolve_topology(args.topology)
+    bench = resolve_bench(args.bench)
+    device = resolve_device(args.device)
+
+    result = run_bench(topology=topo, bench_fn=bench, device=device, engine_factory=engine_factory)
+
+    topo_obj = getattr(topo, "topology_obj", topo)
+    spec = getattr(topo_obj, "spec", None)
+    if result.traces:
+        print(format_report(result.traces, title=args.bench, spec=spec))
+    print(result.summary_text())
+
+    return 0 if result.completion.ok else 1
+
+
+def main(argv=None) -> int:
+    parser = build_parser()
+    args = parser.parse_args(argv)
+    return int(args._handler(args))
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/src/kernbench/cli/probe.py b/src/kernbench/cli/probe.py
new file mode 100644
index 0000000..8e81f12
--- /dev/null
+++ b/src/kernbench/cli/probe.py
@@ -0,0 +1,248 @@
+"""kernbench probe: latency and BW verification utility.
+
+Runs predefined traffic patterns through the simulation engine and reports
+latency, effective bandwidth, bottleneck bandwidth, and utilization for each
+case.  Validates monotonicity invariants across hop counts and access types.
+"""
+from __future__ import annotations
+
+from pathlib import Path
+
+from kernbench.policy.address.phyaddr import PhysAddr
+from kernbench.policy.routing.router import AddressResolver, PathRouter
+from kernbench.runtime_api.kernel import MemoryWriteMsg, PeDmaMsg
+from kernbench.sim_engine.engine import GraphEngine
+from kernbench.topology.builder import load_topology
+from kernbench.topology.types import TopologyGraph
+
+
+# -- Helpers ----------------------------------------------------------
+
+
+def _hbm_pa(sip: int, cube: int, pe_id: int, spec: dict) -> int:
+    mm = spec["cube"]["memory_map"]
+    slice_bytes = mm["hbm_total_gb_per_cube"] * (1 << 30) // mm["hbm_slices_per_cube"]
+    pa = PhysAddr.pe_hbm_addr(
+        rack_id=0, sip_id=sip, cube_id=cube, pe_id=pe_id,
+        pe_local_hbm_offset=0x1000, slice_size_bytes=slice_bytes,
+    )
+    return pa.encode()
+
+
+def _build_edge_map(graph: TopologyGraph) -> dict[tuple[str, str], object]:
+    return {(e.src, e.dst): e for e in graph.edges}
+
+
+def _formula_breakdown(
+    path: list[str], nbytes: int, edge_map: dict, graph: TopologyGraph,
+) -> tuple[float, float, float, float]:
+    """Return (wire_ns, overhead_ns, drain_ns, formula_ns) for a path."""
+    ns_per_mm = graph.spec.get("system", {}).get("ns_per_mm", 0.01)
+    wire_ns = 0.0
+    for i in range(len(path) - 1):
+        e = edge_map.get((path[i], path[i + 1]))
+        if e:
+            wire_ns += e.distance_mm * ns_per_mm
+    overhead_ns = 0.0
+    for nid in path:
+        node = graph.nodes.get(nid)
+        if node:
+            overhead_ns += float(node.attrs.get("overhead_ns", 0.0))
+    bws = [e.bw_gbs for i in range(len(path) - 1)
+           if (e := edge_map.get((path[i], path[i + 1]))) and e.bw_gbs]
+    drain_ns = nbytes / min(bws) if bws else 0.0
+    return wire_ns, overhead_ns, drain_ns, wire_ns + overhead_ns + drain_ns
+
+
+def _bottleneck_bw(path: list[str], edge_map: dict) -> float | None:
+    """Per-request bottleneck: single request uses one connection."""
+    bws: list[float] = []
+    for i in range(len(path) - 1):
+        e = edge_map.get((path[i], path[i + 1]))
+        if e and e.bw_gbs:
+            bws.append(e.bw_gbs)
+    return min(bws) if bws else None
+
+
+
+def _fmt_bw(bw: float | None) -> str:
+    return f"{bw:.1f}" if bw is not None else "-"
+
+
+def _fmt_util(eff: float, bn: float | None) -> str:
+    if bn is None or bn <= 0:
+        return "-"
+    return f"{eff / bn * 100:.1f}%"
+
+
+def _short_name(node_id: str) -> str:
+    """Shorten node id: keep last 2 segments to avoid ambiguity (xbar.pe0 vs pe0)."""
+    parts = node_id.split(".")
+    return ".".join(parts[-2:]) if len(parts) >= 2 else node_id
+
+
+def _short_path(path: list[str]) -> str:
+    return " -> ".join(_short_name(n) for n in path)
+
+
+# -- Probe runner -----------------------------------------------------
+
+
+def run_probe(topology_path: str, case_filter: str | None = None) -> int:
+    path = Path(topology_path).expanduser().resolve()
+    graph = load_topology(path)
+    edge_map = _build_edge_map(graph)
+    spec = graph.spec
+    resolver = AddressResolver(graph)
+    router = PathRouter(graph)
+
+    nbytes = 4096
+    show_all = case_filter is None or case_filter == "all"
+
+    # === H2D Write ===
+    h2d_cases = [
+        ("h2d-1hop", 0, 1),
+        ("h2d-2hop", 4, 2),
+        ("h2d-3hop", 8, 3),
+        ("h2d-4hop", 12, 4),
+    ]
+    h2d_results: list[tuple[str, int, float, float, float | None]] = []
+    h2d_paths: list[tuple[str, list[str], list[str], list[str]]] = []
+
+    print()
+    print("=== H2D Write Latency (IO->HBM, varying hop count) ===")
+    print(f"  {'Case':<14} {'Target':<16} {'Hops':>4}   {'Actual':>8}"
+          f"  {'Ovhd':>6} {'Drain':>6} {'Wire':>5} {'Ovhd%':>6} {'Drain%':>7}"
+          f"   {'Eff.BW':>8}   {'BN.BW':>8}   {'Util%':>6}")
+    print("  " + "-" * 115)
+
+    for name, cube, hops in h2d_cases:
+        if not show_all and case_filter != name:
+            continue
+        engine = GraphEngine(graph)
+        pa = _hbm_pa(sip=0, cube=cube, pe_id=0, spec=spec)
+        msg = MemoryWriteMsg(
+            correlation_id="probe", request_id=name,
+            dst_sip=0, dst_cube=cube, dst_pe=0,
+            dst_pa=pa, nbytes=nbytes, pattern="zero",
+        )
+        h = engine.submit(msg)
+        engine.wait(h)
+        _, trace = engine.get_completion(h)
+        total_ns = trace["total_ns"]
+        eff_bw = nbytes / total_ns if total_ns > 0 else 0.0
+
+        pa_obj = PhysAddr.decode(pa)
+        dst_node = resolver.resolve(pa_obj)
+
+        pcie_ep = resolver.find_pcie_ep(0)
+        io_cpu = resolver.find_io_cpu(0)
+        m_cpu = resolver.find_m_cpu(0, cube)
+        leg1 = router.find_node_path(pcie_ep, io_cpu)
+        leg2 = router.find_node_path(io_cpu, m_cpu)
+        leg3 = router.find_mcpu_dma_path(m_cpu, dst_node)
+        full_path = leg1 + leg2[1:] + leg3[1:]
+        bn_bw = _bottleneck_bw(full_path, edge_map)
+
+        # Forward path breakdown only (response path is implicit in actual_ns)
+        fwd_path = leg1 + leg2[1:] + leg3[1:]
+        wire, ovhd, drain, formula = _formula_breakdown(fwd_path, nbytes, edge_map, graph)
+
+        ovhd_pct = ovhd / total_ns * 100 if total_ns > 0 else 0
+        drain_pct = drain / total_ns * 100 if total_ns > 0 else 0
+
+        h2d_results.append((name, hops, total_ns, eff_bw, bn_bw))
+        h2d_paths.append((name, leg1, leg2, leg3))
+        print(f"  {name:<14} cube{cube}.pe0{'':<8} {hops:>4}   {total_ns:>8.2f}"
+              f"  {ovhd:>6.1f} {drain:>6.1f} {wire:>5.2f} {ovhd_pct:>5.1f}% {drain_pct:>5.1f}%"
+              f"   {eff_bw:>8.2f}   {_fmt_bw(bn_bw):>8}   {_fmt_util(eff_bw, bn_bw):>6}")
+
+    if len(h2d_results) >= 2:
+        lats = [r[2] for r in h2d_results]
+        mono = all(lats[i] < lats[i + 1] for i in range(len(lats) - 1))
+        sym = "[v]" if mono else "[x]"
+        print(f"  {sym} Monotonic increase: {'PASS' if mono else 'FAIL'}")
+
+    if h2d_paths:
+        print()
+        print("  Route Details:")
+        print(f"  {'Case':<14} {'Leg':>4}  Path")
+        print("  " + "-" * 80)
+        for name, leg1, leg2, leg3 in h2d_paths:
+            print(f"  {name:<14} {'L1':>4}  {_short_path(leg1)}")
+            print(f"  {'':<14} {'L2':>4}  {_short_path(leg2)}")
+            print(f"  {'':<14} {'L3':>4}  {_short_path(leg3)}")
+
+    # === PE DMA → HBM (direct PE-level injection) ===
+    # (name, sip, src_cube, src_pe, dst_cube, dst_pe)
+    pe_cases = [
+        ("pe-local-hbm", 0, 0, 0, 0, 0),        # pe0 → slice0 (local, 256 GB/s)
+        ("pe-same-half-hbm", 0, 0, 0, 0, 1),     # pe0 → slice1 (xbar chain, 128 GB/s)
+        ("pe-cross-half-hbm", 0, 0, 0, 0, 4),    # pe0 → slice4 (xbar chain, 128 GB/s)
+        ("pe-cross-cube-hbm", 0, 0, 0, 1, 0),    # cube0.pe0 → cube1.slice0 (NOC, 128 GB/s)
+    ]
+    pe_results: list[tuple[str, float, float, float | None]] = []
+    pe_paths: list[tuple[str, list[str]]] = []
+
+    print()
+    print("=== PE DMA Latency (pe_dma -> xbar -> HBM, direct injection) ===")
+    print(f"  {'Case':<22} {'Target':<28} {'Actual':>8}"
+          f"  {'Ovhd':>6} {'Drain':>6} {'Wire':>5} {'Ovhd%':>6} {'Drain%':>7}"
+          f"   {'Eff.BW':>8}   {'BN.BW':>8}   {'Util%':>6}")
+    print("  " + "-" * 120)
+
+    for name, sip, src_cube, src_pe, dst_cube, dst_pe in pe_cases:
+        if not show_all and case_filter != name:
+            continue
+        engine = GraphEngine(graph)
+        dst_pa = _hbm_pa(sip=sip, cube=dst_cube, pe_id=dst_pe, spec=spec)
+        msg = PeDmaMsg(
+            correlation_id="probe", request_id=name,
+            src_sip=sip, src_cube=src_cube, src_pe=src_pe,
+            dst_pa=dst_pa, nbytes=nbytes,
+        )
+        h = engine.submit(msg)
+        engine.wait(h)
+        _, trace = engine.get_completion(h)
+        total_ns = trace["total_ns"]
+        eff_bw = nbytes / total_ns if total_ns > 0 else 0.0
+
+        pe_ref = f"sip{sip}.cube{src_cube}.pe{src_pe}"
+        pa_obj = PhysAddr.decode(dst_pa)
+        dst_node = resolver.resolve(pa_obj)
+        dma_path = router.find_path(pe_ref, dst_node)
+        bn_bw = _bottleneck_bw(dma_path, edge_map)
+
+        wire, ovhd, drain, formula = _formula_breakdown(dma_path, nbytes, edge_map, graph)
+
+        ovhd_pct = ovhd / total_ns * 100 if total_ns > 0 else 0
+        drain_pct = drain / total_ns * 100 if total_ns > 0 else 0
+
+        target_str = f"c{src_cube}.pe{src_pe}->c{dst_cube}.slice{dst_pe}"
+        pe_results.append((name, total_ns, eff_bw, bn_bw))
+        pe_paths.append((name, dma_path))
+        print(f"  {name:<22} {target_str:<28} {total_ns:>8.2f}"
+              f"  {ovhd:>6.1f} {drain:>6.1f} {wire:>5.2f} {ovhd_pct:>5.1f}% {drain_pct:>5.1f}%"
+              f"   {eff_bw:>8.2f}   {_fmt_bw(bn_bw):>8}   {_fmt_util(eff_bw, bn_bw):>6}")
+
+    if len(pe_results) >= 2:
+        local = [r for r in pe_results if "local" in r[0]]
+        chain = [r for r in pe_results if "local" not in r[0]]
+        if local and chain:
+            print(f"  * Local BN: {_fmt_bw(local[0][3])} GB/s, "
+                  f"Chain/NOC BN: {_fmt_bw(chain[0][3])} GB/s")
+
+    if pe_paths:
+        print()
+        print("  Route Details:")
+        print(f"  {'Case':<22}  Path")
+        print("  " + "-" * 80)
+        for name, dma_path in pe_paths:
+            print(f"  {name:<22}  {_short_path(dma_path)}")
+
+    print()
+    return 0
+
+
+def cmd_probe(args) -> int:
+    return run_probe(args.topology, getattr(args, "case", "all"))
diff --git a/src/kernbench/cli/report.py b/src/kernbench/cli/report.py
new file mode 100644
index 0000000..815501a
--- /dev/null
+++ b/src/kernbench/cli/report.py
@@ -0,0 +1,175 @@
+"""Performance report formatter for bench results."""
+from __future__ import annotations
+
+
+_DTYPE_BITS: dict[str, int] = {
+    "f16": 16, "fp16": 16, "float16": 16, "bf16": 16,
+    "f32": 32, "fp32": 32, "float32": 32,
+    "i8": 8, "int8": 8, "i16": 16, "int16": 16, "i32": 32, "int32": 32,
+}
+
+
+def format_report(
+    traces: list[dict],
+    title: str = "Benchmark",
+    spec: dict | None = None,
+) -> str:
+    """Format collected traces into a human-readable performance report.
+
+    spec: topology spec dict for peak TFLOPS / BW extraction.
+    """
+    peak_tflops_f16, peak_hbm_bw_gbs = _extract_peaks(spec)
+    num_pes = _count_pes(spec)
+
+    lines: list[str] = []
+    title_line = f"-- {title} Performance Report "
+
+    deploy_entries = [t for t in traces if t.get("phase") not in ("kernel",)]
+    kernel_entries = [t for t in traces if t.get("phase") == "kernel"]
+
+    # ── Title ──
+    # Compute max header width for consistent separator lengths
+    _cmd_hdr = (f"{'Cmd':<10} {'Name':<12} {'SIP':>4} {'Cube':>5} {'PE':>4} {'Bytes':>10} "
+                f"{'Lat(ns)':>10} {'Xfer(ns)':>10} {'Proc(ns)':>10} "
+                f"{'BW(GB/s)':>10} {'MinBW':>10} {'Util%':>7}")
+    report_width = len(_cmd_hdr)
+    lines.append(title_line + "-" * max(0, report_width - len(title_line)))
+
+    # ── Command summary ──
+    if deploy_entries:
+        lines.append("")
+        hdr = (f"{'Cmd':<10} {'Name':<12} {'SIP':>4} {'Cube':>5} {'PE':>4} {'Bytes':>10} "
+               f"{'Lat(ns)':>10} {'Xfer(ns)':>10} {'Proc(ns)':>10} "
+               f"{'BW(GB/s)':>10} {'MinBW':>10} {'Util%':>7}")
+        lines.append(hdr)
+        lines.append("-" * len(hdr))
+        for e in deploy_entries:
+            lat = e.get("total_ns", 0.0)
+            nb = e.get("nbytes", 0)
+            sip = e.get("sip", "-")
+            pe = e.get("pe", "-")
+            cube = e.get("cube", "-")
+            cmd = e.get("phase", "deploy")
+            xfer_ns = e.get("xfer_ns", 0.0)
+            proc_ns = lat - xfer_ns if xfer_ns > 0 else 0.0
+            bw = nb / lat if lat > 0 else 0.0
+            min_bw = nb / xfer_ns if xfer_ns > 0 else 0.0
+            util = (xfer_ns / lat * 100) if lat > 0 and xfer_ns > 0 else 0.0
+            lines.append(
+                f"{cmd:<10} {e.get('name', '?'):<12} {str(sip):>4} {str(cube):>5} {str(pe):>4} {nb:>10} "
+                f"{lat:>10.1f} {xfer_ns:>10.1f} {proc_ns:>10.1f} "
+                f"{bw:>10.1f} {min_bw:>10.1f} {util:>6.1f}%"
+            )
+
+    # ── Kernel summary ──
+    if kernel_entries:
+        lines.append("")
+        k_hdr = (f"{'Phase':<10} {'Name':<12} {'PE':>4} {'E2E(ns)':>10} "
+                 f"{'PE(ns)':>10} {'DMA(ns)':>10} {'Comp(ns)':>10} "
+                 f"{'Bound':<8} {'TFLOPS':>8} {'Peak':>8} {'Util%':>7}")
+        lines.append(k_hdr)
+        lines.append("-" * len(k_hdr))
+        for e in kernel_entries:
+            e2e_ns = e.get("total_ns", 0.0)
+            pe_ns = e.get("pe_exec_ns", e2e_ns)
+            dma_ns = e.get("dma_ns", 0.0)
+            compute_ns = e.get("compute_ns", 0.0)
+            target_pe = e.get("target_pe", "-")
+            scalars = e.get("scalars", [])
+            pe_str = "all" if target_pe == "all" else str(target_pe)
+            n_active = num_pes if target_pe == "all" else 1
+
+            # Bound indicator based on measured DMA vs compute time
+            if dma_ns > 0 or compute_ns > 0:
+                bound = "memory" if dma_ns >= compute_ns else "compute"
+            else:
+                bound = "-"
+
+            achieved = _calc_tflops(scalars, pe_ns)
+            peak_total = peak_tflops_f16 * n_active
+            util = (achieved / peak_total * 100) if peak_total > 0 else 0.0
+            lines.append(
+                f"{'kernel':<10} {e.get('name', '?'):<12} {pe_str:>4} {e2e_ns:>10.1f} "
+                f"{pe_ns:>10.1f} {dma_ns:>10.1f} {compute_ns:>10.1f} "
+                f"{bound:<8} {achieved:>8.3f} {peak_total:>8.1f} {util:>6.1f}%"
+            )
+
+    # ── Per-PE summary ──
+    pe_deploy = _per_pe_deploy(deploy_entries)
+    if len(pe_deploy) > 1:
+        lines.append("")
+        pe_title = (f"-- Per-PE Summary (peak: {peak_tflops_f16:.1f} TFLOPS/PE, "
+                     f"{peak_hbm_bw_gbs:.0f} GB/s HBM BW) ")
+        pe_hdr = (f"{'PE':>4}  {'Deploy(ns)':>10}  {'BW(GB/s)':>10}  {'BW Util':>8}  "
+                   f"{'Kernel(ns)':>10}  {'TFLOPS':>8}  {'Util':>7}")
+        pe_width = max(len(pe_title), len(pe_hdr))
+        lines.append(pe_title + "-" * max(0, pe_width - len(pe_title)))
+        lines.append(pe_hdr)
+        lines.append("-" * pe_width)
+
+        k_ns = sum(e.get("pe_exec_ns", e.get("total_ns", 0.0)) for e in kernel_entries)
+        k_scalars = kernel_entries[0].get("scalars", []) if kernel_entries else []
+        n_active = len(pe_deploy)
+        total_achieved = _calc_tflops(k_scalars, k_ns)
+        per_pe_tflops = total_achieved / n_active if n_active > 0 else 0.0
+        pe_util = (per_pe_tflops / peak_tflops_f16 * 100) if peak_tflops_f16 > 0 else 0.0
+
+        for pe_id in sorted(pe_deploy):
+            d_ns, d_bytes = pe_deploy[pe_id]
+            d_bw = d_bytes / d_ns if d_ns > 0 else 0.0
+            d_util = (d_bw / peak_hbm_bw_gbs * 100) if peak_hbm_bw_gbs > 0 else 0.0
+            lines.append(
+                f"{pe_id:>4}  {d_ns:>10.1f}  {d_bw:>10.1f}  {d_util:>7.1f}%  "
+                f"{k_ns:>10.1f}  {per_pe_tflops:>8.3f}  {pe_util:>6.1f}%"
+            )
+        lines.append("")
+
+    return "\n".join(lines)
+
+
+def _extract_peaks(spec: dict | None) -> tuple[float, float]:
+    """Extract peak TFLOPS (f16) and HBM BW (GB/s) from spec."""
+    if spec is None:
+        return 0.0, 0.0
+    cube = spec.get("cube", {})
+    pe_template = cube.get("pe_template", {})
+    comps = pe_template.get("components", {})
+    gemm_attrs = comps.get("pe_gemm", {}).get("attrs", {})
+    peak_tflops = float(gemm_attrs.get("peak_tflops_f16", 0.0))
+    cube_links = cube.get("links", {})
+    hbm_bw = float(cube_links.get("xbar_to_hbm_bw_gbs", 0.0))
+    return peak_tflops, hbm_bw
+
+
+def _count_pes(spec: dict | None) -> int:
+    if spec is None:
+        return 8
+    cube = spec.get("cube", {})
+    layout = cube.get("pe_layout", {})
+    per_corner = layout.get("pe_per_corner", 2)
+    corners = len(layout.get("corners", ["NW", "NE", "SW", "SE"]))
+    return per_corner * corners
+
+
+def _calc_tflops(scalars: list, latency_ns: float) -> float:
+    """Calculate achieved TFLOPS from scalar args [M, K, N] and latency."""
+    if len(scalars) < 3 or latency_ns <= 0:
+        return 0.0
+    m, k, n = scalars[0], scalars[1], scalars[2]
+    flops = 2.0 * m * k * n
+    return flops / (latency_ns * 1e-9) / 1e12
+
+
+def _per_pe_deploy(deploy_entries: list[dict]) -> dict[int, tuple[float, int]]:
+    """Aggregate deploy latency and bytes per PE."""
+    result: dict[int, tuple[float, int]] = {}
+    for e in deploy_entries:
+        pe = e.get("pe", 0)
+        lat = e.get("total_ns", 0.0)
+        nb = e.get("nbytes", 0)
+        if pe in result:
+            old_ns, old_bytes = result[pe]
+            result[pe] = (old_ns + lat, old_bytes + nb)
+        else:
+            result[pe] = (lat, nb)
+    return result
diff --git a/src/kernbench/common/__init__.py b/src/kernbench/common/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/kernbench/common/pe_commands.py b/src/kernbench/common/pe_commands.py
new file mode 100644
index 0000000..d1d2c39
--- /dev/null
+++ b/src/kernbench/common/pe_commands.py
@@ -0,0 +1,150 @@
+"""PE-internal command types and handles (ADR-0014).
+
+Generated by triton_emu (TLContext) and consumed by PE component
+implementations (PE_CPU, PE_SCHEDULER, PE_DMA, PE_GEMM, PE_MATH).
+
+Command lifecycle:
+  Triton kernel → TLContext → [PeCommand list] → PE_CPU → PE_SCHEDULER → engines
+"""
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any, Literal
+
+if TYPE_CHECKING:
+    import simpy
+
+
+# ── Handles ───────────────────────────────────────────────────────
+
+
+@dataclass(frozen=True)
+class TensorHandle:
+    """Opaque reference to a tensor residing in PE_TCM.
+
+    Returned by tl.load, tl.dot, tl.exp, etc.
+    Carries metadata for command generation; data field is reserved
+    for future validate mode (numpy array).
+    """
+
+    id: str
+    pa: int                          # physical address in HBM/TCM
+    shape: tuple[int, ...]
+    dtype: str
+    nbytes: int                      # total byte size
+    data: object = None              # reserved for validate mode
+
+
+@dataclass(frozen=True)
+class CompletionHandle:
+    """Opaque handle for a non-blocking composite command.
+
+    Returned by tl.composite, consumed by tl.wait.
+    """
+
+    id: str
+
+
+# ── PE Commands ───────────────────────────────────────────────────
+
+
+@dataclass(frozen=True)
+class DmaReadCmd:
+    """DMA READ: HBM → PE_TCM."""
+
+    handle: TensorHandle
+    src_pa: int
+    nbytes: int
+
+
+@dataclass(frozen=True)
+class DmaWriteCmd:
+    """DMA WRITE: PE_TCM → HBM."""
+
+    handle: TensorHandle
+    dst_pa: int
+    nbytes: int
+
+
+@dataclass(frozen=True)
+class GemmCmd:
+    """GEMM engine command: matrix multiply on TCM data.
+
+    out = a @ b, all operands in TCM.
+    """
+
+    a: TensorHandle
+    b: TensorHandle
+    out: TensorHandle
+    m: int
+    k: int
+    n: int
+
+
+@dataclass(frozen=True)
+class MathCmd:
+    """MATH engine command: unary/binary/reduction on TCM data.
+
+    op: "exp", "log", "sqrt", "abs", "sigmoid", "cos", "sin",
+        "add", "sub", "mul", "div", "where",
+        "sum", "max", "min"
+    """
+
+    op: str
+    inputs: tuple[TensorHandle, ...]
+    out: TensorHandle
+    axis: int | None = None          # for reductions
+
+
+@dataclass(frozen=True)
+class CompositeCmd:
+    """Composite command: tiled pipeline of DMA_READ + COMPUTE + DMA_WRITE.
+
+    Non-blocking — submitted to PE_SCHEDULER which manages tile splitting
+    and pipeline overlaps (ADR-0014 D3.2).
+    """
+
+    completion: CompletionHandle
+    op: Literal["gemm", "math"]
+    a: TensorHandle
+    b: TensorHandle | None
+    out_pa: int
+    out_nbytes: int
+    math_op: str | None = None       # for op="math": which math operation
+
+
+@dataclass(frozen=True)
+class WaitCmd:
+    """Wait for a specific composite or all pending composites."""
+
+    handle: CompletionHandle | None = None  # None = wait all
+
+
+@dataclass(frozen=True)
+class PeCpuOverheadCmd:
+    """PE_CPU scalar execution overhead (cycles)."""
+
+    cycles: int
+
+
+# Union type for all PE commands
+PeCommand = (
+    DmaReadCmd | DmaWriteCmd | GemmCmd | MathCmd
+    | CompositeCmd | WaitCmd | PeCpuOverheadCmd
+)
+
+
+@dataclass
+class PeInternalTxn:
+    """PE-internal message flowing PE_CPU → PE_SCHEDULER → engines.
+
+    Carries a single PeCommand and a completion event.  PE_CPU creates one
+    PeInternalTxn per command during the replay phase and sends it to
+    PE_SCHEDULER, which routes it to the appropriate engine (PE_DMA,
+    PE_GEMM, PE_MATH).  The engine signals ``done`` on completion.
+    """
+
+    command: PeCommand
+    done: simpy.Event  # succeeded when the engine completes this command
+    pe_prefix: str = ""  # e.g. "sip0.cube0.pe0" — needed by PE_DMA for path resolution
+    result_data: dict[str, Any] = field(default_factory=dict)
diff --git a/src/kernbench/common/types.py b/src/kernbench/common/types.py
new file mode 100644
index 0000000..c6ff2bb
--- /dev/null
+++ b/src/kernbench/common/types.py
@@ -0,0 +1,29 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any, NewType, Protocol, TypeAlias
+
+RequestHandle = NewType("RequestHandle", str)
+
+Trace: TypeAlias = Any
+
+
+@dataclass(frozen=True)
+class Completion:
+    ok: bool
+    error_code: str | None = None
+    error_message: str | None = None
+
+
+class SimEngine(Protocol):
+    """
+    Backend simulation/runner engine contract.
+
+    Engine must be able to:
+      - accept requests created by RuntimeContext (submit/dispatch)
+      - report completion and optional trace for a given handle
+    """
+
+    def get_completion(self, handle: RequestHandle) -> tuple[Completion, Trace | None]: ...
+    def submit(self, request: Any) -> RequestHandle: ...
+    def wait(self, handle: RequestHandle) -> None: ...
diff --git a/src/kernbench/components/__init__.py b/src/kernbench/components/__init__.py
new file mode 100644
index 0000000..b4f2653
--- /dev/null
+++ b/src/kernbench/components/__init__.py
@@ -0,0 +1,4 @@
+from kernbench.components.base import ComponentBase, ComponentRegistry
+from kernbench.components.context import ComponentContext
+
+__all__ = ["ComponentBase", "ComponentRegistry", "ComponentContext"]
diff --git a/src/kernbench/components/base.py b/src/kernbench/components/base.py
new file mode 100644
index 0000000..5d633d8
--- /dev/null
+++ b/src/kernbench/components/base.py
@@ -0,0 +1,167 @@
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from collections.abc import Generator
+from typing import TYPE_CHECKING, Any
+
+import simpy
+
+if TYPE_CHECKING:
+    from kernbench.components.context import ComponentContext
+    from kernbench.topology.types import Node
+
+
+class ComponentBase(ABC):
+    """Base class for all SimPy component implementations (ADR-0007 D3, ADR-0015).
+
+    Each component corresponds to one node in the compiled topology graph.
+    It models the processing overhead at that node as a SimPy generator,
+    allowing future implementations to add queueing and contention.
+
+    Port model (ADR-0015 D1):
+      in_ports[src_node_id]  — SimPy Store for incoming messages from src
+      out_ports[dst_node_id] — SimPy Store for outgoing messages to dst
+    Ports are wired by GraphEngine at initialization; wire processes model
+    propagation delay between connected ports (ADR-0015 D2).
+
+    Context (ADR-0015 D4):
+      ctx — ComponentContext with router and resolver.
+    """
+
+    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
+        self.node = node
+        self.ctx = ctx
+        self.in_ports: dict[str, simpy.Store] = {}
+        self.out_ports: dict[str, simpy.Store] = {}
+
+    def start(self, env: simpy.Environment) -> None:
+        """Called once after all ports are wired.
+
+        Default: starts a fan-in collector and a generic forwarding worker.
+        The worker calls self.run() for per-component latency, then routes the
+        Transaction to the next hop or signals done (duck-typed; no direct
+        Transaction import to avoid circular dependencies).
+
+        Override in components that need custom fan-out / aggregation logic
+        (e.g. MCpuComponent, IoCpuComponent for kernel launch).
+        """
+        if not self.in_ports:
+            return
+        self._inbox: simpy.Store = simpy.Store(env)
+        for port in self.in_ports.values():
+            env.process(self._fan_in(port))
+        env.process(self._worker(env))
+
+    def _fan_in(self, port: simpy.Store) -> Generator:
+        """Relay messages from one in_port into the shared inbox."""
+        while True:
+            msg = yield port.get()
+            yield self._inbox.put(msg)
+
+    def _worker(self, env: simpy.Environment) -> Generator:
+        """Generic forwarding worker: spawns _forward_txn per message (pipeline)."""
+        while True:
+            txn: Any = yield self._inbox.get()
+            env.process(self._forward_txn(env, txn))
+
+    def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator:
+        """Apply run() latency, then forward to next hop or drain at terminal."""
+        yield from self.run(env, txn.nbytes)
+        next_hop = txn.next_hop          # duck-typed: Transaction.next_hop
+        if next_hop:
+            yield self.out_ports[next_hop].put(txn.advance())
+        else:
+            drain = getattr(txn, "drain_ns", 0.0)
+            if drain > 0:
+                yield env.timeout(drain)
+            txn.done.succeed()
+
+    @abstractmethod
+    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
+        """SimPy process: yield one or more events for this node's processing.
+
+        Subclasses yield env.timeout(overhead_ns) or compute latency dynamically.
+        Called by _forward_txn and subclass-specific handlers.
+        """
+        ...
+
+
+class PeEngineBase(ComponentBase):
+    """Base class for PE-internal engines (PE_DMA, PE_GEMM, PE_MATH).
+
+    Provides:
+      - ``_pe_prefix``: extracted from node.id (e.g. "sip0.cube0.pe0")
+      - Dual-message ``_worker``: dispatches PeInternalTxn to
+        ``handle_command()`` and Transaction to inherited ``_forward_txn()``.
+      - ``init_resources(env)``: hook for subclass resource initialization,
+        called by ``start()`` before the worker is spawned.
+
+    Subclass contract:
+      1. Override ``handle_command(env, pe_txn)`` — process a PeInternalTxn.
+      2. Override ``run(env, nbytes)`` — yield component latency.
+      3. Optionally override ``init_resources(env)`` for DMA channels, etc.
+    """
+
+    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
+        super().__init__(node, ctx)
+        self._pe_prefix: str = node.id.rsplit(".", 1)[0]
+
+    def start(self, env: simpy.Environment) -> None:
+        self.init_resources(env)
+        super().start(env)
+
+    def init_resources(self, env: simpy.Environment) -> None:
+        """Hook for subclass resource initialization. Called before worker spawn."""
+
+    def _worker(self, env: simpy.Environment) -> Generator:
+        """Dual-message dispatch: PeInternalTxn → handle_command, Transaction → _forward_txn."""
+        from kernbench.common.pe_commands import PeInternalTxn
+
+        while True:
+            msg: Any = yield self._inbox.get()
+            if isinstance(msg, PeInternalTxn):
+                env.process(self.handle_command(env, msg))
+            else:
+                env.process(self._forward_txn(env, msg))
+
+    @abstractmethod
+    def handle_command(self, env: simpy.Environment, pe_txn: Any) -> Generator:
+        """Process a PE-internal command (PeInternalTxn).
+
+        Subclass must:
+          - Perform engine-specific work (acquire resources, compute, etc.)
+          - Call ``pe_txn.done.succeed()`` on completion.
+        """
+        ...
+
+
+class ComponentRegistry:
+    """DI registry: maps node.impl strings to ComponentBase subclasses.
+
+    Resolution order for ComponentRegistry.create(node, overrides, ctx):
+      1. overrides[node.impl]   — caller-injected override
+      2. _registry[node.impl]   — globally registered impl
+      3. Error                   — no fallback; every node must have an impl
+    """
+
+    _registry: dict[str, type[ComponentBase]] = {}
+
+    @classmethod
+    def register(cls, impl: str, component_cls: type[ComponentBase]) -> None:
+        cls._registry[impl] = component_cls
+
+    @classmethod
+    def create(
+        cls,
+        node: Node,
+        overrides: dict[str, type[ComponentBase]] | None = None,
+        ctx: ComponentContext | None = None,
+    ) -> ComponentBase:
+        if overrides and node.impl in overrides:
+            return overrides[node.impl](node, ctx)
+        if node.impl in cls._registry:
+            return cls._registry[node.impl](node, ctx)
+        raise ValueError(
+            f"No component registered for impl '{node.impl}' (node: {node.id}). "
+            f"Register it in kernbench.components.impls.__init__."
+        )
diff --git a/src/kernbench/components/context.py b/src/kernbench/components/context.py
new file mode 100644
index 0000000..98a6f93
--- /dev/null
+++ b/src/kernbench/components/context.py
@@ -0,0 +1,52 @@
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any
+
+import simpy
+
+from kernbench.policy.routing.router import AddressResolver, PathRouter
+
+
+@dataclass
+class ComponentContext:
+    """Topology services injected into every component implementation.
+
+    Required by components that need routing or address resolution
+    (IoCpuComponent, MCpuComponent, …).  TransitComponent ignores ctx.
+
+    Passed via ComponentRegistry.create(node, overrides, ctx=ctx).
+    """
+
+    router: PathRouter
+    resolver: AddressResolver
+    positions: dict[str, tuple[float, float] | None]  # node_id → pos_mm
+    ns_per_mm: float  # wire propagation constant (from topology spec)
+    edge_map: dict[tuple[str, str], Any] = field(default_factory=dict)
+    spec: dict = field(default_factory=dict)  # topology spec (cube layout, PE count, etc.)
+
+    def get_shared_resource(
+        self, env: simpy.Environment, key: str, capacity: int = 1,
+    ) -> simpy.Resource:
+        """Return a shared SimPy Resource, creating it on first access.
+
+        Used by PE components that share a resource across engines within
+        the same PE (e.g. accel_slot shared by PE_GEMM and PE_MATH).
+        Key should be scoped per PE: e.g. "sip0.cube0.pe0.accel_slot".
+        """
+        if not hasattr(self, "_shared_resources"):
+            self._shared_resources: dict[str, simpy.Resource] = {}
+        if key not in self._shared_resources:
+            self._shared_resources[key] = simpy.Resource(env, capacity=capacity)
+        return self._shared_resources[key]
+
+    def compute_drain_ns(self, path: list[str], nbytes: int) -> float:
+        """Wormhole drain time: nbytes / bottleneck_bw along path."""
+        min_bw = float("inf")
+        for i in range(len(path) - 1):
+            edge = self.edge_map.get((path[i], path[i + 1]))
+            if edge and getattr(edge, "bw_gbs", None):
+                min_bw = min(min_bw, edge.bw_gbs)
+        if min_bw == float("inf"):
+            return 0.0
+        return nbytes / min_bw
diff --git a/src/kernbench/components/impls/__init__.py b/src/kernbench/components/impls/__init__.py
new file mode 100644
index 0000000..f4edf9c
--- /dev/null
+++ b/src/kernbench/components/impls/__init__.py
@@ -0,0 +1,54 @@
+"""Concrete component implementations.
+
+Each module registers its component(s) with ComponentRegistry on import.
+Import this package to activate all built-in implementations.
+"""
+
+from kernbench.components.base import ComponentRegistry
+from kernbench.components.impls.forwarding import TransitComponent
+from kernbench.components.impls.hbm_ctrl import HbmCtrlComponent
+from kernbench.components.impls.io_cpu import IoCpuComponent
+from kernbench.components.impls.m_cpu import MCpuComponent
+from kernbench.components.impls.noc import TwoDMeshNocComponent
+from kernbench.components.impls.pcie_ep import PcieEpComponent
+from kernbench.components.impls.pe_cpu import PeCpuComponent
+from kernbench.components.impls.pe_dma import PeDmaComponent
+from kernbench.components.impls.pe_gemm import PeGemmComponent
+from kernbench.components.impls.pe_math import PeMathComponent
+from kernbench.components.impls.pe_scheduler import PeSchedulerComponent
+from kernbench.components.impls.pe_tcm import PeTcmComponent
+from kernbench.components.impls.sram import SramComponent
+
+ComponentRegistry.register("forwarding_v1", TransitComponent)
+ComponentRegistry.register("switch_v1", TransitComponent)
+ComponentRegistry.register("noc_v1", TransitComponent)
+ComponentRegistry.register("noc_2d_mesh_v1", TwoDMeshNocComponent)
+ComponentRegistry.register("ucie_v1", TransitComponent)
+ComponentRegistry.register("xbar_v1", TransitComponent)
+ComponentRegistry.register("pcie_ep_v1", PcieEpComponent)
+ComponentRegistry.register("io_cpu_v1", IoCpuComponent)
+ComponentRegistry.register("m_cpu_v1", MCpuComponent)
+ComponentRegistry.register("hbm_ctrl_v1", HbmCtrlComponent)
+ComponentRegistry.register("sram_v1", SramComponent)
+ComponentRegistry.register("pe_cpu_v1", PeCpuComponent)
+ComponentRegistry.register("pe_scheduler_v1", PeSchedulerComponent)
+ComponentRegistry.register("pe_dma_v1", PeDmaComponent)
+ComponentRegistry.register("pe_gemm_v1", PeGemmComponent)
+ComponentRegistry.register("pe_math_v1", PeMathComponent)
+ComponentRegistry.register("pe_tcm_v1", PeTcmComponent)
+
+__all__ = [
+    "HbmCtrlComponent",
+    "IoCpuComponent",
+    "MCpuComponent",
+    "PcieEpComponent",
+    "PeCpuComponent",
+    "PeDmaComponent",
+    "PeGemmComponent",
+    "PeMathComponent",
+    "PeSchedulerComponent",
+    "PeTcmComponent",
+    "TransitComponent",
+    "TwoDMeshNocComponent",
+    "SramComponent",
+]
diff --git a/src/kernbench/components/impls/forwarding.py b/src/kernbench/components/impls/forwarding.py
new file mode 100644
index 0000000..1fa8eee
--- /dev/null
+++ b/src/kernbench/components/impls/forwarding.py
@@ -0,0 +1,27 @@
+from __future__ import annotations
+
+from collections.abc import Generator
+from typing import TYPE_CHECKING
+
+import simpy
+
+from kernbench.components.base import ComponentBase
+
+if TYPE_CHECKING:
+    from kernbench.components.context import ComponentContext
+    from kernbench.topology.types import Node
+
+
+class TransitComponent(ComponentBase):
+    """Transit component for NOC, UCIe, XBAR nodes.
+
+    Applies overhead_ns processing delay (from node.attrs) then forwards the
+    Transaction to the next hop via inherited _forward_txn().
+    """
+
+    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
+        super().__init__(node, ctx)
+
+    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
+        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
+        yield env.timeout(overhead_ns)
diff --git a/src/kernbench/components/impls/hbm_ctrl.py b/src/kernbench/components/impls/hbm_ctrl.py
new file mode 100644
index 0000000..3fa21cb
--- /dev/null
+++ b/src/kernbench/components/impls/hbm_ctrl.py
@@ -0,0 +1,101 @@
+from __future__ import annotations
+
+from collections.abc import Generator
+from typing import TYPE_CHECKING, Any
+
+import simpy
+
+from kernbench.components.base import ComponentBase
+from kernbench.sim_engine.transaction import Transaction
+
+if TYPE_CHECKING:
+    from kernbench.components.context import ComponentContext
+    from kernbench.topology.types import Node
+
+
+class HbmCtrlComponent(ComponentBase):
+    """HBM controller: terminal component that models HBM access latency.
+
+    Dual-channel model: separate read and write resources (each capacity=1)
+    allowing concurrent read/write like PE_DMA. Multiple reads or multiple
+    writes still serialize within their respective channel.
+
+    On completion, creates a ResponseMsg and sends it back on the reverse path
+    so that response latency is modeled through the fabric.
+    """
+
+    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
+        super().__init__(node, ctx)
+        self._read: simpy.Resource | None = None
+        self._write: simpy.Resource | None = None
+
+    def start(self, env: simpy.Environment) -> None:
+        capacity = int(self.node.attrs.get("capacity", 1))
+        self._read = simpy.Resource(env, capacity=capacity)
+        self._write = simpy.Resource(env, capacity=capacity)
+        super().start(env)
+
+    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
+        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
+        yield env.timeout(overhead_ns)
+
+    def _select_channel(self, txn: Any) -> simpy.Resource:
+        """Select channel based on request type: write requests → write, else → read."""
+        from kernbench.runtime_api.kernel import MemoryWriteMsg, PeDmaMsg
+
+        assert self._read is not None and self._write is not None
+        req = txn.request
+        if isinstance(req, MemoryWriteMsg):
+            return self._write
+        if isinstance(req, PeDmaMsg) and req.is_write:
+            return self._write
+        return self._read
+
+    def _worker(self, env: simpy.Environment) -> Generator:
+        """Dispatch each incoming txn to a concurrent process for channel-level parallelism."""
+        while True:
+            txn: Any = yield self._inbox.get()
+            env.process(self._handle_txn(env, txn))
+
+    def _handle_txn(self, env: simpy.Environment, txn: Any) -> Generator:
+        """Acquire channel, run, apply drain, send response."""
+        channel = self._select_channel(txn)
+        with channel.request() as req:
+            yield req
+            yield from self.run(env, txn.nbytes)
+            drain = getattr(txn, "drain_ns", 0.0)
+            if drain > 0:
+                yield env.timeout(drain)
+        yield from self._send_response(env, txn)
+
+    def _send_response(self, env: simpy.Environment, txn: Any) -> Generator:
+        """Create ResponseMsg and send on reverse path back to originator.
+
+        PeDmaMsg is a direct probe with no IO_CPU/M_CPU aggregation in the path,
+        so we succeed txn.done directly instead of sending a response Transaction.
+        """
+        from kernbench.runtime_api.kernel import PeDmaMsg
+
+        if isinstance(txn.request, PeDmaMsg):
+            txn.done.succeed()
+            return
+
+        reverse_path = list(reversed(txn.path))
+        if len(reverse_path) >= 2 and self.ctx:
+            from kernbench.runtime_api.kernel import ResponseMsg
+
+            parts = self.node.id.split(".")
+            cube_id = int(parts[1].replace("cube", ""))
+            pe_id = int(parts[3].replace("slice", ""))
+            resp_msg = ResponseMsg(
+                correlation_id=txn.request.correlation_id,
+                request_id=txn.request.request_id,
+                src_cube=cube_id, src_pe=pe_id, success=True,
+            )
+            resp_txn = Transaction(
+                request=resp_msg, path=reverse_path, step=0,
+                nbytes=0, done=env.event(), is_response=True,
+            )
+            yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
+        else:
+            txn.done.succeed()
diff --git a/src/kernbench/components/impls/io_cpu.py b/src/kernbench/components/impls/io_cpu.py
new file mode 100644
index 0000000..ad123a6
--- /dev/null
+++ b/src/kernbench/components/impls/io_cpu.py
@@ -0,0 +1,145 @@
+from __future__ import annotations
+
+from collections.abc import Generator
+from typing import TYPE_CHECKING, Any
+
+import simpy
+
+from kernbench.components.base import ComponentBase
+from kernbench.sim_engine.transaction import Transaction
+
+if TYPE_CHECKING:
+    from kernbench.components.context import ComponentContext
+    from kernbench.topology.types import Node
+
+
+class IoCpuComponent(ComponentBase):
+    """IO_CPU component: multi-cube fan-out with response aggregation.
+
+    Forward path:
+      1. Applies overhead_ns processing overhead.
+      2. Resolves target cube(s) from request.target_cubes.
+      3. Fans out sub-Transactions to each target cube's M_CPU.
+
+    Response path:
+      Collects ResponseMsg from each M_CPU. When all cube responses are
+      received, succeeds the parent txn.done.
+    """
+
+    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
+        super().__init__(node, ctx)
+        # Pending fan-out tracking: request_id → (expected, received, parent_txn_done)
+        self._pending: dict[str, tuple[int, int, simpy.Event]] = {}
+
+    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
+        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
+        yield env.timeout(overhead_ns)
+
+    def _worker(self, env: simpy.Environment) -> Generator:
+        while True:
+            txn: Any = yield self._inbox.get()
+            if getattr(txn, "is_response", False):
+                self._collect_response(txn)
+            else:
+                yield from self.run(env, txn.nbytes)
+                env.process(self._dispatch_to_m_cpus(env, txn))
+
+    def _collect_response(self, resp_txn: Any) -> None:
+        """Receive a cube response and increment the aggregation counter."""
+        key = resp_txn.request.request_id
+        if key not in self._pending:
+            return
+        expected, received, parent_done = self._pending[key]
+        received += 1
+        if received >= expected:
+            parent_done.succeed()
+            del self._pending[key]
+        else:
+            self._pending[key] = (expected, received, parent_done)
+
+    def _dispatch_to_m_cpus(self, env: simpy.Environment, txn: Any) -> Generator:
+        """Fan out sub-Transactions to target cube M_CPUs, wait for responses."""
+        from kernbench.runtime_api.kernel import KernelLaunchMsg, MemoryReadMsg, MemoryWriteMsg
+
+        request = txn.request
+        try:
+            cube_targets = self._resolve_cube_targets(request)
+        except Exception:
+            txn.done.succeed()
+            return
+
+        if not cube_targets:
+            txn.done.succeed()
+            return
+
+        # Setup aggregation
+        self._pending[request.request_id] = (len(cube_targets), 0, txn.done)
+
+        # Fan out to each target cube's M_CPU
+        for sip, cube in cube_targets:
+            try:
+                m_cpu_id = self.ctx.resolver.find_m_cpu(sip, cube)
+                path = self.ctx.router.find_node_path(self.node.id, m_cpu_id)
+            except Exception:
+                continue
+            if len(path) < 2:
+                continue
+            sub_txn = Transaction(
+                request=request, path=path, step=0,
+                nbytes=txn.nbytes, done=env.event(),
+                result_data=txn.result_data,
+            )
+            yield self.out_ports[path[1]].put(sub_txn.advance())
+
+    def _resolve_cube_targets(self, request: Any) -> list[tuple[int, int]]:
+        """Return list of (sip, cube) pairs to fan out to."""
+        from kernbench.runtime_api.kernel import KernelLaunchMsg, MemoryReadMsg, MemoryWriteMsg
+
+        target_cubes = getattr(request, "target_cubes", "all")
+
+        if isinstance(request, MemoryWriteMsg):
+            sip = request.dst_sip
+            if target_cubes == "all":
+                cube = self._cube_from_pa(request.dst_pa, fallback=request.dst_cube)
+                return [(sip, cube)]
+            return [(sip, c) for c in target_cubes]
+
+        if isinstance(request, MemoryReadMsg):
+            sip = request.src_sip
+            if target_cubes == "all":
+                cube = self._cube_from_pa(request.src_pa, fallback=request.src_cube)
+                return [(sip, cube)]
+            return [(sip, c) for c in target_cubes]
+
+        if isinstance(request, KernelLaunchMsg):
+            my_sip = self._my_sip()
+            if target_cubes != "all":
+                return [(my_sip, c) for c in target_cubes]
+            # "all": derive from tensor shards, filtered to this SIP
+            seen: set[tuple[int, int]] = set()
+            targets: list[tuple[int, int]] = []
+            for arg in request.args:
+                if arg.arg_kind != "tensor":
+                    continue
+                for shard in arg.shards:
+                    if shard.sip != my_sip:
+                        continue
+                    key = (shard.sip, shard.cube)
+                    if key not in seen:
+                        seen.add(key)
+                        targets.append(key)
+            return targets
+
+        return []
+
+    def _cube_from_pa(self, pa_val: int, fallback: int) -> int:
+        """Extract cube_id from a physical address, with fallback."""
+        from kernbench.policy.address.phyaddr import PhysAddr
+        try:
+            return PhysAddr.decode(pa_val).cube_id
+        except Exception:
+            return fallback
+
+    def _my_sip(self) -> int:
+        """Extract this IO_CPU's SIP ID from its node ID (e.g. 'sip0.io0.io_cpu' → 0)."""
+        return int(self.node.id.split(".")[0].replace("sip", ""))
diff --git a/src/kernbench/components/impls/m_cpu.py b/src/kernbench/components/impls/m_cpu.py
new file mode 100644
index 0000000..8bf955e
--- /dev/null
+++ b/src/kernbench/components/impls/m_cpu.py
@@ -0,0 +1,269 @@
+from __future__ import annotations
+
+from collections.abc import Generator
+from typing import TYPE_CHECKING, Any
+
+import simpy
+
+from kernbench.components.base import ComponentBase
+from kernbench.sim_engine.transaction import Transaction
+
+if TYPE_CHECKING:
+    from kernbench.components.context import ComponentContext
+    from kernbench.topology.types import Node
+
+
+class MCpuComponent(ComponentBase):
+    """M_CPU component: multi-PE DMA fan-out with response aggregation.
+
+    Forward path (ADR-0015 D5):
+      When a forward Transaction arrives at m_cpu (terminal hop), M_CPU fans out
+      DMA sub-Transactions to target PEs' HBM slices. target_pe on the request
+      controls fan-out: int → single PE, "all" → all PEs in the cube.
+
+    Response path:
+      ResponseMsg from each hbm_ctrl arrives back at m_cpu. Once all PE responses
+      are collected, m_cpu sends an aggregate ResponseMsg on the reverse command
+      path back to io_cpu.
+
+    Transit:
+      When m_cpu is NOT the terminal hop (transit or response relay), the
+      Transaction is forwarded normally to the next hop.
+    """
+
+    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
+        super().__init__(node, ctx)
+        # Pending fan-out tracking: request_id → (expected, received, all_done_event)
+        self._pending: dict[str, tuple[int, int, simpy.Event]] = {}
+        # Store parent txn for response sending: request_id → parent_txn
+        self._parent_txns: dict[str, Any] = {}
+        # DMA engine resources (ADR-0015 D5, ADR-0014 D4): capacity=1 each
+        self._dma_write: simpy.Resource | None = None
+        self._dma_read: simpy.Resource | None = None
+
+    def start(self, env: simpy.Environment) -> None:
+        self._dma_write = simpy.Resource(env, capacity=1)
+        self._dma_read = simpy.Resource(env, capacity=1)
+        super().start(env)
+
+    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
+        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
+        yield env.timeout(overhead_ns)
+
+    def _worker(self, env: simpy.Environment) -> Generator:
+        """Dispatch forward txns, collect response txns."""
+        from kernbench.runtime_api.kernel import KernelLaunchMsg
+
+        while True:
+            txn: Any = yield self._inbox.get()
+            if getattr(txn, "is_response", False):
+                self._collect_response(txn)
+            else:
+                yield from self.run(env, txn.nbytes)
+                next_hop = txn.next_hop
+                if next_hop:
+                    yield self.out_ports[next_hop].put(txn.advance())
+                elif self.ctx is not None and txn.request is not None:
+                    if isinstance(txn.request, KernelLaunchMsg):
+                        env.process(self._kernel_launch_fanout(env, txn))
+                    else:
+                        env.process(self._dma_fanout(env, txn))
+                else:
+                    txn.done.succeed()
+
+    def _collect_response(self, resp_txn: Any) -> None:
+        """Receive a PE response and increment the aggregation counter."""
+        key = resp_txn.request.request_id
+        if key not in self._pending:
+            return
+        expected, received, all_done = self._pending[key]
+        received += 1
+        if received >= expected:
+            all_done.succeed()
+            del self._pending[key]
+        else:
+            self._pending[key] = (expected, received, all_done)
+
+    def _dma_fanout(self, env: simpy.Environment, txn: Any) -> Generator:
+        """Fan out DMA sub-Transactions to target PE(s), wait for responses,
+        then send aggregate response on reverse command path.
+
+        Each DMA transfer acquires the DMA resource (capacity=1 per ADR-0014 D4),
+        so multi-PE fan-out is serialized through the DMA engine.
+        """
+        from kernbench.runtime_api.kernel import MemoryWriteMsg
+
+        request = txn.request
+        target_pe = getattr(request, "target_pe", "all")
+
+        dst_nodes = self._resolve_dma_destinations(request, target_pe)
+        if not dst_nodes:
+            txn.done.succeed()
+            return
+
+        # Setup aggregation
+        all_done = env.event()
+        self._pending[request.request_id] = (len(dst_nodes), 0, all_done)
+        self._parent_txns[request.request_id] = txn
+
+        # Select DMA resource based on operation type
+        dma_res = self._dma_write if isinstance(request, MemoryWriteMsg) else self._dma_read
+
+        # Fan out DMA sub-txns (serialized through DMA resource)
+        max_drain_ns = 0.0
+        for dst_node in dst_nodes:
+            try:
+                dma_path = self.ctx.router.find_mcpu_dma_path(self.node.id, dst_node)
+            except Exception:
+                continue
+            if len(dma_path) < 2:
+                continue
+            drain_ns = self.ctx.compute_drain_ns(dma_path, txn.nbytes)
+            max_drain_ns = max(max_drain_ns, drain_ns)
+            sub_txn = Transaction(
+                request=request, path=dma_path, step=0,
+                nbytes=txn.nbytes, done=env.event(),
+                drain_ns=drain_ns,
+            )
+            with dma_res.request() as req:
+                yield req
+                yield self.out_ports[dma_path[1]].put(sub_txn.advance())
+
+        # Wait for all PE responses
+        yield all_done
+        txn.result_data["xfer_ns"] = max_drain_ns
+        del self._parent_txns[request.request_id]
+
+        # Send aggregate response on reverse command path
+        reverse_path = list(reversed(txn.path))
+        if len(reverse_path) >= 2:
+            from kernbench.runtime_api.kernel import ResponseMsg
+
+            parts = self.node.id.split(".")
+            cube_id = int(parts[1].replace("cube", ""))
+            resp_msg = ResponseMsg(
+                correlation_id=request.correlation_id,
+                request_id=request.request_id,
+                src_cube=cube_id, src_pe=-1, success=True,
+            )
+            resp_txn = Transaction(
+                request=resp_msg, path=reverse_path, step=0,
+                nbytes=0, done=env.event(), is_response=True,
+            )
+            yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
+        else:
+            txn.done.succeed()
+
+    def _kernel_launch_fanout(self, env: simpy.Environment, txn: Any) -> Generator:
+        """Fan out KernelLaunchMsg to target PE_CPU(s) via NOC (ADR-0009 D3).
+
+        Routes through find_node_path (M_CPU → NOC → PE_CPU command edges).
+        Waits for sub_txn.done directly — no ResponseMsg needed for PE direction.
+        Then sends aggregate ResponseMsg back to IO_CPU on the reverse path.
+        """
+        request = txn.request
+        target_pe = getattr(request, "target_pe", "all")
+        cube_prefix = self.node.id.rsplit(".", 1)[0]  # e.g. "sip0.cube0"
+        pe_ids = self._resolve_pe_ids(target_pe)
+
+        if not pe_ids:
+            txn.done.succeed()
+            return
+
+        # Fan out to each PE_CPU and collect done events
+        sub_dones: list[simpy.Event] = []
+        sub_txns: list[Transaction] = []
+        for pe_id in pe_ids:
+            pe_cpu_id = f"{cube_prefix}.pe{pe_id}.pe_cpu"
+            try:
+                path = self.ctx.router.find_node_path(self.node.id, pe_cpu_id)
+            except Exception:
+                continue
+            if len(path) < 2:
+                continue
+            sub_done = env.event()
+            sub_txn = Transaction(
+                request=request, path=path, step=0,
+                nbytes=0, done=sub_done,
+            )
+            yield self.out_ports[path[1]].put(sub_txn.advance())
+            sub_dones.append(sub_done)
+            sub_txns.append(sub_txn)
+
+        if not sub_dones:
+            txn.done.succeed()
+            return
+
+        # Wait for all PE_CPUs to complete
+        for sd in sub_dones:
+            yield sd
+
+        # Aggregate PE-internal metrics (max across PEs)
+        pe_exec_values = [st.result_data.get("pe_exec_ns", 0.0) for st in sub_txns]
+        if pe_exec_values:
+            txn.result_data["pe_exec_ns"] = max(pe_exec_values)
+        dma_values = [st.result_data.get("dma_ns", 0.0) for st in sub_txns]
+        if dma_values:
+            txn.result_data["dma_ns"] = max(dma_values)
+        compute_values = [st.result_data.get("compute_ns", 0.0) for st in sub_txns]
+        if compute_values:
+            txn.result_data["compute_ns"] = max(compute_values)
+
+        # Send aggregate response on reverse command path back to IO_CPU
+        reverse_path = list(reversed(txn.path))
+        if len(reverse_path) >= 2:
+            from kernbench.runtime_api.kernel import ResponseMsg
+
+            parts = self.node.id.split(".")
+            cube_id = int(parts[1].replace("cube", ""))
+            resp_msg = ResponseMsg(
+                correlation_id=request.correlation_id,
+                request_id=request.request_id,
+                src_cube=cube_id, src_pe=-1, success=True,
+            )
+            resp_txn = Transaction(
+                request=resp_msg, path=reverse_path, step=0,
+                nbytes=0, done=env.event(), is_response=True,
+            )
+            yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
+        else:
+            txn.done.succeed()
+
+    def _resolve_dma_destinations(self, request: Any, target_pe: int | str) -> list[str]:
+        """Return list of HBM destination node_ids for DMA fan-out.
+
+        Uses PA-based resolution to determine the actual target cube and slice,
+        enabling cross-cube DMA routing when the PA points to a remote cube.
+        """
+        cube_prefix = self.node.id.rsplit(".", 1)[0]  # e.g. "sip0.cube0"
+
+        if isinstance(target_pe, int):
+            return [f"{cube_prefix}.hbm_ctrl.slice{target_pe}"]
+
+        # PA-based resolution: extract actual target from physical address
+        pa_val = getattr(request, "dst_pa", None) or getattr(request, "src_pa", None)
+        if pa_val is not None:
+            from kernbench.policy.address.phyaddr import PhysAddr
+            try:
+                pa = PhysAddr.decode(pa_val)
+                return [self.ctx.resolver.resolve(pa)]
+            except Exception:
+                pass
+
+        # "all" without PA (KernelLaunch): all slices in local cube
+        n_slices = 8
+        if self.ctx and self.ctx.spec:
+            mm = self.ctx.spec.get("cube", {}).get("memory_map", {})
+            n_slices = mm.get("hbm_slices_per_cube", 8)
+        return [f"{cube_prefix}.hbm_ctrl.slice{i}" for i in range(n_slices)]
+
+    def _resolve_pe_ids(self, target_pe: int | str) -> list[int]:
+        """Return list of PE IDs to fan out to (used by kernel launch fan-out)."""
+        if isinstance(target_pe, int):
+            return [target_pe]
+        # "all": all PEs in local cube
+        n_slices = 8
+        if self.ctx and self.ctx.spec:
+            mm = self.ctx.spec.get("cube", {}).get("memory_map", {})
+            n_slices = mm.get("hbm_slices_per_cube", 8)
+        return list(range(n_slices))
diff --git a/src/kernbench/components/impls/noc.py b/src/kernbench/components/impls/noc.py
new file mode 100644
index 0000000..0c7af1f
--- /dev/null
+++ b/src/kernbench/components/impls/noc.py
@@ -0,0 +1,187 @@
+from __future__ import annotations
+
+from collections.abc import Generator
+from typing import TYPE_CHECKING, Any
+
+import simpy
+
+from kernbench.components.base import ComponentBase
+
+if TYPE_CHECKING:
+    from kernbench.components.context import ComponentContext
+    from kernbench.topology.types import Node
+
+
+class TwoDMeshNocComponent(ComponentBase):
+    """2D mesh NOC modeled as a single smart node.
+
+    Latency model:
+      - Traversal latency = Manhattan distance between prev_hop and next_hop
+        node positions, split into XY segments, traversed with pipeline.
+      - overhead_ns (from node.attrs) is added once per traversal.
+
+    Contention model:
+      - Each directed XY segment is a simpy.Resource(capacity=1).
+      - Pipeline: next segment's resource is requested before the current
+        segment's timeout completes, so a free downstream segment is acquired
+        immediately (wormhole-style cut-through).
+      - Two transactions sharing a segment (same row or column band) contend.
+
+    Concurrency:
+      - _worker spawns an independent SimPy process per transaction, so the
+        NOC is never serialized at the node level — only at segment resources.
+    """
+
+    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
+        super().__init__(node, ctx)
+        self._env: simpy.Environment | None = None
+        self._links: dict[tuple, simpy.Resource] = {}
+        self._x_grid: list[float] = []
+        self._y_grid: list[float] = []
+
+    def start(self, env: simpy.Environment) -> None:
+        self._env = env
+        self._build_grid()
+        super().start(env)
+
+    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
+        yield env.timeout(0)
+
+    # ── Grid construction ────────────────────────────────────────────
+
+    def _build_grid(self) -> None:
+        if not self.ctx:
+            return
+        cube_prefix = self.node.id.rsplit(".", 1)[0]
+        xs: set[float] = set()
+        ys: set[float] = set()
+        for node_id, pos in self.ctx.positions.items():
+            if node_id.startswith(cube_prefix + ".") and pos is not None:
+                xs.add(round(pos[0], 2))
+                ys.add(round(pos[1], 2))
+        self._x_grid = sorted(xs)
+        self._y_grid = sorted(ys)
+
+    def _get_link(self, key: tuple) -> simpy.Resource:
+        if key not in self._links:
+            assert self._env is not None
+            self._links[key] = simpy.Resource(self._env, capacity=1)
+        return self._links[key]
+
+    # ── Worker ───────────────────────────────────────────────────────
+
+    def _worker(self, env: simpy.Environment) -> Generator:
+        while True:
+            txn: Any = yield self._inbox.get()
+            env.process(self._route(env, txn))
+
+    def _route(self, env: simpy.Environment, txn: Any) -> Generator:
+        prev_hop = txn.path[txn.step - 1] if txn.step > 0 else None
+        next_hop = txn.next_hop
+        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
+
+        links: list[tuple[tuple, float]] = []
+        if prev_hop and next_hop and self.ctx:
+            src_pos = self.ctx.positions.get(prev_hop)
+            dst_pos = self.ctx.positions.get(next_hop)
+            if src_pos and dst_pos:
+                links = self._xy_links(src_pos, dst_pos)
+
+        if links:
+            yield from self._traverse(env, links, overhead_ns)
+        else:
+            yield env.timeout(overhead_ns)
+
+        if next_hop:
+            yield self.out_ports[next_hop].put(txn.advance())
+        else:
+            drain = getattr(txn, "drain_ns", 0.0)
+            if drain > 0:
+                yield env.timeout(drain)
+            txn.done.succeed()
+
+    # ── XY routing and pipelined link traversal ──────────────────────
+
+    def _traverse(
+        self,
+        env: simpy.Environment,
+        links: list[tuple[tuple, float]],
+        overhead_ns: float,
+    ) -> Generator:
+        """Pipeline: request next segment before current timeout finishes."""
+        ns_per_mm = self.ctx.ns_per_mm  # type: ignore[union-attr]
+
+        # Acquire first link
+        first_key, _ = links[0]
+        current_resource = self._get_link(first_key)
+        current_req = current_resource.request()
+        yield current_req
+
+        for i, (_, dist_mm) in enumerate(links):
+            # Request next link before current timeout (pipeline)
+            if i + 1 < len(links):
+                next_key, _ = links[i + 1]
+                next_resource = self._get_link(next_key)
+                next_req = next_resource.request()
+
+            yield env.timeout(dist_mm * ns_per_mm + (overhead_ns if i == 0 else 0.0))
+            current_resource.release(current_req)
+
+            if i + 1 < len(links):
+                yield next_req  # usually already fulfilled (pipeline)
+                current_resource = next_resource
+                current_req = next_req
+
+    def _xy_links(
+        self,
+        src: tuple[float, float],
+        dst: tuple[float, float],
+    ) -> list[tuple[tuple, float]]:
+        """XY routing: horizontal segment first, then vertical.
+
+        Returns list of (link_key, dist_mm) pairs, where link_key uniquely
+        identifies a directed segment shared across concurrent transactions.
+        """
+        x0, y0 = src
+        x1, y1 = dst
+        links: list[tuple[tuple, float]] = []
+
+        # Horizontal segment at y≈y0
+        if abs(x0 - x1) > 1e-9:
+            y_band = self._snap(y0, self._y_grid)
+            for xa, xb in self._segments(x0, x1, self._x_grid):
+                d = abs(xb - xa)
+                if d > 1e-9:
+                    lo, hi = (xa, xb) if xa < xb else (xb, xa)
+                    dir_h = "E" if xb > xa else "W"
+                    links.append((("H", round(y_band, 2), round(lo, 2), round(hi, 2), dir_h), d))
+
+        # Vertical segment at x≈x1
+        if abs(y0 - y1) > 1e-9:
+            x_band = self._snap(x1, self._x_grid)
+            for ya, yb in self._segments(y0, y1, self._y_grid):
+                d = abs(yb - ya)
+                if d > 1e-9:
+                    lo, hi = (ya, yb) if ya < yb else (yb, ya)
+                    dir_v = "S" if yb > ya else "N"
+                    links.append((("V", round(x_band, 2), round(lo, 2), round(hi, 2), dir_v), d))
+
+        return links
+
+    @staticmethod
+    def _snap(val: float, grid: list[float]) -> float:
+        if not grid:
+            return val
+        return min(grid, key=lambda g: abs(g - val))
+
+    @staticmethod
+    def _segments(a: float, b: float, grid: list[float]) -> list[tuple[float, float]]:
+        """Consecutive (p_i, p_{i+1}) pairs covering range [a, b] using grid waypoints."""
+        if abs(a - b) < 1e-9:
+            return []
+        lo, hi = (a, b) if a < b else (b, a)
+        pts = [lo] + [g for g in grid if lo + 1e-9 < g < hi - 1e-9] + [hi]
+        pairs = [(pts[i], pts[i + 1]) for i in range(len(pts) - 1)]
+        if a > b:
+            pairs = [(p2, p1) for p1, p2 in reversed(pairs)]
+        return pairs
diff --git a/src/kernbench/components/impls/pcie_ep.py b/src/kernbench/components/impls/pcie_ep.py
new file mode 100644
index 0000000..53faac0
--- /dev/null
+++ b/src/kernbench/components/impls/pcie_ep.py
@@ -0,0 +1,27 @@
+from __future__ import annotations
+
+from collections.abc import Generator
+from typing import TYPE_CHECKING
+
+import simpy
+
+from kernbench.components.base import ComponentBase
+
+if TYPE_CHECKING:
+    from kernbench.components.context import ComponentContext
+    from kernbench.topology.types import Node
+
+
+class PcieEpComponent(ComponentBase):
+    """PCIe endpoint: protocol processing overhead before forwarding.
+
+    Applies overhead_ns (from node.attrs) for PCIe protocol handling,
+    then forwards via inherited _forward_txn().
+    """
+
+    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
+        super().__init__(node, ctx)
+
+    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
+        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
+        yield env.timeout(overhead_ns)
diff --git a/src/kernbench/components/impls/pe_cpu.py b/src/kernbench/components/impls/pe_cpu.py
new file mode 100644
index 0000000..746856f
--- /dev/null
+++ b/src/kernbench/components/impls/pe_cpu.py
@@ -0,0 +1,154 @@
+from __future__ import annotations
+
+from collections.abc import Generator
+from typing import TYPE_CHECKING, Any
+
+import simpy
+
+from kernbench.components.base import ComponentBase
+
+if TYPE_CHECKING:
+    from kernbench.components.context import ComponentContext
+    from kernbench.topology.types import Node
+
+
+class PeCpuComponent(ComponentBase):
+    """PE_CPU: kernel execution controller (Stage 2).
+
+    Two-phase kernel execution (ADR-0014 D1):
+      Phase 1 (compile): look up kernel from registry, run it with TLContext
+                         to generate a PeCommand list.
+      Phase 2 (replay):  iterate commands, dispatch to PE_SCHEDULER via
+                         PeInternalTxn, wait for blocking commands.
+
+    Non-kernel Transactions are forwarded normally.
+    """
+
+    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
+        super().__init__(node, ctx)
+        self._pe_prefix = node.id.rsplit(".", 1)[0]  # e.g. "sip0.cube0.pe0"
+        try:
+            self._pe_idx = int(self._pe_prefix.rsplit("pe", 1)[1])
+        except (IndexError, ValueError):
+            self._pe_idx = 0
+        # Extract sip/cube index for multi-SIP/cube shard matching
+        parts = node.id.split(".")
+        try:
+            self._sip_idx = int(parts[0].replace("sip", ""))
+        except (IndexError, ValueError):
+            self._sip_idx = 0
+        try:
+            self._cube_idx = int(parts[1].replace("cube", ""))
+        except (IndexError, ValueError):
+            self._cube_idx = 0
+
+    def _find_shard(self, shards: tuple) -> Any:
+        """Find shard matching this PE's (sip, cube, pe). Fallback to positional index."""
+        for s in shards:
+            if s.sip == self._sip_idx and s.cube == self._cube_idx and s.pe == self._pe_idx:
+                return s
+        return shards[min(self._pe_idx, len(shards) - 1)]
+
+    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
+        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
+        yield env.timeout(overhead_ns)
+
+    def _worker(self, env: simpy.Environment) -> Generator:
+        while True:
+            txn: Any = yield self._inbox.get()
+            from kernbench.runtime_api.kernel import KernelLaunchMsg
+
+            if hasattr(txn, "request") and isinstance(txn.request, KernelLaunchMsg):
+                yield from self._execute_kernel(env, txn)
+            else:
+                yield from self._forward_txn(env, txn)
+
+    def _execute_kernel(self, env: simpy.Environment, txn: Any) -> Generator:
+        """Compile kernel function and replay command trace."""
+        from kernbench.common.pe_commands import (
+            CompositeCmd,
+            PeCpuOverheadCmd,
+            PeInternalTxn,
+            WaitCmd,
+        )
+        from kernbench.triton_emu.registry import get_kernel
+        from kernbench.triton_emu.tl_context import TLContext, run_kernel
+
+        request = txn.request
+
+        # Phase 1: Compile — apply PE_CPU setup overhead, then run kernel
+        yield from self.run(env, 0)
+
+        kernel_fn = get_kernel(request.kernel_ref.name)
+        tl = TLContext(pe_id=self._pe_idx, dispatch_cycles=0)
+
+        # Unpack KernelLaunchMsg.args into positional args for kernel function
+        # TensorArg → PA (pointer), ScalarArg → value
+        kernel_args: list = []
+        for arg in request.args:
+            if arg.arg_kind == "tensor":
+                shard = self._find_shard(arg.shards)
+                kernel_args.append(shard.pa)
+            elif arg.arg_kind == "scalar":
+                kernel_args.append(arg.value)
+
+        run_kernel(kernel_fn, tl, *kernel_args)
+        commands = tl.commands
+
+        # Phase 2: Replay — dispatch commands to PE_SCHEDULER
+        pe_exec_start = env.now
+        scheduler_id = f"{self._pe_prefix}.pe_scheduler"
+        pending: dict[str, simpy.Event] = {}  # completion_id → done event
+        composite_results: list[dict] = []  # collect result_data from CompositeCmd txns
+
+        for cmd in commands:
+            if isinstance(cmd, PeCpuOverheadCmd):
+                yield env.timeout(cmd.cycles)
+            elif isinstance(cmd, WaitCmd):
+                if cmd.handle is not None:
+                    evt = pending.pop(cmd.handle.id, None)
+                    if evt:
+                        yield evt
+                else:
+                    # Wait all pending completions
+                    for evt in pending.values():
+                        yield evt
+                    pending.clear()
+            elif isinstance(cmd, CompositeCmd):
+                # Non-blocking: dispatch to scheduler, track completion
+                done_evt = env.event()
+                pe_txn = PeInternalTxn(
+                    command=cmd, done=done_evt,
+                    pe_prefix=self._pe_prefix,
+                )
+                composite_results.append(pe_txn.result_data)
+                yield self.out_ports[scheduler_id].put(pe_txn)
+                pending[cmd.completion.id] = done_evt
+            else:
+                # Blocking: dispatch and wait for completion
+                done_evt = env.event()
+                pe_txn = PeInternalTxn(
+                    command=cmd, done=done_evt,
+                    pe_prefix=self._pe_prefix,
+                )
+                yield self.out_ports[scheduler_id].put(pe_txn)
+                yield done_evt
+
+        # Wait for any remaining pending completions
+        for evt in pending.values():
+            yield evt
+
+        # Record PE-internal execution time
+        txn.result_data["pe_exec_ns"] = env.now - pe_exec_start
+
+        # Aggregate dma_ns / compute_ns from CompositeCmd results
+        total_dma_ns = 0.0
+        total_compute_ns = 0.0
+        for rd in composite_results:
+            total_dma_ns += rd.get("dma_ns", 0.0)
+            total_compute_ns += rd.get("compute_ns", 0.0)
+        txn.result_data["dma_ns"] = total_dma_ns
+        txn.result_data["compute_ns"] = total_compute_ns
+
+        # Signal original Transaction done
+        txn.done.succeed()
diff --git a/src/kernbench/components/impls/pe_dma.py b/src/kernbench/components/impls/pe_dma.py
new file mode 100644
index 0000000..71ce8aa
--- /dev/null
+++ b/src/kernbench/components/impls/pe_dma.py
@@ -0,0 +1,116 @@
+from __future__ import annotations
+
+from collections.abc import Generator
+from typing import TYPE_CHECKING, Any
+
+import simpy
+
+from kernbench.components.base import PeEngineBase
+from kernbench.sim_engine.transaction import Transaction
+
+if TYPE_CHECKING:
+    from kernbench.common.pe_commands import PeInternalTxn
+    from kernbench.components.context import ComponentContext
+    from kernbench.topology.types import Node
+
+
+class PeDmaComponent(PeEngineBase):
+    """PE_DMA: dual-channel DMA engine with READ and WRITE resources.
+
+    Each channel has capacity=1 (ADR-0014 D4):
+      - DMA_READ and DMA_WRITE may execute concurrently.
+      - Multiple READs cannot overlap; multiple WRITEs cannot overlap.
+
+    Handles two message types:
+      - Transaction: external fabric messages (PeDmaMsg probes, M_CPU DMA)
+      - PeInternalTxn: PE-internal commands from PE_SCHEDULER
+        (DmaReadCmd → HBM read, DmaWriteCmd → HBM write)
+    """
+
+    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
+        super().__init__(node, ctx)
+        self._dma_read: simpy.Resource | None = None
+        self._dma_write: simpy.Resource | None = None
+
+    def init_resources(self, env: simpy.Environment) -> None:
+        self._dma_read = simpy.Resource(env, capacity=1)
+        self._dma_write = simpy.Resource(env, capacity=1)
+
+    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
+        yield env.timeout(0)
+
+    def handle_command(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
+        """Handle PE-internal DMA command: resolve PA → HBM path → transfer."""
+        from kernbench.common.pe_commands import DmaReadCmd, DmaWriteCmd
+        from kernbench.policy.address.phyaddr import PhysAddr
+        from kernbench.runtime_api.kernel import PeDmaMsg
+
+        cmd = pe_txn.command
+        assert self._dma_read is not None and self._dma_write is not None
+
+        # Determine direction and target PA
+        if isinstance(cmd, DmaReadCmd):
+            dma_res = self._dma_read
+            target_pa = cmd.src_pa
+            is_write = False
+        elif isinstance(cmd, DmaWriteCmd):
+            dma_res = self._dma_write
+            target_pa = cmd.dst_pa
+            is_write = True
+        else:
+            pe_txn.done.succeed()
+            return
+
+        # Resolve PA → HBM node and compute path
+        pa = PhysAddr.decode(target_pa)
+        dst_node = self.ctx.resolver.resolve(pa)
+        path = self.ctx.router.find_path(self._pe_prefix, dst_node)
+        drain_ns = self.ctx.compute_drain_ns(path, cmd.nbytes)
+
+        # Acquire DMA channel (command issue serialization)
+        with dma_res.request() as req:
+            yield req
+            # Create sub-Transaction with PeDmaMsg (HbmCtrl handles it directly)
+            sub_done = env.event()
+            sub_request = PeDmaMsg(
+                correlation_id="pe_internal",
+                request_id=f"dma_{id(pe_txn)}",
+                src_sip=0, src_cube=0, src_pe=0,
+                dst_pa=target_pa, nbytes=cmd.nbytes,
+                is_write=is_write,
+            )
+            sub_txn = Transaction(
+                request=sub_request, path=path, step=0,
+                nbytes=cmd.nbytes, done=sub_done, drain_ns=drain_ns,
+            )
+            # Send to next hop (path[0] is pe_dma itself, path[1] is xbar)
+            if len(path) > 1:
+                yield self.out_ports[path[1]].put(sub_txn.advance())
+        # DMA channel released after issue
+
+        # Wait for HBM transfer completion
+        yield sub_done
+        pe_txn.done.succeed()
+
+    def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator:
+        """Handle external Transaction (PeDmaMsg probe, M_CPU DMA) with channel acquisition."""
+        dma_res = self._select_channel(txn)
+        with dma_res.request() as req:
+            yield req
+            next_hop = txn.next_hop
+            if next_hop:
+                yield self.out_ports[next_hop].put(txn.advance())
+            else:
+                drain = getattr(txn, "drain_ns", 0.0)
+                if drain > 0:
+                    yield env.timeout(drain)
+                txn.done.succeed()
+
+    def _select_channel(self, txn: Any) -> simpy.Resource:
+        """Select DMA channel based on request type."""
+        from kernbench.runtime_api.kernel import MemoryWriteMsg
+
+        assert self._dma_read is not None and self._dma_write is not None
+        if isinstance(txn.request, MemoryWriteMsg):
+            return self._dma_write
+        return self._dma_read
diff --git a/src/kernbench/components/impls/pe_gemm.py b/src/kernbench/components/impls/pe_gemm.py
new file mode 100644
index 0000000..3fc74e3
--- /dev/null
+++ b/src/kernbench/components/impls/pe_gemm.py
@@ -0,0 +1,90 @@
+from __future__ import annotations
+
+from collections.abc import Generator
+from typing import TYPE_CHECKING, Any
+
+import simpy
+
+from kernbench.components.base import PeEngineBase
+
+if TYPE_CHECKING:
+    from kernbench.common.pe_commands import PeInternalTxn
+    from kernbench.components.context import ComponentContext
+    from kernbench.topology.types import Node
+
+
+# dtype → bit width (for TFLOPS scaling)
+_DTYPE_BITS: dict[str, int] = {
+    "f16": 16, "fp16": 16, "float16": 16, "bf16": 16,
+    "f32": 32, "fp32": 32, "float32": 32,
+    "i8": 8, "int8": 8,
+    "i16": 16, "int16": 16,
+    "i32": 32, "int32": 32,
+}
+
+
+class PeGemmComponent(PeEngineBase):
+    """PE_GEMM: matrix multiplication engine sharing accel_slot (ADR-0014 D4).
+
+    Uses a shared compute resource (PE_ACCEL capacity=1) that is mutually
+    exclusive with PE_MATH within the same PE.
+
+    Compute latency model:
+      FLOPs = 2 * M * K * N
+      effective_tflops = peak_tflops_f16 * (16 / dtype_bits)
+      compute_ns = FLOPs / (effective_tflops * 1e3)
+    """
+
+    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
+        super().__init__(node, ctx)
+        self._accel: simpy.Resource | None = None
+        self._peak_tflops_f16: float = float(node.attrs.get("peak_tflops_f16", 0.0))
+
+    def init_resources(self, env: simpy.Environment) -> None:
+        resource_name = self.node.attrs.get("shared_resource")
+        if resource_name and self.ctx:
+            self._accel = self.ctx.get_shared_resource(
+                env, f"{self._pe_prefix}.{resource_name}"
+            )
+
+    def _compute_ns(self, m: int, k: int, n: int, dtype: str) -> float:
+        """Compute GEMM latency in nanoseconds."""
+        if self._peak_tflops_f16 <= 0:
+            return float(self.node.attrs.get("overhead_ns", 0.0))
+        dtype_bits = _DTYPE_BITS.get(dtype, 16)
+        effective_tflops = self._peak_tflops_f16 * (16.0 / dtype_bits)
+        flops = 2.0 * m * k * n
+        return flops / (effective_tflops * 1e3)
+
+    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
+        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
+        yield env.timeout(overhead_ns)
+
+    def handle_command(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
+        from kernbench.common.pe_commands import GemmCmd
+
+        cmd = pe_txn.command
+        if self._accel:
+            with self._accel.request() as req:
+                yield req
+                if isinstance(cmd, GemmCmd):
+                    ns = self._compute_ns(cmd.m, cmd.k, cmd.n, cmd.a.dtype)
+                    yield env.timeout(ns)
+                else:
+                    yield from self.run(env, 0)
+        else:
+            if isinstance(cmd, GemmCmd):
+                ns = self._compute_ns(cmd.m, cmd.k, cmd.n, cmd.a.dtype)
+                yield env.timeout(ns)
+            else:
+                yield from self.run(env, 0)
+        pe_txn.done.succeed()
+
+    def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator:
+        """Transaction forwarding with accel_slot acquisition."""
+        if self._accel:
+            with self._accel.request() as req:
+                yield req
+                yield from super()._forward_txn(env, txn)
+        else:
+            yield from super()._forward_txn(env, txn)
diff --git a/src/kernbench/components/impls/pe_math.py b/src/kernbench/components/impls/pe_math.py
new file mode 100644
index 0000000..c3c3a83
--- /dev/null
+++ b/src/kernbench/components/impls/pe_math.py
@@ -0,0 +1,54 @@
+from __future__ import annotations
+
+from collections.abc import Generator
+from typing import TYPE_CHECKING, Any
+
+import simpy
+
+from kernbench.components.base import PeEngineBase
+
+if TYPE_CHECKING:
+    from kernbench.common.pe_commands import PeInternalTxn
+    from kernbench.components.context import ComponentContext
+    from kernbench.topology.types import Node
+
+
+class PeMathComponent(PeEngineBase):
+    """PE_MATH: element-wise computation engine sharing accel_slot (ADR-0014 D4).
+
+    Uses a shared compute resource (PE_ACCEL capacity=1) that is mutually
+    exclusive with PE_GEMM within the same PE.
+    """
+
+    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
+        super().__init__(node, ctx)
+        self._accel: simpy.Resource | None = None
+
+    def init_resources(self, env: simpy.Environment) -> None:
+        resource_name = self.node.attrs.get("shared_resource")
+        if resource_name and self.ctx:
+            self._accel = self.ctx.get_shared_resource(
+                env, f"{self._pe_prefix}.{resource_name}"
+            )
+
+    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
+        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
+        yield env.timeout(overhead_ns)
+
+    def handle_command(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
+        if self._accel:
+            with self._accel.request() as req:
+                yield req
+                yield from self.run(env, 0)
+        else:
+            yield from self.run(env, 0)
+        pe_txn.done.succeed()
+
+    def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator:
+        """Transaction forwarding with accel_slot acquisition."""
+        if self._accel:
+            with self._accel.request() as req:
+                yield req
+                yield from super()._forward_txn(env, txn)
+        else:
+            yield from super()._forward_txn(env, txn)
diff --git a/src/kernbench/components/impls/pe_scheduler.py b/src/kernbench/components/impls/pe_scheduler.py
new file mode 100644
index 0000000..d196759
--- /dev/null
+++ b/src/kernbench/components/impls/pe_scheduler.py
@@ -0,0 +1,245 @@
+from __future__ import annotations
+
+from collections.abc import Generator
+from typing import TYPE_CHECKING, Any
+
+import simpy
+
+from kernbench.components.base import ComponentBase
+
+if TYPE_CHECKING:
+    from kernbench.common.pe_commands import PeInternalTxn
+    from kernbench.components.context import ComponentContext
+    from kernbench.topology.types import Node
+
+
+class PeSchedulerComponent(ComponentBase):
+    """PE_SCHEDULER: sole dispatcher inside a PE (ADR-0014 D1).
+
+    Receives PeInternalTxn from PE_CPU, routes to the appropriate engine:
+      - DmaReadCmd / DmaWriteCmd → PE_DMA
+      - GemmCmd                  → PE_GEMM
+      - MathCmd                  → PE_MATH
+      - CompositeCmd             → tiled pipeline (Stage 3: ADR-0014 D3.2)
+
+    Composite GEMM pipeline (32x64x32 tiles):
+      DMA_READ(b_tile_t) → COMPUTE(t) → DMA_WRITE(out_tile_t)
+      with overlap: READ(t+1) || COMPUTE(t) || WRITE(t-1)
+
+    Applies scheduler overhead_ns before dispatching each command.
+    Non-PeInternalTxn messages are forwarded via inherited _forward_txn().
+    """
+
+    # Scheduler tile dimensions (ADR-0014 D3.2)
+    TILE_M = 32
+    TILE_K = 64
+    TILE_N = 32
+
+    # Command → engine suffix dispatch table.
+    # New engines: add a single entry here (e.g. ConvCmd: "pe_conv").
+    _CMD_DISPATCH: dict[type, str] = {}
+
+    @classmethod
+    def _ensure_dispatch_table(cls) -> None:
+        if cls._CMD_DISPATCH:
+            return
+        from kernbench.common.pe_commands import DmaReadCmd, DmaWriteCmd, GemmCmd, MathCmd
+
+        cls._CMD_DISPATCH = {
+            DmaReadCmd: "pe_dma",
+            DmaWriteCmd: "pe_dma",
+            GemmCmd: "pe_gemm",
+            MathCmd: "pe_math",
+        }
+
+    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
+        super().__init__(node, ctx)
+        self._pe_prefix = node.id.rsplit(".", 1)[0]
+        self._ensure_dispatch_table()
+
+    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
+        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
+        yield env.timeout(overhead_ns)
+
+    def _worker(self, env: simpy.Environment) -> Generator:
+        from kernbench.common.pe_commands import PeInternalTxn
+
+        while True:
+            msg: Any = yield self._inbox.get()
+            if isinstance(msg, PeInternalTxn):
+                env.process(self._dispatch(env, msg))
+            else:
+                yield from self._forward_txn(env, msg)
+
+    def _dispatch(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
+        """Route a PeInternalTxn to the correct engine via dispatch table."""
+        from kernbench.common.pe_commands import CompositeCmd
+
+        # Scheduler overhead
+        yield from self.run(env, 0)
+
+        cmd = pe_txn.command
+
+        # Check dispatch table first
+        engine_suffix = self._CMD_DISPATCH.get(type(cmd))
+        if engine_suffix is not None:
+            yield self.out_ports[f"{self._pe_prefix}.{engine_suffix}"].put(pe_txn)
+            return
+
+        # CompositeCmd: tiled pipeline (not a simple forward)
+        if isinstance(cmd, CompositeCmd):
+            yield from self._dispatch_composite(env, pe_txn)
+            return
+
+        # Unknown command — signal done immediately
+        pe_txn.done.succeed()
+
+    def _dispatch_composite(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
+        """Composite tiled pipeline (ADR-0014 D3.2).
+
+        GEMM: 3-stage pipeline with b-tile streaming from HBM.
+        MATH: sequential compute + DMA_WRITE (no tiling).
+        """
+        from kernbench.common.pe_commands import CompositeCmd
+
+        cmd = pe_txn.command
+        assert isinstance(cmd, CompositeCmd)
+        if cmd.op == "gemm" and cmd.b is not None:
+            yield from self._pipeline_gemm(env, pe_txn, cmd)
+        else:
+            yield from self._pipeline_math(env, pe_txn, cmd)
+
+    def _pipeline_gemm(self, env: simpy.Environment, pe_txn: PeInternalTxn, cmd: Any) -> Generator:
+        """Tiled GEMM pipeline: stream b tiles from HBM, compute, write results.
+
+        Tensor a is in TCM (loaded via tl.load). Tensor b is in HBM (via tl.ref).
+        Pipeline: DMA_READ(b_tile_t) -> COMPUTE(t) -> DMA_WRITE(out_tile_t)
+        Overlap: READ(t+1) || COMPUTE(t) || WRITE(t-1)
+        """
+        from kernbench.common.pe_commands import (
+            DmaReadCmd,
+            DmaWriteCmd,
+            GemmCmd,
+            PeInternalTxn as PeTxn,
+            TensorHandle,
+        )
+
+        pp = self._pe_prefix
+        a = cmd.a   # already in TCM
+        b = cmd.b   # HBM reference (via tl.ref)
+
+        M, K_a = a.shape[-2], a.shape[-1]
+        K_b, N = b.shape[-2], b.shape[-1]
+        dtype = a.dtype
+        dtype_bytes = b.nbytes // (K_b * N) if (K_b * N) > 0 else 2
+
+        # Tile counts
+        n_tiles_k = max(1, (K_a + self.TILE_K - 1) // self.TILE_K)
+        n_tiles_n = max(1, (N + self.TILE_N - 1) // self.TILE_N)
+        n_tiles = n_tiles_k * n_tiles_n
+
+        prev_compute_done = None
+        prev_write_done = None
+        total_dma_ns = 0.0
+        total_compute_ns = 0.0
+
+        for tile_idx in range(n_tiles):
+            tk = tile_idx // n_tiles_n
+            tn = tile_idx % n_tiles_n
+
+            k_start = tk * self.TILE_K
+            n_start = tn * self.TILE_N
+            tile_k = min(self.TILE_K, K_a - k_start)
+            tile_n = min(self.TILE_N, N - n_start)
+            tile_nbytes = tile_k * tile_n * dtype_bytes
+
+            # --- Stage 1: DMA_READ b_tile from HBM ---
+            read_done = env.event()
+            b_tile_pa = b.pa + (k_start * N + n_start) * dtype_bytes
+            b_tile_handle = TensorHandle(
+                id=f"b_tile_{tile_idx}", pa=b_tile_pa,
+                shape=(tile_k, tile_n), dtype=dtype, nbytes=tile_nbytes,
+            )
+            read_cmd = DmaReadCmd(handle=b_tile_handle, src_pa=b_tile_pa, nbytes=tile_nbytes)
+            read_txn = PeTxn(command=read_cmd, done=read_done, pe_prefix=pp)
+            t0 = env.now
+            yield self.out_ports[f"{pp}.pe_dma"].put(read_txn)
+
+            # Wait for previous compute before starting this tile's compute
+            if prev_compute_done is not None:
+                yield prev_compute_done
+
+            # Wait for this tile's DMA_READ
+            yield read_done
+            total_dma_ns += env.now - t0
+
+            # --- Stage 2: COMPUTE (GEMM) ---
+            compute_done = env.event()
+            out_handle = TensorHandle(
+                id=f"out_tile_{tile_idx}", pa=0,
+                shape=(M, tile_n), dtype=dtype,
+                nbytes=M * tile_n * dtype_bytes,
+            )
+            compute_cmd = GemmCmd(a=a, b=b_tile_handle, out=out_handle,
+                                  m=M, k=tile_k, n=tile_n)
+            compute_txn = PeTxn(command=compute_cmd, done=compute_done, pe_prefix=pp)
+            t0 = env.now
+            yield self.out_ports[f"{pp}.pe_gemm"].put(compute_txn)
+
+            # Wait for previous write (DMA_WRITE serialization)
+            if prev_write_done is not None:
+                yield prev_write_done
+
+            # Wait for compute of THIS tile
+            yield compute_done
+            total_compute_ns += env.now - t0
+            prev_compute_done = compute_done
+
+            # --- Stage 3: DMA_WRITE out_tile to HBM ---
+            write_done = env.event()
+            out_tile_pa = cmd.out_pa + n_start * dtype_bytes
+            write_nbytes = M * tile_n * dtype_bytes
+            write_cmd = DmaWriteCmd(handle=out_handle, dst_pa=out_tile_pa, nbytes=write_nbytes)
+            write_txn = PeTxn(command=write_cmd, done=write_done, pe_prefix=pp)
+            t0 = env.now
+            yield self.out_ports[f"{pp}.pe_dma"].put(write_txn)
+            prev_write_done = write_done
+
+        # Wait for final write
+        if prev_write_done is not None:
+            t0 = env.now
+            yield prev_write_done
+            total_dma_ns += env.now - t0
+
+        pe_txn.result_data["dma_ns"] = total_dma_ns
+        pe_txn.result_data["compute_ns"] = total_compute_ns
+        pe_txn.done.succeed()
+
+    def _pipeline_math(self, env: simpy.Environment, pe_txn: PeInternalTxn, cmd: Any) -> Generator:
+        """Non-GEMM composite: sequential compute + DMA_WRITE (no tiling)."""
+        from kernbench.common.pe_commands import (
+            DmaWriteCmd,
+            MathCmd,
+            PeInternalTxn as PeTxn,
+        )
+
+        pp = self._pe_prefix
+
+        # Step 1: Compute (MATH)
+        compute_done = env.event()
+        compute_cmd = MathCmd(
+            op=cmd.math_op or "identity",
+            inputs=(cmd.a,), out=cmd.a,
+        )
+        compute_txn = PeTxn(command=compute_cmd, done=compute_done, pe_prefix=pp)
+        yield self.out_ports[f"{pp}.pe_math"].put(compute_txn)
+        yield compute_done
+
+        # Step 2: DMA_WRITE result to HBM
+        write_done = env.event()
+        write_cmd = DmaWriteCmd(handle=cmd.a, dst_pa=cmd.out_pa, nbytes=cmd.out_nbytes)
+        write_txn = PeTxn(command=write_cmd, done=write_done, pe_prefix=pp)
+        yield self.out_ports[f"{pp}.pe_dma"].put(write_txn)
+        yield write_done
+
+        pe_txn.done.succeed()
diff --git a/src/kernbench/components/impls/pe_tcm.py b/src/kernbench/components/impls/pe_tcm.py
new file mode 100644
index 0000000..6458d56
--- /dev/null
+++ b/src/kernbench/components/impls/pe_tcm.py
@@ -0,0 +1,25 @@
+from __future__ import annotations
+
+from collections.abc import Generator
+from typing import TYPE_CHECKING
+
+from kernbench.components.base import ComponentBase
+
+if TYPE_CHECKING:
+    from kernbench.components.context import ComponentContext
+    from kernbench.topology.types import Node
+
+
+class PeTcmComponent(ComponentBase):
+    """PE_TCM: tightly-coupled memory / local SRAM staging buffer.
+
+    Terminal storage component for PE-internal dataflow (ADR-0014 D5).
+    Phase 0: applies overhead_ns and drain_ns at terminal.
+    """
+
+    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
+        super().__init__(node, ctx)
+
+    def run(self, env, nbytes: int) -> Generator:
+        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
+        yield env.timeout(overhead_ns)
diff --git a/src/kernbench/components/impls/sram.py b/src/kernbench/components/impls/sram.py
new file mode 100644
index 0000000..d631ec4
--- /dev/null
+++ b/src/kernbench/components/impls/sram.py
@@ -0,0 +1,59 @@
+from __future__ import annotations
+
+from collections.abc import Generator
+from typing import TYPE_CHECKING, Any
+
+import simpy
+
+from kernbench.components.base import ComponentBase
+from kernbench.sim_engine.transaction import Transaction
+
+if TYPE_CHECKING:
+    from kernbench.components.context import ComponentContext
+    from kernbench.topology.types import Node
+
+
+class SramComponent(ComponentBase):
+    """Cube SRAM: terminal component that models SRAM access latency.
+
+    Applies overhead_ns processing overhead (from node.attrs).
+    On completion, sends a ResponseMsg back on the reverse path.
+    """
+
+    def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
+        super().__init__(node, ctx)
+
+    def run(self, env: simpy.Environment, nbytes: int) -> Generator:
+        overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
+        yield env.timeout(overhead_ns)
+
+    def _worker(self, env: simpy.Environment) -> Generator:
+        """Terminal worker: process, apply drain, send response."""
+        while True:
+            txn: Any = yield self._inbox.get()
+            yield from self.run(env, txn.nbytes)
+            drain = getattr(txn, "drain_ns", 0.0)
+            if drain > 0:
+                yield env.timeout(drain)
+            yield from self._send_response(env, txn)
+
+    def _send_response(self, env: simpy.Environment, txn: Any) -> Generator:
+        """Create ResponseMsg and send on reverse path."""
+        reverse_path = list(reversed(txn.path))
+        if len(reverse_path) >= 2 and self.ctx:
+            from kernbench.runtime_api.kernel import ResponseMsg
+
+            parts = self.node.id.split(".")
+            cube_id = int(parts[1].replace("cube", ""))
+            resp_msg = ResponseMsg(
+                correlation_id=txn.request.correlation_id,
+                request_id=txn.request.request_id,
+                src_cube=cube_id, src_pe=-1, success=True,
+            )
+            resp_txn = Transaction(
+                request=resp_msg, path=reverse_path, step=0,
+                nbytes=0, done=env.event(), is_response=True,
+            )
+            yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
+        else:
+            txn.done.succeed()
diff --git a/src/kernbench/di/registry.py b/src/kernbench/di/registry.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/kernbench/policy/address/allocator.py b/src/kernbench/policy/address/allocator.py
new file mode 100644
index 0000000..5d10bb4
--- /dev/null
+++ b/src/kernbench/policy/address/allocator.py
@@ -0,0 +1,85 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+from kernbench.policy.address.phyaddr import PhysAddr
+
+
+class AllocationError(Exception):
+    pass
+
+
+@dataclass(frozen=True)
+class AddressConfig:
+    sip_count: int
+    cubes_per_sip: int
+    pes_per_cube: int
+    hbm_bytes_per_cube: int
+    hbm_slices_per_cube: int
+    tcm_bytes_per_pe: int
+    tcm_scheduler_reserved_bytes: int
+    sram_bytes_per_cube: int
+
+    @property
+    def hbm_slice_bytes(self) -> int:
+        return self.hbm_bytes_per_cube // self.hbm_slices_per_cube
+
+    @property
+    def tcm_allocatable_bytes(self) -> int:
+        return self.tcm_bytes_per_pe - self.tcm_scheduler_reserved_bytes
+
+
+class PEMemAllocator:
+    def __init__(
+        self, rack_id: int, sip_id: int, cube_id: int, pe_id: int, cfg: AddressConfig,
+    ) -> None:
+        self._rack_id = rack_id
+        self._sip_id = sip_id
+        self._cube_id = cube_id
+        self._pe_id = pe_id
+        self._cfg = cfg
+        self._hbm_cursor = 0
+        self._tcm_cursor = 0
+
+    def alloc_hbm(self, nbytes: int) -> PhysAddr:
+        if self._hbm_cursor + nbytes > self._cfg.hbm_slice_bytes:
+            raise AllocationError(
+                f"HBM overflow: need {nbytes}, "
+                f"available {self._cfg.hbm_slice_bytes - self._hbm_cursor}"
+            )
+        pa = PhysAddr.pe_hbm_addr(
+            rack_id=self._rack_id, sip_id=self._sip_id, cube_id=self._cube_id,
+            pe_id=self._pe_id, pe_local_hbm_offset=self._hbm_cursor,
+            slice_size_bytes=self._cfg.hbm_slice_bytes,
+        )
+        self._hbm_cursor += nbytes
+        return pa
+
+    def alloc_tcm(self, nbytes: int) -> PhysAddr:
+        if self._tcm_cursor + nbytes > self._cfg.tcm_allocatable_bytes:
+            raise AllocationError(
+                f"TCM overflow: need {nbytes}, "
+                f"available {self._cfg.tcm_allocatable_bytes - self._tcm_cursor}"
+            )
+        pa = PhysAddr.pe_tcm_addr(
+            rack_id=self._rack_id, sip_id=self._sip_id, cube_id=self._cube_id,
+            pe_id=self._pe_id, tcm_offset=self._tcm_cursor,
+        )
+        self._tcm_cursor += nbytes
+        return pa
+
+    @property
+    def hbm_used(self) -> int:
+        return self._hbm_cursor
+
+    @property
+    def hbm_total(self) -> int:
+        return self._cfg.hbm_slice_bytes
+
+    @property
+    def tcm_used(self) -> int:
+        return self._tcm_cursor
+
+    @property
+    def tcm_total(self) -> int:
+        return self._cfg.tcm_allocatable_bytes
diff --git a/src/kernbench/policy/address/phyaddr.py b/src/kernbench/policy/address/phyaddr.py
new file mode 100644
index 0000000..b19e3cc
--- /dev/null
+++ b/src/kernbench/policy/address/phyaddr.py
@@ -0,0 +1,184 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from enum import IntEnum
+from typing import Literal
+
+MAX_51 = (1 << 51) - 1
+
+
+class PhysAddrError(Exception):
+    pass
+
+
+def _chk_range(name: str, v: int, bits: int) -> None:
+    if not (0 <= v < (1 << bits)):
+        raise PhysAddrError(f"{name} out of range for {bits} bits: {v}")
+
+
+def _chk_max(name: str, v: int, maxv: int) -> None:
+    if not (0 <= v <= maxv):
+        raise PhysAddrError(f"{name} out of range (0..{maxv}): {v}")
+
+
+class UnitType(IntEnum):
+    PE = 0
+    MCPU = 1
+    SRAM = 2
+
+
+@dataclass(frozen=True)
+class PhysAddr:
+    """
+    51-bit physical address value object.
+
+    Layout:
+      [50:47] rack_id  (4)
+      [46:43] sip_id   (4)
+      [42:38] sip_seg  (5)   # cube_id
+      [37:0]  local_offset (38) => each segment is 256GB
+
+    local_offset:
+      [37] selector: 1 = HBM window (128GB reserved), 0 = PE resource window
+    """
+
+    rack_id: int
+    sip_id: int
+    sip_seg: int
+    local_offset: int
+
+    kind: Literal["hbm", "pe_resource", "raw"] = "raw"
+    cube_id: int = 0
+    unit_type: UnitType = UnitType.PE
+    pe_id: int = 0
+    ext: int = 0
+    sub_offset: int = 0
+    hbm_offset: int = 0
+
+    HBM_WINDOW_BYTES = 1 << 37  # 128GB
+
+    def encode(self) -> int:
+        _chk_range("rack_id", self.rack_id, 4)
+        _chk_range("sip_id", self.sip_id, 4)
+        _chk_range("sip_seg", self.sip_seg, 5)
+        _chk_range("local_offset", self.local_offset, 38)
+        addr = (self.rack_id << 47) | (self.sip_id << 43) | (self.sip_seg << 38) | self.local_offset
+        if not (0 <= addr <= MAX_51):
+            raise PhysAddrError("address exceeds 51-bit space")
+        return addr
+
+    @staticmethod
+    def decode(addr: int) -> PhysAddr:
+        if not (0 <= addr <= MAX_51):
+            raise PhysAddrError("addr must be a 51-bit value")
+        rack = (addr >> 47) & 0xF
+        sip_id = (addr >> 43) & 0xF
+        sip_seg = (addr >> 38) & 0x1F
+        off = addr & ((1 << 38) - 1)
+        cube_id = sip_seg
+        sel = (off >> 37) & 0x1
+        if sel == 1:
+            hbm_offset = int(off & ((1 << 37) - 1))
+            return PhysAddr(
+                rack_id=rack,
+                sip_id=sip_id,
+                sip_seg=sip_seg,
+                local_offset=off,
+                kind="hbm",
+                cube_id=cube_id,
+                hbm_offset=hbm_offset,
+            )
+        # PE resource decode
+        raw_ut = int((off >> 34) & 0x7)
+        try:
+            unit_type = UnitType(raw_ut)
+        except ValueError:
+            raise PhysAddrError(f"unknown unit_type: {raw_ut}") from None
+        pe_id = int((off >> 30) & 0xF)
+        ext = int((off >> 29) & 0x1)
+        sub_offset = int(off & ((1 << 29) - 1))
+        return PhysAddr(
+            rack_id=rack,
+            sip_id=sip_id,
+            sip_seg=sip_seg,
+            local_offset=off,
+            kind="pe_resource",
+            cube_id=cube_id,
+            unit_type=unit_type,
+            pe_id=pe_id,
+            ext=ext,
+            sub_offset=sub_offset,
+            hbm_offset=0,
+        )
+
+    @staticmethod
+    def hbm_addr(*, rack_id: int, sip_id: int, cube_id: int, hbm_offset: int) -> PhysAddr:
+        _chk_max("cube_id", cube_id, 31)
+        _chk_range("hbm_offset", hbm_offset, 37)
+        sip_seg = cube_id
+        local_offset = (1 << 37) | int(hbm_offset)
+        return PhysAddr(
+            rack_id=rack_id,
+            sip_id=sip_id,
+            sip_seg=sip_seg,
+            local_offset=local_offset,
+            kind="hbm",
+            cube_id=cube_id,
+            hbm_offset=int(hbm_offset),
+        )
+
+    @staticmethod
+    def pe_hbm_addr(
+        *,
+        rack_id: int,
+        sip_id: int,
+        cube_id: int,
+        pe_id: int,
+        pe_local_hbm_offset: int,
+        slice_size_bytes: int,
+    ) -> PhysAddr:
+        _chk_max("cube_id", cube_id, 31)
+        _chk_range("pe_id", pe_id, 4)
+        if not (0 <= pe_local_hbm_offset < slice_size_bytes):
+            raise PhysAddrError("pe_local_hbm_offset out of PE local slice range")
+        hbm_offset = int(pe_id) * int(slice_size_bytes) + int(pe_local_hbm_offset)
+        if not (0 <= hbm_offset < PhysAddr.HBM_WINDOW_BYTES):
+            raise PhysAddrError("HBM offset exceeds reserved 128GB window")
+        return PhysAddr.hbm_addr(
+            rack_id=rack_id, sip_id=sip_id, cube_id=cube_id, hbm_offset=hbm_offset
+        )
+
+    @staticmethod
+    def hbm_pe_id(hbm_offset: int, slice_size_bytes: int) -> int:
+        return hbm_offset // slice_size_bytes
+
+    @staticmethod
+    def cube_sram_addr(
+        *, rack_id: int, sip_id: int, cube_id: int, sram_offset: int,
+    ) -> PhysAddr:
+        _chk_max("cube_id", cube_id, 31)
+        _chk_range("sram_offset", sram_offset, 29)
+        sip_seg = cube_id
+        local_offset = (UnitType.SRAM << 34) | sram_offset
+        return PhysAddr(
+            rack_id=rack_id, sip_id=sip_id, sip_seg=sip_seg,
+            local_offset=local_offset,
+            kind="pe_resource", cube_id=cube_id,
+            unit_type=UnitType.SRAM, sub_offset=sram_offset,
+        )
+
+    @staticmethod
+    def pe_tcm_addr(
+        *, rack_id: int, sip_id: int, cube_id: int, pe_id: int, tcm_offset: int,
+    ) -> PhysAddr:
+        _chk_max("cube_id", cube_id, 31)
+        _chk_range("pe_id", pe_id, 4)
+        _chk_range("tcm_offset", tcm_offset, 29)
+        sip_seg = cube_id
+        local_offset = (UnitType.PE << 34) | (pe_id << 30) | tcm_offset
+        return PhysAddr(
+            rack_id=rack_id, sip_id=sip_id, sip_seg=sip_seg,
+            local_offset=local_offset,
+            kind="pe_resource", cube_id=cube_id,
+            unit_type=UnitType.PE, pe_id=pe_id, sub_offset=tcm_offset,
+        )
diff --git a/src/kernbench/policy/placement/dp.py b/src/kernbench/policy/placement/dp.py
new file mode 100644
index 0000000..8860d7f
--- /dev/null
+++ b/src/kernbench/policy/placement/dp.py
@@ -0,0 +1,174 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from math import ceil
+from typing import Literal
+
+
+@dataclass(frozen=True)
+class DPPolicy:
+    """Two-level data-parallel policy: cube-level + pe-level."""
+
+    cube: Literal["replicate", "shard_m", "shard_k"] = "replicate"
+    pe: Literal["replicate", "column_wise", "row_wise"] = "replicate"
+
+
+def resolve_dp_policy(
+    policy: DPPolicy,
+    *,
+    shape: tuple[int, int],
+    itemsize: int,
+    num_pe: int,
+    num_cubes: int = 1,
+) -> list[ShardSpec]:
+    """Resolve a DPPolicy into a list[ShardSpec] with two-level resolution.
+
+    Cube-level policy distributes across cubes, pe-level distributes within
+    each cube. ShardSpec.pe_index uses flat indexing: cube_id * num_pe + pe_id.
+    """
+    _PE_RESOLVERS = {
+        "replicate": replicate,
+        "column_wise": column_wise,
+        "row_wise": row_wise,
+    }
+    resolver = _PE_RESOLVERS.get(policy.pe)
+    if resolver is None:
+        raise ValueError(f"Unknown pe-level policy: {policy.pe}")
+
+    if num_cubes <= 1:
+        return resolver(shape=shape, itemsize=itemsize, num_pe=num_pe)
+
+    # Two-level resolution: cube-level → pe-level
+    M, K = shape
+    all_shards: list[ShardSpec] = []
+
+    for cube_id in range(num_cubes):
+        # Determine per-cube shape based on cube-level policy
+        if policy.cube == "replicate":
+            cube_shape = (M, K)
+            cube_offset = 0
+        elif policy.cube == "shard_m":
+            chunk_m = M // num_cubes
+            cube_shape = (chunk_m, K)
+            cube_offset = cube_id * chunk_m * K * itemsize
+        elif policy.cube == "shard_k":
+            chunk_k = K // num_cubes
+            cube_shape = (M, chunk_k)
+            cube_offset = cube_id * M * chunk_k * itemsize
+        else:
+            raise ValueError(f"Unknown cube-level policy: {policy.cube}")
+
+        # Resolve pe-level within this cube's shape
+        pe_shards = resolver(shape=cube_shape, itemsize=itemsize, num_pe=num_pe)
+
+        # Remap pe_index to flat index and adjust offset
+        for ps in pe_shards:
+            flat_idx = cube_id * num_pe + ps.pe_index
+            all_shards.append(ShardSpec(
+                pe_index=flat_idx,
+                offset_bytes=cube_offset + ps.offset_bytes,
+                nbytes=ps.nbytes,
+            ))
+
+    return all_shards
+
+
+@dataclass(frozen=True)
+class ShardSpec:
+    pe_index: int
+    offset_bytes: int
+    nbytes: int
+
+
+def column_wise(
+    *, shape: tuple[int, int], itemsize: int, num_pe: int,
+) -> list[ShardSpec]:
+    """Split K axis into num_pe equal parts. Each PE gets (M, K/P)."""
+    M, K = shape
+    chunk_k = K // num_pe
+    chunk_bytes = M * chunk_k * itemsize
+    shards = []
+    for i in range(num_pe):
+        shards.append(ShardSpec(
+            pe_index=i,
+            offset_bytes=i * chunk_bytes,
+            nbytes=chunk_bytes,
+        ))
+    return shards
+
+
+def row_wise(
+    *, shape: tuple[int, int], itemsize: int, num_pe: int,
+) -> list[ShardSpec]:
+    """Split M axis into num_pe equal parts. Each PE gets (M/P, K)."""
+    M, K = shape
+    chunk_m = M // num_pe
+    chunk_bytes = chunk_m * K * itemsize
+    shards = []
+    for i in range(num_pe):
+        shards.append(ShardSpec(
+            pe_index=i,
+            offset_bytes=i * chunk_bytes,
+            nbytes=chunk_bytes,
+        ))
+    return shards
+
+
+def replicate(
+    *, shape: tuple[int, int], itemsize: int, num_pe: int,
+) -> list[ShardSpec]:
+    """Full copy per PE. Each PE gets (M, K)."""
+    M, K = shape
+    full_bytes = M * K * itemsize
+    return [
+        ShardSpec(pe_index=i, offset_bytes=0, nbytes=full_bytes)
+        for i in range(num_pe)
+    ]
+
+
+def tiled_column_major(
+    *, shape: tuple[int, int], itemsize: int, num_pe: int,
+    tile_m: int, tile_k: int,
+) -> list[ShardSpec]:
+    """2D tiling, column-major order (K axis first), round-robin across PEs."""
+    M, K = shape
+    tiles_m = ceil(M / tile_m)
+    tiles_k = ceil(K / tile_k)
+    tile_bytes = tile_m * tile_k * itemsize
+    row_bytes = K * itemsize
+    shards = []
+    idx = 0
+    for mi in range(tiles_m):
+        for ki in range(tiles_k):
+            offset = (mi * tile_m * row_bytes) + (ki * tile_k * itemsize)
+            shards.append(ShardSpec(
+                pe_index=idx % num_pe,
+                offset_bytes=offset,
+                nbytes=tile_bytes,
+            ))
+            idx += 1
+    return shards
+
+
+def tiled_row_major(
+    *, shape: tuple[int, int], itemsize: int, num_pe: int,
+    tile_m: int, tile_k: int,
+) -> list[ShardSpec]:
+    """2D tiling, row-major order (M axis first), round-robin across PEs."""
+    M, K = shape
+    tiles_m = ceil(M / tile_m)
+    tiles_k = ceil(K / tile_k)
+    tile_bytes = tile_m * tile_k * itemsize
+    row_bytes = K * itemsize
+    shards = []
+    idx = 0
+    for ki in range(tiles_k):
+        for mi in range(tiles_m):
+            offset = (mi * tile_m * row_bytes) + (ki * tile_k * itemsize)
+            shards.append(ShardSpec(
+                pe_index=idx % num_pe,
+                offset_bytes=offset,
+                nbytes=tile_bytes,
+            ))
+            idx += 1
+    return shards
diff --git a/src/kernbench/policy/routing/router.py b/src/kernbench/policy/routing/router.py
new file mode 100644
index 0000000..5565e45
--- /dev/null
+++ b/src/kernbench/policy/routing/router.py
@@ -0,0 +1,184 @@
+from __future__ import annotations
+
+import heapq
+from collections import defaultdict
+
+from kernbench.policy.address.phyaddr import PhysAddr, UnitType
+from kernbench.topology.types import TopologyGraph
+
+
+class RoutingError(Exception):
+    pass
+
+
+class AddressResolver:
+    """Resolve a PhysAddr to the destination node_id in the compiled graph.
+
+    Also provides named node lookups (find_m_cpu, find_pcie_ep, …) so that
+    component implementations never construct node_id strings directly.
+    Centralising the naming convention here means a single change propagates
+    everywhere (ADR-0015 D4).
+    """
+
+    def __init__(self, graph: TopologyGraph) -> None:
+        self._node_ids = set(graph.nodes)
+        mm = graph.spec["cube"]["memory_map"]
+        self._slice_size_bytes = mm["hbm_total_gb_per_cube"] * (1 << 30) // mm["hbm_slices_per_cube"]
+
+    # ── Physical-address resolution ──────────────────────────────────
+
+    def resolve(self, addr: PhysAddr) -> str:
+        s = addr.sip_id
+        c = addr.cube_id
+        if addr.kind == "hbm":
+            pe_slice = PhysAddr.hbm_pe_id(addr.hbm_offset, self._slice_size_bytes)
+            node_id = f"sip{s}.cube{c}.hbm_ctrl.slice{pe_slice}"
+        elif addr.kind == "pe_resource":
+            if addr.unit_type == UnitType.PE:
+                node_id = f"sip{s}.cube{c}.pe{addr.pe_id}.pe_tcm"
+            elif addr.unit_type == UnitType.SRAM:
+                node_id = f"sip{s}.cube{c}.sram"
+            elif addr.unit_type == UnitType.MCPU:
+                node_id = f"sip{s}.cube{c}.m_cpu"
+            else:
+                raise RoutingError(f"unsupported unit_type: {addr.unit_type}")
+        else:
+            raise RoutingError(f"unsupported address kind: {addr.kind}")
+        if node_id not in self._node_ids:
+            raise RoutingError(f"node {node_id} not found in topology")
+        return node_id
+
+    # ── Named node lookups ───────────────────────────────────────────
+
+    def find_m_cpu(self, sip: int, cube: int) -> str:
+        node_id = f"sip{sip}.cube{cube}.m_cpu"
+        if node_id not in self._node_ids:
+            raise RoutingError(f"M_CPU not found: {node_id}")
+        return node_id
+
+    def find_pcie_ep(self, sip: int, io_id: str = "io0") -> str:
+        node_id = f"sip{sip}.{io_id}.pcie_ep"
+        if node_id not in self._node_ids:
+            raise RoutingError(f"PCIE_EP not found: {node_id}")
+        return node_id
+
+    def find_io_cpu(self, sip: int, io_id: str = "io0") -> str:
+        node_id = f"sip{sip}.{io_id}.io_cpu"
+        if node_id not in self._node_ids:
+            raise RoutingError(f"IO_CPU not found: {node_id}")
+        return node_id
+
+    def find_all_pcie_eps(self) -> list[str]:
+        """Return all PCIE_EP node ids across all SIPs, sorted."""
+        return sorted(nid for nid in self._node_ids if nid.endswith(".pcie_ep"))
+
+
+class PathRouter:
+    """Find data-path from a source PE (or arbitrary node) to a destination node.
+
+    Two adjacency graphs are maintained:
+      _adj      — excludes command edges (used by PE DMA routing, find_path)
+      _adj_all  — includes all edges (used by component-to-component routing,
+                  find_node_path; required because M_CPU↔NOC links are "command")
+    """
+
+    # Edge kinds excluded from M_CPU DMA adjacency: prevents routing through
+    # PE-internal pipeline nodes when computing DMA paths.
+    _MCPU_DMA_EXCLUDE = {"pe_internal", "pe_to_xbar"}
+
+    def __init__(self, graph: TopologyGraph) -> None:
+        self._adj: dict[str, list[tuple[str, float]]] = defaultdict(list)
+        self._adj_all: dict[str, list[tuple[str, float]]] = defaultdict(list)
+        self._adj_mcpu_dma: dict[str, list[tuple[str, float]]] = defaultdict(list)
+        for e in graph.edges:
+            w = e.routing_weight_mm if e.routing_weight_mm is not None else e.distance_mm
+            self._adj_all[e.src].append((e.dst, w))
+            if e.kind != "command":
+                self._adj[e.src].append((e.dst, w))
+            if e.kind not in self._MCPU_DMA_EXCLUDE:
+                self._adj_mcpu_dma[e.src].append((e.dst, w))
+
+    def find_path(self, src_pe: str, dst_node: str) -> list[str]:
+        """PE DMA routing: prepends .pe_dma, excludes command edges."""
+        start = f"{src_pe}.pe_dma"
+        return self._run_dijkstra(self._adj, start, dst_node)
+
+    def find_path_with_distance(self, src_pe: str, dst_node: str) -> tuple[list[str], float]:
+        start = f"{src_pe}.pe_dma"
+        return self._run_dijkstra_with_dist(self._adj, start, dst_node)
+
+    def find_mcpu_dma_path(self, m_cpu_id: str, dst_hbm_slice_id: str) -> list[str]:
+        """M_CPU DMA path: never routes through PE-internal nodes (ADR-0015 D5).
+
+        Same-cube: deterministic [m_cpu, noc, xbar.pe_i, hbm_ctrl.slice_i].
+        Cross-cube: Dijkstra via _adj_mcpu_dma (pe_internal/pe_to_xbar excluded)
+                    → routes through NOC → UCIe → target cube NOC → xbar → HBM.
+        """
+        m_cube = ".".join(m_cpu_id.split(".")[:2])
+        d_cube = ".".join(dst_hbm_slice_id.split(".")[:2])
+        if m_cube == d_cube:
+            slice_idx = int(dst_hbm_slice_id.rsplit("slice", 1)[1])
+            return [
+                m_cpu_id,
+                f"{m_cube}.noc",
+                f"{m_cube}.xbar.pe{slice_idx}",
+                dst_hbm_slice_id,
+            ]
+        return self._run_dijkstra(self._adj_mcpu_dma, m_cpu_id, dst_hbm_slice_id)
+
+    def find_node_path(self, src: str, dst: str) -> list[str]:
+        """General routing between arbitrary nodes, including command edges.
+
+        Used by components (IoCpuComponent, MCpuComponent) that route through
+        M_CPU↔NOC command-kind links.
+        """
+        return self._run_dijkstra(self._adj_all, src, dst)
+
+    def _run_dijkstra(
+        self,
+        adj: dict[str, list[tuple[str, float]]],
+        start: str,
+        goal: str,
+    ) -> list[str]:
+        path, _ = self._run_dijkstra_with_dist(adj, start, goal)
+        return path
+
+    def _run_dijkstra_with_dist(
+        self,
+        adj: dict[str, list[tuple[str, float]]],
+        start: str,
+        goal: str,
+    ) -> tuple[list[str], float]:
+        if start == goal:
+            return [start], 0.0
+        best: dict[str, float] = {start: 0.0}
+        prev: dict[str, str] = {}
+        heap: list[tuple[float, str]] = [(0.0, start)]
+        while heap:
+            d, node = heapq.heappop(heap)
+            if node == goal:
+                path: list[str] = []
+                cur = goal
+                while cur != start:
+                    path.append(cur)
+                    cur = prev[cur]
+                path.append(start)
+                path.reverse()
+                return path, d
+            if d > best.get(node, float("inf")):
+                continue
+            for neighbor, edge_dist in adj[node]:
+                new_d = d + edge_dist
+                if new_d < best.get(neighbor, float("inf")):
+                    best[neighbor] = new_d
+                    prev[neighbor] = node
+                    heapq.heappush(heap, (new_d, neighbor))
+        raise RoutingError(f"no path from {start} to {goal}")
+
+    # ── backward-compat shims (used by existing tests) ───────────────
+
+    def _dijkstra(self, start: str, goal: str) -> list[str]:
+        return self._run_dijkstra(self._adj, start, goal)
+
+    def _dijkstra_with_dist(self, start: str, goal: str) -> tuple[list[str], float]:
+        return self._run_dijkstra_with_dist(self._adj, start, goal)
diff --git a/src/kernbench/runtime_api/__init__.py b/src/kernbench/runtime_api/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/kernbench/runtime_api/bench_runner.py b/src/kernbench/runtime_api/bench_runner.py
new file mode 100644
index 0000000..164a6a7
--- /dev/null
+++ b/src/kernbench/runtime_api/bench_runner.py
@@ -0,0 +1,96 @@
+from __future__ import annotations
+
+from collections.abc import Callable
+from enum import Enum
+from typing import Any
+
+from kernbench.common.types import Completion, SimEngine, Trace
+
+from .context import RuntimeContext
+from .types import BenchResult, DeviceSelector
+
+
+class CompletionPolicy(str, Enum):
+    LAST_SUBMITTED = "last_submitted"
+    LAST_COMPLETED = "last_completed"  # requires trace/timestamps or engine support; stub for now
+    ALL_OK_FAIL_FAST = "all_ok_fail_fast"
+
+
+BenchFn = Callable[[RuntimeContext], Any]
+EngineFactory = Callable[[object, DeviceSelector], SimEngine]
+
+
+def run_bench(
+    *,
+    topology: object,
+    bench_fn: BenchFn,
+    device: DeviceSelector,
+    engine_factory: EngineFactory,
+    correlation_id: str = "bench0",
+    completion_policy: CompletionPolicy = CompletionPolicy.LAST_SUBMITTED,
+) -> BenchResult:
+    """
+    Minimal bench runner.
+
+    - topology: compiled topology object (opaque to runtime here)
+    - bench_fn: callable that receives RuntimeContext and submits requests
+    - device: DeviceSelector ("all" or "sip:<N>")
+    - engine_factory: builds sim_engine for given topology & device
+    - completion_policy: how to determine overall completion/result
+    """
+    engine = engine_factory(topology, device)
+    # Extract spec from TopologyHandle or TopologyGraph
+    topo_obj = getattr(topology, "topology_obj", topology)
+    spec = getattr(topo_obj, "spec", None)
+    ctx = RuntimeContext(
+        engine=engine, target_device=device,
+        correlation_id=correlation_id, spec=spec,
+    )
+
+    bench_fn(ctx)
+
+    ctx.wait_all()
+
+    collected_traces = ctx._traces or None
+
+    handles = ctx.handles()
+    if not handles:
+        return BenchResult(
+            completion=Completion(
+                ok=False, error_code="NO_REQUESTS", error_message="Bench submitted no requests"
+            ),
+            correlation_id=correlation_id,
+            trace=None,
+            traces=collected_traces,
+        )
+
+    if completion_policy == CompletionPolicy.LAST_SUBMITTED:
+        last = handles[-1]
+        completion, trace = engine.get_completion(last)
+        return BenchResult(
+            completion=completion, correlation_id=correlation_id,
+            trace=trace, traces=collected_traces,
+        )
+
+    if completion_policy == CompletionPolicy.ALL_OK_FAIL_FAST:
+        last_trace: Trace | None = None
+        for h in handles:
+            c, t = engine.get_completion(h)
+            last_trace = t if t is not None else last_trace
+            if not c.ok:
+                return BenchResult(
+                    completion=c, correlation_id=correlation_id,
+                    trace=last_trace, traces=collected_traces,
+                )
+        return BenchResult(
+            completion=Completion(ok=True), correlation_id=correlation_id,
+            trace=last_trace, traces=collected_traces,
+        )
+
+    # LAST_COMPLETED placeholder (needs engine support for timing). Fall back.
+    last = handles[-1]
+    completion, trace = engine.get_completion(last)
+    return BenchResult(
+        completion=completion, correlation_id=correlation_id,
+        trace=trace, traces=collected_traces,
+    )
diff --git a/src/kernbench/runtime_api/context.py b/src/kernbench/runtime_api/context.py
new file mode 100644
index 0000000..e9cf270
--- /dev/null
+++ b/src/kernbench/runtime_api/context.py
@@ -0,0 +1,282 @@
+# kernbench/runtime_api/context.py
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any
+
+from kernbench.common.types import Completion, RequestHandle, SimEngine
+
+from .types import DeviceSelector
+
+
+@dataclass
+class RuntimeContext:
+    engine: SimEngine
+    target_device: DeviceSelector
+    correlation_id: str
+    spec: dict | None = None
+
+    _handles: list[RequestHandle] = field(default_factory=list, init=False)
+    _completed: set[RequestHandle] = field(default_factory=set, init=False)
+    _allocators: dict[int, Any] = field(default_factory=dict, init=False)
+    _tensor_counter: int = field(default=0, init=False)
+    _traces: list[dict] = field(default_factory=list, init=False)
+
+    def submit(self, request: Any) -> RequestHandle:
+        submit_fn = getattr(self.engine, "submit", None)
+        if submit_fn is None:
+            raise AttributeError("Engine does not implement submit(request) -> RequestHandle.")
+        handle: RequestHandle = submit_fn(request)  # type: ignore[call-arg]
+        self._handles.append(handle)
+        return handle
+
+    def is_completed(self, handle: RequestHandle) -> bool:
+        return handle in self._completed
+
+    def wait(self, handle: RequestHandle, *, _meta: dict | None = None) -> Completion:
+        if handle in self._completed:
+            completion, trace = self.engine.get_completion(handle)
+            return completion
+
+        wait_fn = getattr(self.engine, "wait", None)
+        if wait_fn is not None:
+            wait_fn(handle)  # type: ignore[misc]
+
+        completion, trace = self.engine.get_completion(handle)
+        self._completed.add(handle)
+        if _meta is not None and trace is not None:
+            entry = dict(trace) if isinstance(trace, dict) else {"raw": trace}
+            entry.update(_meta)
+            self._traces.append(entry)
+        return completion
+
+    def wait_all(self) -> None:
+        for h in self._handles:
+            if h not in self._completed:
+                self.wait(h)
+
+    def handles(self) -> list[RequestHandle]:
+        return list(self._handles)
+
+    # ── PyTorch-like tensor API ──────────────────────────────────────
+
+    def _ensure_allocators(self) -> dict:
+        """Lazily create PEMemAllocator instances from spec."""
+        if self._allocators:
+            return self._allocators
+        if self.spec is None:
+            raise RuntimeError(
+                "RuntimeContext.spec is required for tensor operations. "
+                "Pass spec=graph.spec when creating RuntimeContext."
+            )
+        from kernbench.policy.address.allocator import AddressConfig, PEMemAllocator
+
+        system = self.spec.get("system", {})
+        cube = self.spec.get("cube", {})
+        mm = cube.get("memory_map", {})
+        pe_template = cube.get("pe_template", {})
+        pe_comps = pe_template.get("components", {})
+        tcm_cfg = pe_comps.get("pe_tcm", {}).get("attrs", {})
+
+        sip_count = system.get("sips", {}).get("count", 1)
+        cubes_per_sip = system.get("sips", {}).get("cubes_per_sip", 16)
+        pes_per_cube = (
+            cube.get("pe_layout", {}).get("pe_per_corner", 2)
+            * len(cube.get("pe_layout", {}).get("corners", ["NW", "NE", "SW", "SE"]))
+        )
+        hbm_gb = mm.get("hbm_total_gb_per_cube", 48)
+        hbm_slices = mm.get("hbm_slices_per_cube", 8)
+        tcm_mb = tcm_cfg.get("size_mb", 16)
+
+        cfg = AddressConfig(
+            sip_count=sip_count,
+            cubes_per_sip=cubes_per_sip,
+            pes_per_cube=pes_per_cube,
+            hbm_bytes_per_cube=hbm_gb * (1 << 30),
+            hbm_slices_per_cube=hbm_slices,
+            tcm_bytes_per_pe=tcm_mb * (1 << 20),
+            tcm_scheduler_reserved_bytes=4 * (1 << 20),
+            sram_bytes_per_cube=32 * (1 << 20),
+        )
+        # Create allocators for all SIPs × cubes × PEs
+        # Flat index: sip_id * cubes_per_sip * pes_per_cube + cube_id * pes_per_cube + pe_id
+        self._pes_per_cube = pes_per_cube
+        self._num_cubes = cubes_per_sip
+        self._num_sips = sip_count
+        cubes_x_pes = cubes_per_sip * pes_per_cube
+        for sip_id in range(sip_count):
+            for cube_id in range(cubes_per_sip):
+                for pe_id in range(pes_per_cube):
+                    flat_idx = sip_id * cubes_x_pes + cube_id * pes_per_cube + pe_id
+                    self._allocators[flat_idx] = PEMemAllocator(
+                        rack_id=0, sip_id=sip_id, cube_id=cube_id, pe_id=pe_id, cfg=cfg,
+                    )
+        return self._allocators
+
+    def _next_tensor_name(self) -> str:
+        self._tensor_counter += 1
+        return f"t{self._tensor_counter}"
+
+    def zeros(
+        self,
+        shape: tuple[int, ...],
+        dtype: str = "f16",
+        *,
+        placement: list | None = None,
+        dp: Any = None,
+        name: str | None = None,
+    ):
+        """Create a tensor and deploy to HBM with zero-fill (like torch.zeros)."""
+        return self._create_tensor(shape, dtype, placement, name, pattern="zero", dp=dp)
+
+    def empty(
+        self,
+        shape: tuple[int, ...],
+        dtype: str = "f16",
+        *,
+        placement: list | None = None,
+        dp: Any = None,
+        name: str | None = None,
+    ):
+        """Allocate a tensor in HBM without initialization (like torch.empty)."""
+        return self._create_tensor(shape, dtype, placement, name, pattern=None, dp=dp)
+
+    def _create_tensor(
+        self,
+        shape: tuple[int, ...],
+        dtype: str,
+        placement: list | None,
+        name: str | None,
+        pattern: str | None,
+        dp: Any = None,
+    ):
+        from kernbench.policy.placement.dp import DPPolicy, ShardSpec, resolve_dp_policy
+        from kernbench.runtime_api.kernel import MemoryWriteMsg
+        from kernbench.runtime_api.tensor import Tensor, deploy_tensor, dtype_itemsize
+
+        tensor_name = name or self._next_tensor_name()
+        t = Tensor(shape=shape, dtype=dtype, name=tensor_name)
+
+        dp_policy: DPPolicy | None = None
+
+        # Resolve placement: dp= takes priority over placement=
+        if dp is not None and isinstance(dp, DPPolicy):
+            dp_policy = dp
+            allocators = self._ensure_allocators()
+            itemsize = dtype_itemsize(dtype)
+            shape_2d = (shape[0], shape[1])  # type: tuple[int, int]
+            total_cubes = self._num_sips * self._num_cubes
+            placement = resolve_dp_policy(
+                dp, shape=shape_2d, itemsize=itemsize,
+                num_pe=self._pes_per_cube, num_cubes=total_cubes,
+            )
+        elif placement is None:
+            placement = [ShardSpec(pe_index=0, offset_bytes=0, nbytes=t.nbytes)]
+
+        # Infer target_pe from placement: multi-PE → "all", single PE → pe_index
+        pe_indices = {s.pe_index for s in placement}
+        target_pe: int | str = "all" if len(pe_indices) > 1 else next(iter(pe_indices))
+        t.to(placement=placement, target_pe=target_pe, dp_policy=dp_policy)
+
+        # Allocate PAs via PEMemAllocator
+        allocators = self._ensure_allocators()
+        handle = deploy_tensor(
+            name=tensor_name,
+            shape=shape,
+            dtype=dtype,
+            placement=placement,
+            allocators=allocators,
+        )
+        t._handle = handle
+
+        # Submit MemoryWriteMsg per shard (deploy data to device)
+        if pattern is not None:
+            for shard in handle.shards:
+                h = self.submit(MemoryWriteMsg(
+                    correlation_id=self.correlation_id,
+                    request_id=f"deploy_{tensor_name}_pe{shard.pe}",
+                    dst_sip=shard.sip, dst_cube=shard.cube, dst_pe=shard.pe,
+                    dst_pa=shard.pa, nbytes=shard.nbytes, pattern=pattern,
+                    target_cubes=(shard.cube,), target_pe=shard.pe,
+                ))
+                self.wait(h, _meta={
+                    "phase": "memory_write", "name": tensor_name,
+                    "sip": shard.sip, "cube": shard.cube, "pe": shard.pe,
+                    "nbytes": shard.nbytes,
+                })
+
+        return t
+
+    def launch(
+        self,
+        kernel_name: str,
+        kernel_fn: Any,
+        *args: Any,
+        **kwargs: Any,
+    ) -> RequestHandle:
+        """Register and launch a kernel (like a fused torch op).
+
+        Positional args: Tensor objects become TensorArg, int/float become ScalarArg.
+        Keyword args: become ScalarArg (name is discarded, order preserved).
+        """
+        from kernbench.runtime_api.kernel import (
+            KernelLaunchMsg,
+            KernelRef,
+            ScalarArg,
+        )
+        from kernbench.runtime_api.tensor import Tensor
+        from kernbench.triton_emu.registry import register_kernel
+
+        # Register kernel (idempotent)
+        try:
+            register_kernel(kernel_name, kernel_fn)
+        except ValueError:
+            pass
+
+        # Build kernel args from positional + keyword args
+        kernel_args: list = []
+        target_pe: int | str = 0
+
+        for a in args:
+            if isinstance(a, Tensor):
+                kernel_args.append(a.to_tensor_arg())
+                # Infer target_pe from tensor DP metadata
+                if a._dp_metadata is not None:
+                    dp_target = a._dp_metadata.target_pe
+                    if dp_target == "all":
+                        target_pe = "all"
+                    elif isinstance(dp_target, int) and target_pe != "all":
+                        target_pe = dp_target
+            elif isinstance(a, (int, float)):
+                dtype_str = "f32" if isinstance(a, float) else "i32"
+                kernel_args.append(ScalarArg(dtype=dtype_str, value=a))
+
+        for v in kwargs.values():
+            if isinstance(v, (int, float)):
+                dtype_str = "f32" if isinstance(v, float) else "i32"
+                kernel_args.append(ScalarArg(dtype=dtype_str, value=v))
+
+        # Determine target cubes from all tensor shards
+        cube_set: set[int] = set()
+        for a in args:
+            if isinstance(a, Tensor) and a._handle is not None:
+                for s in a._handle.shards:
+                    cube_set.add(s.cube)
+        target_cubes = tuple(sorted(cube_set)) if cube_set else (0,)
+
+        # Collect scalar values for GEMM FLOP calculation
+        scalar_vals = [a.value for a in kernel_args if hasattr(a, "value")]
+
+        h = self.submit(KernelLaunchMsg(
+            correlation_id=self.correlation_id,
+            request_id=kernel_name,
+            kernel_ref=KernelRef(name=kernel_name, kind="builtin"),
+            args=tuple(kernel_args),
+            target_cubes=target_cubes,
+            target_pe=target_pe,
+        ))
+        self.wait(h, _meta={
+            "phase": "kernel", "name": kernel_name,
+            "target_pe": target_pe, "scalars": scalar_vals,
+        })
+        return h
diff --git a/src/kernbench/runtime_api/kernel.py b/src/kernbench/runtime_api/kernel.py
new file mode 100644
index 0000000..433d976
--- /dev/null
+++ b/src/kernbench/runtime_api/kernel.py
@@ -0,0 +1,123 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Literal, TypeAlias
+
+
+@dataclass(frozen=True)
+class MemoryWriteMsg:
+    correlation_id: str
+    request_id: str
+    dst_sip: int
+    dst_cube: int
+    dst_pe: int
+    dst_pa: int
+    nbytes: int
+    src_kind: Literal["pattern", "host_buffer_ref"] = "pattern"
+    pattern: str | None = None
+    target_cubes: tuple[int, ...] | Literal["all"] = "all"
+    target_pe: int | Literal["all"] = "all"
+    msg_type: Literal["memory_write"] = "memory_write"
+
+
+@dataclass(frozen=True)
+class MemoryReadMsg:
+    correlation_id: str
+    request_id: str
+    src_sip: int
+    src_cube: int
+    src_pe: int
+    src_pa: int
+    nbytes: int
+    target_cubes: tuple[int, ...] | Literal["all"] = "all"
+    target_pe: int | Literal["all"] = "all"
+    msg_type: Literal["memory_read"] = "memory_read"
+
+
+@dataclass(frozen=True)
+class KernelRef:
+    """Reference to a kernel binary or builtin timing model.
+
+    Kernel binaries must be pre-deployed to device memory via MemoryWriteMsg.
+    KernelLaunchMsg references the deployed location by PA — source code or IR
+    MUST NOT be embedded in launch messages.
+
+    - "deployed": kernel binary pre-deployed to HBM/SRAM at deploy_pa.
+    - "builtin":  simulator built-in timing model, identified by name.
+    """
+
+    name: str
+    kind: Literal["deployed", "builtin"]
+    deploy_pa: int | None = None
+    deploy_sip: int = 0
+    deploy_cube: int = 0
+    deploy_pe: int = 0
+    nbytes_code: int = 0
+
+
+@dataclass(frozen=True)
+class TensorArgShard:
+    sip: int
+    cube: int
+    pe: int
+    pa: int
+    nbytes: int
+    offset_bytes: int
+
+
+@dataclass(frozen=True)
+class TensorArg:
+    shards: tuple[TensorArgShard, ...]
+    arg_kind: Literal["tensor"] = "tensor"
+
+
+@dataclass(frozen=True)
+class ScalarArg:
+    dtype: str
+    value: float | int
+    arg_kind: Literal["scalar"] = "scalar"
+
+
+KernelArg: TypeAlias = TensorArg | ScalarArg
+
+
+@dataclass(frozen=True)
+class KernelLaunchMsg:
+    correlation_id: str
+    request_id: str
+    kernel_ref: KernelRef
+    args: tuple[KernelArg, ...]
+    target_cubes: tuple[int, ...] | Literal["all"] = "all"
+    target_pe: int | Literal["all"] = "all"
+    msg_type: Literal["kernel_launch"] = "kernel_launch"
+
+
+@dataclass(frozen=True)
+class ResponseMsg:
+    """Device→Host response carrying PE execution result."""
+
+    correlation_id: str
+    request_id: str
+    src_cube: int
+    src_pe: int
+    success: bool
+    msg_type: Literal["response"] = "response"
+
+
+@dataclass(frozen=True)
+class PeDmaMsg:
+    """Direct PE DMA request: host injects a transfer at PE_DMA level.
+
+    Used by the probe utility to measure PE→HBM latency without requiring
+    the full PE_CPU → scheduler → DMA pipeline.
+    """
+
+    correlation_id: str
+    request_id: str
+    src_sip: int
+    src_cube: int
+    src_pe: int
+    dst_pa: int
+    nbytes: int
+    is_write: bool = False
+    msg_type: Literal["pe_dma"] = "pe_dma"
diff --git a/src/kernbench/runtime_api/tensor.py b/src/kernbench/runtime_api/tensor.py
new file mode 100644
index 0000000..26d4749
--- /dev/null
+++ b/src/kernbench/runtime_api/tensor.py
@@ -0,0 +1,166 @@
+from __future__ import annotations
+
+import math
+from dataclasses import dataclass
+from typing import Literal
+
+from kernbench.policy.address.allocator import PEMemAllocator
+from kernbench.policy.placement.dp import DPPolicy, ShardSpec
+from kernbench.runtime_api.kernel import TensorArg, TensorArgShard
+
+
+@dataclass(frozen=True)
+class TensorShard:
+    sip: int
+    cube: int
+    pe: int
+    pa: int
+    nbytes: int
+    offset_bytes: int
+
+
+@dataclass(frozen=True)
+class TensorHandle:
+    name: str
+    shape: tuple[int, ...]
+    dtype: str
+    itemsize: int
+    shards: tuple[TensorShard, ...]
+
+    @property
+    def nbytes(self) -> int:
+        return math.prod(self.shape) * self.itemsize
+
+
+_DTYPE_ITEMSIZE = {
+    "fp16": 2, "float16": 2, "f16": 2,
+    "fp32": 4, "float32": 4, "f32": 4,
+    "bf16": 2,
+    "int8": 1, "i8": 1,
+    "int16": 2, "i16": 2,
+    "int32": 4, "i32": 4,
+}
+
+
+def dtype_itemsize(dtype: str) -> int:
+    if dtype not in _DTYPE_ITEMSIZE:
+        raise ValueError(f"unsupported dtype: {dtype}")
+    return _DTYPE_ITEMSIZE[dtype]
+
+
+def deploy_tensor(
+    *,
+    name: str,
+    shape: tuple[int, ...],
+    dtype: str,
+    placement: list[ShardSpec],
+    allocators: dict[int, PEMemAllocator],
+    mem_kind: Literal["hbm", "tcm"] = "hbm",
+) -> TensorHandle:
+    isize = dtype_itemsize(dtype)
+    shards: list[TensorShard] = []
+    for spec in placement:
+        alloc = allocators[spec.pe_index]
+        if mem_kind == "hbm":
+            pa = alloc.alloc_hbm(spec.nbytes)
+        else:
+            pa = alloc.alloc_tcm(spec.nbytes)
+        shards.append(TensorShard(
+            sip=alloc._sip_id,
+            cube=alloc._cube_id,
+            pe=alloc._pe_id,
+            pa=pa.encode(),
+            nbytes=spec.nbytes,
+            offset_bytes=spec.offset_bytes,
+        ))
+    return TensorHandle(
+        name=name,
+        shape=shape,
+        dtype=dtype,
+        itemsize=isize,
+        shards=tuple(shards),
+    )
+
+
+# ── PyTorch-like Tensor API ──────────────────────────────────────────
+
+
+@dataclass(frozen=True)
+class DPMetadata:
+    """Data-parallel placement metadata (stored as Tensor._dp_metadata)."""
+
+    placement: list[ShardSpec]
+    dp_policy: DPPolicy | None = None
+    sip: int = 0
+    cube: int = 0
+    target_pe: int | str = 0  # int → single PE, "all" → all PEs
+
+
+class Tensor:
+    """PyTorch-like tensor for benchmark code.
+
+    Usage::
+
+        a = ctx.zeros((M, K), dtype="f16")
+        a = ctx.zeros((M, K), dtype="f16", placement=dp.replicate(num_pe=8))
+        ctx.launch("kernel_name", kernel_fn, a, b, out, M=M, K=K)
+    """
+
+    def __init__(
+        self,
+        shape: tuple[int, ...],
+        dtype: str = "f16",
+        name: str = "",
+    ) -> None:
+        self.shape = shape
+        self.dtype = dtype
+        self.name = name
+        self._dp_metadata: DPMetadata | None = None
+        self._handle: TensorHandle | None = None
+
+    @property
+    def itemsize(self) -> int:
+        return dtype_itemsize(self.dtype)
+
+    @property
+    def nbytes(self) -> int:
+        return math.prod(self.shape) * self.itemsize
+
+    @property
+    def pa(self) -> int:
+        """Primary PA (first shard). Used as kernel pointer argument."""
+        if self._handle is None or not self._handle.shards:
+            raise RuntimeError(f"Tensor '{self.name}' is not deployed yet")
+        return self._handle.shards[0].pa
+
+    def to(
+        self,
+        placement: list[ShardSpec] | None = None,
+        *,
+        dp_policy: DPPolicy | None = None,
+        sip: int = 0,
+        cube: int = 0,
+        target_pe: int | str = 0,
+    ) -> Tensor:
+        """Set DP placement metadata (like torch.Tensor.to())."""
+        if placement is None:
+            placement = [ShardSpec(pe_index=0, offset_bytes=0, nbytes=self.nbytes)]
+        self._dp_metadata = DPMetadata(
+            placement=placement, dp_policy=dp_policy,
+            sip=sip, cube=cube, target_pe=target_pe,
+        )
+        return self
+
+    def to_tensor_arg(self) -> TensorArg:
+        """Convert deployed shards to KernelLaunchMsg TensorArg."""
+        if self._handle is None:
+            raise RuntimeError(f"Tensor '{self.name}' is not deployed yet")
+        return TensorArg(
+            shards=tuple(
+                TensorArgShard(
+                    sip=s.sip, cube=s.cube, pe=s.pe,
+                    pa=s.pa, nbytes=s.nbytes, offset_bytes=s.offset_bytes,
+                )
+                for s in self._handle.shards
+            ),
+        )
diff --git a/src/kernbench/runtime_api/types.py b/src/kernbench/runtime_api/types.py
new file mode 100644
index 0000000..00072a0
--- /dev/null
+++ b/src/kernbench/runtime_api/types.py
@@ -0,0 +1,71 @@
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass
+
+from kernbench.common.types import Completion, Trace
+
+
+@dataclass(frozen=True)
+class BenchResult:
+    completion: Completion
+    correlation_id: str
+    trace: Trace | None = None
+    traces: list[dict] | None = None
+
+    def summary_text(self) -> str:
+        if self.completion.ok:
+            return f"[OK] correlation_id={self.correlation_id}"
+        code = self.completion.error_code or "ERROR"
+        msg = self.completion.error_message or ""
+        return f"[FAIL:{code}] correlation_id={self.correlation_id} {msg}".rstrip()
+
+
+@dataclass(frozen=True)
+class DeviceSelector:
+    """
+    Device selector.
+
+    Supported:
+      - "all"      : all SIPs in the tray topology
+      - "sip:<N>"  : a single SIP index
+    """
+
+    raw: str  # "all" or "sip:<N>"
+
+    @property
+    def is_all(self) -> bool:
+        return self.raw == "all"
+
+    @property
+    def sip_index(self) -> int:
+        if self.is_all:
+            raise ValueError("DeviceSelector is 'all'; no single sip_index.")
+        m = re.fullmatch(r"sip:(\d+)", self.raw)
+        if not m:
+            raise ValueError(
+                f"Invalid device '{self.raw}'. Expected 'all' or 'sip:<N>' (e.g., sip:0)."
+            )
+        return int(m.group(1))
+
+
+def resolve_device(raw: str | None) -> DeviceSelector:
+    """
+    Resolve the CLI --device string into a DeviceSelector.
+
+    Semantics:
+      - if omitted/empty -> "all"
+      - else accept "all" or "sip:<N>"
+    """
+    if raw is None or raw.strip() == "":
+        return DeviceSelector(raw="all")
+
+    raw = raw.strip().lower()
+    if raw == "all":
+        return DeviceSelector(raw="all")
+
+    m = re.fullmatch(r"sip:(\d+)", raw)
+    if not m:
+        raise ValueError(f"Invalid device '{raw}'. Expected 'all' or 'sip:<N>' (e.g., sip:0).")
+
+    return DeviceSelector(raw=raw)
diff --git a/src/kernbench/sim_engine/dummy.py b/src/kernbench/sim_engine/dummy.py
new file mode 100644
index 0000000..52f2d31
--- /dev/null
+++ b/src/kernbench/sim_engine/dummy.py
@@ -0,0 +1,31 @@
+# kernbench/engine/dummy.py
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any
+
+from kernbench.common.types import Completion, RequestHandle, SimEngine, Trace
+
+
+@dataclass
+class DummyEngine(SimEngine):
+    topology: object
+    device_raw: str
+    _n: int = 0
+    _store: dict[str, tuple[Completion, Trace | None]] = None  # type: ignore
+
+    def __post_init__(self) -> None:
+        self._store = {}
+
+    def submit(self, request: Any) -> RequestHandle:
+        self._n += 1
+        h = RequestHandle(f"h{self._n}")
+        # 여기서 request 처리/시뮬레이션/스케줄링 등을 수행
+        self._store[str(h)] = (Completion(ok=True), {"request": request, "device": self.device_raw})
+        return h
+
+    def get_completion(self, handle: RequestHandle) -> tuple[Completion, Trace | None]:
+        return self._store[str(handle)]
+
+    def wait(self, handle: RequestHandle) -> None:
+        pass
diff --git a/src/kernbench/sim_engine/engine.py b/src/kernbench/sim_engine/engine.py
new file mode 100644
index 0000000..962730e
--- /dev/null
+++ b/src/kernbench/sim_engine/engine.py
@@ -0,0 +1,298 @@
+from __future__ import annotations
+
+from typing import Any
+
+import simpy
+
+from kernbench.common.types import Completion, RequestHandle, Trace
+import kernbench.components.impls  # noqa: F401 — registers built-in implementations
+from kernbench.components.base import ComponentBase, ComponentRegistry
+from kernbench.components.context import ComponentContext
+from kernbench.policy.address.phyaddr import PhysAddr
+from kernbench.policy.routing.router import AddressResolver, PathRouter
+from kernbench.runtime_api.kernel import KernelLaunchMsg, MemoryReadMsg, MemoryWriteMsg, PeDmaMsg
+from kernbench.sim_engine.transaction import Transaction
+from kernbench.topology.types import Edge, TopologyGraph
+
+
+class GraphEngine:
+    """simpy-based discrete-event simulation engine.
+
+    Phase B: engine injects a Transaction into the PCIE_EP host queue for
+    each request. Components handle their own routing:
+      Path 1: PCIE_EP → IO_CPU   (engine-computed path, pre-loaded in Transaction)
+      Path 2: IO_CPU → M_CPU     (IO_CPU dispatches, fire-and-forget callback)
+      Path 3: M_CPU.DMA → HBM   (M_CPU dispatches, fire-and-forget callback)
+
+    Component implementations are DI-injectable via component_overrides (ADR-0007 D3).
+    """
+
+    def __init__(
+        self,
+        graph: TopologyGraph,
+        *,
+        component_overrides: dict[str, type[ComponentBase]] | None = None,
+    ) -> None:
+        self._env = simpy.Environment()
+        self._resolver = AddressResolver(graph)
+        self._router = PathRouter(graph)
+        self._nodes = graph.nodes
+        self._edge_map: dict[tuple[str, str], Edge] = {}
+        for e in graph.edges:
+            self._edge_map[(e.src, e.dst)] = e
+        self._ns_per_mm: float = graph.spec.get("system", {}).get("ns_per_mm", 0.01)
+        self._results: dict[str, tuple[Completion, Trace]] = {}
+        self._events: dict[str, simpy.Event] = {}
+        self._counter = 0
+        overrides = component_overrides or {}
+        ctx = ComponentContext(
+            router=self._router,
+            resolver=self._resolver,
+            positions={nid: n.pos_mm for nid, n in graph.nodes.items()},
+            ns_per_mm=self._ns_per_mm,
+            edge_map=self._edge_map,
+            spec=graph.spec,
+        )
+        self._components: dict[str, ComponentBase] = {
+            node_id: ComponentRegistry.create(node, overrides, ctx)
+            for node_id, node in graph.nodes.items()
+        }
+
+        # Wire ports: one Store per directed edge (ADR-0015 D1)
+        for e in graph.edges:
+            src_comp = self._components.get(e.src)
+            dst_comp = self._components.get(e.dst)
+            if src_comp is None or dst_comp is None:
+                continue
+            store: simpy.Store = simpy.Store(self._env)
+            src_comp.out_ports[e.dst] = store
+            dst_comp.in_ports[e.src] = store
+
+        # Wire processes: propagation delay per edge (ADR-0015 D2)
+        # Cut-through (wormhole) model: wires apply propagation only.
+        # Serialization (drain) is computed per-path and applied once at the terminal.
+        for e in graph.edges:
+            src_comp = self._components.get(e.src)
+            dst_comp = self._components.get(e.dst)
+            if src_comp is None or dst_comp is None:
+                continue
+            prop_ns = e.distance_mm * self._ns_per_mm
+            self._env.process(
+                self._wire(src_comp.out_ports[e.dst], dst_comp.in_ports[e.src],
+                           prop_ns)
+            )
+
+        # Attach host queues to PCIE_EP in_ports before start() (ADR-0015 D3)
+        self._host_queues: dict[str, simpy.Store] = {}
+        for pcie_ep_id in self._resolver.find_all_pcie_eps():
+            host_q: simpy.Store = simpy.Store(self._env)
+            self._components[pcie_ep_id].in_ports["host"] = host_q
+            self._host_queues[pcie_ep_id] = host_q
+
+        # Attach host queues to PE_DMA nodes for direct PE DMA injection
+        self._pe_dma_queues: dict[str, simpy.Store] = {}
+        for node_id, node in graph.nodes.items():
+            if node.kind == "pe_dma":
+                host_q = simpy.Store(self._env)
+                self._components[node_id].in_ports["host"] = host_q
+                self._pe_dma_queues[node_id] = host_q
+
+        # Start components after all ports are wired (ADR-0015 D3)
+        for comp in self._components.values():
+            comp.start(self._env)
+
+    def submit(self, request: Any) -> RequestHandle:
+        self._counter += 1
+        handle = RequestHandle(f"h{self._counter}")
+        event = self._env.event()
+        self._events[str(handle)] = event
+        self._env.process(self._process(str(handle), request, event))
+        return handle
+
+    def wait(self, handle: RequestHandle) -> None:
+        key = str(handle)
+        event = self._events[key]
+        if not event.triggered:
+            self._env.run(until=event)
+
+    def get_completion(self, handle: RequestHandle) -> tuple[Completion, Trace | None]:
+        return self._results[str(handle)]
+
+    # ── internal ────────────────────────────────────────────────────
+
+    def _wire(
+        self,
+        out_port: simpy.Store,
+        in_port: simpy.Store,
+        prop_ns: float,
+    ):
+        """SimPy process: relay messages with propagation delay only.
+
+        Cut-through (wormhole) model: serialization (drain) is computed per-path
+        and applied once at the terminal component, not at every wire hop.
+        """
+        while True:
+            msg = yield out_port.get()
+            if prop_ns > 0:
+                yield self._env.timeout(prop_ns)
+            yield in_port.put(msg)
+
+    def _process(self, key: str, request: Any, done: simpy.Event):
+        if isinstance(request, PeDmaMsg):
+            yield from self._process_pe_dma(key, request, done)
+            return
+
+        entries = self._entry_points(request)
+        if not entries:
+            self._results[key] = (
+                Completion(ok=True),
+                {"total_ns": 0.0, "nbytes": 0},
+            )
+            done.succeed()
+            return
+
+        start_ns = self._env.now
+        total_nbytes = 0
+
+        root_txn: Transaction | None = None
+        if len(entries) == 1:
+            # Single-SIP: direct inject (common path, no extra events)
+            pcie_ep_id, io_cpu_id, nbytes = entries[0]
+            total_nbytes = nbytes
+            path = self._router.find_node_path(pcie_ep_id, io_cpu_id)
+            txn_done = self._env.event()
+            txn = Transaction(request=request, path=path, step=0, nbytes=nbytes, done=txn_done)
+            root_txn = txn
+            yield self._host_queues[pcie_ep_id].put(txn)
+            yield txn_done
+        else:
+            # Multi-SIP: inject per SIP, aggregate completions (ADR-0007)
+            sub_dones: list[simpy.Event] = []
+            sub_txns: list[Transaction] = []
+            for pcie_ep_id, io_cpu_id, nbytes in entries:
+                total_nbytes = max(total_nbytes, nbytes)
+                path = self._router.find_node_path(pcie_ep_id, io_cpu_id)
+                txn_done = self._env.event()
+                txn = Transaction(
+                    request=request, path=path, step=0,
+                    nbytes=nbytes, done=txn_done,
+                )
+                yield self._host_queues[pcie_ep_id].put(txn)
+                sub_dones.append(txn_done)
+                sub_txns.append(txn)
+            for sd in sub_dones:
+                yield sd
+            # Aggregate pe_exec_ns from multi-SIP (max)
+            pe_vals = [st.result_data.get("pe_exec_ns") for st in sub_txns]
+            pe_vals = [v for v in pe_vals if v is not None]
+            if pe_vals:
+                if root_txn is None:
+                    root_txn = sub_txns[0]
+                root_txn.result_data["pe_exec_ns"] = max(pe_vals)
+
+        total_ns = self._env.now - start_ns
+        result_trace: dict[str, Any] = {"total_ns": total_ns, "nbytes": total_nbytes}
+        if root_txn is not None and root_txn.result_data:
+            result_trace.update(root_txn.result_data)
+        self._results[key] = (
+            Completion(ok=True),
+            result_trace,
+        )
+        done.succeed()
+
+    def _process_pe_dma(self, key: str, request: PeDmaMsg, done: simpy.Event):
+        """Inject a Transaction directly at PE_DMA for PE→HBM latency measurement."""
+        pe_prefix = f"sip{request.src_sip}.cube{request.src_cube}.pe{request.src_pe}"
+        pe_dma_id = f"{pe_prefix}.pe_dma"
+        pa = PhysAddr.decode(request.dst_pa)
+        dst_node = self._resolver.resolve(pa)
+        path = self._router.find_path(pe_prefix, dst_node)
+        drain_ns = self._path_drain_ns(path, request.nbytes)
+
+        start_ns = self._env.now
+        txn_done = self._env.event()
+        txn = Transaction(request=request, path=path, step=0, nbytes=request.nbytes,
+                          done=txn_done, drain_ns=drain_ns)
+        yield self._pe_dma_queues[pe_dma_id].put(txn)
+        yield txn_done
+        total_ns = self._env.now - start_ns
+        formula_ns = self._formula_latency(path, request.nbytes)
+        self._results[key] = (
+            Completion(ok=True),
+            {"total_ns": total_ns, "formula_ns": formula_ns, "nbytes": request.nbytes},
+        )
+        done.succeed()
+
+    def _path_drain_ns(self, path: list[str], nbytes: int) -> float:
+        """Wormhole drain time: nbytes / bottleneck_bw along path."""
+        min_bw = float("inf")
+        for i in range(len(path) - 1):
+            edge = self._edge_map.get((path[i], path[i + 1]))
+            if edge and edge.bw_gbs:
+                min_bw = min(min_bw, edge.bw_gbs)
+        if min_bw == float("inf"):
+            return 0.0
+        return nbytes / min_bw
+
+    def _formula_latency(self, path: list[str], nbytes: int) -> float:
+        """Lower-bound formula latency (ADR-0015 D7).
+
+        formula = Σ(wire propagation) + Σ(component overhead_ns) + drain_ns
+
+        Phase 0: formula == actual (no contention).
+        Phase 1+: formula <= actual (contention adds queueing).
+        """
+        total = 0.0
+        # Wire propagation delays
+        for i in range(len(path) - 1):
+            edge = self._edge_map.get((path[i], path[i + 1]))
+            if edge:
+                total += edge.distance_mm * self._ns_per_mm
+        # Component overhead_ns
+        for node_id in path:
+            node = self._nodes.get(node_id)
+            if node:
+                total += float(node.attrs.get("overhead_ns", 0.0))
+        # Drain
+        total += self._path_drain_ns(path, nbytes)
+        return total
+
+    def _entry_points(self, request: Any) -> list[tuple[str, str, int]]:
+        """Return list of (pcie_ep_id, io_cpu_id, nbytes) per target SIP.
+
+        For Memory{Write,Read}: single SIP entry.
+        For KernelLaunchMsg: one entry per distinct SIP in tensor shards.
+        """
+        if isinstance(request, MemoryWriteMsg):
+            sip = request.dst_sip
+            return [(
+                self._resolver.find_pcie_ep(sip),
+                self._resolver.find_io_cpu(sip),
+                request.nbytes,
+            )]
+
+        if isinstance(request, MemoryReadMsg):
+            sip = request.src_sip
+            return [(
+                self._resolver.find_pcie_ep(sip),
+                self._resolver.find_io_cpu(sip),
+                request.nbytes,
+            )]
+
+        if isinstance(request, KernelLaunchMsg):
+            seen: set[int] = set()
+            entries: list[tuple[str, str, int]] = []
+            for arg in request.args:
+                if arg.arg_kind != "tensor":
+                    continue
+                for shard in arg.shards:
+                    if shard.sip not in seen:
+                        seen.add(shard.sip)
+                        entries.append((
+                            self._resolver.find_pcie_ep(shard.sip),
+                            self._resolver.find_io_cpu(shard.sip),
+                            shard.nbytes,
+                        ))
+            return entries
+
+        raise ValueError(f"unsupported request type: {type(request)}")
diff --git a/src/kernbench/sim_engine/transaction.py b/src/kernbench/sim_engine/transaction.py
new file mode 100644
index 0000000..8341aa5
--- /dev/null
+++ b/src/kernbench/sim_engine/transaction.py
@@ -0,0 +1,49 @@
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any
+
+import simpy
+
+
+@dataclass
+class Transaction:
+    """In-flight request traversing the device fabric hop-by-hop (ADR-0015 D4).
+
+    A Transaction carries a host request through one leg of the device fabric.
+    Each component on the path reads from its in_port, processes (overhead_ns or
+    other latency), and advances the Transaction to the next hop via out_port.
+    Wire processes (ADR-0015 D2) model propagation delay between hops.
+
+    Multi-leg flows (e.g. IO_CPU → M_CPU as leg 1, M_CPU.DMA → HBM as leg 2)
+    use separate Transactions: the terminal component of leg 1 creates leg 2
+    and waits for leg 2's done before succeeding leg 1's done.
+    """
+
+    request: Any            # original host request (MemoryReadMsg, KernelLaunchMsg, …)
+    path: list[str]         # node_id sequence for this leg
+    step: int               # index of the component currently holding this Transaction
+    nbytes: int             # payload size (bytes)
+    done: simpy.Event       # succeeded when this leg completes
+    drain_ns: float = 0.0   # wormhole drain time: nbytes / bottleneck_bw (applied once at terminal)
+    is_response: bool = False  # True when carrying ResponseMsg on reverse path
+    result_data: dict[str, Any] = field(default_factory=dict)  # PE-level metrics (pe_exec_ns, etc.)
+
+    @property
+    def next_hop(self) -> str | None:
+        """Node id of the next component, or None if this is the terminal hop."""
+        nxt = self.step + 1
+        return self.path[nxt] if nxt < len(self.path) else None
+
+    def advance(self) -> Transaction:
+        """Return a copy of this Transaction advanced one step along the path."""
+        return Transaction(
+            request=self.request,
+            path=self.path,
+            step=self.step + 1,
+            nbytes=self.nbytes,
+            done=self.done,
+            drain_ns=self.drain_ns,
+            is_response=self.is_response,
+            result_data=self.result_data,
+        )
diff --git a/src/kernbench/topology/__init__.py b/src/kernbench/topology/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/kernbench/topology/builder.py b/src/kernbench/topology/builder.py
new file mode 100644
index 0000000..49f1aa8
--- /dev/null
+++ b/src/kernbench/topology/builder.py
@@ -0,0 +1,965 @@
+# kernbench/topology/builder.py
+"""
+Topology compiler: parses topology.yaml and produces a fully-instantiated
+TopologyGraph with nodes, edges, and representative view projections.
+"""
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+import yaml
+
+from .types import Edge, Node, TopologyGraph, TopologyHandle, ViewGraph
+
+
+# PE component offsets from PE center (small, intra-PE distances ~0.5mm)
+_PE_COMP_OFFSETS = {
+    "pe_cpu": (-0.3, 0.0),
+    "pe_scheduler": (-0.15, 0.0),
+    "pe_dma": (0.0, -0.15),
+    "pe_gemm": (0.0, 0.0),
+    "pe_math": (0.0, 0.15),
+    "pe_tcm": (0.3, 0.0),
+}
+
+
+# ── Public API ───────────────────────────────────────────────────────
+
+
+def resolve_topology(path_str: str) -> TopologyHandle:
+    """Validate path and build compiled topology graph."""
+    p = Path(path_str).expanduser().resolve()
+    if not p.exists():
+        raise FileNotFoundError(f"Topology file not found: {p}")
+    if not p.is_file():
+        raise ValueError(f"Topology path is not a file: {p}")
+    graph = load_topology(p)
+    return TopologyHandle(path=p, topology_obj=graph)
+
+
+def load_topology(path: Path) -> TopologyGraph:
+    """Load topology spec from file and compile into a topology graph."""
+    spec = _read_spec(path)
+    _validate_spec(spec)
+    return _compile_graph(spec)
+
+
+def _read_spec(path: Path) -> dict[str, Any]:
+    """Read YAML topology spec file and return a dict."""
+    try:
+        with path.open("r", encoding="utf-8") as f:
+            data = yaml.safe_load(f)
+    except yaml.YAMLError as e:
+        msg = f"Failed to parse YAML topology: {path}"
+        mark = getattr(e, "problem_mark", None)
+        if mark is not None:
+            msg += f" (line {mark.line + 1}, column {mark.column + 1})"
+        raise ValueError(msg) from e
+
+    if data is None:
+        raise ValueError(f"Topology YAML is empty: {path}")
+    if not isinstance(data, dict):
+        raise ValueError(
+            f"Topology YAML root must be a mapping/dict: {path} (got {type(data).__name__})"
+        )
+    return data
+
+
+def _validate_spec(spec: dict) -> None:
+    # TODO: schema validation
+    return
+
+
+# ── Graph Compiler ───────────────────────────────────────────────────
+
+
+def _compile_graph(spec: dict) -> TopologyGraph:
+    """Build fully-instantiated flat graph + representative view projections."""
+    nodes: dict[str, Node] = {}
+    edges: list[Edge] = []
+
+    system = spec["system"]
+    sip_spec = spec["sip"]
+    cube_spec = spec["cube"]
+
+    mesh_w = sip_spec["cube_mesh"]["w"]
+    mesh_h = sip_spec["cube_mesh"]["h"]
+    cube_w = cube_spec["geometry"]["cube_mm"]["w"]
+    cube_h = cube_spec["geometry"]["cube_mm"]["h"]
+    seam = sip_spec["links"]["inter_cube_mesh"]["distance_mm_across_seam"]
+    stride_x = cube_w + seam
+    stride_y = cube_h + seam
+
+    # System-level
+    _instantiate_system(nodes, system)
+
+    # Per-SIP
+    for sip_id in range(system["sips"]["count"]):
+        sp = f"sip{sip_id}"
+
+        # IO chiplets
+        _instantiate_io_chiplets(
+            nodes, edges, sp, sip_spec,
+            cube_w, cube_h, mesh_w, mesh_h, seam,
+        )
+
+        # Cubes + PEs
+        for row in range(mesh_h):
+            for col in range(mesh_w):
+                cid = row * mesh_w + col
+                cp = f"{sp}.cube{cid}"
+                origin = (col * stride_x, row * stride_y)
+                _instantiate_cube(nodes, edges, cp, cube_spec, origin)
+
+        # Inter-cube UCIe mesh
+        _add_inter_cube_edges(edges, sp, mesh_w, mesh_h, sip_spec)
+
+        # IO → cube UCIe
+        _add_io_to_cube_edges(edges, sp, sip_spec, mesh_w)
+
+        # Switch → IO pcie_ep
+        _add_system_to_io_edges(edges, sp, sip_spec, system)
+
+    # Build views
+    return TopologyGraph(
+        spec=spec,
+        nodes=nodes,
+        edges=edges,
+        system_view=_build_system_view(spec),
+        sip_view=_build_sip_view(spec),
+        cube_view=_build_cube_view(spec),
+        pe_view=_build_pe_view(spec),
+    )
+
+
+# ── Layout helpers ───────────────────────────────────────────────────
+
+
+def _cube_local_positions(cube_w: float, cube_h: float) -> dict[str, tuple[float, float]]:
+    """Cube-internal component positions relative to cube origin (0,0) at top-left."""
+    cx, cy = cube_w / 2, cube_h / 2
+    # UCIe node half-sizes (default 2.0×1.2mm) — inset so edges touch boundary
+    uh = 0.6   # half height
+    uw = 1.0   # half width
+    return {
+        "ucie-N": (cx, uh),
+        "ucie-S": (cx, cube_h - uh),
+        "ucie-W": (uw, cy),
+        "ucie-E": (cube_w - uw, cy),
+        "m_cpu": (cube_w - 2.5, cy - 1.5),
+        "xbar.top": (cx, 3.5),       # Y reference for top-half xbar.pe nodes
+        "hbm_ctrl": (cx - 2.0, cy),
+        "xbar.bottom": (cx, cube_h - 3.5),  # Y reference for bottom-half xbar.pe nodes
+        "bridge.left": (2.5, cy + 2.0),
+        "bridge.right": (cube_w - 2.5, cy + 2.0),
+        "noc": (cx + 2.0, cy),
+        "sram": (2.5, cy - 1.5),
+    }
+
+
+def _corner_pe_positions(cube_w: float, cube_h: float) -> dict[str, list[tuple[float, float]]]:
+    """PE center positions per corner, relative to cube origin."""
+    return {
+        "NW": [(1.5, 1.5), (4.5, 1.5)],
+        "NE": [(cube_w - 4.5, 1.5), (cube_w - 1.5, 1.5)],
+        "SW": [(1.5, cube_h - 1.5), (4.5, cube_h - 1.5)],
+        "SE": [(cube_w - 4.5, cube_h - 1.5), (cube_w - 1.5, cube_h - 1.5)],
+    }
+
+
+# ── Instantiation: system ───────────────────────────────────────────
+
+
+def _instantiate_system(nodes: dict[str, Node], system: dict) -> None:
+    """Add system-level nodes (fabric switch)."""
+    sw = system["components"]["switch"]
+    sw_id = "fabric.switch0"
+    nodes[sw_id] = Node(
+        id=sw_id, kind=sw["kind"], impl=sw["impl"],
+        attrs=sw.get("attrs", {}), pos_mm=None, label="Switch",
+    )
+
+
+# ── Instantiation: IO chiplets ──────────────────────────────────────
+
+
+def _instantiate_io_chiplets(
+    nodes: dict[str, Node],
+    edges: list[Edge],
+    sp: str,
+    sip_spec: dict,
+    cube_w: float,
+    cube_h: float,
+    mesh_w: int,
+    mesh_h: int,
+    seam: float,
+) -> None:
+    """Add IO chiplet nodes and internal pcie_ep → io_cpu edges."""
+    io_spec = sip_spec["iochiplet"]
+    comp = io_spec["components"]
+    links = io_spec["links"]
+    mesh_total_w = mesh_w * cube_w + (mesh_w - 1) * seam
+    mesh_total_h = mesh_h * cube_h + (mesh_h - 1) * seam
+
+    for inst in io_spec["instances"]:
+        iid = inst["id"]
+        prefix = f"{sp}.{iid}"
+        side = inst["place"]["side"]
+        cx = mesh_total_w / 2
+        if side == "N":
+            pcie_y, cpu_y = -5.0, -3.0
+        else:
+            pcie_y, cpu_y = mesh_total_h + 5.0, mesh_total_h + 3.0
+
+        # pcie_ep
+        ep = comp["pcie_ep"]
+        ep_id = f"{prefix}.pcie_ep"
+        nodes[ep_id] = Node(
+            id=ep_id, kind=ep["kind"], impl=ep["impl"],
+            attrs=ep["attrs"], pos_mm=(cx, pcie_y), label="PCIe EP",
+        )
+
+        # io_cpu
+        cpu = comp["io_cpu"]
+        cpu_id = f"{prefix}.io_cpu"
+        nodes[cpu_id] = Node(
+            id=cpu_id, kind=cpu["kind"], impl=cpu["impl"],
+            attrs=cpu["attrs"], pos_mm=(cx, cpu_y), label="IO CPU",
+        )
+
+        # Internal edge
+        edges.append(Edge(
+            src=ep_id, dst=cpu_id,
+            distance_mm=links["pcie_ep_to_io_cpu_mm"],
+            bw_gbs=links["pcie_ep_to_io_cpu_bw_gbs"],
+            kind="io_internal",
+        ))
+
+
+# ── Instantiation: cube + PEs ───────────────────────────────────────
+
+
+def _instantiate_cube(
+    nodes: dict[str, Node],
+    edges: list[Edge],
+    cp: str,
+    cube: dict,
+    origin: tuple[float, float],
+) -> None:
+    """Add all cube-internal nodes and edges, including PE instances."""
+    cube_w = cube["geometry"]["cube_mm"]["w"]
+    cube_h = cube["geometry"]["cube_mm"]["h"]
+    ox, oy = origin
+    local_pos = _cube_local_positions(cube_w, cube_h)
+    clinks = cube["links"]
+    n_slices = cube["memory_map"]["hbm_slices_per_cube"]
+
+    # ── UCIe ports ──
+    ucie_ns = cube["ucie"]["overhead_ns"]
+    for port in cube["ucie"]["ports"]:
+        pid = f"{cp}.ucie-{port}"
+        lx, ly = local_pos[f"ucie-{port}"]
+        nodes[pid] = Node(
+            id=pid, kind="ucie_port", impl="ucie_v1",
+            attrs={"overhead_ns": ucie_ns}, pos_mm=(ox + lx, oy + ly),
+            label=f"UCIe-{port}",
+        )
+
+    # ── Named components: noc, m_cpu, sram ──
+    for name in ("noc", "m_cpu", "sram"):
+        c = cube["components"][name]
+        nid = f"{cp}.{name}"
+        lx, ly = local_pos[name]
+        nodes[nid] = Node(
+            id=nid, kind=c["kind"], impl=c["impl"],
+            attrs=c["attrs"], pos_mm=(ox + lx, oy + ly),
+            label=name.upper().replace("_", " "),
+        )
+
+    # ── HBM controller slices (one per PE) ──
+    hbm_spec = cube["components"]["hbm_ctrl"]
+    hbm_lx, hbm_ly = local_pos["hbm_ctrl"]
+    for sl in range(n_slices):
+        sid = f"{cp}.hbm_ctrl.slice{sl}"
+        nodes[sid] = Node(
+            id=sid, kind=hbm_spec["kind"], impl=hbm_spec["impl"],
+            attrs=hbm_spec["attrs"], pos_mm=(ox + hbm_lx, oy + hbm_ly),
+            label=f"HBM SLICE{sl}",
+        )
+
+    # ── Bridges ──
+    for br in cube["components"]["xbar"]["bridges"]:
+        bname = br["id"]
+        nid = f"{cp}.bridge.{bname}"
+        lx, ly = local_pos[f"bridge.{bname}"]
+        nodes[nid] = Node(
+            id=nid, kind=br["kind"], impl=br["impl"],
+            attrs=br["attrs"], pos_mm=(ox + lx, oy + ly),
+            label=f"Bridge {bname.upper()}",
+        )
+
+    # ── PE instances + per-PE xbar entry nodes ──
+    corners = cube["pe_layout"]["corners"]
+    pe_per_corner = cube["pe_layout"]["pe_per_corner"]
+    corner_pos = _corner_pe_positions(cube_w, cube_h)
+    pe_tmpl = cube["pe_template"]
+    pe_links = pe_tmpl["links"]
+
+    xbar_pe_spec = cube["components"]["xbar"]["pe"]
+    xbar_top_y = local_pos["xbar.top"][1]
+    xbar_bot_y = local_pos["xbar.bottom"][1]
+
+    pe_idx = 0
+    for corner in corners:
+        is_top = corner in ("NW", "NE")
+        xbar_y = xbar_top_y if is_top else xbar_bot_y
+        mm_key = "pe_to_xbar_row_n_mm" if is_top else "pe_to_xbar_row_s_mm"
+        for ci in range(pe_per_corner):
+            pp = f"{cp}.pe{pe_idx}"
+            pe_cx, pe_cy = corner_pos[corner][ci]
+
+            # Per-PE xbar entry node
+            xbar_nid = f"{cp}.xbar.pe{pe_idx}"
+            nodes[xbar_nid] = Node(
+                id=xbar_nid, kind=xbar_pe_spec["kind"], impl=xbar_pe_spec["impl"],
+                attrs=xbar_pe_spec["attrs"], pos_mm=(ox + pe_cx, oy + xbar_y),
+                label=f"XBAR PE{pe_idx}",
+            )
+
+            # PE template components
+            for comp_name, comp_spec in pe_tmpl["components"].items():
+                cid = f"{pp}.{comp_name}"
+                dx, dy = _PE_COMP_OFFSETS.get(comp_name, (0.0, 0.0))
+                nodes[cid] = Node(
+                    id=cid, kind=comp_spec["kind"], impl=comp_spec["impl"],
+                    attrs=comp_spec["attrs"],
+                    pos_mm=(ox + pe_cx + dx, oy + pe_cy + dy),
+                    label=comp_name.upper().replace("_", " "),
+                )
+
+            # PE-internal edges
+            _add_pe_internal_edges(edges, pp, pe_links)
+
+            # PE_DMA → xbar.pe_i (HBM data path)
+            edges.append(Edge(
+                src=f"{pp}.pe_dma", dst=xbar_nid,
+                distance_mm=clinks[mm_key],
+                bw_gbs=clinks["pe_to_xbar_bw_gbs"],
+                kind="pe_to_xbar",
+            ))
+
+            # PE_DMA → noc (non-HBM data path: SRAM, inter-cube, etc.)
+            edges.append(Edge(
+                src=f"{pp}.pe_dma", dst=f"{cp}.noc",
+                distance_mm=clinks["pe_dma_to_noc_mm"],
+                bw_gbs=clinks["pe_dma_to_noc_bw_gbs"],
+                kind="pe_to_noc",
+            ))
+
+            # noc → PE_CPU (command delivery)
+            edges.append(Edge(
+                src=f"{cp}.noc", dst=f"{pp}.pe_cpu",
+                distance_mm=clinks["noc_to_pe_cpu_mm"],
+                kind="command",
+            ))
+
+            pe_idx += 1
+
+    # ── Cube fabric edges ──
+
+    # xbar.pe_i ↔ hbm_ctrl.slice_i (local Y-path, bidirectional for response)
+    for i in range(n_slices):
+        edges.append(Edge(
+            src=f"{cp}.xbar.pe{i}", dst=f"{cp}.hbm_ctrl.slice{i}",
+            distance_mm=clinks["xbar_to_hbm_mm"],
+            bw_gbs=clinks["xbar_to_hbm_bw_gbs"],
+            kind="xbar_to_hbm",
+        ))
+        edges.append(Edge(
+            src=f"{cp}.hbm_ctrl.slice{i}", dst=f"{cp}.xbar.pe{i}",
+            distance_mm=clinks["xbar_to_hbm_mm"],
+            bw_gbs=clinks["xbar_to_hbm_bw_gbs"],
+            kind="hbm_to_xbar",
+        ))
+
+    # xbar chain: pe0↔pe1↔pe2↔pe3 (top), pe4↔pe5↔pe6↔pe7 (bottom)
+    half = n_slices // 2
+    for half_start in (0, half):
+        for i in range(half_start, half_start + half - 1):
+            intra = ((i - half_start) % pe_per_corner) != (pe_per_corner - 1)
+            x_dist = clinks["xbar_chain_intra_corner_mm"] if intra else clinks["xbar_chain_inter_corner_mm"]
+            for a, b in [(i, i + 1), (i + 1, i)]:
+                edges.append(Edge(
+                    src=f"{cp}.xbar.pe{a}", dst=f"{cp}.xbar.pe{b}",
+                    distance_mm=x_dist,
+                    bw_gbs=clinks["xbar_x_bw_gbs"],
+                    kind="xbar_chain",
+                ))
+
+    # bridge connections: pe0↔bridge.left↔pe4, pe3↔bridge.right↔pe7
+    for bname, pe_top, pe_bot in [("left", 0, half), ("right", half - 1, n_slices - 1)]:
+        br_node = f"{cp}.bridge.{bname}"
+        for pe_i, br_mm_key in [(pe_top, "xbar_row_n_to_bridge_mm"),
+                                 (pe_bot, "xbar_row_s_to_bridge_mm")]:
+            xbar_node = f"{cp}.xbar.pe{pe_i}"
+            edges.append(Edge(
+                src=xbar_node, dst=br_node,
+                distance_mm=clinks[br_mm_key],
+                bw_gbs=clinks["xbar_to_bridge_bw_gbs"],
+                kind="xbar_to_bridge",
+            ))
+            edges.append(Edge(
+                src=br_node, dst=xbar_node,
+                distance_mm=clinks[br_mm_key],
+                bw_gbs=clinks["xbar_to_bridge_bw_gbs"],
+                kind="bridge_to_xbar",
+            ))
+
+    # ucie ↔ noc (UCIe-NOC boundary; per_connection_bw_gbs = 128 GB/s, n_connections = 4)
+    _noc_ucie = clinks["noc_to_ucie"]
+    for port in cube["ucie"]["ports"]:
+        edges.append(Edge(
+            src=f"{cp}.ucie-{port}", dst=f"{cp}.noc",
+            distance_mm=0.0,
+            bw_gbs=_noc_ucie["per_connection_bw_gbs"],
+            n_connections=_noc_ucie["n_connections"],
+            kind="ucie_to_noc",
+        ))
+
+    for port in cube["ucie"]["ports"]:
+        edges.append(Edge(
+            src=f"{cp}.noc", dst=f"{cp}.ucie-{port}",
+            distance_mm=0.0,
+            bw_gbs=_noc_ucie["per_connection_bw_gbs"],
+            n_connections=_noc_ucie["n_connections"],
+            kind="noc_to_ucie",
+        ))
+
+    # noc ↔ xbar.pe{i}: wire delay is 0 (NOC traversal latency computed by TwoDMeshNocComponent);
+    # routing_weight_mm=50.0 steers PE DMA Dijkstra away from this path (prefer direct pe_dma→xbar)
+    _noc_xbar = clinks.get("noc_to_xbar", {})
+    _noc_xbar_bw = _noc_xbar.get("per_connection_bw_gbs")
+    for i in range(n_slices):
+        edges.append(Edge(
+            src=f"{cp}.noc", dst=f"{cp}.xbar.pe{i}",
+            distance_mm=0.0,
+            bw_gbs=_noc_xbar_bw,
+            routing_weight_mm=50.0,
+            kind="noc_to_xbar",
+        ))
+        edges.append(Edge(
+            src=f"{cp}.xbar.pe{i}", dst=f"{cp}.noc",
+            distance_mm=0.0,
+            bw_gbs=_noc_xbar_bw,
+            routing_weight_mm=50.0,
+            kind="xbar_to_noc",
+        ))
+
+    # m_cpu ↔ noc (command dispatch, both directions)
+    edges.append(Edge(
+        src=f"{cp}.m_cpu", dst=f"{cp}.noc",
+        distance_mm=clinks["m_cpu_to_noc_mm"],
+        kind="command",
+    ))
+    edges.append(Edge(
+        src=f"{cp}.noc", dst=f"{cp}.m_cpu",
+        distance_mm=clinks["m_cpu_to_noc_mm"],
+        kind="command",
+    ))
+
+    # noc ↔ sram (shared SRAM access; per_connection_bw_gbs = 128 GB/s, n_connections = 4)
+    _noc_sram = clinks["noc_to_sram"]
+    edges.append(Edge(
+        src=f"{cp}.noc", dst=f"{cp}.sram",
+        distance_mm=clinks["noc_to_sram_mm"],
+        bw_gbs=_noc_sram["per_connection_bw_gbs"],
+        n_connections=_noc_sram["n_connections"],
+        kind="noc_to_sram",
+    ))
+    edges.append(Edge(
+        src=f"{cp}.sram", dst=f"{cp}.noc",
+        distance_mm=clinks["noc_to_sram_mm"],
+        bw_gbs=_noc_sram["per_connection_bw_gbs"],
+        n_connections=_noc_sram["n_connections"],
+        kind="noc_to_sram",
+    ))
+
+
+def _add_pe_internal_edges(edges: list[Edge], pp: str, pe_links: dict) -> None:
+    """Add PE-internal edges for a single PE instance."""
+    edges.append(Edge(
+        src=f"{pp}.pe_cpu", dst=f"{pp}.pe_scheduler",
+        distance_mm=pe_links["pe_cpu_to_scheduler_mm"],
+        kind="pe_internal",
+    ))
+    for eng, key in [("pe_dma", "scheduler_to_dma_mm"),
+                     ("pe_gemm", "scheduler_to_gemm_mm"),
+                     ("pe_math", "scheduler_to_math_mm")]:
+        edges.append(Edge(
+            src=f"{pp}.pe_scheduler", dst=f"{pp}.{eng}",
+            distance_mm=pe_links[key],
+            kind="pe_internal",
+        ))
+    for eng, mm_key, bw_key in [("pe_dma", "dma_to_tcm_mm", "dma_to_tcm_bw_gbs"),
+                                 ("pe_gemm", "gemm_to_tcm_mm", "gemm_to_tcm_bw_gbs"),
+                                 ("pe_math", "math_to_tcm_mm", "math_to_tcm_bw_gbs")]:
+        edges.append(Edge(
+            src=f"{pp}.{eng}", dst=f"{pp}.pe_tcm",
+            distance_mm=pe_links[mm_key],
+            bw_gbs=pe_links[bw_key],
+            kind="pe_internal",
+        ))
+
+
+# ── Inter-cube / IO / system edges ──────────────────────────────────
+
+
+def _add_inter_cube_edges(
+    edges: list[Edge], sp: str, mesh_w: int, mesh_h: int, sip_spec: dict,
+) -> None:
+    """Add UCIe mesh edges between adjacent cubes within a SIP."""
+    mesh = sip_spec["links"]["inter_cube_mesh"]
+    bw = mesh["bw_gbs_per_ucie_phy"]
+    dist = mesh["distance_mm_across_seam"]
+    for row in range(mesh_h):
+        for col in range(mesh_w):
+            cid = row * mesh_w + col
+            if col + 1 < mesh_w:
+                nid = row * mesh_w + (col + 1)
+                edges.append(Edge(
+                    src=f"{sp}.cube{cid}.ucie-E", dst=f"{sp}.cube{nid}.ucie-W",
+                    distance_mm=dist, bw_gbs=bw, kind="ucie_mesh",
+                ))
+                edges.append(Edge(
+                    src=f"{sp}.cube{nid}.ucie-W", dst=f"{sp}.cube{cid}.ucie-E",
+                    distance_mm=dist, bw_gbs=bw, kind="ucie_mesh",
+                ))
+            if row + 1 < mesh_h:
+                nid = (row + 1) * mesh_w + col
+                edges.append(Edge(
+                    src=f"{sp}.cube{cid}.ucie-S", dst=f"{sp}.cube{nid}.ucie-N",
+                    distance_mm=dist, bw_gbs=bw, kind="ucie_mesh",
+                ))
+                edges.append(Edge(
+                    src=f"{sp}.cube{nid}.ucie-N", dst=f"{sp}.cube{cid}.ucie-S",
+                    distance_mm=dist, bw_gbs=bw, kind="ucie_mesh",
+                ))
+
+
+def _add_io_to_cube_edges(
+    edges: list[Edge], sp: str, sip_spec: dict, mesh_w: int,
+) -> None:
+    """Add IO chiplet io_cpu ↔ cube UCIe edges (bidirectional for response)."""
+    io_links = sip_spec["iochiplet"]["links"]
+    io_to_ucie_mm = io_links["io_cpu_to_ucie_mm"]
+    io_to_ucie_bw = io_links["io_cpu_to_ucie_bw_gbs"]
+    for inst in sip_spec["iochiplet"]["instances"]:
+        iid = inst["id"]
+        io_cpu_id = f"{sp}.{iid}.io_cpu"
+        for port in inst["cube_ports"]:
+            cube_col, cube_row = port["cube"]["xy"]
+            cube_id = cube_row * mesh_w + cube_col
+            cube_side = port["cube_side"]
+            ucie_id = f"{sp}.cube{cube_id}.ucie-{cube_side}"
+            edges.append(Edge(
+                src=io_cpu_id, dst=ucie_id,
+                distance_mm=io_to_ucie_mm + port["distance_mm"],
+                bw_gbs=io_to_ucie_bw,
+                kind="io_to_cube",
+            ))
+            edges.append(Edge(
+                src=ucie_id, dst=io_cpu_id,
+                distance_mm=io_to_ucie_mm + port["distance_mm"],
+                bw_gbs=io_to_ucie_bw,
+                kind="cube_to_io",
+            ))
+
+
+def _add_system_to_io_edges(
+    edges: list[Edge], sp: str, sip_spec: dict, system: dict,
+) -> None:
+    """Add fabric switch → IO chiplet PCIe edges."""
+    sw_id = "fabric.switch0"
+    sys_link = system["links"]["io_ep_to_switch"]
+    for inst in sip_spec["iochiplet"]["instances"]:
+        pcie_ep_id = f"{sp}.{inst['id']}.pcie_ep"
+        edges.append(Edge(
+            src=sw_id, dst=pcie_ep_id,
+            distance_mm=sys_link["distance_mm"],
+            bw_gbs=sys_link["bw_gbs_per_ep"],
+            kind="pcie",
+        ))
+
+
+# ── View builders ────────────────────────────────────────────────────
+
+
+def _build_system_view(spec: dict) -> ViewGraph:
+    """System-level view: SIP blocks, IO chiplets, fabric switch."""
+    system = spec["system"]
+    sip_count = system["sips"]["count"]
+    sip_w, sip_h = 71.0, 59.0
+    gap = 30.0
+    canvas_w = sip_count * sip_w + (sip_count - 1) * gap
+    canvas_h = sip_h + 20.0
+
+    nodes: dict[str, Node] = {}
+    view_edges: list[Edge] = []
+
+    sw = system["components"]["switch"]
+    sw_id = "fabric.switch0"
+    nodes[sw_id] = Node(
+        id=sw_id, kind=sw["kind"], impl=sw["impl"],
+        attrs=sw.get("attrs", {}), pos_mm=(canvas_w / 2, 5.0), label="Fabric Switch",
+    )
+
+    for s in range(sip_count):
+        sx = s * (sip_w + gap)
+        sy = 20.0
+        sip_id = f"sip{s}"
+
+        nodes[sip_id] = Node(
+            id=sip_id, kind="sip", impl="",
+            attrs={"w_mm": sip_w, "h_mm": sip_h},
+            pos_mm=(sx + sip_w / 2, sy + sip_h / 2),
+            label=f"SIP {s}",
+        )
+
+        for inst in spec["sip"]["iochiplet"]["instances"]:
+            iid = inst["id"]
+            io_nid = f"{sip_id}.{iid}"
+            side = inst["place"]["side"]
+            iy = sy if side == "N" else sy + sip_h
+            nodes[io_nid] = Node(
+                id=io_nid, kind="iochiplet", impl="",
+                attrs={}, pos_mm=(sx + sip_w / 2, iy), label=f"IO {iid}",
+            )
+            view_edges.append(Edge(
+                src=sw_id, dst=io_nid,
+                distance_mm=system["links"]["io_ep_to_switch"]["distance_mm"],
+                bw_gbs=system["links"]["io_ep_to_switch"]["bw_gbs_per_ep"],
+                kind="pcie",
+            ))
+
+    return ViewGraph(
+        name="system", nodes=nodes, edges=view_edges,
+        width_mm=canvas_w, height_mm=canvas_h,
+    )
+
+
+def _build_sip_view(spec: dict) -> ViewGraph:
+    """SIP-level view: cube mesh + IO chiplets (representative, sip0)."""
+    sip_spec = spec["sip"]
+    cube_spec = spec["cube"]
+    mesh_w = sip_spec["cube_mesh"]["w"]
+    mesh_h = sip_spec["cube_mesh"]["h"]
+    cube_w = cube_spec["geometry"]["cube_mm"]["w"]
+    cube_h = cube_spec["geometry"]["cube_mm"]["h"]
+    seam = sip_spec["links"]["inter_cube_mesh"]["distance_mm_across_seam"]
+    stride_x = cube_w + seam
+    stride_y = cube_h + seam
+    mesh_total_w = mesh_w * cube_w + (mesh_w - 1) * seam
+    mesh_total_h = mesh_h * cube_h + (mesh_h - 1) * seam
+    io_margin = 6.0
+    canvas_w = mesh_total_w
+    canvas_h = mesh_total_h + 2 * io_margin
+
+    nodes: dict[str, Node] = {}
+    view_edges: list[Edge] = []
+
+    # Cubes as opaque blocks
+    for row in range(mesh_h):
+        for col in range(mesh_w):
+            cid = row * mesh_w + col
+            cx = col * stride_x + cube_w / 2
+            cy = io_margin + row * stride_y + cube_h / 2
+            nid = f"cube{cid}"
+            nodes[nid] = Node(
+                id=nid, kind="cube", impl="",
+                attrs={"w_mm": cube_w, "h_mm": cube_h, "col": col, "row": row},
+                pos_mm=(cx, cy), label=f"CUBE ({col},{row})",
+            )
+
+    # Inter-cube mesh edges
+    mesh_link = sip_spec["links"]["inter_cube_mesh"]
+    for row in range(mesh_h):
+        for col in range(mesh_w):
+            cid = row * mesh_w + col
+            if col + 1 < mesh_w:
+                nid = row * mesh_w + (col + 1)
+                view_edges.append(Edge(
+                    src=f"cube{cid}", dst=f"cube{nid}",
+                    distance_mm=mesh_link["distance_mm_across_seam"],
+                    bw_gbs=mesh_link["bw_gbs_per_ucie_phy"],
+                    kind="ucie_mesh",
+                ))
+            if row + 1 < mesh_h:
+                nid = (row + 1) * mesh_w + col
+                view_edges.append(Edge(
+                    src=f"cube{cid}", dst=f"cube{nid}",
+                    distance_mm=mesh_link["distance_mm_across_seam"],
+                    bw_gbs=mesh_link["bw_gbs_per_ucie_phy"],
+                    kind="ucie_mesh",
+                ))
+
+    # IO chiplets
+    io_links = sip_spec["iochiplet"]["links"]
+    for inst in sip_spec["iochiplet"]["instances"]:
+        iid = inst["id"]
+        side = inst["place"]["side"]
+        iy = 2.0 if side == "N" else canvas_h - 2.0
+        nodes[iid] = Node(
+            id=iid, kind="iochiplet", impl="",
+            attrs={}, pos_mm=(mesh_total_w / 2, iy), label=f"IO {iid}",
+        )
+        for port in inst["cube_ports"]:
+            cube_col, cube_row = port["cube"]["xy"]
+            cube_id = cube_row * mesh_w + cube_col
+            view_edges.append(Edge(
+                src=iid, dst=f"cube{cube_id}",
+                distance_mm=io_links["io_cpu_to_ucie_mm"] + port["distance_mm"],
+                bw_gbs=io_links["io_cpu_to_ucie_bw_gbs"],
+                kind="io_to_cube",
+            ))
+
+    return ViewGraph(
+        name="sip", nodes=nodes, edges=view_edges,
+        width_mm=canvas_w, height_mm=canvas_h,
+    )
+
+
+def _build_cube_view(spec: dict) -> ViewGraph:
+    """Cube-level view: representative single cube, PEs as opaque blocks."""
+    cube = spec["cube"]
+    cube_w = cube["geometry"]["cube_mm"]["w"]
+    cube_h = cube["geometry"]["cube_mm"]["h"]
+    local_pos = _cube_local_positions(cube_w, cube_h)
+    clinks = cube["links"]
+    n_slices = cube["memory_map"]["hbm_slices_per_cube"]
+
+    nodes: dict[str, Node] = {}
+    view_edges: list[Edge] = []
+
+    # UCIe ports
+    for port in cube["ucie"]["ports"]:
+        pid = f"ucie-{port}"
+        lx, ly = local_pos[pid]
+        nodes[pid] = Node(
+            id=pid, kind="ucie_port", impl="ucie_v1",
+            attrs={}, pos_mm=(lx, ly), label=f"UCIe-{port}",
+        )
+
+    # Named components (hbm_ctrl as single representative node in view)
+    for name in ("noc", "m_cpu", "hbm_ctrl", "sram"):
+        c = cube["components"][name]
+        lx, ly = local_pos[name]
+        nodes[name] = Node(
+            id=name, kind=c["kind"], impl=c["impl"],
+            attrs=c["attrs"], pos_mm=(lx, ly),
+            label=name.upper().replace("_", " "),
+        )
+
+    # Bridges
+    for br in cube["components"]["xbar"]["bridges"]:
+        bname = br["id"]
+        bid = f"bridge.{bname}"
+        lx, ly = local_pos[bid]
+        nodes[bid] = Node(
+            id=bid, kind=br["kind"], impl=br["impl"],
+            attrs=br["attrs"], pos_mm=(lx, ly),
+            label=f"Bridge {bname.upper()}",
+        )
+
+    # PEs as opaque blocks + per-PE xbar entry nodes
+    corners = cube["pe_layout"]["corners"]
+    pe_per_corner = cube["pe_layout"]["pe_per_corner"]
+    corner_pos = _corner_pe_positions(cube_w, cube_h)
+    xbar_pe_spec = cube["components"]["xbar"]["pe"]
+    xbar_top_y = local_pos["xbar.top"][1]
+    xbar_bot_y = local_pos["xbar.bottom"][1]
+
+    pe_idx = 0
+    for corner in corners:
+        is_top = corner in ("NW", "NE")
+        xbar_y = xbar_top_y if is_top else xbar_bot_y
+        mm_key = "pe_to_xbar_row_n_mm" if is_top else "pe_to_xbar_row_s_mm"
+        for ci in range(pe_per_corner):
+            pid = f"pe{pe_idx}"
+            xbar_id = f"xbar.pe{pe_idx}"
+            px, py = corner_pos[corner][ci]
+
+            nodes[pid] = Node(
+                id=pid, kind="pe", impl="",
+                attrs={"corner": corner}, pos_mm=(px, py),
+                label=f"PE{pe_idx}",
+            )
+            nodes[xbar_id] = Node(
+                id=xbar_id, kind=xbar_pe_spec["kind"], impl=xbar_pe_spec["impl"],
+                attrs=xbar_pe_spec["attrs"], pos_mm=(px, xbar_y),
+                label=f"XBAR PE{pe_idx}",
+            )
+
+            # PE → xbar.pe_i (HBM data path)
+            view_edges.append(Edge(
+                src=pid, dst=xbar_id,
+                distance_mm=clinks[mm_key],
+                bw_gbs=clinks["pe_to_xbar_bw_gbs"],
+                kind="pe_to_xbar",
+            ))
+            # PE → noc (non-HBM data path)
+            view_edges.append(Edge(
+                src=pid, dst="noc",
+                distance_mm=clinks["pe_dma_to_noc_mm"],
+                bw_gbs=clinks["pe_dma_to_noc_bw_gbs"],
+                kind="pe_to_noc",
+            ))
+            # noc → PE (command delivery)
+            view_edges.append(Edge(
+                src="noc", dst=pid,
+                distance_mm=clinks["noc_to_pe_cpu_mm"],
+                kind="command",
+            ))
+            pe_idx += 1
+
+    # Cube fabric edges
+    # xbar.pe_i → hbm_ctrl (single representative node in view)
+    for i in range(n_slices):
+        view_edges.append(Edge(
+            src=f"xbar.pe{i}", dst="hbm_ctrl",
+            distance_mm=clinks["xbar_to_hbm_mm"],
+            bw_gbs=clinks["xbar_to_hbm_bw_gbs"],
+            kind="xbar_to_hbm",
+        ))
+
+    # xbar chain
+    half = n_slices // 2
+    for half_start in (0, half):
+        for i in range(half_start, half_start + half - 1):
+            intra = ((i - half_start) % pe_per_corner) != (pe_per_corner - 1)
+            x_dist = clinks["xbar_chain_intra_corner_mm"] if intra else clinks["xbar_chain_inter_corner_mm"]
+            for a, b in [(i, i + 1), (i + 1, i)]:
+                view_edges.append(Edge(
+                    src=f"xbar.pe{a}", dst=f"xbar.pe{b}",
+                    distance_mm=x_dist,
+                    bw_gbs=clinks["xbar_x_bw_gbs"],
+                    kind="xbar_chain",
+                ))
+
+    # bridge connections
+    for bname, pe_top, pe_bot in [("left", 0, half), ("right", half - 1, n_slices - 1)]:
+        br_id = f"bridge.{bname}"
+        for pe_i, br_mm_key in [(pe_top, "xbar_row_n_to_bridge_mm"),
+                                 (pe_bot, "xbar_row_s_to_bridge_mm")]:
+            xbar_id = f"xbar.pe{pe_i}"
+            view_edges.append(Edge(
+                src=xbar_id, dst=br_id,
+                distance_mm=clinks[br_mm_key],
+                bw_gbs=clinks["xbar_to_bridge_bw_gbs"],
+                kind="xbar_to_bridge",
+            ))
+            view_edges.append(Edge(
+                src=br_id, dst=xbar_id,
+                distance_mm=clinks[br_mm_key],
+                bw_gbs=clinks["xbar_to_bridge_bw_gbs"],
+                kind="bridge_to_xbar",
+            ))
+
+    _noc_ucie_v = clinks["noc_to_ucie"]
+    for port in cube["ucie"]["ports"]:
+        view_edges.append(Edge(
+            src="noc", dst=f"ucie-{port}",
+            distance_mm=0.0,
+            bw_gbs=_noc_ucie_v["per_connection_bw_gbs"],
+            n_connections=_noc_ucie_v["n_connections"],
+            kind="noc_to_ucie",
+        ))
+
+    # m_cpu ↔ noc (command dispatch, both directions)
+    view_edges.append(Edge(
+        src="m_cpu", dst="noc",
+        distance_mm=clinks["m_cpu_to_noc_mm"],
+        kind="command",
+    ))
+    view_edges.append(Edge(
+        src="noc", dst="m_cpu",
+        distance_mm=clinks["m_cpu_to_noc_mm"],
+        kind="command",
+    ))
+
+    # noc ↔ sram (shared SRAM access, bidirectional)
+    _noc_sram_v = clinks["noc_to_sram"]
+    view_edges.append(Edge(
+        src="noc", dst="sram",
+        distance_mm=clinks["noc_to_sram_mm"],
+        bw_gbs=_noc_sram_v["per_connection_bw_gbs"],
+        n_connections=_noc_sram_v["n_connections"],
+        kind="noc_to_sram",
+    ))
+    view_edges.append(Edge(
+        src="sram", dst="noc",
+        distance_mm=clinks["noc_to_sram_mm"],
+        bw_gbs=_noc_sram_v["per_connection_bw_gbs"],
+        n_connections=_noc_sram_v["n_connections"],
+        kind="noc_to_sram",
+    ))
+
+    return ViewGraph(
+        name="cube", nodes=nodes, edges=view_edges,
+        width_mm=cube_w, height_mm=cube_h,
+    )
+
+
+def _build_pe_view(spec: dict) -> ViewGraph:
+    """PE-level view: representative single PE with all template components."""
+    pe_tmpl = spec["cube"]["pe_template"]
+    pe_links = pe_tmpl["links"]
+    canvas_w, canvas_h = 12.0, 8.0
+
+    positions = {
+        "pe_cpu": (1.5, 4.0),
+        "pe_scheduler": (4.0, 4.0),
+        "pe_dma": (7.0, 1.5),
+        "pe_gemm": (7.0, 4.0),
+        "pe_math": (7.0, 6.5),
+        "pe_tcm": (10.0, 4.0),
+    }
+
+    nodes: dict[str, Node] = {}
+    view_edges: list[Edge] = []
+
+    for comp_name, comp_spec in pe_tmpl["components"].items():
+        px, py = positions[comp_name]
+        nodes[comp_name] = Node(
+            id=comp_name, kind=comp_spec["kind"], impl=comp_spec["impl"],
+            attrs=comp_spec["attrs"], pos_mm=(px, py),
+            label=comp_name.upper().replace("_", " "),
+        )
+
+    view_edges.append(Edge(
+        src="pe_cpu", dst="pe_scheduler",
+        distance_mm=pe_links["pe_cpu_to_scheduler_mm"],
+        kind="pe_internal",
+    ))
+    for eng, key in [("pe_dma", "scheduler_to_dma_mm"),
+                     ("pe_gemm", "scheduler_to_gemm_mm"),
+                     ("pe_math", "scheduler_to_math_mm")]:
+        view_edges.append(Edge(
+            src="pe_scheduler", dst=eng,
+            distance_mm=pe_links[key],
+            kind="pe_internal",
+        ))
+    for eng, mm_key, bw_key in [("pe_dma", "dma_to_tcm_mm", "dma_to_tcm_bw_gbs"),
+                                 ("pe_gemm", "gemm_to_tcm_mm", "gemm_to_tcm_bw_gbs"),
+                                 ("pe_math", "math_to_tcm_mm", "math_to_tcm_bw_gbs")]:
+        view_edges.append(Edge(
+            src=eng, dst="pe_tcm",
+            distance_mm=pe_links[mm_key],
+            bw_gbs=pe_links[bw_key],
+            kind="pe_internal",
+        ))
+
+    return ViewGraph(
+        name="pe", nodes=nodes, edges=view_edges,
+        width_mm=canvas_w, height_mm=canvas_h,
+    )
diff --git a/src/kernbench/topology/graph.py b/src/kernbench/topology/graph.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/kernbench/topology/projections/cube_view.py b/src/kernbench/topology/projections/cube_view.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/kernbench/topology/projections/pe_view.py b/src/kernbench/topology/projections/pe_view.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/kernbench/topology/projections/sip_view.py b/src/kernbench/topology/projections/sip_view.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/kernbench/topology/types.py b/src/kernbench/topology/types.py
new file mode 100644
index 0000000..6365338
--- /dev/null
+++ b/src/kernbench/topology/types.py
@@ -0,0 +1,56 @@
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+
+
+@dataclass
+class Node:
+    id: str
+    kind: str
+    impl: str
+    attrs: dict[str, Any]
+    pos_mm: tuple[float, float] | None  # (x_mm, y_mm); None for abstract nodes
+    label: str = ""
+
+
+@dataclass
+class Edge:
+    src: str                  # node id
+    dst: str                  # node id
+    distance_mm: float        # physical wire delay distance (ns = distance_mm * ns_per_mm)
+    routing_weight_mm: float | None = None  # Dijkstra cost; None → use distance_mm
+    bw_gbs: float | None = None
+    n_connections: int | None = None  # multi-connection links; single request uses 1 connection
+    kind: str = "link"
+
+
+@dataclass
+class ViewGraph:
+    name: str                 # "system" | "sip" | "cube" | "pe"
+    nodes: dict[str, Node]
+    edges: list[Edge]
+    width_mm: float
+    height_mm: float
+
+
+@dataclass
+class TopologyGraph:
+    spec: dict[str, Any]
+
+    # Full instantiated flat graph (used by sim_engine)
+    nodes: dict[str, Node] = field(default_factory=dict)
+    edges: list[Edge] = field(default_factory=list)
+
+    # Representative view projections (used by visualizer)
+    system_view: ViewGraph | None = None
+    sip_view:    ViewGraph | None = None
+    cube_view:   ViewGraph | None = None
+    pe_view:     ViewGraph | None = None
+
+
+@dataclass(frozen=True)
+class TopologyHandle:
+    path: Path
+    topology_obj: TopologyGraph | None  # None until _compile_graph is implemented
diff --git a/src/kernbench/topology/visualizer.py b/src/kernbench/topology/visualizer.py
new file mode 100644
index 0000000..075b081
--- /dev/null
+++ b/src/kernbench/topology/visualizer.py
@@ -0,0 +1,367 @@
+# kernbench/topology/visualizer.py
+"""
+SVG diagram generator for TopologyGraph views.
+
+Produces mm-accurate, deterministic SVG files for each view level
+(system, SIP, cube, PE) per ADR-0005 and ADR-0006.
+"""
+from __future__ import annotations
+
+from pathlib import Path
+
+from .types import Edge, Node, TopologyGraph, ViewGraph
+
+# ── Color palette by component kind ─────────────────────────────────
+
+_KIND_COLORS: dict[str, str] = {
+    "switch":       "#6366f1",  # indigo
+    "sip":          "#e0e7ff",  # light indigo
+    "iochiplet":    "#0ea5e9",  # sky blue
+    "pcie_ep":      "#0ea5e9",
+    "io_cpu":       "#0ea5e9",
+    "ucie_port":    "#3b82f6",  # blue
+    "noc":          "#a78bfa",  # purple
+    "m_cpu":        "#f59e0b",  # amber
+    "xbar":         "#f97316",  # orange
+    "hbm_ctrl":     "#10b981",  # emerald
+    "pe":           "#94a3b8",  # slate
+    "pe_cpu":       "#ef4444",  # red
+    "pe_scheduler": "#f59e0b",  # amber
+    "pe_dma":       "#3b82f6",  # blue
+    "pe_gemm":      "#8b5cf6",  # violet
+    "pe_math":      "#ec4899",  # pink
+    "pe_tcm":       "#10b981",  # emerald
+    "sram":         "#f59e0b",  # amber
+    "cube":         "#cbd5e1",  # slate-300
+}
+
+_EDGE_COLORS: dict[str, str] = {
+    "pcie":         "#6366f1",
+    "io_internal":  "#0ea5e9",
+    "io_to_cube":   "#0ea5e9",
+    "ucie_mesh":    "#3b82f6",
+    "pe_to_xbar":   "#f97316",
+    "xbar_to_hbm":  "#10b981",
+    "xbar_to_bridge": "#a78bfa",
+    "bridge_to_xbar": "#a78bfa",
+    "noc_to_ucie":  "#a78bfa",
+    "pe_to_noc":    "#a78bfa",
+    "noc_to_sram":  "#f59e0b",
+    "command":      "#f59e0b",
+    "pe_internal":  "#94a3b8",
+}
+
+# ── Node sizing ──────────────────────────────────────────────────────
+
+_DEFAULT_NODE_W = 2.0  # mm
+_DEFAULT_NODE_H = 1.2  # mm
+
+_KIND_SIZE: dict[str, tuple[float, float]] = {
+    "sip":  (60.0, 50.0),
+    "cube": (6.0, 4.0),
+    "iochiplet": (4.0, 1.5),
+    "switch": (5.0, 1.5),
+}
+
+
+# ── Public API ───────────────────────────────────────────────────────
+
+
+def emit_diagrams(graph: TopologyGraph, out_dir: Path) -> list[Path]:
+    """Generate SVG diagrams for all views. Returns list of created file paths."""
+    out_dir.mkdir(parents=True, exist_ok=True)
+    created: list[Path] = []
+
+    views = [
+        ("system_view", graph.system_view),
+        ("sip_view", graph.sip_view),
+        ("cube_view", graph.cube_view),
+        ("pe_view", graph.pe_view),
+    ]
+
+    for name, view in views:
+        if view is None:
+            continue
+        svg = _render_view_svg(view)
+        path = out_dir / f"{name}.svg"
+        path.write_text(svg, encoding="utf-8")
+        created.append(path)
+
+    return created
+
+
+# ── SVG rendering ────────────────────────────────────────────────────
+
+
+def _render_view_svg(view: ViewGraph) -> str:
+    """Render a ViewGraph to an SVG string."""
+    scale = _pick_scale(view)
+    pad = 40  # px padding
+    node_sizes = _compute_node_sizes(view, scale)
+
+    # Canvas size in px
+    w_px = int(view.width_mm * scale + 2 * pad)
+    h_px = int(view.height_mm * scale + 2 * pad)
+
+    parts: list[str] = []
+    parts.append(_svg_header(w_px, h_px, view.name))
+
+    # Background
+    parts.append(f'  <rect width="{w_px}" height="{h_px}" fill="#f8fafc"/>')
+
+    # Title
+    parts.append(
+        f'  <text x="{w_px // 2}" y="18" text-anchor="middle" '
+        f'font-family="monospace" font-size="14" font-weight="bold" fill="#1e293b">'
+        f'{view.name.upper()} VIEW</text>'
+    )
+
+    # Special: draw cube boundary + HBM block background in cube view
+    if view.name == "cube":
+        _draw_cube_boundary(parts, view, scale, pad)
+        _draw_hbm_block(parts, view, scale, pad)
+
+    # Edges (draw before nodes so nodes are on top)
+    # Track fan-out edges to assign per-edge offsets
+    fanout_counter: dict[str, int] = {}
+    for edge in view.edges:
+        if edge.src in view.nodes and edge.dst in view.nodes:
+            _draw_edge(parts, edge, view, node_sizes, scale, pad, fanout_counter)
+
+    # Nodes
+    for node in view.nodes.values():
+        _draw_node(parts, node, node_sizes, scale, pad)
+
+    parts.append("</svg>")
+    return "\n".join(parts)
+
+
+def _pick_scale(view: ViewGraph) -> float:
+    """Pixels per mm, chosen per view type."""
+    return {
+        "system": 4.0,
+        "sip": 8.0,
+        "cube": 28.0,
+        "pe": 35.0,
+    }.get(view.name, 10.0)
+
+
+def _compute_node_sizes(
+    view: ViewGraph, scale: float,
+) -> dict[str, tuple[float, float]]:
+    """Returns (w_px, h_px) for each node."""
+    sizes: dict[str, tuple[float, float]] = {}
+    for nid, node in view.nodes.items():
+        w_mm, h_mm = _KIND_SIZE.get(node.kind, (_DEFAULT_NODE_W, _DEFAULT_NODE_H))
+        # For cube view, use smaller PE nodes
+        if view.name == "cube" and node.kind == "pe":
+            w_mm, h_mm = 1.8, 1.0
+        if view.name == "pe":
+            w_mm, h_mm = 2.5, 1.4
+        sizes[nid] = (w_mm * scale, h_mm * scale)
+    return sizes
+
+
+def _svg_header(w: int, h: int, title: str) -> str:
+    return (
+        f'<svg xmlns="http://www.w3.org/2000/svg" '
+        f'width="{w}" height="{h}" viewBox="0 0 {w} {h}">\n'
+        f'  <title>{title}</title>'
+    )
+
+
+def _draw_cube_boundary(
+    parts: list[str], view: ViewGraph, scale: float, pad: int,
+) -> None:
+    """Draw the cube die outline as a dashed rectangle."""
+    bx = pad
+    by = pad
+    bw = view.width_mm * scale
+    bh = view.height_mm * scale
+    parts.append(
+        f'  <rect x="{bx:.1f}" y="{by:.1f}" '
+        f'width="{bw:.1f}" height="{bh:.1f}" '
+        f'rx="6" fill="none" stroke="#475569" stroke-width="2" '
+        f'stroke-dasharray="8,4"/>'
+    )
+
+
+def _draw_hbm_block(
+    parts: list[str], view: ViewGraph, scale: float, pad: int,
+) -> None:
+    """Draw HBM area as a filled rectangle in cube view."""
+    # HBM area: centered at (8.5, 7.0), size 9x5 -> x=[4.0,13.0], y=[4.5,9.5]
+    hbm_x = 4.0 * scale + pad
+    hbm_y = 4.5 * scale + pad
+    hbm_w = 9.0 * scale
+    hbm_h = 5.0 * scale
+    parts.append(
+        f'  <rect x="{hbm_x:.1f}" y="{hbm_y:.1f}" '
+        f'width="{hbm_w:.1f}" height="{hbm_h:.1f}" '
+        f'rx="4" fill="#d1fae5" stroke="#10b981" stroke-width="1.5" '
+        f'stroke-dasharray="6,3" opacity="0.5"/>'
+    )
+    cx = 8.5 * scale + pad
+    cy = 8.5 * scale + pad
+    parts.append(
+        f'  <text x="{cx:.1f}" y="{cy:.1f}" text-anchor="middle" '
+        f'font-family="monospace" font-size="11" fill="#047857" opacity="0.7">'
+        f'HBM</text>'
+    )
+
+
+def _draw_node(
+    parts: list[str],
+    node: Node,
+    sizes: dict[str, tuple[float, float]],
+    scale: float,
+    pad: int,
+) -> None:
+    """Draw a single node as a rounded rectangle with label."""
+    if node.pos_mm is None:
+        return
+    px = node.pos_mm[0] * scale + pad
+    py = node.pos_mm[1] * scale + pad
+    w, h = sizes.get(node.id, (40, 24))
+
+    x = px - w / 2
+    y = py - h / 2
+    fill = _KIND_COLORS.get(node.kind, "#e2e8f0")
+    text_color = "#ffffff" if _is_dark(fill) else "#1e293b"
+
+    parts.append(
+        f'  <rect x="{x:.1f}" y="{y:.1f}" width="{w:.1f}" height="{h:.1f}" '
+        f'rx="4" fill="{fill}" stroke="#475569" stroke-width="1"/>'
+    )
+
+    label = node.label or node.id
+    font_size = _label_font_size(w, label)
+    parts.append(
+        f'  <text x="{px:.1f}" y="{py + 4:.1f}" text-anchor="middle" '
+        f'font-family="monospace" font-size="{font_size}" fill="{text_color}">'
+        f'{_escape(label)}</text>'
+    )
+
+
+# ── Fan-out edge kinds that need offset routing ─────────────────────
+
+_FANOUT_KINDS = {"pe_to_xbar", "pe_to_noc", "command", "noc_to_ucie"}
+
+
+def _draw_edge(
+    parts: list[str],
+    edge: Edge,
+    view: ViewGraph,
+    sizes: dict[str, tuple[float, float]],
+    scale: float,
+    pad: int,
+    fanout_counter: dict[str, int],
+) -> None:
+    """Draw an edge with orthogonal (90-degree) routing for fan-out kinds."""
+    nodes = view.nodes
+    src_node = nodes[edge.src]
+    dst_node = nodes[edge.dst]
+    if src_node.pos_mm is None or dst_node.pos_mm is None:
+        return
+
+    x1 = src_node.pos_mm[0] * scale + pad
+    y1 = src_node.pos_mm[1] * scale + pad
+    x2 = dst_node.pos_mm[0] * scale + pad
+    y2 = dst_node.pos_mm[1] * scale + pad
+
+    color = _EDGE_COLORS.get(edge.kind, "#94a3b8")
+    width = "1.5" if edge.kind == "pe_internal" else "1"
+    opacity = "0.6" if edge.kind in ("command", "noc_to_ucie") else "0.8"
+
+    if edge.kind in _FANOUT_KINDS and view.name == "cube":
+        # Orthogonal routing: src→horizontal→vertical→dst with per-edge offset.
+        group_key = f"{edge.kind}:{edge.dst}"
+        idx = fanout_counter.get(group_key, 0)
+        fanout_counter[group_key] = idx + 1
+
+        # Route: go vertically from src to a staggered horizontal channel,
+        # then horizontally to dst x, then vertically to dst.
+        mid_y = (y1 + y2) / 2 + (idx - 1.5) * 10  # spread channels vertically
+
+        parts.append(
+            f'  <polyline points="{x1:.1f},{y1:.1f} {x1:.1f},{mid_y:.1f} '
+            f'{x2:.1f},{mid_y:.1f} {x2:.1f},{y2:.1f}" '
+            f'fill="none" stroke="{color}" stroke-width="{width}" opacity="{opacity}"/>'
+        )
+
+        # Label on the horizontal segment
+        if edge.distance_mm > 0:
+            lx = (x1 + x2) / 2
+            label = f"{edge.distance_mm:.1f}mm"
+            if edge.bw_gbs:
+                label += f" {edge.bw_gbs:.0f}GB/s"
+            parts.append(
+                f'  <text x="{lx:.1f}" y="{mid_y - 3:.1f}" text-anchor="middle" '
+                f'font-family="monospace" font-size="7" fill="#64748b">'
+                f'{label}</text>'
+            )
+        return
+
+    # Non-fanout: orthogonal L-bend
+    if abs(x2 - x1) > 1 and abs(y2 - y1) > 1:
+        # PE view: vertical-first for left→right edges (scheduler→engines),
+        # horizontal-first for right→right edges (engines→tcm)
+        if view.name == "pe":
+            if src_node.pos_mm[0] < view.width_mm / 2:
+                # Source in left half: vertical-first (scheduler fan-out)
+                parts.append(
+                    f'  <polyline points="{x1:.1f},{y1:.1f} {x1:.1f},{y2:.1f} {x2:.1f},{y2:.1f}" '
+                    f'fill="none" stroke="{color}" stroke-width="{width}" opacity="{opacity}"/>'
+                )
+            else:
+                # Source in right half: horizontal-first (dma/math→tcm)
+                parts.append(
+                    f'  <polyline points="{x1:.1f},{y1:.1f} {x2:.1f},{y1:.1f} {x2:.1f},{y2:.1f}" '
+                    f'fill="none" stroke="{color}" stroke-width="{width}" opacity="{opacity}"/>'
+                )
+        else:
+            parts.append(
+                f'  <polyline points="{x1:.1f},{y1:.1f} {x2:.1f},{y1:.1f} {x2:.1f},{y2:.1f}" '
+                f'fill="none" stroke="{color}" stroke-width="{width}" opacity="{opacity}"/>'
+            )
+    else:
+        parts.append(
+            f'  <line x1="{x1:.1f}" y1="{y1:.1f}" x2="{x2:.1f}" y2="{y2:.1f}" '
+            f'stroke="{color}" stroke-width="{width}" opacity="{opacity}"/>'
+        )
+
+    # Distance label at midpoint
+    if edge.distance_mm > 0:
+        mx = (x1 + x2) / 2
+        my = (y1 + y2) / 2
+        label = f"{edge.distance_mm:.1f}mm"
+        if edge.bw_gbs:
+            label += f" {edge.bw_gbs:.0f}GB/s"
+        parts.append(
+            f'  <text x="{mx:.1f}" y="{my - 4:.1f}" text-anchor="middle" '
+            f'font-family="monospace" font-size="7" fill="#64748b">'
+            f'{label}</text>'
+        )
+
+
+# ── Helpers ──────────────────────────────────────────────────────────
+
+
+def _is_dark(hex_color: str) -> bool:
+    """Check if a hex color is dark (for white text)."""
+    h = hex_color.lstrip("#")
+    r, g, b = int(h[0:2], 16), int(h[2:4], 16), int(h[4:6], 16)
+    return (r * 0.299 + g * 0.587 + b * 0.114) < 140
+
+
+def _label_font_size(box_width: float, label: str) -> int:
+    """Choose font size to fit label in box."""
+    char_w = len(label) * 7
+    if char_w > box_width * 0.9:
+        return max(7, int(box_width * 0.9 / len(label) * 1.4))
+    return 10
+
+
+def _escape(text: str) -> str:
+    """Escape XML special characters."""
+    return text.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
diff --git a/src/kernbench/triton_emu/__init__.py b/src/kernbench/triton_emu/__init__.py
new file mode 100644
index 0000000..ba436fb
--- /dev/null
+++ b/src/kernbench/triton_emu/__init__.py
@@ -0,0 +1,11 @@
+"""Triton emulator: fake tl module for kernel performance simulation.
+
+Provides TLContext (the fake `tl` parameter) that kernels use to express
+memory access patterns and compute operations. Kernel functions are plain
+Python — no yield, no async — and generate a PeCommand trace that PE_CPU
+replays through SimPy.
+
+Usage:
+    from kernbench.triton_emu.registry import register_kernel, get_kernel
+    from kernbench.triton_emu.tl_context import TLContext
+"""
diff --git a/src/kernbench/triton_emu/registry.py b/src/kernbench/triton_emu/registry.py
new file mode 100644
index 0000000..017e39d
--- /dev/null
+++ b/src/kernbench/triton_emu/registry.py
@@ -0,0 +1,30 @@
+"""Kernel registry: maps kernel names to Python callable generators.
+
+Benchmarks register kernel functions here; PE_CPU looks them up by
+KernelRef.name at execution time.
+"""
+from __future__ import annotations
+
+from collections.abc import Callable
+from typing import Any
+
+_kernels: dict[str, Callable[..., None]] = {}
+
+
+def register_kernel(name: str, fn: Callable[..., None]) -> None:
+    """Register a kernel function by name."""
+    if name in _kernels:
+        raise ValueError(f"kernel '{name}' already registered")
+    _kernels[name] = fn
+
+
+def get_kernel(name: str) -> Callable[..., None]:
+    """Look up a registered kernel function by name."""
+    if name not in _kernels:
+        raise KeyError(f"kernel '{name}' not registered")
+    return _kernels[name]
+
+
+def clear_registry() -> None:
+    """Clear all registered kernels (for testing)."""
+    _kernels.clear()
diff --git a/src/kernbench/triton_emu/tl_context.py b/src/kernbench/triton_emu/tl_context.py
new file mode 100644
index 0000000..4d5296c
--- /dev/null
+++ b/src/kernbench/triton_emu/tl_context.py
@@ -0,0 +1,356 @@
+"""TLContext: fake Triton Language module for kernel performance simulation.
+
+Passed as the `tl` parameter to kernel functions. Each API call records a
+PeCommand in the internal trace. After the kernel returns, PE_CPU replays
+the command list through SimPy.
+
+Kernel code looks like standard Python — no yield, no async:
+
+    def my_kernel(a_ptr, b_ptr, out_ptr, tl):
+        pid = tl.program_id(0)
+        a = tl.load(a_ptr, shape=(32, 64), dtype="f16")
+        b = tl.load(b_ptr + pid * stride, shape=(64, 32), dtype="f16")
+        tl.composite(op="gemm", a=a, b=b, out_ptr=out_ptr)
+"""
+from __future__ import annotations
+
+import math
+from typing import Literal
+
+from kernbench.common.pe_commands import (
+    CompletionHandle,
+    CompositeCmd,
+    DmaReadCmd,
+    DmaWriteCmd,
+    GemmCmd,
+    MathCmd,
+    PeCommand,
+    PeCpuOverheadCmd,
+    TensorHandle,
+    WaitCmd,
+)
+
+_DTYPE_BYTES: dict[str, int] = {
+    "f16": 2, "f32": 4, "f64": 8,
+    "bf16": 2,
+    "i8": 1, "i16": 2, "i32": 4, "i64": 8,
+    "u8": 1, "u16": 2, "u32": 4, "u64": 8,
+}
+
+
+class TLContext:
+    """Fake Triton Language context.
+
+    Args:
+        pe_id: program instance index (returned by program_id).
+        num_programs: total number of program instances.
+        dispatch_cycles: PE_CPU overhead per tl API call (auto-inserted).
+    """
+
+    def __init__(
+        self,
+        pe_id: int = 0,
+        num_programs: int = 1,
+        dispatch_cycles: int = 1,
+    ) -> None:
+        self._pe_id = pe_id
+        self._num_programs = num_programs
+        self._dispatch_cycles = dispatch_cycles
+        self._commands: list[PeCommand] = []
+        self._handle_counter = 0
+        self._completion_counter = 0
+
+    @property
+    def commands(self) -> list[PeCommand]:
+        """Return the recorded command trace."""
+        return self._commands
+
+    # ── helpers ────────────────────────────────────────────────────
+
+    def _next_handle_id(self) -> str:
+        self._handle_counter += 1
+        return f"t{self._handle_counter}"
+
+    def _next_completion_id(self) -> str:
+        self._completion_counter += 1
+        return f"c{self._completion_counter}"
+
+    def _dtype_bytes(self, dtype: str) -> int:
+        return _DTYPE_BYTES.get(dtype, 2)
+
+    def _nbytes(self, shape: tuple[int, ...], dtype: str) -> int:
+        return math.prod(shape) * self._dtype_bytes(dtype)
+
+    def _emit_dispatch_overhead(self) -> None:
+        if self._dispatch_cycles > 0:
+            self._commands.append(PeCpuOverheadCmd(cycles=self._dispatch_cycles))
+
+    def _make_handle(
+        self, pa: int, shape: tuple[int, ...], dtype: str,
+    ) -> TensorHandle:
+        return TensorHandle(
+            id=self._next_handle_id(),
+            pa=pa, shape=shape, dtype=dtype,
+            nbytes=self._nbytes(shape, dtype),
+        )
+
+    # ── Reference (no DMA, metadata only) ────────────────────────
+
+    def ref(
+        self, ptr: int, shape: tuple[int, ...], dtype: str = "f16",
+    ) -> TensorHandle:
+        """Create a TensorHandle referencing HBM data without issuing DMA.
+
+        Used when the scheduler will stream data per-tile (e.g., tensor b
+        in a composite GEMM). No command is generated.
+        """
+        return self._make_handle(pa=ptr, shape=shape, dtype=dtype)
+
+    # ── Data Movement (blocking, DMA engine) ──────────────────────
+
+    def load(
+        self, ptr: int, shape: tuple[int, ...], dtype: str = "f16",
+    ) -> TensorHandle:
+        """Load tensor from HBM to TCM. Returns TensorHandle."""
+        self._emit_dispatch_overhead()
+        handle = self._make_handle(pa=ptr, shape=shape, dtype=dtype)
+        self._commands.append(DmaReadCmd(
+            handle=handle, src_pa=ptr, nbytes=handle.nbytes,
+        ))
+        return handle
+
+    def store(self, ptr: int, handle: TensorHandle) -> None:
+        """Store tensor from TCM to HBM."""
+        self._emit_dispatch_overhead()
+        self._commands.append(DmaWriteCmd(
+            handle=handle, dst_pa=ptr, nbytes=handle.nbytes,
+        ))
+
+    # ── GEMM Engine (blocking) ────────────────────────────────────
+
+    def dot(self, a: TensorHandle, b: TensorHandle) -> TensorHandle:
+        """Matrix multiply: out = a @ b. Both operands must be in TCM.
+
+        a: (M, K), b: (K, N) → out: (M, N)
+        """
+        if len(a.shape) < 2 or len(b.shape) < 2:
+            raise ValueError("dot requires 2D tensors")
+        m, k = a.shape[-2], a.shape[-1]
+        k2, n = b.shape[-2], b.shape[-1]
+        if k != k2:
+            raise ValueError(f"dot shape mismatch: a.K={k} != b.K={k2}")
+        out_shape = (*a.shape[:-2], m, n)
+        out_dtype = a.dtype
+        out = self._make_handle(pa=0, shape=out_shape, dtype=out_dtype)
+        self._emit_dispatch_overhead()
+        self._commands.append(GemmCmd(a=a, b=b, out=out, m=m, k=k, n=n))
+        return out
+
+    # ── MATH Engine: unary (blocking) ─────────────────────────────
+
+    def _unary_math(self, op: str, x: TensorHandle) -> TensorHandle:
+        out = self._make_handle(pa=0, shape=x.shape, dtype=x.dtype)
+        self._emit_dispatch_overhead()
+        self._commands.append(MathCmd(op=op, inputs=(x,), out=out))
+        return out
+
+    def exp(self, x: TensorHandle) -> TensorHandle:
+        return self._unary_math("exp", x)
+
+    def log(self, x: TensorHandle) -> TensorHandle:
+        return self._unary_math("log", x)
+
+    def sqrt(self, x: TensorHandle) -> TensorHandle:
+        return self._unary_math("sqrt", x)
+
+    def abs(self, x: TensorHandle) -> TensorHandle:
+        return self._unary_math("abs", x)
+
+    def sigmoid(self, x: TensorHandle) -> TensorHandle:
+        return self._unary_math("sigmoid", x)
+
+    def cos(self, x: TensorHandle) -> TensorHandle:
+        return self._unary_math("cos", x)
+
+    def sin(self, x: TensorHandle) -> TensorHandle:
+        return self._unary_math("sin", x)
+
+    # ── MATH Engine: reduction (blocking) ─────────────────────────
+
+    def _reduction(
+        self, op: str, x: TensorHandle, axis: int,
+    ) -> TensorHandle:
+        out_shape = list(x.shape)
+        out_shape[axis] = 1
+        out = self._make_handle(pa=0, shape=tuple(out_shape), dtype=x.dtype)
+        self._emit_dispatch_overhead()
+        self._commands.append(MathCmd(op=op, inputs=(x,), out=out, axis=axis))
+        return out
+
+    def sum(self, x: TensorHandle, axis: int) -> TensorHandle:
+        return self._reduction("sum", x, axis)
+
+    def max(self, x: TensorHandle, axis: int) -> TensorHandle:
+        return self._reduction("max", x, axis)
+
+    def min(self, x: TensorHandle, axis: int) -> TensorHandle:
+        return self._reduction("min", x, axis)
+
+    # ── MATH Engine: binary (blocking) ────────────────────────────
+
+    def _binary_math(
+        self, op: str, a: TensorHandle, b: TensorHandle,
+    ) -> TensorHandle:
+        out = self._make_handle(pa=0, shape=a.shape, dtype=a.dtype)
+        self._emit_dispatch_overhead()
+        self._commands.append(MathCmd(op=op, inputs=(a, b), out=out))
+        return out
+
+    def where(
+        self, cond: TensorHandle, a: TensorHandle, b: TensorHandle,
+    ) -> TensorHandle:
+        out = self._make_handle(pa=0, shape=a.shape, dtype=a.dtype)
+        self._emit_dispatch_overhead()
+        self._commands.append(MathCmd(op="where", inputs=(cond, a, b), out=out))
+        return out
+
+    # ── Index / Scalar (PE_CPU, no engine) ────────────────────────
+
+    def program_id(self, axis: int = 0) -> int:
+        """Return program instance index."""
+        return self._pe_id
+
+    def num_programs(self, axis: int = 0) -> int:
+        """Return total number of program instances."""
+        return self._num_programs
+
+    def arange(self, start: int, end: int, dtype: str = "i32") -> TensorHandle:
+        """Create index range tensor in TCM."""
+        n = end - start
+        return self._make_handle(pa=0, shape=(n,), dtype=dtype)
+
+    def zeros(self, shape: tuple[int, ...], dtype: str = "f16") -> TensorHandle:
+        """Create zero-filled tensor in TCM."""
+        return self._make_handle(pa=0, shape=shape, dtype=dtype)
+
+    def full(
+        self, shape: tuple[int, ...], value: float | int, dtype: str = "f16",
+    ) -> TensorHandle:
+        """Create constant-filled tensor in TCM."""
+        return self._make_handle(pa=0, shape=shape, dtype=dtype)
+
+    # ── Metadata (no compute, no DMA) ─────────────────────────────
+
+    def trans(self, x: TensorHandle) -> TensorHandle:
+        """Transpose — shape change only, no command generated."""
+        if len(x.shape) < 2:
+            raise ValueError("trans requires at least 2D tensor")
+        new_shape = (*x.shape[:-2], x.shape[-1], x.shape[-2])
+        return TensorHandle(
+            id=x.id, pa=x.pa, shape=new_shape,
+            dtype=x.dtype, nbytes=x.nbytes, data=x.data,
+        )
+
+    # ── Composite + Control ───────────────────────────────────────
+
+    def composite(
+        self,
+        op: Literal["gemm", "math"],
+        a: TensorHandle,
+        b: TensorHandle | None = None,
+        out_ptr: int = 0,
+        math_op: str | None = None,
+    ) -> CompletionHandle:
+        """Submit a composite command (non-blocking, tiled pipeline).
+
+        Returns CompletionHandle for use with wait().
+        """
+        # Compute output size based on op
+        if op == "gemm" and b is not None:
+            m, k = a.shape[-2], a.shape[-1]
+            n = b.shape[-1]
+            out_dtype = a.dtype
+            out_nbytes = m * n * self._dtype_bytes(out_dtype)
+        else:
+            out_nbytes = a.nbytes
+
+        completion = CompletionHandle(id=self._next_completion_id())
+        self._emit_dispatch_overhead()
+        self._commands.append(CompositeCmd(
+            completion=completion, op=op,
+            a=a, b=b, out_pa=out_ptr, out_nbytes=out_nbytes,
+            math_op=math_op,
+        ))
+        return completion
+
+    def wait(self, handle: CompletionHandle | None = None) -> None:
+        """Wait for a specific composite or all pending composites."""
+        self._commands.append(WaitCmd(handle=handle))
+
+    def cycles(self, n: int) -> None:
+        """Declare PE_CPU scalar execution overhead (cycles)."""
+        self._commands.append(PeCpuOverheadCmd(cycles=n))
+
+
+# ── TensorHandle arithmetic operators ─────────────────────────────
+# Enables: a + b, a * b, a - b, a / b in kernel code.
+# Each creates a MathCmd via a module-level helper that requires a
+# TLContext. We attach the context to handles via a closure approach.
+
+
+def _enable_tensor_ops() -> None:
+    """Patch TensorHandle with arithmetic operators.
+
+    Called once at module load. Operators create MathCmd entries via
+    a thread-local TLContext reference set during kernel execution.
+    """
+    import threading
+
+    _local = threading.local()
+
+    def set_active_context(ctx: TLContext | None) -> None:
+        _local.ctx = ctx
+
+    def get_active_context() -> TLContext:
+        ctx = getattr(_local, "ctx", None)
+        if ctx is None:
+            raise RuntimeError("TensorHandle ops require an active TLContext")
+        return ctx
+
+    def _binop(op: str):
+        def method(self: TensorHandle, other: TensorHandle) -> TensorHandle:
+            ctx = get_active_context()
+            return ctx._binary_math(op, self, other)
+        return method
+
+    # Patch TensorHandle class with operators
+    TensorHandle.__add__ = _binop("add")       # type: ignore[attr-defined]
+    TensorHandle.__sub__ = _binop("sub")       # type: ignore[attr-defined]
+    TensorHandle.__mul__ = _binop("mul")       # type: ignore[attr-defined]
+    TensorHandle.__truediv__ = _binop("div")   # type: ignore[attr-defined]
+
+    # Expose context management
+    TLContext._set_active = staticmethod(set_active_context)  # type: ignore[attr-defined]
+    TLContext._get_active = staticmethod(get_active_context)  # type: ignore[attr-defined]
+
+
+_enable_tensor_ops()
+
+
+def run_kernel(
+    kernel_fn,
+    tl_ctx: TLContext,
+    *args,
+    **kwargs,
+) -> list[PeCommand]:
+    """Execute a kernel function with the given TLContext and return commands.
+
+    Sets tl_ctx as the active context for TensorHandle operators,
+    calls the kernel, then clears the context.
+    """
+    TLContext._set_active(tl_ctx)  # type: ignore[attr-defined]
+    try:
+        kernel_fn(*args, tl=tl_ctx, **kwargs)
+    finally:
+        TLContext._set_active(None)  # type: ignore[attr-defined]
+    return tl_ctx.commands
diff --git a/tests/test_cli.py b/tests/test_cli.py
new file mode 100644
index 0000000..b1f8df9
--- /dev/null
+++ b/tests/test_cli.py
@@ -0,0 +1,22 @@
+import kernbench.cli.main as cli_main
+
+
+def test_cli_main_arg_parsing(monkeypatch):
+
+    def fake_cmd_run(args) -> int:
+        assert args.cmd == "run"
+        assert args.topology == "topology.yaml"
+        assert args.bench == "qkv_gemm"
+        assert args.device == None
+        return 0
+
+    # monkey patch the handler to test arg parsing without running the actual bench
+    monkeypatch.setattr(cli_main, "cmd_run", fake_cmd_run)
+    rc = cli_main.main(["run", "--topology", "topology.yaml", "--bench", "qkv_gemm"])
+    assert rc == 0
+
+
+def test_cli_main():
+
+    rc = cli_main.main(["run", "--topology", "topology.yaml", "--bench", "qkv_gemm"])
+    assert rc == 0
diff --git a/tests/test_component_registry.py b/tests/test_component_registry.py
new file mode 100644
index 0000000..2e6bb82
--- /dev/null
+++ b/tests/test_component_registry.py
@@ -0,0 +1,187 @@
+"""Tests for the SimPy component model and DI registry (ADR-0007 D3).
+
+Phase 1 verification: all tests FAIL until Phase 2 implements production code.
+
+Latency invariant after refactor:
+  total_ns = Σ(wire propagation) + Σ(component.run() overhead_ns) + nbytes / bottleneck_bw
+  This is identical to the current formula for Phase 0 (no contention).
+"""
+
+import pytest
+import simpy
+
+from pathlib import Path
+
+from kernbench.components.base import ComponentBase, ComponentRegistry
+from kernbench.components.impls.forwarding import TransitComponent
+from kernbench.policy.address.phyaddr import PhysAddr
+from kernbench.runtime_api.kernel import MemoryReadMsg
+from kernbench.sim_engine.engine import GraphEngine
+from kernbench.topology.builder import load_topology
+from kernbench.topology.types import Node
+
+TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml"
+
+
+def _graph():
+    return load_topology(TOPOLOGY_PATH)
+
+
+def _hbm_pa(pe_id: int = 0) -> int:
+    slice_bytes = 48 * (1 << 30) // 8
+    pa = PhysAddr.pe_hbm_addr(
+        rack_id=0, sip_id=0, cube_id=0, pe_id=pe_id,
+        pe_local_hbm_offset=0x1000, slice_size_bytes=slice_bytes,
+    )
+    return pa.encode()
+
+
+def _node(impl: str, overhead_ns: float = 0.0) -> Node:
+    return Node(id="test", kind="xbar", impl=impl, attrs={"overhead_ns": overhead_ns}, pos_mm=None)
+
+
+# ── 1. unknown impl → error ──────────────────────────────────────────
+
+
+def test_registry_unknown_impl_raises_error():
+    """Unregistered impl raises ValueError (no fallback)."""
+    node = _node("totally_unknown_v99", overhead_ns=5.0)
+    with pytest.raises(ValueError, match="No component registered"):
+        ComponentRegistry.create(node)
+
+
+# ── 2. TransitComponent yields exactly overhead_ns via simpy timeout ──
+
+
+def test_transit_component_yields_overhead_ns():
+    """TransitComponent.run() yields exactly node.attrs['overhead_ns'] ns."""
+    node = _node("xbar_v1", overhead_ns=3.0)
+    comp = TransitComponent(node)
+    env = simpy.Environment()
+
+    def proc():
+        yield from comp.run(env, nbytes=4096)
+
+    env.process(proc())
+    env.run()
+    assert env.now == pytest.approx(3.0)
+
+
+def test_transit_component_zero_overhead_ns():
+    """TransitComponent with overhead_ns=0 still yields (no infinite loop)."""
+    node = _node("noc_v1", overhead_ns=0.0)
+    comp = TransitComponent(node)
+    env = simpy.Environment()
+
+    done = []
+
+    def proc():
+        yield from comp.run(env, nbytes=1024)
+        done.append(True)
+
+    env.process(proc())
+    env.run()
+    assert done == [True]
+    assert env.now == pytest.approx(0.0)
+
+
+# ── 3. DI override: custom component is invoked by engine ────────────
+
+
+def test_engine_component_override_is_called():
+    """Custom component injected via component_overrides is invoked during simulation."""
+
+    class SpyXbar(ComponentBase):
+        calls = 0
+
+        def run(self, env, nbytes):
+            SpyXbar.calls += 1
+            yield env.timeout(0)
+
+    SpyXbar.calls = 0
+    graph = _graph()
+    engine = GraphEngine(graph, component_overrides={"xbar_v1": SpyXbar})
+    msg = MemoryReadMsg(
+        correlation_id="c", request_id="r",
+        src_sip=0, src_cube=0, src_pe=0,
+        src_pa=_hbm_pa(pe_id=0), nbytes=4096,
+    )
+    h = engine.submit(msg)
+    engine.wait(h)
+    # PE0→slice0 path passes through xbar.pe0 (impl=xbar_v1)
+    assert SpyXbar.calls > 0
+
+
+# ── 4. behavior unchanged: total_ns matches existing formula ─────────
+
+
+def test_engine_component_model_same_latency_as_before():
+    """Phase B component model total_ns for PE0→slice0 local HBM (4096B).
+
+    Cut-through (wormhole) wire model: wires apply propagation only.
+    Serialization (drain) is computed per-path and applied once at the terminal.
+
+    Forward path:
+      Path 1: pcie_ep(5.0) + wire(1.0mm=0.01) + io_cpu(10.0)
+      Path 2: wire(3.5mm=0.035) + ucie-N(1.0)
+              + 2DMeshNOC(ucie-N→m_cpu: Manhattan 10.9mm=0.109) + m_cpu(5.0)
+      Path 3 DMA (m_cpu→noc→xbar.pe0→hbm_ctrl.slice0):
+              + 2DMeshNOC(m_cpu→xbar.pe0: Manhattan 15.0mm=0.15)
+              + xbar.pe0(2.0) + wire(2.5mm=0.025) + hbm_ctrl(0.0)
+              + drain_ns(4096/128 = 32.0, bottleneck = noc_to_xbar 128 GB/s)
+
+    Response path (reverse, nbytes=0, drain=0):
+      DMA response: hbm_ctrl→xbar.pe0→noc→m_cpu (propagation + xbar overhead_ns)
+      Command response: m_cpu→noc→ucie-N→io_cpu (propagation + ucie overhead_ns)
+
+    Total:  ~58.648 ns
+    """
+    graph = _graph()
+    engine = GraphEngine(graph)
+    msg = MemoryReadMsg(
+        correlation_id="c", request_id="r",
+        src_sip=0, src_cube=0, src_pe=0,
+        src_pa=_hbm_pa(pe_id=0), nbytes=4096,
+    )
+    h = engine.submit(msg)
+    engine.wait(h)
+    _, trace = engine.get_completion(h)
+    assert trace["total_ns"] == pytest.approx(58.648, rel=1e-4)
+
+
+# ── 5. override is scoped: only targeted impl is replaced ────────────
+
+
+def test_engine_override_is_scoped_to_impl():
+    """xbar_v1 override (ZeroXbar, no overhead_ns) reduces total_ns by exactly 4.0 ns.
+
+    xbar.pe0 has overhead_ns=2.0. It is traversed on both the forward DMA path
+    and the reverse response path, so replacing it with a zero-latency impl
+    removes 2.0 ns × 2 = 4.0 ns; all other components are unchanged.
+    """
+
+    class ZeroXbar(ComponentBase):
+        def run(self, env, nbytes):
+            yield env.timeout(0)
+
+    graph = _graph()
+    engine_default = GraphEngine(graph)
+    engine_override = GraphEngine(graph, component_overrides={"xbar_v1": ZeroXbar})
+
+    msg = MemoryReadMsg(
+        correlation_id="c", request_id="r",
+        src_sip=0, src_cube=0, src_pe=0,
+        src_pa=_hbm_pa(pe_id=0), nbytes=4096,
+    )
+
+    h_d = engine_default.submit(msg)
+    engine_default.wait(h_d)
+    _, t_default = engine_default.get_completion(h_d)
+
+    h_o = engine_override.submit(msg)
+    engine_override.wait(h_o)
+    _, t_override = engine_override.get_completion(h_o)
+
+    # ZeroXbar removes overhead_ns=2.0 from xbar.pe0 on forward + response = 4.0 ns faster
+    assert t_override["total_ns"] < t_default["total_ns"]
+    assert t_default["total_ns"] - t_override["total_ns"] == pytest.approx(4.0, rel=1e-6)
diff --git a/tests/test_engine.py b/tests/test_engine.py
new file mode 100644
index 0000000..50f550b
--- /dev/null
+++ b/tests/test_engine.py
@@ -0,0 +1,405 @@
+import pytest
+
+from pathlib import Path
+
+from kernbench.common.types import Completion, RequestHandle
+from kernbench.policy.address.phyaddr import PhysAddr
+from kernbench.runtime_api.kernel import (
+    KernelLaunchMsg,
+    KernelRef,
+    MemoryReadMsg,
+    MemoryWriteMsg,
+    ScalarArg,
+    TensorArg,
+    TensorArgShard,
+)
+from kernbench.sim_engine.engine import GraphEngine
+from kernbench.topology.builder import load_topology
+
+TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml"
+
+
+def _engine():
+    graph = load_topology(TOPOLOGY_PATH)
+    return GraphEngine(graph)
+
+
+def _hbm_pa(sip: int = 0, cube: int = 0, pe_id: int = 0) -> int:
+    """Create an HBM physical address targeting a specific PE's HBM slice."""
+    # 48 GB / 8 slices = 6 GB per slice
+    slice_bytes = 48 * (1 << 30) // 8
+    pa = PhysAddr.pe_hbm_addr(
+        rack_id=0, sip_id=sip, cube_id=cube, pe_id=pe_id,
+        pe_local_hbm_offset=0x1000, slice_size_bytes=slice_bytes,
+    )
+    return pa.encode()
+
+
+def _sram_pa(sip: int = 0, cube: int = 0) -> int:
+    """Create an SRAM physical address."""
+    pa = PhysAddr.cube_sram_addr(rack_id=0, sip_id=sip, cube_id=cube, sram_offset=0x800)
+    return pa.encode()
+
+
+# ── 1. submit returns handle ────────────────────────────────────────
+
+
+def test_engine_submit_returns_handle():
+    """submit() must return a RequestHandle (non-empty string)."""
+    engine = _engine()
+    msg = MemoryWriteMsg(
+        correlation_id="c0", request_id="r0",
+        dst_sip=0, dst_cube=0, dst_pe=0,
+        dst_pa=_hbm_pa(), nbytes=4096, pattern="zero",
+    )
+    handle = engine.submit(msg)
+    assert isinstance(handle, str)
+    assert len(handle) > 0
+
+
+# ── 2. memory write completion ──────────────────────────────────────
+
+
+def test_engine_memory_write_completion():
+    """MemoryWrite must complete with ok=True."""
+    engine = _engine()
+    msg = MemoryWriteMsg(
+        correlation_id="c0", request_id="r1",
+        dst_sip=0, dst_cube=0, dst_pe=0,
+        dst_pa=_hbm_pa(), nbytes=4096, pattern="zero",
+    )
+    h = engine.submit(msg)
+    engine.wait(h)
+    comp, trace = engine.get_completion(h)
+    assert comp.ok is True
+
+
+# ── 3. memory read completion ───────────────────────────────────────
+
+
+def test_engine_memory_read_completion():
+    """MemoryRead must complete with ok=True."""
+    engine = _engine()
+    msg = MemoryReadMsg(
+        correlation_id="c0", request_id="r2",
+        src_sip=0, src_cube=0, src_pe=0,
+        src_pa=_hbm_pa(), nbytes=4096,
+    )
+    h = engine.submit(msg)
+    engine.wait(h)
+    comp, trace = engine.get_completion(h)
+    assert comp.ok is True
+
+
+# ── 4. latency positive ────────────────────────────────────────────
+
+
+def test_engine_latency_positive():
+    """Trace total_ns must be > 0 (ADR-0002 D4)."""
+    engine = _engine()
+    msg = MemoryWriteMsg(
+        correlation_id="c0", request_id="r3",
+        dst_sip=0, dst_cube=0, dst_pe=0,
+        dst_pa=_hbm_pa(), nbytes=4096, pattern="zero",
+    )
+    h = engine.submit(msg)
+    engine.wait(h)
+    _, trace = engine.get_completion(h)
+    assert trace["total_ns"] > 0
+
+
+# ── 5. trace has total_ns and nbytes ───────────────────────────────
+
+
+def test_engine_trace_has_total_ns_and_nbytes():
+    """Trace must contain 'total_ns' and 'nbytes'."""
+    engine = _engine()
+    msg = MemoryWriteMsg(
+        correlation_id="c0", request_id="r4",
+        dst_sip=0, dst_cube=0, dst_pe=0,
+        dst_pa=_hbm_pa(), nbytes=4096, pattern="zero",
+    )
+    h = engine.submit(msg)
+    engine.wait(h)
+    _, trace = engine.get_completion(h)
+    assert "total_ns" in trace
+    assert "nbytes" in trace
+    assert trace["nbytes"] == 4096
+
+
+# ── 6. latency includes node overhead_ns ────────────────────────────
+
+
+def test_engine_latency_includes_node_overhead_ns():
+    """Path traverses components with overhead_ns > 0, so total >= some minimum."""
+    engine = _engine()
+    msg = MemoryWriteMsg(
+        correlation_id="c0", request_id="r7",
+        dst_sip=0, dst_cube=0, dst_pe=0,
+        dst_pa=_hbm_pa(), nbytes=4096, pattern="zero",
+    )
+    h = engine.submit(msg)
+    engine.wait(h)
+    _, trace = engine.get_completion(h)
+    # pcie_ep (5.0) + io_cpu (10.0) + m_cpu (5.0) = at least 20 ns
+    assert trace["total_ns"] >= 20.0
+
+
+# ── 7. concurrent requests ─────────────────────────────────────────
+
+
+def test_engine_concurrent_requests():
+    """Two requests submitted before wait must both complete with traces."""
+    engine = _engine()
+    msg1 = MemoryWriteMsg(
+        correlation_id="c0", request_id="r9a",
+        dst_sip=0, dst_cube=0, dst_pe=0,
+        dst_pa=_hbm_pa(), nbytes=4096, pattern="zero",
+    )
+    msg2 = MemoryWriteMsg(
+        correlation_id="c0", request_id="r9b",
+        dst_sip=0, dst_cube=0, dst_pe=1,
+        dst_pa=_hbm_pa(pe_id=1), nbytes=4096, pattern="zero",
+    )
+    h1 = engine.submit(msg1)
+    h2 = engine.submit(msg2)
+    engine.wait(h1)
+    engine.wait(h2)
+    comp1, trace1 = engine.get_completion(h1)
+    comp2, trace2 = engine.get_completion(h2)
+    assert comp1.ok is True
+    assert comp2.ok is True
+    assert trace1["total_ns"] > 0
+    assert trace2["total_ns"] > 0
+
+
+# ── 8. kernel launch ───────────────────────────────────────────────
+
+
+def test_engine_kernel_launch_simplified():
+    """KernelLaunch returns latency > 0."""
+    from kernbench.triton_emu.registry import clear_registry, register_kernel
+
+    clear_registry()
+    hbm_pa = _hbm_pa(pe_id=0)
+
+    def gemm_kernel(a_ptr, tl):
+        a = tl.load(a_ptr, shape=(4, 4), dtype="f16")
+        tl.store(a_ptr, a)
+
+    register_kernel("gemm", gemm_kernel)
+
+    engine = _engine()
+    shard0 = TensorArgShard(
+        sip=0, cube=0, pe=0,
+        pa=_hbm_pa(pe_id=0), nbytes=4096, offset_bytes=0,
+    )
+    shard1 = TensorArgShard(
+        sip=0, cube=0, pe=1,
+        pa=_hbm_pa(pe_id=1), nbytes=4096, offset_bytes=4096,
+    )
+    msg = KernelLaunchMsg(
+        correlation_id="c0", request_id="r10",
+        kernel_ref=KernelRef(name="gemm", kind="builtin"),
+        args=(TensorArg(shards=(shard0, shard1)),),
+    )
+    h = engine.submit(msg)
+    engine.wait(h)
+    comp, trace = engine.get_completion(h)
+    assert comp.ok is True
+    assert trace["total_ns"] > 0
+    clear_registry()
+
+
+# ── 9. deterministic ───────────────────────────────────────────────
+
+
+def test_engine_deterministic():
+    """Same request on two engines must produce identical latency."""
+    msg = MemoryWriteMsg(
+        correlation_id="c0", request_id="r11",
+        dst_sip=0, dst_cube=0, dst_pe=0,
+        dst_pa=_hbm_pa(), nbytes=4096, pattern="zero",
+    )
+    e1 = _engine()
+    h1 = e1.submit(msg)
+    e1.wait(h1)
+    _, t1 = e1.get_completion(h1)
+
+    e2 = _engine()
+    h2 = e2.submit(msg)
+    e2.wait(h2)
+    _, t2 = e2.get_completion(h2)
+
+    assert t1["total_ns"] == t2["total_ns"]
+
+
+# ── 10. remote cube access succeeds with higher latency ────────────
+
+
+def test_dma_capacity_serializes_concurrent():
+    """Two concurrent DMA writes to the same cube must contend at DMA capacity=1.
+
+    When two MemoryWrite requests target the same cube's M_CPU simultaneously,
+    the DMA engine (capacity=1) serializes them. The slower request must take
+    longer than a single isolated request (ADR-0014 D4, ADR-0015 D5).
+    """
+    # Single isolated write baseline
+    engine_single = _engine()
+    msg_single = MemoryWriteMsg(
+        correlation_id="c0", request_id="single",
+        dst_sip=0, dst_cube=0, dst_pe=0,
+        dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096,
+        pattern="zero", target_pe=0,
+    )
+    h1 = engine_single.submit(msg_single)
+    engine_single.wait(h1)
+    _, t1 = engine_single.get_completion(h1)
+    single_ns = t1["total_ns"]
+
+    # Two concurrent writes to same cube (different PEs) → DMA contention
+    engine_conc = _engine()
+    msg_a = MemoryWriteMsg(
+        correlation_id="c0", request_id="conc-a",
+        dst_sip=0, dst_cube=0, dst_pe=0,
+        dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096,
+        pattern="zero", target_pe=0,
+    )
+    msg_b = MemoryWriteMsg(
+        correlation_id="c0", request_id="conc-b",
+        dst_sip=0, dst_cube=0, dst_pe=1,
+        dst_pa=_hbm_pa(sip=0, cube=0, pe_id=1), nbytes=4096,
+        pattern="zero", target_pe=1,
+    )
+    ha = engine_conc.submit(msg_a)
+    hb = engine_conc.submit(msg_b)
+    engine_conc.wait(ha)
+    engine_conc.wait(hb)
+    _, ta = engine_conc.get_completion(ha)
+    _, tb = engine_conc.get_completion(hb)
+
+    # At least one must be delayed by DMA contention
+    max_ns = max(ta["total_ns"], tb["total_ns"])
+    assert max_ns > single_ns, (
+        f"concurrent max ({max_ns:.2f}ns) must > single ({single_ns:.2f}ns) "
+        f"due to DMA capacity=1 contention"
+    )
+
+
+# ── 11. formula latency lower bound ──────────────────────────────
+
+
+def test_formula_latency_lower_bound():
+    """_formula_latency must be <= actual latency (ADR-0015 D7).
+
+    Uses PE DMA path which is fully known at engine level.
+    """
+    from kernbench.policy.address.phyaddr import PhysAddr as PA
+    from kernbench.policy.routing.router import AddressResolver, PathRouter
+    from kernbench.topology.builder import load_topology as lt
+
+    graph = lt(TOPOLOGY_PATH)
+    engine = GraphEngine(graph)
+    resolver = AddressResolver(graph)
+    router = PathRouter(graph)
+
+    pa = _hbm_pa(sip=0, cube=0, pe_id=1)
+    pa_obj = PA.decode(pa)
+    dst_node = resolver.resolve(pa_obj)
+    pe_ref = "sip0.cube0.pe0"
+    path = router.find_path(pe_ref, dst_node)
+    formula = engine._formula_latency(path, 4096)
+
+    # Run actual simulation
+    msg = MemoryReadMsg(
+        correlation_id="c0", request_id="formula-lb",
+        src_sip=0, src_cube=0, src_pe=0,
+        src_pa=pa, nbytes=4096, target_pe=1,
+    )
+    h = engine.submit(msg)
+    engine.wait(h)
+    _, trace = engine.get_completion(h)
+    actual = trace["total_ns"]
+
+    assert formula <= actual, (
+        f"formula ({formula:.2f}) must <= actual ({actual:.2f})"
+    )
+    assert formula > 0, "formula must be > 0"
+
+
+def test_formula_latency_exact_no_contention():
+    """With no contention, formula should approximate actual for PE DMA.
+
+    PE DMA is single-request with no fan-out or aggregation,
+    so formula ≈ actual (within small tolerance for SimPy scheduling).
+    """
+    from kernbench.runtime_api.kernel import PeDmaMsg
+    from kernbench.policy.address.phyaddr import PhysAddr as PA
+    from kernbench.policy.routing.router import AddressResolver, PathRouter
+    from kernbench.topology.builder import load_topology as lt
+
+    graph = lt(TOPOLOGY_PATH)
+    engine = GraphEngine(graph)
+    resolver = AddressResolver(graph)
+    router = PathRouter(graph)
+
+    pa = _hbm_pa(sip=0, cube=0, pe_id=0)
+    pa_obj = PA.decode(pa)
+    dst_node = resolver.resolve(pa_obj)
+    pe_ref = "sip0.cube0.pe0"
+    path = router.find_path(pe_ref, dst_node)
+    formula = engine._formula_latency(path, 4096)
+
+    msg = PeDmaMsg(
+        correlation_id="c0", request_id="formula-exact",
+        src_sip=0, src_cube=0, src_pe=0,
+        dst_pa=pa, nbytes=4096,
+    )
+    h = engine.submit(msg)
+    engine.wait(h)
+    _, trace = engine.get_completion(h)
+    actual = trace["total_ns"]
+
+    # No contention: formula should equal actual
+    assert abs(formula - actual) < 0.01, (
+        f"formula ({formula:.4f}) ≈ actual ({actual:.4f}) expected with no contention"
+    )
+
+
+# ── 10. remote cube access succeeds with higher latency ────────────
+
+
+def test_engine_remote_cube_latency_higher():
+    """Accessing a distant cube's HBM must have strictly higher latency than local.
+
+    Uses separate engines to avoid contention effects.
+    cube15 (far corner of 4x4 mesh) requires multiple UCIe + NOC hops
+    from IO chiplet compared to cube0 (directly connected).
+    """
+    engine_local = _engine()
+    engine_remote = _engine()
+    msg_local = MemoryReadMsg(
+        correlation_id="c0", request_id="r14a",
+        src_sip=0, src_cube=0, src_pe=0,
+        src_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096,
+    )
+    msg_remote = MemoryReadMsg(
+        correlation_id="c0", request_id="r14b",
+        src_sip=0, src_cube=0, src_pe=0,
+        src_pa=_hbm_pa(sip=0, cube=15, pe_id=0), nbytes=4096,
+    )
+    h_local = engine_local.submit(msg_local)
+    engine_local.wait(h_local)
+    _, t_local = engine_local.get_completion(h_local)
+
+    h_remote = engine_remote.submit(msg_remote)
+    engine_remote.wait(h_remote)
+    comp_remote, t_remote = engine_remote.get_completion(h_remote)
+
+    assert comp_remote.ok is True
+    assert t_remote is not None and t_local is not None
+    assert t_remote["total_ns"] > t_local["total_ns"], (
+        f"remote cube {t_remote['total_ns']:.2f} must > local {t_local['total_ns']:.2f}"
+    )
+
+
diff --git a/tests/test_pe_components.py b/tests/test_pe_components.py
new file mode 100644
index 0000000..4d6a1f9
--- /dev/null
+++ b/tests/test_pe_components.py
@@ -0,0 +1,1175 @@
+"""Tests for PE internal component implementations (ADR-0014).
+
+Validates:
+  - Registry resolves all 6 PE component impl strings
+  - PE_DMA dual-channel concurrency (READ ∥ WRITE allowed)
+  - PE_DMA same-channel serialization (READ ∥ READ blocked)
+  - PE_GEMM / PE_MATH shared accel_slot (capacity=1)
+  - PeDmaMsg probe regression (latency unchanged)
+  - Stage 2: PE_CPU kernel execution + PE_SCHEDULER dispatch
+"""
+from pathlib import Path
+
+import simpy
+
+from kernbench.common.pe_commands import (
+    DmaReadCmd,
+    GemmCmd,
+    MathCmd,
+    PeInternalTxn,
+    TensorHandle,
+)
+from kernbench.components.base import ComponentRegistry
+from kernbench.components.impls.pe_cpu import PeCpuComponent
+from kernbench.components.impls.pe_dma import PeDmaComponent
+from kernbench.components.impls.pe_gemm import PeGemmComponent
+from kernbench.components.impls.pe_math import PeMathComponent
+from kernbench.components.impls.pe_scheduler import PeSchedulerComponent
+from kernbench.components.impls.pe_tcm import PeTcmComponent
+from kernbench.policy.address.phyaddr import PhysAddr
+from kernbench.runtime_api.kernel import (
+    KernelLaunchMsg,
+    KernelRef,
+    MemoryReadMsg,
+    MemoryWriteMsg,
+    PeDmaMsg,
+    TensorArg,
+    TensorArgShard,
+)
+from kernbench.sim_engine.engine import GraphEngine
+from kernbench.sim_engine.transaction import Transaction
+from kernbench.topology.builder import load_topology
+from kernbench.topology.types import Node
+from kernbench.triton_emu.registry import clear_registry, register_kernel
+
+TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml"
+
+
+def _engine():
+    return GraphEngine(load_topology(TOPOLOGY_PATH))
+
+
+def _hbm_pa(sip: int = 0, cube: int = 0, pe_id: int = 0) -> int:
+    slice_bytes = 48 * (1 << 30) // 8
+    pa = PhysAddr.pe_hbm_addr(
+        rack_id=0, sip_id=sip, cube_id=cube, pe_id=pe_id,
+        pe_local_hbm_offset=0x1000, slice_size_bytes=slice_bytes,
+    )
+    return pa.encode()
+
+
+# ── 1. Registry resolves all PE impl strings ──────────────────────
+
+
+def test_pe_registry_resolves_all():
+    """All 6 PE component impl strings must resolve to their specific classes."""
+    expected = {
+        "pe_cpu_v1": PeCpuComponent,
+        "pe_scheduler_v1": PeSchedulerComponent,
+        "pe_dma_v1": PeDmaComponent,
+        "pe_gemm_v1": PeGemmComponent,
+        "pe_math_v1": PeMathComponent,
+        "pe_tcm_v1": PeTcmComponent,
+    }
+    for impl, cls in expected.items():
+        node = Node(id=f"test.{impl}", kind=impl.replace("_v1", ""),
+                     impl=impl, pos_mm=None, attrs={})
+        comp = ComponentRegistry.create(node)
+        assert isinstance(comp, cls), f"{impl} resolved to {type(comp)}, expected {cls}"
+
+
+# ── 2. PE_DMA dual-channel: READ ∥ WRITE allowed ──────────────────
+
+
+def test_pe_dma_dual_channel_concurrent():
+    """PE_DMA READ and WRITE channels are independent (ADR-0014 D4).
+
+    Two concurrent DMA operations on different channels should not block
+    each other — both should complete at the same time as a single op.
+    """
+    env = simpy.Environment()
+    node = Node(id="sip0.cube0.pe0.pe_dma", kind="pe_dma",
+                impl="pe_dma_v1", pos_mm=None,
+                attrs={"rd_engines": 1, "wr_engines": 1})
+    comp = PeDmaComponent(node)
+
+    # Create minimal ports: just need inbox via start()
+    # We'll use a sink store as out_port
+    sink = simpy.Store(env)
+    comp.out_ports["next"] = sink
+    comp.in_ports["src"] = simpy.Store(env)
+    comp.start(env)
+
+    results = []
+
+    def submit_and_track(request, label):
+        done = env.event()
+        txn = Transaction(
+            request=request, path=["sip0.cube0.pe0.pe_dma", "next"],
+            step=0, nbytes=4096, done=done,
+        )
+        yield comp._inbox.put(txn)
+        yield done
+        results.append((label, env.now))
+
+    # Drain sink so transactions don't block
+    def drain_sink():
+        while True:
+            txn = yield sink.get()
+            txn.done.succeed()
+
+    env.process(drain_sink())
+
+    read_req = MemoryReadMsg(
+        correlation_id="c", request_id="r1",
+        src_sip=0, src_cube=0, src_pe=0,
+        src_pa=_hbm_pa(), nbytes=4096,
+    )
+    write_req = MemoryWriteMsg(
+        correlation_id="c", request_id="r2",
+        dst_sip=0, dst_cube=0, dst_pe=0,
+        dst_pa=_hbm_pa(), nbytes=4096, pattern="zero",
+    )
+
+    env.process(submit_and_track(read_req, "read"))
+    env.process(submit_and_track(write_req, "write"))
+    env.run()
+
+    assert len(results) == 2
+    # Both should complete at same time (no contention between channels)
+    assert results[0][1] == results[1][1], (
+        f"READ and WRITE should not block each other: "
+        f"{results[0]} vs {results[1]}"
+    )
+
+
+# ── 3. PE_DMA same-channel serializes ─────────────────────────────
+
+
+def test_pe_dma_same_channel_serializes():
+    """Two READ operations on the same PE_DMA must serialize (capacity=1)."""
+    env = simpy.Environment()
+    node = Node(id="sip0.cube0.pe0.pe_dma", kind="pe_dma",
+                impl="pe_dma_v1", pos_mm=None,
+                attrs={"rd_engines": 1, "wr_engines": 1})
+    comp = PeDmaComponent(node)
+
+    sink = simpy.Store(env)
+    comp.out_ports["next"] = sink
+    comp.in_ports["src"] = simpy.Store(env)
+    comp.start(env)
+
+    completions = []
+
+    def submit_read(req_id):
+        done = env.event()
+        req = MemoryReadMsg(
+            correlation_id="c", request_id=req_id,
+            src_sip=0, src_cube=0, src_pe=0,
+            src_pa=_hbm_pa(), nbytes=4096,
+        )
+        txn = Transaction(
+            request=req, path=["sip0.cube0.pe0.pe_dma", "next"],
+            step=0, nbytes=4096, done=done,
+        )
+        yield comp._inbox.put(txn)
+        yield done
+        completions.append((req_id, env.now))
+
+    # Sink completes transactions after 10ns delay (simulates downstream)
+    def drain_sink():
+        while True:
+            txn = yield sink.get()
+            yield env.timeout(10)
+            txn.done.succeed()
+
+    env.process(drain_sink())
+    env.process(submit_read("r1"))
+    env.process(submit_read("r2"))
+    env.run()
+
+    assert len(completions) == 2
+    # Second read must complete later due to serialization
+    t1 = completions[0][1]
+    t2 = completions[1][1]
+    assert t2 > t1, (
+        f"Second READ ({t2}) must complete after first ({t1}) "
+        f"due to DMA capacity=1"
+    )
+
+
+# ── 4. PE_GEMM / PE_MATH shared accel_slot ────────────────────────
+
+
+def test_pe_accel_shared_slot():
+    """GEMM and MATH share PE_ACCEL capacity=1 — cannot overlap (ADR-0014 D4)."""
+    from kernbench.components.context import ComponentContext
+    from kernbench.policy.routing.router import AddressResolver, PathRouter
+
+    graph = load_topology(TOPOLOGY_PATH)
+    env = simpy.Environment()
+    ctx = ComponentContext(
+        router=PathRouter(graph),
+        resolver=AddressResolver(graph),
+        positions={},
+        ns_per_mm=0.01,
+        spec=graph.spec,
+    )
+
+    pe_prefix = "sip0.cube0.pe0"
+    gemm_node = Node(
+        id=f"{pe_prefix}.pe_gemm", kind="pe_gemm", impl="pe_gemm_v1",
+        pos_mm=None, attrs={"overhead_ns": 10.0, "shared_resource": "accel_slot"},
+    )
+    math_node = Node(
+        id=f"{pe_prefix}.pe_math", kind="pe_math", impl="pe_math_v1",
+        pos_mm=None, attrs={"overhead_ns": 10.0, "shared_resource": "accel_slot"},
+    )
+    gemm = PeGemmComponent(gemm_node, ctx)
+    math = PeMathComponent(math_node, ctx)
+
+    # Wire minimal ports
+    gemm.in_ports["src"] = simpy.Store(env)
+    math.in_ports["src"] = simpy.Store(env)
+    gemm.start(env)
+    math.start(env)
+
+    completions = []
+
+    def submit(comp_inst, label):
+        done = env.event()
+        txn = Transaction(
+            request=None, path=[comp_inst.node.id],
+            step=0, nbytes=0, done=done,
+        )
+        yield comp_inst._inbox.put(txn)
+        yield done
+        completions.append((label, env.now))
+
+    env.process(submit(gemm, "gemm"))
+    env.process(submit(math, "math"))
+    env.run()
+
+    assert len(completions) == 2
+    t1 = completions[0][1]
+    t2 = completions[1][1]
+    # One completes at 10ns, the other at 20ns (serialized)
+    assert t1 == 10.0, f"First should complete at 10ns, got {t1}"
+    assert t2 == 20.0, f"Second should complete at 20ns, got {t2}"
+
+
+# ── 5. PeDmaMsg probe regression ──────────────────────────────────
+
+
+def test_pe_dma_probe_regression():
+    """PeDmaMsg probe must still complete with same latency after PE component registration."""
+    msg = PeDmaMsg(
+        correlation_id="probe", request_id="regression",
+        src_sip=0, src_cube=0, src_pe=0,
+        dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096,
+    )
+    e1 = _engine()
+    h1 = e1.submit(msg)
+    e1.wait(h1)
+    comp1, trace1 = e1.get_completion(h1)
+    assert comp1.ok is True
+    assert trace1["total_ns"] > 0
+
+    # Deterministic: two engines produce same result
+    e2 = _engine()
+    h2 = e2.submit(msg)
+    e2.wait(h2)
+    _, trace2 = e2.get_completion(h2)
+    assert trace1["total_ns"] == trace2["total_ns"]
+
+
+# ── 6. PE_GEMM handles PeInternalTxn ────────────────────────────
+
+
+def test_pe_gemm_handles_pe_internal_txn():
+    """PE_GEMM receives PeInternalTxn with GemmCmd, acquires accel, signals done."""
+    from kernbench.components.context import ComponentContext
+    from kernbench.policy.routing.router import AddressResolver, PathRouter
+
+    graph = load_topology(TOPOLOGY_PATH)
+    env = simpy.Environment()
+    ctx = ComponentContext(
+        router=PathRouter(graph),
+        resolver=AddressResolver(graph),
+        positions={}, ns_per_mm=0.01, spec=graph.spec,
+    )
+
+    pe_prefix = "sip0.cube0.pe0"
+    gemm_node = Node(
+        id=f"{pe_prefix}.pe_gemm", kind="pe_gemm", impl="pe_gemm_v1",
+        pos_mm=None, attrs={"overhead_ns": 5.0, "shared_resource": "accel_slot"},
+    )
+    gemm = PeGemmComponent(gemm_node, ctx)
+    gemm.in_ports["src"] = simpy.Store(env)
+    gemm.start(env)
+
+    a = TensorHandle(id="t1", pa=0, shape=(4, 8), dtype="f16", nbytes=64)
+    b = TensorHandle(id="t2", pa=0, shape=(8, 4), dtype="f16", nbytes=64)
+    out = TensorHandle(id="t3", pa=0, shape=(4, 4), dtype="f16", nbytes=32)
+    cmd = GemmCmd(a=a, b=b, out=out, m=4, k=8, n=4)
+    done = env.event()
+    pe_txn = PeInternalTxn(command=cmd, done=done, pe_prefix=pe_prefix)
+
+    def submit():
+        yield gemm._inbox.put(pe_txn)
+        yield done
+
+    env.process(submit())
+    env.run()
+    assert env.now == 5.0  # overhead_ns from node attrs
+
+
+# ── 7. PE_MATH handles PeInternalTxn ────────────────────────────
+
+
+def test_pe_math_handles_pe_internal_txn():
+    """PE_MATH receives PeInternalTxn with MathCmd, acquires accel, signals done."""
+    from kernbench.components.context import ComponentContext
+    from kernbench.policy.routing.router import AddressResolver, PathRouter
+
+    graph = load_topology(TOPOLOGY_PATH)
+    env = simpy.Environment()
+    ctx = ComponentContext(
+        router=PathRouter(graph),
+        resolver=AddressResolver(graph),
+        positions={}, ns_per_mm=0.01, spec=graph.spec,
+    )
+
+    pe_prefix = "sip0.cube0.pe0"
+    math_node = Node(
+        id=f"{pe_prefix}.pe_math", kind="pe_math", impl="pe_math_v1",
+        pos_mm=None, attrs={"overhead_ns": 3.0, "shared_resource": "accel_slot"},
+    )
+    math_comp = PeMathComponent(math_node, ctx)
+    math_comp.in_ports["src"] = simpy.Store(env)
+    math_comp.start(env)
+
+    x = TensorHandle(id="t1", pa=0, shape=(4, 4), dtype="f16", nbytes=32)
+    out = TensorHandle(id="t2", pa=0, shape=(4, 4), dtype="f16", nbytes=32)
+    cmd = MathCmd(op="exp", inputs=(x,), out=out)
+    done = env.event()
+    pe_txn = PeInternalTxn(command=cmd, done=done, pe_prefix=pe_prefix)
+
+    def submit():
+        yield math_comp._inbox.put(pe_txn)
+        yield done
+
+    env.process(submit())
+    env.run()
+    assert env.now == 3.0  # overhead_ns from node attrs
+
+
+# ── 8. PE_CPU kernel execution e2e (load-only kernel) ────────────
+
+
+def test_pe_kernel_e2e_load_only():
+    """PE_CPU compiles and replays a simple load kernel through the full pipeline.
+
+    Kernel: tl.load(hbm_pa, shape=(4,4), dtype='f16')
+    Expected: Transaction completes with latency > 0 (DMA to HBM and back).
+    """
+    clear_registry()
+    hbm_pa = _hbm_pa(sip=0, cube=0, pe_id=0)
+
+    def load_kernel(tl):
+        tl.load(hbm_pa, shape=(4, 4), dtype="f16")
+
+    register_kernel("test_load_kernel", load_kernel)
+
+    engine = _engine()
+    pe_cpu_id = "sip0.cube0.pe0.pe_cpu"
+
+    # Create KernelLaunchMsg
+    launch_msg = KernelLaunchMsg(
+        correlation_id="test", request_id="load_e2e",
+        kernel_ref=KernelRef(name="test_load_kernel", kind="builtin"),
+        args=(),
+    )
+
+    # Inject Transaction at PE_CPU inbox
+    done = engine._env.event()
+    txn = Transaction(
+        request=launch_msg,
+        path=[pe_cpu_id],
+        step=0, nbytes=0, done=done,
+    )
+
+    start_ns = engine._env.now
+
+    def inject():
+        yield engine._components[pe_cpu_id]._inbox.put(txn)
+        yield done
+
+    engine._env.process(inject())
+    engine._env.run()
+
+    total_ns = engine._env.now - start_ns
+    assert total_ns > 0, f"Kernel should take > 0ns, got {total_ns}"
+    clear_registry()
+
+
+# ── 9. PE_CPU kernel execution e2e (load + store) ────────────────
+
+
+def test_pe_kernel_e2e_load_store():
+    """PE_CPU: load→store kernel completes with latency > load-only kernel."""
+    clear_registry()
+    hbm_pa = _hbm_pa(sip=0, cube=0, pe_id=0)
+
+    def load_store_kernel(tl):
+        a = tl.load(hbm_pa, shape=(4, 4), dtype="f16")
+        tl.store(hbm_pa + 0x10000, a)
+
+    register_kernel("test_load_store", load_store_kernel)
+
+    engine = _engine()
+    pe_cpu_id = "sip0.cube0.pe0.pe_cpu"
+    launch_msg = KernelLaunchMsg(
+        correlation_id="test", request_id="ls_e2e",
+        kernel_ref=KernelRef(name="test_load_store", kind="builtin"),
+        args=(),
+    )
+
+    done = engine._env.event()
+    txn = Transaction(
+        request=launch_msg, path=[pe_cpu_id],
+        step=0, nbytes=0, done=done,
+    )
+
+    def inject():
+        yield engine._components[pe_cpu_id]._inbox.put(txn)
+        yield done
+
+    engine._env.process(inject())
+    engine._env.run()
+
+    total_ns = engine._env.now
+    assert total_ns > 0, f"load+store should take > 0ns, got {total_ns}"
+    clear_registry()
+
+
+# ── 10. PE_CPU kernel with overhead timing ───────────────────────
+
+
+def test_pe_cpu_overhead_timing():
+    """PeCpuOverheadCmd cycles are added to total kernel time."""
+    clear_registry()
+    hbm_pa = _hbm_pa(sip=0, cube=0, pe_id=0)
+    pe_cpu_id = "sip0.cube0.pe0.pe_cpu"
+
+    def no_overhead_kernel(tl):
+        tl.load(hbm_pa, shape=(4, 4), dtype="f16")
+
+    def overhead_kernel(tl):
+        tl.cycles(100)
+        tl.load(hbm_pa, shape=(4, 4), dtype="f16")
+
+    # Run without overhead
+    register_kernel("test_no_overhead", no_overhead_kernel)
+    engine1 = _engine()
+    done1 = engine1._env.event()
+    txn1 = Transaction(
+        request=KernelLaunchMsg(
+            correlation_id="t", request_id="r1",
+            kernel_ref=KernelRef(name="test_no_overhead", kind="builtin"),
+            args=(),
+        ),
+        path=[pe_cpu_id], step=0, nbytes=0, done=done1,
+    )
+
+    def inject1():
+        yield engine1._components[pe_cpu_id]._inbox.put(txn1)
+        yield done1
+
+    engine1._env.process(inject1())
+    engine1._env.run()
+    base_ns = engine1._env.now
+
+    # Run with overhead
+    clear_registry()
+    register_kernel("test_overhead", overhead_kernel)
+    engine2 = _engine()
+    done2 = engine2._env.event()
+    txn2 = Transaction(
+        request=KernelLaunchMsg(
+            correlation_id="t", request_id="r2",
+            kernel_ref=KernelRef(name="test_overhead", kind="builtin"),
+            args=(),
+        ),
+        path=[pe_cpu_id], step=0, nbytes=0, done=done2,
+    )
+
+    def inject2():
+        yield engine2._components[pe_cpu_id]._inbox.put(txn2)
+        yield done2
+
+    engine2._env.process(inject2())
+    engine2._env.run()
+    overhead_ns = engine2._env.now
+
+    # Overhead kernel should take 100 cycles more
+    assert overhead_ns == base_ns + 100, (
+        f"Expected {base_ns + 100}ns with overhead, got {overhead_ns}ns"
+    )
+    clear_registry()
+
+
+# ── 11. PE_CPU kernel with GEMM (dot) ────────────────────────────
+
+
+def test_pe_kernel_e2e_gemm():
+    """PE_CPU: kernel with tl.dot dispatches GemmCmd through PE_GEMM."""
+    clear_registry()
+    hbm_pa = _hbm_pa(sip=0, cube=0, pe_id=0)
+
+    def gemm_kernel(tl):
+        a = tl.load(hbm_pa, shape=(4, 8), dtype="f16")
+        b = tl.load(hbm_pa + 0x10000, shape=(8, 4), dtype="f16")
+        out = tl.dot(a, b)
+        tl.store(hbm_pa + 0x20000, out)
+
+    register_kernel("test_gemm", gemm_kernel)
+
+    engine = _engine()
+    pe_cpu_id = "sip0.cube0.pe0.pe_cpu"
+    launch_msg = KernelLaunchMsg(
+        correlation_id="test", request_id="gemm_e2e",
+        kernel_ref=KernelRef(name="test_gemm", kind="builtin"),
+        args=(),
+    )
+
+    done = engine._env.event()
+    txn = Transaction(
+        request=launch_msg, path=[pe_cpu_id],
+        step=0, nbytes=0, done=done,
+    )
+
+    def inject():
+        yield engine._components[pe_cpu_id]._inbox.put(txn)
+        yield done
+
+    engine._env.process(inject())
+    engine._env.run()
+
+    total_ns = engine._env.now
+    assert total_ns > 0, f"GEMM kernel should take > 0ns, got {total_ns}"
+    clear_registry()
+
+
+# ── 12. PE_CPU kernel with math ops ──────────────────────────────
+
+
+def test_pe_kernel_e2e_math():
+    """PE_CPU: kernel with tl.exp dispatches MathCmd through PE_MATH."""
+    clear_registry()
+    hbm_pa = _hbm_pa(sip=0, cube=0, pe_id=0)
+
+    def math_kernel(tl):
+        x = tl.load(hbm_pa, shape=(4, 4), dtype="f16")
+        y = tl.exp(x)
+        tl.store(hbm_pa + 0x10000, y)
+
+    register_kernel("test_math", math_kernel)
+
+    engine = _engine()
+    pe_cpu_id = "sip0.cube0.pe0.pe_cpu"
+    launch_msg = KernelLaunchMsg(
+        correlation_id="test", request_id="math_e2e",
+        kernel_ref=KernelRef(name="test_math", kind="builtin"),
+        args=(),
+    )
+
+    done = engine._env.event()
+    txn = Transaction(
+        request=launch_msg, path=[pe_cpu_id],
+        step=0, nbytes=0, done=done,
+    )
+
+    def inject():
+        yield engine._components[pe_cpu_id]._inbox.put(txn)
+        yield done
+
+    engine._env.process(inject())
+    engine._env.run()
+
+    total_ns = engine._env.now
+    assert total_ns > 0, f"Math kernel should take > 0ns, got {total_ns}"
+    clear_registry()
+
+
+# ── 13. Deterministic: same kernel → same latency ───────────────
+
+
+def test_pe_kernel_deterministic():
+    """Same kernel on same PE produces identical latency across runs."""
+    clear_registry()
+    hbm_pa = _hbm_pa(sip=0, cube=0, pe_id=0)
+
+    def det_kernel(tl):
+        a = tl.load(hbm_pa, shape=(4, 4), dtype="f16")
+        tl.store(hbm_pa + 0x10000, a)
+
+    register_kernel("test_det", det_kernel)
+
+    results = []
+    pe_cpu_id = "sip0.cube0.pe0.pe_cpu"
+    for _ in range(2):
+        engine = _engine()
+        done = engine._env.event()
+        txn = Transaction(
+            request=KernelLaunchMsg(
+                correlation_id="t", request_id="det",
+                kernel_ref=KernelRef(name="test_det", kind="builtin"),
+                args=(),
+            ),
+            path=[pe_cpu_id], step=0, nbytes=0, done=done,
+        )
+
+        def inject(e=engine, d=done, t=txn):
+            yield e._components[pe_cpu_id]._inbox.put(t)
+            yield d
+
+        engine._env.process(inject())
+        engine._env.run()
+        results.append(engine._env.now)
+
+    assert results[0] == results[1], (
+        f"Determinism violation: {results[0]} != {results[1]}"
+    )
+    clear_registry()
+
+
+# ── 14. Stage 3: Composite GEMM pipeline with tiling ─────────────
+
+
+def test_composite_gemm_pipeline():
+    """Composite GEMM with tl.ref(b) produces tiled pipeline execution.
+
+    Kernel: tl.load(a) + tl.ref(b) + tl.composite(gemm) + tl.wait()
+    Validates: Transaction completes, latency > 0, latency > load-only.
+    """
+    clear_registry()
+    hbm_pa_a = _hbm_pa(sip=0, cube=0, pe_id=0)
+    hbm_pa_b = _hbm_pa(sip=0, cube=0, pe_id=0)
+    out_pa = _hbm_pa(sip=0, cube=0, pe_id=0)
+
+    def composite_gemm_kernel(tl):
+        a = tl.load(hbm_pa_a, shape=(32, 64), dtype="f16")
+        b = tl.ref(hbm_pa_b, shape=(64, 32), dtype="f16")
+        h = tl.composite(op="gemm", a=a, b=b, out_ptr=out_pa)
+        tl.wait(h)
+
+    register_kernel("test_composite_gemm", composite_gemm_kernel)
+
+    engine = _engine()
+    pe_cpu_id = "sip0.cube0.pe0.pe_cpu"
+    launch_msg = KernelLaunchMsg(
+        correlation_id="test", request_id="composite_gemm",
+        kernel_ref=KernelRef(name="test_composite_gemm", kind="builtin"),
+        args=(),
+    )
+
+    done = engine._env.event()
+    txn = Transaction(
+        request=launch_msg, path=[pe_cpu_id],
+        step=0, nbytes=0, done=done,
+    )
+
+    def inject():
+        yield engine._components[pe_cpu_id]._inbox.put(txn)
+        yield done
+
+    engine._env.process(inject())
+    engine._env.run()
+
+    total_ns = engine._env.now
+    assert total_ns > 0, f"Composite GEMM should take > 0ns, got {total_ns}"
+    clear_registry()
+
+
+# ── 15. Stage 3: Composite generates multiple tiles ───────────────
+
+
+def test_composite_gemm_multi_tile():
+    """Larger GEMM produces multiple tiles (K=128 > TILE_K=64 → 2 K-tiles).
+
+    Validates latency is strictly greater than single-tile composite.
+    """
+    clear_registry()
+    hbm_pa_a = _hbm_pa(sip=0, cube=0, pe_id=0)
+    hbm_pa_b = _hbm_pa(sip=0, cube=0, pe_id=0)
+    out_pa = _hbm_pa(sip=0, cube=0, pe_id=0)
+
+    # Single K-tile (K=64, fits in one TILE_K)
+    def single_tile_kernel(tl):
+        a = tl.load(hbm_pa_a, shape=(32, 64), dtype="f16")
+        b = tl.ref(hbm_pa_b, shape=(64, 32), dtype="f16")
+        h = tl.composite(op="gemm", a=a, b=b, out_ptr=out_pa)
+        tl.wait(h)
+
+    register_kernel("test_single_tile", single_tile_kernel)
+    engine1 = _engine()
+    pe_cpu_id = "sip0.cube0.pe0.pe_cpu"
+    done1 = engine1._env.event()
+    txn1 = Transaction(
+        request=KernelLaunchMsg(
+            correlation_id="t", request_id="st",
+            kernel_ref=KernelRef(name="test_single_tile", kind="builtin"),
+            args=(),
+        ),
+        path=[pe_cpu_id], step=0, nbytes=0, done=done1,
+    )
+
+    def inject1():
+        yield engine1._components[pe_cpu_id]._inbox.put(txn1)
+        yield done1
+
+    engine1._env.process(inject1())
+    engine1._env.run()
+    single_ns = engine1._env.now
+
+    # Multi K-tile (K=128, needs 2 TILE_K=64 tiles)
+    clear_registry()
+
+    def multi_tile_kernel(tl):
+        a = tl.load(hbm_pa_a, shape=(32, 128), dtype="f16")
+        b = tl.ref(hbm_pa_b, shape=(128, 32), dtype="f16")
+        h = tl.composite(op="gemm", a=a, b=b, out_ptr=out_pa)
+        tl.wait(h)
+
+    register_kernel("test_multi_tile", multi_tile_kernel)
+    engine2 = _engine()
+    done2 = engine2._env.event()
+    txn2 = Transaction(
+        request=KernelLaunchMsg(
+            correlation_id="t", request_id="mt",
+            kernel_ref=KernelRef(name="test_multi_tile", kind="builtin"),
+            args=(),
+        ),
+        path=[pe_cpu_id], step=0, nbytes=0, done=done2,
+    )
+
+    def inject2():
+        yield engine2._components[pe_cpu_id]._inbox.put(txn2)
+        yield done2
+
+    engine2._env.process(inject2())
+    engine2._env.run()
+    multi_ns = engine2._env.now
+
+    assert multi_ns > single_ns, (
+        f"Multi-tile ({multi_ns}ns) should take longer than single-tile ({single_ns}ns)"
+    )
+    clear_registry()
+
+
+# ── 16. Stage 3: tl.ref() generates no DMA command ───────────────
+
+
+def test_tl_ref_no_dma():
+    """tl.ref() creates TensorHandle but does NOT emit a DMA command."""
+    from kernbench.triton_emu.tl_context import TLContext
+
+    tl = TLContext(pe_id=0, dispatch_cycles=0)
+    handle = tl.ref(0x1000, shape=(4, 4), dtype="f16")
+    assert handle.pa == 0x1000
+    assert handle.shape == (4, 4)
+    assert len(tl.commands) == 0, f"tl.ref should emit 0 commands, got {len(tl.commands)}"
+
+
+# ── 17. Stage 4: M_CPU kernel launch fan-out ──────────────────────
+
+
+def test_mcpu_kernel_launch_fanout():
+    """M_CPU routes KernelLaunchMsg to PE_CPU via NOC, PE executes, response returns.
+
+    Full pipeline: Host → PCIE_EP → IO_CPU → M_CPU → NOC → PE_CPU → engines
+    """
+    clear_registry()
+    hbm_pa = _hbm_pa(sip=0, cube=0, pe_id=0)
+
+    def simple_kernel(a_ptr, tl):
+        tl.load(a_ptr, shape=(4, 4), dtype="f16")
+
+    register_kernel("test_mcpu_kernel", simple_kernel)
+
+    engine = _engine()
+
+    shard = TensorArgShard(
+        sip=0, cube=0, pe=0,
+        pa=hbm_pa, nbytes=32, offset_bytes=0,
+    )
+    launch_msg = KernelLaunchMsg(
+        correlation_id="test", request_id="mcpu_launch",
+        kernel_ref=KernelRef(name="test_mcpu_kernel", kind="builtin"),
+        args=(TensorArg(shards=(shard,)),),
+        target_cubes=(0,), target_pe=0,
+    )
+
+    h = engine.submit(launch_msg)
+    engine.wait(h)
+    comp, trace = engine.get_completion(h)
+
+    assert comp.ok is True
+    assert trace["total_ns"] > 0, f"Kernel launch should take > 0ns, got {trace['total_ns']}"
+    clear_registry()
+
+
+# ── 18. Stage 4: M_CPU kernel launch with composite GEMM ──────────
+
+
+def test_mcpu_kernel_launch_composite():
+    """Full pipeline kernel launch with composite GEMM through M_CPU."""
+    clear_registry()
+    hbm_pa = _hbm_pa(sip=0, cube=0, pe_id=0)
+
+    def gemm_kernel(a_ptr, tl):
+        a = tl.load(a_ptr, shape=(32, 64), dtype="f16")
+        b = tl.ref(a_ptr, shape=(64, 32), dtype="f16")
+        h = tl.composite(op="gemm", a=a, b=b, out_ptr=a_ptr)
+        tl.wait(h)
+
+    register_kernel("test_mcpu_composite", gemm_kernel)
+
+    engine = _engine()
+    shard = TensorArgShard(
+        sip=0, cube=0, pe=0,
+        pa=hbm_pa, nbytes=32, offset_bytes=0,
+    )
+    launch_msg = KernelLaunchMsg(
+        correlation_id="test", request_id="mcpu_composite",
+        kernel_ref=KernelRef(name="test_mcpu_composite", kind="builtin"),
+        args=(TensorArg(shards=(shard,)),),
+        target_cubes=(0,), target_pe=0,
+    )
+
+    h = engine.submit(launch_msg)
+    engine.wait(h)
+    comp, trace = engine.get_completion(h)
+
+    assert comp.ok is True
+    assert trace["total_ns"] > 0
+    clear_registry()
+
+
+# ── 19. Stage 5: QKV GEMM benchmark completion ────────────────────
+
+
+def test_qkv_gemm_bench_completes():
+    """The qkv_gemm benchmark runs to completion without error."""
+    clear_registry()
+    from benches.qkv_gemm import run as bench_run
+    from kernbench.runtime_api.context import RuntimeContext
+
+    graph = load_topology(TOPOLOGY_PATH)
+    engine = GraphEngine(graph)
+    ctx = RuntimeContext(
+        engine=engine,
+        target_device="sip0",
+        correlation_id="bench_test",
+        spec=graph.spec,
+    )
+    bench_run(ctx)
+    ctx.wait_all()
+
+    # All handles should have completed
+    for h in ctx.handles():
+        comp, trace = engine.get_completion(h)
+        assert comp.ok is True
+        assert trace["total_ns"] > 0
+
+    # Trace collection: deploy + kernel phases
+    assert len(ctx._traces) > 0
+    deploy_traces = [t for t in ctx._traces if t["phase"] in ("deploy", "memory_write")]
+    kernel_traces = [t for t in ctx._traces if t["phase"] == "kernel"]
+    assert len(deploy_traces) >= 2  # at least a, b (out is empty, no deploy)
+    assert len(kernel_traces) == 1
+    assert kernel_traces[0]["name"] == "qkv_gemm"
+    assert kernel_traces[0]["total_ns"] > 0
+    # Scalars should contain M, K, N
+    assert len(kernel_traces[0]["scalars"]) >= 3
+
+    clear_registry()
+
+
+# ── 20. Stage 4: M_CPU multi-PE kernel launch fan-out ─────────────
+
+
+def test_mcpu_multi_pe_kernel_launch():
+    """M_CPU fans out KernelLaunchMsg to all 8 PEs when target_pe='all'.
+
+    Validates:
+      - All PEs execute the kernel (latency > 0)
+      - Multi-PE latency >= single-PE latency (parallel but NOC contention)
+    """
+    clear_registry()
+    hbm_pa = _hbm_pa(sip=0, cube=0, pe_id=0)
+
+    def simple_kernel(a_ptr, tl):
+        tl.load(a_ptr, shape=(4, 4), dtype="f16")
+
+    register_kernel("test_multi_pe", simple_kernel)
+
+    # Single PE baseline
+    engine1 = _engine()
+    shard = TensorArgShard(
+        sip=0, cube=0, pe=0,
+        pa=hbm_pa, nbytes=32, offset_bytes=0,
+    )
+    h1 = engine1.submit(KernelLaunchMsg(
+        correlation_id="t", request_id="single",
+        kernel_ref=KernelRef(name="test_multi_pe", kind="builtin"),
+        args=(TensorArg(shards=(shard,)),),
+        target_cubes=(0,), target_pe=0,
+    ))
+    engine1.wait(h1)
+    comp1, trace1 = engine1.get_completion(h1)
+    single_ns = trace1["total_ns"]
+
+    # Multi PE (all 8)
+    engine2 = _engine()
+    h2 = engine2.submit(KernelLaunchMsg(
+        correlation_id="t", request_id="multi",
+        kernel_ref=KernelRef(name="test_multi_pe", kind="builtin"),
+        args=(TensorArg(shards=(shard,)),),
+        target_cubes=(0,), target_pe="all",
+    ))
+    engine2.wait(h2)
+    comp2, trace2 = engine2.get_completion(h2)
+    multi_ns = trace2["total_ns"]
+
+    assert comp1.ok is True
+    assert comp2.ok is True
+    assert single_ns > 0
+    assert multi_ns > 0
+    assert multi_ns >= single_ns, (
+        f"Multi-PE ({multi_ns}ns) should be >= single-PE ({single_ns}ns)"
+    )
+    clear_registry()
+
+
+# ── 21. Stage 5: QKV GEMM multi-PE benchmark completion ──────────
+
+
+def test_qkv_gemm_bench_multi_pe_completes():
+    """The qkv_gemm_multi_pe benchmark runs to completion without error."""
+    clear_registry()
+    from benches.qkv_gemm_multi_pe import run as bench_run
+    from kernbench.runtime_api.context import RuntimeContext
+
+    graph = load_topology(TOPOLOGY_PATH)
+    engine = GraphEngine(graph)
+    ctx = RuntimeContext(
+        engine=engine,
+        target_device="sip0",
+        correlation_id="bench_multi_pe",
+        spec=graph.spec,
+    )
+    bench_run(ctx)
+    ctx.wait_all()
+
+    for h in ctx.handles():
+        comp, trace = engine.get_completion(h)
+        assert comp.ok is True
+        assert trace["total_ns"] > 0
+
+    # Multi-PE: 8 PEs, deploy traces per PE + kernel
+    deploy_traces = [t for t in ctx._traces if t["phase"] in ("deploy", "memory_write")]
+    kernel_traces = [t for t in ctx._traces if t["phase"] == "kernel"]
+    assert len(deploy_traces) >= 8  # replicate(a)*8 + column_wise(b)*8
+    assert len(kernel_traces) == 1
+    assert kernel_traces[0]["target_pe"] == "all"
+
+    clear_registry()
+
+
+def test_report_format():
+    """format_report produces readable output with TFLOPS and BW."""
+    from kernbench.cli.report import format_report
+
+    traces = [
+        {"phase": "deploy", "name": "a", "pe": 0, "nbytes": 65536, "total_ns": 25.0},
+        {"phase": "deploy", "name": "b", "pe": 0, "nbytes": 65536, "total_ns": 25.0},
+        {"phase": "kernel", "name": "qkv_gemm", "target_pe": 0,
+         "scalars": [128, 256, 128], "total_ns": 100.0},
+    ]
+    report = format_report(traces, title="qkv_gemm")
+    assert "qkv_gemm" in report
+    assert "deploy" in report
+    assert "kernel" in report
+    assert "TFLOPS" in report
+    # GEMM TFLOPS: 2*128*256*128 / (100ns * 1e-9) / 1e12 = 83.886
+    assert "83.886" in report
+    # BW: 65536 / 25.0 = 2621.4 GB/s
+    assert "2621.4" in report
+    # Util% column present
+    assert "Util" in report
+
+
+# ── 22. Multi-CUBE kernel launch (ADR-0013 V4) ──────────────────
+
+
+def test_multi_cube_kernel_launch():
+    """IO_CPU fans out KernelLaunchMsg to M_CPUs in two different cubes.
+
+    Validates ADR-0013 V4 (multi-CUBE within SIP):
+      - Shards in cube=0 and cube=1, each targeting pe=0
+      - Completion ok=True, total_ns > 0
+      - Multi-cube latency >= single-cube latency (inter-cube UCIe adds cost)
+      - Deterministic across runs
+    """
+    clear_registry()
+    hbm_pa_c0 = _hbm_pa(sip=0, cube=0, pe_id=0)
+    hbm_pa_c1 = _hbm_pa(sip=0, cube=1, pe_id=0)
+
+    def simple_kernel(a_ptr, tl):
+        tl.load(a_ptr, shape=(4, 4), dtype="f16")
+
+    register_kernel("test_multi_cube", simple_kernel)
+
+    # Single-cube baseline
+    engine1 = _engine()
+    shard_single = TensorArgShard(
+        sip=0, cube=0, pe=0,
+        pa=hbm_pa_c0, nbytes=32, offset_bytes=0,
+    )
+    h1 = engine1.submit(KernelLaunchMsg(
+        correlation_id="t", request_id="single_cube",
+        kernel_ref=KernelRef(name="test_multi_cube", kind="builtin"),
+        args=(TensorArg(shards=(shard_single,)),),
+        target_cubes=(0,), target_pe=0,
+    ))
+    engine1.wait(h1)
+    comp1, trace1 = engine1.get_completion(h1)
+    single_ns = trace1["total_ns"]
+
+    # Multi-cube: shards in cube=0 and cube=1
+    engine2 = _engine()
+    shard_c0 = TensorArgShard(
+        sip=0, cube=0, pe=0,
+        pa=hbm_pa_c0, nbytes=32, offset_bytes=0,
+    )
+    shard_c1 = TensorArgShard(
+        sip=0, cube=1, pe=0,
+        pa=hbm_pa_c1, nbytes=32, offset_bytes=0,
+    )
+    h2 = engine2.submit(KernelLaunchMsg(
+        correlation_id="t", request_id="multi_cube",
+        kernel_ref=KernelRef(name="test_multi_cube", kind="builtin"),
+        args=(TensorArg(shards=(shard_c0, shard_c1)),),
+        target_pe=0,
+    ))
+    engine2.wait(h2)
+    comp2, trace2 = engine2.get_completion(h2)
+    multi_ns = trace2["total_ns"]
+
+    assert comp1.ok is True
+    assert comp2.ok is True
+    assert single_ns > 0
+    assert multi_ns > 0
+    assert multi_ns >= single_ns, (
+        f"Multi-cube ({multi_ns}ns) should be >= single-cube ({single_ns}ns)"
+    )
+
+    # Determinism check
+    engine3 = _engine()
+    h3 = engine3.submit(KernelLaunchMsg(
+        correlation_id="t", request_id="multi_cube_det",
+        kernel_ref=KernelRef(name="test_multi_cube", kind="builtin"),
+        args=(TensorArg(shards=(shard_c0, shard_c1)),),
+        target_pe=0,
+    ))
+    engine3.wait(h3)
+    _, trace3 = engine3.get_completion(h3)
+    assert trace2["total_ns"] == trace3["total_ns"], (
+        f"Determinism violation: {trace2['total_ns']} != {trace3['total_ns']}"
+    )
+    clear_registry()
+
+
+# ── 23. Multi-SIP kernel launch (ADR-0013 V4) ───────────────────
+
+
+def test_multi_sip_kernel_launch():
+    """Engine submits KernelLaunchMsg to two SIPs via separate PCIE_EPs.
+
+    Validates ADR-0013 V4 (multi-SIP tray):
+      - Shards in sip=0/cube=0/pe=0 and sip=1/cube=0/pe=0
+      - Completion ok=True, total_ns > 0
+      - Multi-SIP latency >= single-SIP latency
+      - Deterministic across runs
+    """
+    clear_registry()
+    hbm_pa_s0 = _hbm_pa(sip=0, cube=0, pe_id=0)
+    hbm_pa_s1 = _hbm_pa(sip=1, cube=0, pe_id=0)
+
+    # Kernel uses cycles only — no HBM access, so it runs correctly on any SIP
+    # (a real multi-SIP kernel would use SIP-local PA for each PE)
+    def simple_kernel(a_ptr, tl):
+        tl.cycles(50)
+
+    register_kernel("test_multi_sip", simple_kernel)
+
+    # Single-SIP baseline
+    engine1 = _engine()
+    shard_single = TensorArgShard(
+        sip=0, cube=0, pe=0,
+        pa=hbm_pa_s0, nbytes=32, offset_bytes=0,
+    )
+    h1 = engine1.submit(KernelLaunchMsg(
+        correlation_id="t", request_id="single_sip",
+        kernel_ref=KernelRef(name="test_multi_sip", kind="builtin"),
+        args=(TensorArg(shards=(shard_single,)),),
+        target_cubes=(0,), target_pe=0,
+    ))
+    engine1.wait(h1)
+    comp1, trace1 = engine1.get_completion(h1)
+    single_ns = trace1["total_ns"]
+
+    # Multi-SIP: shards in sip=0 and sip=1
+    engine2 = _engine()
+    shard_s0 = TensorArgShard(
+        sip=0, cube=0, pe=0,
+        pa=hbm_pa_s0, nbytes=32, offset_bytes=0,
+    )
+    shard_s1 = TensorArgShard(
+        sip=1, cube=0, pe=0,
+        pa=hbm_pa_s1, nbytes=32, offset_bytes=0,
+    )
+    h2 = engine2.submit(KernelLaunchMsg(
+        correlation_id="t", request_id="multi_sip",
+        kernel_ref=KernelRef(name="test_multi_sip", kind="builtin"),
+        args=(TensorArg(shards=(shard_s0, shard_s1)),),
+        target_pe=0,
+    ))
+    engine2.wait(h2)
+    comp2, trace2 = engine2.get_completion(h2)
+    multi_ns = trace2["total_ns"]
+
+    assert comp1.ok is True
+    assert comp2.ok is True
+    assert single_ns > 0
+    assert multi_ns > 0
+    assert multi_ns >= single_ns, (
+        f"Multi-SIP ({multi_ns}ns) should be >= single-SIP ({single_ns}ns)"
+    )
+
+    # Determinism check
+    engine3 = _engine()
+    h3 = engine3.submit(KernelLaunchMsg(
+        correlation_id="t", request_id="multi_sip_det",
+        kernel_ref=KernelRef(name="test_multi_sip", kind="builtin"),
+        args=(TensorArg(shards=(shard_s0, shard_s1)),),
+        target_pe=0,
+    ))
+    engine3.wait(h3)
+    _, trace3 = engine3.get_completion(h3)
+    assert trace2["total_ns"] == trace3["total_ns"], (
+        f"Determinism violation: {trace2['total_ns']} != {trace3['total_ns']}"
+    )
+    clear_registry()
diff --git a/tests/test_phase_a_components.py b/tests/test_phase_a_components.py
new file mode 100644
index 0000000..4360832
--- /dev/null
+++ b/tests/test_phase_a_components.py
@@ -0,0 +1,269 @@
+"""Phase A component infrastructure tests (ADR-0015).
+
+Verifies:
+  - TransitComponent, IoCpuComponent apply overhead_ns via run()
+  - HbmCtrlComponent and SramComponent act as terminal nodes (succeed done)
+  - MCpuComponent forwards when not terminal; completes when terminal + no ctx
+  - ComponentRegistry resolves impl strings to correct concrete classes
+  - GraphEngine passes ComponentContext to every component
+  - ComponentContext.router and .resolver are correctly populated
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+import pytest
+import simpy
+
+from kernbench.components.base import ComponentBase, ComponentRegistry
+from kernbench.components.context import ComponentContext
+from kernbench.components.impls import (
+    HbmCtrlComponent,
+    IoCpuComponent,
+    MCpuComponent,
+    PcieEpComponent,
+    SramComponent,
+    TransitComponent,
+)
+from kernbench.sim_engine.engine import GraphEngine
+from kernbench.sim_engine.transaction import Transaction
+from kernbench.topology.builder import load_topology
+from kernbench.topology.types import Node
+
+TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml"
+
+
+def _node(impl: str, attrs: dict | None = None) -> Node:
+    return Node(id="test.node", kind="test", impl=impl, attrs=attrs or {}, pos_mm=None)
+
+
+def _run_worker(comp: ComponentBase, env: simpy.Environment, txn: Transaction) -> None:
+    """Wire one in_port, start the component, inject txn, run env until done."""
+    in_store: simpy.Store = simpy.Store(env)
+    comp.in_ports["src"] = in_store
+    comp.start(env)
+    env.process(_inject(in_store, txn))
+    env.run(until=txn.done)
+
+
+def _inject(store: simpy.Store, txn: Transaction):
+    yield store.put(txn)
+
+
+# ── 1. run() latency: TransitComponent ───────────────────────────────
+
+
+def test_transit_component_run_overhead_ns():
+    """TransitComponent.run() yields exactly overhead_ns."""
+    node = _node("forwarding_v1", {"overhead_ns": 7.5})
+    comp = TransitComponent(node)
+    env = simpy.Environment()
+
+    def proc():
+        yield from comp.run(env, nbytes=1024)
+
+    env.process(proc())
+    env.run()
+    assert env.now == pytest.approx(7.5)
+
+
+def test_transit_component_run_zero_overhead_ns():
+    """TransitComponent.run() with overhead_ns=0 completes immediately."""
+    node = _node("noc_v1", {"overhead_ns": 0.0})
+    comp = TransitComponent(node)
+    env = simpy.Environment()
+    done = []
+
+    def proc():
+        yield from comp.run(env, nbytes=512)
+        done.append(True)
+
+    env.process(proc())
+    env.run()
+    assert done == [True]
+    assert env.now == pytest.approx(0.0)
+
+
+# ── 2. run() latency: IoCpuComponent ────────────────────────────────
+
+
+def test_io_cpu_component_run_overhead_ns():
+    """IoCpuComponent.run() yields exactly overhead_ns."""
+    node = _node("io_cpu_v1", {"overhead_ns": 10.0})
+    comp = IoCpuComponent(node)
+    env = simpy.Environment()
+
+    def proc():
+        yield from comp.run(env, nbytes=2048)
+
+    env.process(proc())
+    env.run()
+    assert env.now == pytest.approx(10.0)
+
+
+# ── 3. Terminal: HbmCtrlComponent succeeds done ──────────────────────
+
+
+def test_hbm_ctrl_terminal_succeeds_done():
+    """HbmCtrlComponent is a terminal node: succeeds txn.done after run()."""
+    node = _node("hbm_ctrl_v1", {"overhead_ns": 0.0, "capacity": 1})
+    comp = HbmCtrlComponent(node)
+    env = simpy.Environment()
+    done_event = env.event()
+    txn = Transaction(request=None, path=["test.node"], step=0, nbytes=256, done=done_event)
+
+    _run_worker(comp, env, txn)
+
+    assert done_event.triggered
+
+
+def test_hbm_ctrl_resource_serializes_requests():
+    """HbmCtrlComponent with capacity=1 serializes concurrent requests."""
+    node = _node("hbm_ctrl_v1", {"overhead_ns": 5.0, "capacity": 1})
+    comp = HbmCtrlComponent(node)
+    env = simpy.Environment()
+    in_store: simpy.Store = simpy.Store(env)
+    comp.in_ports["src"] = in_store
+    comp.start(env)
+
+    done1 = env.event()
+    done2 = env.event()
+    txn1 = Transaction(request=None, path=["test.node"], step=0, nbytes=0, done=done1)
+    txn2 = Transaction(request=None, path=["test.node"], step=0, nbytes=0, done=done2)
+
+    def inject():
+        yield in_store.put(txn1)
+        yield in_store.put(txn2)
+
+    env.process(inject())
+    env.run(until=done2)
+
+    # Both must be done; with serialization: t=5 + t=10
+    assert done1.triggered
+    assert done2.triggered
+    assert env.now == pytest.approx(10.0)
+
+
+# ── 4. Terminal: SramComponent succeeds done ─────────────────────────
+
+
+def test_sram_terminal_succeeds_done():
+    """SramComponent is a terminal node: succeeds txn.done after run()."""
+    node = _node("sram_v1", {"overhead_ns": 2.0})
+    comp = SramComponent(node)
+    env = simpy.Environment()
+    done_event = env.event()
+    txn = Transaction(request=None, path=["test.node"], step=0, nbytes=512, done=done_event)
+
+    _run_worker(comp, env, txn)
+
+    assert done_event.triggered
+    assert env.now == pytest.approx(2.0)
+
+
+# ── 5. MCpuComponent: forward when not terminal ──────────────────────
+
+
+def test_m_cpu_forwards_when_not_terminal():
+    """MCpuComponent forwards Transaction to next hop when not terminal."""
+    node = _node("m_cpu_v1", {"overhead_ns": 5.0})
+    comp = MCpuComponent(node)
+    env = simpy.Environment()
+
+    # Wire in_port and out_port for a two-hop path [src, test.node, next]
+    in_store: simpy.Store = simpy.Store(env)
+    out_store: simpy.Store = simpy.Store(env)
+    comp.in_ports["src"] = in_store
+    comp.out_ports["next"] = out_store
+    comp.start(env)
+
+    done_event = env.event()
+    txn = Transaction(
+        request=None,
+        path=["src", "test.node", "next"],
+        step=1,           # currently at test.node; next_hop = "next"
+        nbytes=128,
+        done=done_event,
+    )
+
+    forwarded: list[Any] = []
+
+    def receiver():
+        msg = yield out_store.get()
+        forwarded.append(msg)
+        msg.done.succeed()
+
+    env.process(receiver())
+
+    def inject():
+        yield in_store.put(txn)
+
+    env.process(inject())
+    env.run(until=done_event)
+
+    assert len(forwarded) == 1
+    assert forwarded[0].step == 2     # advanced
+    assert env.now == pytest.approx(5.0)
+
+
+# ── 6. MCpuComponent: terminal with no ctx just completes ────────────
+
+
+def test_m_cpu_terminal_no_ctx_completes():
+    """MCpuComponent without ctx completes txn.done when it is the terminal hop."""
+    node = _node("m_cpu_v1", {"overhead_ns": 0.0})
+    comp = MCpuComponent(node, ctx=None)
+    env = simpy.Environment()
+    done_event = env.event()
+    txn = Transaction(request=None, path=["test.node"], step=0, nbytes=64, done=done_event)
+
+    _run_worker(comp, env, txn)
+
+    assert done_event.triggered
+
+
+# ── 7. ComponentRegistry resolves impl strings ───────────────────────
+
+
+@pytest.mark.parametrize("impl,expected_cls", [
+    ("forwarding_v1", TransitComponent),
+    ("noc_v1",        TransitComponent),
+    ("ucie_v1",       TransitComponent),
+    ("xbar_v1",       TransitComponent),
+    ("pcie_ep_v1",    PcieEpComponent),
+    ("io_cpu_v1",     IoCpuComponent),
+    ("m_cpu_v1",      MCpuComponent),
+    ("hbm_ctrl_v1",   HbmCtrlComponent),
+    ("sram_v1",       SramComponent),
+])
+def test_registry_resolves_impl(impl, expected_cls):
+    """ComponentRegistry.create() returns the correct concrete class for each impl."""
+    node = _node(impl, {"overhead_ns": 0.0})
+    comp = ComponentRegistry.create(node)
+    assert isinstance(comp, expected_cls)
+
+
+# ── 8. GraphEngine passes ComponentContext to components ─────────────
+
+
+def test_engine_passes_ctx_to_components():
+    """GraphEngine injects a non-None ComponentContext into every component."""
+    graph = load_topology(TOPOLOGY_PATH)
+    engine = GraphEngine(graph)
+    for node_id, comp in engine._components.items():
+        assert comp.ctx is not None, f"{node_id}: ctx is None"
+        assert isinstance(comp.ctx, ComponentContext), f"{node_id}: ctx wrong type"
+
+
+def test_engine_ctx_router_and_resolver_populated():
+    """ComponentContext.router and .resolver are PathRouter / AddressResolver instances."""
+    from kernbench.policy.routing.router import AddressResolver, PathRouter
+
+    graph = load_topology(TOPOLOGY_PATH)
+    engine = GraphEngine(graph)
+    # Spot-check one component
+    first_comp = next(iter(engine._components.values()))
+    assert isinstance(first_comp.ctx.router, PathRouter)
+    assert isinstance(first_comp.ctx.resolver, AddressResolver)
diff --git a/tests/test_phyaddr.py b/tests/test_phyaddr.py
new file mode 100644
index 0000000..2cbb252
--- /dev/null
+++ b/tests/test_phyaddr.py
@@ -0,0 +1,268 @@
+import pytest
+
+from kernbench.policy.address.allocator import AddressConfig, AllocationError, PEMemAllocator
+from kernbench.policy.address.phyaddr import PhysAddr, PhysAddrError, UnitType
+
+_MB = 1 << 20
+_GB = 1 << 30
+
+# Topology-matching config: 48GB HBM / 8 slices / 16MB TCM / 4MB reserved / 32MB SRAM
+_CFG = AddressConfig(
+    sip_count=2,
+    cubes_per_sip=16,
+    pes_per_cube=8,
+    hbm_bytes_per_cube=48 * _GB,
+    hbm_slices_per_cube=8,
+    tcm_bytes_per_pe=16 * _MB,
+    tcm_scheduler_reserved_bytes=4 * _MB,
+    sram_bytes_per_cube=32 * _MB,
+)
+
+
+# ── Immutability & value semantics ──────────────────────────────────
+
+
+def test_physaddr_immutable():
+    pa = PhysAddr.hbm_addr(rack_id=0, sip_id=0, cube_id=0, hbm_offset=0)
+    with pytest.raises(AttributeError):
+        pa.rack_id = 1  # type: ignore[misc]
+    # hashable
+    {pa}
+    # comparable
+    pa2 = PhysAddr.hbm_addr(rack_id=0, sip_id=0, cube_id=0, hbm_offset=0)
+    assert pa == pa2
+
+
+# ── HBM encode/decode roundtrip ────────────────────────────────────
+
+
+def test_hbm_encode_decode_roundtrip():
+    pa = PhysAddr.hbm_addr(rack_id=2, sip_id=3, cube_id=5, hbm_offset=0x1000)
+    raw = pa.encode()
+    dec = PhysAddr.decode(raw)
+    assert dec.rack_id == 2
+    assert dec.sip_id == 3
+    assert dec.cube_id == 5
+    assert dec.kind == "hbm"
+    assert dec.hbm_offset == 0x1000
+
+
+# ── PE resource encode/decode roundtrip ─────────────────────────────
+
+
+def test_pe_resource_encode_decode_roundtrip():
+    pa = PhysAddr(
+        rack_id=1, sip_id=2, sip_seg=7, local_offset=0,
+        kind="pe_resource", cube_id=7,
+        unit_type=UnitType.PE, pe_id=3, ext=1, sub_offset=0xFF,
+    )
+    # manually build local_offset matching bit layout
+    local_offset = (UnitType.PE << 34) | (3 << 30) | (1 << 29) | 0xFF
+    pa2 = PhysAddr(
+        rack_id=1, sip_id=2, sip_seg=7, local_offset=local_offset,
+        kind="pe_resource", cube_id=7,
+        unit_type=UnitType.PE, pe_id=3, ext=1, sub_offset=0xFF,
+    )
+    raw = pa2.encode()
+    dec = PhysAddr.decode(raw)
+    assert dec.kind == "pe_resource"
+    assert dec.unit_type == UnitType.PE
+    assert dec.pe_id == 3
+    assert dec.ext == 1
+    assert dec.sub_offset == 0xFF
+
+
+# ── pe_hbm_addr factory ────────────────────────────────────────────
+
+
+def test_pe_hbm_addr_factory():
+    SLICE = 6 * (1 << 30)  # 6 GB per PE slice
+    pa = PhysAddr.pe_hbm_addr(
+        rack_id=0, sip_id=0, cube_id=0,
+        pe_id=2, pe_local_hbm_offset=1024, slice_size_bytes=SLICE,
+    )
+    assert pa.kind == "hbm"
+    assert pa.cube_id == 0
+    assert pa.hbm_offset == 2 * SLICE + 1024
+
+
+def test_pe_hbm_addr_overflow():
+    SLICE = 6 * (1 << 30)
+    with pytest.raises(PhysAddrError, match="pe_local_hbm_offset"):
+        PhysAddr.pe_hbm_addr(
+            rack_id=0, sip_id=0, cube_id=0,
+            pe_id=0, pe_local_hbm_offset=SLICE, slice_size_bytes=SLICE,
+        )
+
+
+# ── Invalid unit_type decode (fix #1) ──────────────────────────────
+
+
+def test_invalid_unit_type_raises():
+    # Craft a PE-resource address with unit_type=7 (invalid)
+    local_offset = (7 << 34) | (0 << 30) | 0
+    pa_raw = PhysAddr(
+        rack_id=0, sip_id=0, sip_seg=0, local_offset=local_offset,
+    )
+    raw = pa_raw.encode()
+    with pytest.raises(PhysAddrError, match="unit_type"):
+        PhysAddr.decode(raw)
+
+
+# ── hbm_pe_id utility (fix #3) ─────────────────────────────────────
+
+
+def test_hbm_pe_id_utility():
+    SLICE = 6 * (1 << 30)  # 6 GB
+    pa = PhysAddr.pe_hbm_addr(
+        rack_id=0, sip_id=0, cube_id=0,
+        pe_id=5, pe_local_hbm_offset=256, slice_size_bytes=SLICE,
+    )
+    assert PhysAddr.hbm_pe_id(pa.hbm_offset, SLICE) == 5
+
+
+# ── UnitType.SRAM exists (fix #5) ──────────────────────────────────
+
+
+def test_sram_unit_type_exists():
+    assert UnitType.SRAM == 2
+
+
+# ── cube_sram_addr factory + roundtrip ──────────────────────────────
+
+
+def test_cube_sram_addr_roundtrip():
+    pa = PhysAddr.cube_sram_addr(
+        rack_id=0, sip_id=1, cube_id=3, sram_offset=0x800,
+    )
+    assert pa.kind == "pe_resource"
+    assert pa.unit_type == UnitType.SRAM
+    assert pa.cube_id == 3
+    assert pa.sub_offset == 0x800
+    # encode → decode roundtrip
+    dec = PhysAddr.decode(pa.encode())
+    assert dec.unit_type == UnitType.SRAM
+    assert dec.cube_id == 3
+    assert dec.sub_offset == 0x800
+
+
+def test_cube_sram_addr_range_check():
+    with pytest.raises(PhysAddrError):
+        PhysAddr.cube_sram_addr(
+            rack_id=0, sip_id=0, cube_id=0,
+            sram_offset=(1 << 29),  # exceeds 29-bit sub_offset
+        )
+
+
+# ── pe_tcm_addr factory + roundtrip ────────────────────────────────
+
+
+def test_pe_tcm_addr_roundtrip():
+    pa = PhysAddr.pe_tcm_addr(
+        rack_id=0, sip_id=0, cube_id=2, pe_id=7, tcm_offset=0x400,
+    )
+    assert pa.kind == "pe_resource"
+    assert pa.unit_type == UnitType.PE
+    assert pa.pe_id == 7
+    assert pa.cube_id == 2
+    assert pa.sub_offset == 0x400
+    # encode → decode roundtrip
+    dec = PhysAddr.decode(pa.encode())
+    assert dec.unit_type == UnitType.PE
+    assert dec.pe_id == 7
+    assert dec.sub_offset == 0x400
+
+
+def test_pe_tcm_addr_range_check():
+    with pytest.raises(PhysAddrError):
+        PhysAddr.pe_tcm_addr(
+            rack_id=0, sip_id=0, cube_id=0, pe_id=0,
+            tcm_offset=(1 << 29),  # exceeds 29-bit sub_offset
+        )
+
+
+# ── AddressConfig ───────────────────────────────────────────────────
+
+
+def test_address_config_derived_sizes():
+    assert _CFG.hbm_slice_bytes == 6 * _GB
+    assert _CFG.tcm_allocatable_bytes == 12 * _MB
+
+
+# ── PEMemAllocator: HBM ────────────────────────────────────────────
+
+
+def _make_alloc(pe_id: int = 0) -> PEMemAllocator:
+    return PEMemAllocator(rack_id=0, sip_id=0, cube_id=0, pe_id=pe_id, cfg=_CFG)
+
+
+def test_allocator_hbm_basic():
+    a = _make_alloc(pe_id=3)
+    pa = a.alloc_hbm(4096)
+    assert pa.kind == "hbm"
+    assert pa.sip_id == 0
+    assert pa.cube_id == 0
+    # hbm_offset should be pe3's slice start
+    assert pa.hbm_offset == 3 * 6 * _GB
+
+
+def test_allocator_hbm_sequential():
+    a = _make_alloc()
+    pa1 = a.alloc_hbm(1024)
+    pa2 = a.alloc_hbm(2048)
+    assert pa1.hbm_offset == 0  # pe0 slice start + 0
+    assert pa2.hbm_offset == 1024  # pe0 slice start + 1024
+
+
+def test_allocator_hbm_overflow():
+    a = _make_alloc()
+    a.alloc_hbm(6 * _GB - 256)
+    with pytest.raises(AllocationError, match="HBM"):
+        a.alloc_hbm(512)
+
+
+# ── PEMemAllocator: TCM ────────────────────────────────────────────
+
+
+def test_allocator_tcm_basic():
+    a = _make_alloc(pe_id=5)
+    pa = a.alloc_tcm(256)
+    assert pa.kind == "pe_resource"
+    assert pa.unit_type == UnitType.PE
+    assert pa.pe_id == 5
+    assert pa.sub_offset == 0
+
+
+def test_allocator_tcm_respects_reserved():
+    a = _make_alloc()
+    # allocatable = 12 MB, should succeed
+    a.alloc_tcm(12 * _MB)
+    assert a.tcm_used == 12 * _MB
+    assert a.tcm_total == 12 * _MB
+
+
+def test_allocator_tcm_overflow():
+    a = _make_alloc()
+    a.alloc_tcm(12 * _MB)
+    with pytest.raises(AllocationError, match="TCM"):
+        a.alloc_tcm(1)
+
+
+# ── PEMemAllocator: stats & determinism ─────────────────────────────
+
+
+def test_allocator_stats():
+    a = _make_alloc()
+    a.alloc_hbm(1000)
+    a.alloc_tcm(500)
+    assert a.hbm_used == 1000
+    assert a.hbm_total == 6 * _GB
+    assert a.tcm_used == 500
+    assert a.tcm_total == 12 * _MB
+
+
+def test_allocator_deterministic():
+    a1 = _make_alloc(pe_id=2)
+    a2 = _make_alloc(pe_id=2)
+    assert a1.alloc_hbm(4096) == a2.alloc_hbm(4096)
+    assert a1.alloc_tcm(256) == a2.alloc_tcm(256)
diff --git a/tests/test_probe.py b/tests/test_probe.py
new file mode 100644
index 0000000..eb70e4b
--- /dev/null
+++ b/tests/test_probe.py
@@ -0,0 +1,221 @@
+"""Tests for H2D writes and PE DMA probe latency invariants.
+
+H2D tests use MemoryWriteMsg (pcie_ep → io_cpu → m_cpu → hbm_ctrl → response).
+PE DMA tests use PeDmaMsg (direct pe_dma → xbar → hbm_ctrl injection).
+"""
+from pathlib import Path
+
+from kernbench.policy.address.phyaddr import PhysAddr
+from kernbench.policy.routing.router import AddressResolver, PathRouter
+from kernbench.runtime_api.kernel import MemoryWriteMsg, PeDmaMsg
+from kernbench.sim_engine.engine import GraphEngine
+from kernbench.topology.builder import load_topology
+
+TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml"
+
+
+def _engine():
+    return GraphEngine(load_topology(TOPOLOGY_PATH))
+
+
+def _hbm_pa(sip: int = 0, cube: int = 0, pe_id: int = 0) -> int:
+    slice_bytes = 48 * (1 << 30) // 8
+    pa = PhysAddr.pe_hbm_addr(
+        rack_id=0, sip_id=sip, cube_id=cube, pe_id=pe_id,
+        pe_local_hbm_offset=0x1000, slice_size_bytes=slice_bytes,
+    )
+    return pa.encode()
+
+
+def _h2d_latency(dst_cube: int, dst_pe: int = 0) -> float:
+    engine = _engine()
+    msg = MemoryWriteMsg(
+        correlation_id="probe", request_id=f"h2d-c{dst_cube}-p{dst_pe}",
+        dst_sip=0, dst_cube=dst_cube, dst_pe=dst_pe,
+        dst_pa=_hbm_pa(sip=0, cube=dst_cube, pe_id=dst_pe), nbytes=4096,
+        pattern="zero", target_pe=dst_pe,
+    )
+    h = engine.submit(msg)
+    engine.wait(h)
+    _, trace = engine.get_completion(h)
+    return trace["total_ns"]
+
+
+# ── 1. Single-PE write completes ──────────────────────────────────
+
+
+def test_single_pe_write_completes():
+    """MemoryWriteMsg(target_pe=0) must complete with ok=True, latency > 0."""
+    engine = _engine()
+    msg = MemoryWriteMsg(
+        correlation_id="probe", request_id="pe-local",
+        dst_sip=0, dst_cube=0, dst_pe=0,
+        dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096,
+        pattern="zero", target_pe=0,
+    )
+    h = engine.submit(msg)
+    engine.wait(h)
+    comp, trace = engine.get_completion(h)
+    assert comp.ok is True
+    assert trace["total_ns"] > 0
+
+
+# ── 2. Cross-cube write positive latency ─────────────────────────
+
+
+def test_cross_cube_write_positive():
+    """Cross-cube MemoryWriteMsg(target_pe=0) must complete with latency > 0."""
+    lat = _h2d_latency(dst_cube=1, dst_pe=0)
+    assert lat > 0
+
+
+# ── 3. H2D latency monotonicity ──────────────────────────────────
+
+
+def test_h2d_latency_monotonic():
+    """1hop < 2hop < 3hop < 4hop."""
+    cubes = [0, 4, 8, 12]
+    latencies: list[tuple[int, float]] = []
+    for cube in cubes:
+        lat = _h2d_latency(dst_cube=cube, dst_pe=0)
+        latencies.append((cube, lat))
+
+    for i in range(len(latencies) - 1):
+        assert latencies[i][1] < latencies[i + 1][1], (
+            f"cube{latencies[i][0]}({latencies[i][1]:.2f}) "
+            f"must < cube{latencies[i + 1][0]}({latencies[i + 1][1]:.2f})"
+        )
+
+
+# ── 4. Single-PE write deterministic ─────────────────────────────
+
+
+def test_single_pe_write_deterministic():
+    """Same MemoryWriteMsg on two engines must produce identical latency."""
+    msg = MemoryWriteMsg(
+        correlation_id="probe", request_id="det",
+        dst_sip=0, dst_cube=0, dst_pe=0,
+        dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096,
+        pattern="zero", target_pe=0,
+    )
+    e1 = _engine()
+    h1 = e1.submit(msg)
+    e1.wait(h1)
+    _, t1 = e1.get_completion(h1)
+
+    e2 = _engine()
+    h2 = e2.submit(msg)
+    e2.wait(h2)
+    _, t2 = e2.get_completion(h2)
+
+    assert t1["total_ns"] == t2["total_ns"]
+
+
+# ── 5. Cut-through (wormhole) wire model invariants ──────────────
+
+
+def test_h2d_local_cube_cut_through():
+    """H2D to local cube with cut-through should be < 50ns for 4096B.
+
+    Full command path: pcie_ep → io_cpu → ucie → noc → m_cpu
+    DMA: m_cpu → noc → xbar → hbm_ctrl (drain once at terminal)
+    Plus response path back.
+    With store-and-forward each hop would serialize; cut-through keeps it low.
+    """
+    lat = _h2d_latency(dst_cube=0, dst_pe=0)
+    assert lat < 65.0, f"Local H2D {lat:.2f}ns; cut-through expects < 65ns"
+
+
+def test_h2d_remote_cube_cut_through():
+    """H2D to 1-hop remote cube: cut-through drain dominates, not per-hop serialization.
+
+    With store-and-forward, each hop would serialize 4096B, total >> 100ns.
+    With cut-through, drain happens once at bottleneck.
+    """
+    lat = _h2d_latency(dst_cube=4, dst_pe=0)
+    assert lat < 80.0, f"Remote H2D {lat:.2f}ns; cut-through expects < 80ns"
+
+
+# ── 6. PE DMA: direct injection tests ─────────────────────────
+
+
+def _graph():
+    return load_topology(TOPOLOGY_PATH)
+
+
+def _pe_dma_latency(src_cube: int, src_pe: int, dst_pe: int) -> float:
+    engine = _engine()
+    msg = PeDmaMsg(
+        correlation_id="probe", request_id=f"dma-c{src_cube}-p{src_pe}-s{dst_pe}",
+        src_sip=0, src_cube=src_cube, src_pe=src_pe,
+        dst_pa=_hbm_pa(sip=0, cube=src_cube, pe_id=dst_pe), nbytes=4096,
+    )
+    h = engine.submit(msg)
+    engine.wait(h)
+    _, trace = engine.get_completion(h)
+    return trace["total_ns"]
+
+
+def _pe_dma_bottleneck(src_cube: int, src_pe: int, dst_pe: int) -> float | None:
+    graph = _graph()
+    edge_map = {(e.src, e.dst): e for e in graph.edges}
+    resolver = AddressResolver(graph)
+    router = PathRouter(graph)
+    pa = _hbm_pa(sip=0, cube=src_cube, pe_id=dst_pe)
+    pa_obj = PhysAddr.decode(pa)
+    dst_node = resolver.resolve(pa_obj)
+    pe_ref = f"sip0.cube{src_cube}.pe{src_pe}"
+    path = router.find_path(pe_ref, dst_node)
+    bws: list[float] = []
+    for i in range(len(path) - 1):
+        e = edge_map.get((path[i], path[i + 1]))
+        if e and e.bw_gbs:
+            bws.append(e.bw_gbs)
+    return min(bws) if bws else None
+
+
+def test_pe_dma_local_completes():
+    """PeDmaMsg to local slice0 must complete with ok=True, latency > 0."""
+    engine = _engine()
+    msg = PeDmaMsg(
+        correlation_id="probe", request_id="dma-local",
+        src_sip=0, src_cube=0, src_pe=0,
+        dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096,
+    )
+    h = engine.submit(msg)
+    engine.wait(h)
+    comp, trace = engine.get_completion(h)
+    assert comp.ok is True
+    assert trace["total_ns"] > 0
+
+
+def test_pe_dma_local_bottleneck_256():
+    """PE DMA pe0→slice0 (local): bottleneck = 256 GB/s (direct xbar→hbm)."""
+    bn = _pe_dma_bottleneck(src_cube=0, src_pe=0, dst_pe=0)
+    assert bn == 256.0, f"Local PE DMA bottleneck {bn}, expected 256.0"
+
+
+def test_pe_dma_chain_bottleneck_128():
+    """PE DMA pe0→slice1 (xbar chain): bottleneck = 128 GB/s."""
+    bn = _pe_dma_bottleneck(src_cube=0, src_pe=0, dst_pe=1)
+    assert bn == 128.0, f"Chain PE DMA bottleneck {bn}, expected 128.0"
+
+
+def test_pe_dma_deterministic():
+    """Same PeDmaMsg on two engines must produce identical latency."""
+    msg = PeDmaMsg(
+        correlation_id="probe", request_id="det",
+        src_sip=0, src_cube=0, src_pe=0,
+        dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096,
+    )
+    e1 = _engine()
+    h1 = e1.submit(msg)
+    e1.wait(h1)
+    _, t1 = e1.get_completion(h1)
+
+    e2 = _engine()
+    h2 = e2.submit(msg)
+    e2.wait(h2)
+    _, t2 = e2.get_completion(h2)
+
+    assert t1["total_ns"] == t2["total_ns"]
diff --git a/tests/test_routing.py b/tests/test_routing.py
new file mode 100644
index 0000000..b44c147
--- /dev/null
+++ b/tests/test_routing.py
@@ -0,0 +1,226 @@
+import pytest
+
+from pathlib import Path
+
+from kernbench.policy.address.phyaddr import PhysAddr, UnitType
+from kernbench.policy.routing.router import AddressResolver, PathRouter, RoutingError
+from kernbench.topology.builder import load_topology
+
+TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml"
+
+
+def _graph():
+    return load_topology(TOPOLOGY_PATH)
+
+
+# ── AddressResolver ──────────────────────────────────────────────────
+
+
+def test_resolve_hbm_addr():
+    """HBM address -> sip{S}.cube{C}.hbm_ctrl.slice{P}"""
+    g = _graph()
+    resolver = AddressResolver(g)
+    # hbm_offset=0x1000, slice_size=6GB -> slice 0
+    pa = PhysAddr.hbm_addr(rack_id=0, sip_id=0, cube_id=3, hbm_offset=0x1000)
+    assert resolver.resolve(pa) == "sip0.cube3.hbm_ctrl.slice0"
+
+
+def test_resolve_hbm_addr_slice4():
+    """HBM address in PE4's slice range -> slice4."""
+    g = _graph()
+    resolver = AddressResolver(g)
+    # slice_size = 6GB; PE4 offset starts at 4*6GB = 24GB = 0x600000000
+    pa = PhysAddr.hbm_addr(rack_id=0, sip_id=0, cube_id=0, hbm_offset=0x600000000)
+    assert resolver.resolve(pa) == "sip0.cube0.hbm_ctrl.slice4"
+
+
+def test_resolve_pe_tcm_addr():
+    """PE TCM address → sip{S}.cube{C}.pe{P}.pe_tcm"""
+    g = _graph()
+    resolver = AddressResolver(g)
+    pa = PhysAddr.pe_tcm_addr(rack_id=0, sip_id=1, cube_id=5, pe_id=7, tcm_offset=0x400)
+    assert resolver.resolve(pa) == "sip1.cube5.pe7.pe_tcm"
+
+
+def test_resolve_sram_addr():
+    """SRAM address → sip{S}.cube{C}.sram"""
+    g = _graph()
+    resolver = AddressResolver(g)
+    pa = PhysAddr.cube_sram_addr(rack_id=0, sip_id=0, cube_id=10, sram_offset=0x800)
+    assert resolver.resolve(pa) == "sip0.cube10.sram"
+
+
+def test_resolve_mcpu_addr():
+    """MCPU pe_resource address → sip{S}.cube{C}.m_cpu"""
+    g = _graph()
+    resolver = AddressResolver(g)
+    pa = PhysAddr(
+        rack_id=0, sip_id=0, sip_seg=2, local_offset=(UnitType.MCPU << 34),
+        kind="pe_resource", cube_id=2, unit_type=UnitType.MCPU,
+    )
+    assert resolver.resolve(pa) == "sip0.cube2.m_cpu"
+
+
+def test_resolve_nonexistent_node():
+    """Address pointing to a node outside the compiled topology raises RoutingError."""
+    g = _graph()
+    resolver = AddressResolver(g)
+    # sip_id=15 doesn't exist in the 2-SIP topology
+    pa = PhysAddr.hbm_addr(rack_id=0, sip_id=15, cube_id=0, hbm_offset=0)
+    with pytest.raises(RoutingError):
+        resolver.resolve(pa)
+
+
+# ── PathRouter: local HBM (same xbar half) ──────────────────────────
+
+
+def test_path_local_hbm_same_half():
+    """PE0 -> slice0 (local): pe_dma -> xbar.pe0 -> hbm_ctrl.slice0 (no chain hops)."""
+    g = _graph()
+    router = PathRouter(g)
+    path = router.find_path("sip0.cube0.pe0", "sip0.cube0.hbm_ctrl.slice0")
+    assert path[0] == "sip0.cube0.pe0.pe_dma"
+    assert "sip0.cube0.xbar.pe0" in path
+    assert path[-1] == "sip0.cube0.hbm_ctrl.slice0"
+    # local access: no bridge and no chain traversal (shortest path = 3 nodes)
+    assert not any("bridge" in n for n in path)
+    assert len(path) == 3  # pe_dma → xbar.pe0 → slice0
+
+
+# ── PathRouter: same-half remote HBM ────────────────────────────────
+
+
+def test_path_same_half_remote_hbm():
+    """PE0 -> slice1: same-half chain traversal pe0→pe1, no bridge."""
+    g = _graph()
+    router = PathRouter(g)
+    path = router.find_path("sip0.cube0.pe0", "sip0.cube0.hbm_ctrl.slice1")
+    assert path[0] == "sip0.cube0.pe0.pe_dma"
+    assert "sip0.cube0.xbar.pe0" in path   # enter at pe0
+    assert "sip0.cube0.xbar.pe1" in path   # chain hop to pe1
+    assert path[-1] == "sip0.cube0.hbm_ctrl.slice1"
+    assert not any("bridge" in n for n in path)
+    assert len(path) == 4  # pe_dma → xbar.pe0 → xbar.pe1 → slice1
+
+
+# ── PathRouter: cross-half HBM ──────────────────────────────────────
+
+
+def test_path_cross_half_hbm():
+    """PE0 -> slice4 (cross-half): pe_dma → xbar.pe0 → bridge.left → xbar.pe4 → slice4."""
+    g = _graph()
+    router = PathRouter(g)
+    path = router.find_path("sip0.cube0.pe0", "sip0.cube0.hbm_ctrl.slice4")
+    assert path[0] == "sip0.cube0.pe0.pe_dma"
+    assert "sip0.cube0.xbar.pe0" in path
+    assert any("bridge" in n for n in path), "cross-half HBM must traverse bridge"
+    assert "sip0.cube0.xbar.pe4" in path
+    assert path[-1] == "sip0.cube0.hbm_ctrl.slice4"
+    # Shortest cross-half path: pe_dma → xbar.pe0 → bridge.left → xbar.pe4 → slice4
+    assert len(path) == 5
+
+
+def test_path_cross_half_requires_bridge():
+    """PE4 (bottom) -> slice2 (top) requires bridge traversal."""
+    g = _graph()
+    router = PathRouter(g)
+    path = router.find_path("sip0.cube0.pe4", "sip0.cube0.hbm_ctrl.slice2")
+    assert any("bridge" in n for n in path), "cross-half HBM must traverse bridge"
+    assert any("xbar.pe" in n for n in path)
+    assert path[-1] == "sip0.cube0.hbm_ctrl.slice2"
+
+
+def test_cross_half_distance_greater():
+    """Cross-half HBM access must have greater distance than local-half."""
+    g = _graph()
+    router = PathRouter(g)
+    _, dist_local = router.find_path_with_distance(
+        "sip0.cube0.pe0", "sip0.cube0.hbm_ctrl.slice0")
+    _, dist_cross = router.find_path_with_distance(
+        "sip0.cube0.pe0", "sip0.cube0.hbm_ctrl.slice4")
+    assert dist_cross > dist_local
+
+
+def test_path_same_half_remote_longer():
+    """Same-half remote HBM (PE0->slice3) has greater distance than local (PE0->slice0)."""
+    g = _graph()
+    router = PathRouter(g)
+    _, dist_local = router.find_path_with_distance(
+        "sip0.cube0.pe0", "sip0.cube0.hbm_ctrl.slice0")
+    _, dist_remote = router.find_path_with_distance(
+        "sip0.cube0.pe0", "sip0.cube0.hbm_ctrl.slice3")
+    assert dist_remote > dist_local, (
+        f"same-half remote ({dist_remote:.2f}mm) must > local ({dist_local:.2f}mm)"
+    )
+
+
+def test_path_remote_cube_hbm():
+    """PE0 in cube0 can reach HBM in cube1 via UCIe (ADR-0004 D4)."""
+    g = _graph()
+    router = PathRouter(g)
+    path = router.find_path("sip0.cube0.pe0", "sip0.cube1.hbm_ctrl.slice0")
+    assert path[0] == "sip0.cube0.pe0.pe_dma"
+    assert path[-1] == "sip0.cube1.hbm_ctrl.slice0"
+    # inter-cube path must cross a UCIe link
+    assert any("ucie" in n for n in path), "remote cube path must traverse UCIe"
+    # must not be trivially short (needs noc + ucie + remote noc + xbar)
+    assert len(path) >= 5
+
+
+# ── PathRouter: SRAM via NOC ────────────────────────────────────────
+
+
+def test_path_sram_via_noc():
+    """PE → SRAM must go through NOC (non-HBM data path)."""
+    g = _graph()
+    router = PathRouter(g)
+    path = router.find_path("sip0.cube0.pe0", "sip0.cube0.sram")
+    assert path[0] == "sip0.cube0.pe0.pe_dma"
+    assert "sip0.cube0.noc" in path
+    assert path[-1] == "sip0.cube0.sram"
+    # should NOT go through xbar (SRAM is non-HBM path)
+    assert not any("xbar" in n for n in path)
+
+
+# ── PathRouter: PE TCM (local) ──────────────────────────────────────
+
+
+def test_path_local_tcm():
+    """PE0 → own TCM is PE-internal, not via xbar or noc."""
+    g = _graph()
+    router = PathRouter(g)
+    path = router.find_path("sip0.cube0.pe0", "sip0.cube0.pe0.pe_tcm")
+    assert path[0] == "sip0.cube0.pe0.pe_dma"
+    assert path[-1] == "sip0.cube0.pe0.pe_tcm"
+    # PE-internal path, no fabric
+    assert not any("xbar" in n or "noc" in n for n in path)
+
+
+# ── PathRouter: distance monotonic ──────────────────────────────────
+
+
+def test_path_distance_positive():
+    """All routed paths must have accumulated distance > 0 (ADR-0002 D4)."""
+    g = _graph()
+    router = PathRouter(g)
+    _, dist = router.find_path_with_distance("sip0.cube0.pe0", "sip0.cube0.hbm_ctrl.slice0")
+    assert dist > 0
+
+
+def test_path_deterministic():
+    """Same (src, dst) must always produce the same path."""
+    g = _graph()
+    r1 = PathRouter(g)
+    r2 = PathRouter(g)
+    p1 = r1.find_path("sip0.cube0.pe3", "sip0.cube0.hbm_ctrl.slice3")
+    p2 = r2.find_path("sip0.cube0.pe3", "sip0.cube0.hbm_ctrl.slice3")
+    assert p1 == p2
+
+
+def test_remote_cube_path_no_routing_error():
+    """Routing to remote cube HBM must not raise RoutingError (ADR-0004 D4)."""
+    g = _graph()
+    router = PathRouter(g)
+    # cube0.PE0 -> cube1.slice0 (adjacent cube, E direction)
+    path = router.find_path("sip0.cube0.pe0", "sip0.cube1.hbm_ctrl.slice0")
+    assert len(path) >= 1  # succeeds without exception
diff --git a/tests/test_tensor.py b/tests/test_tensor.py
new file mode 100644
index 0000000..a89109f
--- /dev/null
+++ b/tests/test_tensor.py
@@ -0,0 +1,282 @@
+import pytest
+
+from kernbench.policy.address.allocator import AddressConfig, AllocationError, PEMemAllocator
+from kernbench.policy.placement.dp import (
+    ShardSpec,
+    column_wise,
+    tiled_column_major,
+    replicate,
+    row_wise,
+    tiled_row_major,
+)
+from kernbench.runtime_api.kernel import (
+    KernelLaunchMsg,
+    KernelRef,
+    MemoryReadMsg,
+    MemoryWriteMsg,
+    ScalarArg,
+    TensorArg,
+    TensorArgShard,
+)
+from kernbench.runtime_api.tensor import (
+    TensorHandle,
+    TensorShard,
+    deploy_tensor,
+    dtype_itemsize,
+)
+
+_MB = 1 << 20
+_GB = 1 << 30
+
+_CFG = AddressConfig(
+    sip_count=2,
+    cubes_per_sip=16,
+    pes_per_cube=8,
+    hbm_bytes_per_cube=48 * _GB,
+    hbm_slices_per_cube=8,
+    tcm_bytes_per_pe=16 * _MB,
+    tcm_scheduler_reserved_bytes=4 * _MB,
+    sram_bytes_per_cube=32 * _MB,
+)
+
+
+def _make_allocators(num_pe: int = 8) -> dict[int, PEMemAllocator]:
+    return {
+        i: PEMemAllocator(rack_id=0, sip_id=0, cube_id=0, pe_id=i, cfg=_CFG)
+        for i in range(num_pe)
+    }
+
+
+# ── Tensor types ─────────────────────────────────────────────────────
+
+
+def test_tensor_shard_immutable():
+    ts = TensorShard(sip=0, cube=0, pe=0, pa=0x1000, nbytes=4096, offset_bytes=0)
+    with pytest.raises(AttributeError):
+        ts.pa = 0x2000  # type: ignore[misc]
+    # hashable
+    {ts}
+
+
+def test_tensor_handle_nbytes():
+    th = TensorHandle(
+        name="A",
+        shape=(1024, 512),
+        dtype="fp16",
+        itemsize=2,
+        shards=(),
+    )
+    assert th.nbytes == 1024 * 512 * 2  # 1 MB
+
+
+# ── Message types (ADR-0012) ─────────────────────────────────────────
+
+
+def test_memory_write_msg_fields():
+    msg = MemoryWriteMsg(
+        correlation_id="c0",
+        request_id="r0",
+        dst_sip=0,
+        dst_cube=3,
+        dst_pe=5,
+        dst_pa=0xDEAD,
+        nbytes=4096,
+        pattern="zero",
+    )
+    assert msg.msg_type == "memory_write"
+    assert msg.src_kind == "pattern"
+    assert msg.dst_pa == 0xDEAD
+    assert msg.pattern == "zero"
+    with pytest.raises(AttributeError):
+        msg.nbytes = 0  # type: ignore[misc]
+
+
+def test_memory_read_msg_fields():
+    msg = MemoryReadMsg(
+        correlation_id="c0",
+        request_id="r1",
+        src_sip=1,
+        src_cube=2,
+        src_pe=7,
+        src_pa=0xBEEF,
+        nbytes=2048,
+    )
+    assert msg.msg_type == "memory_read"
+    assert msg.src_pa == 0xBEEF
+    assert msg.nbytes == 2048
+
+
+def test_kernel_launch_msg_fields():
+    shard = TensorArgShard(sip=0, cube=0, pe=0, pa=0x100, nbytes=1024, offset_bytes=0)
+    targ = TensorArg(shards=(shard,))
+    sarg = ScalarArg(dtype="fp32", value=1.0)
+    kref = KernelRef(name="gemm", kind="builtin")
+    msg = KernelLaunchMsg(
+        correlation_id="c0",
+        request_id="r2",
+        kernel_ref=kref,
+        args=(targ, sarg),
+    )
+    assert msg.msg_type == "kernel_launch"
+    assert msg.kernel_ref.name == "gemm"
+    assert len(msg.args) == 2
+    assert msg.args[0].arg_kind == "tensor"
+    assert msg.args[1].arg_kind == "scalar"
+
+
+# ── Placement: column_wise ───────────────────────────────────────────
+
+
+def test_column_wise_placement():
+    """(1024, 512) fp16 across 8 PEs → K axis split → 8 shards, each (1024, 64) = 128KB"""
+    shards = column_wise(shape=(1024, 512), itemsize=2, num_pe=8)
+    assert len(shards) == 8
+    expected_nbytes = 1024 * 64 * 2  # 128 KB
+    for i, s in enumerate(shards):
+        assert s.pe_index == i
+        assert s.nbytes == expected_nbytes
+    # offsets are contiguous
+    assert shards[0].offset_bytes == 0
+    assert shards[1].offset_bytes == expected_nbytes
+    # total coverage
+    assert sum(s.nbytes for s in shards) == 1024 * 512 * 2
+
+
+# ── Placement: row_wise ──────────────────────────────────────────────
+
+
+def test_row_wise_placement():
+    """(1024, 512) fp16 across 8 PEs → M axis split → 8 shards, each (128, 512) = 128KB"""
+    shards = row_wise(shape=(1024, 512), itemsize=2, num_pe=8)
+    assert len(shards) == 8
+    expected_nbytes = 128 * 512 * 2  # 128 KB
+    for i, s in enumerate(shards):
+        assert s.pe_index == i
+        assert s.nbytes == expected_nbytes
+    assert shards[0].offset_bytes == 0
+    assert sum(s.nbytes for s in shards) == 1024 * 512 * 2
+
+
+# ── Placement: replicate ─────────────────────────────────────────────
+
+
+def test_replicate_placement():
+    """(1024, 512) fp16 across 8 PEs → each PE gets full copy = 1MB"""
+    shards = replicate(shape=(1024, 512), itemsize=2, num_pe=8)
+    assert len(shards) == 8
+    full_nbytes = 1024 * 512 * 2  # 1 MB
+    for i, s in enumerate(shards):
+        assert s.pe_index == i
+        assert s.nbytes == full_nbytes
+        assert s.offset_bytes == 0  # each is a full copy
+
+
+# ── Placement: tiled_column_major ─────────────────────────────────────
+
+
+def test_tiled_column_major():
+    """(1024, 512) tile=(256, 128) → 4×4=16 tiles, column-major → round-robin 8 PEs"""
+    shards = tiled_column_major(
+        shape=(1024, 512), itemsize=2, num_pe=8, tile_m=256, tile_k=128,
+    )
+    # 4 tiles along M, 4 tiles along K → 16 tiles total
+    assert len(shards) == 16
+    tile_bytes = 256 * 128 * 2  # 64 KB per tile
+    for s in shards:
+        assert s.nbytes == tile_bytes
+    # column-major: iterate K first, then M
+    # tile (m=0,k=0) → PE0, tile (m=0,k=1) → PE1, ..., (m=0,k=3) → PE3
+    # tile (m=1,k=0) → PE4, tile (m=1,k=1) → PE5, ..., (m=1,k=3) → PE7
+    # tile (m=2,k=0) → PE0, ...
+    assert shards[0].pe_index == 0
+    assert shards[1].pe_index == 1
+    assert shards[7].pe_index == 7
+    assert shards[8].pe_index == 0  # wraps around
+    # total coverage
+    assert sum(s.nbytes for s in shards) == 1024 * 512 * 2
+
+
+# ── Placement: tiled_row_major ────────────────────────────────────────
+
+
+def test_tiled_row_major():
+    """(1024, 512) tile=(256, 128) → 4×4=16 tiles, row-major → round-robin 8 PEs"""
+    shards = tiled_row_major(
+        shape=(1024, 512), itemsize=2, num_pe=8, tile_m=256, tile_k=128,
+    )
+    assert len(shards) == 16
+    tile_bytes = 256 * 128 * 2
+    for s in shards:
+        assert s.nbytes == tile_bytes
+    # row-major: iterate M first, then K
+    # tile (m=0,k=0) → PE0, tile (m=1,k=0) → PE1, ..., (m=3,k=0) → PE3
+    # tile (m=0,k=1) → PE4, tile (m=1,k=1) → PE5, ..., (m=3,k=1) → PE7
+    # tile (m=0,k=2) → PE0, ...
+    assert shards[0].pe_index == 0
+    assert shards[1].pe_index == 1
+    assert shards[7].pe_index == 7
+    assert shards[8].pe_index == 0  # wraps around
+    # total coverage
+    assert sum(s.nbytes for s in shards) == 1024 * 512 * 2
+
+
+# ── deploy_tensor ────────────────────────────────────────────────────
+
+
+def test_deploy_tensor_hbm():
+    """Deploy with column_wise placement → TensorHandle with valid PA shards."""
+    allocs = _make_allocators()
+    placement = column_wise(shape=(1024, 512), itemsize=2, num_pe=8)
+    th = deploy_tensor(
+        name="W",
+        shape=(1024, 512),
+        dtype="fp16",
+        placement=placement,
+        allocators=allocs,
+        mem_kind="hbm",
+    )
+    assert th.name == "W"
+    assert th.shape == (1024, 512)
+    assert th.dtype == "fp16"
+    assert th.itemsize == 2
+    assert len(th.shards) == 8
+    # each shard has a distinct PA
+    pas = [s.pa for s in th.shards]
+    assert len(set(pas)) == 8
+    # each shard placed on correct PE
+    for i, s in enumerate(th.shards):
+        assert s.pe == i
+        assert s.sip == 0
+        assert s.cube == 0
+
+
+def test_deploy_tensor_tcm():
+    """Deploy with TCM → uses pe_tcm_addr allocation."""
+    allocs = _make_allocators()
+    placement = [ShardSpec(pe_index=0, offset_bytes=0, nbytes=256)]
+    th = deploy_tensor(
+        name="small",
+        shape=(128,),
+        dtype="fp16",
+        placement=placement,
+        allocators=allocs,
+        mem_kind="tcm",
+    )
+    assert len(th.shards) == 1
+    assert th.shards[0].pe == 0
+    assert th.shards[0].nbytes == 256
+
+
+def test_deploy_tensor_overflow():
+    """Allocation exceeding PE HBM capacity raises AllocationError."""
+    allocs = _make_allocators()
+    # 6 GB per PE slice, try to allocate 7 GB
+    big_shard = ShardSpec(pe_index=0, offset_bytes=0, nbytes=7 * _GB)
+    with pytest.raises(AllocationError):
+        deploy_tensor(
+            name="toobig",
+            shape=(1,),
+            dtype="int8",
+            placement=[big_shard],
+            allocators=allocs,
+        )
diff --git a/tests/test_topology_compile.py b/tests/test_topology_compile.py
new file mode 100644
index 0000000..e8c4359
--- /dev/null
+++ b/tests/test_topology_compile.py
@@ -0,0 +1,409 @@
+from pathlib import Path
+
+from kernbench.topology.builder import load_topology
+
+TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml"
+
+
+def _graph():
+    return load_topology(TOPOLOGY_PATH)
+
+
+# ── Full graph: node counts ──────────────────────────────────────────
+
+
+def test_full_graph_node_count():
+    g = _graph()
+    # 1 switch
+    # + 2 SIPs × (1 IO × 2 comps + 16 cubes × (cube_comps + 8 PEs × 6 pe_comps))
+    #   cube_comps: 9 (noc, m_cpu, sram, 2 bridge, 4 ucie)
+    #             + 8 xbar.pe{0..7}  [replaced xbar.top/xbar.bottom]
+    #             + 8 hbm_slices = 25
+    # = 1 + 2*(2 + 16*(25+48)) = 1 + 2*(2+1168) = 1 + 2340 = 2341
+    assert len(g.nodes) == 2341
+
+
+def test_full_graph_edge_count():
+    g = _graph()
+    # Per cube: 144 (88 cube-fabric + 56 PE-internal)
+    #   cube-fabric: 8 pe→xbar.pe + 8 pe→noc + 8 noc→pe_cpu
+    #     + 8 xbar.pe→slice + 8 slice→xbar.pe (bidirectional for response)
+    #     + 12 xbar chain (3 pairs × 2 dir × 2 halves)
+    #     + 8 xbar.pe↔bridge (pe0↔bL, pe4↔bL, pe3↔bR, pe7↔bR, ×2 dir each)
+    #     + 4 noc→ucie + 4 ucie→noc (bidirectional)
+    #     + 8 noc→xbar.pe + 8 xbar.pe→noc (bidirectional for response)
+    #     + 1 m_cpu→noc + 1 noc→m_cpu + 1 noc→sram + 1 sram→noc = 88
+    # Per SIP: 16*144 + 48 inter-cube(bidirectional) + 8 io↔cube(bidirectional)
+    #          + 1 io_internal + 1 switch→io = 2362
+    # Total: 2 * 2362 = 4724
+    assert len(g.edges) == 4724
+
+
+# ── Full graph: specific nodes exist ─────────────────────────────────
+
+
+def test_system_switch_exists():
+    g = _graph()
+    assert "fabric.switch0" in g.nodes
+    assert g.nodes["fabric.switch0"].kind == "switch"
+    assert g.nodes["fabric.switch0"].pos_mm is None  # abstract
+
+
+def test_io_chiplet_nodes_exist():
+    g = _graph()
+    for s in range(2):
+        assert f"sip{s}.io0.pcie_ep" in g.nodes
+        assert f"sip{s}.io0.io_cpu" in g.nodes
+
+
+def test_cube_component_nodes_exist():
+    g = _graph()
+    cp = "sip0.cube0"
+    for name in ("noc", "m_cpu",
+                  "bridge.left", "bridge.right",
+                  "ucie-N", "ucie-S", "ucie-E", "ucie-W",
+                  "sram"):
+        assert f"{cp}.{name}" in g.nodes
+    # xbar.top/xbar.bottom replaced by per-PE xbar entry nodes
+    assert "sip0.cube0.xbar.top" not in g.nodes
+    assert "sip0.cube0.xbar.bottom" not in g.nodes
+    for pe in range(8):
+        node_id = f"{cp}.xbar.pe{pe}"
+        assert node_id in g.nodes, f"{node_id} missing"
+        assert g.nodes[node_id].kind == "xbar"
+    # HBM slices (one per PE)
+    for s in range(8):
+        assert f"{cp}.hbm_ctrl.slice{s}" in g.nodes
+        assert g.nodes[f"{cp}.hbm_ctrl.slice{s}"].kind == "hbm_ctrl"
+
+
+def test_pe_component_nodes_exist():
+    g = _graph()
+    for comp in ("pe_cpu", "pe_scheduler", "pe_dma", "pe_gemm", "pe_math", "pe_tcm"):
+        assert f"sip0.cube0.pe0.{comp}" in g.nodes
+        assert f"sip1.cube15.pe7.{comp}" in g.nodes
+
+
+# ── Full graph: positions ────────────────────────────────────────────
+
+
+def test_hbm_ctrl_slices_at_cube_center():
+    g = _graph()
+    # cube0 origin = (0, 0), cx=8.5, cy=7.0, hbm_ctrl at (cx-2, cy)
+    # all slices share the same physical position
+    for s in range(8):
+        node = g.nodes[f"sip0.cube0.hbm_ctrl.slice{s}"]
+        assert node.pos_mm == (6.5, 7.0)
+
+
+def test_hbm_ctrl_slices_cube5_position():
+    g = _graph()
+    # cube5 = col=1, row=1 -> origin = (1*18, 1*15) = (18, 15)
+    # hbm_ctrl = (18 + 6.5, 15 + 7.0) = (24.5, 22.0)
+    node = g.nodes["sip0.cube5.hbm_ctrl.slice0"]
+    assert node.pos_mm == (24.5, 22.0)
+
+
+def test_ucie_ports_at_cube_edges():
+    g = _graph()
+    # cube0 origin = (0, 0), cube_w=17, cube_h=14
+    # UCIe nodes inset by half-size so edges touch boundary
+    assert g.nodes["sip0.cube0.ucie-N"].pos_mm == (8.5, 0.6)
+    assert g.nodes["sip0.cube0.ucie-S"].pos_mm == (8.5, 13.4)
+    assert g.nodes["sip0.cube0.ucie-W"].pos_mm == (1.0, 7.0)
+    assert g.nodes["sip0.cube0.ucie-E"].pos_mm == (16.0, 7.0)
+
+
+# ── Full graph: edges ────────────────────────────────────────────────
+
+
+def _edge_set(g):
+    return {(e.src, e.dst) for e in g.edges}
+
+
+def test_inter_cube_ucie_edges():
+    es = _edge_set(_graph())
+    # cube0 (0,0) E → cube1 (1,0) W
+    assert ("sip0.cube0.ucie-E", "sip0.cube1.ucie-W") in es
+    # cube0 (0,0) S → cube4 (0,1) N
+    assert ("sip0.cube0.ucie-S", "sip0.cube4.ucie-N") in es
+
+
+def test_io_to_cube_edges():
+    es = _edge_set(_graph())
+    # io0 connects to cubes (0,0)..(3,0) on N side
+    assert ("sip0.io0.io_cpu", "sip0.cube0.ucie-N") in es
+    assert ("sip0.io0.io_cpu", "sip0.cube3.ucie-N") in es
+
+
+def test_switch_to_io_edges():
+    es = _edge_set(_graph())
+    assert ("fabric.switch0", "sip0.io0.pcie_ep") in es
+    assert ("fabric.switch0", "sip1.io0.pcie_ep") in es
+
+
+def test_pe_to_xbar_edges():
+    es = _edge_set(_graph())
+    cp = "sip0.cube0"
+    # Each PE connects to its own xbar entry (per-PE chain model)
+    for pe in range(8):
+        assert (f"{cp}.pe{pe}.pe_dma", f"{cp}.xbar.pe{pe}") in es
+    # Old shared xbar.top/bottom edges must NOT exist
+    assert (f"{cp}.pe0.pe_dma", f"{cp}.xbar.top") not in es
+    assert (f"{cp}.pe4.pe_dma", f"{cp}.xbar.bottom") not in es
+
+
+def test_command_path_m_cpu_noc_pe_cpu():
+    es = _edge_set(_graph())
+    cp = "sip0.cube0"
+    # m_cpu ↔ noc (bidirectional)
+    assert (f"{cp}.m_cpu", f"{cp}.noc") in es
+    assert (f"{cp}.noc", f"{cp}.m_cpu") in es
+    # noc → pe_cpu for each PE
+    assert (f"{cp}.noc", f"{cp}.pe0.pe_cpu") in es
+    assert (f"{cp}.noc", f"{cp}.pe7.pe_cpu") in es
+
+
+def test_pe_internal_edges():
+    es = _edge_set(_graph())
+    pp = "sip0.cube0.pe0"
+    assert (f"{pp}.pe_cpu", f"{pp}.pe_scheduler") in es
+    assert (f"{pp}.pe_scheduler", f"{pp}.pe_dma") in es
+    assert (f"{pp}.pe_scheduler", f"{pp}.pe_gemm") in es
+    assert (f"{pp}.pe_scheduler", f"{pp}.pe_math") in es
+    assert (f"{pp}.pe_dma", f"{pp}.pe_tcm") in es
+    assert (f"{pp}.pe_gemm", f"{pp}.pe_tcm") in es
+    assert (f"{pp}.pe_math", f"{pp}.pe_tcm") in es
+
+
+def test_xbar_to_hbm_slice_edges():
+    """Each xbar.pe{i} connects only to its own (local) HBM slice."""
+    es = _edge_set(_graph())
+    cp = "sip0.cube0"
+    # xbar.pe_i -> slice_i only (local Y-direction access)
+    for pe in range(8):
+        assert (f"{cp}.xbar.pe{pe}", f"{cp}.hbm_ctrl.slice{pe}") in es
+    # Negative: xbar.pe_i must NOT directly connect to a different slice
+    assert (f"{cp}.xbar.pe0", f"{cp}.hbm_ctrl.slice1") not in es
+    assert (f"{cp}.xbar.pe0", f"{cp}.hbm_ctrl.slice4") not in es
+    assert (f"{cp}.xbar.pe4", f"{cp}.hbm_ctrl.slice0") not in es
+
+
+# ── Views: system ────────────────────────────────────────────────────
+
+
+def test_system_view_nodes():
+    v = _graph().system_view
+    assert "fabric.switch0" in v.nodes
+    assert "sip0" in v.nodes
+    assert "sip1" in v.nodes
+    assert "sip0.io0" in v.nodes
+    assert "sip1.io0" in v.nodes
+
+
+# ── Views: SIP ───────────────────────────────────────────────────────
+
+
+def test_sip_view_cube_count():
+    v = _graph().sip_view
+    cube_nodes = [n for n in v.nodes if n.startswith("cube")]
+    assert len(cube_nodes) == 16
+
+
+def test_sip_view_io_chiplets():
+    v = _graph().sip_view
+    assert "io0" in v.nodes
+
+
+def test_sip_view_cube_positions():
+    v = _graph().sip_view
+    # cube0 (0,0): center = (8.5, 6+7.0) = (8.5, 13.0)  [io_margin=6]
+    x, y = v.nodes["cube0"].pos_mm
+    assert x == 8.5
+    assert y == 13.0
+    # cube1 (1,0): center = (18+8.5, 13.0) = (26.5, 13.0)
+    x1, y1 = v.nodes["cube1"].pos_mm
+    assert x1 == 26.5
+    assert y1 == 13.0
+
+
+# ── Views: cube ──────────────────────────────────────────────────────
+
+
+def test_cube_view_has_all_components():
+    v = _graph().cube_view
+    expected = {"ucie-N", "ucie-S", "ucie-W", "ucie-E",
+                "m_cpu", "hbm_ctrl",
+                "bridge.left", "bridge.right", "noc", "sram",
+                "xbar.pe0", "xbar.pe1", "xbar.pe2", "xbar.pe3",
+                "xbar.pe4", "xbar.pe5", "xbar.pe6", "xbar.pe7",
+                "pe0", "pe1", "pe2", "pe3", "pe4", "pe5", "pe6", "pe7"}
+    assert set(v.nodes.keys()) == expected
+
+
+def test_cube_view_hbm_at_center():
+    v = _graph().cube_view
+    assert v.nodes["hbm_ctrl"].pos_mm == (6.5, 7.0)
+    assert v.nodes["noc"].pos_mm == (10.5, 7.0)
+    assert v.width_mm == 17.0
+    assert v.height_mm == 14.0
+
+
+def test_cube_view_pe_corner_mapping():
+    v = _graph().cube_view
+    ves = {(e.src, e.dst) for e in v.edges}
+    # Each PE connects to its own xbar entry (chain model)
+    for i in range(8):
+        assert (f"pe{i}", f"xbar.pe{i}") in ves
+    # Old shared xbar.top/bottom mapping must not exist
+    assert ("pe0", "xbar.top") not in ves
+    assert ("pe4", "xbar.bottom") not in ves
+
+
+# ── Views: PE ────────────────────────────────────────────────────────
+
+
+def test_pe_view_has_all_components():
+    v = _graph().pe_view
+    assert set(v.nodes.keys()) == {
+        "pe_cpu", "pe_scheduler", "pe_dma", "pe_gemm", "pe_math", "pe_tcm"
+    }
+
+
+def test_pe_view_edges():
+    v = _graph().pe_view
+    ves = {(e.src, e.dst) for e in v.edges}
+    assert ("pe_cpu", "pe_scheduler") in ves
+    assert ("pe_scheduler", "pe_dma") in ves
+    assert ("pe_scheduler", "pe_gemm") in ves
+    assert ("pe_scheduler", "pe_math") in ves
+    assert ("pe_dma", "pe_tcm") in ves
+    assert ("pe_gemm", "pe_tcm") in ves
+    assert ("pe_math", "pe_tcm") in ves
+
+
+# ── SRAM ────────────────────────────────────────────────────────────
+
+
+def test_sram_node_exists():
+    g = _graph()
+    assert "sip0.cube0.sram" in g.nodes
+    assert g.nodes["sip0.cube0.sram"].kind == "sram"
+
+
+def test_noc_to_sram_edges():
+    es = _edge_set(_graph())
+    cp = "sip0.cube0"
+    assert (f"{cp}.noc", f"{cp}.sram") in es
+    assert (f"{cp}.sram", f"{cp}.noc") in es
+
+
+# ── PE_DMA → NOC (non-HBM data path) ───────────────────────────────
+
+
+def test_pe_dma_to_noc_edges():
+    es = _edge_set(_graph())
+    cp = "sip0.cube0"
+    for i in range(8):
+        assert (f"{cp}.pe{i}.pe_dma", f"{cp}.noc") in es
+
+
+# ── Bridge connects XBAR halves (not NOC) ──────────────────────────
+
+
+def test_bridge_connects_xbar_halves():
+    """bridge.left connects leftmost PE nodes (pe0 top, pe4 bottom).
+    bridge.right connects rightmost PE nodes (pe3 top, pe7 bottom)."""
+    es = _edge_set(_graph())
+    cp = "sip0.cube0"
+    # bridge.left ↔ pe0 (top-left) and pe4 (bottom-left)
+    assert (f"{cp}.xbar.pe0", f"{cp}.bridge.left") in es
+    assert (f"{cp}.bridge.left", f"{cp}.xbar.pe0") in es
+    assert (f"{cp}.xbar.pe4", f"{cp}.bridge.left") in es
+    assert (f"{cp}.bridge.left", f"{cp}.xbar.pe4") in es
+    # bridge.right ↔ pe3 (top-right) and pe7 (bottom-right)
+    assert (f"{cp}.xbar.pe3", f"{cp}.bridge.right") in es
+    assert (f"{cp}.bridge.right", f"{cp}.xbar.pe3") in es
+    assert (f"{cp}.xbar.pe7", f"{cp}.bridge.right") in es
+    assert (f"{cp}.bridge.right", f"{cp}.xbar.pe7") in es
+    # Old xbar.top/bottom ↔ bridge edges must NOT exist
+    assert (f"{cp}.xbar.top", f"{cp}.bridge.left") not in es
+    assert (f"{cp}.xbar.bottom", f"{cp}.bridge.left") not in es
+
+
+def test_no_bridge_to_noc_edges():
+    es = _edge_set(_graph())
+    cp = "sip0.cube0"
+    assert (f"{cp}.bridge.left", f"{cp}.noc") not in es
+    assert (f"{cp}.bridge.right", f"{cp}.noc") not in es
+
+
+# ── Cube view: new edges ────────────────────────────────────────────
+
+
+def test_cube_view_pe_to_noc():
+    v = _graph().cube_view
+    ves = {(e.src, e.dst) for e in v.edges}
+    for i in range(8):
+        assert (f"pe{i}", "noc") in ves
+
+
+def test_cube_view_sram():
+    v = _graph().cube_view
+    assert "sram" in v.nodes
+    ves = {(e.src, e.dst) for e in v.edges}
+    assert ("noc", "sram") in ves
+    assert ("sram", "noc") in ves
+
+
+def test_cube_view_bridge_xbar():
+    v = _graph().cube_view
+    ves = {(e.src, e.dst) for e in v.edges}
+    # bridge.left connects pe0 (top-left) ↔ pe4 (bottom-left)
+    assert ("xbar.pe0", "bridge.left") in ves
+    assert ("bridge.left", "xbar.pe0") in ves
+    assert ("xbar.pe4", "bridge.left") in ves
+    assert ("bridge.left", "xbar.pe4") in ves
+    # bridge.right connects pe3 (top-right) ↔ pe7 (bottom-right)
+    assert ("xbar.pe3", "bridge.right") in ves
+    assert ("bridge.right", "xbar.pe3") in ves
+    assert ("xbar.pe7", "bridge.right") in ves
+    assert ("bridge.right", "xbar.pe7") in ves
+
+
+# ── Chain xbar: new topology edges ──────────────────────────────────
+
+
+def test_xbar_chain_edges():
+    """Adjacent xbar.pe nodes within each half are bidirectionally connected."""
+    es = _edge_set(_graph())
+    cp = "sip0.cube0"
+    # Top chain: pe0 ↔ pe1 ↔ pe2 ↔ pe3 (NW→NE direction)
+    for a, b in [(0, 1), (1, 2), (2, 3)]:
+        assert (f"{cp}.xbar.pe{a}", f"{cp}.xbar.pe{b}") in es, f"missing pe{a}→pe{b}"
+        assert (f"{cp}.xbar.pe{b}", f"{cp}.xbar.pe{a}") in es, f"missing pe{b}→pe{a}"
+    # Bottom chain: pe4 ↔ pe5 ↔ pe6 ↔ pe7
+    for a, b in [(4, 5), (5, 6), (6, 7)]:
+        assert (f"{cp}.xbar.pe{a}", f"{cp}.xbar.pe{b}") in es, f"missing pe{a}→pe{b}"
+        assert (f"{cp}.xbar.pe{b}", f"{cp}.xbar.pe{a}") in es, f"missing pe{b}→pe{a}"
+    # Negative: no cross-chain direct edges
+    assert (f"{cp}.xbar.pe0", f"{cp}.xbar.pe2") not in es
+    assert (f"{cp}.xbar.pe0", f"{cp}.xbar.pe4") not in es
+
+
+def test_ucie_noc_reverse_edges():
+    """UCIe ports must have reverse edges back to NOC (bidirectional)."""
+    es = _edge_set(_graph())
+    cp = "sip0.cube1"  # non-edge cube to avoid io-cube edges
+    for port in ("N", "S", "E", "W"):
+        assert (f"{cp}.ucie-{port}", f"{cp}.noc") in es, \
+            f"missing ucie-{port}->noc reverse edge"
+
+
+def test_noc_to_xbar_pe_edges():
+    """NOC connects to all xbar.pe nodes (for remote cube HBM access)."""
+    es = _edge_set(_graph())
+    cp = "sip0.cube0"
+    for pe in range(8):
+        assert (f"{cp}.noc", f"{cp}.xbar.pe{pe}") in es, \
+            f"missing noc->xbar.pe{pe}"
diff --git a/tests/test_topology_load.py b/tests/test_topology_load.py
new file mode 100644
index 0000000..a30edf5
--- /dev/null
+++ b/tests/test_topology_load.py
@@ -0,0 +1,60 @@
+from pathlib import Path
+
+from kernbench.topology.builder import _read_spec, resolve_topology
+
+TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml"
+
+
+def test_topology_yaml_loads_without_error():
+    # _compile_graph is still stubbed (returns None); load must not raise
+    resolve_topology(str(TOPOLOGY_PATH))
+
+
+def test_pe_layout_structure():
+    spec = _read_spec(TOPOLOGY_PATH)
+    pe_layout = spec["cube"]["pe_layout"]
+    assert set(pe_layout["corners"]) == {"NW", "NE", "SW", "SE"}
+    assert pe_layout["pe_per_corner"] == 2
+    # derived total must equal original pe_per_cube: 8
+    assert pe_layout["pe_per_corner"] * len(pe_layout["corners"]) == 8
+
+
+def test_pe_template_components():
+    spec = _read_spec(TOPOLOGY_PATH)
+    comps = spec["cube"]["pe_template"]["components"]
+    assert set(comps.keys()) == {
+        "pe_cpu", "pe_scheduler", "pe_dma", "pe_gemm", "pe_math", "pe_tcm"
+    }
+
+
+def test_pe_template_links_present():
+    spec = _read_spec(TOPOLOGY_PATH)
+    links = spec["cube"]["pe_template"]["links"]
+    required = {
+        "pe_cpu_to_scheduler_mm",
+        "scheduler_to_dma_mm",
+        "scheduler_to_gemm_mm",
+        "scheduler_to_math_mm",
+        "dma_to_tcm_bw_gbs", "dma_to_tcm_mm",
+        "gemm_to_tcm_bw_gbs", "gemm_to_tcm_mm",
+        "math_to_tcm_bw_gbs", "math_to_tcm_mm",
+    }
+    assert required.issubset(set(links.keys()))
+
+
+def test_pe_dma_not_in_cube_components():
+    spec = _read_spec(TOPOLOGY_PATH)
+    assert "pe_dma" not in spec["cube"]["components"]
+
+
+def test_pe_per_cube_removed():
+    spec = _read_spec(TOPOLOGY_PATH)
+    assert "pe_per_cube" not in spec["cube"].get("device", {})
+
+
+def test_shared_resource_accel_slot():
+    # ADR-0014 D4: PE_GEMM and PE_MATH share PE_ACCEL capacity = 1
+    spec = _read_spec(TOPOLOGY_PATH)
+    comps = spec["cube"]["pe_template"]["components"]
+    assert comps["pe_gemm"]["attrs"]["shared_resource"] == "accel_slot"
+    assert comps["pe_math"]["attrs"]["shared_resource"] == "accel_slot"
diff --git a/tests/test_topology_visualize.py b/tests/test_topology_visualize.py
new file mode 100644
index 0000000..848c42d
--- /dev/null
+++ b/tests/test_topology_visualize.py
@@ -0,0 +1,81 @@
+from pathlib import Path
+
+from kernbench.topology.builder import load_topology
+from kernbench.topology.visualizer import emit_diagrams
+
+TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml"
+
+VIEW_FILES = ["system_view.svg", "sip_view.svg", "cube_view.svg", "pe_view.svg"]
+
+
+def _emit(tmp_path: Path) -> list[Path]:
+    graph = load_topology(TOPOLOGY_PATH)
+    return emit_diagrams(graph, tmp_path)
+
+
+def test_emit_creates_all_svg_files(tmp_path):
+    created = _emit(tmp_path)
+    assert len(created) == 4
+    for name in VIEW_FILES:
+        assert (tmp_path / name).exists()
+        assert (tmp_path / name).stat().st_size > 0
+
+
+def test_svg_output_is_deterministic(tmp_path):
+    graph = load_topology(TOPOLOGY_PATH)
+    emit_diagrams(graph, tmp_path)
+    first = {name: (tmp_path / name).read_text() for name in VIEW_FILES}
+    emit_diagrams(graph, tmp_path)
+    second = {name: (tmp_path / name).read_text() for name in VIEW_FILES}
+    for name in VIEW_FILES:
+        assert first[name] == second[name], f"{name} is not deterministic"
+
+
+def test_cube_svg_contains_hbm_ctrl(tmp_path):
+    _emit(tmp_path)
+    svg = (tmp_path / "cube_view.svg").read_text()
+    assert "HBM CTRL" in svg
+
+
+def test_cube_svg_contains_ucie_ports(tmp_path):
+    _emit(tmp_path)
+    svg = (tmp_path / "cube_view.svg").read_text()
+    for port in ("UCIe-N", "UCIe-S", "UCIe-W", "UCIe-E"):
+        assert port in svg
+
+
+def test_cube_svg_contains_pe_nodes(tmp_path):
+    _emit(tmp_path)
+    svg = (tmp_path / "cube_view.svg").read_text()
+    for i in range(8):
+        assert f"PE{i}" in svg
+
+
+def test_pe_svg_contains_all_components(tmp_path):
+    _emit(tmp_path)
+    svg = (tmp_path / "pe_view.svg").read_text()
+    for comp in ("PE CPU", "PE SCHEDULER", "PE DMA", "PE GEMM", "PE MATH", "PE TCM"):
+        assert comp in svg
+
+
+def test_sip_svg_contains_cubes(tmp_path):
+    _emit(tmp_path)
+    svg = (tmp_path / "sip_view.svg").read_text()
+    assert "CUBE (0,0)" in svg
+    assert "CUBE (3,3)" in svg
+
+
+def test_system_svg_contains_switch_and_sips(tmp_path):
+    _emit(tmp_path)
+    svg = (tmp_path / "system_view.svg").read_text()
+    assert "Fabric Switch" in svg
+    assert "SIP 0" in svg
+    assert "SIP 1" in svg
+
+
+def test_svg_is_valid_xml(tmp_path):
+    _emit(tmp_path)
+    for name in VIEW_FILES:
+        svg = (tmp_path / name).read_text()
+        assert svg.startswith("<svg")
+        assert svg.strip().endswith("</svg>")
diff --git a/tests/test_triton_emu.py b/tests/test_triton_emu.py
new file mode 100644
index 0000000..036fc06
--- /dev/null
+++ b/tests/test_triton_emu.py
@@ -0,0 +1,349 @@
+"""Tests for Triton emulator: TLContext, command generation, kernel registry."""
+from kernbench.common.pe_commands import (
+    CompletionHandle,
+    CompositeCmd,
+    DmaReadCmd,
+    DmaWriteCmd,
+    GemmCmd,
+    MathCmd,
+    PeCpuOverheadCmd,
+    TensorHandle,
+    WaitCmd,
+)
+from kernbench.triton_emu.registry import clear_registry, get_kernel, register_kernel
+from kernbench.triton_emu.tl_context import TLContext, run_kernel
+
+
+def _ctx(**kwargs) -> TLContext:
+    return TLContext(dispatch_cycles=0, **kwargs)
+
+
+def _ctx_with_overhead(**kwargs) -> TLContext:
+    return TLContext(dispatch_cycles=1, **kwargs)
+
+
+# ── 1. tl.load → DmaReadCmd ──────────────────────────────────────
+
+
+def test_tl_load_generates_dma_read():
+    tl = _ctx()
+    h = tl.load(0x1000, shape=(32, 64), dtype="f16")
+    assert isinstance(h, TensorHandle)
+    assert h.shape == (32, 64)
+    assert h.nbytes == 32 * 64 * 2
+    cmds = tl.commands
+    assert len(cmds) == 1
+    assert isinstance(cmds[0], DmaReadCmd)
+    assert cmds[0].src_pa == 0x1000
+    assert cmds[0].nbytes == 32 * 64 * 2
+
+
+# ── 2. tl.store → DmaWriteCmd ────────────────────────────────────
+
+
+def test_tl_store_generates_dma_write():
+    tl = _ctx()
+    h = tl.load(0x1000, shape=(16, 16), dtype="f32")
+    tl.store(0x2000, h)
+    cmds = [c for c in tl.commands if isinstance(c, DmaWriteCmd)]
+    assert len(cmds) == 1
+    assert cmds[0].dst_pa == 0x2000
+    assert cmds[0].nbytes == 16 * 16 * 4
+
+
+# ── 3. tl.dot → GemmCmd ──────────────────────────────────────────
+
+
+def test_tl_dot_generates_gemm_cmd():
+    tl = _ctx()
+    a = tl.load(0x1000, shape=(32, 64), dtype="f16")
+    b = tl.load(0x2000, shape=(64, 16), dtype="f16")
+    out = tl.dot(a, b)
+    assert out.shape == (32, 16)
+    cmds = [c for c in tl.commands if isinstance(c, GemmCmd)]
+    assert len(cmds) == 1
+    assert cmds[0].m == 32
+    assert cmds[0].k == 64
+    assert cmds[0].n == 16
+
+
+# ── 4. tl.exp, tl.sqrt etc. → MathCmd ────────────────────────────
+
+
+def test_tl_math_unary_ops():
+    tl = _ctx()
+    x = tl.load(0x1000, shape=(8, 8), dtype="f16")
+    for op_name, op_fn in [
+        ("exp", tl.exp), ("log", tl.log), ("sqrt", tl.sqrt),
+        ("abs", tl.abs), ("sigmoid", tl.sigmoid),
+        ("cos", tl.cos), ("sin", tl.sin),
+    ]:
+        result = op_fn(x)
+        assert isinstance(result, TensorHandle)
+        assert result.shape == x.shape
+
+    math_cmds = [c for c in tl.commands if isinstance(c, MathCmd)]
+    ops = [c.op for c in math_cmds]
+    assert ops == ["exp", "log", "sqrt", "abs", "sigmoid", "cos", "sin"]
+
+
+# ── 5. a + b, a * b → MathCmd ────────────────────────────────────
+
+
+def test_tl_math_binary_ops():
+    tl = _ctx()
+    a = tl.load(0x1000, shape=(4, 4), dtype="f16")
+    b = tl.load(0x2000, shape=(4, 4), dtype="f16")
+    r1 = run_kernel(lambda tl: None, tl)  # activate context for operators
+
+    # Need active context for operators
+    tl2 = _ctx()
+    a2 = tl2.load(0x1000, shape=(4, 4), dtype="f16")
+    b2 = tl2.load(0x2000, shape=(4, 4), dtype="f16")
+
+    def kernel(tl):
+        pass
+
+    # Use run_kernel to activate context, then test operators
+    tl3 = _ctx()
+
+    def binary_kernel(tl):
+        a = tl.load(0x1000, shape=(4, 4), dtype="f16")
+        b = tl.load(0x2000, shape=(4, 4), dtype="f16")
+        _ = a + b
+        _ = a - b
+        _ = a * b
+        _ = a / b
+
+    run_kernel(binary_kernel, tl3)
+    math_cmds = [c for c in tl3.commands if isinstance(c, MathCmd)]
+    ops = [c.op for c in math_cmds]
+    assert ops == ["add", "sub", "mul", "div"]
+
+
+# ── 6. tl.sum, tl.max → MathCmd with axis ────────────────────────
+
+
+def test_tl_reduction_ops():
+    tl = _ctx()
+    x = tl.load(0x1000, shape=(32, 64), dtype="f16")
+    s = tl.sum(x, axis=1)
+    m = tl.max(x, axis=0)
+    assert s.shape == (32, 1)
+    assert m.shape == (1, 64)
+    math_cmds = [c for c in tl.commands if isinstance(c, MathCmd)]
+    assert math_cmds[0].op == "sum" and math_cmds[0].axis == 1
+    assert math_cmds[1].op == "max" and math_cmds[1].axis == 0
+
+
+# ── 7. tl.composite → CompositeCmd + CompletionHandle ────────────
+
+
+def test_tl_composite_nonblocking():
+    tl = _ctx()
+    a = tl.load(0x1000, shape=(32, 64), dtype="f16")
+    b = tl.load(0x2000, shape=(64, 32), dtype="f16")
+    h = tl.composite(op="gemm", a=a, b=b, out_ptr=0x3000)
+    assert isinstance(h, CompletionHandle)
+    comp_cmds = [c for c in tl.commands if isinstance(c, CompositeCmd)]
+    assert len(comp_cmds) == 1
+    assert comp_cmds[0].op == "gemm"
+    assert comp_cmds[0].out_pa == 0x3000
+    assert comp_cmds[0].out_nbytes == 32 * 32 * 2  # M×N×dtype_bytes
+
+
+# ── 8. tl.wait(handle) → WaitCmd ─────────────────────────────────
+
+
+def test_tl_wait_specific():
+    tl = _ctx()
+    a = tl.load(0x1000, shape=(4, 4), dtype="f16")
+    h = tl.composite(op="gemm", a=a, b=a, out_ptr=0x2000)
+    tl.wait(h)
+    wait_cmds = [c for c in tl.commands if isinstance(c, WaitCmd)]
+    assert len(wait_cmds) == 1
+    assert wait_cmds[0].handle == h
+
+
+# ── 9. tl.wait() → WaitCmd(handle=None) ──────────────────────────
+
+
+def test_tl_wait_all():
+    tl = _ctx()
+    tl.wait()
+    wait_cmds = [c for c in tl.commands if isinstance(c, WaitCmd)]
+    assert len(wait_cmds) == 1
+    assert wait_cmds[0].handle is None
+
+
+# ── 10. tl.cycles → PeCpuOverheadCmd ─────────────────────────────
+
+
+def test_tl_cycles():
+    tl = _ctx()
+    tl.cycles(10)
+    assert len(tl.commands) == 1
+    assert isinstance(tl.commands[0], PeCpuOverheadCmd)
+    assert tl.commands[0].cycles == 10
+
+
+# ── 11. tl.program_id ────────────────────────────────────────────
+
+
+def test_tl_program_id():
+    tl = TLContext(pe_id=5, num_programs=8)
+    assert tl.program_id(0) == 5
+    assert tl.num_programs(0) == 8
+
+
+# ── 12. tl.arange, tl.zeros, tl.full ─────────────────────────────
+
+
+def test_tl_arange_zeros_full():
+    tl = _ctx()
+    r = tl.arange(0, 16, dtype="i32")
+    assert r.shape == (16,)
+    assert r.dtype == "i32"
+
+    z = tl.zeros((4, 8), dtype="f16")
+    assert z.shape == (4, 8)
+    assert z.nbytes == 4 * 8 * 2
+
+    f = tl.full((2, 3), value=1.0, dtype="f32")
+    assert f.shape == (2, 3)
+    assert f.nbytes == 2 * 3 * 4
+
+
+# ── 13. tl.trans → shape change, no command ───────────────────────
+
+
+def test_tl_trans_shape():
+    tl = _ctx()
+    h = tl.load(0x1000, shape=(32, 64), dtype="f16")
+    t = tl.trans(h)
+    assert t.shape == (64, 32)
+    assert t.id == h.id  # same underlying data
+    # Only DmaReadCmd from load, no command from trans
+    assert len(tl.commands) == 1
+    assert isinstance(tl.commands[0], DmaReadCmd)
+
+
+# ── 14. Kernel registry ──────────────────────────────────────────
+
+
+def test_kernel_registry():
+    clear_registry()
+
+    def my_kernel(tl):
+        pass
+
+    register_kernel("test_kern", my_kernel)
+    assert get_kernel("test_kern") is my_kernel
+    clear_registry()
+
+
+def test_kernel_registry_missing():
+    clear_registry()
+    import pytest
+    with pytest.raises(KeyError):
+        get_kernel("nonexistent")
+
+
+def test_kernel_registry_duplicate():
+    clear_registry()
+    register_kernel("dup", lambda tl: None)
+    import pytest
+    with pytest.raises(ValueError):
+        register_kernel("dup", lambda tl: None)
+    clear_registry()
+
+
+# ── 15. GEMM kernel → correct command sequence ───────────────────
+
+
+def test_gemm_kernel_command_sequence():
+    """32×64 × 64×32 GEMM kernel produces [DmaRead, DmaRead, Composite]."""
+    def gemm_kernel(a_ptr, b_ptr, out_ptr, tl):
+        pid = tl.program_id(0)
+        a = tl.load(a_ptr, shape=(32, 64), dtype="f16")
+        b = tl.load(b_ptr + pid * 64 * 32 * 2, shape=(64, 32), dtype="f16")
+        tl.composite(op="gemm", a=a, b=b, out_ptr=out_ptr + pid * 32 * 32 * 2)
+
+    tl = _ctx(pe_id=3)
+    run_kernel(gemm_kernel, tl, a_ptr=0x1000, b_ptr=0x2000, out_ptr=0x3000)
+    types = [type(c).__name__ for c in tl.commands]
+    assert types == ["DmaReadCmd", "DmaReadCmd", "CompositeCmd"]
+
+
+# ── 16. Attention kernel → correct command sequence ───────────────
+
+
+def test_attention_kernel_command_sequence():
+    """Attention kernel: load→dot→math ops→dot→store."""
+    def attention_kernel(q_ptr, k_ptr, v_ptr, out_ptr, tl,
+                         seq_len=16, head_dim=8):
+        pid = tl.program_id(0)
+        q = tl.load(q_ptr, shape=(seq_len, head_dim), dtype="f16")
+        k = tl.load(k_ptr, shape=(head_dim, seq_len), dtype="f16")
+        scores = tl.dot(q, k)
+        row_max = tl.max(scores, axis=1)
+        scores = scores - row_max
+        scores = tl.exp(scores)
+        row_sum = tl.sum(scores, axis=1)
+        scores = scores / row_sum
+        v = tl.load(v_ptr, shape=(seq_len, head_dim), dtype="f16")
+        out = tl.dot(scores, v)
+        tl.store(out_ptr, out)
+
+    tl = _ctx(pe_id=0)
+    run_kernel(
+        attention_kernel, tl,
+        q_ptr=0x1000, k_ptr=0x2000, v_ptr=0x3000, out_ptr=0x4000,
+    )
+    types = [type(c).__name__ for c in tl.commands]
+    # load, load, dot, max, sub, exp, sum, div, load, dot, store
+    assert types == [
+        "DmaReadCmd", "DmaReadCmd",           # load Q, K
+        "GemmCmd",                            # Q @ K
+        "MathCmd", "MathCmd", "MathCmd",      # max, sub, exp
+        "MathCmd", "MathCmd",                 # sum, div
+        "DmaReadCmd",                         # load V
+        "GemmCmd",                            # scores @ V
+        "DmaWriteCmd",                        # store output
+    ]
+    # Verify math ops
+    math_cmds = [c for c in tl.commands if isinstance(c, MathCmd)]
+    math_ops = [c.op for c in math_cmds]
+    assert math_ops == ["max", "sub", "exp", "sum", "div"]
+
+
+# ── 17. Dispatch overhead auto-inserted ───────────────────────────
+
+
+def test_dispatch_overhead_inserted():
+    """Each tl API call auto-inserts PeCpuOverheadCmd when dispatch_cycles > 0."""
+    tl = _ctx_with_overhead()
+    a = tl.load(0x1000, shape=(4, 4), dtype="f16")
+    tl.store(0x2000, a)
+    types = [type(c).__name__ for c in tl.commands]
+    # overhead, load, overhead, store
+    assert types == [
+        "PeCpuOverheadCmd", "DmaReadCmd",
+        "PeCpuOverheadCmd", "DmaWriteCmd",
+    ]
+
+
+# ── 18. where operation ──────────────────────────────────────────
+
+
+def test_tl_where():
+    tl = _ctx()
+    cond = tl.load(0x1000, shape=(4, 4), dtype="i32")
+    a = tl.load(0x2000, shape=(4, 4), dtype="f16")
+    b = tl.load(0x3000, shape=(4, 4), dtype="f16")
+    out = tl.where(cond, a, b)
+    assert isinstance(out, TensorHandle)
+    math_cmds = [c for c in tl.commands if isinstance(c, MathCmd)]
+    assert len(math_cmds) == 1
+    assert math_cmds[0].op == "where"
+    assert len(math_cmds[0].inputs) == 3
diff --git a/topology.yaml b/topology.yaml
new file mode 100644
index 0000000..62c9fe8
--- /dev/null
+++ b/topology.yaml
@@ -0,0 +1,126 @@
+
+system:
+  ns_per_mm: 0.01   # wire propagation delay: 10 ps/mm (on-chip silicon)
+
+  sips:
+    count: 2
+
+  components:
+    switch: { kind: switch, impl: switch_v1, attrs: { overhead_ns: 5.0 } }
+
+  links:
+    io_ep_to_switch:
+      kind: pcie
+      bw_gbs_per_ep: 768.0
+      distance_mm: 20.0
+
+sip:
+  cube_mesh: { w: 4, h: 4 }
+
+  iochiplet:
+    components:
+      pcie_ep: { kind: pcie_ep, impl: pcie_ep_v1, attrs: { overhead_ns: 5.0 } }
+      io_cpu:  { kind: io_cpu,  impl: io_cpu_v1,  attrs: { overhead_ns: 10.0 } }
+    links:
+      pcie_ep_to_io_cpu_bw_gbs: 256.0   # matches system.links.io_ep_to_switch.bw_gbs_per_ep
+      pcie_ep_to_io_cpu_mm: 1.0
+      io_cpu_to_ucie_bw_gbs: 512.0      # matches ucie.phy_bw_gbs per PHY
+      io_cpu_to_ucie_mm: 1.5
+    instances:
+      - id: io0
+        place: { side: N, offset_norm: 0.5 }
+        ucie: { phy_bw_gbs: 512.0, phys: [P0, P1, P2, P3] }
+        cube_ports:
+          - { cube: {xy: [0,0]}, cube_side: N, phy: P0, distance_mm: 2.0 }
+          - { cube: {xy: [1,0]}, cube_side: N, phy: P1, distance_mm: 2.0 }
+          - { cube: {xy: [2,0]}, cube_side: N, phy: P2, distance_mm: 2.0 }
+          - { cube: {xy: [3,0]}, cube_side: N, phy: P3, distance_mm: 2.0 }
+
+  links:
+    inter_cube_mesh:
+      bw_gbs_per_ucie_phy: 512.0
+      distance_mm_across_seam: 1.0
+      routing: { algo: xy }
+
+cube:
+  geometry:
+    cube_mm: { w: 17.0, h: 14.0 }
+    hbm_mm: { w: 9.0, h: 5.0 }
+    ucie_mm: { size: 2.0 }
+
+  pe_layout:
+    corners: [NW, NE, SW, SE]   # N corners → xbar top row; S corners → xbar bottom row
+    pe_per_corner: 2             # total PEs per cube: 4 * 2 = 8
+
+  pe_template:
+    components:
+      pe_cpu:       { kind: pe_cpu,       impl: pe_cpu_v1,       attrs: { overhead_ns: 2.0 } }
+      pe_scheduler: { kind: pe_scheduler, impl: pe_scheduler_v1, attrs: { overhead_ns: 1.0 } }
+      pe_dma:       { kind: pe_dma,       impl: pe_dma_v1,       attrs: { rd_engines: 1, wr_engines: 1 } }
+      pe_gemm:      { kind: pe_gemm,      impl: pe_gemm_v1,      attrs: { overhead_ns: 0.0, shared_resource: accel_slot, peak_tflops_f16: 8.0 } }
+      pe_math:      { kind: pe_math,      impl: pe_math_v1,      attrs: { overhead_ns: 0.0, shared_resource: accel_slot } }
+      pe_tcm:       { kind: pe_tcm,       impl: pe_tcm_v1,       attrs:
+      { size_mb: 16 } }
+    links:
+      pe_cpu_to_scheduler_mm:  0.5
+      scheduler_to_dma_mm:     0.5
+      scheduler_to_gemm_mm:    0.5
+      scheduler_to_math_mm:    0.5
+      dma_to_tcm_bw_gbs:       512.0
+      dma_to_tcm_mm:           0.5
+      gemm_to_tcm_bw_gbs:      512.0    # GEMM reads inputs from TCM (ADR-0014 D5)
+      gemm_to_tcm_mm:          0.5
+      math_to_tcm_bw_gbs:      512.0
+      math_to_tcm_mm:          0.5
+
+  memory_map:
+    hbm_total_gb_per_cube: 48
+    hbm_slices_per_cube: 8
+    hbm_total_bw_gbs: 1024.0
+
+  components:
+    noc:     { kind: noc,     impl: noc_2d_mesh_v1, attrs: { overhead_ns: 0.0 } }
+    m_cpu:   { kind: m_cpu,   impl: m_cpu_v1,   attrs: { overhead_ns: 5.0 } }
+    xbar:
+      pe:     { kind: xbar, impl: xbar_v1, attrs: { overhead_ns: 2.0 } }
+      bridges:
+        - { id: left,  kind: xbar, impl: xbar_v1, attrs: { overhead_ns: 1.0 } }
+        - { id: right, kind: xbar, impl: xbar_v1, attrs: { overhead_ns: 1.0 } }
+    hbm_ctrl: { kind: hbm_ctrl, impl: hbm_ctrl_v1, attrs: { capacity: 1 } }
+    sram:     { kind: sram,     impl: sram_v1,     attrs: { size_mb: 32, overhead_ns: 2.0 } }
+
+  ucie:
+    decompose: true
+    ports: [N, S, E, W]
+    overhead_ns: 1.0
+
+  links:
+    pe_to_xbar_bw_gbs: 256.0         # per-PE effective (2048 / 8 PEs)
+    xbar_to_hbm_bw_gbs: 256.0        # per-PE effective (2048 / 8 PEs)
+    xbar_to_bridge_bw_gbs: 128.0     # bridge BW (same as xbar chain BW)
+    xbar_x_bw_gbs: 128.0             # X-direction BW for xbar chain traversal
+    xbar_chain_intra_corner_mm: 2.0  # xbar wire distance within same corner PE pair
+    xbar_chain_inter_corner_mm: 10.0 # xbar wire distance between corner pairs (NW↔NE, SW↔SE)
+    xbar_row_n_to_bridge_mm: 3.0
+    xbar_row_s_to_bridge_mm: 3.0
+    xbar_to_hbm_mm: 2.5
+    pe_to_xbar_row_n_mm: 6.0
+    pe_to_xbar_row_s_mm: 6.0
+    pe_dma_to_noc_mm: 0.0        # noc is distributed; distance modeled as 0
+    pe_dma_to_noc_bw_gbs: 512.0  # PE non-HBM data path BW
+    noc_to_xbar:
+      per_connection_bw_gbs: 128.0   # BW per NOC connection
+    noc_to_sram_mm: 0.0          # noc is distributed; distance modeled as 0
+    noc_to_sram:
+      per_connection_bw_gbs: 128.0   # BW per NOC connection
+      n_connections: 4               # 4 × 128 = 512 GB/s aggregate
+    noc_to_ucie:
+      per_connection_bw_gbs: 128.0   # BW per NOC connection
+      n_connections: 4               # 4 × 128 = 512 GB/s = UCIe PHY BW
+    m_cpu_to_noc_mm: 0.0         # noc is distributed; distance modeled as 0
+    noc_to_pe_cpu_mm: 0.0        # noc is distributed; distance modeled as 0
+
+visualization:
+  emit_views: [system, sip, cube]
+  sip_ids: [0]
+  cubes: [0, 9, 15]