From 6f43807900443cd0ab2abd163ed42447f8140d56 Mon Sep 17 00:00:00 2001 From: Yangwook Date: Wed, 18 Mar 2026 11:47:48 -0700 Subject: [PATCH] commit - release 1 --- .claude/settings.json | 15 + .claude/settings.local.json | 36 + .editorconfig | 12 + .gitignore | 28 + .pylintrc | 3 + .vscode/extensions.json | 16 + .vscode/launch.json | 55 + .vscode/settings.json | 39 + .vscode/tasks.json | 127 ++ CLAUDE.md | 196 +++ README.md | 13 + SPEC.md | 327 +++++ benches/__init__.py | 0 benches/ipcq_allreduce.py | 2 + benches/loader.py | 37 + benches/qkv_gemm.py | 39 + benches/qkv_gemm_multi_pe.py | 39 + docs/adr/ADR-0001-physaddr-layout.md | 108 ++ docs/adr/ADR-0002-routing-distance.md | 103 ++ docs/adr/ADR-0003-target-system-hierarchy.md | 64 + .../ADR-0004-memory-semantics-local-hbm.md | 64 + .../ADR-0005-diagram-views-distance-layout.md | 186 +++ ...6-topology-compilation-distance-diagram.md | 130 ++ docs/adr/ADR-0007-runtime-api-boundaries.md | 89 ++ .../ADR-0008-tensor-deploy-and-allocation.md | 100 ++ .../ADR-0009-kernel-execution-messaging.md | 74 ++ docs/adr/ADR-0010-cli-device-selection.md | 62 + ...R-0011-memory-addressing-simplification.md | 65 + docs/adr/ADR-0012-host-io-message-schema.md | 232 ++++ docs/adr/ADR-0013-verification_strategy.md | 139 ++ .../ADR-0014-pe-internal-execution-model.md | 364 +++++ .../adr/ADR-0015-component-port-wire-model.md | 178 +++ docs/di-presentation.md | 363 +++++ docs/diagrams/README.md | 26 + docs/diagrams/cube_view.svg | 156 +++ docs/diagrams/pe_view.svg | 31 + docs/diagrams/placement_column_wise.svg | 72 + docs/diagrams/placement_replicate.svg | 47 + docs/diagrams/placement_row_wise.svg | 72 + .../diagrams/placement_tiled_column_major.svg | 116 ++ docs/diagrams/placement_tiled_row_major.svg | 116 ++ docs/diagrams/sip_view.svg | 95 ++ docs/diagrams/system_view.svg | 19 + docs/latency-model.md | 381 ++++++ pyproject.toml | 30 + scripts/gen_placement_diagrams.py | 393 ++++++ src/kernbench/__init__.py | 0 src/kernbench/cli/main.py | 64 + src/kernbench/cli/probe.py | 248 ++++ src/kernbench/cli/report.py | 175 +++ src/kernbench/common/__init__.py | 0 src/kernbench/common/pe_commands.py | 150 +++ src/kernbench/common/types.py | 29 + src/kernbench/components/__init__.py | 4 + src/kernbench/components/base.py | 167 +++ src/kernbench/components/context.py | 52 + src/kernbench/components/impls/__init__.py | 54 + src/kernbench/components/impls/forwarding.py | 27 + src/kernbench/components/impls/hbm_ctrl.py | 101 ++ src/kernbench/components/impls/io_cpu.py | 145 ++ src/kernbench/components/impls/m_cpu.py | 269 ++++ src/kernbench/components/impls/noc.py | 187 +++ src/kernbench/components/impls/pcie_ep.py | 27 + src/kernbench/components/impls/pe_cpu.py | 154 +++ src/kernbench/components/impls/pe_dma.py | 116 ++ src/kernbench/components/impls/pe_gemm.py | 90 ++ src/kernbench/components/impls/pe_math.py | 54 + .../components/impls/pe_scheduler.py | 245 ++++ src/kernbench/components/impls/pe_tcm.py | 25 + src/kernbench/components/impls/sram.py | 59 + src/kernbench/di/registry.py | 0 src/kernbench/policy/address/allocator.py | 85 ++ src/kernbench/policy/address/phyaddr.py | 184 +++ src/kernbench/policy/placement/dp.py | 174 +++ src/kernbench/policy/routing/router.py | 184 +++ src/kernbench/runtime_api/__init__.py | 0 src/kernbench/runtime_api/bench_runner.py | 96 ++ src/kernbench/runtime_api/context.py | 282 ++++ src/kernbench/runtime_api/kernel.py | 123 ++ src/kernbench/runtime_api/tensor.py | 166 +++ src/kernbench/runtime_api/types.py | 71 + src/kernbench/sim_engine/dummy.py | 31 + src/kernbench/sim_engine/engine.py | 298 +++++ src/kernbench/sim_engine/transaction.py | 49 + src/kernbench/topology/__init__.py | 0 src/kernbench/topology/builder.py | 965 ++++++++++++++ src/kernbench/topology/graph.py | 0 .../topology/projections/cube_view.py | 0 src/kernbench/topology/projections/pe_view.py | 0 .../topology/projections/sip_view.py | 0 src/kernbench/topology/types.py | 56 + src/kernbench/topology/visualizer.py | 367 +++++ src/kernbench/triton_emu/__init__.py | 11 + src/kernbench/triton_emu/registry.py | 30 + src/kernbench/triton_emu/tl_context.py | 356 +++++ tests/test_cli.py | 22 + tests/test_component_registry.py | 187 +++ tests/test_engine.py | 405 ++++++ tests/test_pe_components.py | 1175 +++++++++++++++++ tests/test_phase_a_components.py | 269 ++++ tests/test_phyaddr.py | 268 ++++ tests/test_probe.py | 221 ++++ tests/test_routing.py | 226 ++++ tests/test_tensor.py | 282 ++++ tests/test_topology_compile.py | 409 ++++++ tests/test_topology_load.py | 60 + tests/test_topology_visualize.py | 81 ++ tests/test_triton_emu.py | 349 +++++ topology.yaml | 126 ++ 109 files changed, 14909 insertions(+) create mode 100644 .claude/settings.json create mode 100644 .claude/settings.local.json create mode 100644 .editorconfig create mode 100644 .gitignore create mode 100644 .pylintrc create mode 100644 .vscode/extensions.json create mode 100644 .vscode/launch.json create mode 100644 .vscode/settings.json create mode 100644 .vscode/tasks.json create mode 100644 CLAUDE.md create mode 100644 README.md create mode 100644 SPEC.md create mode 100644 benches/__init__.py create mode 100644 benches/ipcq_allreduce.py create mode 100644 benches/loader.py create mode 100644 benches/qkv_gemm.py create mode 100644 benches/qkv_gemm_multi_pe.py create mode 100644 docs/adr/ADR-0001-physaddr-layout.md create mode 100644 docs/adr/ADR-0002-routing-distance.md create mode 100644 docs/adr/ADR-0003-target-system-hierarchy.md create mode 100644 docs/adr/ADR-0004-memory-semantics-local-hbm.md create mode 100644 docs/adr/ADR-0005-diagram-views-distance-layout.md create mode 100644 docs/adr/ADR-0006-topology-compilation-distance-diagram.md create mode 100644 docs/adr/ADR-0007-runtime-api-boundaries.md create mode 100644 docs/adr/ADR-0008-tensor-deploy-and-allocation.md create mode 100644 docs/adr/ADR-0009-kernel-execution-messaging.md create mode 100644 docs/adr/ADR-0010-cli-device-selection.md create mode 100644 docs/adr/ADR-0011-memory-addressing-simplification.md create mode 100644 docs/adr/ADR-0012-host-io-message-schema.md create mode 100644 docs/adr/ADR-0013-verification_strategy.md create mode 100644 docs/adr/ADR-0014-pe-internal-execution-model.md create mode 100644 docs/adr/ADR-0015-component-port-wire-model.md create mode 100644 docs/di-presentation.md create mode 100644 docs/diagrams/README.md create mode 100644 docs/diagrams/cube_view.svg create mode 100644 docs/diagrams/pe_view.svg create mode 100644 docs/diagrams/placement_column_wise.svg create mode 100644 docs/diagrams/placement_replicate.svg create mode 100644 docs/diagrams/placement_row_wise.svg create mode 100644 docs/diagrams/placement_tiled_column_major.svg create mode 100644 docs/diagrams/placement_tiled_row_major.svg create mode 100644 docs/diagrams/sip_view.svg create mode 100644 docs/diagrams/system_view.svg create mode 100644 docs/latency-model.md create mode 100644 pyproject.toml create mode 100644 scripts/gen_placement_diagrams.py create mode 100644 src/kernbench/__init__.py create mode 100644 src/kernbench/cli/main.py create mode 100644 src/kernbench/cli/probe.py create mode 100644 src/kernbench/cli/report.py create mode 100644 src/kernbench/common/__init__.py create mode 100644 src/kernbench/common/pe_commands.py create mode 100644 src/kernbench/common/types.py create mode 100644 src/kernbench/components/__init__.py create mode 100644 src/kernbench/components/base.py create mode 100644 src/kernbench/components/context.py create mode 100644 src/kernbench/components/impls/__init__.py create mode 100644 src/kernbench/components/impls/forwarding.py create mode 100644 src/kernbench/components/impls/hbm_ctrl.py create mode 100644 src/kernbench/components/impls/io_cpu.py create mode 100644 src/kernbench/components/impls/m_cpu.py create mode 100644 src/kernbench/components/impls/noc.py create mode 100644 src/kernbench/components/impls/pcie_ep.py create mode 100644 src/kernbench/components/impls/pe_cpu.py create mode 100644 src/kernbench/components/impls/pe_dma.py create mode 100644 src/kernbench/components/impls/pe_gemm.py create mode 100644 src/kernbench/components/impls/pe_math.py create mode 100644 src/kernbench/components/impls/pe_scheduler.py create mode 100644 src/kernbench/components/impls/pe_tcm.py create mode 100644 src/kernbench/components/impls/sram.py create mode 100644 src/kernbench/di/registry.py create mode 100644 src/kernbench/policy/address/allocator.py create mode 100644 src/kernbench/policy/address/phyaddr.py create mode 100644 src/kernbench/policy/placement/dp.py create mode 100644 src/kernbench/policy/routing/router.py create mode 100644 src/kernbench/runtime_api/__init__.py create mode 100644 src/kernbench/runtime_api/bench_runner.py create mode 100644 src/kernbench/runtime_api/context.py create mode 100644 src/kernbench/runtime_api/kernel.py create mode 100644 src/kernbench/runtime_api/tensor.py create mode 100644 src/kernbench/runtime_api/types.py create mode 100644 src/kernbench/sim_engine/dummy.py create mode 100644 src/kernbench/sim_engine/engine.py create mode 100644 src/kernbench/sim_engine/transaction.py create mode 100644 src/kernbench/topology/__init__.py create mode 100644 src/kernbench/topology/builder.py create mode 100644 src/kernbench/topology/graph.py create mode 100644 src/kernbench/topology/projections/cube_view.py create mode 100644 src/kernbench/topology/projections/pe_view.py create mode 100644 src/kernbench/topology/projections/sip_view.py create mode 100644 src/kernbench/topology/types.py create mode 100644 src/kernbench/topology/visualizer.py create mode 100644 src/kernbench/triton_emu/__init__.py create mode 100644 src/kernbench/triton_emu/registry.py create mode 100644 src/kernbench/triton_emu/tl_context.py create mode 100644 tests/test_cli.py create mode 100644 tests/test_component_registry.py create mode 100644 tests/test_engine.py create mode 100644 tests/test_pe_components.py create mode 100644 tests/test_phase_a_components.py create mode 100644 tests/test_phyaddr.py create mode 100644 tests/test_probe.py create mode 100644 tests/test_routing.py create mode 100644 tests/test_tensor.py create mode 100644 tests/test_topology_compile.py create mode 100644 tests/test_topology_load.py create mode 100644 tests/test_topology_visualize.py create mode 100644 tests/test_triton_emu.py create mode 100644 topology.yaml diff --git a/.claude/settings.json b/.claude/settings.json new file mode 100644 index 0000000..bb31422 --- /dev/null +++ b/.claude/settings.json @@ -0,0 +1,15 @@ +{ + "permissions": { + "allow": [ + "Bash(python -c \":*)", + "Bash(xargs ls -la)", + "Bash(wc -l /c/Users/ywkang/kernbench/src/kernbench/policy/address/phyaddr.py /c/Users/ywkang/kernbench/src/kernbench/runtime_api/*.py)", + "Bash(python scripts/gen_placement_diagrams.py)", + "Bash(python -c:*)", + "Bash(python -m kernbench.cli.main probe --topology topology.yaml)", + "Bash(xargs grep -l \"class.*ComponentBase\\\\|class.*DefaultComponent\")", + "Bash(python -m pytest tests/test_probe.py -v)", + "Bash(python -m pytest tests/test_probe.py tests/test_component_registry.py -v)" + ] + } +} diff --git a/.claude/settings.local.json b/.claude/settings.local.json new file mode 100644 index 0000000..fbd7d37 --- /dev/null +++ b/.claude/settings.local.json @@ -0,0 +1,36 @@ +{ + "permissions": { + "allow": [ + "Bash(python -m ruff check --select I --fix src/kernbench/runtime_api/context.py src/kernbench/runtime_api/bench_runner.py src/kernbench/cli/main.py)", + "Bash(python -m pytest tests/ -q)", + "Bash(python -m pytest tests/ -v)", + "Bash(python -m pytest tests/test_topology_load.py tests/test_cli.py -v)", + "Bash(python -c \":*)", + "Bash(python -m pytest tests/ -q --tb=no)", + "Bash(python -m pytest tests/ -v --tb=short)", + "Bash(python -m pytest tests/test_component_registry.py -v --tb=line)", + "Bash(python -m pytest tests/ --ignore=tests/test_component_registry.py -q --tb=no)", + "Bash(python -m pytest tests/ -q --tb=short)", + "Bash(python -m pytest --tb=short -q)", + "Bash(python -m pytest tests/test_phase_a_components.py -v)", + "Bash(python -m pytest --tb=short)", + "Bash(python -m pytest tests/ -x -q)", + "Bash(python -m pytest tests/test_probe.py::test_h2d_latency_monotonic -x -q)", + "Bash(python -m pytest tests/ -x --tb=short)", + "Bash(python -m pytest tests/ --tb=line)", + "Bash(python -m pytest tests/ --tb=short)", + "Bash(python -m kernbench probe --topology topology.yaml)", + "Bash(python -m pytest tests/ --tb=short -q)", + "Bash(python -m pytest tests/test_probe.py tests/test_component_registry.py tests/test_topology_compile.py -v)", + "Bash(python -m pytest tests/test_tensor.py tests/test_engine.py tests/test_probe.py tests/test_component_registry.py -v)", + "Bash(grep -l \"class.*CPU\" \"/c/Users/ywkang/kernbench/src/kernbench/components/impls\"/*.py)", + "Bash(grep -n \"^class \" \"/c/Users/ywkang/kernbench/src/kernbench/components/impls\"/*.py)", + "Bash(python -m pytest tests/test_engine.py tests/test_probe.py tests/test_component_registry.py -v)", + "Bash(grep -E \"\\\\.\\(py|md\\)$\")", + "Bash(python -m pytest tests/test_pe_components.py -v)", + "Bash(python -m pytest tests/test_triton_emu.py -v)", + "Bash(python -m pytest tests/test_pe_components.py tests/test_triton_emu.py -v)", + "Bash(python -m pytest tests/test_pe_components.py::test_mcpu_multi_pe_kernel_launch tests/test_pe_components.py::test_qkv_gemm_bench_multi_pe_completes -v)" + ] + } +} diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..6646a90 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,12 @@ +root = true + +[*] +charset = utf-8 +end_of_line = lf +insert_final_newline = true +trim_trailing_whitespace = true +indent_style = space +indent_size = 4 + +[*.md] +trim_trailing_whitespace = false diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8599830 --- /dev/null +++ b/.gitignore @@ -0,0 +1,28 @@ +# OS / Editor +.DS_Store +.vscode/.history/ +*.swp + +# Python +__pycache__/ +*.py[cod] +*.pyd +.pytest_cache/ +.mypy_cache/ +.ruff_cache/ + +# Virtualenv +.venv/ + +# Packaging +dist/ +build/ +*.egg-info/ + +# Env +.env +.env.* +!.env.example + +# Logs +*.log diff --git a/.pylintrc b/.pylintrc new file mode 100644 index 0000000..b936b99 --- /dev/null +++ b/.pylintrc @@ -0,0 +1,3 @@ +[MESSAGES CONTROL] +disable=missing-function-docstring +disable=missing-module-docstring diff --git a/.vscode/extensions.json b/.vscode/extensions.json new file mode 100644 index 0000000..e738d79 --- /dev/null +++ b/.vscode/extensions.json @@ -0,0 +1,16 @@ +{ + "recommendations": [ + "ms-python.python", + "ms-python.vscode-pylance", + "charliermarsh.ruff", + "ms-python.debugpy", + "eamodio.gitlens", + "usernamehw.errorlens", + "wayou.vscode-todo-highlight", + "christian-kohler.path-intellisense", + "editorconfig.editorconfig", + "ms-azuretools.vscode-docker", + "humao.rest-client", + "shd101wyy.markdown-preview-enhanced" + ] +} \ No newline at end of file diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..3147e32 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,55 @@ +{ + "version": "0.2.0", + "configurations": [ + { + "name": "Python: Current File", + "type": "python", + "request": "launch", + "program": "${file}", + "console": "integratedTerminal", + "justMyCode": true + }, + { + "name": "Run kernbench CLI", + "type": "python", + "request": "launch", + "module": "kernbench.cli.main", + "console": "integratedTerminal", + "justMyCode": true, + "args": [ + "run", + "--topology", "topology.yaml", + "--bench", "qkv_gemm" + ], + "env": { + "PYTHONPATH": "${workspaceFolder}/src" + } + }, + { + "name": "Run KernBench Probe", + "type": "python", + "request": "launch", + "module": "kernbench.cli.main", + "console": "integratedTerminal", + "justMyCode": true, + "args": [ + "probe", + "--topology", "topology.yaml", + ], + "env": { + "PYTHONPATH": "${workspaceFolder}/src" + } + }, + { + "name": "Pytest: Run All (terminal)", + "type": "python", + "request": "launch", + "module": "pytest", + "args": [ + "-q" + ], + "console": "integratedTerminal", + "justMyCode": true + } + ] +} diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..70ca63f --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,39 @@ +{ + "editor.formatOnSave": true, + "editor.formatOnSaveMode": "file", + "editor.tabSize": 4, + "editor.insertSpaces": true, + "editor.rulers": [ + 100 + ], + "editor.minimap.enabled": false, + "files.eol": "\n", + "files.trimTrailingWhitespace": true, + "files.insertFinalNewline": true, + "search.useIgnoreFiles": true, + "git.autofetch": true, + "python.venvFolders": [".venv"], + "python.testing.pytestEnabled": true, + "python.testing.unittestEnabled": false, + "python.testing.autoTestDiscoverOnSaveEnabled": true, + "python.testing.pytestArgs": [ + "tests", "-q", "-vv", "-s", "--tb=short", "--no-header", + ], + "python.analysis.typeCheckingMode": "basic", + "python.analysis.autoImportCompletions": true, + "python.terminal.activateEnvironment": true, + "ruff.lint.enable": true, + "ruff.format.enable": true, + "editor.defaultFormatter": "charliermarsh.ruff", + "[python]": { + "editor.defaultFormatter": "charliermarsh.ruff" + }, + "editor.codeActionsOnSave": { + "source.fixAll": "explicit", + "source.organizeImports": "explicit" + }, + "python.analysis.extraPaths": [ + "${workspaceFolder}/src" + ] + +} diff --git a/.vscode/tasks.json b/.vscode/tasks.json new file mode 100644 index 0000000..9f3a1e2 --- /dev/null +++ b/.vscode/tasks.json @@ -0,0 +1,127 @@ +{ + "version": "2.0.0", + "tasks": [ + + { + "label": "venv: create", + "type": "shell", + "command": "python -m venv .venv", + "problemMatcher": [] + }, + + { + "label": "deps: install", + "type": "shell", + "command": "${command:python.interpreterPath}", + "args": [ + "-m", + "pip", + "install", + "-U", + "pip", + "&&", + "${command:python.interpreterPath}", + "-m", + "pip", + "install", + "-e", + ".[dev]" + ], + "problemMatcher": [], + "dependsOn": "venv: create" + }, + + { + "label": "Run KernBench CLI", + "type": "shell", + "command": "${command:python.interpreterPath}", + "args": [ + "-m", + "kernbench.cli.main", + "run", + "--topology", + "topology.yaml", + "--bench", + "qkv_gemm" + ], + "options": { "cwd": "${workspaceFolder}" }, + "problemMatcher": [], + "presentation": { + "reveal": "silent", + "panel": "shared", + "clear": true + } + }, + + { + "label": "Run KernBench Probe", + "type": "shell", + "command": "${command:python.interpreterPath}", + "args": [ + "-m", + "kernbench.cli.main", + "probe", + "--topology", + "topology.yaml" + ], + "options": { "cwd": "${workspaceFolder}" }, + "problemMatcher": [], + "presentation": { + "reveal": "silent", + "panel": "shared", + "clear": true + } + }, + { + "label": "Pytest: Run All (terminal)", + "type": "shell", + "command": "${command:python.interpreterPath}", + "args": [ + "-m", + "pytest", + "-q", + "-s" + ], + "problemMatcher": [], + "group": "test" + }, + + { + "label": "lint", + "type": "shell", + "command": "${command:python.interpreterPath}", + "args": [ + "-m", + "ruff", + "check", + "." + ], + "problemMatcher": [] + }, + + { + "label": "format", + "type": "shell", + "command": "${command:python.interpreterPath}", + "args": [ + "-m", + "ruff", + "format", + "." + ], + "problemMatcher": [] + }, + + { + "label": "which python", + "type": "shell", + "command": "${command:python.interpreterPath}", + "args": [ + "-c", + "import sys;print(sys.executable)" + ], + "problemMatcher": [] + } + + ] +} diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..e4787e5 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,196 @@ +# Claude Code Instructions (Repo) + +This repository uses Claude Code with strict architectural and verification rules. +SPEC.md and ADRs are the source of truth. + +--- + +## Terminology + +- runtime API: + Host-facing public API used by benchmarks and user code (e.g., tensor deployment, kernel launch). +- simulation engine (sim_engine): + Discrete-event engine responsible for request injection, scheduling, and completion tracking. +- components: + Device-side nodes modeling hardware behavior (IO_CPU, M_CPU, PE_CPU, routers, engines, etc.). + +## Authority & Scope + +- SPEC.md defines the architectural contract. +- ADRs (docs/adr/ADR-*.md) define non-trivial architectural decisions. +- If a change conflicts with SPEC.md or an ADR: + - STOP. + - Explain the conflict. + - Propose options (keep spec, update ADR, or narrow scope). +- Do NOT silently change architecture. +- The repository structure reflects architectural intent; Claude Code MUST respect existing module boundaries and file locations. + +--- + +## Design Questions + +- Design / architecture questions are ALWAYS allowed. +- Design questions MUST NOT modify: + - production code + - test code + - SPEC.md + - ADRs +- If a design question implies a change, default to Phase 1. + +--- + +## Change & Test Protocol (Mandatory) + +All non-trivial changes MUST follow a two-phase process. +Design discussion is always allowed; code changes are not. + +--- + +### Phase 1 — Proposal + Verification + +(No Production Code Changes) + +#### Purpose + +- Decide *what* to change and *how it will be validated* +- Establish verification coverage BEFORE touching production code + +#### Phase 1 MUST include + +1) **Design Proposal** + +- Explain the design change. +- Explain why the change is needed. +- Explain consistency with SPEC.md and relevant ADRs. + +1) **Verification Plan** + +- SPEC requirement(s) / ADR(s) affected (e.g., R1/R2/R5, ADR-0002). +- Tests that validate the change: + - existing tests to run, and/or + - new tests to add. +- Concrete input cases used by the tests: + - topology (SIP / CUBE / PE layout) + - request parameters (src, dst, size_bytes). +- Expected observable assertions, such as: + - hop trace contains key waypoints, + - latency invariants (e.g., > 0, monotonic increase), + - deterministic route selection. + - **expected changes (or no changes) in generated diagrams**, if applicable. + +If the Verification Plan is missing or vague, STOP. + +#### Allowed in Phase 1 + +- Creating or modifying **test code only** +- Running tests and reporting results + +#### Forbidden in Phase 1 + +- Any production code changes +- Any SPEC.md or ADR modifications +- Any production diff output + +#### Phase 1 Output + +- Proposal + Verification Plan +- Tests added/modified (if any) +- Test execution results (PASS / FAIL) +- Clear recommendation: + - "No Phase 2 needed" OR + - "Await approval for Phase 2" + +--- + +### Phase 2 — Apply + Verify + Rollback + +#### Trigger + +Phase 2 is triggered ONLY by the exact user approval phrase: + +**"ok"** + +#### Phase 2 Rules + +- Output **minimal unified diffs only** +- Modify ONLY production files declared in Phase 1 +- Do NOT include explanations, comments, or unchanged code +- Automatically apply the diff to the working tree + +#### Mandatory Verification + +- Run the tests defined in the Phase 1 Verification Plan + +#### Success Path + +If ALL tests PASS: + +- Keep the applied changes +- Ensure generated diagrams (if affected) are consistent +- Report success concisely + +#### Failure Path (Mandatory) + +If ANY test FAILS: + +- Immediately rollback ALL Phase 2 changes +- Do NOT keep partial changes +- Report: + - failing test names + - error messages / assertions + - brief hypothesis of the root cause +- Return to Phase 1 state + +Tests must NEVER be weakened, removed, or altered to force Phase 2 to pass. + +--- + +## What Counts as "Non-Trivial" + +(Protocol Required) + +Any of the following: + +- routing policy or ordering changes +- topology builder changes (nodes, links, parameters) +- address decoding / PhysAddr behavior +- latency composition rules +- changes affecting determinism or connectivity +- changes touching two or more production files + +--- + +## Allowed Exceptions + +(Protocol Still Required) + +- comments or docstrings +- formatting-only changes +- type annotation changes with no runtime behavior change + +In exceptions, Phase 1 MUST explicitly state: +**"No behavior change; tests unchanged."** + +--- + +## CLI Semantics + +- `kernbench run --device ` runs the benchmark on a single device. +- Omitting `--device` runs the benchmark on all devices discovered in the topology (logically parallel). +- Device enumeration is handled by the CLI only; benchmarks MUST remain single-device. + +## Derived Artifacts (Clarification) + +- Generated diagrams under `docs/diagrams/` are **derived artifacts**, not production code. +- Creating or updating files in `docs/diagrams/`: + - does NOT count as a production code change, + - does NOT require Phase 2 approval, + - MUST be consistent with SPEC.md and ADRs. + +## Enforcement Defaults + +- If unsure whether a change is non-trivial → treat it as non-trivial. +- If unsure whether Phase 2 is allowed → STOP and ask. +- SPEC.md and ADRs are the final authority. +- runtime API MUST NOT hardcode topology/routing or internal hop sequences. +- sim_engine MUST remain independent of runtime API semantics (no tensor/kernel policy logic). diff --git a/README.md b/README.md new file mode 100644 index 0000000..b276a9f --- /dev/null +++ b/README.md @@ -0,0 +1,13 @@ +# Python Project (VS Code Template) + +## Quick start +1. Create venv + install dev deps (editable): + - VS Code: Run Task → `deps: install (editable)` +2. Run tests: + - VS Code: Run Task → `test` +3. Lint / format: + - `lint`, `format` tasks + +## Structure +- `src/` app code +- `tests/` pytest diff --git a/SPEC.md b/SPEC.md new file mode 100644 index 0000000..e881bbe --- /dev/null +++ b/SPEC.md @@ -0,0 +1,327 @@ +# KernBench System-Level Simulator — SPEC + +This document defines the architectural contract for the KernBench +system-level discrete-event simulator for our AI Accelerator SIP-based systems. +All implementations, tests, and changes MUST conform to this SPEC. + +--- + +## 0. Goal + +Build a **system-level, discrete-event simulator** to evaluate the performance of +**LLM kernels running on our AI Accelerator SIP-based systems**, under varying +**SIP architectures, topologies, and interconnect configurations**. + +The simulator models **data-movement and control paths across the full hardware +hierarchy** and computes **end-to-end execution latency** for kernel executions +dispatched to Processing Elements (PEs). + +Primary objectives: + +- compare LLM kernel execution latency under different system configurations +- model PE↔HBM, PE↔PE, CUBE↔CUBE, and SIP↔SIP communication and control paths +- guarantee deterministic, verifiable behavior with strong debuggability +- support visual inspection of the modeled system at multiple abstraction levels + +--- + +## 0.1 Golden Invariants (Must NOT be violated) + +- End-to-end latency is computed **strictly by explicit traversal** over modeled + components and links. +- Every routed request MUST incur **latency > 0**. +- Routing decisions MUST be **deterministic** given + (topology + routing policy + request). +- All valid request flows MUST have explicit connectivity in the model. +- No hidden shortcuts, implicit bypasses, or magic paths are allowed. +- Architectural decisions documented in ADRs override local optimizations. + +--- + +## 0.2 Architectural References (ADRs) + +Major architectural decisions are documented in ADRs and referenced by number. + +- ADR-0001: PhysAddr layout & address decoding contract +- ADR-0002: Routing distance, ordering, and bypass rules +- ADR-0003: Target system hierarchy & modeling scope (Tray / SIP / CUBE / PE / IO chiplet) +- ADR-0004: Memory semantics & local-HBM bandwidth guarantee contract +- ADR-0005: Diagram views (SIP / CUBE / PE) and distance-aware layout rules +- ADR-0006: Topology compilation, distance extraction, and automatic diagram generation +- ADR-0007: runtime_api vs sim_engine responsibility boundaries +- ADR-0008: Tensor deployment and allocation (Host allocator, PA-first) +- ADR-0009: Kernel execution fan-out and completion semantics +- ADR-0010: CLI device selection and multi-device execution semantics +- ADR-0011: Memory addressing simplification (PA-first) +- ADR-0012: Host ↔ IO_CPU message schema (PA-first, PE-tagged shards) +- ADR-0013: Verification strategy and Phase 1 test plan + +SPEC MUST remain consistent with accepted ADRs. + +--- + +## 1. Core Requirements + +### R1. Correct Routing and Control Path + +- A request MUST traverse the correct sequence of components based on: + - source location, + - destination address or placement tags, + - routing policy and available topology connectivity. +- Local vs remote traffic MUST be distinguishable: + - same SIP vs different SIP, + - same CUBE vs different CUBE, + - (optional) same PE-group vs cross PE-group. +- Routing behavior MUST be reproducible and deterministic. + +--- + +### R2. Latency is Computed by Traversal + +End-to-end latency is the sum of: + +- per-node fixed latency (processing / router delay), +- per-link latency (fixed and/or size-aware serialization: bytes / BW), +- per-service latency (e.g., memory controller service time). + +The simulator MUST: + +- support both fixed and size-aware latency, +- emit hop-by-hop traces with timestamps and component identifiers. + +--- + +### R3. Topology is Configurable and Variable + +Topology MUST NOT be hardcoded. + +The simulator MUST accept multiple topologies (YAML / JSON / dict), varying: + +- SIP count, +- CUBE count per SIP, +- PE count per CUBE, +- on-chip fabric structure (e.g., mesh / NoC / XBAR), +- IO chiplets and interconnects, +- link bandwidth, latency, and capacity parameters. + +Given a topology: + +- all required request flows MUST have valid connectivity, +- missing links are a topology construction error, not a routing error. + +--- + +### R4. DI-First Component Design (Swappable Implementations) + +All components MUST be replaceable behind stable interfaces, including: + +- routers and fabrics (NoC, bridges, switches), +- XBAR-like selectors, +- DMA engines and queues, +- memory controllers and services (HBM, TCM, queues), +- management and control processors (modeled components). + +The simulator MUST: + +- use dependency injection (DI) to bind node specifications to implementation classes, +- allow component swapping without changing test logic, +- avoid leaking routing or policy logic into unrelated components. + +--- + +### R5. Multi-Domain Communication Modeling + +The simulator MUST model communication across hierarchical domains, including: + +- PE ↔ local HBM +- PE ↔ remote HBM in the same CUBE +- PE ↔ remote HBM in other CUBEs within the same SIP +- PE ↔ remote HBM in other SIPs +- PE ↔ PE messaging (e.g., IPCQ) +- PE ↔ IO chiplets +- CUBE ↔ CUBE (e.g., via UCIe) +- SIP ↔ SIP (e.g., via PCIe or UAL) + +Policy-based bypass is allowed ONLY if: + +- the bypass path is explicitly represented in the model, +- the bypass incurs non-zero latency, +- the bypass is visible in traces and diagrams. + +--- + +### R6. Verification-Driven Development + +Development MUST follow a verification-driven workflow: + +- behavior is validated by tests with meaningful input cases, +- tests encode SPEC-defined invariants, not incidental implementation details, +- changes without clear verification coverage are not allowed. + +--- + +## R7. Runtime API + +The simulator MUST provide a host-facing runtime API that: + +- exposes tensor deployment and kernel execution operations, +- submits requests only to endpoint components (e.g., IO_CPU), +- owns host-side tensor handles and allocation metadata as PA shard maps, +- remains topology-agnostic and does not perform routing or fan-out. + +Tensor deployment in Phase 0 produces **device physical-address (PA) shard mappings**. +Each shard explicitly identifies its target `(sip, cube, pe)` and PA range. +No separate host-visible allocation RPC (e.g., AllocateTensorMeta) exists. + +--- + +## R8. Simulation Engine + +The simulator MUST include a discrete-event simulation engine that: + +- injects requests into the system graph, +- schedules events deterministically, +- tracks completion via correlation identifiers, +- decomposes runtime API operations into explicit graph requests + (e.g., MemoryWrite, MemoryRead, KernelLaunch). + +--- + +## R9. CLI Execution Semantics + +The CLI MUST support executing benchmarks: + +- on a specified device. + +Benchmarks are executed once per invocation within a single simulation instance. +If multiple devices are present in the topology, a benchmark MAY interact with +multiple devices internally, but the CLI does not launch multiple independent +benchmark instances by default. + +--- + +## R10. Memory Addressing (Phase 0) + +In Phase 0, the simulator uses a **PA-first memory model**: + +- All memory operations use device physical addresses (PA) only. +- Virtual addressing, MMU/IOMMU, and address translation latency are out of scope. +- Tensor placement is represented as a list of PA shards, each explicitly tagged + with `(sip, cube, pe)`. + +All memory access latency MUST be modeled explicitly via graph traversal. +No implicit translation or hidden latency is allowed. + +--- + +## 2. Model Concepts + +### 2.1 Graph Execution Model + +- Nodes represent modeled components (PE blocks, XBAR, NoC, bridges, + HBM controllers, IO components, etc.). +- Directed edges represent interconnect links with latency and bandwidth attributes. +- Execution model: + - a node receives a request, + - incurs node or service latency, + - emits the request to the next hop via a link, + - repeats until the destination service completes. + +--- + +### 2.2 Routing + +Routing MAY be implemented as: + +- policy-based routing (code-driven), +- routing tables (config-driven), +- topology-driven routing (e.g., mesh XY), +- or a hybrid approach. + +Routing MUST: + +- consume decoded address domains or explicit placement tags, +- operate only on explicit topology connectivity, +- remain deterministic. + +Kernel execution requests reference tensors via PA shard mappings. +Each shard explicitly identifies its target PE, allowing IO_CPU to +deterministically fan-out execution without relying on PA decoding. + +--- + +## 3. Inputs and Identity + +### 3.1 Node Identity Scheme + +Nodes MUST have stable, parsable identifiers sufficient for domain inference +and trace-based debugging. + +Example patterns: + +- `tray.host_cpu` +- `sip{S}.io{I}.pcie_ep` +- `sip{S}.cube{C}.fabric` +- `sip{S}.cube{C}.pe{P}` +- `sip{S}.cube{C}.hbm_ctrl` + +--- + +### 3.2 Link Specifications + +A link MAY include: + +- fixed latency (ns), +- bandwidth (GB/s) for serialization latency, +- optional capacity for contention modeling. + +Topology builders MUST ensure: + +- required links exist, +- link parameters are consistent with topology intent. + +--- + +## 4. Output, Debuggability, and Diagrams + +The simulator MUST provide: + +- per-request hop-by-hop traces with timestamps, +- clear error messages for missing connectivity + (e.g., "no link for A → B"), +- reproducible, inspectable representations of the modeled system. + +Diagrams are **derived artifacts** of the simulator model: + +- They MUST be generatable from the **compiled topology** and **distance metadata** + used by execution and routing. +- Generation MAY be performed lazily or cached by the implementation, + as long as outputs remain consistent with the compiled topology. + +Diagram abstraction levels and distance-aware layout rules are defined in ADR-0005. +Automatic diagram generation and output conventions are defined in ADR-0006. + +By default, generated diagrams are written under: + +- `docs/diagrams/` + +--- + +## 5. Non-Goals (for now) + +The following are explicitly out of scope: + +- cycle-accurate microarchitecture modeling, +- detailed cache coherence protocols, +- full PCIe / CXL protocol correctness. + +These MAY be layered later via additional components and policies. + +--- + +## 6. Decision Boundaries + +- SPEC.md defines architectural intent and invariants. +- Code implements SPEC and MUST NOT introduce hidden invariants. +- Tests validate SPEC-defined behavior and MUST NOT encode fixed topology assumptions. +- ADRs record non-trivial architectural decisions and MUST be referenced when relevant. diff --git a/benches/__init__.py b/benches/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/benches/ipcq_allreduce.py b/benches/ipcq_allreduce.py new file mode 100644 index 0000000..99e5217 --- /dev/null +++ b/benches/ipcq_allreduce.py @@ -0,0 +1,2 @@ +def run(ctx): + print("IPCQ all reduce kernel bench") diff --git a/benches/loader.py b/benches/loader.py new file mode 100644 index 0000000..e78e1a2 --- /dev/null +++ b/benches/loader.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +import importlib +from collections.abc import Callable +from typing import Any + +from kernbench.runtime_api.context import RuntimeContext + +BenchFn = Callable[[RuntimeContext], Any] + + +def resolve_bench(bench_id: str) -> BenchFn: + """ + Resolve a bench id into a callable bench function. + + Expected layout (repo root): + benches/.py + def run(ctx: RuntimeContext) -> Any + """ + bench_id = bench_id.strip() + if not bench_id: + raise ValueError("Bench id is empty.") + + module_path = f"benches.{bench_id}" + + try: + mod = importlib.import_module(module_path) + except ModuleNotFoundError as e: + raise ValueError(f"Unknown bench '{bench_id}'. Expected module {module_path}.py") from e + + run_fn = getattr(mod, "run", None) + if run_fn is None: + raise ValueError(f"Bench module {module_path} must define a 'run(ctx)' function.") + if not callable(run_fn): + raise ValueError(f"'run' in {module_path} is not callable.") + + return run_fn diff --git a/benches/qkv_gemm.py b/benches/qkv_gemm.py new file mode 100644 index 0000000..7c92569 --- /dev/null +++ b/benches/qkv_gemm.py @@ -0,0 +1,39 @@ +"""QKV GEMM benchmark: Q*K^T projection on a single PE. + +Demonstrates the full host-to-PE kernel launch pipeline: + Host → PCIE_EP → IO_CPU → M_CPU → NOC → PE_CPU → PE_SCHEDULER → engines + +Kernel: tl.load(a) + tl.ref(b) + tl.composite(gemm) + tl.wait() + - Tensor a is loaded into TCM via DMA + - Tensor b stays in HBM; PE_SCHEDULER streams it per-tile (32x64x32) +""" +from kernbench.policy.placement.dp import DPPolicy + +# GEMM dimensions: (M, K) x (K, N) → (M, N) +M, K, N = 128, 256, 128 +DTYPE = "f16" + + +def _gemm_kernel(a_ptr, b_ptr, out_ptr, M, K, N, tl, DTYPE="f16"): + """QKV GEMM kernel: out = a @ b. + + a is loaded into TCM (DMA_READ). + b is referenced in HBM (tl.ref, no DMA — scheduler streams per-tile). + """ + a = tl.load(a_ptr, shape=(M, K), dtype=DTYPE) + b = tl.ref(b_ptr, shape=(K, N), dtype=DTYPE) + handle = tl.composite(op="gemm", a=a, b=b, out_ptr=out_ptr) + tl.wait(handle) + + +def run(ctx): + """Run the QKV GEMM benchmark.""" + # DP placement: a=replicate (cube-level), b/out=column_wise (N-axis, single PE) + a = ctx.zeros((M, K), dtype=DTYPE, dp=DPPolicy(cube="replicate", pe="replicate"), name="a") + b = ctx.zeros((K, N), dtype=DTYPE, dp=DPPolicy(cube="replicate", pe="column_wise"), name="b") + out = ctx.empty( + (M, N), dtype=DTYPE, dp=DPPolicy(cube="replicate", pe="column_wise"), name="out", + ) + + # Launch GEMM kernel + ctx.launch("qkv_gemm", _gemm_kernel, a, b, out, M, K, N) diff --git a/benches/qkv_gemm_multi_pe.py b/benches/qkv_gemm_multi_pe.py new file mode 100644 index 0000000..2b7bd87 --- /dev/null +++ b/benches/qkv_gemm_multi_pe.py @@ -0,0 +1,39 @@ +"""QKV GEMM benchmark: Q*K^T projection on all PEs in a cube (multi-PE). + +Column-parallel GEMM: a is replicated (cube-level), b/out are column-sharded. +M_CPU fans out KernelLaunchMsg to all 8 PE_CPUs (ADR-0009 D3). + +Kernel: tl.load(a) + tl.ref(b) + tl.composite(gemm) + tl.wait() + - Tensor a is loaded into TCM via DMA + - Tensor b stays in HBM; PE_SCHEDULER streams it per-tile (32x64x32) +""" +from kernbench.policy.placement.dp import DPPolicy + +# GEMM dimensions: (M, K) x (K, N) -> (M, N) +M, K, N = 128, 256, 128 +DTYPE = "f16" + + +def _gemm_kernel(a_ptr, b_ptr, out_ptr, M, K, N, tl, DTYPE="f16"): + """QKV GEMM kernel: out = a @ b. + + a is loaded into TCM (DMA_READ). + b is referenced in HBM (tl.ref, no DMA -- scheduler streams per-tile). + """ + a = tl.load(a_ptr, shape=(M, K), dtype=DTYPE) + b = tl.ref(b_ptr, shape=(K, N), dtype=DTYPE) + handle = tl.composite(op="gemm", a=a, b=b, out_ptr=out_ptr) + tl.wait(handle) + + +def run(ctx): + """Run the multi-PE QKV GEMM benchmark.""" + # DP placement: a=replicate (cube-level), b/out=column_wise (N-axis split) + a = ctx.zeros((M, K), dtype=DTYPE, dp=DPPolicy(cube="replicate", pe="replicate"), name="a") + b = ctx.zeros((K, N), dtype=DTYPE, dp=DPPolicy(cube="replicate", pe="column_wise"), name="b") + out = ctx.empty( + (M, N), dtype=DTYPE, dp=DPPolicy(cube="replicate", pe="column_wise"), name="out", + ) + + # Launch GEMM kernel on all PEs + ctx.launch("qkv_gemm_multi", _gemm_kernel, a, b, out, M, K, N) diff --git a/docs/adr/ADR-0001-physaddr-layout.md b/docs/adr/ADR-0001-physaddr-layout.md new file mode 100644 index 0000000..9158f7a --- /dev/null +++ b/docs/adr/ADR-0001-physaddr-layout.md @@ -0,0 +1,108 @@ +# ADR-0001: PhysAddr Layout & Address Decoding Contract + +## Status + +Accepted + +## Date + +2026-02-27 + +## Context + +KernBench Graph Latency Simulator must route requests deterministically and compute end-to-end latency strictly by graph traversal. +To model local vs remote traffic (same/different SIP, same/different CUBE, optional PE-group), requests need a stable, parsable address/location scheme that: + +- can be decoded into routing domains (SIP/CUBE/HBM/PE-resource, etc.) +- remains topology-agnostic (no hardcoded counts) +- supports swappable policy and DI-first components without leaking topology assumptions into node implementations + +## Decision + +We define a **PhysAddr value object** and an **address decoding contract** that converts an integer address into routing domains. + +### D1. PhysAddr is an immutable value object + +- PhysAddr is immutable and comparable as a pure value. +- Any allocator returns a **fully specified PhysAddr** (not partial metadata). +- No global state may be required to interpret a PhysAddr. + +### D2. PhysAddr fields (logical contract) + +PhysAddr must be able to represent at least: + +- `rack_id` (optional but reserved for scale-out) +- `sip_id` (device / SIP domain) +- `sip_seg` (SIP-level segment/window selection, e.g., cube window) +- `local_offset` (offset within the chosen segment/window) + +Decoded/derived fields may include (optional): + +- `cube_id` +- `kind` (e.g., HBM vs PE-resource vs raw) +- `unit_type` / `pe_id` (if PE-level addressing is modeled) + +**Important:** The exact bit allocation may evolve, but the *semantic fields above* must remain decodable without hidden assumptions. + +### D3. Decoding is deterministic and policy-compatible + +- Decoding must deterministically map an integer address to: + - destination SIP domain (`sip_id`) + - destination sub-domain (`cube_id` if applicable) + - destination target kind (HBM/PE-resource/other) +- Decoding must not depend on runtime topology sizes; it may depend on **explicit topology parameters** provided through configuration (e.g., segment size, slice size), and those parameters must live in the topology/config layer (not in random components). + +### D4. Topology-derived constants live in the topology layer + +Constants such as segment sizes (e.g., HBM slice size / window size) are derived from topology configuration (YAML/JSON/dict) and are provided to the decoder via DI/config. +They must not be hardcoded in node implementations. + +### D5. Routing consumes decoded domains, not raw bits + +Routing policy uses decoded domains: + +- `src` location (sip/cube/pe or node_id) +- `dst` domains derived from PhysAddr decoding +- `size_bytes` for size-aware link latency +Routing must not inspect raw bit-fields directly except inside the decoding module. + +## Alternatives Considered + +1) **Use raw integers everywhere, decode ad-hoc in routing** + +- Rejected: leads to duplicated logic, inconsistent routing, and hidden assumptions embedded in multiple components. + +1) **Hardcode topology sizes (SIP/CUBE/PE counts) into decoding** + +- Rejected: violates SPEC (R3) and breaks swappability and configuration-driven topologies. + +1) **Put decoding inside memory controllers or routers** + +- Rejected: leaks policy into components and undermines DI-first, swappable implementations (SPEC R4). + +## Consequences + +### Positive + +- Deterministic routing domains enable clear test invariants for local vs remote paths (SPEC R1, R5). +- Keeps topology variability (SPEC R3) while preserving consistent semantics. +- DI-first: decoder can be swapped or extended without changing components or tests (SPEC R4). + +### Tradeoffs / Costs + +- Requires explicit configuration for any topology-derived sizes. +- Introduces a single “blessed” decoding module that must remain stable and well-tested. + +## Implementation Notes (Non-normative) + +- Recommended module boundary: + - `src/kernbench/policy/address/phyaddr.py` + +- Tests should cover: + - deterministic decoding + - local vs remote classification from decoded fields + - invariants: “allocator returns full PhysAddr”, “decoding requires no global state” + +## Links + +- SPEC.md: R1 (routing), R3 (configurable topology), R4 (DI-first), R5 (multi-domain comm) diff --git a/docs/adr/ADR-0002-routing-distance.md b/docs/adr/ADR-0002-routing-distance.md new file mode 100644 index 0000000..2c28f41 --- /dev/null +++ b/docs/adr/ADR-0002-routing-distance.md @@ -0,0 +1,103 @@ +# ADR-0002: Routing Distance, Ordering & Bypass Rules + +## Status +Accepted + +## Date +2026-02-27 + +## Context +The KernBench Graph Latency Simulator must compare kernel execution time +across different architectures and topologies by computing end-to-end +latency from graph traversal. + +To support meaningful comparison: +- routing must be deterministic +- latency must reflect actual interconnect structure +- local vs remote traffic must be distinguishable +- “bypass” optimizations must not undermine debuggability or correctness + +The simulator also aims to avoid software-managed metadata and hidden +shortcuts that obscure control paths. + +## Decision + +### D1. Distance is accumulated latency, not hop count +- Routing “distance” is defined as the **sum of per-node and per-link latency**. +- Hop count alone must not be used for ordering or path selection. +- Size-aware serialization latency (bytes / BW) contributes to distance. + +### D2. Routing order is derived from graph traversal +- The chosen route is the path with minimum accumulated latency + given the constructed graph and routing policy. +- Deterministic ordering must be guaranteed for identical inputs + (topology + policy + request). + +### D3. Bypass is explicit and graph-represented +- Any bypass (e.g., local cube HBM access via XBAR instead of NOC) must be: + - explicitly represented as a graph path, and + - subject to latency accumulation like any other path. +- Example: PE_DMA has dual egress — one to XBAR (HBM path) and one to NOC (non-HBM path). + Both are explicit graph edges; neither is a “bypass” — they are distinct data paths + serving different memory domains. +- Implicit or “magic” bypass paths are disallowed. + +### D4. No zero-latency end-to-end paths + +- Every routed request must incur **end-to-end** latency > 0. +- Individual fabric segments (e.g., NOC hops) MAY have distance_mm = 0 + when the fabric is distributed and distance is not meaningful at that granularity. + This is allowed because other components on the same path (e.g., PE_DMA, SRAM, + UCIe endpoints) contribute non-zero latency, ensuring the end-to-end invariant holds. +- Fully zero-latency end-to-end paths are disallowed, except for explicit + test-only stubs clearly marked as such. + +### D5. Policy vs topology responsibility split +- Topology builder: + - defines nodes and links and their latency/BW parameters +- Routing policy: + - selects among available graph paths based on decoded domains +- Routing policy must not assume missing links; missing connectivity + is a topology construction error. + +### D6. No software-managed routing metadata +- Routing decisions must not rely on per-request software-managed metadata + that tracks distance, hop count, or ordering outside the graph model. +- All distance/order computation is derived from traversal itself. + +## Alternatives Considered + +1) **Hop-count based routing** +- Rejected: ignores heterogeneous latency/BW and misrepresents + architectural differences. + +2) **Implicit local shortcuts** +- Rejected: breaks debuggability and violates traversal-based latency. + +3) **Software-managed distance metadata** +- Rejected: increases control overhead and obscures routing semantics. + +## Consequences + +### Positive +- Clear, debuggable hop-by-hop traces (SPEC R2, R4). +- Architecture comparisons reflect real interconnect structure. +- Routing behavior is reproducible and deterministic. + +### Tradeoffs / Costs +- Graph construction must be correct and complete. +- Bypass modeling requires explicit graph representation, + which slightly increases topology description complexity. + +## Implementation Notes (Non-normative) +- Recommended responsibilities: + - Graph builder: ensure all required paths exist. + - Router: select next hop based on decoded domains and policy. +- Tests should assert: + - non-zero end-to-end latency + - deterministic routing for identical inputs + - bypass paths appear explicitly in emitted traces + +## Links +- SPEC.md: R1 (routing), R2 (latency), R3 (topology), R5 (multi-domain comm) +- ADR-0001: PhysAddr layout & decoding contract diff --git a/docs/adr/ADR-0003-target-system-hierarchy.md b/docs/adr/ADR-0003-target-system-hierarchy.md new file mode 100644 index 0000000..4a685d8 --- /dev/null +++ b/docs/adr/ADR-0003-target-system-hierarchy.md @@ -0,0 +1,64 @@ +# ADR-0003: Target System Hierarchy & Modeling Scope + +## Status + +Accepted + +## Context + +We need a system-level simulator to evaluate LLM kernel performance on our AI Accelerator platform. +The platform is organized as a compute tray containing multiple identical SIPs connected via PCIe or UAL +through switching fabrics, with a host CPU issuing commands/kernels. + +## Decision + +We model the system hierarchy explicitly: + +### D1. Tray-level + +- A compute tray contains: + - Host CPU (issues requests / coordinates runtime & data placement) + - Multiple identical SIPs (accelerators) + - Interconnect fabric between SIPs (PCIe and/or UAL via switches) + +### D2. SIP-level + +- A SIP is a multi-die package composed of: + - Multiple CUBEs (HBM die + compute PEs + UCIe) + - One or more IO chiplets (host/SIP interfaces) +- IO chiplets: + - provide interfaces: PCIe-EP, IO_CPU, optionally UAL-EP + - can be multiple per SIP + - placement constrained to SIP shoreline (top/bottom/left/right); each shoreline may host 1–2 IO chiplets + +### D3. CUBE-level + +- A CUBE contains: + - HBM + memory controller (HBM_CTRL) + - XBAR (top/bottom): HBM pseudo-channel crossbar, PE's dedicated path to HBM + - Bridge (left/right): connects XBAR.top ↔ XBAR.bottom for cross-half HBM access + - NOC: distributed on-die fabric spanning the entire cube (distance modeled as 0); + carries non-HBM traffic including inter-cube (UCIe), command (M_CPU↔PE_CPU), and shared SRAM access + - Shared SRAM: cube-level shared memory accessible by all PEs via NOC + - management/control CPU (M_CPU) coordinating PE command distribution and completion aggregation + - multiple PEs + - up to 4 UCIe endpoints (N/E/W/S) for CUBE↔CUBE and CUBE↔IO connectivity + +### D4. PE-level + +- A PE can execute one kernel instance +- PE contains internal control + accelerators (modeled at PE view granularity): + - PE_CPU, command handler, PE_TCM, DMA/GEMM/MATH engines, internal queues + +## Consequences + +- The simulator supports abstraction by “views”: + - SIP view hides PE internals + - CUBE view treats each PE as a single block + - PE view expands PE internals +- Topology remains parameterized; sizes/counts/links come from configuration. + +## Links + +- SPEC R3/R5 +- ADR-0005 (diagram views) diff --git a/docs/adr/ADR-0004-memory-semantics-local-hbm.md b/docs/adr/ADR-0004-memory-semantics-local-hbm.md new file mode 100644 index 0000000..ed91e7d --- /dev/null +++ b/docs/adr/ADR-0004-memory-semantics-local-hbm.md @@ -0,0 +1,64 @@ +# ADR-0004: Memory Semantics & Local-HBM Bandwidth Guarantee + +## Status + +Accepted + +## Context + +Accurately modeling PE↔HBM behavior is essential for kernel latency estimation. +Each PE has a notion of “local HBM” that must guarantee full HBM bandwidth, independent of intervening on-die fabric bandwidth. + +## Decision + +### D1. Local HBM definition + +- Each PE is assigned a logically defined “local HBM” region. +- Local HBM corresponds to the pseudo-channel subset directly attached to that PE’s DMA path + via the XBAR (top or bottom, depending on PE corner placement). +- The path is: PE_DMA → XBAR.top/bottom → HBM_CTRL. +- The mapping (HBM pseudo-channels → PE local regions) is derived from topology configuration. + +### D2. Local HBM bandwidth guarantee contract + +- Accesses from a PE to its local HBM MUST guarantee full HBM read/write bandwidth + independent of intervening fabric bandwidth limits. +- This guarantee is modeled by: + - a dedicated logical path and/or service model that enforces HBM BW at the PE-local-HBM interaction point, + - while still incurring non-zero latency along explicitly modeled components. + +### D3. Cross-half HBM semantics + +- A PE connected to XBAR.bottom that accesses HBM pseudo-channels on the XBAR.top half + (or vice versa) traverses a bridge: + - PE_DMA → XBAR.bottom → bridge → XBAR.top → HBM_CTRL +- Bridge bandwidth may limit cross-half HBM access relative to local-half access. + +### D4. Non-local HBM semantics (inter-cube / inter-SIP) + +- Accesses from a PE to HBM in a different cube or SIP MAY be limited by: + - NOC bandwidth within the cube, + - inter-cube UCIe links, + - inter-SIP fabric (PCIe/UAL). +- These paths MUST be explicit and traceable. + +### D5. Shared SRAM semantics + +- Each CUBE contains a shared SRAM accessible by all PEs in that CUBE. +- Access path: PE_DMA → NOC → shared SRAM. +- Shared SRAM bandwidth is limited by the NOC↔SRAM link bandwidth. +- Shared SRAM is not part of the HBM address space; it is a separate memory domain. + +## Verification Notes + +Tests should cover: + +- local-HBM case: BW matches HBM BW regardless of fabric BW parameter +- cross-half HBM case: latency includes bridge traversal +- non-local cases (inter-cube/inter-SIP): BW/latency respond to fabric/link parameters +- shared SRAM case: access via NOC with correct BW + +## Links + +- SPEC R2/R5 +- ADR-0002 (distance/order & explicit bypass) diff --git a/docs/adr/ADR-0005-diagram-views-distance-layout.md b/docs/adr/ADR-0005-diagram-views-distance-layout.md new file mode 100644 index 0000000..918afbe --- /dev/null +++ b/docs/adr/ADR-0005-diagram-views-distance-layout.md @@ -0,0 +1,186 @@ +# ADR-0005: Diagram Views & Distance-Aware Layout Rules + +## Status + +Accepted + +## Context + +We require verifiable and inspectable system modeling for a large-scale, +parameterized AI Accelerator system. + +Humans must be able to: + +- visually inspect the modeled topology, +- reason about communication structure and relative distance, +- do so at multiple abstraction levels without being overwhelmed by detail. + +The simulator models distance (accumulated latency) as a first-class concept. +Diagrams must reflect this distance by default. + +--- + +## Global Defaults + +- All diagrams MUST be **distance-aware by default**. +- All diagrams MUST render **representative views** of the architecture. +- Instance indices (e.g., sip0, cube2, pe3) MUST NOT be required for diagram generation. +- Instance indices MAY be used ONLY: + - to define a distance anchor in asymmetric or debugging scenarios, or + - when explicitly requested. + +--- + +## Representative Rendering Rule + +- All CUBEs share the same internal structure. +- All PEs share the same internal structure. + +Therefore: + +- SIP-level diagrams render representative CUBEs and IO chiplets. +- CUBE-level diagrams render representative PEs as opaque blocks. +- PE-level diagrams render a representative PE with fully expanded internals. + +Diagrams MUST NOT depend on specific SIP, CUBE, or PE indices +unless explicitly requested. + +--- + +## Diagram Views + +### View A — SIP-Level Diagram + +**Purpose** +Explain system-scale structure and connectivity. + +**Visible elements** + +- SIP boundaries (optional) +- CUBEs (opaque blocks) +- IO chiplets (opaque blocks) +- Optional UCIe stubs only if needed to clarify connectivity + +**Hidden elements** + +- PE internals +- CUBE internal fabric +- IO chiplet internals + +**Visible links** + +- Host ↔ IO chiplets (PCIe) +- SIP ↔ SIP (PCIe / UAL via switches) +- IO ↔ CUBE (on-package links) + +--- + +### View B — CUBE-Level Diagram + +**Purpose** +Explain cube-internal structure and data/control flow. + +**Visible elements** + +- XBAR (top/bottom): HBM pseudo-channel crossbar +- Bridge (left/right): cross-half HBM connectors between XBAR.top and XBAR.bottom +- NOC: distributed on-die fabric for non-HBM traffic +- HBM subsystem (HBM_CTRL) +- Shared SRAM: cube-level shared memory +- Management CPU (M_CPU) +- PEs as opaque blocks (PE[0..N−1]) +- UCIe endpoints (N/E/W/S) as ports + +**Hidden elements** + +- PE internals + +**Visible links** + +- PE → XBAR (HBM data path, top or bottom by corner placement) +- PE → NOC (non-HBM data path) +- XBAR ↔ bridge ↔ XBAR (cross-half HBM access) +- XBAR → HBM_CTRL +- NOC ↔ UCIe endpoints +- NOC ↔ shared SRAM +- M_CPU ↔ NOC (command path) +- NOC → PE_CPU (command delivery, collapsed into PE block) + +--- + +### View C — PE-Level Diagram + +**Purpose** +Explain internal PE behavior and execution structure. + +**Visible elements** + +- PE_CPU +- Command handler / scheduler +- PE_TCM (local SRAM) +- HW accelerators (DMA, GEMM, MATH, etc.) +- Local HBM interface +- Optional IPCQ / messaging endpoints + +**Visible links** + +- Control paths (CPU → scheduler → engines) +- Data paths (engines ↔ TCM, DMA ↔ local HBM) +- External fabric ports as abstract ports only + +--- + +## Distance-Aware Layout (Default) + +### Distance definition + +- Distance is defined as **accumulated latency**, consistent with ADR-0002. +- Distance is computed from a single anchor node. + +### Default anchor selection + +- SIP view: IO chiplet (or Host CPU if present) +- CUBE view: a representative PE +- PE view: PE_CPU or Command Handler + +Anchors are **implicit defaults** and MUST NOT be required to be specified. + +### Layout rules + +- Diagrams MUST be laid out in layers based on distance buckets. +- Layout direction MUST be consistent within a view type + (preferred: left-to-right). +- Nodes with equal distance MUST have stable ordering + (by role or identifier, deterministically). + +Cycles MAY be rendered using dashed or curved edges for readability, +without affecting distance semantics. + +--- + +## Generation Contract (for Tools / Claude Code) + +When generating diagrams: + +- Assume distance-aware layout by default. +- Assume representative rendering by default. +- Do NOT ask for SIP/CUBE/PE indices unless required. +- Do NOT expand hidden abstraction levels. +- Prefer architectural clarity over micro-hop fidelity. + +--- + +## Consequences + +- Diagrams are stable across topology scaling. +- Changes in distance or routing policy are reflected visually. +- Diagrams serve as verifiable artifacts derived from the simulator model, + not as hand-maintained documentation. + +--- + +## Links + +- SPEC Section 4 (Output, Debuggability, and Diagrams) +- ADR-0002 (Routing distance semantics) +- ADR-0006 (Topology compilation & automatic diagram generation) diff --git a/docs/adr/ADR-0006-topology-compilation-distance-diagram.md b/docs/adr/ADR-0006-topology-compilation-distance-diagram.md new file mode 100644 index 0000000..b9c8fe1 --- /dev/null +++ b/docs/adr/ADR-0006-topology-compilation-distance-diagram.md @@ -0,0 +1,130 @@ +# ADR-0006: Topology Compilation, Distance Extraction, and Automatic Diagram Generation + +## Status + +Accepted + +## Context + +The simulator compiles topology configuration (e.g., topology.yaml) into an explicit model graph, +and computes routing and accumulated latency (distance). +Diagrams should be generated from these authoritative artifacts to ensure consistency and avoid +hand-maintained topology drawings. + +Additionally, for usability, diagrams should be emitted automatically into a stable location +so that developers can preview them immediately in the repository. + +--- + +## Decision + +### D1. Topology compilation is the single source of truth + +- topology.yaml (or equivalent config) is compiled into: + - an explicit system graph, + - node/link attributes, + - routing policies. +This compiled graph is the authoritative representation of the system. + +### D2. Distance extraction during compilation + +- During or immediately after topology compilation, the simulator MUST compute distance metadata + (accumulated latency) consistent with ADR-0002. +- Distance metadata MUST be sufficient to support distance-aware diagram layout as defined in ADR-0005. +- Distributed fabric segments (e.g., NOC) MAY have distance_mm = 0 per ADR-0002 D4; + layout placement for such nodes uses explicit position metadata rather than distance buckets. + +### D3. Diagram generation is a derived artifact + +- Diagrams MUST be generated from: + - the compiled topology graph, + - extracted distance metadata, + - view/layout rules defined in ADR-0005. +- Diagram generation MUST NOT require additional hand-written topology descriptions. + +### D4. Automatic diagram emission to the repository + +- As part of topology compilation, the implementation MUST produce the following diagrams by default: + - SIP-level diagram (representative, distance-aware) + - CUBE-level diagram (representative, distance-aware) + - PE-level diagram (representative, distance-aware) +- The default output directory is: + - `docs/diagrams/` +- The generator MUST overwrite/update only when the compiled topology (or diagram rules) changes. + +### D5. View-specific projection and layout + +For each view (SIP / CUBE / PE): + +- The generator MUST project the compiled graph into a reduced view graph: + - hide/collapse nodes according to ADR-0005, + - preserve connectivity semantics relevant to that view, + - compute distance buckets and assign layout layers deterministically. +- CUBE-level projection MUST include: + - XBAR (top/bottom), bridge (left/right), NOC, HBM_CTRL, shared SRAM, M_CPU, UCIe ports, + and PEs as opaque blocks. + - Distinct edge kinds for HBM path (PE→XBAR) vs non-HBM path (PE→NOC). +- Default anchors are implicit (ADR-0005) and MUST NOT require instance indices. + +### D6. Output formats and determinism + +- The generator MUST output at least one of: + - Mermaid (Markdown-native) + - Graphviz DOT (rank-based control) + - SVG (mm-accurate layout, no external dependencies) +- SVG is preferred when mm-accurate position metadata is available from the compiled topology. +- Output MUST be deterministic: + - same topology + same rules → identical diagram text +- File naming MUST be deterministic and stable (see "Output Conventions"). + +### D7. Performance and caching + +- Diagram generation MAY be lazy and/or cached, as long as the outputs in `docs/diagrams/` + remain consistent with the compiled topology. +- The implementation SHOULD use a cache key based on: + - topology content hash, + - routing policy version, + - diagram rules version, + - view type (SIP/CUBE/PE). + +--- + +## Output Conventions + +### Directory + +- `docs/diagrams/` is the canonical output directory for generated diagrams. + +### File names (recommended, deterministic) + +- `system_view.svg` / `system_view.mmd` / `system_view.dot` +- `sip_view.svg` / `sip_view.mmd` / `sip_view.dot` +- `cube_view.svg` / `cube_view.mmd` / `cube_view.dot` +- `pe_view.svg` / `pe_view.mmd` / `pe_view.dot` + +Optionally, for multi-topology workflows: + +- `sip_view__{topology_id}.svg` +- `cube_view__{topology_id}.svg` +- `pe_view__{topology_id}.svg` + +### Repository policy + +- Generated diagram files MAY be committed to the repository to enable diff-based review. +- If committed, they MUST be reproducible from topology compilation. + +--- + +## Consequences + +- Diagrams are always consistent with simulator behavior. +- Architectural changes automatically propagate to visualizations. +- Diagram diffs become meaningful indicators of architectural change. + +--- + +## Links + +- SPEC Section 4 (Output, Debuggability, and Diagrams) +- ADR-0002 (Distance semantics) +- ADR-0005 (Diagram views and layout rules) diff --git a/docs/adr/ADR-0007-runtime-api-boundaries.md b/docs/adr/ADR-0007-runtime-api-boundaries.md new file mode 100644 index 0000000..51975be --- /dev/null +++ b/docs/adr/ADR-0007-runtime-api-boundaries.md @@ -0,0 +1,89 @@ +# ADR-0007: Runtime API and Simulation Engine Boundaries + +## Status + +Accepted + +## Context + +The simulator consists of multiple layers with distinct responsibilities: + +- a host-facing API layer used by benchmarks and user code, +- a discrete-event simulation engine that executes requests, +- device components that model hardware behavior. + +Without strict boundaries, orchestration logic can leak into components, +or simulation internals can become entangled with user-facing APIs. + +This ADR defines clear responsibility boundaries between: + +- runtime API, +- simulation engine (sim_engine), +- hardware components. + +--- + +## Decision + +### D1. Runtime API is host-facing orchestration only + +The runtime API represents host/driver-level behavior and MUST: + +- expose high-level operations (tensor deployment, kernel launch), +- submit requests only to endpoint components (e.g., IO_CPU), +- await completion via futures/handles, +- own and persist host-side metadata (tensor allocation maps, kernel bindings). + +The runtime API MUST NOT: + +- hardcode hop-by-hop routing or fan-out, +- directly invoke internal components (M_CPU, PE_CPU, engines), +- embed topology- or routing-specific assumptions. + +--- + +### D2. Simulation engine executes and schedules requests + +The simulation engine (sim_engine) MUST: + +- inject requests into the compiled topology graph, +- schedule and execute events using a discrete-event model, +- manage correlation ids and completion tracking, +- decompose operations into low-level requests when required + (e.g., MemoryWrite events). + +The simulation engine MUST NOT: + +- define tensor semantics, +- define kernel execution policies, +- expose internal graph details to the runtime API. + +--- + +### D3. Components own fan-out and aggregation + +Device-side components MUST: + +- fan-out requests to downstream domains + (IO_CPU → M_CPU → PE_CPU → schedulers/engines), +- aggregate completion and failure signals, +- propagate results deterministically upstream. + +Neither the runtime API nor the simulation engine may orchestrate +component-level fan-out explicitly. + +--- + +## Consequences + +- Runtime APIs remain stable as topology and routing evolve. +- Simulation internals can change without affecting user-facing code. +- Component implementations remain swappable via DI. + +--- + +## Links + +- SPEC R4, R7, R8 +- ADR-0008 (Tensor deployment) +- ADR-0009 (Kernel execution) diff --git a/docs/adr/ADR-0008-tensor-deploy-and-allocation.md b/docs/adr/ADR-0008-tensor-deploy-and-allocation.md new file mode 100644 index 0000000..2ef5e82 --- /dev/null +++ b/docs/adr/ADR-0008-tensor-deploy-and-allocation.md @@ -0,0 +1,100 @@ +# ADR-0008: Tensor Deployment and Allocation (Host Allocator, PA-first) + +## Status + +Accepted + +## Context + +Benchmarks require PyTorch-like tensor semantics: + +- tensor creation (empty, fill), +- deployment to accelerator devices (tensor.to()). + +In the realistic system, host software manages allocation/mapping and installs +mappings for DMA/MMU. For Phase 0 we simplify (ADR-0011): + +- device memory operations use PA only, +- VA/MMU/IOMMU is not modeled. + +To keep the host↔device interface minimal, we avoid a separate +AllocateTensorMeta message. Instead, host allocation produces a PA shard map +that is used directly by MemoryWrite/Read and KernelLaunch. + +--- + +## Decision + +### D1. Tensor is a host-owned handle with PA shard mapping + +A Tensor object is a host-owned handle that encapsulates: + +- shape and dtype, +- initialization intent, +- device placement and allocation metadata as a PA shard map. + +After deployment, the Tensor handle MUST contain: + +- a list of shards, each with (sip,cube,pe,pa,nbytes,offset_bytes). + +This PA shard mapping is the single source of truth for kernel argument binding. + +--- + +### D2. Deployment uses a host allocator (Phase 0) + +In Phase 0, tensor deployment produces PA shard mappings via a host allocator: + +- placement (split/replicate/hybrid) is decided by a DP policy, +- allocation assigns PA ranges at the PE level and returns shard mappings, +- the Tensor handle stores the resulting shard list deterministically. + +No separate host-visible device allocation RPC is required in Phase 0. + +--- + +### D3. Data initialization and transfer uses MemoryWrite/Read only + +Any data initialization or transfer implied by a tensor (e.g., fill, copy) +MUST be represented using Host ↔ IO_CPU messages only: + +- MemoryWrite +- MemoryRead + +Rules: + +- MemoryWrite/Read MUST reference PA + (sip,cube,pe) tags (ADR-0012). +- Allocation metadata MUST NOT be embedded as a separate allocation message. +- Bulk tensor data MUST NOT be embedded in Phase 0 messages. + +The simulation engine schedules MemoryWrite/Read through the graph so that +latency is computed by explicit traversal. + +--- + +### D4. Extension path (non-breaking) + +Future ADRs MAY introduce optional VA/MMU/IOMMU modeling by adding: + +- virtual addressing in tensor handles, +- mapping install steps, +- translation latency/page granularity. + +The Phase 0 PA shard map remains a valid fast-path configuration. + +--- + +## Consequences + +- Host↔IO_CPU contract remains minimal (MemoryRead/Write + KernelLaunch). +- KernelLaunch can pass per-PE data placement explicitly via shard tags. +- Early implementation stays simple and testable. + +--- + +## Links + +- ADR-0011 (PA-first) +- ADR-0012 (Host↔IO_CPU schema) +- ADR-0007 (runtime_api vs sim_engine boundaries) +- ADR-0009 (Kernel execution) diff --git a/docs/adr/ADR-0009-kernel-execution-messaging.md b/docs/adr/ADR-0009-kernel-execution-messaging.md new file mode 100644 index 0000000..91ca443 --- /dev/null +++ b/docs/adr/ADR-0009-kernel-execution-messaging.md @@ -0,0 +1,74 @@ +# ADR-0009: Kernel Execution Messaging and Completion Semantics + +## Status + +Accepted + +## Context + +Kernel execution is initiated by the host and proceeds through +device control components: + +Host → IO_CPU → M_CPU → PE_CPU → schedulers → engines + +Completion propagates in reverse order. + +To keep benchmarks simple and topology-agnostic, +kernel execution must be endpoint-driven with deterministic aggregation. + +--- + +## Decision + +### D1. Kernel launch is an endpoint request + +A kernel launch is initiated by submitting a single KernelLaunch request +to the IO_CPU endpoint. + +The runtime API MUST: + +- construct the kernel launch request, +- submit it to IO_CPU, +- await a single completion result. + +The runtime API MUST NOT orchestrate internal fan-out. + +--- + +### D2. Tensor arguments are passed by metadata + +KernelLaunch requests MUST reference tensor arguments via: + +- host-owned tensor handles, or +- resolved device address maps derived from those handles. + +Bulk tensor data MUST NOT be embedded in kernel launch messages. + +--- + +### D3. Fan-out and aggregation are component responsibilities + +- IO_CPU fans out work to M_CPUs. +- M_CPU fans out work to PE_CPUs. +- PE_CPU manages kernel execution and engine dispatch. + +Completion semantics: + +- M_CPU completes when all targeted PEs complete or a failure policy triggers. +- IO_CPU completes when all targeted CUBEs complete or a failure policy triggers. + +--- + +### D4. Completion and failure propagation + +- All messages MUST carry correlation identifiers. +- Completion and failure MUST propagate deterministically to the host. +- The simulation engine provides futures/handles to observe completion. + +--- + +## Links + +- SPEC R1, R2, R7, R8 +- ADR-0007 (Runtime API boundaries) +- ADR-0008 (Tensor deployment) diff --git a/docs/adr/ADR-0010-cli-device-selection.md b/docs/adr/ADR-0010-cli-device-selection.md new file mode 100644 index 0000000..bed601b --- /dev/null +++ b/docs/adr/ADR-0010-cli-device-selection.md @@ -0,0 +1,62 @@ +# ADR-0010: CLI Device Selection and Multi-Device Execution Semantics + +## Status + +Accepted + +## Context + +Benchmarks represent device-agnostic workloads that operate on a single device. +Users may want to run a benchmark: + +- on a specific device, or +- across all devices in the system. + +Device enumeration must not leak into benchmarks or runtime APIs. + +--- + +## Decision + +### D1. Benchmarks are single-device by design + +- A benchmark MUST define behavior for a single device only. +- A benchmark MUST accept a device identifier as input. +- Benchmarks MUST NOT enumerate or loop over multiple devices. + +--- + +### D2. CLI controls device selection + +The `kernbench run` command supports an optional `--device` argument: + +- If `--device ` is specified: + - the benchmark executes once for the specified device. + +- If `--device` is omitted: + - the benchmark executes once using all the SIPs discovered in the topology. + +--- + +### D3. Multi-device execution is logically parallel + +When running on multiple devices: + +- benchmark executions are submitted to a single simulation engine instance, +- executions are logically parallel in simulation time, +- inter-device contention is naturally modeled. + +--- + +### D4. Runtime API and simulation engine remain device-scoped + +- Runtime API calls operate on one device per invocation. +- The simulation engine schedules all requests deterministically. +- Neither layer enumerates devices. + +--- + +## Links + +- SPEC R7, R8 +- ADR-0007 (Runtime API boundaries) diff --git a/docs/adr/ADR-0011-memory-addressing-simplification.md b/docs/adr/ADR-0011-memory-addressing-simplification.md new file mode 100644 index 0000000..3fa7003 --- /dev/null +++ b/docs/adr/ADR-0011-memory-addressing-simplification.md @@ -0,0 +1,65 @@ +# ADR-0011: Memory Addressing Simplification (PA-first) + +## Status + +Accepted + +## Context + +A realistic system uses host-side virtual addressing and an MMU/IOMMU-style +translation path for DMA: host allocates physical memory at PE level, maps it +into a virtual address space, installs mappings, and DMA requests use virtual +addresses that are translated to physical addresses. + +For early development, we want a minimal, deterministic model that enables: + +- correct routing and latency accounting through the graph, +- stable tensor deployment and kernel execution semantics, +- future extension toward VA/MMU without rewriting workflows. + +--- + +## Decision + +### D1. Phase 0 model is PA-only + +The simulator uses a PA-first model: + +- All device memory accesses (MemoryRead/MemoryWrite) operate on device physical + addresses (PA) plus size. +- Tensor handles store PA-based shard mappings after deployment. +- KernelLaunch passes tensor arguments as PA-based mappings (or references to them). +- MMU/IOMMU concepts (virtual address spaces, page tables, translation latency) + are NOT modeled in Phase 0. + +### D2. Allocation produces PA mappings + +Device allocation selects PE-local memory regions and returns PA mappings +sufficient to execute kernels and issue DMA requests. + +### D3. Extension path (non-breaking) + +A future ADR MAY introduce an optional VA/MMU layer by: + +- introducing virtual addresses in tensor handles, +- adding a mapping-install step, +- modeling translation latency and page granularity. + +The Phase 0 PA model remains a valid fast-path configuration. + +--- + +## Consequences + +- Early implementation stays simple and testable. +- All latency remains explicit via graph traversal, not hidden translation. +- Future VA/MMU modeling can be added without breaking existing benchmarks. + +--- + +## Links + +- ADR-0007 (runtime_api vs sim_engine boundaries) +- ADR-0008 (tensor deployment) +- ADR-0009 (kernel execution) +- SPEC R2 (latency by traversal) diff --git a/docs/adr/ADR-0012-host-io-message-schema.md b/docs/adr/ADR-0012-host-io-message-schema.md new file mode 100644 index 0000000..b3f4c6f --- /dev/null +++ b/docs/adr/ADR-0012-host-io-message-schema.md @@ -0,0 +1,232 @@ +# ADR-0012: Host ↔ IO_CPU Message Schema (PA-first, PE-tagged) + +## Status + +Accepted + +## Context + +Phase 0 uses a PA-first memory model (ADR-0011): + +- memory operations use device physical addresses (PA) only, +- VA/MMU/IOMMU is not modeled. + +The host-facing runtime API interacts with the device via the IO_CPU endpoint. +We define stable, minimal message schemas for Host ↔ IO_CPU so that: + +- benchmarks remain stable, +- IO_CPU-internal fan-out/aggregation can evolve independently, +- completion and failure propagation is deterministic. + +We also require PE-tagging (A 방식): each shard explicitly carries (sip,cube,pe) +so IO_CPU can deterministically route/fan-out without relying on PA decoding. + +--- + +## Decision + +### D1. Contract scope + +This schema is the stable contract ONLY for Host ↔ IO_CPU. + +Messages beyond IO_CPU (to M_CPU, PE_CPU, schedulers, engines) are component-internal +and are NOT part of this host contract in Phase 0. + +--- + +### D2. Required message set + +The runtime API MUST use only these message types for Host ↔ IO_CPU: + +- MemoryWrite +- MemoryRead +- KernelLaunch + +All operations required by benchmarks (tensor init/copy, kernel run) MUST be expressible +with these messages. + +--- + +### D3. Common envelope (mandatory for all requests) + +All Host ↔ IO_CPU requests MUST include: + +- `msg_type: str` +- `correlation_id: str` + - generated by the host + - used to match responses deterministically +- `request_id: str` + - unique within a correlation_id +- `target_device: str` + - device identifier (e.g., "sip:0") +- `timestamp_tag: str | None` (optional) + - debug tag only; MUST NOT affect determinism + +All Host ↔ IO_CPU responses MUST include: + +- `correlation_id: str` +- `request_id: str` +- `completion: Completion` + +--- + +### D4. Completion schema (mandatory) + +`Completion` MUST have: + +- `ok: bool` +- `error_code: str | None` +- `error_message: str | None` + +Rules: + +- If `ok == true` then `error_code` and `error_message` MUST be null. +- If `ok == false` then `error_code` MUST be non-null. +- Completion semantics MUST be deterministic. + +--- + +### D5. MemoryWrite schema (PA-first, PE-tagged) + +`MemoryWrite` represents a host-initiated write/initialize operation to device memory. + +Mandatory fields: + +- common envelope fields (D3) +- destination placement tags (A 방식): + - `dst_sip: int` + - `dst_cube: int` + - `dst_pe: int` +- `dst_pa: int` + - destination physical address in the destination PE's address space +- `nbytes: int` +- `src_kind: "pattern" | "host_buffer_ref"` + - Phase 0 MUST support "pattern" +- `pattern: Pattern | None` + - required if `src_kind == "pattern"` + +`Pattern` (Phase 0 mandatory support): + +- `pattern_kind: "zero" | "fill_u8" | "fill_u16" | "fill_u32" | "fill_fp16" | "fill_fp32"` +- `value: number | None` + - required for fill_*; ignored for zero + +Optional fields: + +- `dst_mem_kind: "HBM" | "TCM" | "AUTO"` (default "AUTO") +- `debug_label: str | None` + +Notes: + +- This message MUST NOT embed bulk tensor data in Phase 0. +- All latency MUST come from explicit graph traversal and modeled components. + +--- + +### D6. MemoryRead schema (PA-first, PE-tagged) + +`MemoryRead` represents a host-initiated read from device memory. + +Mandatory fields: + +- common envelope fields (D3) +- source placement tags (A 방식): + - `src_sip: int` + - `src_cube: int` + - `src_pe: int` +- `src_pa: int` +- `nbytes: int` + +Optional fields: + +- `dst_kind: "host_sink" | "discard"` (default "host_sink") +- `debug_label: str | None` + +Response payload: + +- actual bytes are NOT required in Phase 0 (latency/traces focus) +- implementations MAY return lightweight stats or hashes later via a new ADR + +--- + +### D7. KernelLaunch schema (PA-first, PE-tagged shards) + +`KernelLaunch` represents launching a kernel on a target device via IO_CPU. + +Mandatory fields: + +- common envelope fields (D3) +- `kernel_ref: KernelRef` +- `args: list[KernelArg]` + +`KernelRef` MUST have: + +- `name: str` +- `kind: "deployed" | "builtin"` +- `deploy_pa: int | None` — PA where kernel binary was deployed (required for "deployed") +- `deploy_sip: int` — SIP where binary resides +- `deploy_cube: int` — cube where binary resides +- `deploy_pe: int` — PE where binary resides +- `nbytes_code: int` — kernel binary size (for BW modeling) + +Kernel binaries MUST be pre-deployed to device memory via MemoryWrite. +KernelLaunch MUST NOT embed kernel source code or IR in the launch message. + +`KernelArg` supports tensor args by PA mapping and scalars by value. + +Tensor arg (mandatory): + +- `arg_kind: "tensor"` +- `tensor_pa_map: TensorPAMap` + +`TensorPAMap` MUST have: + +- `shards: list[TensorShard]` + +`TensorShard` MUST have (A 방식 강제): + +- `sip: int` +- `cube: int` +- `pe: int` +- `pa: int` +- `nbytes: int` +- `offset_bytes: int` + +Scalar arg (mandatory): + +- `arg_kind: "scalar"` +- `dtype: "i32" | "i64" | "fp16" | "fp32" | "bool"` +- `value: number | bool` + +Optional KernelLaunch fields: + +- `grid: dict | None` +- `meta: dict | None` +- `failure_policy: "fail_fast" | "collect_all"` (default "fail_fast") +- `debug_label: str | None` + +Notes: + +- KernelLaunch MUST NOT embed bulk tensor data. +- KernelLaunch MUST be submitted only to the IO_CPU endpoint. +- IO_CPU MUST fan-out work internally using the shard (sip,cube,pe) tags. + +--- + +## Verification Notes + +Tests SHOULD validate: + +- schema validation rejects missing mandatory fields, +- deterministic correlation/response matching, +- MemoryWrite/Read/KernelLaunch produce explicit hop traces, +- all routed requests incur latency > 0. + +--- + +## Links + +- ADR-0011 (PA-first memory addressing) +- ADR-0007 (runtime_api vs sim_engine boundaries) +- ADR-0009 (kernel execution fan-out/aggregation) +- SPEC R2, R7, R8 diff --git a/docs/adr/ADR-0013-verification_strategy.md b/docs/adr/ADR-0013-verification_strategy.md new file mode 100644 index 0000000..3f83712 --- /dev/null +++ b/docs/adr/ADR-0013-verification_strategy.md @@ -0,0 +1,139 @@ +# ADR-0013: Verification Strategy and Phase 1 Test Plan + +## Status + +Accepted + +## Context + +KernBench is a system-level simulator whose correctness is defined by: + +- adherence to SPEC-defined invariants, +- determinism and debuggability, +- explicit modeling of routing and latency. + +Given the evolving implementation, we need a stable verification strategy +that prevents architectural drift while allowing incremental development. + +This ADR defines the Phase 1 verification plan and what constitutes +"correct behavior" for early implementations. + +--- + +## Decision + +### D1. Verification is contract-based + +Verification MUST be derived from: + +- SPEC requirements, +- accepted ADRs. + +Tests MUST validate architectural contracts, not incidental implementation details. + +--- + +### D2. Phase 1 verification scope + +Phase 1 verification focuses on: + +- message contract validity (ADR-0012), +- routing and fan-out semantics at the IO_CPU boundary (ADR-0009), +- PA-first memory addressing and shard tagging (ADR-0011), +- core latency and trace invariants (SPEC 0.1, R2). + +Microarchitectural accuracy, bandwidth contention, and cycle-level behavior +are explicitly out of scope in Phase 1. + +--- + +### D3. Required Phase 1 verification cases + +The following verification cases MUST be supported by the implementation: + +#### V1. Message schema validation + +- KernelLaunch requests missing `(sip, cube, pe)` in any tensor shard MUST be rejected. +- MemoryWrite/MemoryRead requests missing destination/source placement tags MUST be rejected. +- Completion results MUST follow the `ok / error_code / error_message` contract. + +#### V2. IO_CPU fan-out and aggregation + +Given: + +- a topology with one SIP, one CUBE, and two PEs, +- a KernelLaunch request containing two tensor shards targeting different PEs, + +The system MUST: + +- submit a single KernelLaunch to IO_CPU, +- fan-out work internally to both PEs, +- aggregate completion and return a single deterministic completion to the host. + +#### V3. Latency and trace invariants + +For any valid request: + +- the hop-by-hop trace MUST be non-empty, +- total latency MUST be greater than zero, +- repeated runs with identical inputs MUST produce identical traces. + +#### V4. Topology independence and cross-domain coverage + +Verification cases MUST pass for multiple topology shapes, including: + +- minimal: (1 SIP, 1 CUBE, 1 PE) +- multi-PE: (1 SIP, 1 CUBE, N PEs) +- multi-CUBE within a SIP: (1 SIP, M CUBEs, ≥1 PE per CUBE) +- multi-SIP tray: (K SIPs, ≥1 CUBE per SIP, ≥1 PE per CUBE) + +For multi-CUBE and multi-SIP topologies, Phase 1 verification focuses on: + +- explicit connectivity (required links exist), +- deterministic routing and control-path traversal, +- non-empty traces and latency > 0 for representative cross-domain requests + (inter-CUBE and inter-SIP paths). + +Tests MUST NOT hardcode topology sizes, node ids, or link counts. +Instead, tests MUST derive expectations from the compiled topology metadata +--- + +### D4. Phase 1 artifacts + +Phase 1 MAY include: + +- verification-only test code, +- topology fixtures, +- trace inspection utilities. + +Phase 1 MUST NOT require: + +- production code changes solely to satisfy tests, +- weakening or removing tests to allow progress. + +--- + +### D5. Phase 2 enforcement + +Phase 2 (Apply) MUST: + +- run the Phase 1 verification cases, +- rollback all changes if any verification fails, +- preserve tests as authoritative contracts. + +--- + +## Consequences + +- Architectural correctness is enforced early. +- Tests serve as executable documentation of system behavior. +- Implementation remains flexible without losing rigor. + +--- + +## Links + +- SPEC 0.1, R2, R6 +- ADR-0011 (PA-first memory addressing) +- ADR-0012 (Host ↔ IO_CPU message schema) +- ADR-0009 (Kernel execution semantics) diff --git a/docs/adr/ADR-0014-pe-internal-execution-model.md b/docs/adr/ADR-0014-pe-internal-execution-model.md new file mode 100644 index 0000000..99023a0 --- /dev/null +++ b/docs/adr/ADR-0014-pe-internal-execution-model.md @@ -0,0 +1,364 @@ +# ADR-0014: PE Internal Execution Model (PE_CPU, PE_SCHEDULER, and Composite Commands) + +## Status + +Proposed + +## Context + +ADR-0003 (system hierarchy) and ADR-0009 (kernel execution semantics) reference PE internals but do not define: + +- the dispatch model inside a PE, +- the responsibilities of PE_SCHEDULER, +- the PE_TCM-centric dataflow contract used by accelerator engines. + +We need a deterministic and debuggable PE-internal execution contract that supports: + +- simple single-engine commands +- composite commands that build a tiled pipeline across DMA and accelerator engines + +The simulator must produce deterministic traces and allow modeling of PE-internal pipelining without introducing nondeterministic engine scheduling. + +## Decision + +### D1. PE internal component roles + +Each PE contains the following logical components. + +**PE_CPU** + +- Executes kernel instruction stream or kernel control logic. +- Generates PE commands. +- Submits commands to PE_SCHEDULER. +- PE_CPU does NOT enqueue work directly into engine queues. + +**PE_SCHEDULER** + +- The sole dispatcher inside a PE. +- Receives commands from PE_CPU. +- Expands composite commands into sub-commands. +- Tracks dependencies and command state. +- Dispatches work to engine queues. +- Manages tile scheduling for composite commands. + +**PE_DMA** + +- Handles memory transfers between PE_TCM and external memory domains. +- PE_DMA has **dual egress** at the CUBE level: + - **→ XBAR**: dedicated path to HBM (local and cross-half via bridge) + - **→ NOC**: path to non-HBM destinations (shared SRAM, inter-cube UCIe, etc.) +- Supported directions include: + - HBM → PE_TCM (via XBAR) + - PE_TCM → HBM (via XBAR) + - PE_TCM → shared SRAM (via NOC) + - PE_TCM → other memory domains (via NOC, if supported by topology) + +**PE_GEMM** + +- Matrix multiplication engine. +- Reads activations from PE_TCM. +- May stream weights directly from HBM. + +**PE_MATH** + +- Element-wise computation engine. +- Reads and writes PE_TCM. + +**PE_TCM** + +- Local SRAM used as the staging memory for accelerator operations. + +--- + +### D2. Command lifecycle and queues + +PE_SCHEDULER maintains three logical structures. + +**SubmissionQueue** + +- Written by PE_CPU. +- Contains incoming PE commands waiting to be processed. + +**InflightTable** + +- Owned and mutated only by PE_SCHEDULER. +- Tracks: + - expanded sub-commands + - dependency state + - engine assignment + - completion status + +**CompletionQueue** + +- Written by PE_SCHEDULER. +- Contains final completion records for commands. + +**Single-writer rule** + +- Only PE_SCHEDULER is allowed to mutate command completion state. +- Engine components must report completion via explicit completion events/messages. + +**Command completion** + +A command becomes DONE when: + +- all sub-commands complete +- PE_SCHEDULER publishes a completion record to CompletionQueue. + +--- + +### D3. Dispatch modes + +PE commands are divided into two categories. + +#### D3.1 Simple command + +A simple command expands to exactly one engine sub-command. + +Examples include: + +- DMA transfer +- GEMM compute +- MATH compute + +Execution flow: + +``` +PE_CPU → SubmissionQueue → PE_SCHEDULER → engine queue → engine execution → completion event → PE_SCHEDULER → CompletionQueue +``` + +#### D3.2 Composite command (tiled pipeline) + +Composite commands implement tiled pipelined execution across engines. + +Each tile executes the following pipeline: + +``` +Input DMA (READ) +→ Compute (GEMM or MATH) +→ Output DMA (WRITE) +``` + +**Tiling rule** + +If the DMA payload exceeds hardware tile size, PE_SCHEDULER splits the transfer into tiles. +Each tile is assigned a monotonically increasing `tile_id`. + +**Tile dependency rules** + +For tile `t`: + +- Compute must wait for input DMA: `DMA_READ(t) → COMPUTE(t)` +- Output DMA must wait for compute: `COMPUTE(t) → DMA_WRITE(t)` +- All dependencies are enforced by PE_SCHEDULER. + +**Overlap policy (Phase 0 default)** + +Operations for different tiles may overlap when engine resources permit. + +Allowed overlaps: + +``` +DMA_READ(t+1) ∥ COMPUTE(t) +DMA_WRITE(t−1) ∥ COMPUTE(t) +DMA_READ(t) ∥ DMA_WRITE(t) +``` + +Disallowed overlaps: + +``` +GEMM(t) ∥ GEMM(t′) +MATH(t) ∥ MATH(t′) +GEMM(t) ∥ MATH(t′) +``` + +--- + +### D4. Engine execution model (Phase 0 default) + +Each engine behaves as a deterministic service resource. + +**DMA engine** + +PE_DMA contains two independent channels. + +``` +DMA_READ capacity = 1 +DMA_WRITE capacity = 1 +``` + +Rules: + +- DMA_READ and DMA_WRITE may execute concurrently. +- Multiple READs cannot overlap. +- Multiple WRITEs cannot overlap. + +Example allowed: + +``` +DMA_READ(t+1) ∥ DMA_WRITE(t) +``` + +Example not allowed: + +``` +DMA_READ(t) ∥ DMA_READ(t+1) +DMA_WRITE(t) ∥ DMA_WRITE(t+1) +``` + +**Compute engine** + +Compute operations share a single compute resource. + +``` +PE_ACCEL capacity = 1 +``` + +Both GEMM and MATH require this shared compute slot. + +Consequences: + +- GEMM ∥ GEMM not allowed +- MATH ∥ MATH not allowed +- GEMM ∥ MATH not allowed + +Only one compute operation can run in a PE at a time. + +**Compute opcode restriction** + +Composite commands contain one compute opcode only. + +Examples: + +``` +COMPOSITE_GEMM +COMPOSITE_MATH +``` + +Mixed compute pipelines such as `GEMM → MATH` are not supported in Phase 0. + +**Engine completion signaling** + +Every engine emits a completion event when a sub-command finishes. +Completion events are delivered to PE_SCHEDULER. + +--- + +### D5. Dataflow model + +Compute operations use a TCM-centric dataflow model. + +**Input path (HBM)** + +``` +HBM → XBAR → PE_DMA (DMA_READ) → PE_TCM +``` + +**Input path (shared SRAM)** + +``` +Shared SRAM → NOC → PE_DMA (DMA_READ) → PE_TCM +``` + +**Compute stage** + +Compute engines read input tensors from PE_TCM. + +``` +PE_TCM → GEMM / MATH +``` + +Weights for GEMM may optionally stream directly from HBM (via XBAR). + +**Output path (HBM)** + +Compute results are written to PE_TCM, then DMA writes to HBM. + +``` +PE_TCM → PE_DMA (DMA_WRITE) → XBAR → HBM +``` + +**Output path (shared SRAM)** + +``` +PE_TCM → PE_DMA (DMA_WRITE) → NOC → Shared SRAM +``` + +#### D5.1 PE_TCM partitioning and ownership boundary + +The PE_TCM address space is partitioned into two logical regions. + +**SchedulerReservedTCM** + +- A staging region owned exclusively by PE_SCHEDULER. +- This region is used for composite command tile buffers. +- PE_SCHEDULER: + - partitions this region into tile buffers + - assigns buffers for DMA_READ, COMPUTE, and DMA_WRITE stages + - guarantees input/output buffer separation + - manages tile buffer lifetime + +**AllocatableTCM** + +- General-purpose region managed by PEMemAllocator. +- Used by host or DP-visible allocations. + +**Visibility rule (hard isolation)** + +- PEMemAllocator must not see or allocate memory inside SchedulerReservedTCM. +- SchedulerReservedTCM is excluded from allocator-managed ranges by construction. +- This prevents DP or host allocations from interfering with scheduler staging buffers. + +**Tile buffer rules** + +Within SchedulerReservedTCM: + +- input buffers and output buffers must not overlap +- PE_SCHEDULER assigns tile buffers for DMA and compute stages +- tile buffers remain valid until the corresponding DMA_WRITE completes +- Buffer reuse is allowed only after the tile lifetime finishes. + +--- + +### D6. Observability and trace contract + +The simulator must emit deterministic trace events. + +Required events include: + +- `command_submitted` +- `sub_command_dispatched` +- `engine_start` +- `engine_complete` +- `tile_ready` +- `command_complete` + +Trace ordering must be deterministic for identical inputs. + +--- + +### D7. Topology representation + +PE internal components are declared in `cube.pe_template`. + +The template is instantiated once per PE. + +PE instances are derived from `cube.pe_layout`. + +External connectivity such as: + +- PE_DMA → XBAR (HBM data path) +- PE_DMA → NOC (non-HBM data path: shared SRAM, inter-cube UCIe) +- NOC → PE_CPU (command path from M_CPU) + +is modeled at the CUBE level (see ADR-0003 D3). + +--- + +## Links + +- SPEC R3, R4 +- ADR-0003 D4 (PE-level system hierarchy) +- ADR-0005 View C (PE-level diagram) +- ADR-0008 D2 (PA-level allocation at PE scope; PEMemAllocator is the per-PE allocator instance) +- ADR-0009 D3 (kernel execution fan-out and PE_CPU dispatch) diff --git a/docs/adr/ADR-0015-component-port-wire-model.md b/docs/adr/ADR-0015-component-port-wire-model.md new file mode 100644 index 0000000..a9c4a94 --- /dev/null +++ b/docs/adr/ADR-0015-component-port-wire-model.md @@ -0,0 +1,178 @@ +# ADR-0015: Component Port/Wire Model and Fabric Routing + +## Status + +Proposed + +## Context + +ADR-0007 D2 assigns path-walking and low-level request decomposition to the simulation engine. +In practice, the engine iterates the topology path and calls `run()` on each component +sequentially — conflating routing policy with component behavior and preventing realistic +hardware modeling (queues, contention, fan-out). + +ADR-0007 D3 already states that components own fan-out and aggregation, but the current +implementation does not enforce this for fabric traversal. + +This ADR defines: + +- how components communicate via typed port queues, +- how propagation delay is modeled (wire processes), +- the fabric path for Memory R/W through M_CPU.DMA, +- the reduced role of the simulation engine, +- M_CPU.DMA as an internal subcomponent of M_CPU. + +--- + +## Decision + +### D1. Component port model + +Each component has typed input/output ports modeled as SimPy Stores: + +``` +in_ports: dict[str, simpy.Store] # keyed by source node_id +out_ports: dict[str, simpy.Store] # keyed by destination node_id +``` + +Ports are created at engine initialization based on graph edges. +Each directed edge (src → dst) results in: + +- `src.out_ports[dst]` — the sending end +- `dst.in_ports[src]` — the receiving end + +--- + +### D2. Wire process (propagation delay) + +For each directed edge (src, dst) in the topology graph, a SimPy wire process +models propagation delay: + +```python +def wire_process(env, out_port, in_port, delay_ns): + while True: + cmd = yield out_port.get() + yield env.timeout(delay_ns) + yield in_port.put(cmd) +``` + +Wire processes are started at engine initialization. +BW constraints are enforced by the sending component's out_port capacity or token model, +not by the wire process itself. + +--- + +### D3. Engine role (reduced) + +The simulation engine MUST: + +- wire components at initialization (create port Stores, start wire processes), +- identify the entry component for each request type (PCIE_EP), +- put the request into the entry component's in_port, +- wait for a completion event. + +The simulation engine MUST NOT: + +- walk the topology path during request execution, +- call component `run()` methods directly, +- track per-hop latency or decompose fan-out. + +This supersedes ADR-0007 D2's "decompose operations into low-level requests" clause. +ADR-0007 D2 must be amended accordingly. + +--- + +### D4. Unified fabric path for Memory R/W and Kernel Launch + +Both Memory R/W and Kernel Launch use the same fabric path to reach the target cube's M_CPU. +The difference is what M_CPU does upon receiving the request. + +**Forward path (IO_CPU → target M_CPU):** + +``` +IO_CPU + → [transit cubes: ucie_out → wire → ucie_in → noc → ucie_out] (zero or more) + → target cube: ucie_in → noc → M_CPU +``` + +**At M_CPU (diverges by operation type):** + +``` +Memory R/W: M_CPU → M_CPU.DMA → noc → hbm_ctrl +Kernel Launch: M_CPU → PE[0..n] (parallel fan-out) +``` + +**Completion path (reverse, same fabric):** + +``` +Memory R/W: hbm_ctrl → noc → M_CPU.DMA → M_CPU +Kernel Launch: PE[0..n] all complete → M_CPU (aggregation) + +M_CPU → [transit cubes: ucie → noc → ucie] → IO_CPU → runtime_api +``` + +--- + +### D5. M_CPU.DMA is an internal subcomponent of M_CPU + +M_CPU.DMA is NOT a separate topology node. +It is an internal subcomponent owned by the M_CPU component implementation. + +M_CPU.DMA: + +- owns the DMA READ and DMA WRITE queues (capacity=1 each, per ADR-0014 D4), +- issues memory requests over the NOC to hbm_ctrl, +- receives completion from hbm_ctrl via the NOC, +- reports completion to M_CPU, +- is created and managed inside M_CPU's `__init__` and `run()`. + +M_CPU.DMA does not appear as a node in the compiled topology graph. + +--- + +### D6. Transit cube forwarding + +A cube that is not the target of a memory or kernel request acts as a transit node. +Transit cubes forward requests without consuming them: + +``` +ucie_in (from upstream) → noc → ucie_out (to downstream) +``` + +Transit forwarding is implemented entirely within the ucie_in component. +The noc and ucie_out components in a transit cube forward the packet without modification. + +--- + +### D7. _formula_latency is preserved as a lower-bound cross-check + +The path-based formula latency function (`_formula_latency`) is preserved in the engine +as a lower bound for correctness verification. + +Invariant: + +- Phase 0: `_formula_latency == component model total_ns` +- Phase 1+: `_formula_latency <= component model total_ns` (contention adds queueing) + +This function is independent of the port/wire model and requires only the topology graph. +It is used for shard comparison in `_route_kernel` and as a regression guard. + +--- + +## Consequences + +- Components model realistic hardware behavior (queues, contention, fan-out). +- Propagation delay is modeled accurately per edge. +- Engine is decoupled from routing policy. +- Component implementations remain swappable via DI (ADR-0007 D3). +- ADR-0007 D2 must be amended to remove path-walking from engine responsibilities. +- ADR-0009 D3 should be updated to reference the unified fabric path (D4 above). + +--- + +## Links + +- ADR-0007 D2 (to be amended: engine path-walking clause) +- ADR-0009 D3 (kernel execution fan-out; fabric path to be referenced) +- ADR-0014 D4 (DMA engine capacity=1) +- ADR-0012 D1 (host ↔ IO_CPU message schema; M_CPU.DMA is component-internal) diff --git a/docs/di-presentation.md b/docs/di-presentation.md new file mode 100644 index 0000000..5f64572 --- /dev/null +++ b/docs/di-presentation.md @@ -0,0 +1,363 @@ +# 실무 DI 패턴: kernbench 구현으로 배우는 Dependency Injection + +--- + +## 슬라이드 1 — 오늘 이야기할 것 + +**질문:** 코드를 어떻게 설계해야 테스트하기 쉽고, 갈아끼우기 쉬울까? + +**답:** Dependency Injection (DI) + +오늘은 이론이 아니라 **실제로 돌아가는 시뮬레이터 코드**를 보면서 배웁니다. + +``` +kernbench +└── AI 가속기 하드웨어를 Python으로 시뮬레이션하는 프레임워크 + - 수십 개의 하드웨어 컴포넌트 (NOC, HBM, PE, CPU...) + - 각 컴포넌트는 런타임에 교체 가능 + - 테스트에서 Mock 컴포넌트로 즉시 대체 가능 +``` + +--- + +## 슬라이드 2 — DI가 없으면 어떤 일이 생기나 + +```python +# ❌ DI 없는 코드 +class IoCpuComponent: + def run(self, env, nbytes): + router = PathRouter() # 직접 생성 — 교체 불가 + hbm = HbmCtrlComponent() # 직접 생성 — 교체 불가 + yield env.timeout(10.0) +``` + +**문제:** +- 테스트할 때 실제 `PathRouter`와 `HbmCtrl`이 항상 따라온다 +- 컴포넌트를 Mock으로 바꾸려면 **소스 코드를 수정**해야 한다 +- 다른 topology(다른 라우팅 전략)를 쓰고 싶으면 **또 수정** + +> 클래스가 자기 의존성을 스스로 만들면, 그 클래스는 의존성과 결합된다 + +--- + +## 슬라이드 3 — DI의 핵심 원칙 + +**의존성은 밖에서 만들어서 안으로 넣어준다** + +``` +┌────────────────────────────┐ +│ 조립자 (Assembler) │ ← 누가 무엇을 쓸지 결정 +│ GraphEngine.__init__ │ +└────────────┬───────────────┘ + │ ctx 주입 + ▼ +┌────────────────────────────┐ +│ 컴포넌트 (Component) │ ← 어떻게 동작하는지만 알면 됨 +│ IoCpuComponent │ +│ self.ctx.router.find_path(...) ← 그냥 사용 +└────────────────────────────┘ +``` + +**세 가지 역할 분리:** +1. **Interface** — 무엇을 할 수 있는가 (`ComponentBase`) +2. **Implementation** — 어떻게 하는가 (`IoCpuComponent`, `HbmCtrlComponent`, ...) +3. **Assembler** — 무엇을 연결할 것인가 (`GraphEngine`) + +--- + +## 슬라이드 4 — 패턴 1: Constructor Injection + +> 생성자로 의존성을 받는다 + +```python +# kernbench/components/base.py + +class ComponentBase(ABC): + def __init__(self, node: Node, ctx: ComponentContext | None = None): + self.node = node + self.ctx = ctx # 외부에서 주입받은 의존성 + self.in_ports: dict[str, simpy.Store] = {} + self.out_ports: dict[str, simpy.Store] = {} +``` + +```python +# 사용 측 — ctx를 직접 만들지 않는다 +class IoCpuComponent(ComponentBase): + def _dispatch(self, env, txn): + path = self.ctx.router.find_node_path(...) # ctx는 이미 들어와 있음 + yield self.out_ports[next_hop].put(...) +``` + +**언제 쓰나:** +- 컴포넌트가 살아있는 동안 의존성이 바뀌지 않을 때 +- 의존성 없이는 컴포넌트가 동작하지 않을 때 (필수 의존성) + +--- + +## 슬라이드 5 — Context Object 패턴 + +> 의존성이 많아지면 묶어서 하나로 + +```python +# kernbench/components/context.py + +@dataclass +class ComponentContext: + router: PathRouter # 라우팅 정책 + resolver: AddressResolver # 주소 해석 + positions: dict[str, ...] # 물리적 위치 정보 + ns_per_mm: float # 전파 지연 상수 + edge_map: dict[...] # 엣지 정보 + spec: dict # 토폴로지 스펙 +``` + +**왜 Context로 묶나?** +- 생성자 인자가 6개면 → 컴포넌트 추가할 때마다 시그니처 변경 +- Context 하나면 → 새 필드 추가해도 기존 컴포넌트 무영향 +- 컴포넌트는 **필요한 것만 꺼내 쓴다** + +```python +class TwoDMeshNocComponent(ComponentBase): + def _route(self, env, txn): + src_pos = self.ctx.positions.get(prev_hop) # 위치만 사용 + ns_per_mm = self.ctx.ns_per_mm # 상수만 사용 + # router, resolver 등은 건드리지 않음 +``` + +--- + +## 슬라이드 6 — 패턴 2: Registry + Factory + +> 문자열 키 → 클래스 매핑으로 런타임 교체 + +```python +# kernbench/components/base.py + +class ComponentRegistry: + _registry: dict[str, type[ComponentBase]] = {} + + @classmethod + def register(cls, impl: str, component_cls: type[ComponentBase]): + cls._registry[impl] = component_cls + + @classmethod + def create(cls, node, overrides=None, ctx=None) -> ComponentBase: + if overrides and node.impl in overrides: + return overrides[node.impl](node, ctx) # 1순위: 호출자 override + if node.impl in cls._registry: + return cls._registry[node.impl](node, ctx) # 2순위: 등록된 구현 + return DefaultComponent(node, ctx) # 3순위: 기본값 fallback +``` + +**Resolution 우선순위:** +``` +overrides[impl] ← 테스트/실험용 주입 + ↓ (없으면) +_registry[impl] ← 프로덕션 구현 + ↓ (없으면) +DefaultComponent ← 안전한 fallback +``` + +--- + +## 슬라이드 7 — Registry 등록 방식 + +```python +# kernbench/components/impls/__init__.py + +from kernbench.components.base import ComponentRegistry +from kernbench.components.impls.noc import TwoDMeshNocComponent +from kernbench.components.impls.io_cpu import IoCpuComponent +# ... + +ComponentRegistry.register("noc_2d_mesh_v1", TwoDMeshNocComponent) +ComponentRegistry.register("io_cpu_v1", IoCpuComponent) +ComponentRegistry.register("hbm_ctrl_v1", HbmCtrlComponent) +# ... +``` + +**topology.yaml (설정 파일)** +```yaml +nodes: + - id: sip0.cube0.noc + impl: noc_2d_mesh_v1 # ← 이 문자열이 Registry 키 +``` + +**흐름:** +``` +YAML → impl 문자열 → Registry.create() → 실제 컴포넌트 인스턴스 +``` + +impl 문자열만 바꾸면 동작이 바뀐다. 코드 수정 없음. + +--- + +## 슬라이드 8 — 패턴 3: Override Injection (테스트용) + +> 호출자가 특정 impl만 갈아끼운다 + +```python +# tests/test_component_registry.py + +class SpyXbar(ComponentBase): + calls = 0 + + def run(self, env, nbytes): + SpyXbar.calls += 1 + yield env.timeout(0) + + +# 테스트에서 xbar_v1만 SpyXbar로 교체 +engine = GraphEngine( + graph, + component_overrides={"xbar_v1": SpyXbar} # ← 이것만 추가 +) + +result = engine.run(msg) +assert SpyXbar.calls > 0 # Xbar가 실제로 호출됐는지 검증 +``` + +**핵심:** 테스트 코드가 프로덕션 코드를 **수정하지 않는다** + +--- + +## 슬라이드 9 — 조립자: GraphEngine + +> 컴포넌트를 생성하고 연결하는 유일한 곳 + +```python +# kernbench/sim_engine/engine.py + +class GraphEngine: + def __init__(self, graph, component_overrides=None): + + # 1. 공유 의존성 생성 + ctx = ComponentContext( + router=PathRouter(graph), + resolver=AddressResolver(graph), + positions={nid: n.pos_mm for nid, n in graph.nodes.items()}, + ns_per_mm=..., + ) + + # 2. 컴포넌트 생성 (DI: ctx 주입) + self._components = { + node_id: ComponentRegistry.create(node, overrides, ctx) + for node_id, node in graph.nodes.items() + } + + # 3. 포트 연결 (배선) + for e in graph.edges: + store = simpy.Store(self._env) + self._components[e.src].out_ports[e.dst] = store + self._components[e.dst].in_ports[e.src] = store +``` + +**생성 → 주입 → 연결** — 이 세 단계가 한 곳에서만 일어난다 + +--- + +## 슬라이드 10 — 전체 구조 한눈에 보기 + +``` +topology.yaml + │ impl: "noc_2d_mesh_v1" + ▼ +GraphEngine.__init__() ← 조립자 + │ + ├── ComponentContext 생성 ← 공유 의존성 묶음 + │ ├── PathRouter + │ ├── AddressResolver + │ └── positions, ns_per_mm, ... + │ + ├── ComponentRegistry.create(node, overrides, ctx) + │ ├── overrides["noc_2d_mesh_v1"]? → SpyNoc (테스트) + │ ├── registry["noc_2d_mesh_v1"]? → TwoDMeshNocComponent (프로덕션) + │ └── fallback → DefaultComponent + │ + └── 포트 배선: out_ports / in_ports 연결 + +Component (TwoDMeshNocComponent) + └── self.ctx.positions, self.ctx.ns_per_mm 사용 + (라우터, 리졸버는 건드리지 않음 — 필요한 것만) +``` + +--- + +## 슬라이드 11 — 무엇을 얻었나 + +| 상황 | DI 없이 | DI 있이 | +|------|---------|---------| +| NOC 알고리즘 교체 | 소스 코드 수정 | YAML에서 impl 문자열 변경 | +| Xbar 동작 검증 | 실제 HW 전부 구동 | `overrides={"xbar_v1": SpyXbar}` | +| 새 컴포넌트 추가 | 기존 코드 수정 | `register("new_v1", NewComp)` | +| 컨텍스트 필드 추가 | 모든 생성자 수정 | `ComponentContext`에 필드 추가 | +| 테스트 격리 | 불가능 | 필요한 것만 override | + +--- + +## 슬라이드 12 — 실무 적용 체크리스트 + +**설계할 때 물어볼 것:** + +1. **이 클래스가 직접 `new`(생성)하는 것은 무엇인가?** + → 생성하는 것 = 교체할 수 없는 것. 생성자로 받을 수 없는지 검토. + +2. **의존성이 3개 이상이면?** + → Context Object로 묶어라. + +3. **테스트에서 이 클래스를 단독으로 실행할 수 있는가?** + → 없다면 DI가 필요하다는 신호. + +4. **설정(YAML/config)으로 동작을 바꾸고 싶은가?** + → Registry + 문자열 키 패턴. + +5. **누가 조립하는가?** + → 조립자는 하나여야 한다. 컴포넌트 안에 조립 로직이 있으면 안 된다. + +--- + +## 슬라이드 13 — 안티패턴: 이것은 하지 말자 + +```python +# ❌ 서비스 로케이터 (컴포넌트 안에서 registry 호출) +class BadComponent(ComponentBase): + def run(self, env, nbytes): + router = ComponentRegistry.get("router") # 컴포넌트가 직접 찾는다 + ... + +# ❌ 전역 싱글톤 직접 참조 +class BadComponent(ComponentBase): + def run(self, env, nbytes): + router = GlobalRouter.instance() # 교체 불가 + ... + +# ❌ 생성자 안에서 의존성 생성 +class BadComponent(ComponentBase): + def __init__(self, node): + self.router = PathRouter(node.graph) # 테스트에서 격리 불가 +``` + +**공통 문제:** 컴포넌트가 자기 의존성을 스스로 해결한다 → 결합도 증가 + +--- + +## 슬라이드 14 — 요약 + +> **DI = 의존성의 생성과 사용을 분리하는 것** + +``` +생성 → Registry / Assembler (GraphEngine) +사용 → Component (IoCpuComponent, TwoDMeshNocComponent, ...) +``` + +**kernbench에서 배운 패턴 3가지:** + +1. **Constructor Injection** — 필수 의존성은 생성자로 +2. **Context Object** — 의존성 묶음을 하나의 dataclass로 +3. **Registry + Override** — 문자열 키로 구현체 선택, 테스트에서 교체 + +**결과:** 141개 테스트, YAML 한 줄로 컴포넌트 교체, 프로덕션 코드 수정 없이 Mock 주입 + +--- + +*참고 코드: kernbench/src/kernbench/components/* diff --git a/docs/diagrams/README.md b/docs/diagrams/README.md new file mode 100644 index 0000000..d151243 --- /dev/null +++ b/docs/diagrams/README.md @@ -0,0 +1,26 @@ +# Generated Diagrams + +This directory contains diagrams generated from topology compilation. + +## What these files are +- Derived artifacts generated from: + - compiled topology graph + - distance (accumulated latency) metadata + - view/layout rules (ADR-0005) + +These files are meant for quick visual inspection and review. + +## Default outputs +- SIP view: `sip_view.mmd` (and/or `sip_view.dot`) +- CUBE view: `cube_view.mmd` (and/or `cube_view.dot`) +- PE view: `pe_view.mmd` (and/or `pe_view.dot`) + +## How to preview +- In VS Code: + - open `.mmd` or `.md` containing Mermaid blocks and use Markdown Preview + - for `.dot`, use a Graphviz preview extension or `dot -Tpng` + +## Notes +- Diagrams are representative and distance-aware by default. +- Instance indices are not required unless debugging asymmetry. +- Outputs should be deterministic for the same topology and rules. diff --git a/docs/diagrams/cube_view.svg b/docs/diagrams/cube_view.svg new file mode 100644 index 0000000..ebf8c05 --- /dev/null +++ b/docs/diagrams/cube_view.svg @@ -0,0 +1,156 @@ + + cube + + CUBE VIEW + + + HBM + + 6.0mm 256GB/s + + + + 6.0mm 256GB/s + + + + 6.0mm 256GB/s + + + + 6.0mm 256GB/s + + + + 6.0mm 256GB/s + + + + 6.0mm 256GB/s + + + + 6.0mm 256GB/s + + + + 6.0mm 256GB/s + + + + 2.5mm 256GB/s + + 2.5mm 256GB/s + + 2.5mm 256GB/s + + 2.5mm 256GB/s + + 2.5mm 256GB/s + + 2.5mm 256GB/s + + 2.5mm 256GB/s + + 2.5mm 256GB/s + + 2.0mm 128GB/s + + 2.0mm 128GB/s + + 10.0mm 128GB/s + + 10.0mm 128GB/s + + 2.0mm 128GB/s + + 2.0mm 128GB/s + + 2.0mm 128GB/s + + 2.0mm 128GB/s + + 10.0mm 128GB/s + + 10.0mm 128GB/s + + 2.0mm 128GB/s + + 2.0mm 128GB/s + + 3.0mm 512GB/s + + 3.0mm 512GB/s + + 3.0mm 512GB/s + + 3.0mm 512GB/s + + 3.0mm 512GB/s + + 3.0mm 512GB/s + + 3.0mm 512GB/s + + 3.0mm 512GB/s + + + + + + + + + + UCIe-N + + UCIe-S + + UCIe-E + + UCIe-W + + NOC + + M CPU + + HBM CTRL + + SRAM + + Bridge LEFT + + Bridge RIGHT + + PE0 + + XBAR PE0 + + PE1 + + XBAR PE1 + + PE2 + + XBAR PE2 + + PE3 + + XBAR PE3 + + PE4 + + XBAR PE4 + + PE5 + + XBAR PE5 + + PE6 + + XBAR PE6 + + PE7 + + XBAR PE7 + \ No newline at end of file diff --git a/docs/diagrams/pe_view.svg b/docs/diagrams/pe_view.svg new file mode 100644 index 0000000..6142e2f --- /dev/null +++ b/docs/diagrams/pe_view.svg @@ -0,0 +1,31 @@ + + pe + + PE VIEW + + 0.5mm + + 0.5mm + + 0.5mm + + 0.5mm + + 0.5mm 512GB/s + + 0.5mm 512GB/s + + 0.5mm 512GB/s + + PE CPU + + PE SCHEDULER + + PE DMA + + PE GEMM + + PE MATH + + PE TCM + \ No newline at end of file diff --git a/docs/diagrams/placement_column_wise.svg b/docs/diagrams/placement_column_wise.svg new file mode 100644 index 0000000..ffae2ae --- /dev/null +++ b/docs/diagrams/placement_column_wise.svg @@ -0,0 +1,72 @@ + + +Placement: column_wise +Tensor (1024×512) fp16 → K axis split into 8 parts +← K=512 → +↑ M=1024 ↓ + +PE0 +(1024×64) + +PE1 +(1024×64) + +PE2 +(1024×64) + +PE3 +(1024×64) + +PE4 +(1024×64) + +PE5 +(1024×64) + +PE6 +(1024×64) + +PE7 +(1024×64) + +off=0 B +128 KB +off=128 KB +128 KB +off=256 KB +128 KB +off=384 KB +128 KB +off=512 KB +128 KB +off=640 KB +128 KB +off=768 KB +128 KB +off=896 KB +128 KB +PE Legend + +PE0 + +PE1 + +PE2 + +PE3 + +PE4 + +PE5 + +PE6 + +PE7 + +Strategy: column_wise +Split axis: K +Shards: 8 +Each: (1024, 64) +Each: 128 KB +Total: 1 MB + diff --git a/docs/diagrams/placement_replicate.svg b/docs/diagrams/placement_replicate.svg new file mode 100644 index 0000000..09115f5 --- /dev/null +++ b/docs/diagrams/placement_replicate.svg @@ -0,0 +1,47 @@ + + +Placement: replicate +Tensor (1024×512) fp16 → full copy to each PE + +PE0 +(1024×512) +1 MB +offset=0 + +PE1 +(1024×512) +1 MB +offset=0 + +PE2 +(1024×512) +1 MB +offset=0 + +PE3 +(1024×512) +1 MB +offset=0 + +PE4 +(1024×512) +1 MB +offset=0 + +PE5 +(1024×512) +1 MB +offset=0 + +PE6 +(1024×512) +1 MB +offset=0 + +PE7 +(1024×512) +1 MB +offset=0 + +Strategy: replicate | Shards: 8 | Each: 1 MB | Total mem: 8 MB + diff --git a/docs/diagrams/placement_row_wise.svg b/docs/diagrams/placement_row_wise.svg new file mode 100644 index 0000000..2750eb8 --- /dev/null +++ b/docs/diagrams/placement_row_wise.svg @@ -0,0 +1,72 @@ + + +Placement: row_wise +Tensor (1024×512) fp16 → M axis split into 8 parts +← K=512 → +↑ M=1024 ↓ + +PE0 +(128×512) + +PE1 +(128×512) + +PE2 +(128×512) + +PE3 +(128×512) + +PE4 +(128×512) + +PE5 +(128×512) + +PE6 +(128×512) + +PE7 +(128×512) + +off=0 B +128 KB +off=128 KB +128 KB +off=256 KB +128 KB +off=384 KB +128 KB +off=512 KB +128 KB +off=640 KB +128 KB +off=768 KB +128 KB +off=896 KB +128 KB +PE Legend + +PE0 + +PE1 + +PE2 + +PE3 + +PE4 + +PE5 + +PE6 + +PE7 + +Strategy: row_wise +Split axis: M +Shards: 8 +Each: (128, 512) +Each: 128 KB +Total: 1 MB + diff --git a/docs/diagrams/placement_tiled_column_major.svg b/docs/diagrams/placement_tiled_column_major.svg new file mode 100644 index 0000000..f94b6b9 --- /dev/null +++ b/docs/diagrams/placement_tiled_column_major.svg @@ -0,0 +1,116 @@ + + +Placement: tiled_column_major +Tensor (1024×512) fp16, tile=(256×128) → 4×4=16 tiles, column-major (K first) +← K=512 → +↑ M=1024 ↓ + +PE0 +t0 + +PE1 +t1 + +PE2 +t2 + +PE3 +t3 + +PE4 +t4 + +PE5 +t5 + +PE6 +t6 + +PE7 +t7 + +PE0 +t8 + +PE1 +t9 + +PE2 +t10 + +PE3 +t11 + +PE4 +t12 + +PE5 +t13 + +PE6 +t14 + +PE7 +t15 + +k=0..127 +k=128..255 +k=256..383 +k=384..511 +m=0..255 +m=256..511 +m=512..767 +m=768..1023 +PE Legend + +PE0 + +PE1 + +PE2 + +PE3 + +PE4 + +PE5 + +PE6 + +PE7 +Tile Assignment Order + +t 0 → PE0 (0,0) off=0 B + +t 1 → PE1 (0,1) off=256 B + +t 2 → PE2 (0,2) off=512 B + +t 3 → PE3 (0,3) off=768 B + +t 4 → PE4 (1,0) off=256 KB + +t 5 → PE5 (1,1) off=256 KB + +t 6 → PE6 (1,2) off=256 KB + +t 7 → PE7 (1,3) off=256 KB + +t 8 → PE0 (2,0) off=512 KB + +t 9 → PE1 (2,1) off=512 KB + +t10 → PE2 (2,2) off=512 KB + +t11 → PE3 (2,3) off=512 KB + +t12 → PE4 (3,0) off=768 KB + +t13 → PE5 (3,1) off=768 KB + +t14 → PE6 (3,2) off=768 KB + +t15 → PE7 (3,3) off=768 KB + +Strategy: tiled_column_major | Tile: (256×128)=64 KB | Tiles: 16 | Total: 1 MB + diff --git a/docs/diagrams/placement_tiled_row_major.svg b/docs/diagrams/placement_tiled_row_major.svg new file mode 100644 index 0000000..346df72 --- /dev/null +++ b/docs/diagrams/placement_tiled_row_major.svg @@ -0,0 +1,116 @@ + + +Placement: tiled_row_major +Tensor (1024×512) fp16, tile=(256×128) → 4×4=16 tiles, row-major (M first) +← K=512 → +↑ M=1024 ↓ + +PE0 +t0 + +PE1 +t1 + +PE2 +t2 + +PE3 +t3 + +PE4 +t4 + +PE5 +t5 + +PE6 +t6 + +PE7 +t7 + +PE0 +t8 + +PE1 +t9 + +PE2 +t10 + +PE3 +t11 + +PE4 +t12 + +PE5 +t13 + +PE6 +t14 + +PE7 +t15 + +k=0..127 +k=128..255 +k=256..383 +k=384..511 +m=0..255 +m=256..511 +m=512..767 +m=768..1023 +PE Legend + +PE0 + +PE1 + +PE2 + +PE3 + +PE4 + +PE5 + +PE6 + +PE7 +Tile Assignment Order + +t 0 → PE0 (0,0) off=0 B + +t 1 → PE1 (1,0) off=256 KB + +t 2 → PE2 (2,0) off=512 KB + +t 3 → PE3 (3,0) off=768 KB + +t 4 → PE4 (0,1) off=256 B + +t 5 → PE5 (1,1) off=256 KB + +t 6 → PE6 (2,1) off=512 KB + +t 7 → PE7 (3,1) off=768 KB + +t 8 → PE0 (0,2) off=512 B + +t 9 → PE1 (1,2) off=256 KB + +t10 → PE2 (2,2) off=512 KB + +t11 → PE3 (3,2) off=768 KB + +t12 → PE4 (0,3) off=768 B + +t13 → PE5 (1,3) off=256 KB + +t14 → PE6 (2,3) off=512 KB + +t15 → PE7 (3,3) off=768 KB + +Strategy: tiled_row_major | Tile: (256×128)=64 KB | Tiles: 16 | Total: 1 MB + diff --git a/docs/diagrams/sip_view.svg b/docs/diagrams/sip_view.svg new file mode 100644 index 0000000..c1faf21 --- /dev/null +++ b/docs/diagrams/sip_view.svg @@ -0,0 +1,95 @@ + + sip + + SIP VIEW + + 1.0mm 512GB/s + + 1.0mm 512GB/s + + 1.0mm 512GB/s + + 1.0mm 512GB/s + + 1.0mm 512GB/s + + 1.0mm 512GB/s + + 1.0mm 512GB/s + + 1.0mm 512GB/s + + 1.0mm 512GB/s + + 1.0mm 512GB/s + + 1.0mm 512GB/s + + 1.0mm 512GB/s + + 1.0mm 512GB/s + + 1.0mm 512GB/s + + 1.0mm 512GB/s + + 1.0mm 512GB/s + + 1.0mm 512GB/s + + 1.0mm 512GB/s + + 1.0mm 512GB/s + + 1.0mm 512GB/s + + 1.0mm 512GB/s + + 1.0mm 512GB/s + + 1.0mm 512GB/s + + 1.0mm 512GB/s + + 3.5mm 512GB/s + + 3.5mm 512GB/s + + 3.5mm 512GB/s + + 3.5mm 512GB/s + + CUBE (0,0) + + CUBE (1,0) + + CUBE (2,0) + + CUBE (3,0) + + CUBE (0,1) + + CUBE (1,1) + + CUBE (2,1) + + CUBE (3,1) + + CUBE (0,2) + + CUBE (1,2) + + CUBE (2,2) + + CUBE (3,2) + + CUBE (0,3) + + CUBE (1,3) + + CUBE (2,3) + + CUBE (3,3) + + IO io0 + \ No newline at end of file diff --git a/docs/diagrams/system_view.svg b/docs/diagrams/system_view.svg new file mode 100644 index 0000000..fa7102d --- /dev/null +++ b/docs/diagrams/system_view.svg @@ -0,0 +1,19 @@ + + system + + SYSTEM VIEW + + 20.0mm 256GB/s + + 20.0mm 256GB/s + + Fabric Switch + + SIP 0 + + IO io0 + + SIP 1 + + IO io0 + \ No newline at end of file diff --git a/docs/latency-model.md b/docs/latency-model.md new file mode 100644 index 0000000..3f3cb07 --- /dev/null +++ b/docs/latency-model.md @@ -0,0 +1,381 @@ +# Latency Model + +## Overview + +kernbench uses a discrete-event simulation (SimPy) to compute end-to-end latency. +Every request flows through a graph of **components** connected by **wires**. +The total latency reported is the **actual SimPy wall-clock** (`env.now` delta), +not a static formula—so contention and queueing are captured automatically. + +``` +total_ns (actual) = wire_prop + component_overhead + drain + queueing + ├── deterministic ──────────────────┘ │ + └── contention-dependent ────────────────────┘ +``` + +## Three Deterministic Cost Components + +### 1. Wire Propagation + +``` +wire_ns = distance_mm × ns_per_mm (global: 0.01 = 10 ps/mm) +``` + +Every edge in the topology graph has a `distance_mm`. A SimPy wire process +delays each message by `wire_ns` before delivering it to the next component. +For on-chip silicon this is ~10 ps/mm; the same constant applies everywhere +since all links are on-die or interposer. Wire propagation is typically <1 ns +and negligible compared to other costs. + +### 2. Component Overhead (`overhead_ns`) + +``` +component_ns = node.attrs["overhead_ns"] +``` + +Each component on the path adds a fixed processing delay via `yield env.timeout(overhead_ns)`. +This models arbitration, protocol processing, pipeline stages, etc. + +| Component | overhead_ns | Meaning | +|-----------|-------------|---------| +| pcie_ep | 5.0 | PCIe protocol processing | +| io_cpu | 10.0 | Command decode / dispatch | +| m_cpu | 5.0 | DMA scheduling | +| fabric switch | 5.0 | Packet arbitration | +| xbar | 2.0 | Crossbar arbitration | +| xbar bridge | 1.0 | Bridge traversal between xbar halves | +| ucie | 1.0 | UCIe protocol overhead per port | +| noc (2D mesh) | 0.0 | Hop delay modeled internally via manhattan distance | +| hbm_ctrl | 0.0 | Access time captured in drain_ns | +| pe_cpu | 2.0 | Command dispatch | +| pe_scheduler | 1.0 | PE-internal scheduling | +| pe_gemm/math | 0.0 | Placeholder; will use flops-based model | + +### 3. Drain (Serialization Delay) + +``` +drain_ns = nbytes / bottleneck_bw_gbs +``` + +**Wormhole (cut-through) model**: data flows through intermediate nodes as a +pipeline. Serialization cost is paid **once** at the terminal node, not at +every hop. The bottleneck is the minimum `bw_gbs` across all edges in the path. + +Example: 4096 bytes through a path with bottleneck 128 GB/s → `4096 / 128 = 32.0 ns`. + +### Formula (Theoretical Lower Bound) + +``` +formula_ns = Σ(wire_prop) + Σ(overhead_ns) + drain_ns +``` + +This is the latency with **zero contention**—no other request competing for +any resource. The engine provides `_formula_latency()` for verification. +With no contention: `actual == formula`. With contention: `actual > formula`. + +### Diagram: PE DMA Read (pe0 → local slice0, 4096 bytes) + +```mermaid +sequenceDiagram + participant D as pe_dma + participant X as xbar.pe0 + participant H as hbm_ctrl.slice0 + + D->>X: txn (4096B) + Note over X: overhead 2.0 ns + X->>H: txn (wire 0.025 ns) + Note over H: acquire Resource + Note over H: overhead 0 ns + Note over H: drain 4096/256 = 16.0 ns + Note over H: release Resource + H-->>D: done.succeed() + + Note over D,H: total_ns = 18.09 ns
formula = wire(0.025) + ovhd(2.0) + drain(16.0) = 18.025 ns
actual ≈ formula (no contention) +``` + +### Diagram: Two Requests — No Contention vs HOL Blocking + +#### Case 1: Different slices (parallel, no contention) + +```mermaid +sequenceDiagram + participant A as Request A + participant S0 as hbm_ctrl.slice0
Resource(cap=1) + participant S1 as hbm_ctrl.slice1
Resource(cap=1) + + Note over A,S1: t=2 ns — both requests arrive at their own slice + A->>S0: A (4KB) + A->>S1: B (4KB) + Note over S0: acquire (immediate) + Note over S1: acquire (immediate) + Note over S0: drain 16.0 ns + Note over S1: drain 16.0 ns + Note over S0: t=18 release + Note over S1: t=18 release + + Note over A,S1: A actual = 18 ns, B actual = 18 ns
No waiting — separate Resources +``` + +#### Case 2: Same slice (HOL blocking) + +```mermaid +sequenceDiagram + participant A as Request A (4KB) + participant Q as hbm_ctrl.slice0
Resource(cap=1) + participant B as Request B (64B) + + Note over A,B: t=0 — A arrives first + A->>Q: acquire (immediate) + Note over Q: drain A = 16.0 ns + + Note over B,Q: t=5 — B arrives, yield req → BLOCKED + B--xQ: waiting... + + Note over Q: t=16 — A drain done, release + Q->>B: B acquires resource + Note over Q: drain B = 0.25 ns + Note over Q: t=16.25 — B done, release + + Note over A,B: A actual = 16.0 ns (== formula)
B actual = 11.25 ns (formula 0.25 + queueing 11.0)
HOL blocking: short request waits behind long drain +``` + +--- + +## How SimPy Tracks Latency + +### Measurement + +```python +start_ns = env.now +yield txn_done # wait for the transaction to complete +total_ns = env.now - start_ns # ← this is what probe reports +``` + +`env.now` is SimPy's simulation clock. It only advances when a process `yield`s +a timeout or waits on a resource/store. The delta between start and done captures +**everything**: wire delays, component overheads, drain, and any queueing. + +### Component Pipeline + +Each component is a SimPy process: + +``` +_fan_in (per in_port) → _inbox (Store) → _worker → out_ports +``` + +1. **`_fan_in`**: relays messages from each `in_port` into a shared `_inbox` Store. +2. **`_worker`**: pulls from `_inbox`, spawns `_forward_txn` per message. +3. **`_forward_txn`**: calls `run()` (overhead), then puts to `out_ports[next_hop]`. + +The worker uses `env.process()` (pipeline model), so multiple messages can be +in-flight through the same component concurrently. Contention happens when +they compete for shared resources (e.g., `simpy.Resource` in hbm_ctrl). + +### Wire Process + +```python +while True: + msg = yield out_port.get() # wait for sender + yield env.timeout(prop_ns) # propagation delay + yield in_port.put(msg) # deliver to receiver +``` + +Each directed edge has its own wire process. Messages are delayed by exactly +`distance_mm × ns_per_mm`. + +--- + +## Contention and Queueing + +Queueing delay is **not a separate formula term**—it emerges from SimPy's +event scheduling when multiple requests compete for the same resource. + +### Where Contention Occurs + +| Resource | SimPy Type | Capacity | Effect | +|----------|-----------|----------|--------| +| hbm_ctrl | `simpy.Resource` | 1 | Serializes HBM access | +| m_cpu DMA read engine | `simpy.Resource` | 1 | Serializes DMA reads | +| m_cpu DMA write engine | `simpy.Resource` | 1 | Serializes DMA writes | +| pe_dma channels | `simpy.Resource` | configurable | Serializes PE DMA ops | +| component inbox | `simpy.Store` | unbounded | No backpressure (FIFO) | + +### How Queueing Works + +```python +# hbm_ctrl._worker +with self._resource.request() as req: + yield req # ← BLOCKS if resource is occupied + yield from self.run(env, txn.nbytes) + yield env.timeout(drain_ns) +``` + +If request A holds the resource and request B arrives: +- B's `yield req` blocks until A releases the resource +- SimPy advances B's `env.now` by A's remaining service time +- This "extra" time shows up in B's `total_ns` automatically + +``` +No contention: actual_ns == formula_ns +Contention: actual_ns > formula_ns + queueing_delay = actual_ns - formula_ns +``` + +### Head-of-Line (HOL) Blocking at hbm_ctrl + +The `simpy.Resource` is held for the **entire** `with` block—both overhead and +drain. The resource is NOT released between overhead and drain: + +```python +with self._resource.request() as req: + yield req # acquire (or wait) + yield from self.run(env, txn.nbytes) # overhead_ns ─┐ + yield env.timeout(drain_ns) # drain_ns │ resource held +# ← resource released here ───────────────────────────────┘ +``` + +This means a short request arriving during a long request's drain must wait +for the full remaining drain time—classic head-of-line blocking: + +``` +Request A: 4 KB, drain = 16.0 ns (arrives at t=0) +Request B: 64 B, drain = 0.25 ns (arrives at t=5) + +Timeline: + t=0.00 A acquires resource + t=0.00 A: overhead (0 ns) + t=0.00 A: drain starts (16.0 ns) + t=5.00 B arrives → yield req → BLOCKED (A holds resource) + t=16.00 A: drain done → resource released + t=16.00 B acquires resource + t=16.00 B: overhead (0 ns) + t=16.25 B: drain done → resource released + + B actual = 11.25 ns (waited 11.0 + own 0.25) + B formula = 0.25 ns + B queueing = 11.0 ns ← HOL blocking penalty +``` + +**Why this is physically realistic**: An HBM channel processes one burst at a +time. While data is being serialized onto the channel (drain), no other request +can use that channel. The FIFO ordering (`simpy.Resource` default) reflects +the simplest controller scheduling policy. + +**Alternative: priority scheduling**: If needed, `simpy.PriorityResource` can +prioritize shorter requests (Shortest Job First), but this is not currently +used since FIFO matches typical HBM controller behavior. + +--- + +## Worked Example: Two Concurrent PE DMA Reads + +Setup: PE0 and PE1 in cube0 both read 4096 bytes from their local HBM slices +(slice0 and slice1), submitted to the **same engine** at the same time. + +### Paths + +``` +DMA A: pe0.pe_dma → xbar.pe0 → hbm_ctrl.slice0 +DMA B: pe1.pe_dma → xbar.pe1 → hbm_ctrl.slice1 +``` + +### No Contention (different HBM slices) + +Since slice0 and slice1 are **separate** hbm_ctrl instances, each with its own +`simpy.Resource(capacity=1)`, there is no resource competition. + +``` +DMA A timeline: + t=0.00 pe_dma dequeues txn + t=0.00 xbar.pe0: overhead_ns=2.0 → t=2.00 + t=2.025 wire prop (2.5mm × 0.01) → t=2.025 + t=2.025 hbm_ctrl.slice0: yield req → immediate (no contention) + t=2.025 hbm_ctrl.slice0: overhead_ns=0 → t=2.025 + t=18.025 drain_ns = 4096/256 = 16.0 → t=18.025 + t=18.025 done + +DMA B timeline: (identical, on its own slice) + t=0.00 → ... → t=18.09 done +``` + +Both complete at ~18.09 ns. `actual == formula` for both. + +### With Contention (same HBM slice) + +Now suppose both PE0 and PE1 read from **slice0**: + +``` +DMA A: pe0.pe_dma → xbar.pe0 → hbm_ctrl.slice0 +DMA B: pe1.pe_dma → xbar.pe1 → xbar.pe0 → hbm_ctrl.slice0 + (chain traversal to reach slice0) +``` + +``` +DMA A timeline: + t=0.00 xbar.pe0(2.0) → wire → hbm_ctrl.slice0 + t=2.025 yield req → immediate (first to arrive) + t=18.025 drain 16.0 → release resource → done + actual_A = 18.025 ns (== formula) + +DMA B timeline: + t=0.00 xbar.pe1(2.0) → xbar.pe0(2.0) → wire → hbm_ctrl.slice0 + t=4.035 yield req → BLOCKED (A holds resource until t=18.025) + t=18.025 acquire resource + t=34.025 drain 16.0 → release → done + actual_B = 34.035 ns + + formula_B = wire(0.035) + overhead(4.0) + drain(32.0) = 36.035 ns + But actual_B is different because drain uses bottleneck BW of B's path (128 GB/s) + while A's path has BW 256 GB/s. Let's recalculate: + + B's bottleneck: xbar_x_bw = 128 GB/s → drain = 4096/128 = 32.0 ns + formula_B = 0.035 + 4.0 + 32.0 = 36.035 ns + actual_B = 36.035 + queueing ≈ 50+ ns + queueing = time waiting for A to release hbm_ctrl +``` + +The key insight: **queueing delay is not in the formula**. It only appears in +the actual SimPy simulation when resources are contested. The probe reports +`actual_ns`, which includes all queueing. To see pure queueing overhead, +compare `actual_ns` vs `formula_ns` (available in PE DMA traces). + +--- + +## Probe Output Explained + +``` +=== PE DMA Latency === +Case Target Actual Ovhd Drain Wire Ovhd% Drain% Eff.BW BN.BW Util% +pe-local-hbm c0.pe0->c0.slice0 18.09 2.0 16.0 0.08 11.1% 88.5% 226.49 256.0 88.5% +pe-cross-half-hbm c0.pe0->c0.slice4 37.14 5.0 32.0 0.14 13.5% 86.1% 110.27 128.0 86.1% +``` + +| Column | Meaning | +|--------|---------| +| **Actual** | SimPy measured `env.now` delta (includes contention if any) | +| **Ovhd** | Sum of `overhead_ns` for all components on the forward path | +| **Drain** | `nbytes / bottleneck_bw` — serialization at terminal | +| **Wire** | Sum of `distance_mm × ns_per_mm` for all edges | +| **Ovhd%** | `Ovhd / Actual × 100` — fraction of time spent in component processing | +| **Drain%** | `Drain / Actual × 100` — fraction of time spent in data transfer | +| **Eff.BW** | `nbytes / Actual` — achieved bandwidth | +| **BN.BW** | Bottleneck bandwidth (min `bw_gbs` on path) | +| **Util%** | `Eff.BW / BN.BW × 100` — how close to theoretical max BW | + +### Why Util% < 100% + +`Util% = Drain% = drain_ns / actual_ns`. The gap from 100% is the overhead +fraction. For small transfers (4KB), overhead is significant relative to drain. +For large transfers, drain dominates and utilization approaches 100%. + +``` + 4 KB: Ovhd=2.0, Drain=16.0 → Util=88.5% (overhead is 11% of time) + 64 KB: Ovhd=2.0, Drain=256.0 → Util=99.2% (overhead is <1% of time) +``` + +### H2D Path: Why Ovhd% is ~40% + +H2D traverses many components (pcie_ep → io_cpu → ucie → noc → m_cpu → noc → +xbar → hbm_ctrl + response path). Total forward overhead is ~23 ns vs drain +of 32 ns for 4KB, so overhead is comparable to data transfer time—resulting +in ~55% utilization. This is expected for small command-path transfers. diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..579aa33 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,30 @@ +[build-system] +requires = ["setuptools>=68", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "kernbench" +version = "0.1.0" +requires-python = ">=3.10" +dependencies = ["pytest", "simpy", "pyyaml"] + +[project.scripts] +kernbench = "kernbench.cli.main:main" + +[project.optional-dependencies] +dev = [ + "pytest>=7", + "ruff>=0.4.0", +] + +[tool.ruff] +line-length = 100 +target-version = "py310" +fix = false + +[tool.ruff.lint] +select = ["E", "F", "I", "B", "UP"] +ignore = ["F401"] + +[tool.pytest.ini_options] +addopts = ["--disable-warnings"] diff --git a/scripts/gen_placement_diagrams.py b/scripts/gen_placement_diagrams.py new file mode 100644 index 0000000..81036bc --- /dev/null +++ b/scripts/gen_placement_diagrams.py @@ -0,0 +1,393 @@ +#!/usr/bin/env python3 +"""Generate SVG diagrams illustrating each placement strategy. + +Example tensor: (M=1024, K=512) fp16 (itemsize=2), 8 PEs. +Tiled variants use tile_m=256, tile_k=128. + +Output: docs/diagrams/placement_*.svg +""" +from __future__ import annotations + +import math +from pathlib import Path + +# ── Diagram parameters ────────────────────────────────────────────── +M, K = 1024, 512 +ITEMSIZE = 2 +NUM_PE = 8 +TILE_M, TILE_K = 256, 128 + +PE_COLORS = [ + "#3b82f6", # PE0 blue + "#10b981", # PE1 emerald + "#f59e0b", # PE2 amber + "#ef4444", # PE3 red + "#8b5cf6", # PE4 violet + "#ec4899", # PE5 pink + "#06b6d4", # PE6 cyan + "#f97316", # PE7 orange +] +PE_TEXT_COLORS = [ + "#fff", "#fff", "#000", "#fff", + "#fff", "#fff", "#000", "#fff", +] + +OUT_DIR = Path(__file__).parent.parent / "docs" / "diagrams" + +# ── SVG helpers ───────────────────────────────────────────────────── + +def _svg_header(w: int, h: int, title: str) -> str: + return ( + f'\n' + f'\n' + f'{title}\n' + ) + +def _svg_footer() -> str: + return "\n" + +def _rect(x: float, y: float, w: float, h: float, fill: str, + stroke: str = "#334155", sw: float = 1.0, opacity: float = 1.0) -> str: + return ( + f'\n' + ) + +def _text(x: float, y: float, txt: str, size: int = 11, + anchor: str = "middle", fill: str = "#1e293b", + weight: str = "normal") -> str: + return ( + f'{txt}\n' + ) + +def _line(x1: float, y1: float, x2: float, y2: float, + stroke: str = "#94a3b8", sw: float = 1) -> str: + return ( + f'\n' + ) + +def _format_bytes(n: int) -> str: + if n >= (1 << 20): + return f"{n >> 20} MB" + if n >= (1 << 10): + return f"{n >> 10} KB" + return f"{n} B" + +def _legend(x: float, y0: float, num_pe: int = NUM_PE) -> str: + s = _text(x + 50, y0, "PE Legend", size=12, weight="bold") + for i in range(num_pe): + ly = y0 + 18 + i * 22 + s += _rect(x, ly - 12, 16, 16, PE_COLORS[i]) + s += _text(x + 22, ly, f"PE{i}", size=11, anchor="start") + return s + +def _axes(gx: float, gy: float, gw: float, gh: float, + m_label: str = "M=1024", k_label: str = "K=512") -> str: + """Draw axis labels and dimension arrows.""" + s = "" + # K axis (horizontal) label above grid + s += _text(gx + gw / 2, gy - 8, f"← {k_label} →", size=11, fill="#475569") + # M axis (vertical) label left of grid + mx = gx - 12 + my = gy + gh / 2 + s += ( + f'↑ {m_label} ↓\n' + ) + return s + +def _info_box(x: float, y: float, lines: list[str]) -> str: + """Rounded info box with key/value lines.""" + bw = max(len(l) for l in lines) * 7 + 20 + bh = len(lines) * 18 + 12 + s = _rect(x, y, bw, bh, "#e2e8f0", stroke="#94a3b8", sw=1) + for i, line in enumerate(lines): + s += _text(x + 10, y + 18 + i * 18, line, size=10, anchor="start", fill="#334155") + return s + +# ── Grid drawing ──────────────────────────────────────────────────── + +def _draw_grid( + gx: float, gy: float, gw: float, gh: float, + cells: list[dict], # [{row, col, rspan, cspan, pe, label?, offset?}] + rows: int, cols: int, + cell_labels: bool = True, +) -> str: + """Draw a grid of colored cells representing shard placement.""" + cw = gw / cols + ch = gh / rows + s = "" + for c in cells: + cx = gx + c["col"] * cw + cy = gy + c["row"] * ch + w = c.get("cspan", 1) * cw + h = c.get("rspan", 1) * ch + pe = c["pe"] + s += _rect(cx, cy, w, h, PE_COLORS[pe], stroke="#334155", sw=1.5) + # PE label + lx = cx + w / 2 + ly = cy + h / 2 + s += _text(lx, ly - 4, f"PE{pe}", size=12, + fill=PE_TEXT_COLORS[pe], weight="bold") + if cell_labels and "label" in c: + s += _text(lx, ly + 12, c["label"], size=9, + fill=PE_TEXT_COLORS[pe]) + # Grid border + s += _rect(gx, gy, gw, gh, "none", stroke="#1e293b", sw=2) + return s + + +# ── Strategy-specific generators ──────────────────────────────────── + +def gen_column_wise() -> str: + """Column-wise: split K into 8 equal parts.""" + W, H = 820, 500 + s = _svg_header(W, H, "Placement: column_wise") + s += _text(W // 2, 54, f"Tensor ({M}×{K}) fp16 → K axis split into {NUM_PE} parts", + size=12, fill="#475569") + + gx, gy, gw, gh = 80, 90, 480, 320 + chunk_k = K // NUM_PE # 64 + chunk_bytes = M * chunk_k * ITEMSIZE + + s += _axes(gx, gy, gw, gh) + cells = [] + for i in range(NUM_PE): + cells.append({ + "row": 0, "col": i, "rspan": 1, "cspan": 1, + "pe": i, + "label": f"({M}×{chunk_k})", + }) + s += _draw_grid(gx, gy, gw, gh, cells, rows=1, cols=NUM_PE) + + # Column dimension labels + cw = gw / NUM_PE + for i in range(NUM_PE): + cx = gx + i * cw + cw / 2 + off = i * chunk_bytes + s += _text(cx, gy + gh + 16, f"off={_format_bytes(off)}", size=9, fill="#475569") + s += _text(cx, gy + gh + 30, f"{_format_bytes(chunk_bytes)}", size=9, fill="#64748b") + + s += _legend(620, 100) + s += _info_box(620, 320, [ + f"Strategy: column_wise", + f"Split axis: K", + f"Shards: {NUM_PE}", + f"Each: ({M}, {chunk_k})", + f"Each: {_format_bytes(chunk_bytes)}", + f"Total: {_format_bytes(M * K * ITEMSIZE)}", + ]) + s += _svg_footer() + return s + + +def gen_row_wise() -> str: + """Row-wise: split M into 8 equal parts.""" + W, H = 820, 560 + s = _svg_header(W, H, "Placement: row_wise") + s += _text(W // 2, 54, f"Tensor ({M}×{K}) fp16 → M axis split into {NUM_PE} parts", + size=12, fill="#475569") + + gx, gy, gw, gh = 80, 90, 320, 400 + chunk_m = M // NUM_PE # 128 + chunk_bytes = chunk_m * K * ITEMSIZE + + s += _axes(gx, gy, gw, gh) + cells = [] + for i in range(NUM_PE): + cells.append({ + "row": i, "col": 0, "rspan": 1, "cspan": 1, + "pe": i, + "label": f"({chunk_m}×{K})", + }) + s += _draw_grid(gx, gy, gw, gh, cells, rows=NUM_PE, cols=1) + + # Row dimension labels + ch = gh / NUM_PE + for i in range(NUM_PE): + cy = gy + i * ch + ch / 2 + off = i * chunk_bytes + s += _text(gx + gw + 10, cy - 4, f"off={_format_bytes(off)}", + size=9, anchor="start", fill="#475569") + s += _text(gx + gw + 10, cy + 10, f"{_format_bytes(chunk_bytes)}", + size=9, anchor="start", fill="#64748b") + + s += _legend(580, 100) + s += _info_box(580, 320, [ + f"Strategy: row_wise", + f"Split axis: M", + f"Shards: {NUM_PE}", + f"Each: ({chunk_m}, {K})", + f"Each: {_format_bytes(chunk_bytes)}", + f"Total: {_format_bytes(M * K * ITEMSIZE)}", + ]) + s += _svg_footer() + return s + + +def gen_replicate() -> str: + """Replicate: full copy per PE.""" + W, H = 820, 500 + s = _svg_header(W, H, "Placement: replicate") + s += _text(W // 2, 54, f"Tensor ({M}×{K}) fp16 → full copy to each PE", + size=12, fill="#475569") + + full_bytes = M * K * ITEMSIZE + # Show 8 small copies in 2 rows × 4 cols + cols, rows = 4, 2 + margin_x, margin_y = 60, 90 + gap = 16 + bw = (700 - (cols - 1) * gap) / cols + bh = (340 - (rows - 1) * gap) / rows + + for i in range(NUM_PE): + r = i // cols + c = i % cols + bx = margin_x + c * (bw + gap) + by = margin_y + r * (bh + gap) + s += _rect(bx, by, bw, bh, PE_COLORS[i], stroke="#334155", sw=1.5) + s += _text(bx + bw / 2, by + bh / 2 - 14, f"PE{i}", + size=14, fill=PE_TEXT_COLORS[i], weight="bold") + s += _text(bx + bw / 2, by + bh / 2 + 6, f"({M}×{K})", + size=11, fill=PE_TEXT_COLORS[i]) + s += _text(bx + bw / 2, by + bh / 2 + 22, f"{_format_bytes(full_bytes)}", + size=10, fill=PE_TEXT_COLORS[i]) + s += _text(bx + bw / 2, by + bh / 2 + 36, "offset=0", + size=9, fill=PE_TEXT_COLORS[i]) + + s += _info_box(60, 450, [ + f"Strategy: replicate | Shards: {NUM_PE} | Each: {_format_bytes(full_bytes)}" + f" | Total mem: {_format_bytes(full_bytes * NUM_PE)}", + ]) + s += _svg_footer() + return s + + +def gen_tiled(column_major: bool) -> str: + """2D tiled placement. column_major=True → tiled_column_major.""" + name = "tiled_column_major" if column_major else "tiled_row_major" + order = "column-major (K first)" if column_major else "row-major (M first)" + + tiles_m = M // TILE_M # 4 + tiles_k = K // TILE_K # 4 + total_tiles = tiles_m * tiles_k # 16 + tile_bytes = TILE_M * TILE_K * ITEMSIZE + + W, H = 820, 620 + s = _svg_header(W, H, f"Placement: {name}") + s += _text(W // 2, 54, + f"Tensor ({M}×{K}) fp16, tile=({TILE_M}×{TILE_K}) → " + f"{tiles_m}×{tiles_k}={total_tiles} tiles, {order}", + size=11, fill="#475569") + + gx, gy, gw, gh = 80, 90, 400, 400 + s += _axes(gx, gy, gw, gh) + + # Build tile → PE mapping + cells = [] + idx = 0 + if column_major: + # iterate M first (rows), then K (cols) — but column-major means + # we traverse in the order that fills columns first + # Actually: column-major = K axis first within each M row + # The implementation iterates: for mi in tiles_m: for ki in tiles_k + for mi in range(tiles_m): + for ki in range(tiles_k): + pe = idx % NUM_PE + row_bytes = K * ITEMSIZE + offset = (mi * TILE_M * row_bytes) + (ki * TILE_K * ITEMSIZE) + cells.append({ + "row": mi, "col": ki, "rspan": 1, "cspan": 1, + "pe": pe, + "label": f"t{idx}", + "offset": offset, + "idx": idx, + }) + idx += 1 + else: + # row-major: iterate K first (cols), then M (rows) + for ki in range(tiles_k): + for mi in range(tiles_m): + pe = idx % NUM_PE + row_bytes = K * ITEMSIZE + offset = (mi * TILE_M * row_bytes) + (ki * TILE_K * ITEMSIZE) + cells.append({ + "row": mi, "col": ki, "rspan": 1, "cspan": 1, + "pe": pe, + "label": f"t{idx}", + "offset": offset, + "idx": idx, + }) + idx += 1 + + s += _draw_grid(gx, gy, gw, gh, cells, rows=tiles_m, cols=tiles_k) + + # Tile dimension labels on top + cw = gw / tiles_k + for ki in range(tiles_k): + cx = gx + ki * cw + cw / 2 + s += _text(cx, gy + gh + 16, f"k={ki * TILE_K}..{(ki + 1) * TILE_K - 1}", + size=9, fill="#475569") + + # Tile dimension labels on left + ch = gh / tiles_m + for mi in range(tiles_m): + cy = gy + mi * ch + ch / 2 + s += _text(gx - 16, cy, f"m={mi * TILE_M}..{(mi + 1) * TILE_M - 1}", + size=9, anchor="end", fill="#475569") + + s += _legend(540, 90) + + # Assignment table + table_y = 310 + s += _text(540, table_y, "Tile Assignment Order", size=12, weight="bold") + # Sort cells by idx for table + sorted_cells = sorted(cells, key=lambda c: c["idx"]) + for i, c in enumerate(sorted_cells): + ty = table_y + 18 + i * 16 + if ty > H - 20: + break + pe = c["pe"] + s += _rect(540, ty - 10, 12, 12, PE_COLORS[pe]) + s += _text(558, ty, + f"t{c['idx']:>2d} → PE{pe} ({c['row']},{c['col']})" + f" off={_format_bytes(c['offset'])}", + size=9, anchor="start", fill="#334155") + + s += _info_box(80, H - 60, [ + f"Strategy: {name} | Tile: ({TILE_M}×{TILE_K})={_format_bytes(tile_bytes)}" + f" | Tiles: {total_tiles} | Total: {_format_bytes(M * K * ITEMSIZE)}", + ]) + s += _svg_footer() + return s + + +# ── Main ──────────────────────────────────────────────────────────── + +def main() -> None: + OUT_DIR.mkdir(parents=True, exist_ok=True) + + diagrams = { + "placement_column_wise.svg": gen_column_wise(), + "placement_row_wise.svg": gen_row_wise(), + "placement_replicate.svg": gen_replicate(), + "placement_tiled_column_major.svg": gen_tiled(column_major=True), + "placement_tiled_row_major.svg": gen_tiled(column_major=False), + } + + for name, svg in diagrams.items(): + path = OUT_DIR / name + path.write_text(svg, encoding="utf-8") + print(f" wrote {path}") + + print(f"\nGenerated {len(diagrams)} placement diagrams.") + + +if __name__ == "__main__": + main() diff --git a/src/kernbench/__init__.py b/src/kernbench/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/kernbench/cli/main.py b/src/kernbench/cli/main.py new file mode 100644 index 0000000..93c724e --- /dev/null +++ b/src/kernbench/cli/main.py @@ -0,0 +1,64 @@ +import argparse +import sys + +from benches.loader import resolve_bench +from kernbench.cli.probe import cmd_probe +from kernbench.cli.report import format_report +from kernbench.common.types import SimEngine +from kernbench.runtime_api.bench_runner import run_bench +from kernbench.runtime_api.types import DeviceSelector, resolve_device +from kernbench.sim_engine.engine import GraphEngine +from kernbench.topology.builder import resolve_topology + + +def build_parser() -> argparse.ArgumentParser: + p = argparse.ArgumentParser(prog="kernbench") + sub = p.add_subparsers(dest="cmd", required=True) + + runp = sub.add_parser("run", help="Run a benchmark") + runp.add_argument("--topology", required=True) + runp.add_argument("--bench", required=True) + runp.add_argument( + "--device", default=None, help="Target device: 'all' or 'sip:' (default: all)" + ) + runp.set_defaults(_handler=cmd_run) + + probep = sub.add_parser("probe", help="Probe latency and BW for predefined traffic patterns") + probep.add_argument("--topology", required=True) + probep.add_argument("--case", default="all", help="Case name or 'all' (default: all)") + probep.set_defaults(_handler=cmd_probe) + + return p + + +def engine_factory(topology: object, device: DeviceSelector) -> SimEngine: + topo_obj = getattr(topology, "topology_obj", topology) + return GraphEngine(topo_obj) + + +def cmd_run(args) -> int: + print("> Running benchmark with:", args) + + topo = resolve_topology(args.topology) + bench = resolve_bench(args.bench) + device = resolve_device(args.device) + + result = run_bench(topology=topo, bench_fn=bench, device=device, engine_factory=engine_factory) + + topo_obj = getattr(topo, "topology_obj", topo) + spec = getattr(topo_obj, "spec", None) + if result.traces: + print(format_report(result.traces, title=args.bench, spec=spec)) + print(result.summary_text()) + + return 0 if result.completion.ok else 1 + + +def main(argv=None) -> int: + parser = build_parser() + args = parser.parse_args(argv) + return int(args._handler(args)) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/src/kernbench/cli/probe.py b/src/kernbench/cli/probe.py new file mode 100644 index 0000000..8e81f12 --- /dev/null +++ b/src/kernbench/cli/probe.py @@ -0,0 +1,248 @@ +"""kernbench probe: latency and BW verification utility. + +Runs predefined traffic patterns through the simulation engine and reports +latency, effective bandwidth, bottleneck bandwidth, and utilization for each +case. Validates monotonicity invariants across hop counts and access types. +""" +from __future__ import annotations + +from pathlib import Path + +from kernbench.policy.address.phyaddr import PhysAddr +from kernbench.policy.routing.router import AddressResolver, PathRouter +from kernbench.runtime_api.kernel import MemoryWriteMsg, PeDmaMsg +from kernbench.sim_engine.engine import GraphEngine +from kernbench.topology.builder import load_topology +from kernbench.topology.types import TopologyGraph + + +# -- Helpers ---------------------------------------------------------- + + +def _hbm_pa(sip: int, cube: int, pe_id: int, spec: dict) -> int: + mm = spec["cube"]["memory_map"] + slice_bytes = mm["hbm_total_gb_per_cube"] * (1 << 30) // mm["hbm_slices_per_cube"] + pa = PhysAddr.pe_hbm_addr( + rack_id=0, sip_id=sip, cube_id=cube, pe_id=pe_id, + pe_local_hbm_offset=0x1000, slice_size_bytes=slice_bytes, + ) + return pa.encode() + + +def _build_edge_map(graph: TopologyGraph) -> dict[tuple[str, str], object]: + return {(e.src, e.dst): e for e in graph.edges} + + +def _formula_breakdown( + path: list[str], nbytes: int, edge_map: dict, graph: TopologyGraph, +) -> tuple[float, float, float, float]: + """Return (wire_ns, overhead_ns, drain_ns, formula_ns) for a path.""" + ns_per_mm = graph.spec.get("system", {}).get("ns_per_mm", 0.01) + wire_ns = 0.0 + for i in range(len(path) - 1): + e = edge_map.get((path[i], path[i + 1])) + if e: + wire_ns += e.distance_mm * ns_per_mm + overhead_ns = 0.0 + for nid in path: + node = graph.nodes.get(nid) + if node: + overhead_ns += float(node.attrs.get("overhead_ns", 0.0)) + bws = [e.bw_gbs for i in range(len(path) - 1) + if (e := edge_map.get((path[i], path[i + 1]))) and e.bw_gbs] + drain_ns = nbytes / min(bws) if bws else 0.0 + return wire_ns, overhead_ns, drain_ns, wire_ns + overhead_ns + drain_ns + + +def _bottleneck_bw(path: list[str], edge_map: dict) -> float | None: + """Per-request bottleneck: single request uses one connection.""" + bws: list[float] = [] + for i in range(len(path) - 1): + e = edge_map.get((path[i], path[i + 1])) + if e and e.bw_gbs: + bws.append(e.bw_gbs) + return min(bws) if bws else None + + + +def _fmt_bw(bw: float | None) -> str: + return f"{bw:.1f}" if bw is not None else "-" + + +def _fmt_util(eff: float, bn: float | None) -> str: + if bn is None or bn <= 0: + return "-" + return f"{eff / bn * 100:.1f}%" + + +def _short_name(node_id: str) -> str: + """Shorten node id: keep last 2 segments to avoid ambiguity (xbar.pe0 vs pe0).""" + parts = node_id.split(".") + return ".".join(parts[-2:]) if len(parts) >= 2 else node_id + + +def _short_path(path: list[str]) -> str: + return " -> ".join(_short_name(n) for n in path) + + +# -- Probe runner ----------------------------------------------------- + + +def run_probe(topology_path: str, case_filter: str | None = None) -> int: + path = Path(topology_path).expanduser().resolve() + graph = load_topology(path) + edge_map = _build_edge_map(graph) + spec = graph.spec + resolver = AddressResolver(graph) + router = PathRouter(graph) + + nbytes = 4096 + show_all = case_filter is None or case_filter == "all" + + # === H2D Write === + h2d_cases = [ + ("h2d-1hop", 0, 1), + ("h2d-2hop", 4, 2), + ("h2d-3hop", 8, 3), + ("h2d-4hop", 12, 4), + ] + h2d_results: list[tuple[str, int, float, float, float | None]] = [] + h2d_paths: list[tuple[str, list[str], list[str], list[str]]] = [] + + print() + print("=== H2D Write Latency (IO->HBM, varying hop count) ===") + print(f" {'Case':<14} {'Target':<16} {'Hops':>4} {'Actual':>8}" + f" {'Ovhd':>6} {'Drain':>6} {'Wire':>5} {'Ovhd%':>6} {'Drain%':>7}" + f" {'Eff.BW':>8} {'BN.BW':>8} {'Util%':>6}") + print(" " + "-" * 115) + + for name, cube, hops in h2d_cases: + if not show_all and case_filter != name: + continue + engine = GraphEngine(graph) + pa = _hbm_pa(sip=0, cube=cube, pe_id=0, spec=spec) + msg = MemoryWriteMsg( + correlation_id="probe", request_id=name, + dst_sip=0, dst_cube=cube, dst_pe=0, + dst_pa=pa, nbytes=nbytes, pattern="zero", + ) + h = engine.submit(msg) + engine.wait(h) + _, trace = engine.get_completion(h) + total_ns = trace["total_ns"] + eff_bw = nbytes / total_ns if total_ns > 0 else 0.0 + + pa_obj = PhysAddr.decode(pa) + dst_node = resolver.resolve(pa_obj) + + pcie_ep = resolver.find_pcie_ep(0) + io_cpu = resolver.find_io_cpu(0) + m_cpu = resolver.find_m_cpu(0, cube) + leg1 = router.find_node_path(pcie_ep, io_cpu) + leg2 = router.find_node_path(io_cpu, m_cpu) + leg3 = router.find_mcpu_dma_path(m_cpu, dst_node) + full_path = leg1 + leg2[1:] + leg3[1:] + bn_bw = _bottleneck_bw(full_path, edge_map) + + # Forward path breakdown only (response path is implicit in actual_ns) + fwd_path = leg1 + leg2[1:] + leg3[1:] + wire, ovhd, drain, formula = _formula_breakdown(fwd_path, nbytes, edge_map, graph) + + ovhd_pct = ovhd / total_ns * 100 if total_ns > 0 else 0 + drain_pct = drain / total_ns * 100 if total_ns > 0 else 0 + + h2d_results.append((name, hops, total_ns, eff_bw, bn_bw)) + h2d_paths.append((name, leg1, leg2, leg3)) + print(f" {name:<14} cube{cube}.pe0{'':<8} {hops:>4} {total_ns:>8.2f}" + f" {ovhd:>6.1f} {drain:>6.1f} {wire:>5.2f} {ovhd_pct:>5.1f}% {drain_pct:>5.1f}%" + f" {eff_bw:>8.2f} {_fmt_bw(bn_bw):>8} {_fmt_util(eff_bw, bn_bw):>6}") + + if len(h2d_results) >= 2: + lats = [r[2] for r in h2d_results] + mono = all(lats[i] < lats[i + 1] for i in range(len(lats) - 1)) + sym = "[v]" if mono else "[x]" + print(f" {sym} Monotonic increase: {'PASS' if mono else 'FAIL'}") + + if h2d_paths: + print() + print(" Route Details:") + print(f" {'Case':<14} {'Leg':>4} Path") + print(" " + "-" * 80) + for name, leg1, leg2, leg3 in h2d_paths: + print(f" {name:<14} {'L1':>4} {_short_path(leg1)}") + print(f" {'':<14} {'L2':>4} {_short_path(leg2)}") + print(f" {'':<14} {'L3':>4} {_short_path(leg3)}") + + # === PE DMA → HBM (direct PE-level injection) === + # (name, sip, src_cube, src_pe, dst_cube, dst_pe) + pe_cases = [ + ("pe-local-hbm", 0, 0, 0, 0, 0), # pe0 → slice0 (local, 256 GB/s) + ("pe-same-half-hbm", 0, 0, 0, 0, 1), # pe0 → slice1 (xbar chain, 128 GB/s) + ("pe-cross-half-hbm", 0, 0, 0, 0, 4), # pe0 → slice4 (xbar chain, 128 GB/s) + ("pe-cross-cube-hbm", 0, 0, 0, 1, 0), # cube0.pe0 → cube1.slice0 (NOC, 128 GB/s) + ] + pe_results: list[tuple[str, float, float, float | None]] = [] + pe_paths: list[tuple[str, list[str]]] = [] + + print() + print("=== PE DMA Latency (pe_dma -> xbar -> HBM, direct injection) ===") + print(f" {'Case':<22} {'Target':<28} {'Actual':>8}" + f" {'Ovhd':>6} {'Drain':>6} {'Wire':>5} {'Ovhd%':>6} {'Drain%':>7}" + f" {'Eff.BW':>8} {'BN.BW':>8} {'Util%':>6}") + print(" " + "-" * 120) + + for name, sip, src_cube, src_pe, dst_cube, dst_pe in pe_cases: + if not show_all and case_filter != name: + continue + engine = GraphEngine(graph) + dst_pa = _hbm_pa(sip=sip, cube=dst_cube, pe_id=dst_pe, spec=spec) + msg = PeDmaMsg( + correlation_id="probe", request_id=name, + src_sip=sip, src_cube=src_cube, src_pe=src_pe, + dst_pa=dst_pa, nbytes=nbytes, + ) + h = engine.submit(msg) + engine.wait(h) + _, trace = engine.get_completion(h) + total_ns = trace["total_ns"] + eff_bw = nbytes / total_ns if total_ns > 0 else 0.0 + + pe_ref = f"sip{sip}.cube{src_cube}.pe{src_pe}" + pa_obj = PhysAddr.decode(dst_pa) + dst_node = resolver.resolve(pa_obj) + dma_path = router.find_path(pe_ref, dst_node) + bn_bw = _bottleneck_bw(dma_path, edge_map) + + wire, ovhd, drain, formula = _formula_breakdown(dma_path, nbytes, edge_map, graph) + + ovhd_pct = ovhd / total_ns * 100 if total_ns > 0 else 0 + drain_pct = drain / total_ns * 100 if total_ns > 0 else 0 + + target_str = f"c{src_cube}.pe{src_pe}->c{dst_cube}.slice{dst_pe}" + pe_results.append((name, total_ns, eff_bw, bn_bw)) + pe_paths.append((name, dma_path)) + print(f" {name:<22} {target_str:<28} {total_ns:>8.2f}" + f" {ovhd:>6.1f} {drain:>6.1f} {wire:>5.2f} {ovhd_pct:>5.1f}% {drain_pct:>5.1f}%" + f" {eff_bw:>8.2f} {_fmt_bw(bn_bw):>8} {_fmt_util(eff_bw, bn_bw):>6}") + + if len(pe_results) >= 2: + local = [r for r in pe_results if "local" in r[0]] + chain = [r for r in pe_results if "local" not in r[0]] + if local and chain: + print(f" * Local BN: {_fmt_bw(local[0][3])} GB/s, " + f"Chain/NOC BN: {_fmt_bw(chain[0][3])} GB/s") + + if pe_paths: + print() + print(" Route Details:") + print(f" {'Case':<22} Path") + print(" " + "-" * 80) + for name, dma_path in pe_paths: + print(f" {name:<22} {_short_path(dma_path)}") + + print() + return 0 + + +def cmd_probe(args) -> int: + return run_probe(args.topology, getattr(args, "case", "all")) diff --git a/src/kernbench/cli/report.py b/src/kernbench/cli/report.py new file mode 100644 index 0000000..815501a --- /dev/null +++ b/src/kernbench/cli/report.py @@ -0,0 +1,175 @@ +"""Performance report formatter for bench results.""" +from __future__ import annotations + + +_DTYPE_BITS: dict[str, int] = { + "f16": 16, "fp16": 16, "float16": 16, "bf16": 16, + "f32": 32, "fp32": 32, "float32": 32, + "i8": 8, "int8": 8, "i16": 16, "int16": 16, "i32": 32, "int32": 32, +} + + +def format_report( + traces: list[dict], + title: str = "Benchmark", + spec: dict | None = None, +) -> str: + """Format collected traces into a human-readable performance report. + + spec: topology spec dict for peak TFLOPS / BW extraction. + """ + peak_tflops_f16, peak_hbm_bw_gbs = _extract_peaks(spec) + num_pes = _count_pes(spec) + + lines: list[str] = [] + title_line = f"-- {title} Performance Report " + + deploy_entries = [t for t in traces if t.get("phase") not in ("kernel",)] + kernel_entries = [t for t in traces if t.get("phase") == "kernel"] + + # ── Title ── + # Compute max header width for consistent separator lengths + _cmd_hdr = (f"{'Cmd':<10} {'Name':<12} {'SIP':>4} {'Cube':>5} {'PE':>4} {'Bytes':>10} " + f"{'Lat(ns)':>10} {'Xfer(ns)':>10} {'Proc(ns)':>10} " + f"{'BW(GB/s)':>10} {'MinBW':>10} {'Util%':>7}") + report_width = len(_cmd_hdr) + lines.append(title_line + "-" * max(0, report_width - len(title_line))) + + # ── Command summary ── + if deploy_entries: + lines.append("") + hdr = (f"{'Cmd':<10} {'Name':<12} {'SIP':>4} {'Cube':>5} {'PE':>4} {'Bytes':>10} " + f"{'Lat(ns)':>10} {'Xfer(ns)':>10} {'Proc(ns)':>10} " + f"{'BW(GB/s)':>10} {'MinBW':>10} {'Util%':>7}") + lines.append(hdr) + lines.append("-" * len(hdr)) + for e in deploy_entries: + lat = e.get("total_ns", 0.0) + nb = e.get("nbytes", 0) + sip = e.get("sip", "-") + pe = e.get("pe", "-") + cube = e.get("cube", "-") + cmd = e.get("phase", "deploy") + xfer_ns = e.get("xfer_ns", 0.0) + proc_ns = lat - xfer_ns if xfer_ns > 0 else 0.0 + bw = nb / lat if lat > 0 else 0.0 + min_bw = nb / xfer_ns if xfer_ns > 0 else 0.0 + util = (xfer_ns / lat * 100) if lat > 0 and xfer_ns > 0 else 0.0 + lines.append( + f"{cmd:<10} {e.get('name', '?'):<12} {str(sip):>4} {str(cube):>5} {str(pe):>4} {nb:>10} " + f"{lat:>10.1f} {xfer_ns:>10.1f} {proc_ns:>10.1f} " + f"{bw:>10.1f} {min_bw:>10.1f} {util:>6.1f}%" + ) + + # ── Kernel summary ── + if kernel_entries: + lines.append("") + k_hdr = (f"{'Phase':<10} {'Name':<12} {'PE':>4} {'E2E(ns)':>10} " + f"{'PE(ns)':>10} {'DMA(ns)':>10} {'Comp(ns)':>10} " + f"{'Bound':<8} {'TFLOPS':>8} {'Peak':>8} {'Util%':>7}") + lines.append(k_hdr) + lines.append("-" * len(k_hdr)) + for e in kernel_entries: + e2e_ns = e.get("total_ns", 0.0) + pe_ns = e.get("pe_exec_ns", e2e_ns) + dma_ns = e.get("dma_ns", 0.0) + compute_ns = e.get("compute_ns", 0.0) + target_pe = e.get("target_pe", "-") + scalars = e.get("scalars", []) + pe_str = "all" if target_pe == "all" else str(target_pe) + n_active = num_pes if target_pe == "all" else 1 + + # Bound indicator based on measured DMA vs compute time + if dma_ns > 0 or compute_ns > 0: + bound = "memory" if dma_ns >= compute_ns else "compute" + else: + bound = "-" + + achieved = _calc_tflops(scalars, pe_ns) + peak_total = peak_tflops_f16 * n_active + util = (achieved / peak_total * 100) if peak_total > 0 else 0.0 + lines.append( + f"{'kernel':<10} {e.get('name', '?'):<12} {pe_str:>4} {e2e_ns:>10.1f} " + f"{pe_ns:>10.1f} {dma_ns:>10.1f} {compute_ns:>10.1f} " + f"{bound:<8} {achieved:>8.3f} {peak_total:>8.1f} {util:>6.1f}%" + ) + + # ── Per-PE summary ── + pe_deploy = _per_pe_deploy(deploy_entries) + if len(pe_deploy) > 1: + lines.append("") + pe_title = (f"-- Per-PE Summary (peak: {peak_tflops_f16:.1f} TFLOPS/PE, " + f"{peak_hbm_bw_gbs:.0f} GB/s HBM BW) ") + pe_hdr = (f"{'PE':>4} {'Deploy(ns)':>10} {'BW(GB/s)':>10} {'BW Util':>8} " + f"{'Kernel(ns)':>10} {'TFLOPS':>8} {'Util':>7}") + pe_width = max(len(pe_title), len(pe_hdr)) + lines.append(pe_title + "-" * max(0, pe_width - len(pe_title))) + lines.append(pe_hdr) + lines.append("-" * pe_width) + + k_ns = sum(e.get("pe_exec_ns", e.get("total_ns", 0.0)) for e in kernel_entries) + k_scalars = kernel_entries[0].get("scalars", []) if kernel_entries else [] + n_active = len(pe_deploy) + total_achieved = _calc_tflops(k_scalars, k_ns) + per_pe_tflops = total_achieved / n_active if n_active > 0 else 0.0 + pe_util = (per_pe_tflops / peak_tflops_f16 * 100) if peak_tflops_f16 > 0 else 0.0 + + for pe_id in sorted(pe_deploy): + d_ns, d_bytes = pe_deploy[pe_id] + d_bw = d_bytes / d_ns if d_ns > 0 else 0.0 + d_util = (d_bw / peak_hbm_bw_gbs * 100) if peak_hbm_bw_gbs > 0 else 0.0 + lines.append( + f"{pe_id:>4} {d_ns:>10.1f} {d_bw:>10.1f} {d_util:>7.1f}% " + f"{k_ns:>10.1f} {per_pe_tflops:>8.3f} {pe_util:>6.1f}%" + ) + lines.append("") + + return "\n".join(lines) + + +def _extract_peaks(spec: dict | None) -> tuple[float, float]: + """Extract peak TFLOPS (f16) and HBM BW (GB/s) from spec.""" + if spec is None: + return 0.0, 0.0 + cube = spec.get("cube", {}) + pe_template = cube.get("pe_template", {}) + comps = pe_template.get("components", {}) + gemm_attrs = comps.get("pe_gemm", {}).get("attrs", {}) + peak_tflops = float(gemm_attrs.get("peak_tflops_f16", 0.0)) + cube_links = cube.get("links", {}) + hbm_bw = float(cube_links.get("xbar_to_hbm_bw_gbs", 0.0)) + return peak_tflops, hbm_bw + + +def _count_pes(spec: dict | None) -> int: + if spec is None: + return 8 + cube = spec.get("cube", {}) + layout = cube.get("pe_layout", {}) + per_corner = layout.get("pe_per_corner", 2) + corners = len(layout.get("corners", ["NW", "NE", "SW", "SE"])) + return per_corner * corners + + +def _calc_tflops(scalars: list, latency_ns: float) -> float: + """Calculate achieved TFLOPS from scalar args [M, K, N] and latency.""" + if len(scalars) < 3 or latency_ns <= 0: + return 0.0 + m, k, n = scalars[0], scalars[1], scalars[2] + flops = 2.0 * m * k * n + return flops / (latency_ns * 1e-9) / 1e12 + + +def _per_pe_deploy(deploy_entries: list[dict]) -> dict[int, tuple[float, int]]: + """Aggregate deploy latency and bytes per PE.""" + result: dict[int, tuple[float, int]] = {} + for e in deploy_entries: + pe = e.get("pe", 0) + lat = e.get("total_ns", 0.0) + nb = e.get("nbytes", 0) + if pe in result: + old_ns, old_bytes = result[pe] + result[pe] = (old_ns + lat, old_bytes + nb) + else: + result[pe] = (lat, nb) + return result diff --git a/src/kernbench/common/__init__.py b/src/kernbench/common/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/kernbench/common/pe_commands.py b/src/kernbench/common/pe_commands.py new file mode 100644 index 0000000..d1d2c39 --- /dev/null +++ b/src/kernbench/common/pe_commands.py @@ -0,0 +1,150 @@ +"""PE-internal command types and handles (ADR-0014). + +Generated by triton_emu (TLContext) and consumed by PE component +implementations (PE_CPU, PE_SCHEDULER, PE_DMA, PE_GEMM, PE_MATH). + +Command lifecycle: + Triton kernel → TLContext → [PeCommand list] → PE_CPU → PE_SCHEDULER → engines +""" +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import TYPE_CHECKING, Any, Literal + +if TYPE_CHECKING: + import simpy + + +# ── Handles ─────────────────────────────────────────────────────── + + +@dataclass(frozen=True) +class TensorHandle: + """Opaque reference to a tensor residing in PE_TCM. + + Returned by tl.load, tl.dot, tl.exp, etc. + Carries metadata for command generation; data field is reserved + for future validate mode (numpy array). + """ + + id: str + pa: int # physical address in HBM/TCM + shape: tuple[int, ...] + dtype: str + nbytes: int # total byte size + data: object = None # reserved for validate mode + + +@dataclass(frozen=True) +class CompletionHandle: + """Opaque handle for a non-blocking composite command. + + Returned by tl.composite, consumed by tl.wait. + """ + + id: str + + +# ── PE Commands ─────────────────────────────────────────────────── + + +@dataclass(frozen=True) +class DmaReadCmd: + """DMA READ: HBM → PE_TCM.""" + + handle: TensorHandle + src_pa: int + nbytes: int + + +@dataclass(frozen=True) +class DmaWriteCmd: + """DMA WRITE: PE_TCM → HBM.""" + + handle: TensorHandle + dst_pa: int + nbytes: int + + +@dataclass(frozen=True) +class GemmCmd: + """GEMM engine command: matrix multiply on TCM data. + + out = a @ b, all operands in TCM. + """ + + a: TensorHandle + b: TensorHandle + out: TensorHandle + m: int + k: int + n: int + + +@dataclass(frozen=True) +class MathCmd: + """MATH engine command: unary/binary/reduction on TCM data. + + op: "exp", "log", "sqrt", "abs", "sigmoid", "cos", "sin", + "add", "sub", "mul", "div", "where", + "sum", "max", "min" + """ + + op: str + inputs: tuple[TensorHandle, ...] + out: TensorHandle + axis: int | None = None # for reductions + + +@dataclass(frozen=True) +class CompositeCmd: + """Composite command: tiled pipeline of DMA_READ + COMPUTE + DMA_WRITE. + + Non-blocking — submitted to PE_SCHEDULER which manages tile splitting + and pipeline overlaps (ADR-0014 D3.2). + """ + + completion: CompletionHandle + op: Literal["gemm", "math"] + a: TensorHandle + b: TensorHandle | None + out_pa: int + out_nbytes: int + math_op: str | None = None # for op="math": which math operation + + +@dataclass(frozen=True) +class WaitCmd: + """Wait for a specific composite or all pending composites.""" + + handle: CompletionHandle | None = None # None = wait all + + +@dataclass(frozen=True) +class PeCpuOverheadCmd: + """PE_CPU scalar execution overhead (cycles).""" + + cycles: int + + +# Union type for all PE commands +PeCommand = ( + DmaReadCmd | DmaWriteCmd | GemmCmd | MathCmd + | CompositeCmd | WaitCmd | PeCpuOverheadCmd +) + + +@dataclass +class PeInternalTxn: + """PE-internal message flowing PE_CPU → PE_SCHEDULER → engines. + + Carries a single PeCommand and a completion event. PE_CPU creates one + PeInternalTxn per command during the replay phase and sends it to + PE_SCHEDULER, which routes it to the appropriate engine (PE_DMA, + PE_GEMM, PE_MATH). The engine signals ``done`` on completion. + """ + + command: PeCommand + done: simpy.Event # succeeded when the engine completes this command + pe_prefix: str = "" # e.g. "sip0.cube0.pe0" — needed by PE_DMA for path resolution + result_data: dict[str, Any] = field(default_factory=dict) diff --git a/src/kernbench/common/types.py b/src/kernbench/common/types.py new file mode 100644 index 0000000..c6ff2bb --- /dev/null +++ b/src/kernbench/common/types.py @@ -0,0 +1,29 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any, NewType, Protocol, TypeAlias + +RequestHandle = NewType("RequestHandle", str) + +Trace: TypeAlias = Any + + +@dataclass(frozen=True) +class Completion: + ok: bool + error_code: str | None = None + error_message: str | None = None + + +class SimEngine(Protocol): + """ + Backend simulation/runner engine contract. + + Engine must be able to: + - accept requests created by RuntimeContext (submit/dispatch) + - report completion and optional trace for a given handle + """ + + def get_completion(self, handle: RequestHandle) -> tuple[Completion, Trace | None]: ... + def submit(self, request: Any) -> RequestHandle: ... + def wait(self, handle: RequestHandle) -> None: ... diff --git a/src/kernbench/components/__init__.py b/src/kernbench/components/__init__.py new file mode 100644 index 0000000..b4f2653 --- /dev/null +++ b/src/kernbench/components/__init__.py @@ -0,0 +1,4 @@ +from kernbench.components.base import ComponentBase, ComponentRegistry +from kernbench.components.context import ComponentContext + +__all__ = ["ComponentBase", "ComponentRegistry", "ComponentContext"] diff --git a/src/kernbench/components/base.py b/src/kernbench/components/base.py new file mode 100644 index 0000000..5d633d8 --- /dev/null +++ b/src/kernbench/components/base.py @@ -0,0 +1,167 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod +from collections.abc import Generator +from typing import TYPE_CHECKING, Any + +import simpy + +if TYPE_CHECKING: + from kernbench.components.context import ComponentContext + from kernbench.topology.types import Node + + +class ComponentBase(ABC): + """Base class for all SimPy component implementations (ADR-0007 D3, ADR-0015). + + Each component corresponds to one node in the compiled topology graph. + It models the processing overhead at that node as a SimPy generator, + allowing future implementations to add queueing and contention. + + Port model (ADR-0015 D1): + in_ports[src_node_id] — SimPy Store for incoming messages from src + out_ports[dst_node_id] — SimPy Store for outgoing messages to dst + Ports are wired by GraphEngine at initialization; wire processes model + propagation delay between connected ports (ADR-0015 D2). + + Context (ADR-0015 D4): + ctx — ComponentContext with router and resolver. + """ + + def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None: + self.node = node + self.ctx = ctx + self.in_ports: dict[str, simpy.Store] = {} + self.out_ports: dict[str, simpy.Store] = {} + + def start(self, env: simpy.Environment) -> None: + """Called once after all ports are wired. + + Default: starts a fan-in collector and a generic forwarding worker. + The worker calls self.run() for per-component latency, then routes the + Transaction to the next hop or signals done (duck-typed; no direct + Transaction import to avoid circular dependencies). + + Override in components that need custom fan-out / aggregation logic + (e.g. MCpuComponent, IoCpuComponent for kernel launch). + """ + if not self.in_ports: + return + self._inbox: simpy.Store = simpy.Store(env) + for port in self.in_ports.values(): + env.process(self._fan_in(port)) + env.process(self._worker(env)) + + def _fan_in(self, port: simpy.Store) -> Generator: + """Relay messages from one in_port into the shared inbox.""" + while True: + msg = yield port.get() + yield self._inbox.put(msg) + + def _worker(self, env: simpy.Environment) -> Generator: + """Generic forwarding worker: spawns _forward_txn per message (pipeline).""" + while True: + txn: Any = yield self._inbox.get() + env.process(self._forward_txn(env, txn)) + + def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator: + """Apply run() latency, then forward to next hop or drain at terminal.""" + yield from self.run(env, txn.nbytes) + next_hop = txn.next_hop # duck-typed: Transaction.next_hop + if next_hop: + yield self.out_ports[next_hop].put(txn.advance()) + else: + drain = getattr(txn, "drain_ns", 0.0) + if drain > 0: + yield env.timeout(drain) + txn.done.succeed() + + @abstractmethod + def run(self, env: simpy.Environment, nbytes: int) -> Generator: + """SimPy process: yield one or more events for this node's processing. + + Subclasses yield env.timeout(overhead_ns) or compute latency dynamically. + Called by _forward_txn and subclass-specific handlers. + """ + ... + + +class PeEngineBase(ComponentBase): + """Base class for PE-internal engines (PE_DMA, PE_GEMM, PE_MATH). + + Provides: + - ``_pe_prefix``: extracted from node.id (e.g. "sip0.cube0.pe0") + - Dual-message ``_worker``: dispatches PeInternalTxn to + ``handle_command()`` and Transaction to inherited ``_forward_txn()``. + - ``init_resources(env)``: hook for subclass resource initialization, + called by ``start()`` before the worker is spawned. + + Subclass contract: + 1. Override ``handle_command(env, pe_txn)`` — process a PeInternalTxn. + 2. Override ``run(env, nbytes)`` — yield component latency. + 3. Optionally override ``init_resources(env)`` for DMA channels, etc. + """ + + def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None: + super().__init__(node, ctx) + self._pe_prefix: str = node.id.rsplit(".", 1)[0] + + def start(self, env: simpy.Environment) -> None: + self.init_resources(env) + super().start(env) + + def init_resources(self, env: simpy.Environment) -> None: + """Hook for subclass resource initialization. Called before worker spawn.""" + + def _worker(self, env: simpy.Environment) -> Generator: + """Dual-message dispatch: PeInternalTxn → handle_command, Transaction → _forward_txn.""" + from kernbench.common.pe_commands import PeInternalTxn + + while True: + msg: Any = yield self._inbox.get() + if isinstance(msg, PeInternalTxn): + env.process(self.handle_command(env, msg)) + else: + env.process(self._forward_txn(env, msg)) + + @abstractmethod + def handle_command(self, env: simpy.Environment, pe_txn: Any) -> Generator: + """Process a PE-internal command (PeInternalTxn). + + Subclass must: + - Perform engine-specific work (acquire resources, compute, etc.) + - Call ``pe_txn.done.succeed()`` on completion. + """ + ... + + +class ComponentRegistry: + """DI registry: maps node.impl strings to ComponentBase subclasses. + + Resolution order for ComponentRegistry.create(node, overrides, ctx): + 1. overrides[node.impl] — caller-injected override + 2. _registry[node.impl] — globally registered impl + 3. Error — no fallback; every node must have an impl + """ + + _registry: dict[str, type[ComponentBase]] = {} + + @classmethod + def register(cls, impl: str, component_cls: type[ComponentBase]) -> None: + cls._registry[impl] = component_cls + + @classmethod + def create( + cls, + node: Node, + overrides: dict[str, type[ComponentBase]] | None = None, + ctx: ComponentContext | None = None, + ) -> ComponentBase: + if overrides and node.impl in overrides: + return overrides[node.impl](node, ctx) + if node.impl in cls._registry: + return cls._registry[node.impl](node, ctx) + raise ValueError( + f"No component registered for impl '{node.impl}' (node: {node.id}). " + f"Register it in kernbench.components.impls.__init__." + ) diff --git a/src/kernbench/components/context.py b/src/kernbench/components/context.py new file mode 100644 index 0000000..98a6f93 --- /dev/null +++ b/src/kernbench/components/context.py @@ -0,0 +1,52 @@ +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Any + +import simpy + +from kernbench.policy.routing.router import AddressResolver, PathRouter + + +@dataclass +class ComponentContext: + """Topology services injected into every component implementation. + + Required by components that need routing or address resolution + (IoCpuComponent, MCpuComponent, …). TransitComponent ignores ctx. + + Passed via ComponentRegistry.create(node, overrides, ctx=ctx). + """ + + router: PathRouter + resolver: AddressResolver + positions: dict[str, tuple[float, float] | None] # node_id → pos_mm + ns_per_mm: float # wire propagation constant (from topology spec) + edge_map: dict[tuple[str, str], Any] = field(default_factory=dict) + spec: dict = field(default_factory=dict) # topology spec (cube layout, PE count, etc.) + + def get_shared_resource( + self, env: simpy.Environment, key: str, capacity: int = 1, + ) -> simpy.Resource: + """Return a shared SimPy Resource, creating it on first access. + + Used by PE components that share a resource across engines within + the same PE (e.g. accel_slot shared by PE_GEMM and PE_MATH). + Key should be scoped per PE: e.g. "sip0.cube0.pe0.accel_slot". + """ + if not hasattr(self, "_shared_resources"): + self._shared_resources: dict[str, simpy.Resource] = {} + if key not in self._shared_resources: + self._shared_resources[key] = simpy.Resource(env, capacity=capacity) + return self._shared_resources[key] + + def compute_drain_ns(self, path: list[str], nbytes: int) -> float: + """Wormhole drain time: nbytes / bottleneck_bw along path.""" + min_bw = float("inf") + for i in range(len(path) - 1): + edge = self.edge_map.get((path[i], path[i + 1])) + if edge and getattr(edge, "bw_gbs", None): + min_bw = min(min_bw, edge.bw_gbs) + if min_bw == float("inf"): + return 0.0 + return nbytes / min_bw diff --git a/src/kernbench/components/impls/__init__.py b/src/kernbench/components/impls/__init__.py new file mode 100644 index 0000000..f4edf9c --- /dev/null +++ b/src/kernbench/components/impls/__init__.py @@ -0,0 +1,54 @@ +"""Concrete component implementations. + +Each module registers its component(s) with ComponentRegistry on import. +Import this package to activate all built-in implementations. +""" + +from kernbench.components.base import ComponentRegistry +from kernbench.components.impls.forwarding import TransitComponent +from kernbench.components.impls.hbm_ctrl import HbmCtrlComponent +from kernbench.components.impls.io_cpu import IoCpuComponent +from kernbench.components.impls.m_cpu import MCpuComponent +from kernbench.components.impls.noc import TwoDMeshNocComponent +from kernbench.components.impls.pcie_ep import PcieEpComponent +from kernbench.components.impls.pe_cpu import PeCpuComponent +from kernbench.components.impls.pe_dma import PeDmaComponent +from kernbench.components.impls.pe_gemm import PeGemmComponent +from kernbench.components.impls.pe_math import PeMathComponent +from kernbench.components.impls.pe_scheduler import PeSchedulerComponent +from kernbench.components.impls.pe_tcm import PeTcmComponent +from kernbench.components.impls.sram import SramComponent + +ComponentRegistry.register("forwarding_v1", TransitComponent) +ComponentRegistry.register("switch_v1", TransitComponent) +ComponentRegistry.register("noc_v1", TransitComponent) +ComponentRegistry.register("noc_2d_mesh_v1", TwoDMeshNocComponent) +ComponentRegistry.register("ucie_v1", TransitComponent) +ComponentRegistry.register("xbar_v1", TransitComponent) +ComponentRegistry.register("pcie_ep_v1", PcieEpComponent) +ComponentRegistry.register("io_cpu_v1", IoCpuComponent) +ComponentRegistry.register("m_cpu_v1", MCpuComponent) +ComponentRegistry.register("hbm_ctrl_v1", HbmCtrlComponent) +ComponentRegistry.register("sram_v1", SramComponent) +ComponentRegistry.register("pe_cpu_v1", PeCpuComponent) +ComponentRegistry.register("pe_scheduler_v1", PeSchedulerComponent) +ComponentRegistry.register("pe_dma_v1", PeDmaComponent) +ComponentRegistry.register("pe_gemm_v1", PeGemmComponent) +ComponentRegistry.register("pe_math_v1", PeMathComponent) +ComponentRegistry.register("pe_tcm_v1", PeTcmComponent) + +__all__ = [ + "HbmCtrlComponent", + "IoCpuComponent", + "MCpuComponent", + "PcieEpComponent", + "PeCpuComponent", + "PeDmaComponent", + "PeGemmComponent", + "PeMathComponent", + "PeSchedulerComponent", + "PeTcmComponent", + "TransitComponent", + "TwoDMeshNocComponent", + "SramComponent", +] diff --git a/src/kernbench/components/impls/forwarding.py b/src/kernbench/components/impls/forwarding.py new file mode 100644 index 0000000..1fa8eee --- /dev/null +++ b/src/kernbench/components/impls/forwarding.py @@ -0,0 +1,27 @@ +from __future__ import annotations + +from collections.abc import Generator +from typing import TYPE_CHECKING + +import simpy + +from kernbench.components.base import ComponentBase + +if TYPE_CHECKING: + from kernbench.components.context import ComponentContext + from kernbench.topology.types import Node + + +class TransitComponent(ComponentBase): + """Transit component for NOC, UCIe, XBAR nodes. + + Applies overhead_ns processing delay (from node.attrs) then forwards the + Transaction to the next hop via inherited _forward_txn(). + """ + + def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None: + super().__init__(node, ctx) + + def run(self, env: simpy.Environment, nbytes: int) -> Generator: + overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0)) + yield env.timeout(overhead_ns) diff --git a/src/kernbench/components/impls/hbm_ctrl.py b/src/kernbench/components/impls/hbm_ctrl.py new file mode 100644 index 0000000..3fa21cb --- /dev/null +++ b/src/kernbench/components/impls/hbm_ctrl.py @@ -0,0 +1,101 @@ +from __future__ import annotations + +from collections.abc import Generator +from typing import TYPE_CHECKING, Any + +import simpy + +from kernbench.components.base import ComponentBase +from kernbench.sim_engine.transaction import Transaction + +if TYPE_CHECKING: + from kernbench.components.context import ComponentContext + from kernbench.topology.types import Node + + +class HbmCtrlComponent(ComponentBase): + """HBM controller: terminal component that models HBM access latency. + + Dual-channel model: separate read and write resources (each capacity=1) + allowing concurrent read/write like PE_DMA. Multiple reads or multiple + writes still serialize within their respective channel. + + On completion, creates a ResponseMsg and sends it back on the reverse path + so that response latency is modeled through the fabric. + """ + + def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None: + super().__init__(node, ctx) + self._read: simpy.Resource | None = None + self._write: simpy.Resource | None = None + + def start(self, env: simpy.Environment) -> None: + capacity = int(self.node.attrs.get("capacity", 1)) + self._read = simpy.Resource(env, capacity=capacity) + self._write = simpy.Resource(env, capacity=capacity) + super().start(env) + + def run(self, env: simpy.Environment, nbytes: int) -> Generator: + overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0)) + yield env.timeout(overhead_ns) + + def _select_channel(self, txn: Any) -> simpy.Resource: + """Select channel based on request type: write requests → write, else → read.""" + from kernbench.runtime_api.kernel import MemoryWriteMsg, PeDmaMsg + + assert self._read is not None and self._write is not None + req = txn.request + if isinstance(req, MemoryWriteMsg): + return self._write + if isinstance(req, PeDmaMsg) and req.is_write: + return self._write + return self._read + + def _worker(self, env: simpy.Environment) -> Generator: + """Dispatch each incoming txn to a concurrent process for channel-level parallelism.""" + while True: + txn: Any = yield self._inbox.get() + env.process(self._handle_txn(env, txn)) + + def _handle_txn(self, env: simpy.Environment, txn: Any) -> Generator: + """Acquire channel, run, apply drain, send response.""" + channel = self._select_channel(txn) + with channel.request() as req: + yield req + yield from self.run(env, txn.nbytes) + drain = getattr(txn, "drain_ns", 0.0) + if drain > 0: + yield env.timeout(drain) + yield from self._send_response(env, txn) + + def _send_response(self, env: simpy.Environment, txn: Any) -> Generator: + """Create ResponseMsg and send on reverse path back to originator. + + PeDmaMsg is a direct probe with no IO_CPU/M_CPU aggregation in the path, + so we succeed txn.done directly instead of sending a response Transaction. + """ + from kernbench.runtime_api.kernel import PeDmaMsg + + if isinstance(txn.request, PeDmaMsg): + txn.done.succeed() + return + + reverse_path = list(reversed(txn.path)) + if len(reverse_path) >= 2 and self.ctx: + from kernbench.runtime_api.kernel import ResponseMsg + + parts = self.node.id.split(".") + cube_id = int(parts[1].replace("cube", "")) + pe_id = int(parts[3].replace("slice", "")) + resp_msg = ResponseMsg( + correlation_id=txn.request.correlation_id, + request_id=txn.request.request_id, + src_cube=cube_id, src_pe=pe_id, success=True, + ) + resp_txn = Transaction( + request=resp_msg, path=reverse_path, step=0, + nbytes=0, done=env.event(), is_response=True, + ) + yield self.out_ports[reverse_path[1]].put(resp_txn.advance()) + else: + txn.done.succeed() diff --git a/src/kernbench/components/impls/io_cpu.py b/src/kernbench/components/impls/io_cpu.py new file mode 100644 index 0000000..ad123a6 --- /dev/null +++ b/src/kernbench/components/impls/io_cpu.py @@ -0,0 +1,145 @@ +from __future__ import annotations + +from collections.abc import Generator +from typing import TYPE_CHECKING, Any + +import simpy + +from kernbench.components.base import ComponentBase +from kernbench.sim_engine.transaction import Transaction + +if TYPE_CHECKING: + from kernbench.components.context import ComponentContext + from kernbench.topology.types import Node + + +class IoCpuComponent(ComponentBase): + """IO_CPU component: multi-cube fan-out with response aggregation. + + Forward path: + 1. Applies overhead_ns processing overhead. + 2. Resolves target cube(s) from request.target_cubes. + 3. Fans out sub-Transactions to each target cube's M_CPU. + + Response path: + Collects ResponseMsg from each M_CPU. When all cube responses are + received, succeeds the parent txn.done. + """ + + def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None: + super().__init__(node, ctx) + # Pending fan-out tracking: request_id → (expected, received, parent_txn_done) + self._pending: dict[str, tuple[int, int, simpy.Event]] = {} + + def run(self, env: simpy.Environment, nbytes: int) -> Generator: + overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0)) + yield env.timeout(overhead_ns) + + def _worker(self, env: simpy.Environment) -> Generator: + while True: + txn: Any = yield self._inbox.get() + if getattr(txn, "is_response", False): + self._collect_response(txn) + else: + yield from self.run(env, txn.nbytes) + env.process(self._dispatch_to_m_cpus(env, txn)) + + def _collect_response(self, resp_txn: Any) -> None: + """Receive a cube response and increment the aggregation counter.""" + key = resp_txn.request.request_id + if key not in self._pending: + return + expected, received, parent_done = self._pending[key] + received += 1 + if received >= expected: + parent_done.succeed() + del self._pending[key] + else: + self._pending[key] = (expected, received, parent_done) + + def _dispatch_to_m_cpus(self, env: simpy.Environment, txn: Any) -> Generator: + """Fan out sub-Transactions to target cube M_CPUs, wait for responses.""" + from kernbench.runtime_api.kernel import KernelLaunchMsg, MemoryReadMsg, MemoryWriteMsg + + request = txn.request + try: + cube_targets = self._resolve_cube_targets(request) + except Exception: + txn.done.succeed() + return + + if not cube_targets: + txn.done.succeed() + return + + # Setup aggregation + self._pending[request.request_id] = (len(cube_targets), 0, txn.done) + + # Fan out to each target cube's M_CPU + for sip, cube in cube_targets: + try: + m_cpu_id = self.ctx.resolver.find_m_cpu(sip, cube) + path = self.ctx.router.find_node_path(self.node.id, m_cpu_id) + except Exception: + continue + if len(path) < 2: + continue + sub_txn = Transaction( + request=request, path=path, step=0, + nbytes=txn.nbytes, done=env.event(), + result_data=txn.result_data, + ) + yield self.out_ports[path[1]].put(sub_txn.advance()) + + def _resolve_cube_targets(self, request: Any) -> list[tuple[int, int]]: + """Return list of (sip, cube) pairs to fan out to.""" + from kernbench.runtime_api.kernel import KernelLaunchMsg, MemoryReadMsg, MemoryWriteMsg + + target_cubes = getattr(request, "target_cubes", "all") + + if isinstance(request, MemoryWriteMsg): + sip = request.dst_sip + if target_cubes == "all": + cube = self._cube_from_pa(request.dst_pa, fallback=request.dst_cube) + return [(sip, cube)] + return [(sip, c) for c in target_cubes] + + if isinstance(request, MemoryReadMsg): + sip = request.src_sip + if target_cubes == "all": + cube = self._cube_from_pa(request.src_pa, fallback=request.src_cube) + return [(sip, cube)] + return [(sip, c) for c in target_cubes] + + if isinstance(request, KernelLaunchMsg): + my_sip = self._my_sip() + if target_cubes != "all": + return [(my_sip, c) for c in target_cubes] + # "all": derive from tensor shards, filtered to this SIP + seen: set[tuple[int, int]] = set() + targets: list[tuple[int, int]] = [] + for arg in request.args: + if arg.arg_kind != "tensor": + continue + for shard in arg.shards: + if shard.sip != my_sip: + continue + key = (shard.sip, shard.cube) + if key not in seen: + seen.add(key) + targets.append(key) + return targets + + return [] + + def _cube_from_pa(self, pa_val: int, fallback: int) -> int: + """Extract cube_id from a physical address, with fallback.""" + from kernbench.policy.address.phyaddr import PhysAddr + try: + return PhysAddr.decode(pa_val).cube_id + except Exception: + return fallback + + def _my_sip(self) -> int: + """Extract this IO_CPU's SIP ID from its node ID (e.g. 'sip0.io0.io_cpu' → 0).""" + return int(self.node.id.split(".")[0].replace("sip", "")) diff --git a/src/kernbench/components/impls/m_cpu.py b/src/kernbench/components/impls/m_cpu.py new file mode 100644 index 0000000..8bf955e --- /dev/null +++ b/src/kernbench/components/impls/m_cpu.py @@ -0,0 +1,269 @@ +from __future__ import annotations + +from collections.abc import Generator +from typing import TYPE_CHECKING, Any + +import simpy + +from kernbench.components.base import ComponentBase +from kernbench.sim_engine.transaction import Transaction + +if TYPE_CHECKING: + from kernbench.components.context import ComponentContext + from kernbench.topology.types import Node + + +class MCpuComponent(ComponentBase): + """M_CPU component: multi-PE DMA fan-out with response aggregation. + + Forward path (ADR-0015 D5): + When a forward Transaction arrives at m_cpu (terminal hop), M_CPU fans out + DMA sub-Transactions to target PEs' HBM slices. target_pe on the request + controls fan-out: int → single PE, "all" → all PEs in the cube. + + Response path: + ResponseMsg from each hbm_ctrl arrives back at m_cpu. Once all PE responses + are collected, m_cpu sends an aggregate ResponseMsg on the reverse command + path back to io_cpu. + + Transit: + When m_cpu is NOT the terminal hop (transit or response relay), the + Transaction is forwarded normally to the next hop. + """ + + def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None: + super().__init__(node, ctx) + # Pending fan-out tracking: request_id → (expected, received, all_done_event) + self._pending: dict[str, tuple[int, int, simpy.Event]] = {} + # Store parent txn for response sending: request_id → parent_txn + self._parent_txns: dict[str, Any] = {} + # DMA engine resources (ADR-0015 D5, ADR-0014 D4): capacity=1 each + self._dma_write: simpy.Resource | None = None + self._dma_read: simpy.Resource | None = None + + def start(self, env: simpy.Environment) -> None: + self._dma_write = simpy.Resource(env, capacity=1) + self._dma_read = simpy.Resource(env, capacity=1) + super().start(env) + + def run(self, env: simpy.Environment, nbytes: int) -> Generator: + overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0)) + yield env.timeout(overhead_ns) + + def _worker(self, env: simpy.Environment) -> Generator: + """Dispatch forward txns, collect response txns.""" + from kernbench.runtime_api.kernel import KernelLaunchMsg + + while True: + txn: Any = yield self._inbox.get() + if getattr(txn, "is_response", False): + self._collect_response(txn) + else: + yield from self.run(env, txn.nbytes) + next_hop = txn.next_hop + if next_hop: + yield self.out_ports[next_hop].put(txn.advance()) + elif self.ctx is not None and txn.request is not None: + if isinstance(txn.request, KernelLaunchMsg): + env.process(self._kernel_launch_fanout(env, txn)) + else: + env.process(self._dma_fanout(env, txn)) + else: + txn.done.succeed() + + def _collect_response(self, resp_txn: Any) -> None: + """Receive a PE response and increment the aggregation counter.""" + key = resp_txn.request.request_id + if key not in self._pending: + return + expected, received, all_done = self._pending[key] + received += 1 + if received >= expected: + all_done.succeed() + del self._pending[key] + else: + self._pending[key] = (expected, received, all_done) + + def _dma_fanout(self, env: simpy.Environment, txn: Any) -> Generator: + """Fan out DMA sub-Transactions to target PE(s), wait for responses, + then send aggregate response on reverse command path. + + Each DMA transfer acquires the DMA resource (capacity=1 per ADR-0014 D4), + so multi-PE fan-out is serialized through the DMA engine. + """ + from kernbench.runtime_api.kernel import MemoryWriteMsg + + request = txn.request + target_pe = getattr(request, "target_pe", "all") + + dst_nodes = self._resolve_dma_destinations(request, target_pe) + if not dst_nodes: + txn.done.succeed() + return + + # Setup aggregation + all_done = env.event() + self._pending[request.request_id] = (len(dst_nodes), 0, all_done) + self._parent_txns[request.request_id] = txn + + # Select DMA resource based on operation type + dma_res = self._dma_write if isinstance(request, MemoryWriteMsg) else self._dma_read + + # Fan out DMA sub-txns (serialized through DMA resource) + max_drain_ns = 0.0 + for dst_node in dst_nodes: + try: + dma_path = self.ctx.router.find_mcpu_dma_path(self.node.id, dst_node) + except Exception: + continue + if len(dma_path) < 2: + continue + drain_ns = self.ctx.compute_drain_ns(dma_path, txn.nbytes) + max_drain_ns = max(max_drain_ns, drain_ns) + sub_txn = Transaction( + request=request, path=dma_path, step=0, + nbytes=txn.nbytes, done=env.event(), + drain_ns=drain_ns, + ) + with dma_res.request() as req: + yield req + yield self.out_ports[dma_path[1]].put(sub_txn.advance()) + + # Wait for all PE responses + yield all_done + txn.result_data["xfer_ns"] = max_drain_ns + del self._parent_txns[request.request_id] + + # Send aggregate response on reverse command path + reverse_path = list(reversed(txn.path)) + if len(reverse_path) >= 2: + from kernbench.runtime_api.kernel import ResponseMsg + + parts = self.node.id.split(".") + cube_id = int(parts[1].replace("cube", "")) + resp_msg = ResponseMsg( + correlation_id=request.correlation_id, + request_id=request.request_id, + src_cube=cube_id, src_pe=-1, success=True, + ) + resp_txn = Transaction( + request=resp_msg, path=reverse_path, step=0, + nbytes=0, done=env.event(), is_response=True, + ) + yield self.out_ports[reverse_path[1]].put(resp_txn.advance()) + else: + txn.done.succeed() + + def _kernel_launch_fanout(self, env: simpy.Environment, txn: Any) -> Generator: + """Fan out KernelLaunchMsg to target PE_CPU(s) via NOC (ADR-0009 D3). + + Routes through find_node_path (M_CPU → NOC → PE_CPU command edges). + Waits for sub_txn.done directly — no ResponseMsg needed for PE direction. + Then sends aggregate ResponseMsg back to IO_CPU on the reverse path. + """ + request = txn.request + target_pe = getattr(request, "target_pe", "all") + cube_prefix = self.node.id.rsplit(".", 1)[0] # e.g. "sip0.cube0" + pe_ids = self._resolve_pe_ids(target_pe) + + if not pe_ids: + txn.done.succeed() + return + + # Fan out to each PE_CPU and collect done events + sub_dones: list[simpy.Event] = [] + sub_txns: list[Transaction] = [] + for pe_id in pe_ids: + pe_cpu_id = f"{cube_prefix}.pe{pe_id}.pe_cpu" + try: + path = self.ctx.router.find_node_path(self.node.id, pe_cpu_id) + except Exception: + continue + if len(path) < 2: + continue + sub_done = env.event() + sub_txn = Transaction( + request=request, path=path, step=0, + nbytes=0, done=sub_done, + ) + yield self.out_ports[path[1]].put(sub_txn.advance()) + sub_dones.append(sub_done) + sub_txns.append(sub_txn) + + if not sub_dones: + txn.done.succeed() + return + + # Wait for all PE_CPUs to complete + for sd in sub_dones: + yield sd + + # Aggregate PE-internal metrics (max across PEs) + pe_exec_values = [st.result_data.get("pe_exec_ns", 0.0) for st in sub_txns] + if pe_exec_values: + txn.result_data["pe_exec_ns"] = max(pe_exec_values) + dma_values = [st.result_data.get("dma_ns", 0.0) for st in sub_txns] + if dma_values: + txn.result_data["dma_ns"] = max(dma_values) + compute_values = [st.result_data.get("compute_ns", 0.0) for st in sub_txns] + if compute_values: + txn.result_data["compute_ns"] = max(compute_values) + + # Send aggregate response on reverse command path back to IO_CPU + reverse_path = list(reversed(txn.path)) + if len(reverse_path) >= 2: + from kernbench.runtime_api.kernel import ResponseMsg + + parts = self.node.id.split(".") + cube_id = int(parts[1].replace("cube", "")) + resp_msg = ResponseMsg( + correlation_id=request.correlation_id, + request_id=request.request_id, + src_cube=cube_id, src_pe=-1, success=True, + ) + resp_txn = Transaction( + request=resp_msg, path=reverse_path, step=0, + nbytes=0, done=env.event(), is_response=True, + ) + yield self.out_ports[reverse_path[1]].put(resp_txn.advance()) + else: + txn.done.succeed() + + def _resolve_dma_destinations(self, request: Any, target_pe: int | str) -> list[str]: + """Return list of HBM destination node_ids for DMA fan-out. + + Uses PA-based resolution to determine the actual target cube and slice, + enabling cross-cube DMA routing when the PA points to a remote cube. + """ + cube_prefix = self.node.id.rsplit(".", 1)[0] # e.g. "sip0.cube0" + + if isinstance(target_pe, int): + return [f"{cube_prefix}.hbm_ctrl.slice{target_pe}"] + + # PA-based resolution: extract actual target from physical address + pa_val = getattr(request, "dst_pa", None) or getattr(request, "src_pa", None) + if pa_val is not None: + from kernbench.policy.address.phyaddr import PhysAddr + try: + pa = PhysAddr.decode(pa_val) + return [self.ctx.resolver.resolve(pa)] + except Exception: + pass + + # "all" without PA (KernelLaunch): all slices in local cube + n_slices = 8 + if self.ctx and self.ctx.spec: + mm = self.ctx.spec.get("cube", {}).get("memory_map", {}) + n_slices = mm.get("hbm_slices_per_cube", 8) + return [f"{cube_prefix}.hbm_ctrl.slice{i}" for i in range(n_slices)] + + def _resolve_pe_ids(self, target_pe: int | str) -> list[int]: + """Return list of PE IDs to fan out to (used by kernel launch fan-out).""" + if isinstance(target_pe, int): + return [target_pe] + # "all": all PEs in local cube + n_slices = 8 + if self.ctx and self.ctx.spec: + mm = self.ctx.spec.get("cube", {}).get("memory_map", {}) + n_slices = mm.get("hbm_slices_per_cube", 8) + return list(range(n_slices)) diff --git a/src/kernbench/components/impls/noc.py b/src/kernbench/components/impls/noc.py new file mode 100644 index 0000000..0c7af1f --- /dev/null +++ b/src/kernbench/components/impls/noc.py @@ -0,0 +1,187 @@ +from __future__ import annotations + +from collections.abc import Generator +from typing import TYPE_CHECKING, Any + +import simpy + +from kernbench.components.base import ComponentBase + +if TYPE_CHECKING: + from kernbench.components.context import ComponentContext + from kernbench.topology.types import Node + + +class TwoDMeshNocComponent(ComponentBase): + """2D mesh NOC modeled as a single smart node. + + Latency model: + - Traversal latency = Manhattan distance between prev_hop and next_hop + node positions, split into XY segments, traversed with pipeline. + - overhead_ns (from node.attrs) is added once per traversal. + + Contention model: + - Each directed XY segment is a simpy.Resource(capacity=1). + - Pipeline: next segment's resource is requested before the current + segment's timeout completes, so a free downstream segment is acquired + immediately (wormhole-style cut-through). + - Two transactions sharing a segment (same row or column band) contend. + + Concurrency: + - _worker spawns an independent SimPy process per transaction, so the + NOC is never serialized at the node level — only at segment resources. + """ + + def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None: + super().__init__(node, ctx) + self._env: simpy.Environment | None = None + self._links: dict[tuple, simpy.Resource] = {} + self._x_grid: list[float] = [] + self._y_grid: list[float] = [] + + def start(self, env: simpy.Environment) -> None: + self._env = env + self._build_grid() + super().start(env) + + def run(self, env: simpy.Environment, nbytes: int) -> Generator: + yield env.timeout(0) + + # ── Grid construction ──────────────────────────────────────────── + + def _build_grid(self) -> None: + if not self.ctx: + return + cube_prefix = self.node.id.rsplit(".", 1)[0] + xs: set[float] = set() + ys: set[float] = set() + for node_id, pos in self.ctx.positions.items(): + if node_id.startswith(cube_prefix + ".") and pos is not None: + xs.add(round(pos[0], 2)) + ys.add(round(pos[1], 2)) + self._x_grid = sorted(xs) + self._y_grid = sorted(ys) + + def _get_link(self, key: tuple) -> simpy.Resource: + if key not in self._links: + assert self._env is not None + self._links[key] = simpy.Resource(self._env, capacity=1) + return self._links[key] + + # ── Worker ─────────────────────────────────────────────────────── + + def _worker(self, env: simpy.Environment) -> Generator: + while True: + txn: Any = yield self._inbox.get() + env.process(self._route(env, txn)) + + def _route(self, env: simpy.Environment, txn: Any) -> Generator: + prev_hop = txn.path[txn.step - 1] if txn.step > 0 else None + next_hop = txn.next_hop + overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0)) + + links: list[tuple[tuple, float]] = [] + if prev_hop and next_hop and self.ctx: + src_pos = self.ctx.positions.get(prev_hop) + dst_pos = self.ctx.positions.get(next_hop) + if src_pos and dst_pos: + links = self._xy_links(src_pos, dst_pos) + + if links: + yield from self._traverse(env, links, overhead_ns) + else: + yield env.timeout(overhead_ns) + + if next_hop: + yield self.out_ports[next_hop].put(txn.advance()) + else: + drain = getattr(txn, "drain_ns", 0.0) + if drain > 0: + yield env.timeout(drain) + txn.done.succeed() + + # ── XY routing and pipelined link traversal ────────────────────── + + def _traverse( + self, + env: simpy.Environment, + links: list[tuple[tuple, float]], + overhead_ns: float, + ) -> Generator: + """Pipeline: request next segment before current timeout finishes.""" + ns_per_mm = self.ctx.ns_per_mm # type: ignore[union-attr] + + # Acquire first link + first_key, _ = links[0] + current_resource = self._get_link(first_key) + current_req = current_resource.request() + yield current_req + + for i, (_, dist_mm) in enumerate(links): + # Request next link before current timeout (pipeline) + if i + 1 < len(links): + next_key, _ = links[i + 1] + next_resource = self._get_link(next_key) + next_req = next_resource.request() + + yield env.timeout(dist_mm * ns_per_mm + (overhead_ns if i == 0 else 0.0)) + current_resource.release(current_req) + + if i + 1 < len(links): + yield next_req # usually already fulfilled (pipeline) + current_resource = next_resource + current_req = next_req + + def _xy_links( + self, + src: tuple[float, float], + dst: tuple[float, float], + ) -> list[tuple[tuple, float]]: + """XY routing: horizontal segment first, then vertical. + + Returns list of (link_key, dist_mm) pairs, where link_key uniquely + identifies a directed segment shared across concurrent transactions. + """ + x0, y0 = src + x1, y1 = dst + links: list[tuple[tuple, float]] = [] + + # Horizontal segment at y≈y0 + if abs(x0 - x1) > 1e-9: + y_band = self._snap(y0, self._y_grid) + for xa, xb in self._segments(x0, x1, self._x_grid): + d = abs(xb - xa) + if d > 1e-9: + lo, hi = (xa, xb) if xa < xb else (xb, xa) + dir_h = "E" if xb > xa else "W" + links.append((("H", round(y_band, 2), round(lo, 2), round(hi, 2), dir_h), d)) + + # Vertical segment at x≈x1 + if abs(y0 - y1) > 1e-9: + x_band = self._snap(x1, self._x_grid) + for ya, yb in self._segments(y0, y1, self._y_grid): + d = abs(yb - ya) + if d > 1e-9: + lo, hi = (ya, yb) if ya < yb else (yb, ya) + dir_v = "S" if yb > ya else "N" + links.append((("V", round(x_band, 2), round(lo, 2), round(hi, 2), dir_v), d)) + + return links + + @staticmethod + def _snap(val: float, grid: list[float]) -> float: + if not grid: + return val + return min(grid, key=lambda g: abs(g - val)) + + @staticmethod + def _segments(a: float, b: float, grid: list[float]) -> list[tuple[float, float]]: + """Consecutive (p_i, p_{i+1}) pairs covering range [a, b] using grid waypoints.""" + if abs(a - b) < 1e-9: + return [] + lo, hi = (a, b) if a < b else (b, a) + pts = [lo] + [g for g in grid if lo + 1e-9 < g < hi - 1e-9] + [hi] + pairs = [(pts[i], pts[i + 1]) for i in range(len(pts) - 1)] + if a > b: + pairs = [(p2, p1) for p1, p2 in reversed(pairs)] + return pairs diff --git a/src/kernbench/components/impls/pcie_ep.py b/src/kernbench/components/impls/pcie_ep.py new file mode 100644 index 0000000..53faac0 --- /dev/null +++ b/src/kernbench/components/impls/pcie_ep.py @@ -0,0 +1,27 @@ +from __future__ import annotations + +from collections.abc import Generator +from typing import TYPE_CHECKING + +import simpy + +from kernbench.components.base import ComponentBase + +if TYPE_CHECKING: + from kernbench.components.context import ComponentContext + from kernbench.topology.types import Node + + +class PcieEpComponent(ComponentBase): + """PCIe endpoint: protocol processing overhead before forwarding. + + Applies overhead_ns (from node.attrs) for PCIe protocol handling, + then forwards via inherited _forward_txn(). + """ + + def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None: + super().__init__(node, ctx) + + def run(self, env: simpy.Environment, nbytes: int) -> Generator: + overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0)) + yield env.timeout(overhead_ns) diff --git a/src/kernbench/components/impls/pe_cpu.py b/src/kernbench/components/impls/pe_cpu.py new file mode 100644 index 0000000..746856f --- /dev/null +++ b/src/kernbench/components/impls/pe_cpu.py @@ -0,0 +1,154 @@ +from __future__ import annotations + +from collections.abc import Generator +from typing import TYPE_CHECKING, Any + +import simpy + +from kernbench.components.base import ComponentBase + +if TYPE_CHECKING: + from kernbench.components.context import ComponentContext + from kernbench.topology.types import Node + + +class PeCpuComponent(ComponentBase): + """PE_CPU: kernel execution controller (Stage 2). + + Two-phase kernel execution (ADR-0014 D1): + Phase 1 (compile): look up kernel from registry, run it with TLContext + to generate a PeCommand list. + Phase 2 (replay): iterate commands, dispatch to PE_SCHEDULER via + PeInternalTxn, wait for blocking commands. + + Non-kernel Transactions are forwarded normally. + """ + + def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None: + super().__init__(node, ctx) + self._pe_prefix = node.id.rsplit(".", 1)[0] # e.g. "sip0.cube0.pe0" + try: + self._pe_idx = int(self._pe_prefix.rsplit("pe", 1)[1]) + except (IndexError, ValueError): + self._pe_idx = 0 + # Extract sip/cube index for multi-SIP/cube shard matching + parts = node.id.split(".") + try: + self._sip_idx = int(parts[0].replace("sip", "")) + except (IndexError, ValueError): + self._sip_idx = 0 + try: + self._cube_idx = int(parts[1].replace("cube", "")) + except (IndexError, ValueError): + self._cube_idx = 0 + + def _find_shard(self, shards: tuple) -> Any: + """Find shard matching this PE's (sip, cube, pe). Fallback to positional index.""" + for s in shards: + if s.sip == self._sip_idx and s.cube == self._cube_idx and s.pe == self._pe_idx: + return s + return shards[min(self._pe_idx, len(shards) - 1)] + + def run(self, env: simpy.Environment, nbytes: int) -> Generator: + overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0)) + yield env.timeout(overhead_ns) + + def _worker(self, env: simpy.Environment) -> Generator: + while True: + txn: Any = yield self._inbox.get() + from kernbench.runtime_api.kernel import KernelLaunchMsg + + if hasattr(txn, "request") and isinstance(txn.request, KernelLaunchMsg): + yield from self._execute_kernel(env, txn) + else: + yield from self._forward_txn(env, txn) + + def _execute_kernel(self, env: simpy.Environment, txn: Any) -> Generator: + """Compile kernel function and replay command trace.""" + from kernbench.common.pe_commands import ( + CompositeCmd, + PeCpuOverheadCmd, + PeInternalTxn, + WaitCmd, + ) + from kernbench.triton_emu.registry import get_kernel + from kernbench.triton_emu.tl_context import TLContext, run_kernel + + request = txn.request + + # Phase 1: Compile — apply PE_CPU setup overhead, then run kernel + yield from self.run(env, 0) + + kernel_fn = get_kernel(request.kernel_ref.name) + tl = TLContext(pe_id=self._pe_idx, dispatch_cycles=0) + + # Unpack KernelLaunchMsg.args into positional args for kernel function + # TensorArg → PA (pointer), ScalarArg → value + kernel_args: list = [] + for arg in request.args: + if arg.arg_kind == "tensor": + shard = self._find_shard(arg.shards) + kernel_args.append(shard.pa) + elif arg.arg_kind == "scalar": + kernel_args.append(arg.value) + + run_kernel(kernel_fn, tl, *kernel_args) + commands = tl.commands + + # Phase 2: Replay — dispatch commands to PE_SCHEDULER + pe_exec_start = env.now + scheduler_id = f"{self._pe_prefix}.pe_scheduler" + pending: dict[str, simpy.Event] = {} # completion_id → done event + composite_results: list[dict] = [] # collect result_data from CompositeCmd txns + + for cmd in commands: + if isinstance(cmd, PeCpuOverheadCmd): + yield env.timeout(cmd.cycles) + elif isinstance(cmd, WaitCmd): + if cmd.handle is not None: + evt = pending.pop(cmd.handle.id, None) + if evt: + yield evt + else: + # Wait all pending completions + for evt in pending.values(): + yield evt + pending.clear() + elif isinstance(cmd, CompositeCmd): + # Non-blocking: dispatch to scheduler, track completion + done_evt = env.event() + pe_txn = PeInternalTxn( + command=cmd, done=done_evt, + pe_prefix=self._pe_prefix, + ) + composite_results.append(pe_txn.result_data) + yield self.out_ports[scheduler_id].put(pe_txn) + pending[cmd.completion.id] = done_evt + else: + # Blocking: dispatch and wait for completion + done_evt = env.event() + pe_txn = PeInternalTxn( + command=cmd, done=done_evt, + pe_prefix=self._pe_prefix, + ) + yield self.out_ports[scheduler_id].put(pe_txn) + yield done_evt + + # Wait for any remaining pending completions + for evt in pending.values(): + yield evt + + # Record PE-internal execution time + txn.result_data["pe_exec_ns"] = env.now - pe_exec_start + + # Aggregate dma_ns / compute_ns from CompositeCmd results + total_dma_ns = 0.0 + total_compute_ns = 0.0 + for rd in composite_results: + total_dma_ns += rd.get("dma_ns", 0.0) + total_compute_ns += rd.get("compute_ns", 0.0) + txn.result_data["dma_ns"] = total_dma_ns + txn.result_data["compute_ns"] = total_compute_ns + + # Signal original Transaction done + txn.done.succeed() diff --git a/src/kernbench/components/impls/pe_dma.py b/src/kernbench/components/impls/pe_dma.py new file mode 100644 index 0000000..71ce8aa --- /dev/null +++ b/src/kernbench/components/impls/pe_dma.py @@ -0,0 +1,116 @@ +from __future__ import annotations + +from collections.abc import Generator +from typing import TYPE_CHECKING, Any + +import simpy + +from kernbench.components.base import PeEngineBase +from kernbench.sim_engine.transaction import Transaction + +if TYPE_CHECKING: + from kernbench.common.pe_commands import PeInternalTxn + from kernbench.components.context import ComponentContext + from kernbench.topology.types import Node + + +class PeDmaComponent(PeEngineBase): + """PE_DMA: dual-channel DMA engine with READ and WRITE resources. + + Each channel has capacity=1 (ADR-0014 D4): + - DMA_READ and DMA_WRITE may execute concurrently. + - Multiple READs cannot overlap; multiple WRITEs cannot overlap. + + Handles two message types: + - Transaction: external fabric messages (PeDmaMsg probes, M_CPU DMA) + - PeInternalTxn: PE-internal commands from PE_SCHEDULER + (DmaReadCmd → HBM read, DmaWriteCmd → HBM write) + """ + + def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None: + super().__init__(node, ctx) + self._dma_read: simpy.Resource | None = None + self._dma_write: simpy.Resource | None = None + + def init_resources(self, env: simpy.Environment) -> None: + self._dma_read = simpy.Resource(env, capacity=1) + self._dma_write = simpy.Resource(env, capacity=1) + + def run(self, env: simpy.Environment, nbytes: int) -> Generator: + yield env.timeout(0) + + def handle_command(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator: + """Handle PE-internal DMA command: resolve PA → HBM path → transfer.""" + from kernbench.common.pe_commands import DmaReadCmd, DmaWriteCmd + from kernbench.policy.address.phyaddr import PhysAddr + from kernbench.runtime_api.kernel import PeDmaMsg + + cmd = pe_txn.command + assert self._dma_read is not None and self._dma_write is not None + + # Determine direction and target PA + if isinstance(cmd, DmaReadCmd): + dma_res = self._dma_read + target_pa = cmd.src_pa + is_write = False + elif isinstance(cmd, DmaWriteCmd): + dma_res = self._dma_write + target_pa = cmd.dst_pa + is_write = True + else: + pe_txn.done.succeed() + return + + # Resolve PA → HBM node and compute path + pa = PhysAddr.decode(target_pa) + dst_node = self.ctx.resolver.resolve(pa) + path = self.ctx.router.find_path(self._pe_prefix, dst_node) + drain_ns = self.ctx.compute_drain_ns(path, cmd.nbytes) + + # Acquire DMA channel (command issue serialization) + with dma_res.request() as req: + yield req + # Create sub-Transaction with PeDmaMsg (HbmCtrl handles it directly) + sub_done = env.event() + sub_request = PeDmaMsg( + correlation_id="pe_internal", + request_id=f"dma_{id(pe_txn)}", + src_sip=0, src_cube=0, src_pe=0, + dst_pa=target_pa, nbytes=cmd.nbytes, + is_write=is_write, + ) + sub_txn = Transaction( + request=sub_request, path=path, step=0, + nbytes=cmd.nbytes, done=sub_done, drain_ns=drain_ns, + ) + # Send to next hop (path[0] is pe_dma itself, path[1] is xbar) + if len(path) > 1: + yield self.out_ports[path[1]].put(sub_txn.advance()) + # DMA channel released after issue + + # Wait for HBM transfer completion + yield sub_done + pe_txn.done.succeed() + + def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator: + """Handle external Transaction (PeDmaMsg probe, M_CPU DMA) with channel acquisition.""" + dma_res = self._select_channel(txn) + with dma_res.request() as req: + yield req + next_hop = txn.next_hop + if next_hop: + yield self.out_ports[next_hop].put(txn.advance()) + else: + drain = getattr(txn, "drain_ns", 0.0) + if drain > 0: + yield env.timeout(drain) + txn.done.succeed() + + def _select_channel(self, txn: Any) -> simpy.Resource: + """Select DMA channel based on request type.""" + from kernbench.runtime_api.kernel import MemoryWriteMsg + + assert self._dma_read is not None and self._dma_write is not None + if isinstance(txn.request, MemoryWriteMsg): + return self._dma_write + return self._dma_read diff --git a/src/kernbench/components/impls/pe_gemm.py b/src/kernbench/components/impls/pe_gemm.py new file mode 100644 index 0000000..3fc74e3 --- /dev/null +++ b/src/kernbench/components/impls/pe_gemm.py @@ -0,0 +1,90 @@ +from __future__ import annotations + +from collections.abc import Generator +from typing import TYPE_CHECKING, Any + +import simpy + +from kernbench.components.base import PeEngineBase + +if TYPE_CHECKING: + from kernbench.common.pe_commands import PeInternalTxn + from kernbench.components.context import ComponentContext + from kernbench.topology.types import Node + + +# dtype → bit width (for TFLOPS scaling) +_DTYPE_BITS: dict[str, int] = { + "f16": 16, "fp16": 16, "float16": 16, "bf16": 16, + "f32": 32, "fp32": 32, "float32": 32, + "i8": 8, "int8": 8, + "i16": 16, "int16": 16, + "i32": 32, "int32": 32, +} + + +class PeGemmComponent(PeEngineBase): + """PE_GEMM: matrix multiplication engine sharing accel_slot (ADR-0014 D4). + + Uses a shared compute resource (PE_ACCEL capacity=1) that is mutually + exclusive with PE_MATH within the same PE. + + Compute latency model: + FLOPs = 2 * M * K * N + effective_tflops = peak_tflops_f16 * (16 / dtype_bits) + compute_ns = FLOPs / (effective_tflops * 1e3) + """ + + def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None: + super().__init__(node, ctx) + self._accel: simpy.Resource | None = None + self._peak_tflops_f16: float = float(node.attrs.get("peak_tflops_f16", 0.0)) + + def init_resources(self, env: simpy.Environment) -> None: + resource_name = self.node.attrs.get("shared_resource") + if resource_name and self.ctx: + self._accel = self.ctx.get_shared_resource( + env, f"{self._pe_prefix}.{resource_name}" + ) + + def _compute_ns(self, m: int, k: int, n: int, dtype: str) -> float: + """Compute GEMM latency in nanoseconds.""" + if self._peak_tflops_f16 <= 0: + return float(self.node.attrs.get("overhead_ns", 0.0)) + dtype_bits = _DTYPE_BITS.get(dtype, 16) + effective_tflops = self._peak_tflops_f16 * (16.0 / dtype_bits) + flops = 2.0 * m * k * n + return flops / (effective_tflops * 1e3) + + def run(self, env: simpy.Environment, nbytes: int) -> Generator: + overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0)) + yield env.timeout(overhead_ns) + + def handle_command(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator: + from kernbench.common.pe_commands import GemmCmd + + cmd = pe_txn.command + if self._accel: + with self._accel.request() as req: + yield req + if isinstance(cmd, GemmCmd): + ns = self._compute_ns(cmd.m, cmd.k, cmd.n, cmd.a.dtype) + yield env.timeout(ns) + else: + yield from self.run(env, 0) + else: + if isinstance(cmd, GemmCmd): + ns = self._compute_ns(cmd.m, cmd.k, cmd.n, cmd.a.dtype) + yield env.timeout(ns) + else: + yield from self.run(env, 0) + pe_txn.done.succeed() + + def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator: + """Transaction forwarding with accel_slot acquisition.""" + if self._accel: + with self._accel.request() as req: + yield req + yield from super()._forward_txn(env, txn) + else: + yield from super()._forward_txn(env, txn) diff --git a/src/kernbench/components/impls/pe_math.py b/src/kernbench/components/impls/pe_math.py new file mode 100644 index 0000000..c3c3a83 --- /dev/null +++ b/src/kernbench/components/impls/pe_math.py @@ -0,0 +1,54 @@ +from __future__ import annotations + +from collections.abc import Generator +from typing import TYPE_CHECKING, Any + +import simpy + +from kernbench.components.base import PeEngineBase + +if TYPE_CHECKING: + from kernbench.common.pe_commands import PeInternalTxn + from kernbench.components.context import ComponentContext + from kernbench.topology.types import Node + + +class PeMathComponent(PeEngineBase): + """PE_MATH: element-wise computation engine sharing accel_slot (ADR-0014 D4). + + Uses a shared compute resource (PE_ACCEL capacity=1) that is mutually + exclusive with PE_GEMM within the same PE. + """ + + def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None: + super().__init__(node, ctx) + self._accel: simpy.Resource | None = None + + def init_resources(self, env: simpy.Environment) -> None: + resource_name = self.node.attrs.get("shared_resource") + if resource_name and self.ctx: + self._accel = self.ctx.get_shared_resource( + env, f"{self._pe_prefix}.{resource_name}" + ) + + def run(self, env: simpy.Environment, nbytes: int) -> Generator: + overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0)) + yield env.timeout(overhead_ns) + + def handle_command(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator: + if self._accel: + with self._accel.request() as req: + yield req + yield from self.run(env, 0) + else: + yield from self.run(env, 0) + pe_txn.done.succeed() + + def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator: + """Transaction forwarding with accel_slot acquisition.""" + if self._accel: + with self._accel.request() as req: + yield req + yield from super()._forward_txn(env, txn) + else: + yield from super()._forward_txn(env, txn) diff --git a/src/kernbench/components/impls/pe_scheduler.py b/src/kernbench/components/impls/pe_scheduler.py new file mode 100644 index 0000000..d196759 --- /dev/null +++ b/src/kernbench/components/impls/pe_scheduler.py @@ -0,0 +1,245 @@ +from __future__ import annotations + +from collections.abc import Generator +from typing import TYPE_CHECKING, Any + +import simpy + +from kernbench.components.base import ComponentBase + +if TYPE_CHECKING: + from kernbench.common.pe_commands import PeInternalTxn + from kernbench.components.context import ComponentContext + from kernbench.topology.types import Node + + +class PeSchedulerComponent(ComponentBase): + """PE_SCHEDULER: sole dispatcher inside a PE (ADR-0014 D1). + + Receives PeInternalTxn from PE_CPU, routes to the appropriate engine: + - DmaReadCmd / DmaWriteCmd → PE_DMA + - GemmCmd → PE_GEMM + - MathCmd → PE_MATH + - CompositeCmd → tiled pipeline (Stage 3: ADR-0014 D3.2) + + Composite GEMM pipeline (32x64x32 tiles): + DMA_READ(b_tile_t) → COMPUTE(t) → DMA_WRITE(out_tile_t) + with overlap: READ(t+1) || COMPUTE(t) || WRITE(t-1) + + Applies scheduler overhead_ns before dispatching each command. + Non-PeInternalTxn messages are forwarded via inherited _forward_txn(). + """ + + # Scheduler tile dimensions (ADR-0014 D3.2) + TILE_M = 32 + TILE_K = 64 + TILE_N = 32 + + # Command → engine suffix dispatch table. + # New engines: add a single entry here (e.g. ConvCmd: "pe_conv"). + _CMD_DISPATCH: dict[type, str] = {} + + @classmethod + def _ensure_dispatch_table(cls) -> None: + if cls._CMD_DISPATCH: + return + from kernbench.common.pe_commands import DmaReadCmd, DmaWriteCmd, GemmCmd, MathCmd + + cls._CMD_DISPATCH = { + DmaReadCmd: "pe_dma", + DmaWriteCmd: "pe_dma", + GemmCmd: "pe_gemm", + MathCmd: "pe_math", + } + + def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None: + super().__init__(node, ctx) + self._pe_prefix = node.id.rsplit(".", 1)[0] + self._ensure_dispatch_table() + + def run(self, env: simpy.Environment, nbytes: int) -> Generator: + overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0)) + yield env.timeout(overhead_ns) + + def _worker(self, env: simpy.Environment) -> Generator: + from kernbench.common.pe_commands import PeInternalTxn + + while True: + msg: Any = yield self._inbox.get() + if isinstance(msg, PeInternalTxn): + env.process(self._dispatch(env, msg)) + else: + yield from self._forward_txn(env, msg) + + def _dispatch(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator: + """Route a PeInternalTxn to the correct engine via dispatch table.""" + from kernbench.common.pe_commands import CompositeCmd + + # Scheduler overhead + yield from self.run(env, 0) + + cmd = pe_txn.command + + # Check dispatch table first + engine_suffix = self._CMD_DISPATCH.get(type(cmd)) + if engine_suffix is not None: + yield self.out_ports[f"{self._pe_prefix}.{engine_suffix}"].put(pe_txn) + return + + # CompositeCmd: tiled pipeline (not a simple forward) + if isinstance(cmd, CompositeCmd): + yield from self._dispatch_composite(env, pe_txn) + return + + # Unknown command — signal done immediately + pe_txn.done.succeed() + + def _dispatch_composite(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator: + """Composite tiled pipeline (ADR-0014 D3.2). + + GEMM: 3-stage pipeline with b-tile streaming from HBM. + MATH: sequential compute + DMA_WRITE (no tiling). + """ + from kernbench.common.pe_commands import CompositeCmd + + cmd = pe_txn.command + assert isinstance(cmd, CompositeCmd) + if cmd.op == "gemm" and cmd.b is not None: + yield from self._pipeline_gemm(env, pe_txn, cmd) + else: + yield from self._pipeline_math(env, pe_txn, cmd) + + def _pipeline_gemm(self, env: simpy.Environment, pe_txn: PeInternalTxn, cmd: Any) -> Generator: + """Tiled GEMM pipeline: stream b tiles from HBM, compute, write results. + + Tensor a is in TCM (loaded via tl.load). Tensor b is in HBM (via tl.ref). + Pipeline: DMA_READ(b_tile_t) -> COMPUTE(t) -> DMA_WRITE(out_tile_t) + Overlap: READ(t+1) || COMPUTE(t) || WRITE(t-1) + """ + from kernbench.common.pe_commands import ( + DmaReadCmd, + DmaWriteCmd, + GemmCmd, + PeInternalTxn as PeTxn, + TensorHandle, + ) + + pp = self._pe_prefix + a = cmd.a # already in TCM + b = cmd.b # HBM reference (via tl.ref) + + M, K_a = a.shape[-2], a.shape[-1] + K_b, N = b.shape[-2], b.shape[-1] + dtype = a.dtype + dtype_bytes = b.nbytes // (K_b * N) if (K_b * N) > 0 else 2 + + # Tile counts + n_tiles_k = max(1, (K_a + self.TILE_K - 1) // self.TILE_K) + n_tiles_n = max(1, (N + self.TILE_N - 1) // self.TILE_N) + n_tiles = n_tiles_k * n_tiles_n + + prev_compute_done = None + prev_write_done = None + total_dma_ns = 0.0 + total_compute_ns = 0.0 + + for tile_idx in range(n_tiles): + tk = tile_idx // n_tiles_n + tn = tile_idx % n_tiles_n + + k_start = tk * self.TILE_K + n_start = tn * self.TILE_N + tile_k = min(self.TILE_K, K_a - k_start) + tile_n = min(self.TILE_N, N - n_start) + tile_nbytes = tile_k * tile_n * dtype_bytes + + # --- Stage 1: DMA_READ b_tile from HBM --- + read_done = env.event() + b_tile_pa = b.pa + (k_start * N + n_start) * dtype_bytes + b_tile_handle = TensorHandle( + id=f"b_tile_{tile_idx}", pa=b_tile_pa, + shape=(tile_k, tile_n), dtype=dtype, nbytes=tile_nbytes, + ) + read_cmd = DmaReadCmd(handle=b_tile_handle, src_pa=b_tile_pa, nbytes=tile_nbytes) + read_txn = PeTxn(command=read_cmd, done=read_done, pe_prefix=pp) + t0 = env.now + yield self.out_ports[f"{pp}.pe_dma"].put(read_txn) + + # Wait for previous compute before starting this tile's compute + if prev_compute_done is not None: + yield prev_compute_done + + # Wait for this tile's DMA_READ + yield read_done + total_dma_ns += env.now - t0 + + # --- Stage 2: COMPUTE (GEMM) --- + compute_done = env.event() + out_handle = TensorHandle( + id=f"out_tile_{tile_idx}", pa=0, + shape=(M, tile_n), dtype=dtype, + nbytes=M * tile_n * dtype_bytes, + ) + compute_cmd = GemmCmd(a=a, b=b_tile_handle, out=out_handle, + m=M, k=tile_k, n=tile_n) + compute_txn = PeTxn(command=compute_cmd, done=compute_done, pe_prefix=pp) + t0 = env.now + yield self.out_ports[f"{pp}.pe_gemm"].put(compute_txn) + + # Wait for previous write (DMA_WRITE serialization) + if prev_write_done is not None: + yield prev_write_done + + # Wait for compute of THIS tile + yield compute_done + total_compute_ns += env.now - t0 + prev_compute_done = compute_done + + # --- Stage 3: DMA_WRITE out_tile to HBM --- + write_done = env.event() + out_tile_pa = cmd.out_pa + n_start * dtype_bytes + write_nbytes = M * tile_n * dtype_bytes + write_cmd = DmaWriteCmd(handle=out_handle, dst_pa=out_tile_pa, nbytes=write_nbytes) + write_txn = PeTxn(command=write_cmd, done=write_done, pe_prefix=pp) + t0 = env.now + yield self.out_ports[f"{pp}.pe_dma"].put(write_txn) + prev_write_done = write_done + + # Wait for final write + if prev_write_done is not None: + t0 = env.now + yield prev_write_done + total_dma_ns += env.now - t0 + + pe_txn.result_data["dma_ns"] = total_dma_ns + pe_txn.result_data["compute_ns"] = total_compute_ns + pe_txn.done.succeed() + + def _pipeline_math(self, env: simpy.Environment, pe_txn: PeInternalTxn, cmd: Any) -> Generator: + """Non-GEMM composite: sequential compute + DMA_WRITE (no tiling).""" + from kernbench.common.pe_commands import ( + DmaWriteCmd, + MathCmd, + PeInternalTxn as PeTxn, + ) + + pp = self._pe_prefix + + # Step 1: Compute (MATH) + compute_done = env.event() + compute_cmd = MathCmd( + op=cmd.math_op or "identity", + inputs=(cmd.a,), out=cmd.a, + ) + compute_txn = PeTxn(command=compute_cmd, done=compute_done, pe_prefix=pp) + yield self.out_ports[f"{pp}.pe_math"].put(compute_txn) + yield compute_done + + # Step 2: DMA_WRITE result to HBM + write_done = env.event() + write_cmd = DmaWriteCmd(handle=cmd.a, dst_pa=cmd.out_pa, nbytes=cmd.out_nbytes) + write_txn = PeTxn(command=write_cmd, done=write_done, pe_prefix=pp) + yield self.out_ports[f"{pp}.pe_dma"].put(write_txn) + yield write_done + + pe_txn.done.succeed() diff --git a/src/kernbench/components/impls/pe_tcm.py b/src/kernbench/components/impls/pe_tcm.py new file mode 100644 index 0000000..6458d56 --- /dev/null +++ b/src/kernbench/components/impls/pe_tcm.py @@ -0,0 +1,25 @@ +from __future__ import annotations + +from collections.abc import Generator +from typing import TYPE_CHECKING + +from kernbench.components.base import ComponentBase + +if TYPE_CHECKING: + from kernbench.components.context import ComponentContext + from kernbench.topology.types import Node + + +class PeTcmComponent(ComponentBase): + """PE_TCM: tightly-coupled memory / local SRAM staging buffer. + + Terminal storage component for PE-internal dataflow (ADR-0014 D5). + Phase 0: applies overhead_ns and drain_ns at terminal. + """ + + def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None: + super().__init__(node, ctx) + + def run(self, env, nbytes: int) -> Generator: + overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0)) + yield env.timeout(overhead_ns) diff --git a/src/kernbench/components/impls/sram.py b/src/kernbench/components/impls/sram.py new file mode 100644 index 0000000..d631ec4 --- /dev/null +++ b/src/kernbench/components/impls/sram.py @@ -0,0 +1,59 @@ +from __future__ import annotations + +from collections.abc import Generator +from typing import TYPE_CHECKING, Any + +import simpy + +from kernbench.components.base import ComponentBase +from kernbench.sim_engine.transaction import Transaction + +if TYPE_CHECKING: + from kernbench.components.context import ComponentContext + from kernbench.topology.types import Node + + +class SramComponent(ComponentBase): + """Cube SRAM: terminal component that models SRAM access latency. + + Applies overhead_ns processing overhead (from node.attrs). + On completion, sends a ResponseMsg back on the reverse path. + """ + + def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None: + super().__init__(node, ctx) + + def run(self, env: simpy.Environment, nbytes: int) -> Generator: + overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0)) + yield env.timeout(overhead_ns) + + def _worker(self, env: simpy.Environment) -> Generator: + """Terminal worker: process, apply drain, send response.""" + while True: + txn: Any = yield self._inbox.get() + yield from self.run(env, txn.nbytes) + drain = getattr(txn, "drain_ns", 0.0) + if drain > 0: + yield env.timeout(drain) + yield from self._send_response(env, txn) + + def _send_response(self, env: simpy.Environment, txn: Any) -> Generator: + """Create ResponseMsg and send on reverse path.""" + reverse_path = list(reversed(txn.path)) + if len(reverse_path) >= 2 and self.ctx: + from kernbench.runtime_api.kernel import ResponseMsg + + parts = self.node.id.split(".") + cube_id = int(parts[1].replace("cube", "")) + resp_msg = ResponseMsg( + correlation_id=txn.request.correlation_id, + request_id=txn.request.request_id, + src_cube=cube_id, src_pe=-1, success=True, + ) + resp_txn = Transaction( + request=resp_msg, path=reverse_path, step=0, + nbytes=0, done=env.event(), is_response=True, + ) + yield self.out_ports[reverse_path[1]].put(resp_txn.advance()) + else: + txn.done.succeed() diff --git a/src/kernbench/di/registry.py b/src/kernbench/di/registry.py new file mode 100644 index 0000000..e69de29 diff --git a/src/kernbench/policy/address/allocator.py b/src/kernbench/policy/address/allocator.py new file mode 100644 index 0000000..5d10bb4 --- /dev/null +++ b/src/kernbench/policy/address/allocator.py @@ -0,0 +1,85 @@ +from __future__ import annotations + +from dataclasses import dataclass + +from kernbench.policy.address.phyaddr import PhysAddr + + +class AllocationError(Exception): + pass + + +@dataclass(frozen=True) +class AddressConfig: + sip_count: int + cubes_per_sip: int + pes_per_cube: int + hbm_bytes_per_cube: int + hbm_slices_per_cube: int + tcm_bytes_per_pe: int + tcm_scheduler_reserved_bytes: int + sram_bytes_per_cube: int + + @property + def hbm_slice_bytes(self) -> int: + return self.hbm_bytes_per_cube // self.hbm_slices_per_cube + + @property + def tcm_allocatable_bytes(self) -> int: + return self.tcm_bytes_per_pe - self.tcm_scheduler_reserved_bytes + + +class PEMemAllocator: + def __init__( + self, rack_id: int, sip_id: int, cube_id: int, pe_id: int, cfg: AddressConfig, + ) -> None: + self._rack_id = rack_id + self._sip_id = sip_id + self._cube_id = cube_id + self._pe_id = pe_id + self._cfg = cfg + self._hbm_cursor = 0 + self._tcm_cursor = 0 + + def alloc_hbm(self, nbytes: int) -> PhysAddr: + if self._hbm_cursor + nbytes > self._cfg.hbm_slice_bytes: + raise AllocationError( + f"HBM overflow: need {nbytes}, " + f"available {self._cfg.hbm_slice_bytes - self._hbm_cursor}" + ) + pa = PhysAddr.pe_hbm_addr( + rack_id=self._rack_id, sip_id=self._sip_id, cube_id=self._cube_id, + pe_id=self._pe_id, pe_local_hbm_offset=self._hbm_cursor, + slice_size_bytes=self._cfg.hbm_slice_bytes, + ) + self._hbm_cursor += nbytes + return pa + + def alloc_tcm(self, nbytes: int) -> PhysAddr: + if self._tcm_cursor + nbytes > self._cfg.tcm_allocatable_bytes: + raise AllocationError( + f"TCM overflow: need {nbytes}, " + f"available {self._cfg.tcm_allocatable_bytes - self._tcm_cursor}" + ) + pa = PhysAddr.pe_tcm_addr( + rack_id=self._rack_id, sip_id=self._sip_id, cube_id=self._cube_id, + pe_id=self._pe_id, tcm_offset=self._tcm_cursor, + ) + self._tcm_cursor += nbytes + return pa + + @property + def hbm_used(self) -> int: + return self._hbm_cursor + + @property + def hbm_total(self) -> int: + return self._cfg.hbm_slice_bytes + + @property + def tcm_used(self) -> int: + return self._tcm_cursor + + @property + def tcm_total(self) -> int: + return self._cfg.tcm_allocatable_bytes diff --git a/src/kernbench/policy/address/phyaddr.py b/src/kernbench/policy/address/phyaddr.py new file mode 100644 index 0000000..b19e3cc --- /dev/null +++ b/src/kernbench/policy/address/phyaddr.py @@ -0,0 +1,184 @@ +from __future__ import annotations + +from dataclasses import dataclass +from enum import IntEnum +from typing import Literal + +MAX_51 = (1 << 51) - 1 + + +class PhysAddrError(Exception): + pass + + +def _chk_range(name: str, v: int, bits: int) -> None: + if not (0 <= v < (1 << bits)): + raise PhysAddrError(f"{name} out of range for {bits} bits: {v}") + + +def _chk_max(name: str, v: int, maxv: int) -> None: + if not (0 <= v <= maxv): + raise PhysAddrError(f"{name} out of range (0..{maxv}): {v}") + + +class UnitType(IntEnum): + PE = 0 + MCPU = 1 + SRAM = 2 + + +@dataclass(frozen=True) +class PhysAddr: + """ + 51-bit physical address value object. + + Layout: + [50:47] rack_id (4) + [46:43] sip_id (4) + [42:38] sip_seg (5) # cube_id + [37:0] local_offset (38) => each segment is 256GB + + local_offset: + [37] selector: 1 = HBM window (128GB reserved), 0 = PE resource window + """ + + rack_id: int + sip_id: int + sip_seg: int + local_offset: int + + kind: Literal["hbm", "pe_resource", "raw"] = "raw" + cube_id: int = 0 + unit_type: UnitType = UnitType.PE + pe_id: int = 0 + ext: int = 0 + sub_offset: int = 0 + hbm_offset: int = 0 + + HBM_WINDOW_BYTES = 1 << 37 # 128GB + + def encode(self) -> int: + _chk_range("rack_id", self.rack_id, 4) + _chk_range("sip_id", self.sip_id, 4) + _chk_range("sip_seg", self.sip_seg, 5) + _chk_range("local_offset", self.local_offset, 38) + addr = (self.rack_id << 47) | (self.sip_id << 43) | (self.sip_seg << 38) | self.local_offset + if not (0 <= addr <= MAX_51): + raise PhysAddrError("address exceeds 51-bit space") + return addr + + @staticmethod + def decode(addr: int) -> PhysAddr: + if not (0 <= addr <= MAX_51): + raise PhysAddrError("addr must be a 51-bit value") + rack = (addr >> 47) & 0xF + sip_id = (addr >> 43) & 0xF + sip_seg = (addr >> 38) & 0x1F + off = addr & ((1 << 38) - 1) + cube_id = sip_seg + sel = (off >> 37) & 0x1 + if sel == 1: + hbm_offset = int(off & ((1 << 37) - 1)) + return PhysAddr( + rack_id=rack, + sip_id=sip_id, + sip_seg=sip_seg, + local_offset=off, + kind="hbm", + cube_id=cube_id, + hbm_offset=hbm_offset, + ) + # PE resource decode + raw_ut = int((off >> 34) & 0x7) + try: + unit_type = UnitType(raw_ut) + except ValueError: + raise PhysAddrError(f"unknown unit_type: {raw_ut}") from None + pe_id = int((off >> 30) & 0xF) + ext = int((off >> 29) & 0x1) + sub_offset = int(off & ((1 << 29) - 1)) + return PhysAddr( + rack_id=rack, + sip_id=sip_id, + sip_seg=sip_seg, + local_offset=off, + kind="pe_resource", + cube_id=cube_id, + unit_type=unit_type, + pe_id=pe_id, + ext=ext, + sub_offset=sub_offset, + hbm_offset=0, + ) + + @staticmethod + def hbm_addr(*, rack_id: int, sip_id: int, cube_id: int, hbm_offset: int) -> PhysAddr: + _chk_max("cube_id", cube_id, 31) + _chk_range("hbm_offset", hbm_offset, 37) + sip_seg = cube_id + local_offset = (1 << 37) | int(hbm_offset) + return PhysAddr( + rack_id=rack_id, + sip_id=sip_id, + sip_seg=sip_seg, + local_offset=local_offset, + kind="hbm", + cube_id=cube_id, + hbm_offset=int(hbm_offset), + ) + + @staticmethod + def pe_hbm_addr( + *, + rack_id: int, + sip_id: int, + cube_id: int, + pe_id: int, + pe_local_hbm_offset: int, + slice_size_bytes: int, + ) -> PhysAddr: + _chk_max("cube_id", cube_id, 31) + _chk_range("pe_id", pe_id, 4) + if not (0 <= pe_local_hbm_offset < slice_size_bytes): + raise PhysAddrError("pe_local_hbm_offset out of PE local slice range") + hbm_offset = int(pe_id) * int(slice_size_bytes) + int(pe_local_hbm_offset) + if not (0 <= hbm_offset < PhysAddr.HBM_WINDOW_BYTES): + raise PhysAddrError("HBM offset exceeds reserved 128GB window") + return PhysAddr.hbm_addr( + rack_id=rack_id, sip_id=sip_id, cube_id=cube_id, hbm_offset=hbm_offset + ) + + @staticmethod + def hbm_pe_id(hbm_offset: int, slice_size_bytes: int) -> int: + return hbm_offset // slice_size_bytes + + @staticmethod + def cube_sram_addr( + *, rack_id: int, sip_id: int, cube_id: int, sram_offset: int, + ) -> PhysAddr: + _chk_max("cube_id", cube_id, 31) + _chk_range("sram_offset", sram_offset, 29) + sip_seg = cube_id + local_offset = (UnitType.SRAM << 34) | sram_offset + return PhysAddr( + rack_id=rack_id, sip_id=sip_id, sip_seg=sip_seg, + local_offset=local_offset, + kind="pe_resource", cube_id=cube_id, + unit_type=UnitType.SRAM, sub_offset=sram_offset, + ) + + @staticmethod + def pe_tcm_addr( + *, rack_id: int, sip_id: int, cube_id: int, pe_id: int, tcm_offset: int, + ) -> PhysAddr: + _chk_max("cube_id", cube_id, 31) + _chk_range("pe_id", pe_id, 4) + _chk_range("tcm_offset", tcm_offset, 29) + sip_seg = cube_id + local_offset = (UnitType.PE << 34) | (pe_id << 30) | tcm_offset + return PhysAddr( + rack_id=rack_id, sip_id=sip_id, sip_seg=sip_seg, + local_offset=local_offset, + kind="pe_resource", cube_id=cube_id, + unit_type=UnitType.PE, pe_id=pe_id, sub_offset=tcm_offset, + ) diff --git a/src/kernbench/policy/placement/dp.py b/src/kernbench/policy/placement/dp.py new file mode 100644 index 0000000..8860d7f --- /dev/null +++ b/src/kernbench/policy/placement/dp.py @@ -0,0 +1,174 @@ +from __future__ import annotations + +from dataclasses import dataclass +from math import ceil +from typing import Literal + + +@dataclass(frozen=True) +class DPPolicy: + """Two-level data-parallel policy: cube-level + pe-level.""" + + cube: Literal["replicate", "shard_m", "shard_k"] = "replicate" + pe: Literal["replicate", "column_wise", "row_wise"] = "replicate" + + +def resolve_dp_policy( + policy: DPPolicy, + *, + shape: tuple[int, int], + itemsize: int, + num_pe: int, + num_cubes: int = 1, +) -> list[ShardSpec]: + """Resolve a DPPolicy into a list[ShardSpec] with two-level resolution. + + Cube-level policy distributes across cubes, pe-level distributes within + each cube. ShardSpec.pe_index uses flat indexing: cube_id * num_pe + pe_id. + """ + _PE_RESOLVERS = { + "replicate": replicate, + "column_wise": column_wise, + "row_wise": row_wise, + } + resolver = _PE_RESOLVERS.get(policy.pe) + if resolver is None: + raise ValueError(f"Unknown pe-level policy: {policy.pe}") + + if num_cubes <= 1: + return resolver(shape=shape, itemsize=itemsize, num_pe=num_pe) + + # Two-level resolution: cube-level → pe-level + M, K = shape + all_shards: list[ShardSpec] = [] + + for cube_id in range(num_cubes): + # Determine per-cube shape based on cube-level policy + if policy.cube == "replicate": + cube_shape = (M, K) + cube_offset = 0 + elif policy.cube == "shard_m": + chunk_m = M // num_cubes + cube_shape = (chunk_m, K) + cube_offset = cube_id * chunk_m * K * itemsize + elif policy.cube == "shard_k": + chunk_k = K // num_cubes + cube_shape = (M, chunk_k) + cube_offset = cube_id * M * chunk_k * itemsize + else: + raise ValueError(f"Unknown cube-level policy: {policy.cube}") + + # Resolve pe-level within this cube's shape + pe_shards = resolver(shape=cube_shape, itemsize=itemsize, num_pe=num_pe) + + # Remap pe_index to flat index and adjust offset + for ps in pe_shards: + flat_idx = cube_id * num_pe + ps.pe_index + all_shards.append(ShardSpec( + pe_index=flat_idx, + offset_bytes=cube_offset + ps.offset_bytes, + nbytes=ps.nbytes, + )) + + return all_shards + + +@dataclass(frozen=True) +class ShardSpec: + pe_index: int + offset_bytes: int + nbytes: int + + +def column_wise( + *, shape: tuple[int, int], itemsize: int, num_pe: int, +) -> list[ShardSpec]: + """Split K axis into num_pe equal parts. Each PE gets (M, K/P).""" + M, K = shape + chunk_k = K // num_pe + chunk_bytes = M * chunk_k * itemsize + shards = [] + for i in range(num_pe): + shards.append(ShardSpec( + pe_index=i, + offset_bytes=i * chunk_bytes, + nbytes=chunk_bytes, + )) + return shards + + +def row_wise( + *, shape: tuple[int, int], itemsize: int, num_pe: int, +) -> list[ShardSpec]: + """Split M axis into num_pe equal parts. Each PE gets (M/P, K).""" + M, K = shape + chunk_m = M // num_pe + chunk_bytes = chunk_m * K * itemsize + shards = [] + for i in range(num_pe): + shards.append(ShardSpec( + pe_index=i, + offset_bytes=i * chunk_bytes, + nbytes=chunk_bytes, + )) + return shards + + +def replicate( + *, shape: tuple[int, int], itemsize: int, num_pe: int, +) -> list[ShardSpec]: + """Full copy per PE. Each PE gets (M, K).""" + M, K = shape + full_bytes = M * K * itemsize + return [ + ShardSpec(pe_index=i, offset_bytes=0, nbytes=full_bytes) + for i in range(num_pe) + ] + + +def tiled_column_major( + *, shape: tuple[int, int], itemsize: int, num_pe: int, + tile_m: int, tile_k: int, +) -> list[ShardSpec]: + """2D tiling, column-major order (K axis first), round-robin across PEs.""" + M, K = shape + tiles_m = ceil(M / tile_m) + tiles_k = ceil(K / tile_k) + tile_bytes = tile_m * tile_k * itemsize + row_bytes = K * itemsize + shards = [] + idx = 0 + for mi in range(tiles_m): + for ki in range(tiles_k): + offset = (mi * tile_m * row_bytes) + (ki * tile_k * itemsize) + shards.append(ShardSpec( + pe_index=idx % num_pe, + offset_bytes=offset, + nbytes=tile_bytes, + )) + idx += 1 + return shards + + +def tiled_row_major( + *, shape: tuple[int, int], itemsize: int, num_pe: int, + tile_m: int, tile_k: int, +) -> list[ShardSpec]: + """2D tiling, row-major order (M axis first), round-robin across PEs.""" + M, K = shape + tiles_m = ceil(M / tile_m) + tiles_k = ceil(K / tile_k) + tile_bytes = tile_m * tile_k * itemsize + row_bytes = K * itemsize + shards = [] + idx = 0 + for ki in range(tiles_k): + for mi in range(tiles_m): + offset = (mi * tile_m * row_bytes) + (ki * tile_k * itemsize) + shards.append(ShardSpec( + pe_index=idx % num_pe, + offset_bytes=offset, + nbytes=tile_bytes, + )) + idx += 1 + return shards diff --git a/src/kernbench/policy/routing/router.py b/src/kernbench/policy/routing/router.py new file mode 100644 index 0000000..5565e45 --- /dev/null +++ b/src/kernbench/policy/routing/router.py @@ -0,0 +1,184 @@ +from __future__ import annotations + +import heapq +from collections import defaultdict + +from kernbench.policy.address.phyaddr import PhysAddr, UnitType +from kernbench.topology.types import TopologyGraph + + +class RoutingError(Exception): + pass + + +class AddressResolver: + """Resolve a PhysAddr to the destination node_id in the compiled graph. + + Also provides named node lookups (find_m_cpu, find_pcie_ep, …) so that + component implementations never construct node_id strings directly. + Centralising the naming convention here means a single change propagates + everywhere (ADR-0015 D4). + """ + + def __init__(self, graph: TopologyGraph) -> None: + self._node_ids = set(graph.nodes) + mm = graph.spec["cube"]["memory_map"] + self._slice_size_bytes = mm["hbm_total_gb_per_cube"] * (1 << 30) // mm["hbm_slices_per_cube"] + + # ── Physical-address resolution ────────────────────────────────── + + def resolve(self, addr: PhysAddr) -> str: + s = addr.sip_id + c = addr.cube_id + if addr.kind == "hbm": + pe_slice = PhysAddr.hbm_pe_id(addr.hbm_offset, self._slice_size_bytes) + node_id = f"sip{s}.cube{c}.hbm_ctrl.slice{pe_slice}" + elif addr.kind == "pe_resource": + if addr.unit_type == UnitType.PE: + node_id = f"sip{s}.cube{c}.pe{addr.pe_id}.pe_tcm" + elif addr.unit_type == UnitType.SRAM: + node_id = f"sip{s}.cube{c}.sram" + elif addr.unit_type == UnitType.MCPU: + node_id = f"sip{s}.cube{c}.m_cpu" + else: + raise RoutingError(f"unsupported unit_type: {addr.unit_type}") + else: + raise RoutingError(f"unsupported address kind: {addr.kind}") + if node_id not in self._node_ids: + raise RoutingError(f"node {node_id} not found in topology") + return node_id + + # ── Named node lookups ─────────────────────────────────────────── + + def find_m_cpu(self, sip: int, cube: int) -> str: + node_id = f"sip{sip}.cube{cube}.m_cpu" + if node_id not in self._node_ids: + raise RoutingError(f"M_CPU not found: {node_id}") + return node_id + + def find_pcie_ep(self, sip: int, io_id: str = "io0") -> str: + node_id = f"sip{sip}.{io_id}.pcie_ep" + if node_id not in self._node_ids: + raise RoutingError(f"PCIE_EP not found: {node_id}") + return node_id + + def find_io_cpu(self, sip: int, io_id: str = "io0") -> str: + node_id = f"sip{sip}.{io_id}.io_cpu" + if node_id not in self._node_ids: + raise RoutingError(f"IO_CPU not found: {node_id}") + return node_id + + def find_all_pcie_eps(self) -> list[str]: + """Return all PCIE_EP node ids across all SIPs, sorted.""" + return sorted(nid for nid in self._node_ids if nid.endswith(".pcie_ep")) + + +class PathRouter: + """Find data-path from a source PE (or arbitrary node) to a destination node. + + Two adjacency graphs are maintained: + _adj — excludes command edges (used by PE DMA routing, find_path) + _adj_all — includes all edges (used by component-to-component routing, + find_node_path; required because M_CPU↔NOC links are "command") + """ + + # Edge kinds excluded from M_CPU DMA adjacency: prevents routing through + # PE-internal pipeline nodes when computing DMA paths. + _MCPU_DMA_EXCLUDE = {"pe_internal", "pe_to_xbar"} + + def __init__(self, graph: TopologyGraph) -> None: + self._adj: dict[str, list[tuple[str, float]]] = defaultdict(list) + self._adj_all: dict[str, list[tuple[str, float]]] = defaultdict(list) + self._adj_mcpu_dma: dict[str, list[tuple[str, float]]] = defaultdict(list) + for e in graph.edges: + w = e.routing_weight_mm if e.routing_weight_mm is not None else e.distance_mm + self._adj_all[e.src].append((e.dst, w)) + if e.kind != "command": + self._adj[e.src].append((e.dst, w)) + if e.kind not in self._MCPU_DMA_EXCLUDE: + self._adj_mcpu_dma[e.src].append((e.dst, w)) + + def find_path(self, src_pe: str, dst_node: str) -> list[str]: + """PE DMA routing: prepends .pe_dma, excludes command edges.""" + start = f"{src_pe}.pe_dma" + return self._run_dijkstra(self._adj, start, dst_node) + + def find_path_with_distance(self, src_pe: str, dst_node: str) -> tuple[list[str], float]: + start = f"{src_pe}.pe_dma" + return self._run_dijkstra_with_dist(self._adj, start, dst_node) + + def find_mcpu_dma_path(self, m_cpu_id: str, dst_hbm_slice_id: str) -> list[str]: + """M_CPU DMA path: never routes through PE-internal nodes (ADR-0015 D5). + + Same-cube: deterministic [m_cpu, noc, xbar.pe_i, hbm_ctrl.slice_i]. + Cross-cube: Dijkstra via _adj_mcpu_dma (pe_internal/pe_to_xbar excluded) + → routes through NOC → UCIe → target cube NOC → xbar → HBM. + """ + m_cube = ".".join(m_cpu_id.split(".")[:2]) + d_cube = ".".join(dst_hbm_slice_id.split(".")[:2]) + if m_cube == d_cube: + slice_idx = int(dst_hbm_slice_id.rsplit("slice", 1)[1]) + return [ + m_cpu_id, + f"{m_cube}.noc", + f"{m_cube}.xbar.pe{slice_idx}", + dst_hbm_slice_id, + ] + return self._run_dijkstra(self._adj_mcpu_dma, m_cpu_id, dst_hbm_slice_id) + + def find_node_path(self, src: str, dst: str) -> list[str]: + """General routing between arbitrary nodes, including command edges. + + Used by components (IoCpuComponent, MCpuComponent) that route through + M_CPU↔NOC command-kind links. + """ + return self._run_dijkstra(self._adj_all, src, dst) + + def _run_dijkstra( + self, + adj: dict[str, list[tuple[str, float]]], + start: str, + goal: str, + ) -> list[str]: + path, _ = self._run_dijkstra_with_dist(adj, start, goal) + return path + + def _run_dijkstra_with_dist( + self, + adj: dict[str, list[tuple[str, float]]], + start: str, + goal: str, + ) -> tuple[list[str], float]: + if start == goal: + return [start], 0.0 + best: dict[str, float] = {start: 0.0} + prev: dict[str, str] = {} + heap: list[tuple[float, str]] = [(0.0, start)] + while heap: + d, node = heapq.heappop(heap) + if node == goal: + path: list[str] = [] + cur = goal + while cur != start: + path.append(cur) + cur = prev[cur] + path.append(start) + path.reverse() + return path, d + if d > best.get(node, float("inf")): + continue + for neighbor, edge_dist in adj[node]: + new_d = d + edge_dist + if new_d < best.get(neighbor, float("inf")): + best[neighbor] = new_d + prev[neighbor] = node + heapq.heappush(heap, (new_d, neighbor)) + raise RoutingError(f"no path from {start} to {goal}") + + # ── backward-compat shims (used by existing tests) ─────────────── + + def _dijkstra(self, start: str, goal: str) -> list[str]: + return self._run_dijkstra(self._adj, start, goal) + + def _dijkstra_with_dist(self, start: str, goal: str) -> tuple[list[str], float]: + return self._run_dijkstra_with_dist(self._adj, start, goal) diff --git a/src/kernbench/runtime_api/__init__.py b/src/kernbench/runtime_api/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/kernbench/runtime_api/bench_runner.py b/src/kernbench/runtime_api/bench_runner.py new file mode 100644 index 0000000..164a6a7 --- /dev/null +++ b/src/kernbench/runtime_api/bench_runner.py @@ -0,0 +1,96 @@ +from __future__ import annotations + +from collections.abc import Callable +from enum import Enum +from typing import Any + +from kernbench.common.types import Completion, SimEngine, Trace + +from .context import RuntimeContext +from .types import BenchResult, DeviceSelector + + +class CompletionPolicy(str, Enum): + LAST_SUBMITTED = "last_submitted" + LAST_COMPLETED = "last_completed" # requires trace/timestamps or engine support; stub for now + ALL_OK_FAIL_FAST = "all_ok_fail_fast" + + +BenchFn = Callable[[RuntimeContext], Any] +EngineFactory = Callable[[object, DeviceSelector], SimEngine] + + +def run_bench( + *, + topology: object, + bench_fn: BenchFn, + device: DeviceSelector, + engine_factory: EngineFactory, + correlation_id: str = "bench0", + completion_policy: CompletionPolicy = CompletionPolicy.LAST_SUBMITTED, +) -> BenchResult: + """ + Minimal bench runner. + + - topology: compiled topology object (opaque to runtime here) + - bench_fn: callable that receives RuntimeContext and submits requests + - device: DeviceSelector ("all" or "sip:") + - engine_factory: builds sim_engine for given topology & device + - completion_policy: how to determine overall completion/result + """ + engine = engine_factory(topology, device) + # Extract spec from TopologyHandle or TopologyGraph + topo_obj = getattr(topology, "topology_obj", topology) + spec = getattr(topo_obj, "spec", None) + ctx = RuntimeContext( + engine=engine, target_device=device, + correlation_id=correlation_id, spec=spec, + ) + + bench_fn(ctx) + + ctx.wait_all() + + collected_traces = ctx._traces or None + + handles = ctx.handles() + if not handles: + return BenchResult( + completion=Completion( + ok=False, error_code="NO_REQUESTS", error_message="Bench submitted no requests" + ), + correlation_id=correlation_id, + trace=None, + traces=collected_traces, + ) + + if completion_policy == CompletionPolicy.LAST_SUBMITTED: + last = handles[-1] + completion, trace = engine.get_completion(last) + return BenchResult( + completion=completion, correlation_id=correlation_id, + trace=trace, traces=collected_traces, + ) + + if completion_policy == CompletionPolicy.ALL_OK_FAIL_FAST: + last_trace: Trace | None = None + for h in handles: + c, t = engine.get_completion(h) + last_trace = t if t is not None else last_trace + if not c.ok: + return BenchResult( + completion=c, correlation_id=correlation_id, + trace=last_trace, traces=collected_traces, + ) + return BenchResult( + completion=Completion(ok=True), correlation_id=correlation_id, + trace=last_trace, traces=collected_traces, + ) + + # LAST_COMPLETED placeholder (needs engine support for timing). Fall back. + last = handles[-1] + completion, trace = engine.get_completion(last) + return BenchResult( + completion=completion, correlation_id=correlation_id, + trace=trace, traces=collected_traces, + ) diff --git a/src/kernbench/runtime_api/context.py b/src/kernbench/runtime_api/context.py new file mode 100644 index 0000000..e9cf270 --- /dev/null +++ b/src/kernbench/runtime_api/context.py @@ -0,0 +1,282 @@ +# kernbench/runtime_api/context.py +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Any + +from kernbench.common.types import Completion, RequestHandle, SimEngine + +from .types import DeviceSelector + + +@dataclass +class RuntimeContext: + engine: SimEngine + target_device: DeviceSelector + correlation_id: str + spec: dict | None = None + + _handles: list[RequestHandle] = field(default_factory=list, init=False) + _completed: set[RequestHandle] = field(default_factory=set, init=False) + _allocators: dict[int, Any] = field(default_factory=dict, init=False) + _tensor_counter: int = field(default=0, init=False) + _traces: list[dict] = field(default_factory=list, init=False) + + def submit(self, request: Any) -> RequestHandle: + submit_fn = getattr(self.engine, "submit", None) + if submit_fn is None: + raise AttributeError("Engine does not implement submit(request) -> RequestHandle.") + handle: RequestHandle = submit_fn(request) # type: ignore[call-arg] + self._handles.append(handle) + return handle + + def is_completed(self, handle: RequestHandle) -> bool: + return handle in self._completed + + def wait(self, handle: RequestHandle, *, _meta: dict | None = None) -> Completion: + if handle in self._completed: + completion, trace = self.engine.get_completion(handle) + return completion + + wait_fn = getattr(self.engine, "wait", None) + if wait_fn is not None: + wait_fn(handle) # type: ignore[misc] + + completion, trace = self.engine.get_completion(handle) + self._completed.add(handle) + if _meta is not None and trace is not None: + entry = dict(trace) if isinstance(trace, dict) else {"raw": trace} + entry.update(_meta) + self._traces.append(entry) + return completion + + def wait_all(self) -> None: + for h in self._handles: + if h not in self._completed: + self.wait(h) + + def handles(self) -> list[RequestHandle]: + return list(self._handles) + + # ── PyTorch-like tensor API ────────────────────────────────────── + + def _ensure_allocators(self) -> dict: + """Lazily create PEMemAllocator instances from spec.""" + if self._allocators: + return self._allocators + if self.spec is None: + raise RuntimeError( + "RuntimeContext.spec is required for tensor operations. " + "Pass spec=graph.spec when creating RuntimeContext." + ) + from kernbench.policy.address.allocator import AddressConfig, PEMemAllocator + + system = self.spec.get("system", {}) + cube = self.spec.get("cube", {}) + mm = cube.get("memory_map", {}) + pe_template = cube.get("pe_template", {}) + pe_comps = pe_template.get("components", {}) + tcm_cfg = pe_comps.get("pe_tcm", {}).get("attrs", {}) + + sip_count = system.get("sips", {}).get("count", 1) + cubes_per_sip = system.get("sips", {}).get("cubes_per_sip", 16) + pes_per_cube = ( + cube.get("pe_layout", {}).get("pe_per_corner", 2) + * len(cube.get("pe_layout", {}).get("corners", ["NW", "NE", "SW", "SE"])) + ) + hbm_gb = mm.get("hbm_total_gb_per_cube", 48) + hbm_slices = mm.get("hbm_slices_per_cube", 8) + tcm_mb = tcm_cfg.get("size_mb", 16) + + cfg = AddressConfig( + sip_count=sip_count, + cubes_per_sip=cubes_per_sip, + pes_per_cube=pes_per_cube, + hbm_bytes_per_cube=hbm_gb * (1 << 30), + hbm_slices_per_cube=hbm_slices, + tcm_bytes_per_pe=tcm_mb * (1 << 20), + tcm_scheduler_reserved_bytes=4 * (1 << 20), + sram_bytes_per_cube=32 * (1 << 20), + ) + # Create allocators for all SIPs × cubes × PEs + # Flat index: sip_id * cubes_per_sip * pes_per_cube + cube_id * pes_per_cube + pe_id + self._pes_per_cube = pes_per_cube + self._num_cubes = cubes_per_sip + self._num_sips = sip_count + cubes_x_pes = cubes_per_sip * pes_per_cube + for sip_id in range(sip_count): + for cube_id in range(cubes_per_sip): + for pe_id in range(pes_per_cube): + flat_idx = sip_id * cubes_x_pes + cube_id * pes_per_cube + pe_id + self._allocators[flat_idx] = PEMemAllocator( + rack_id=0, sip_id=sip_id, cube_id=cube_id, pe_id=pe_id, cfg=cfg, + ) + return self._allocators + + def _next_tensor_name(self) -> str: + self._tensor_counter += 1 + return f"t{self._tensor_counter}" + + def zeros( + self, + shape: tuple[int, ...], + dtype: str = "f16", + *, + placement: list | None = None, + dp: Any = None, + name: str | None = None, + ): + """Create a tensor and deploy to HBM with zero-fill (like torch.zeros).""" + return self._create_tensor(shape, dtype, placement, name, pattern="zero", dp=dp) + + def empty( + self, + shape: tuple[int, ...], + dtype: str = "f16", + *, + placement: list | None = None, + dp: Any = None, + name: str | None = None, + ): + """Allocate a tensor in HBM without initialization (like torch.empty).""" + return self._create_tensor(shape, dtype, placement, name, pattern=None, dp=dp) + + def _create_tensor( + self, + shape: tuple[int, ...], + dtype: str, + placement: list | None, + name: str | None, + pattern: str | None, + dp: Any = None, + ): + from kernbench.policy.placement.dp import DPPolicy, ShardSpec, resolve_dp_policy + from kernbench.runtime_api.kernel import MemoryWriteMsg + from kernbench.runtime_api.tensor import Tensor, deploy_tensor, dtype_itemsize + + tensor_name = name or self._next_tensor_name() + t = Tensor(shape=shape, dtype=dtype, name=tensor_name) + + dp_policy: DPPolicy | None = None + + # Resolve placement: dp= takes priority over placement= + if dp is not None and isinstance(dp, DPPolicy): + dp_policy = dp + allocators = self._ensure_allocators() + itemsize = dtype_itemsize(dtype) + shape_2d = (shape[0], shape[1]) # type: tuple[int, int] + total_cubes = self._num_sips * self._num_cubes + placement = resolve_dp_policy( + dp, shape=shape_2d, itemsize=itemsize, + num_pe=self._pes_per_cube, num_cubes=total_cubes, + ) + elif placement is None: + placement = [ShardSpec(pe_index=0, offset_bytes=0, nbytes=t.nbytes)] + + # Infer target_pe from placement: multi-PE → "all", single PE → pe_index + pe_indices = {s.pe_index for s in placement} + target_pe: int | str = "all" if len(pe_indices) > 1 else next(iter(pe_indices)) + t.to(placement=placement, target_pe=target_pe, dp_policy=dp_policy) + + # Allocate PAs via PEMemAllocator + allocators = self._ensure_allocators() + handle = deploy_tensor( + name=tensor_name, + shape=shape, + dtype=dtype, + placement=placement, + allocators=allocators, + ) + t._handle = handle + + # Submit MemoryWriteMsg per shard (deploy data to device) + if pattern is not None: + for shard in handle.shards: + h = self.submit(MemoryWriteMsg( + correlation_id=self.correlation_id, + request_id=f"deploy_{tensor_name}_pe{shard.pe}", + dst_sip=shard.sip, dst_cube=shard.cube, dst_pe=shard.pe, + dst_pa=shard.pa, nbytes=shard.nbytes, pattern=pattern, + target_cubes=(shard.cube,), target_pe=shard.pe, + )) + self.wait(h, _meta={ + "phase": "memory_write", "name": tensor_name, + "sip": shard.sip, "cube": shard.cube, "pe": shard.pe, + "nbytes": shard.nbytes, + }) + + return t + + def launch( + self, + kernel_name: str, + kernel_fn: Any, + *args: Any, + **kwargs: Any, + ) -> RequestHandle: + """Register and launch a kernel (like a fused torch op). + + Positional args: Tensor objects become TensorArg, int/float become ScalarArg. + Keyword args: become ScalarArg (name is discarded, order preserved). + """ + from kernbench.runtime_api.kernel import ( + KernelLaunchMsg, + KernelRef, + ScalarArg, + ) + from kernbench.runtime_api.tensor import Tensor + from kernbench.triton_emu.registry import register_kernel + + # Register kernel (idempotent) + try: + register_kernel(kernel_name, kernel_fn) + except ValueError: + pass + + # Build kernel args from positional + keyword args + kernel_args: list = [] + target_pe: int | str = 0 + + for a in args: + if isinstance(a, Tensor): + kernel_args.append(a.to_tensor_arg()) + # Infer target_pe from tensor DP metadata + if a._dp_metadata is not None: + dp_target = a._dp_metadata.target_pe + if dp_target == "all": + target_pe = "all" + elif isinstance(dp_target, int) and target_pe != "all": + target_pe = dp_target + elif isinstance(a, (int, float)): + dtype_str = "f32" if isinstance(a, float) else "i32" + kernel_args.append(ScalarArg(dtype=dtype_str, value=a)) + + for v in kwargs.values(): + if isinstance(v, (int, float)): + dtype_str = "f32" if isinstance(v, float) else "i32" + kernel_args.append(ScalarArg(dtype=dtype_str, value=v)) + + # Determine target cubes from all tensor shards + cube_set: set[int] = set() + for a in args: + if isinstance(a, Tensor) and a._handle is not None: + for s in a._handle.shards: + cube_set.add(s.cube) + target_cubes = tuple(sorted(cube_set)) if cube_set else (0,) + + # Collect scalar values for GEMM FLOP calculation + scalar_vals = [a.value for a in kernel_args if hasattr(a, "value")] + + h = self.submit(KernelLaunchMsg( + correlation_id=self.correlation_id, + request_id=kernel_name, + kernel_ref=KernelRef(name=kernel_name, kind="builtin"), + args=tuple(kernel_args), + target_cubes=target_cubes, + target_pe=target_pe, + )) + self.wait(h, _meta={ + "phase": "kernel", "name": kernel_name, + "target_pe": target_pe, "scalars": scalar_vals, + }) + return h diff --git a/src/kernbench/runtime_api/kernel.py b/src/kernbench/runtime_api/kernel.py new file mode 100644 index 0000000..433d976 --- /dev/null +++ b/src/kernbench/runtime_api/kernel.py @@ -0,0 +1,123 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Literal, TypeAlias + + +@dataclass(frozen=True) +class MemoryWriteMsg: + correlation_id: str + request_id: str + dst_sip: int + dst_cube: int + dst_pe: int + dst_pa: int + nbytes: int + src_kind: Literal["pattern", "host_buffer_ref"] = "pattern" + pattern: str | None = None + target_cubes: tuple[int, ...] | Literal["all"] = "all" + target_pe: int | Literal["all"] = "all" + msg_type: Literal["memory_write"] = "memory_write" + + +@dataclass(frozen=True) +class MemoryReadMsg: + correlation_id: str + request_id: str + src_sip: int + src_cube: int + src_pe: int + src_pa: int + nbytes: int + target_cubes: tuple[int, ...] | Literal["all"] = "all" + target_pe: int | Literal["all"] = "all" + msg_type: Literal["memory_read"] = "memory_read" + + +@dataclass(frozen=True) +class KernelRef: + """Reference to a kernel binary or builtin timing model. + + Kernel binaries must be pre-deployed to device memory via MemoryWriteMsg. + KernelLaunchMsg references the deployed location by PA — source code or IR + MUST NOT be embedded in launch messages. + + - "deployed": kernel binary pre-deployed to HBM/SRAM at deploy_pa. + - "builtin": simulator built-in timing model, identified by name. + """ + + name: str + kind: Literal["deployed", "builtin"] + deploy_pa: int | None = None + deploy_sip: int = 0 + deploy_cube: int = 0 + deploy_pe: int = 0 + nbytes_code: int = 0 + + +@dataclass(frozen=True) +class TensorArgShard: + sip: int + cube: int + pe: int + pa: int + nbytes: int + offset_bytes: int + + +@dataclass(frozen=True) +class TensorArg: + shards: tuple[TensorArgShard, ...] + arg_kind: Literal["tensor"] = "tensor" + + +@dataclass(frozen=True) +class ScalarArg: + dtype: str + value: float | int + arg_kind: Literal["scalar"] = "scalar" + + +KernelArg: TypeAlias = TensorArg | ScalarArg + + +@dataclass(frozen=True) +class KernelLaunchMsg: + correlation_id: str + request_id: str + kernel_ref: KernelRef + args: tuple[KernelArg, ...] + target_cubes: tuple[int, ...] | Literal["all"] = "all" + target_pe: int | Literal["all"] = "all" + msg_type: Literal["kernel_launch"] = "kernel_launch" + + +@dataclass(frozen=True) +class ResponseMsg: + """Device→Host response carrying PE execution result.""" + + correlation_id: str + request_id: str + src_cube: int + src_pe: int + success: bool + msg_type: Literal["response"] = "response" + + +@dataclass(frozen=True) +class PeDmaMsg: + """Direct PE DMA request: host injects a transfer at PE_DMA level. + + Used by the probe utility to measure PE→HBM latency without requiring + the full PE_CPU → scheduler → DMA pipeline. + """ + + correlation_id: str + request_id: str + src_sip: int + src_cube: int + src_pe: int + dst_pa: int + nbytes: int + is_write: bool = False + msg_type: Literal["pe_dma"] = "pe_dma" diff --git a/src/kernbench/runtime_api/tensor.py b/src/kernbench/runtime_api/tensor.py new file mode 100644 index 0000000..26d4749 --- /dev/null +++ b/src/kernbench/runtime_api/tensor.py @@ -0,0 +1,166 @@ +from __future__ import annotations + +import math +from dataclasses import dataclass +from typing import Literal + +from kernbench.policy.address.allocator import PEMemAllocator +from kernbench.policy.placement.dp import DPPolicy, ShardSpec +from kernbench.runtime_api.kernel import TensorArg, TensorArgShard + + +@dataclass(frozen=True) +class TensorShard: + sip: int + cube: int + pe: int + pa: int + nbytes: int + offset_bytes: int + + +@dataclass(frozen=True) +class TensorHandle: + name: str + shape: tuple[int, ...] + dtype: str + itemsize: int + shards: tuple[TensorShard, ...] + + @property + def nbytes(self) -> int: + return math.prod(self.shape) * self.itemsize + + +_DTYPE_ITEMSIZE = { + "fp16": 2, "float16": 2, "f16": 2, + "fp32": 4, "float32": 4, "f32": 4, + "bf16": 2, + "int8": 1, "i8": 1, + "int16": 2, "i16": 2, + "int32": 4, "i32": 4, +} + + +def dtype_itemsize(dtype: str) -> int: + if dtype not in _DTYPE_ITEMSIZE: + raise ValueError(f"unsupported dtype: {dtype}") + return _DTYPE_ITEMSIZE[dtype] + + +def deploy_tensor( + *, + name: str, + shape: tuple[int, ...], + dtype: str, + placement: list[ShardSpec], + allocators: dict[int, PEMemAllocator], + mem_kind: Literal["hbm", "tcm"] = "hbm", +) -> TensorHandle: + isize = dtype_itemsize(dtype) + shards: list[TensorShard] = [] + for spec in placement: + alloc = allocators[spec.pe_index] + if mem_kind == "hbm": + pa = alloc.alloc_hbm(spec.nbytes) + else: + pa = alloc.alloc_tcm(spec.nbytes) + shards.append(TensorShard( + sip=alloc._sip_id, + cube=alloc._cube_id, + pe=alloc._pe_id, + pa=pa.encode(), + nbytes=spec.nbytes, + offset_bytes=spec.offset_bytes, + )) + return TensorHandle( + name=name, + shape=shape, + dtype=dtype, + itemsize=isize, + shards=tuple(shards), + ) + + +# ── PyTorch-like Tensor API ────────────────────────────────────────── + + +@dataclass(frozen=True) +class DPMetadata: + """Data-parallel placement metadata (stored as Tensor._dp_metadata).""" + + placement: list[ShardSpec] + dp_policy: DPPolicy | None = None + sip: int = 0 + cube: int = 0 + target_pe: int | str = 0 # int → single PE, "all" → all PEs + + +class Tensor: + """PyTorch-like tensor for benchmark code. + + Usage:: + + a = ctx.zeros((M, K), dtype="f16") + a = ctx.zeros((M, K), dtype="f16", placement=dp.replicate(num_pe=8)) + ctx.launch("kernel_name", kernel_fn, a, b, out, M=M, K=K) + """ + + def __init__( + self, + shape: tuple[int, ...], + dtype: str = "f16", + name: str = "", + ) -> None: + self.shape = shape + self.dtype = dtype + self.name = name + self._dp_metadata: DPMetadata | None = None + self._handle: TensorHandle | None = None + + @property + def itemsize(self) -> int: + return dtype_itemsize(self.dtype) + + @property + def nbytes(self) -> int: + return math.prod(self.shape) * self.itemsize + + @property + def pa(self) -> int: + """Primary PA (first shard). Used as kernel pointer argument.""" + if self._handle is None or not self._handle.shards: + raise RuntimeError(f"Tensor '{self.name}' is not deployed yet") + return self._handle.shards[0].pa + + def to( + self, + placement: list[ShardSpec] | None = None, + *, + dp_policy: DPPolicy | None = None, + sip: int = 0, + cube: int = 0, + target_pe: int | str = 0, + ) -> Tensor: + """Set DP placement metadata (like torch.Tensor.to()).""" + if placement is None: + placement = [ShardSpec(pe_index=0, offset_bytes=0, nbytes=self.nbytes)] + self._dp_metadata = DPMetadata( + placement=placement, dp_policy=dp_policy, + sip=sip, cube=cube, target_pe=target_pe, + ) + return self + + def to_tensor_arg(self) -> TensorArg: + """Convert deployed shards to KernelLaunchMsg TensorArg.""" + if self._handle is None: + raise RuntimeError(f"Tensor '{self.name}' is not deployed yet") + return TensorArg( + shards=tuple( + TensorArgShard( + sip=s.sip, cube=s.cube, pe=s.pe, + pa=s.pa, nbytes=s.nbytes, offset_bytes=s.offset_bytes, + ) + for s in self._handle.shards + ), + ) diff --git a/src/kernbench/runtime_api/types.py b/src/kernbench/runtime_api/types.py new file mode 100644 index 0000000..00072a0 --- /dev/null +++ b/src/kernbench/runtime_api/types.py @@ -0,0 +1,71 @@ +from __future__ import annotations + +import re +from dataclasses import dataclass + +from kernbench.common.types import Completion, Trace + + +@dataclass(frozen=True) +class BenchResult: + completion: Completion + correlation_id: str + trace: Trace | None = None + traces: list[dict] | None = None + + def summary_text(self) -> str: + if self.completion.ok: + return f"[OK] correlation_id={self.correlation_id}" + code = self.completion.error_code or "ERROR" + msg = self.completion.error_message or "" + return f"[FAIL:{code}] correlation_id={self.correlation_id} {msg}".rstrip() + + +@dataclass(frozen=True) +class DeviceSelector: + """ + Device selector. + + Supported: + - "all" : all SIPs in the tray topology + - "sip:" : a single SIP index + """ + + raw: str # "all" or "sip:" + + @property + def is_all(self) -> bool: + return self.raw == "all" + + @property + def sip_index(self) -> int: + if self.is_all: + raise ValueError("DeviceSelector is 'all'; no single sip_index.") + m = re.fullmatch(r"sip:(\d+)", self.raw) + if not m: + raise ValueError( + f"Invalid device '{self.raw}'. Expected 'all' or 'sip:' (e.g., sip:0)." + ) + return int(m.group(1)) + + +def resolve_device(raw: str | None) -> DeviceSelector: + """ + Resolve the CLI --device string into a DeviceSelector. + + Semantics: + - if omitted/empty -> "all" + - else accept "all" or "sip:" + """ + if raw is None or raw.strip() == "": + return DeviceSelector(raw="all") + + raw = raw.strip().lower() + if raw == "all": + return DeviceSelector(raw="all") + + m = re.fullmatch(r"sip:(\d+)", raw) + if not m: + raise ValueError(f"Invalid device '{raw}'. Expected 'all' or 'sip:' (e.g., sip:0).") + + return DeviceSelector(raw=raw) diff --git a/src/kernbench/sim_engine/dummy.py b/src/kernbench/sim_engine/dummy.py new file mode 100644 index 0000000..52f2d31 --- /dev/null +++ b/src/kernbench/sim_engine/dummy.py @@ -0,0 +1,31 @@ +# kernbench/engine/dummy.py +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any + +from kernbench.common.types import Completion, RequestHandle, SimEngine, Trace + + +@dataclass +class DummyEngine(SimEngine): + topology: object + device_raw: str + _n: int = 0 + _store: dict[str, tuple[Completion, Trace | None]] = None # type: ignore + + def __post_init__(self) -> None: + self._store = {} + + def submit(self, request: Any) -> RequestHandle: + self._n += 1 + h = RequestHandle(f"h{self._n}") + # 여기서 request 처리/시뮬레이션/스케줄링 등을 수행 + self._store[str(h)] = (Completion(ok=True), {"request": request, "device": self.device_raw}) + return h + + def get_completion(self, handle: RequestHandle) -> tuple[Completion, Trace | None]: + return self._store[str(handle)] + + def wait(self, handle: RequestHandle) -> None: + pass diff --git a/src/kernbench/sim_engine/engine.py b/src/kernbench/sim_engine/engine.py new file mode 100644 index 0000000..962730e --- /dev/null +++ b/src/kernbench/sim_engine/engine.py @@ -0,0 +1,298 @@ +from __future__ import annotations + +from typing import Any + +import simpy + +from kernbench.common.types import Completion, RequestHandle, Trace +import kernbench.components.impls # noqa: F401 — registers built-in implementations +from kernbench.components.base import ComponentBase, ComponentRegistry +from kernbench.components.context import ComponentContext +from kernbench.policy.address.phyaddr import PhysAddr +from kernbench.policy.routing.router import AddressResolver, PathRouter +from kernbench.runtime_api.kernel import KernelLaunchMsg, MemoryReadMsg, MemoryWriteMsg, PeDmaMsg +from kernbench.sim_engine.transaction import Transaction +from kernbench.topology.types import Edge, TopologyGraph + + +class GraphEngine: + """simpy-based discrete-event simulation engine. + + Phase B: engine injects a Transaction into the PCIE_EP host queue for + each request. Components handle their own routing: + Path 1: PCIE_EP → IO_CPU (engine-computed path, pre-loaded in Transaction) + Path 2: IO_CPU → M_CPU (IO_CPU dispatches, fire-and-forget callback) + Path 3: M_CPU.DMA → HBM (M_CPU dispatches, fire-and-forget callback) + + Component implementations are DI-injectable via component_overrides (ADR-0007 D3). + """ + + def __init__( + self, + graph: TopologyGraph, + *, + component_overrides: dict[str, type[ComponentBase]] | None = None, + ) -> None: + self._env = simpy.Environment() + self._resolver = AddressResolver(graph) + self._router = PathRouter(graph) + self._nodes = graph.nodes + self._edge_map: dict[tuple[str, str], Edge] = {} + for e in graph.edges: + self._edge_map[(e.src, e.dst)] = e + self._ns_per_mm: float = graph.spec.get("system", {}).get("ns_per_mm", 0.01) + self._results: dict[str, tuple[Completion, Trace]] = {} + self._events: dict[str, simpy.Event] = {} + self._counter = 0 + overrides = component_overrides or {} + ctx = ComponentContext( + router=self._router, + resolver=self._resolver, + positions={nid: n.pos_mm for nid, n in graph.nodes.items()}, + ns_per_mm=self._ns_per_mm, + edge_map=self._edge_map, + spec=graph.spec, + ) + self._components: dict[str, ComponentBase] = { + node_id: ComponentRegistry.create(node, overrides, ctx) + for node_id, node in graph.nodes.items() + } + + # Wire ports: one Store per directed edge (ADR-0015 D1) + for e in graph.edges: + src_comp = self._components.get(e.src) + dst_comp = self._components.get(e.dst) + if src_comp is None or dst_comp is None: + continue + store: simpy.Store = simpy.Store(self._env) + src_comp.out_ports[e.dst] = store + dst_comp.in_ports[e.src] = store + + # Wire processes: propagation delay per edge (ADR-0015 D2) + # Cut-through (wormhole) model: wires apply propagation only. + # Serialization (drain) is computed per-path and applied once at the terminal. + for e in graph.edges: + src_comp = self._components.get(e.src) + dst_comp = self._components.get(e.dst) + if src_comp is None or dst_comp is None: + continue + prop_ns = e.distance_mm * self._ns_per_mm + self._env.process( + self._wire(src_comp.out_ports[e.dst], dst_comp.in_ports[e.src], + prop_ns) + ) + + # Attach host queues to PCIE_EP in_ports before start() (ADR-0015 D3) + self._host_queues: dict[str, simpy.Store] = {} + for pcie_ep_id in self._resolver.find_all_pcie_eps(): + host_q: simpy.Store = simpy.Store(self._env) + self._components[pcie_ep_id].in_ports["host"] = host_q + self._host_queues[pcie_ep_id] = host_q + + # Attach host queues to PE_DMA nodes for direct PE DMA injection + self._pe_dma_queues: dict[str, simpy.Store] = {} + for node_id, node in graph.nodes.items(): + if node.kind == "pe_dma": + host_q = simpy.Store(self._env) + self._components[node_id].in_ports["host"] = host_q + self._pe_dma_queues[node_id] = host_q + + # Start components after all ports are wired (ADR-0015 D3) + for comp in self._components.values(): + comp.start(self._env) + + def submit(self, request: Any) -> RequestHandle: + self._counter += 1 + handle = RequestHandle(f"h{self._counter}") + event = self._env.event() + self._events[str(handle)] = event + self._env.process(self._process(str(handle), request, event)) + return handle + + def wait(self, handle: RequestHandle) -> None: + key = str(handle) + event = self._events[key] + if not event.triggered: + self._env.run(until=event) + + def get_completion(self, handle: RequestHandle) -> tuple[Completion, Trace | None]: + return self._results[str(handle)] + + # ── internal ──────────────────────────────────────────────────── + + def _wire( + self, + out_port: simpy.Store, + in_port: simpy.Store, + prop_ns: float, + ): + """SimPy process: relay messages with propagation delay only. + + Cut-through (wormhole) model: serialization (drain) is computed per-path + and applied once at the terminal component, not at every wire hop. + """ + while True: + msg = yield out_port.get() + if prop_ns > 0: + yield self._env.timeout(prop_ns) + yield in_port.put(msg) + + def _process(self, key: str, request: Any, done: simpy.Event): + if isinstance(request, PeDmaMsg): + yield from self._process_pe_dma(key, request, done) + return + + entries = self._entry_points(request) + if not entries: + self._results[key] = ( + Completion(ok=True), + {"total_ns": 0.0, "nbytes": 0}, + ) + done.succeed() + return + + start_ns = self._env.now + total_nbytes = 0 + + root_txn: Transaction | None = None + if len(entries) == 1: + # Single-SIP: direct inject (common path, no extra events) + pcie_ep_id, io_cpu_id, nbytes = entries[0] + total_nbytes = nbytes + path = self._router.find_node_path(pcie_ep_id, io_cpu_id) + txn_done = self._env.event() + txn = Transaction(request=request, path=path, step=0, nbytes=nbytes, done=txn_done) + root_txn = txn + yield self._host_queues[pcie_ep_id].put(txn) + yield txn_done + else: + # Multi-SIP: inject per SIP, aggregate completions (ADR-0007) + sub_dones: list[simpy.Event] = [] + sub_txns: list[Transaction] = [] + for pcie_ep_id, io_cpu_id, nbytes in entries: + total_nbytes = max(total_nbytes, nbytes) + path = self._router.find_node_path(pcie_ep_id, io_cpu_id) + txn_done = self._env.event() + txn = Transaction( + request=request, path=path, step=0, + nbytes=nbytes, done=txn_done, + ) + yield self._host_queues[pcie_ep_id].put(txn) + sub_dones.append(txn_done) + sub_txns.append(txn) + for sd in sub_dones: + yield sd + # Aggregate pe_exec_ns from multi-SIP (max) + pe_vals = [st.result_data.get("pe_exec_ns") for st in sub_txns] + pe_vals = [v for v in pe_vals if v is not None] + if pe_vals: + if root_txn is None: + root_txn = sub_txns[0] + root_txn.result_data["pe_exec_ns"] = max(pe_vals) + + total_ns = self._env.now - start_ns + result_trace: dict[str, Any] = {"total_ns": total_ns, "nbytes": total_nbytes} + if root_txn is not None and root_txn.result_data: + result_trace.update(root_txn.result_data) + self._results[key] = ( + Completion(ok=True), + result_trace, + ) + done.succeed() + + def _process_pe_dma(self, key: str, request: PeDmaMsg, done: simpy.Event): + """Inject a Transaction directly at PE_DMA for PE→HBM latency measurement.""" + pe_prefix = f"sip{request.src_sip}.cube{request.src_cube}.pe{request.src_pe}" + pe_dma_id = f"{pe_prefix}.pe_dma" + pa = PhysAddr.decode(request.dst_pa) + dst_node = self._resolver.resolve(pa) + path = self._router.find_path(pe_prefix, dst_node) + drain_ns = self._path_drain_ns(path, request.nbytes) + + start_ns = self._env.now + txn_done = self._env.event() + txn = Transaction(request=request, path=path, step=0, nbytes=request.nbytes, + done=txn_done, drain_ns=drain_ns) + yield self._pe_dma_queues[pe_dma_id].put(txn) + yield txn_done + total_ns = self._env.now - start_ns + formula_ns = self._formula_latency(path, request.nbytes) + self._results[key] = ( + Completion(ok=True), + {"total_ns": total_ns, "formula_ns": formula_ns, "nbytes": request.nbytes}, + ) + done.succeed() + + def _path_drain_ns(self, path: list[str], nbytes: int) -> float: + """Wormhole drain time: nbytes / bottleneck_bw along path.""" + min_bw = float("inf") + for i in range(len(path) - 1): + edge = self._edge_map.get((path[i], path[i + 1])) + if edge and edge.bw_gbs: + min_bw = min(min_bw, edge.bw_gbs) + if min_bw == float("inf"): + return 0.0 + return nbytes / min_bw + + def _formula_latency(self, path: list[str], nbytes: int) -> float: + """Lower-bound formula latency (ADR-0015 D7). + + formula = Σ(wire propagation) + Σ(component overhead_ns) + drain_ns + + Phase 0: formula == actual (no contention). + Phase 1+: formula <= actual (contention adds queueing). + """ + total = 0.0 + # Wire propagation delays + for i in range(len(path) - 1): + edge = self._edge_map.get((path[i], path[i + 1])) + if edge: + total += edge.distance_mm * self._ns_per_mm + # Component overhead_ns + for node_id in path: + node = self._nodes.get(node_id) + if node: + total += float(node.attrs.get("overhead_ns", 0.0)) + # Drain + total += self._path_drain_ns(path, nbytes) + return total + + def _entry_points(self, request: Any) -> list[tuple[str, str, int]]: + """Return list of (pcie_ep_id, io_cpu_id, nbytes) per target SIP. + + For Memory{Write,Read}: single SIP entry. + For KernelLaunchMsg: one entry per distinct SIP in tensor shards. + """ + if isinstance(request, MemoryWriteMsg): + sip = request.dst_sip + return [( + self._resolver.find_pcie_ep(sip), + self._resolver.find_io_cpu(sip), + request.nbytes, + )] + + if isinstance(request, MemoryReadMsg): + sip = request.src_sip + return [( + self._resolver.find_pcie_ep(sip), + self._resolver.find_io_cpu(sip), + request.nbytes, + )] + + if isinstance(request, KernelLaunchMsg): + seen: set[int] = set() + entries: list[tuple[str, str, int]] = [] + for arg in request.args: + if arg.arg_kind != "tensor": + continue + for shard in arg.shards: + if shard.sip not in seen: + seen.add(shard.sip) + entries.append(( + self._resolver.find_pcie_ep(shard.sip), + self._resolver.find_io_cpu(shard.sip), + shard.nbytes, + )) + return entries + + raise ValueError(f"unsupported request type: {type(request)}") diff --git a/src/kernbench/sim_engine/transaction.py b/src/kernbench/sim_engine/transaction.py new file mode 100644 index 0000000..8341aa5 --- /dev/null +++ b/src/kernbench/sim_engine/transaction.py @@ -0,0 +1,49 @@ +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Any + +import simpy + + +@dataclass +class Transaction: + """In-flight request traversing the device fabric hop-by-hop (ADR-0015 D4). + + A Transaction carries a host request through one leg of the device fabric. + Each component on the path reads from its in_port, processes (overhead_ns or + other latency), and advances the Transaction to the next hop via out_port. + Wire processes (ADR-0015 D2) model propagation delay between hops. + + Multi-leg flows (e.g. IO_CPU → M_CPU as leg 1, M_CPU.DMA → HBM as leg 2) + use separate Transactions: the terminal component of leg 1 creates leg 2 + and waits for leg 2's done before succeeding leg 1's done. + """ + + request: Any # original host request (MemoryReadMsg, KernelLaunchMsg, …) + path: list[str] # node_id sequence for this leg + step: int # index of the component currently holding this Transaction + nbytes: int # payload size (bytes) + done: simpy.Event # succeeded when this leg completes + drain_ns: float = 0.0 # wormhole drain time: nbytes / bottleneck_bw (applied once at terminal) + is_response: bool = False # True when carrying ResponseMsg on reverse path + result_data: dict[str, Any] = field(default_factory=dict) # PE-level metrics (pe_exec_ns, etc.) + + @property + def next_hop(self) -> str | None: + """Node id of the next component, or None if this is the terminal hop.""" + nxt = self.step + 1 + return self.path[nxt] if nxt < len(self.path) else None + + def advance(self) -> Transaction: + """Return a copy of this Transaction advanced one step along the path.""" + return Transaction( + request=self.request, + path=self.path, + step=self.step + 1, + nbytes=self.nbytes, + done=self.done, + drain_ns=self.drain_ns, + is_response=self.is_response, + result_data=self.result_data, + ) diff --git a/src/kernbench/topology/__init__.py b/src/kernbench/topology/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/kernbench/topology/builder.py b/src/kernbench/topology/builder.py new file mode 100644 index 0000000..49f1aa8 --- /dev/null +++ b/src/kernbench/topology/builder.py @@ -0,0 +1,965 @@ +# kernbench/topology/builder.py +""" +Topology compiler: parses topology.yaml and produces a fully-instantiated +TopologyGraph with nodes, edges, and representative view projections. +""" +from __future__ import annotations + +from pathlib import Path +from typing import Any + +import yaml + +from .types import Edge, Node, TopologyGraph, TopologyHandle, ViewGraph + + +# PE component offsets from PE center (small, intra-PE distances ~0.5mm) +_PE_COMP_OFFSETS = { + "pe_cpu": (-0.3, 0.0), + "pe_scheduler": (-0.15, 0.0), + "pe_dma": (0.0, -0.15), + "pe_gemm": (0.0, 0.0), + "pe_math": (0.0, 0.15), + "pe_tcm": (0.3, 0.0), +} + + +# ── Public API ─────────────────────────────────────────────────────── + + +def resolve_topology(path_str: str) -> TopologyHandle: + """Validate path and build compiled topology graph.""" + p = Path(path_str).expanduser().resolve() + if not p.exists(): + raise FileNotFoundError(f"Topology file not found: {p}") + if not p.is_file(): + raise ValueError(f"Topology path is not a file: {p}") + graph = load_topology(p) + return TopologyHandle(path=p, topology_obj=graph) + + +def load_topology(path: Path) -> TopologyGraph: + """Load topology spec from file and compile into a topology graph.""" + spec = _read_spec(path) + _validate_spec(spec) + return _compile_graph(spec) + + +def _read_spec(path: Path) -> dict[str, Any]: + """Read YAML topology spec file and return a dict.""" + try: + with path.open("r", encoding="utf-8") as f: + data = yaml.safe_load(f) + except yaml.YAMLError as e: + msg = f"Failed to parse YAML topology: {path}" + mark = getattr(e, "problem_mark", None) + if mark is not None: + msg += f" (line {mark.line + 1}, column {mark.column + 1})" + raise ValueError(msg) from e + + if data is None: + raise ValueError(f"Topology YAML is empty: {path}") + if not isinstance(data, dict): + raise ValueError( + f"Topology YAML root must be a mapping/dict: {path} (got {type(data).__name__})" + ) + return data + + +def _validate_spec(spec: dict) -> None: + # TODO: schema validation + return + + +# ── Graph Compiler ─────────────────────────────────────────────────── + + +def _compile_graph(spec: dict) -> TopologyGraph: + """Build fully-instantiated flat graph + representative view projections.""" + nodes: dict[str, Node] = {} + edges: list[Edge] = [] + + system = spec["system"] + sip_spec = spec["sip"] + cube_spec = spec["cube"] + + mesh_w = sip_spec["cube_mesh"]["w"] + mesh_h = sip_spec["cube_mesh"]["h"] + cube_w = cube_spec["geometry"]["cube_mm"]["w"] + cube_h = cube_spec["geometry"]["cube_mm"]["h"] + seam = sip_spec["links"]["inter_cube_mesh"]["distance_mm_across_seam"] + stride_x = cube_w + seam + stride_y = cube_h + seam + + # System-level + _instantiate_system(nodes, system) + + # Per-SIP + for sip_id in range(system["sips"]["count"]): + sp = f"sip{sip_id}" + + # IO chiplets + _instantiate_io_chiplets( + nodes, edges, sp, sip_spec, + cube_w, cube_h, mesh_w, mesh_h, seam, + ) + + # Cubes + PEs + for row in range(mesh_h): + for col in range(mesh_w): + cid = row * mesh_w + col + cp = f"{sp}.cube{cid}" + origin = (col * stride_x, row * stride_y) + _instantiate_cube(nodes, edges, cp, cube_spec, origin) + + # Inter-cube UCIe mesh + _add_inter_cube_edges(edges, sp, mesh_w, mesh_h, sip_spec) + + # IO → cube UCIe + _add_io_to_cube_edges(edges, sp, sip_spec, mesh_w) + + # Switch → IO pcie_ep + _add_system_to_io_edges(edges, sp, sip_spec, system) + + # Build views + return TopologyGraph( + spec=spec, + nodes=nodes, + edges=edges, + system_view=_build_system_view(spec), + sip_view=_build_sip_view(spec), + cube_view=_build_cube_view(spec), + pe_view=_build_pe_view(spec), + ) + + +# ── Layout helpers ─────────────────────────────────────────────────── + + +def _cube_local_positions(cube_w: float, cube_h: float) -> dict[str, tuple[float, float]]: + """Cube-internal component positions relative to cube origin (0,0) at top-left.""" + cx, cy = cube_w / 2, cube_h / 2 + # UCIe node half-sizes (default 2.0×1.2mm) — inset so edges touch boundary + uh = 0.6 # half height + uw = 1.0 # half width + return { + "ucie-N": (cx, uh), + "ucie-S": (cx, cube_h - uh), + "ucie-W": (uw, cy), + "ucie-E": (cube_w - uw, cy), + "m_cpu": (cube_w - 2.5, cy - 1.5), + "xbar.top": (cx, 3.5), # Y reference for top-half xbar.pe nodes + "hbm_ctrl": (cx - 2.0, cy), + "xbar.bottom": (cx, cube_h - 3.5), # Y reference for bottom-half xbar.pe nodes + "bridge.left": (2.5, cy + 2.0), + "bridge.right": (cube_w - 2.5, cy + 2.0), + "noc": (cx + 2.0, cy), + "sram": (2.5, cy - 1.5), + } + + +def _corner_pe_positions(cube_w: float, cube_h: float) -> dict[str, list[tuple[float, float]]]: + """PE center positions per corner, relative to cube origin.""" + return { + "NW": [(1.5, 1.5), (4.5, 1.5)], + "NE": [(cube_w - 4.5, 1.5), (cube_w - 1.5, 1.5)], + "SW": [(1.5, cube_h - 1.5), (4.5, cube_h - 1.5)], + "SE": [(cube_w - 4.5, cube_h - 1.5), (cube_w - 1.5, cube_h - 1.5)], + } + + +# ── Instantiation: system ─────────────────────────────────────────── + + +def _instantiate_system(nodes: dict[str, Node], system: dict) -> None: + """Add system-level nodes (fabric switch).""" + sw = system["components"]["switch"] + sw_id = "fabric.switch0" + nodes[sw_id] = Node( + id=sw_id, kind=sw["kind"], impl=sw["impl"], + attrs=sw.get("attrs", {}), pos_mm=None, label="Switch", + ) + + +# ── Instantiation: IO chiplets ────────────────────────────────────── + + +def _instantiate_io_chiplets( + nodes: dict[str, Node], + edges: list[Edge], + sp: str, + sip_spec: dict, + cube_w: float, + cube_h: float, + mesh_w: int, + mesh_h: int, + seam: float, +) -> None: + """Add IO chiplet nodes and internal pcie_ep → io_cpu edges.""" + io_spec = sip_spec["iochiplet"] + comp = io_spec["components"] + links = io_spec["links"] + mesh_total_w = mesh_w * cube_w + (mesh_w - 1) * seam + mesh_total_h = mesh_h * cube_h + (mesh_h - 1) * seam + + for inst in io_spec["instances"]: + iid = inst["id"] + prefix = f"{sp}.{iid}" + side = inst["place"]["side"] + cx = mesh_total_w / 2 + if side == "N": + pcie_y, cpu_y = -5.0, -3.0 + else: + pcie_y, cpu_y = mesh_total_h + 5.0, mesh_total_h + 3.0 + + # pcie_ep + ep = comp["pcie_ep"] + ep_id = f"{prefix}.pcie_ep" + nodes[ep_id] = Node( + id=ep_id, kind=ep["kind"], impl=ep["impl"], + attrs=ep["attrs"], pos_mm=(cx, pcie_y), label="PCIe EP", + ) + + # io_cpu + cpu = comp["io_cpu"] + cpu_id = f"{prefix}.io_cpu" + nodes[cpu_id] = Node( + id=cpu_id, kind=cpu["kind"], impl=cpu["impl"], + attrs=cpu["attrs"], pos_mm=(cx, cpu_y), label="IO CPU", + ) + + # Internal edge + edges.append(Edge( + src=ep_id, dst=cpu_id, + distance_mm=links["pcie_ep_to_io_cpu_mm"], + bw_gbs=links["pcie_ep_to_io_cpu_bw_gbs"], + kind="io_internal", + )) + + +# ── Instantiation: cube + PEs ─────────────────────────────────────── + + +def _instantiate_cube( + nodes: dict[str, Node], + edges: list[Edge], + cp: str, + cube: dict, + origin: tuple[float, float], +) -> None: + """Add all cube-internal nodes and edges, including PE instances.""" + cube_w = cube["geometry"]["cube_mm"]["w"] + cube_h = cube["geometry"]["cube_mm"]["h"] + ox, oy = origin + local_pos = _cube_local_positions(cube_w, cube_h) + clinks = cube["links"] + n_slices = cube["memory_map"]["hbm_slices_per_cube"] + + # ── UCIe ports ── + ucie_ns = cube["ucie"]["overhead_ns"] + for port in cube["ucie"]["ports"]: + pid = f"{cp}.ucie-{port}" + lx, ly = local_pos[f"ucie-{port}"] + nodes[pid] = Node( + id=pid, kind="ucie_port", impl="ucie_v1", + attrs={"overhead_ns": ucie_ns}, pos_mm=(ox + lx, oy + ly), + label=f"UCIe-{port}", + ) + + # ── Named components: noc, m_cpu, sram ── + for name in ("noc", "m_cpu", "sram"): + c = cube["components"][name] + nid = f"{cp}.{name}" + lx, ly = local_pos[name] + nodes[nid] = Node( + id=nid, kind=c["kind"], impl=c["impl"], + attrs=c["attrs"], pos_mm=(ox + lx, oy + ly), + label=name.upper().replace("_", " "), + ) + + # ── HBM controller slices (one per PE) ── + hbm_spec = cube["components"]["hbm_ctrl"] + hbm_lx, hbm_ly = local_pos["hbm_ctrl"] + for sl in range(n_slices): + sid = f"{cp}.hbm_ctrl.slice{sl}" + nodes[sid] = Node( + id=sid, kind=hbm_spec["kind"], impl=hbm_spec["impl"], + attrs=hbm_spec["attrs"], pos_mm=(ox + hbm_lx, oy + hbm_ly), + label=f"HBM SLICE{sl}", + ) + + # ── Bridges ── + for br in cube["components"]["xbar"]["bridges"]: + bname = br["id"] + nid = f"{cp}.bridge.{bname}" + lx, ly = local_pos[f"bridge.{bname}"] + nodes[nid] = Node( + id=nid, kind=br["kind"], impl=br["impl"], + attrs=br["attrs"], pos_mm=(ox + lx, oy + ly), + label=f"Bridge {bname.upper()}", + ) + + # ── PE instances + per-PE xbar entry nodes ── + corners = cube["pe_layout"]["corners"] + pe_per_corner = cube["pe_layout"]["pe_per_corner"] + corner_pos = _corner_pe_positions(cube_w, cube_h) + pe_tmpl = cube["pe_template"] + pe_links = pe_tmpl["links"] + + xbar_pe_spec = cube["components"]["xbar"]["pe"] + xbar_top_y = local_pos["xbar.top"][1] + xbar_bot_y = local_pos["xbar.bottom"][1] + + pe_idx = 0 + for corner in corners: + is_top = corner in ("NW", "NE") + xbar_y = xbar_top_y if is_top else xbar_bot_y + mm_key = "pe_to_xbar_row_n_mm" if is_top else "pe_to_xbar_row_s_mm" + for ci in range(pe_per_corner): + pp = f"{cp}.pe{pe_idx}" + pe_cx, pe_cy = corner_pos[corner][ci] + + # Per-PE xbar entry node + xbar_nid = f"{cp}.xbar.pe{pe_idx}" + nodes[xbar_nid] = Node( + id=xbar_nid, kind=xbar_pe_spec["kind"], impl=xbar_pe_spec["impl"], + attrs=xbar_pe_spec["attrs"], pos_mm=(ox + pe_cx, oy + xbar_y), + label=f"XBAR PE{pe_idx}", + ) + + # PE template components + for comp_name, comp_spec in pe_tmpl["components"].items(): + cid = f"{pp}.{comp_name}" + dx, dy = _PE_COMP_OFFSETS.get(comp_name, (0.0, 0.0)) + nodes[cid] = Node( + id=cid, kind=comp_spec["kind"], impl=comp_spec["impl"], + attrs=comp_spec["attrs"], + pos_mm=(ox + pe_cx + dx, oy + pe_cy + dy), + label=comp_name.upper().replace("_", " "), + ) + + # PE-internal edges + _add_pe_internal_edges(edges, pp, pe_links) + + # PE_DMA → xbar.pe_i (HBM data path) + edges.append(Edge( + src=f"{pp}.pe_dma", dst=xbar_nid, + distance_mm=clinks[mm_key], + bw_gbs=clinks["pe_to_xbar_bw_gbs"], + kind="pe_to_xbar", + )) + + # PE_DMA → noc (non-HBM data path: SRAM, inter-cube, etc.) + edges.append(Edge( + src=f"{pp}.pe_dma", dst=f"{cp}.noc", + distance_mm=clinks["pe_dma_to_noc_mm"], + bw_gbs=clinks["pe_dma_to_noc_bw_gbs"], + kind="pe_to_noc", + )) + + # noc → PE_CPU (command delivery) + edges.append(Edge( + src=f"{cp}.noc", dst=f"{pp}.pe_cpu", + distance_mm=clinks["noc_to_pe_cpu_mm"], + kind="command", + )) + + pe_idx += 1 + + # ── Cube fabric edges ── + + # xbar.pe_i ↔ hbm_ctrl.slice_i (local Y-path, bidirectional for response) + for i in range(n_slices): + edges.append(Edge( + src=f"{cp}.xbar.pe{i}", dst=f"{cp}.hbm_ctrl.slice{i}", + distance_mm=clinks["xbar_to_hbm_mm"], + bw_gbs=clinks["xbar_to_hbm_bw_gbs"], + kind="xbar_to_hbm", + )) + edges.append(Edge( + src=f"{cp}.hbm_ctrl.slice{i}", dst=f"{cp}.xbar.pe{i}", + distance_mm=clinks["xbar_to_hbm_mm"], + bw_gbs=clinks["xbar_to_hbm_bw_gbs"], + kind="hbm_to_xbar", + )) + + # xbar chain: pe0↔pe1↔pe2↔pe3 (top), pe4↔pe5↔pe6↔pe7 (bottom) + half = n_slices // 2 + for half_start in (0, half): + for i in range(half_start, half_start + half - 1): + intra = ((i - half_start) % pe_per_corner) != (pe_per_corner - 1) + x_dist = clinks["xbar_chain_intra_corner_mm"] if intra else clinks["xbar_chain_inter_corner_mm"] + for a, b in [(i, i + 1), (i + 1, i)]: + edges.append(Edge( + src=f"{cp}.xbar.pe{a}", dst=f"{cp}.xbar.pe{b}", + distance_mm=x_dist, + bw_gbs=clinks["xbar_x_bw_gbs"], + kind="xbar_chain", + )) + + # bridge connections: pe0↔bridge.left↔pe4, pe3↔bridge.right↔pe7 + for bname, pe_top, pe_bot in [("left", 0, half), ("right", half - 1, n_slices - 1)]: + br_node = f"{cp}.bridge.{bname}" + for pe_i, br_mm_key in [(pe_top, "xbar_row_n_to_bridge_mm"), + (pe_bot, "xbar_row_s_to_bridge_mm")]: + xbar_node = f"{cp}.xbar.pe{pe_i}" + edges.append(Edge( + src=xbar_node, dst=br_node, + distance_mm=clinks[br_mm_key], + bw_gbs=clinks["xbar_to_bridge_bw_gbs"], + kind="xbar_to_bridge", + )) + edges.append(Edge( + src=br_node, dst=xbar_node, + distance_mm=clinks[br_mm_key], + bw_gbs=clinks["xbar_to_bridge_bw_gbs"], + kind="bridge_to_xbar", + )) + + # ucie ↔ noc (UCIe-NOC boundary; per_connection_bw_gbs = 128 GB/s, n_connections = 4) + _noc_ucie = clinks["noc_to_ucie"] + for port in cube["ucie"]["ports"]: + edges.append(Edge( + src=f"{cp}.ucie-{port}", dst=f"{cp}.noc", + distance_mm=0.0, + bw_gbs=_noc_ucie["per_connection_bw_gbs"], + n_connections=_noc_ucie["n_connections"], + kind="ucie_to_noc", + )) + + for port in cube["ucie"]["ports"]: + edges.append(Edge( + src=f"{cp}.noc", dst=f"{cp}.ucie-{port}", + distance_mm=0.0, + bw_gbs=_noc_ucie["per_connection_bw_gbs"], + n_connections=_noc_ucie["n_connections"], + kind="noc_to_ucie", + )) + + # noc ↔ xbar.pe{i}: wire delay is 0 (NOC traversal latency computed by TwoDMeshNocComponent); + # routing_weight_mm=50.0 steers PE DMA Dijkstra away from this path (prefer direct pe_dma→xbar) + _noc_xbar = clinks.get("noc_to_xbar", {}) + _noc_xbar_bw = _noc_xbar.get("per_connection_bw_gbs") + for i in range(n_slices): + edges.append(Edge( + src=f"{cp}.noc", dst=f"{cp}.xbar.pe{i}", + distance_mm=0.0, + bw_gbs=_noc_xbar_bw, + routing_weight_mm=50.0, + kind="noc_to_xbar", + )) + edges.append(Edge( + src=f"{cp}.xbar.pe{i}", dst=f"{cp}.noc", + distance_mm=0.0, + bw_gbs=_noc_xbar_bw, + routing_weight_mm=50.0, + kind="xbar_to_noc", + )) + + # m_cpu ↔ noc (command dispatch, both directions) + edges.append(Edge( + src=f"{cp}.m_cpu", dst=f"{cp}.noc", + distance_mm=clinks["m_cpu_to_noc_mm"], + kind="command", + )) + edges.append(Edge( + src=f"{cp}.noc", dst=f"{cp}.m_cpu", + distance_mm=clinks["m_cpu_to_noc_mm"], + kind="command", + )) + + # noc ↔ sram (shared SRAM access; per_connection_bw_gbs = 128 GB/s, n_connections = 4) + _noc_sram = clinks["noc_to_sram"] + edges.append(Edge( + src=f"{cp}.noc", dst=f"{cp}.sram", + distance_mm=clinks["noc_to_sram_mm"], + bw_gbs=_noc_sram["per_connection_bw_gbs"], + n_connections=_noc_sram["n_connections"], + kind="noc_to_sram", + )) + edges.append(Edge( + src=f"{cp}.sram", dst=f"{cp}.noc", + distance_mm=clinks["noc_to_sram_mm"], + bw_gbs=_noc_sram["per_connection_bw_gbs"], + n_connections=_noc_sram["n_connections"], + kind="noc_to_sram", + )) + + +def _add_pe_internal_edges(edges: list[Edge], pp: str, pe_links: dict) -> None: + """Add PE-internal edges for a single PE instance.""" + edges.append(Edge( + src=f"{pp}.pe_cpu", dst=f"{pp}.pe_scheduler", + distance_mm=pe_links["pe_cpu_to_scheduler_mm"], + kind="pe_internal", + )) + for eng, key in [("pe_dma", "scheduler_to_dma_mm"), + ("pe_gemm", "scheduler_to_gemm_mm"), + ("pe_math", "scheduler_to_math_mm")]: + edges.append(Edge( + src=f"{pp}.pe_scheduler", dst=f"{pp}.{eng}", + distance_mm=pe_links[key], + kind="pe_internal", + )) + for eng, mm_key, bw_key in [("pe_dma", "dma_to_tcm_mm", "dma_to_tcm_bw_gbs"), + ("pe_gemm", "gemm_to_tcm_mm", "gemm_to_tcm_bw_gbs"), + ("pe_math", "math_to_tcm_mm", "math_to_tcm_bw_gbs")]: + edges.append(Edge( + src=f"{pp}.{eng}", dst=f"{pp}.pe_tcm", + distance_mm=pe_links[mm_key], + bw_gbs=pe_links[bw_key], + kind="pe_internal", + )) + + +# ── Inter-cube / IO / system edges ────────────────────────────────── + + +def _add_inter_cube_edges( + edges: list[Edge], sp: str, mesh_w: int, mesh_h: int, sip_spec: dict, +) -> None: + """Add UCIe mesh edges between adjacent cubes within a SIP.""" + mesh = sip_spec["links"]["inter_cube_mesh"] + bw = mesh["bw_gbs_per_ucie_phy"] + dist = mesh["distance_mm_across_seam"] + for row in range(mesh_h): + for col in range(mesh_w): + cid = row * mesh_w + col + if col + 1 < mesh_w: + nid = row * mesh_w + (col + 1) + edges.append(Edge( + src=f"{sp}.cube{cid}.ucie-E", dst=f"{sp}.cube{nid}.ucie-W", + distance_mm=dist, bw_gbs=bw, kind="ucie_mesh", + )) + edges.append(Edge( + src=f"{sp}.cube{nid}.ucie-W", dst=f"{sp}.cube{cid}.ucie-E", + distance_mm=dist, bw_gbs=bw, kind="ucie_mesh", + )) + if row + 1 < mesh_h: + nid = (row + 1) * mesh_w + col + edges.append(Edge( + src=f"{sp}.cube{cid}.ucie-S", dst=f"{sp}.cube{nid}.ucie-N", + distance_mm=dist, bw_gbs=bw, kind="ucie_mesh", + )) + edges.append(Edge( + src=f"{sp}.cube{nid}.ucie-N", dst=f"{sp}.cube{cid}.ucie-S", + distance_mm=dist, bw_gbs=bw, kind="ucie_mesh", + )) + + +def _add_io_to_cube_edges( + edges: list[Edge], sp: str, sip_spec: dict, mesh_w: int, +) -> None: + """Add IO chiplet io_cpu ↔ cube UCIe edges (bidirectional for response).""" + io_links = sip_spec["iochiplet"]["links"] + io_to_ucie_mm = io_links["io_cpu_to_ucie_mm"] + io_to_ucie_bw = io_links["io_cpu_to_ucie_bw_gbs"] + for inst in sip_spec["iochiplet"]["instances"]: + iid = inst["id"] + io_cpu_id = f"{sp}.{iid}.io_cpu" + for port in inst["cube_ports"]: + cube_col, cube_row = port["cube"]["xy"] + cube_id = cube_row * mesh_w + cube_col + cube_side = port["cube_side"] + ucie_id = f"{sp}.cube{cube_id}.ucie-{cube_side}" + edges.append(Edge( + src=io_cpu_id, dst=ucie_id, + distance_mm=io_to_ucie_mm + port["distance_mm"], + bw_gbs=io_to_ucie_bw, + kind="io_to_cube", + )) + edges.append(Edge( + src=ucie_id, dst=io_cpu_id, + distance_mm=io_to_ucie_mm + port["distance_mm"], + bw_gbs=io_to_ucie_bw, + kind="cube_to_io", + )) + + +def _add_system_to_io_edges( + edges: list[Edge], sp: str, sip_spec: dict, system: dict, +) -> None: + """Add fabric switch → IO chiplet PCIe edges.""" + sw_id = "fabric.switch0" + sys_link = system["links"]["io_ep_to_switch"] + for inst in sip_spec["iochiplet"]["instances"]: + pcie_ep_id = f"{sp}.{inst['id']}.pcie_ep" + edges.append(Edge( + src=sw_id, dst=pcie_ep_id, + distance_mm=sys_link["distance_mm"], + bw_gbs=sys_link["bw_gbs_per_ep"], + kind="pcie", + )) + + +# ── View builders ──────────────────────────────────────────────────── + + +def _build_system_view(spec: dict) -> ViewGraph: + """System-level view: SIP blocks, IO chiplets, fabric switch.""" + system = spec["system"] + sip_count = system["sips"]["count"] + sip_w, sip_h = 71.0, 59.0 + gap = 30.0 + canvas_w = sip_count * sip_w + (sip_count - 1) * gap + canvas_h = sip_h + 20.0 + + nodes: dict[str, Node] = {} + view_edges: list[Edge] = [] + + sw = system["components"]["switch"] + sw_id = "fabric.switch0" + nodes[sw_id] = Node( + id=sw_id, kind=sw["kind"], impl=sw["impl"], + attrs=sw.get("attrs", {}), pos_mm=(canvas_w / 2, 5.0), label="Fabric Switch", + ) + + for s in range(sip_count): + sx = s * (sip_w + gap) + sy = 20.0 + sip_id = f"sip{s}" + + nodes[sip_id] = Node( + id=sip_id, kind="sip", impl="", + attrs={"w_mm": sip_w, "h_mm": sip_h}, + pos_mm=(sx + sip_w / 2, sy + sip_h / 2), + label=f"SIP {s}", + ) + + for inst in spec["sip"]["iochiplet"]["instances"]: + iid = inst["id"] + io_nid = f"{sip_id}.{iid}" + side = inst["place"]["side"] + iy = sy if side == "N" else sy + sip_h + nodes[io_nid] = Node( + id=io_nid, kind="iochiplet", impl="", + attrs={}, pos_mm=(sx + sip_w / 2, iy), label=f"IO {iid}", + ) + view_edges.append(Edge( + src=sw_id, dst=io_nid, + distance_mm=system["links"]["io_ep_to_switch"]["distance_mm"], + bw_gbs=system["links"]["io_ep_to_switch"]["bw_gbs_per_ep"], + kind="pcie", + )) + + return ViewGraph( + name="system", nodes=nodes, edges=view_edges, + width_mm=canvas_w, height_mm=canvas_h, + ) + + +def _build_sip_view(spec: dict) -> ViewGraph: + """SIP-level view: cube mesh + IO chiplets (representative, sip0).""" + sip_spec = spec["sip"] + cube_spec = spec["cube"] + mesh_w = sip_spec["cube_mesh"]["w"] + mesh_h = sip_spec["cube_mesh"]["h"] + cube_w = cube_spec["geometry"]["cube_mm"]["w"] + cube_h = cube_spec["geometry"]["cube_mm"]["h"] + seam = sip_spec["links"]["inter_cube_mesh"]["distance_mm_across_seam"] + stride_x = cube_w + seam + stride_y = cube_h + seam + mesh_total_w = mesh_w * cube_w + (mesh_w - 1) * seam + mesh_total_h = mesh_h * cube_h + (mesh_h - 1) * seam + io_margin = 6.0 + canvas_w = mesh_total_w + canvas_h = mesh_total_h + 2 * io_margin + + nodes: dict[str, Node] = {} + view_edges: list[Edge] = [] + + # Cubes as opaque blocks + for row in range(mesh_h): + for col in range(mesh_w): + cid = row * mesh_w + col + cx = col * stride_x + cube_w / 2 + cy = io_margin + row * stride_y + cube_h / 2 + nid = f"cube{cid}" + nodes[nid] = Node( + id=nid, kind="cube", impl="", + attrs={"w_mm": cube_w, "h_mm": cube_h, "col": col, "row": row}, + pos_mm=(cx, cy), label=f"CUBE ({col},{row})", + ) + + # Inter-cube mesh edges + mesh_link = sip_spec["links"]["inter_cube_mesh"] + for row in range(mesh_h): + for col in range(mesh_w): + cid = row * mesh_w + col + if col + 1 < mesh_w: + nid = row * mesh_w + (col + 1) + view_edges.append(Edge( + src=f"cube{cid}", dst=f"cube{nid}", + distance_mm=mesh_link["distance_mm_across_seam"], + bw_gbs=mesh_link["bw_gbs_per_ucie_phy"], + kind="ucie_mesh", + )) + if row + 1 < mesh_h: + nid = (row + 1) * mesh_w + col + view_edges.append(Edge( + src=f"cube{cid}", dst=f"cube{nid}", + distance_mm=mesh_link["distance_mm_across_seam"], + bw_gbs=mesh_link["bw_gbs_per_ucie_phy"], + kind="ucie_mesh", + )) + + # IO chiplets + io_links = sip_spec["iochiplet"]["links"] + for inst in sip_spec["iochiplet"]["instances"]: + iid = inst["id"] + side = inst["place"]["side"] + iy = 2.0 if side == "N" else canvas_h - 2.0 + nodes[iid] = Node( + id=iid, kind="iochiplet", impl="", + attrs={}, pos_mm=(mesh_total_w / 2, iy), label=f"IO {iid}", + ) + for port in inst["cube_ports"]: + cube_col, cube_row = port["cube"]["xy"] + cube_id = cube_row * mesh_w + cube_col + view_edges.append(Edge( + src=iid, dst=f"cube{cube_id}", + distance_mm=io_links["io_cpu_to_ucie_mm"] + port["distance_mm"], + bw_gbs=io_links["io_cpu_to_ucie_bw_gbs"], + kind="io_to_cube", + )) + + return ViewGraph( + name="sip", nodes=nodes, edges=view_edges, + width_mm=canvas_w, height_mm=canvas_h, + ) + + +def _build_cube_view(spec: dict) -> ViewGraph: + """Cube-level view: representative single cube, PEs as opaque blocks.""" + cube = spec["cube"] + cube_w = cube["geometry"]["cube_mm"]["w"] + cube_h = cube["geometry"]["cube_mm"]["h"] + local_pos = _cube_local_positions(cube_w, cube_h) + clinks = cube["links"] + n_slices = cube["memory_map"]["hbm_slices_per_cube"] + + nodes: dict[str, Node] = {} + view_edges: list[Edge] = [] + + # UCIe ports + for port in cube["ucie"]["ports"]: + pid = f"ucie-{port}" + lx, ly = local_pos[pid] + nodes[pid] = Node( + id=pid, kind="ucie_port", impl="ucie_v1", + attrs={}, pos_mm=(lx, ly), label=f"UCIe-{port}", + ) + + # Named components (hbm_ctrl as single representative node in view) + for name in ("noc", "m_cpu", "hbm_ctrl", "sram"): + c = cube["components"][name] + lx, ly = local_pos[name] + nodes[name] = Node( + id=name, kind=c["kind"], impl=c["impl"], + attrs=c["attrs"], pos_mm=(lx, ly), + label=name.upper().replace("_", " "), + ) + + # Bridges + for br in cube["components"]["xbar"]["bridges"]: + bname = br["id"] + bid = f"bridge.{bname}" + lx, ly = local_pos[bid] + nodes[bid] = Node( + id=bid, kind=br["kind"], impl=br["impl"], + attrs=br["attrs"], pos_mm=(lx, ly), + label=f"Bridge {bname.upper()}", + ) + + # PEs as opaque blocks + per-PE xbar entry nodes + corners = cube["pe_layout"]["corners"] + pe_per_corner = cube["pe_layout"]["pe_per_corner"] + corner_pos = _corner_pe_positions(cube_w, cube_h) + xbar_pe_spec = cube["components"]["xbar"]["pe"] + xbar_top_y = local_pos["xbar.top"][1] + xbar_bot_y = local_pos["xbar.bottom"][1] + + pe_idx = 0 + for corner in corners: + is_top = corner in ("NW", "NE") + xbar_y = xbar_top_y if is_top else xbar_bot_y + mm_key = "pe_to_xbar_row_n_mm" if is_top else "pe_to_xbar_row_s_mm" + for ci in range(pe_per_corner): + pid = f"pe{pe_idx}" + xbar_id = f"xbar.pe{pe_idx}" + px, py = corner_pos[corner][ci] + + nodes[pid] = Node( + id=pid, kind="pe", impl="", + attrs={"corner": corner}, pos_mm=(px, py), + label=f"PE{pe_idx}", + ) + nodes[xbar_id] = Node( + id=xbar_id, kind=xbar_pe_spec["kind"], impl=xbar_pe_spec["impl"], + attrs=xbar_pe_spec["attrs"], pos_mm=(px, xbar_y), + label=f"XBAR PE{pe_idx}", + ) + + # PE → xbar.pe_i (HBM data path) + view_edges.append(Edge( + src=pid, dst=xbar_id, + distance_mm=clinks[mm_key], + bw_gbs=clinks["pe_to_xbar_bw_gbs"], + kind="pe_to_xbar", + )) + # PE → noc (non-HBM data path) + view_edges.append(Edge( + src=pid, dst="noc", + distance_mm=clinks["pe_dma_to_noc_mm"], + bw_gbs=clinks["pe_dma_to_noc_bw_gbs"], + kind="pe_to_noc", + )) + # noc → PE (command delivery) + view_edges.append(Edge( + src="noc", dst=pid, + distance_mm=clinks["noc_to_pe_cpu_mm"], + kind="command", + )) + pe_idx += 1 + + # Cube fabric edges + # xbar.pe_i → hbm_ctrl (single representative node in view) + for i in range(n_slices): + view_edges.append(Edge( + src=f"xbar.pe{i}", dst="hbm_ctrl", + distance_mm=clinks["xbar_to_hbm_mm"], + bw_gbs=clinks["xbar_to_hbm_bw_gbs"], + kind="xbar_to_hbm", + )) + + # xbar chain + half = n_slices // 2 + for half_start in (0, half): + for i in range(half_start, half_start + half - 1): + intra = ((i - half_start) % pe_per_corner) != (pe_per_corner - 1) + x_dist = clinks["xbar_chain_intra_corner_mm"] if intra else clinks["xbar_chain_inter_corner_mm"] + for a, b in [(i, i + 1), (i + 1, i)]: + view_edges.append(Edge( + src=f"xbar.pe{a}", dst=f"xbar.pe{b}", + distance_mm=x_dist, + bw_gbs=clinks["xbar_x_bw_gbs"], + kind="xbar_chain", + )) + + # bridge connections + for bname, pe_top, pe_bot in [("left", 0, half), ("right", half - 1, n_slices - 1)]: + br_id = f"bridge.{bname}" + for pe_i, br_mm_key in [(pe_top, "xbar_row_n_to_bridge_mm"), + (pe_bot, "xbar_row_s_to_bridge_mm")]: + xbar_id = f"xbar.pe{pe_i}" + view_edges.append(Edge( + src=xbar_id, dst=br_id, + distance_mm=clinks[br_mm_key], + bw_gbs=clinks["xbar_to_bridge_bw_gbs"], + kind="xbar_to_bridge", + )) + view_edges.append(Edge( + src=br_id, dst=xbar_id, + distance_mm=clinks[br_mm_key], + bw_gbs=clinks["xbar_to_bridge_bw_gbs"], + kind="bridge_to_xbar", + )) + + _noc_ucie_v = clinks["noc_to_ucie"] + for port in cube["ucie"]["ports"]: + view_edges.append(Edge( + src="noc", dst=f"ucie-{port}", + distance_mm=0.0, + bw_gbs=_noc_ucie_v["per_connection_bw_gbs"], + n_connections=_noc_ucie_v["n_connections"], + kind="noc_to_ucie", + )) + + # m_cpu ↔ noc (command dispatch, both directions) + view_edges.append(Edge( + src="m_cpu", dst="noc", + distance_mm=clinks["m_cpu_to_noc_mm"], + kind="command", + )) + view_edges.append(Edge( + src="noc", dst="m_cpu", + distance_mm=clinks["m_cpu_to_noc_mm"], + kind="command", + )) + + # noc ↔ sram (shared SRAM access, bidirectional) + _noc_sram_v = clinks["noc_to_sram"] + view_edges.append(Edge( + src="noc", dst="sram", + distance_mm=clinks["noc_to_sram_mm"], + bw_gbs=_noc_sram_v["per_connection_bw_gbs"], + n_connections=_noc_sram_v["n_connections"], + kind="noc_to_sram", + )) + view_edges.append(Edge( + src="sram", dst="noc", + distance_mm=clinks["noc_to_sram_mm"], + bw_gbs=_noc_sram_v["per_connection_bw_gbs"], + n_connections=_noc_sram_v["n_connections"], + kind="noc_to_sram", + )) + + return ViewGraph( + name="cube", nodes=nodes, edges=view_edges, + width_mm=cube_w, height_mm=cube_h, + ) + + +def _build_pe_view(spec: dict) -> ViewGraph: + """PE-level view: representative single PE with all template components.""" + pe_tmpl = spec["cube"]["pe_template"] + pe_links = pe_tmpl["links"] + canvas_w, canvas_h = 12.0, 8.0 + + positions = { + "pe_cpu": (1.5, 4.0), + "pe_scheduler": (4.0, 4.0), + "pe_dma": (7.0, 1.5), + "pe_gemm": (7.0, 4.0), + "pe_math": (7.0, 6.5), + "pe_tcm": (10.0, 4.0), + } + + nodes: dict[str, Node] = {} + view_edges: list[Edge] = [] + + for comp_name, comp_spec in pe_tmpl["components"].items(): + px, py = positions[comp_name] + nodes[comp_name] = Node( + id=comp_name, kind=comp_spec["kind"], impl=comp_spec["impl"], + attrs=comp_spec["attrs"], pos_mm=(px, py), + label=comp_name.upper().replace("_", " "), + ) + + view_edges.append(Edge( + src="pe_cpu", dst="pe_scheduler", + distance_mm=pe_links["pe_cpu_to_scheduler_mm"], + kind="pe_internal", + )) + for eng, key in [("pe_dma", "scheduler_to_dma_mm"), + ("pe_gemm", "scheduler_to_gemm_mm"), + ("pe_math", "scheduler_to_math_mm")]: + view_edges.append(Edge( + src="pe_scheduler", dst=eng, + distance_mm=pe_links[key], + kind="pe_internal", + )) + for eng, mm_key, bw_key in [("pe_dma", "dma_to_tcm_mm", "dma_to_tcm_bw_gbs"), + ("pe_gemm", "gemm_to_tcm_mm", "gemm_to_tcm_bw_gbs"), + ("pe_math", "math_to_tcm_mm", "math_to_tcm_bw_gbs")]: + view_edges.append(Edge( + src=eng, dst="pe_tcm", + distance_mm=pe_links[mm_key], + bw_gbs=pe_links[bw_key], + kind="pe_internal", + )) + + return ViewGraph( + name="pe", nodes=nodes, edges=view_edges, + width_mm=canvas_w, height_mm=canvas_h, + ) diff --git a/src/kernbench/topology/graph.py b/src/kernbench/topology/graph.py new file mode 100644 index 0000000..e69de29 diff --git a/src/kernbench/topology/projections/cube_view.py b/src/kernbench/topology/projections/cube_view.py new file mode 100644 index 0000000..e69de29 diff --git a/src/kernbench/topology/projections/pe_view.py b/src/kernbench/topology/projections/pe_view.py new file mode 100644 index 0000000..e69de29 diff --git a/src/kernbench/topology/projections/sip_view.py b/src/kernbench/topology/projections/sip_view.py new file mode 100644 index 0000000..e69de29 diff --git a/src/kernbench/topology/types.py b/src/kernbench/topology/types.py new file mode 100644 index 0000000..6365338 --- /dev/null +++ b/src/kernbench/topology/types.py @@ -0,0 +1,56 @@ +from __future__ import annotations + +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any + + +@dataclass +class Node: + id: str + kind: str + impl: str + attrs: dict[str, Any] + pos_mm: tuple[float, float] | None # (x_mm, y_mm); None for abstract nodes + label: str = "" + + +@dataclass +class Edge: + src: str # node id + dst: str # node id + distance_mm: float # physical wire delay distance (ns = distance_mm * ns_per_mm) + routing_weight_mm: float | None = None # Dijkstra cost; None → use distance_mm + bw_gbs: float | None = None + n_connections: int | None = None # multi-connection links; single request uses 1 connection + kind: str = "link" + + +@dataclass +class ViewGraph: + name: str # "system" | "sip" | "cube" | "pe" + nodes: dict[str, Node] + edges: list[Edge] + width_mm: float + height_mm: float + + +@dataclass +class TopologyGraph: + spec: dict[str, Any] + + # Full instantiated flat graph (used by sim_engine) + nodes: dict[str, Node] = field(default_factory=dict) + edges: list[Edge] = field(default_factory=list) + + # Representative view projections (used by visualizer) + system_view: ViewGraph | None = None + sip_view: ViewGraph | None = None + cube_view: ViewGraph | None = None + pe_view: ViewGraph | None = None + + +@dataclass(frozen=True) +class TopologyHandle: + path: Path + topology_obj: TopologyGraph | None # None until _compile_graph is implemented diff --git a/src/kernbench/topology/visualizer.py b/src/kernbench/topology/visualizer.py new file mode 100644 index 0000000..075b081 --- /dev/null +++ b/src/kernbench/topology/visualizer.py @@ -0,0 +1,367 @@ +# kernbench/topology/visualizer.py +""" +SVG diagram generator for TopologyGraph views. + +Produces mm-accurate, deterministic SVG files for each view level +(system, SIP, cube, PE) per ADR-0005 and ADR-0006. +""" +from __future__ import annotations + +from pathlib import Path + +from .types import Edge, Node, TopologyGraph, ViewGraph + +# ── Color palette by component kind ───────────────────────────────── + +_KIND_COLORS: dict[str, str] = { + "switch": "#6366f1", # indigo + "sip": "#e0e7ff", # light indigo + "iochiplet": "#0ea5e9", # sky blue + "pcie_ep": "#0ea5e9", + "io_cpu": "#0ea5e9", + "ucie_port": "#3b82f6", # blue + "noc": "#a78bfa", # purple + "m_cpu": "#f59e0b", # amber + "xbar": "#f97316", # orange + "hbm_ctrl": "#10b981", # emerald + "pe": "#94a3b8", # slate + "pe_cpu": "#ef4444", # red + "pe_scheduler": "#f59e0b", # amber + "pe_dma": "#3b82f6", # blue + "pe_gemm": "#8b5cf6", # violet + "pe_math": "#ec4899", # pink + "pe_tcm": "#10b981", # emerald + "sram": "#f59e0b", # amber + "cube": "#cbd5e1", # slate-300 +} + +_EDGE_COLORS: dict[str, str] = { + "pcie": "#6366f1", + "io_internal": "#0ea5e9", + "io_to_cube": "#0ea5e9", + "ucie_mesh": "#3b82f6", + "pe_to_xbar": "#f97316", + "xbar_to_hbm": "#10b981", + "xbar_to_bridge": "#a78bfa", + "bridge_to_xbar": "#a78bfa", + "noc_to_ucie": "#a78bfa", + "pe_to_noc": "#a78bfa", + "noc_to_sram": "#f59e0b", + "command": "#f59e0b", + "pe_internal": "#94a3b8", +} + +# ── Node sizing ────────────────────────────────────────────────────── + +_DEFAULT_NODE_W = 2.0 # mm +_DEFAULT_NODE_H = 1.2 # mm + +_KIND_SIZE: dict[str, tuple[float, float]] = { + "sip": (60.0, 50.0), + "cube": (6.0, 4.0), + "iochiplet": (4.0, 1.5), + "switch": (5.0, 1.5), +} + + +# ── Public API ─────────────────────────────────────────────────────── + + +def emit_diagrams(graph: TopologyGraph, out_dir: Path) -> list[Path]: + """Generate SVG diagrams for all views. Returns list of created file paths.""" + out_dir.mkdir(parents=True, exist_ok=True) + created: list[Path] = [] + + views = [ + ("system_view", graph.system_view), + ("sip_view", graph.sip_view), + ("cube_view", graph.cube_view), + ("pe_view", graph.pe_view), + ] + + for name, view in views: + if view is None: + continue + svg = _render_view_svg(view) + path = out_dir / f"{name}.svg" + path.write_text(svg, encoding="utf-8") + created.append(path) + + return created + + +# ── SVG rendering ──────────────────────────────────────────────────── + + +def _render_view_svg(view: ViewGraph) -> str: + """Render a ViewGraph to an SVG string.""" + scale = _pick_scale(view) + pad = 40 # px padding + node_sizes = _compute_node_sizes(view, scale) + + # Canvas size in px + w_px = int(view.width_mm * scale + 2 * pad) + h_px = int(view.height_mm * scale + 2 * pad) + + parts: list[str] = [] + parts.append(_svg_header(w_px, h_px, view.name)) + + # Background + parts.append(f' ') + + # Title + parts.append( + f' ' + f'{view.name.upper()} VIEW' + ) + + # Special: draw cube boundary + HBM block background in cube view + if view.name == "cube": + _draw_cube_boundary(parts, view, scale, pad) + _draw_hbm_block(parts, view, scale, pad) + + # Edges (draw before nodes so nodes are on top) + # Track fan-out edges to assign per-edge offsets + fanout_counter: dict[str, int] = {} + for edge in view.edges: + if edge.src in view.nodes and edge.dst in view.nodes: + _draw_edge(parts, edge, view, node_sizes, scale, pad, fanout_counter) + + # Nodes + for node in view.nodes.values(): + _draw_node(parts, node, node_sizes, scale, pad) + + parts.append("") + return "\n".join(parts) + + +def _pick_scale(view: ViewGraph) -> float: + """Pixels per mm, chosen per view type.""" + return { + "system": 4.0, + "sip": 8.0, + "cube": 28.0, + "pe": 35.0, + }.get(view.name, 10.0) + + +def _compute_node_sizes( + view: ViewGraph, scale: float, +) -> dict[str, tuple[float, float]]: + """Returns (w_px, h_px) for each node.""" + sizes: dict[str, tuple[float, float]] = {} + for nid, node in view.nodes.items(): + w_mm, h_mm = _KIND_SIZE.get(node.kind, (_DEFAULT_NODE_W, _DEFAULT_NODE_H)) + # For cube view, use smaller PE nodes + if view.name == "cube" and node.kind == "pe": + w_mm, h_mm = 1.8, 1.0 + if view.name == "pe": + w_mm, h_mm = 2.5, 1.4 + sizes[nid] = (w_mm * scale, h_mm * scale) + return sizes + + +def _svg_header(w: int, h: int, title: str) -> str: + return ( + f'\n' + f' {title}' + ) + + +def _draw_cube_boundary( + parts: list[str], view: ViewGraph, scale: float, pad: int, +) -> None: + """Draw the cube die outline as a dashed rectangle.""" + bx = pad + by = pad + bw = view.width_mm * scale + bh = view.height_mm * scale + parts.append( + f' ' + ) + + +def _draw_hbm_block( + parts: list[str], view: ViewGraph, scale: float, pad: int, +) -> None: + """Draw HBM area as a filled rectangle in cube view.""" + # HBM area: centered at (8.5, 7.0), size 9x5 -> x=[4.0,13.0], y=[4.5,9.5] + hbm_x = 4.0 * scale + pad + hbm_y = 4.5 * scale + pad + hbm_w = 9.0 * scale + hbm_h = 5.0 * scale + parts.append( + f' ' + ) + cx = 8.5 * scale + pad + cy = 8.5 * scale + pad + parts.append( + f' ' + f'HBM' + ) + + +def _draw_node( + parts: list[str], + node: Node, + sizes: dict[str, tuple[float, float]], + scale: float, + pad: int, +) -> None: + """Draw a single node as a rounded rectangle with label.""" + if node.pos_mm is None: + return + px = node.pos_mm[0] * scale + pad + py = node.pos_mm[1] * scale + pad + w, h = sizes.get(node.id, (40, 24)) + + x = px - w / 2 + y = py - h / 2 + fill = _KIND_COLORS.get(node.kind, "#e2e8f0") + text_color = "#ffffff" if _is_dark(fill) else "#1e293b" + + parts.append( + f' ' + ) + + label = node.label or node.id + font_size = _label_font_size(w, label) + parts.append( + f' ' + f'{_escape(label)}' + ) + + +# ── Fan-out edge kinds that need offset routing ───────────────────── + +_FANOUT_KINDS = {"pe_to_xbar", "pe_to_noc", "command", "noc_to_ucie"} + + +def _draw_edge( + parts: list[str], + edge: Edge, + view: ViewGraph, + sizes: dict[str, tuple[float, float]], + scale: float, + pad: int, + fanout_counter: dict[str, int], +) -> None: + """Draw an edge with orthogonal (90-degree) routing for fan-out kinds.""" + nodes = view.nodes + src_node = nodes[edge.src] + dst_node = nodes[edge.dst] + if src_node.pos_mm is None or dst_node.pos_mm is None: + return + + x1 = src_node.pos_mm[0] * scale + pad + y1 = src_node.pos_mm[1] * scale + pad + x2 = dst_node.pos_mm[0] * scale + pad + y2 = dst_node.pos_mm[1] * scale + pad + + color = _EDGE_COLORS.get(edge.kind, "#94a3b8") + width = "1.5" if edge.kind == "pe_internal" else "1" + opacity = "0.6" if edge.kind in ("command", "noc_to_ucie") else "0.8" + + if edge.kind in _FANOUT_KINDS and view.name == "cube": + # Orthogonal routing: src→horizontal→vertical→dst with per-edge offset. + group_key = f"{edge.kind}:{edge.dst}" + idx = fanout_counter.get(group_key, 0) + fanout_counter[group_key] = idx + 1 + + # Route: go vertically from src to a staggered horizontal channel, + # then horizontally to dst x, then vertically to dst. + mid_y = (y1 + y2) / 2 + (idx - 1.5) * 10 # spread channels vertically + + parts.append( + f' ' + ) + + # Label on the horizontal segment + if edge.distance_mm > 0: + lx = (x1 + x2) / 2 + label = f"{edge.distance_mm:.1f}mm" + if edge.bw_gbs: + label += f" {edge.bw_gbs:.0f}GB/s" + parts.append( + f' ' + f'{label}' + ) + return + + # Non-fanout: orthogonal L-bend + if abs(x2 - x1) > 1 and abs(y2 - y1) > 1: + # PE view: vertical-first for left→right edges (scheduler→engines), + # horizontal-first for right→right edges (engines→tcm) + if view.name == "pe": + if src_node.pos_mm[0] < view.width_mm / 2: + # Source in left half: vertical-first (scheduler fan-out) + parts.append( + f' ' + ) + else: + # Source in right half: horizontal-first (dma/math→tcm) + parts.append( + f' ' + ) + else: + parts.append( + f' ' + ) + else: + parts.append( + f' ' + ) + + # Distance label at midpoint + if edge.distance_mm > 0: + mx = (x1 + x2) / 2 + my = (y1 + y2) / 2 + label = f"{edge.distance_mm:.1f}mm" + if edge.bw_gbs: + label += f" {edge.bw_gbs:.0f}GB/s" + parts.append( + f' ' + f'{label}' + ) + + +# ── Helpers ────────────────────────────────────────────────────────── + + +def _is_dark(hex_color: str) -> bool: + """Check if a hex color is dark (for white text).""" + h = hex_color.lstrip("#") + r, g, b = int(h[0:2], 16), int(h[2:4], 16), int(h[4:6], 16) + return (r * 0.299 + g * 0.587 + b * 0.114) < 140 + + +def _label_font_size(box_width: float, label: str) -> int: + """Choose font size to fit label in box.""" + char_w = len(label) * 7 + if char_w > box_width * 0.9: + return max(7, int(box_width * 0.9 / len(label) * 1.4)) + return 10 + + +def _escape(text: str) -> str: + """Escape XML special characters.""" + return text.replace("&", "&").replace("<", "<").replace(">", ">") diff --git a/src/kernbench/triton_emu/__init__.py b/src/kernbench/triton_emu/__init__.py new file mode 100644 index 0000000..ba436fb --- /dev/null +++ b/src/kernbench/triton_emu/__init__.py @@ -0,0 +1,11 @@ +"""Triton emulator: fake tl module for kernel performance simulation. + +Provides TLContext (the fake `tl` parameter) that kernels use to express +memory access patterns and compute operations. Kernel functions are plain +Python — no yield, no async — and generate a PeCommand trace that PE_CPU +replays through SimPy. + +Usage: + from kernbench.triton_emu.registry import register_kernel, get_kernel + from kernbench.triton_emu.tl_context import TLContext +""" diff --git a/src/kernbench/triton_emu/registry.py b/src/kernbench/triton_emu/registry.py new file mode 100644 index 0000000..017e39d --- /dev/null +++ b/src/kernbench/triton_emu/registry.py @@ -0,0 +1,30 @@ +"""Kernel registry: maps kernel names to Python callable generators. + +Benchmarks register kernel functions here; PE_CPU looks them up by +KernelRef.name at execution time. +""" +from __future__ import annotations + +from collections.abc import Callable +from typing import Any + +_kernels: dict[str, Callable[..., None]] = {} + + +def register_kernel(name: str, fn: Callable[..., None]) -> None: + """Register a kernel function by name.""" + if name in _kernels: + raise ValueError(f"kernel '{name}' already registered") + _kernels[name] = fn + + +def get_kernel(name: str) -> Callable[..., None]: + """Look up a registered kernel function by name.""" + if name not in _kernels: + raise KeyError(f"kernel '{name}' not registered") + return _kernels[name] + + +def clear_registry() -> None: + """Clear all registered kernels (for testing).""" + _kernels.clear() diff --git a/src/kernbench/triton_emu/tl_context.py b/src/kernbench/triton_emu/tl_context.py new file mode 100644 index 0000000..4d5296c --- /dev/null +++ b/src/kernbench/triton_emu/tl_context.py @@ -0,0 +1,356 @@ +"""TLContext: fake Triton Language module for kernel performance simulation. + +Passed as the `tl` parameter to kernel functions. Each API call records a +PeCommand in the internal trace. After the kernel returns, PE_CPU replays +the command list through SimPy. + +Kernel code looks like standard Python — no yield, no async: + + def my_kernel(a_ptr, b_ptr, out_ptr, tl): + pid = tl.program_id(0) + a = tl.load(a_ptr, shape=(32, 64), dtype="f16") + b = tl.load(b_ptr + pid * stride, shape=(64, 32), dtype="f16") + tl.composite(op="gemm", a=a, b=b, out_ptr=out_ptr) +""" +from __future__ import annotations + +import math +from typing import Literal + +from kernbench.common.pe_commands import ( + CompletionHandle, + CompositeCmd, + DmaReadCmd, + DmaWriteCmd, + GemmCmd, + MathCmd, + PeCommand, + PeCpuOverheadCmd, + TensorHandle, + WaitCmd, +) + +_DTYPE_BYTES: dict[str, int] = { + "f16": 2, "f32": 4, "f64": 8, + "bf16": 2, + "i8": 1, "i16": 2, "i32": 4, "i64": 8, + "u8": 1, "u16": 2, "u32": 4, "u64": 8, +} + + +class TLContext: + """Fake Triton Language context. + + Args: + pe_id: program instance index (returned by program_id). + num_programs: total number of program instances. + dispatch_cycles: PE_CPU overhead per tl API call (auto-inserted). + """ + + def __init__( + self, + pe_id: int = 0, + num_programs: int = 1, + dispatch_cycles: int = 1, + ) -> None: + self._pe_id = pe_id + self._num_programs = num_programs + self._dispatch_cycles = dispatch_cycles + self._commands: list[PeCommand] = [] + self._handle_counter = 0 + self._completion_counter = 0 + + @property + def commands(self) -> list[PeCommand]: + """Return the recorded command trace.""" + return self._commands + + # ── helpers ──────────────────────────────────────────────────── + + def _next_handle_id(self) -> str: + self._handle_counter += 1 + return f"t{self._handle_counter}" + + def _next_completion_id(self) -> str: + self._completion_counter += 1 + return f"c{self._completion_counter}" + + def _dtype_bytes(self, dtype: str) -> int: + return _DTYPE_BYTES.get(dtype, 2) + + def _nbytes(self, shape: tuple[int, ...], dtype: str) -> int: + return math.prod(shape) * self._dtype_bytes(dtype) + + def _emit_dispatch_overhead(self) -> None: + if self._dispatch_cycles > 0: + self._commands.append(PeCpuOverheadCmd(cycles=self._dispatch_cycles)) + + def _make_handle( + self, pa: int, shape: tuple[int, ...], dtype: str, + ) -> TensorHandle: + return TensorHandle( + id=self._next_handle_id(), + pa=pa, shape=shape, dtype=dtype, + nbytes=self._nbytes(shape, dtype), + ) + + # ── Reference (no DMA, metadata only) ──────────────────────── + + def ref( + self, ptr: int, shape: tuple[int, ...], dtype: str = "f16", + ) -> TensorHandle: + """Create a TensorHandle referencing HBM data without issuing DMA. + + Used when the scheduler will stream data per-tile (e.g., tensor b + in a composite GEMM). No command is generated. + """ + return self._make_handle(pa=ptr, shape=shape, dtype=dtype) + + # ── Data Movement (blocking, DMA engine) ────────────────────── + + def load( + self, ptr: int, shape: tuple[int, ...], dtype: str = "f16", + ) -> TensorHandle: + """Load tensor from HBM to TCM. Returns TensorHandle.""" + self._emit_dispatch_overhead() + handle = self._make_handle(pa=ptr, shape=shape, dtype=dtype) + self._commands.append(DmaReadCmd( + handle=handle, src_pa=ptr, nbytes=handle.nbytes, + )) + return handle + + def store(self, ptr: int, handle: TensorHandle) -> None: + """Store tensor from TCM to HBM.""" + self._emit_dispatch_overhead() + self._commands.append(DmaWriteCmd( + handle=handle, dst_pa=ptr, nbytes=handle.nbytes, + )) + + # ── GEMM Engine (blocking) ──────────────────────────────────── + + def dot(self, a: TensorHandle, b: TensorHandle) -> TensorHandle: + """Matrix multiply: out = a @ b. Both operands must be in TCM. + + a: (M, K), b: (K, N) → out: (M, N) + """ + if len(a.shape) < 2 or len(b.shape) < 2: + raise ValueError("dot requires 2D tensors") + m, k = a.shape[-2], a.shape[-1] + k2, n = b.shape[-2], b.shape[-1] + if k != k2: + raise ValueError(f"dot shape mismatch: a.K={k} != b.K={k2}") + out_shape = (*a.shape[:-2], m, n) + out_dtype = a.dtype + out = self._make_handle(pa=0, shape=out_shape, dtype=out_dtype) + self._emit_dispatch_overhead() + self._commands.append(GemmCmd(a=a, b=b, out=out, m=m, k=k, n=n)) + return out + + # ── MATH Engine: unary (blocking) ───────────────────────────── + + def _unary_math(self, op: str, x: TensorHandle) -> TensorHandle: + out = self._make_handle(pa=0, shape=x.shape, dtype=x.dtype) + self._emit_dispatch_overhead() + self._commands.append(MathCmd(op=op, inputs=(x,), out=out)) + return out + + def exp(self, x: TensorHandle) -> TensorHandle: + return self._unary_math("exp", x) + + def log(self, x: TensorHandle) -> TensorHandle: + return self._unary_math("log", x) + + def sqrt(self, x: TensorHandle) -> TensorHandle: + return self._unary_math("sqrt", x) + + def abs(self, x: TensorHandle) -> TensorHandle: + return self._unary_math("abs", x) + + def sigmoid(self, x: TensorHandle) -> TensorHandle: + return self._unary_math("sigmoid", x) + + def cos(self, x: TensorHandle) -> TensorHandle: + return self._unary_math("cos", x) + + def sin(self, x: TensorHandle) -> TensorHandle: + return self._unary_math("sin", x) + + # ── MATH Engine: reduction (blocking) ───────────────────────── + + def _reduction( + self, op: str, x: TensorHandle, axis: int, + ) -> TensorHandle: + out_shape = list(x.shape) + out_shape[axis] = 1 + out = self._make_handle(pa=0, shape=tuple(out_shape), dtype=x.dtype) + self._emit_dispatch_overhead() + self._commands.append(MathCmd(op=op, inputs=(x,), out=out, axis=axis)) + return out + + def sum(self, x: TensorHandle, axis: int) -> TensorHandle: + return self._reduction("sum", x, axis) + + def max(self, x: TensorHandle, axis: int) -> TensorHandle: + return self._reduction("max", x, axis) + + def min(self, x: TensorHandle, axis: int) -> TensorHandle: + return self._reduction("min", x, axis) + + # ── MATH Engine: binary (blocking) ──────────────────────────── + + def _binary_math( + self, op: str, a: TensorHandle, b: TensorHandle, + ) -> TensorHandle: + out = self._make_handle(pa=0, shape=a.shape, dtype=a.dtype) + self._emit_dispatch_overhead() + self._commands.append(MathCmd(op=op, inputs=(a, b), out=out)) + return out + + def where( + self, cond: TensorHandle, a: TensorHandle, b: TensorHandle, + ) -> TensorHandle: + out = self._make_handle(pa=0, shape=a.shape, dtype=a.dtype) + self._emit_dispatch_overhead() + self._commands.append(MathCmd(op="where", inputs=(cond, a, b), out=out)) + return out + + # ── Index / Scalar (PE_CPU, no engine) ──────────────────────── + + def program_id(self, axis: int = 0) -> int: + """Return program instance index.""" + return self._pe_id + + def num_programs(self, axis: int = 0) -> int: + """Return total number of program instances.""" + return self._num_programs + + def arange(self, start: int, end: int, dtype: str = "i32") -> TensorHandle: + """Create index range tensor in TCM.""" + n = end - start + return self._make_handle(pa=0, shape=(n,), dtype=dtype) + + def zeros(self, shape: tuple[int, ...], dtype: str = "f16") -> TensorHandle: + """Create zero-filled tensor in TCM.""" + return self._make_handle(pa=0, shape=shape, dtype=dtype) + + def full( + self, shape: tuple[int, ...], value: float | int, dtype: str = "f16", + ) -> TensorHandle: + """Create constant-filled tensor in TCM.""" + return self._make_handle(pa=0, shape=shape, dtype=dtype) + + # ── Metadata (no compute, no DMA) ───────────────────────────── + + def trans(self, x: TensorHandle) -> TensorHandle: + """Transpose — shape change only, no command generated.""" + if len(x.shape) < 2: + raise ValueError("trans requires at least 2D tensor") + new_shape = (*x.shape[:-2], x.shape[-1], x.shape[-2]) + return TensorHandle( + id=x.id, pa=x.pa, shape=new_shape, + dtype=x.dtype, nbytes=x.nbytes, data=x.data, + ) + + # ── Composite + Control ─────────────────────────────────────── + + def composite( + self, + op: Literal["gemm", "math"], + a: TensorHandle, + b: TensorHandle | None = None, + out_ptr: int = 0, + math_op: str | None = None, + ) -> CompletionHandle: + """Submit a composite command (non-blocking, tiled pipeline). + + Returns CompletionHandle for use with wait(). + """ + # Compute output size based on op + if op == "gemm" and b is not None: + m, k = a.shape[-2], a.shape[-1] + n = b.shape[-1] + out_dtype = a.dtype + out_nbytes = m * n * self._dtype_bytes(out_dtype) + else: + out_nbytes = a.nbytes + + completion = CompletionHandle(id=self._next_completion_id()) + self._emit_dispatch_overhead() + self._commands.append(CompositeCmd( + completion=completion, op=op, + a=a, b=b, out_pa=out_ptr, out_nbytes=out_nbytes, + math_op=math_op, + )) + return completion + + def wait(self, handle: CompletionHandle | None = None) -> None: + """Wait for a specific composite or all pending composites.""" + self._commands.append(WaitCmd(handle=handle)) + + def cycles(self, n: int) -> None: + """Declare PE_CPU scalar execution overhead (cycles).""" + self._commands.append(PeCpuOverheadCmd(cycles=n)) + + +# ── TensorHandle arithmetic operators ───────────────────────────── +# Enables: a + b, a * b, a - b, a / b in kernel code. +# Each creates a MathCmd via a module-level helper that requires a +# TLContext. We attach the context to handles via a closure approach. + + +def _enable_tensor_ops() -> None: + """Patch TensorHandle with arithmetic operators. + + Called once at module load. Operators create MathCmd entries via + a thread-local TLContext reference set during kernel execution. + """ + import threading + + _local = threading.local() + + def set_active_context(ctx: TLContext | None) -> None: + _local.ctx = ctx + + def get_active_context() -> TLContext: + ctx = getattr(_local, "ctx", None) + if ctx is None: + raise RuntimeError("TensorHandle ops require an active TLContext") + return ctx + + def _binop(op: str): + def method(self: TensorHandle, other: TensorHandle) -> TensorHandle: + ctx = get_active_context() + return ctx._binary_math(op, self, other) + return method + + # Patch TensorHandle class with operators + TensorHandle.__add__ = _binop("add") # type: ignore[attr-defined] + TensorHandle.__sub__ = _binop("sub") # type: ignore[attr-defined] + TensorHandle.__mul__ = _binop("mul") # type: ignore[attr-defined] + TensorHandle.__truediv__ = _binop("div") # type: ignore[attr-defined] + + # Expose context management + TLContext._set_active = staticmethod(set_active_context) # type: ignore[attr-defined] + TLContext._get_active = staticmethod(get_active_context) # type: ignore[attr-defined] + + +_enable_tensor_ops() + + +def run_kernel( + kernel_fn, + tl_ctx: TLContext, + *args, + **kwargs, +) -> list[PeCommand]: + """Execute a kernel function with the given TLContext and return commands. + + Sets tl_ctx as the active context for TensorHandle operators, + calls the kernel, then clears the context. + """ + TLContext._set_active(tl_ctx) # type: ignore[attr-defined] + try: + kernel_fn(*args, tl=tl_ctx, **kwargs) + finally: + TLContext._set_active(None) # type: ignore[attr-defined] + return tl_ctx.commands diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 0000000..b1f8df9 --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,22 @@ +import kernbench.cli.main as cli_main + + +def test_cli_main_arg_parsing(monkeypatch): + + def fake_cmd_run(args) -> int: + assert args.cmd == "run" + assert args.topology == "topology.yaml" + assert args.bench == "qkv_gemm" + assert args.device == None + return 0 + + # monkey patch the handler to test arg parsing without running the actual bench + monkeypatch.setattr(cli_main, "cmd_run", fake_cmd_run) + rc = cli_main.main(["run", "--topology", "topology.yaml", "--bench", "qkv_gemm"]) + assert rc == 0 + + +def test_cli_main(): + + rc = cli_main.main(["run", "--topology", "topology.yaml", "--bench", "qkv_gemm"]) + assert rc == 0 diff --git a/tests/test_component_registry.py b/tests/test_component_registry.py new file mode 100644 index 0000000..2e6bb82 --- /dev/null +++ b/tests/test_component_registry.py @@ -0,0 +1,187 @@ +"""Tests for the SimPy component model and DI registry (ADR-0007 D3). + +Phase 1 verification: all tests FAIL until Phase 2 implements production code. + +Latency invariant after refactor: + total_ns = Σ(wire propagation) + Σ(component.run() overhead_ns) + nbytes / bottleneck_bw + This is identical to the current formula for Phase 0 (no contention). +""" + +import pytest +import simpy + +from pathlib import Path + +from kernbench.components.base import ComponentBase, ComponentRegistry +from kernbench.components.impls.forwarding import TransitComponent +from kernbench.policy.address.phyaddr import PhysAddr +from kernbench.runtime_api.kernel import MemoryReadMsg +from kernbench.sim_engine.engine import GraphEngine +from kernbench.topology.builder import load_topology +from kernbench.topology.types import Node + +TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml" + + +def _graph(): + return load_topology(TOPOLOGY_PATH) + + +def _hbm_pa(pe_id: int = 0) -> int: + slice_bytes = 48 * (1 << 30) // 8 + pa = PhysAddr.pe_hbm_addr( + rack_id=0, sip_id=0, cube_id=0, pe_id=pe_id, + pe_local_hbm_offset=0x1000, slice_size_bytes=slice_bytes, + ) + return pa.encode() + + +def _node(impl: str, overhead_ns: float = 0.0) -> Node: + return Node(id="test", kind="xbar", impl=impl, attrs={"overhead_ns": overhead_ns}, pos_mm=None) + + +# ── 1. unknown impl → error ────────────────────────────────────────── + + +def test_registry_unknown_impl_raises_error(): + """Unregistered impl raises ValueError (no fallback).""" + node = _node("totally_unknown_v99", overhead_ns=5.0) + with pytest.raises(ValueError, match="No component registered"): + ComponentRegistry.create(node) + + +# ── 2. TransitComponent yields exactly overhead_ns via simpy timeout ── + + +def test_transit_component_yields_overhead_ns(): + """TransitComponent.run() yields exactly node.attrs['overhead_ns'] ns.""" + node = _node("xbar_v1", overhead_ns=3.0) + comp = TransitComponent(node) + env = simpy.Environment() + + def proc(): + yield from comp.run(env, nbytes=4096) + + env.process(proc()) + env.run() + assert env.now == pytest.approx(3.0) + + +def test_transit_component_zero_overhead_ns(): + """TransitComponent with overhead_ns=0 still yields (no infinite loop).""" + node = _node("noc_v1", overhead_ns=0.0) + comp = TransitComponent(node) + env = simpy.Environment() + + done = [] + + def proc(): + yield from comp.run(env, nbytes=1024) + done.append(True) + + env.process(proc()) + env.run() + assert done == [True] + assert env.now == pytest.approx(0.0) + + +# ── 3. DI override: custom component is invoked by engine ──────────── + + +def test_engine_component_override_is_called(): + """Custom component injected via component_overrides is invoked during simulation.""" + + class SpyXbar(ComponentBase): + calls = 0 + + def run(self, env, nbytes): + SpyXbar.calls += 1 + yield env.timeout(0) + + SpyXbar.calls = 0 + graph = _graph() + engine = GraphEngine(graph, component_overrides={"xbar_v1": SpyXbar}) + msg = MemoryReadMsg( + correlation_id="c", request_id="r", + src_sip=0, src_cube=0, src_pe=0, + src_pa=_hbm_pa(pe_id=0), nbytes=4096, + ) + h = engine.submit(msg) + engine.wait(h) + # PE0→slice0 path passes through xbar.pe0 (impl=xbar_v1) + assert SpyXbar.calls > 0 + + +# ── 4. behavior unchanged: total_ns matches existing formula ───────── + + +def test_engine_component_model_same_latency_as_before(): + """Phase B component model total_ns for PE0→slice0 local HBM (4096B). + + Cut-through (wormhole) wire model: wires apply propagation only. + Serialization (drain) is computed per-path and applied once at the terminal. + + Forward path: + Path 1: pcie_ep(5.0) + wire(1.0mm=0.01) + io_cpu(10.0) + Path 2: wire(3.5mm=0.035) + ucie-N(1.0) + + 2DMeshNOC(ucie-N→m_cpu: Manhattan 10.9mm=0.109) + m_cpu(5.0) + Path 3 DMA (m_cpu→noc→xbar.pe0→hbm_ctrl.slice0): + + 2DMeshNOC(m_cpu→xbar.pe0: Manhattan 15.0mm=0.15) + + xbar.pe0(2.0) + wire(2.5mm=0.025) + hbm_ctrl(0.0) + + drain_ns(4096/128 = 32.0, bottleneck = noc_to_xbar 128 GB/s) + + Response path (reverse, nbytes=0, drain=0): + DMA response: hbm_ctrl→xbar.pe0→noc→m_cpu (propagation + xbar overhead_ns) + Command response: m_cpu→noc→ucie-N→io_cpu (propagation + ucie overhead_ns) + + Total: ~58.648 ns + """ + graph = _graph() + engine = GraphEngine(graph) + msg = MemoryReadMsg( + correlation_id="c", request_id="r", + src_sip=0, src_cube=0, src_pe=0, + src_pa=_hbm_pa(pe_id=0), nbytes=4096, + ) + h = engine.submit(msg) + engine.wait(h) + _, trace = engine.get_completion(h) + assert trace["total_ns"] == pytest.approx(58.648, rel=1e-4) + + +# ── 5. override is scoped: only targeted impl is replaced ──────────── + + +def test_engine_override_is_scoped_to_impl(): + """xbar_v1 override (ZeroXbar, no overhead_ns) reduces total_ns by exactly 4.0 ns. + + xbar.pe0 has overhead_ns=2.0. It is traversed on both the forward DMA path + and the reverse response path, so replacing it with a zero-latency impl + removes 2.0 ns × 2 = 4.0 ns; all other components are unchanged. + """ + + class ZeroXbar(ComponentBase): + def run(self, env, nbytes): + yield env.timeout(0) + + graph = _graph() + engine_default = GraphEngine(graph) + engine_override = GraphEngine(graph, component_overrides={"xbar_v1": ZeroXbar}) + + msg = MemoryReadMsg( + correlation_id="c", request_id="r", + src_sip=0, src_cube=0, src_pe=0, + src_pa=_hbm_pa(pe_id=0), nbytes=4096, + ) + + h_d = engine_default.submit(msg) + engine_default.wait(h_d) + _, t_default = engine_default.get_completion(h_d) + + h_o = engine_override.submit(msg) + engine_override.wait(h_o) + _, t_override = engine_override.get_completion(h_o) + + # ZeroXbar removes overhead_ns=2.0 from xbar.pe0 on forward + response = 4.0 ns faster + assert t_override["total_ns"] < t_default["total_ns"] + assert t_default["total_ns"] - t_override["total_ns"] == pytest.approx(4.0, rel=1e-6) diff --git a/tests/test_engine.py b/tests/test_engine.py new file mode 100644 index 0000000..50f550b --- /dev/null +++ b/tests/test_engine.py @@ -0,0 +1,405 @@ +import pytest + +from pathlib import Path + +from kernbench.common.types import Completion, RequestHandle +from kernbench.policy.address.phyaddr import PhysAddr +from kernbench.runtime_api.kernel import ( + KernelLaunchMsg, + KernelRef, + MemoryReadMsg, + MemoryWriteMsg, + ScalarArg, + TensorArg, + TensorArgShard, +) +from kernbench.sim_engine.engine import GraphEngine +from kernbench.topology.builder import load_topology + +TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml" + + +def _engine(): + graph = load_topology(TOPOLOGY_PATH) + return GraphEngine(graph) + + +def _hbm_pa(sip: int = 0, cube: int = 0, pe_id: int = 0) -> int: + """Create an HBM physical address targeting a specific PE's HBM slice.""" + # 48 GB / 8 slices = 6 GB per slice + slice_bytes = 48 * (1 << 30) // 8 + pa = PhysAddr.pe_hbm_addr( + rack_id=0, sip_id=sip, cube_id=cube, pe_id=pe_id, + pe_local_hbm_offset=0x1000, slice_size_bytes=slice_bytes, + ) + return pa.encode() + + +def _sram_pa(sip: int = 0, cube: int = 0) -> int: + """Create an SRAM physical address.""" + pa = PhysAddr.cube_sram_addr(rack_id=0, sip_id=sip, cube_id=cube, sram_offset=0x800) + return pa.encode() + + +# ── 1. submit returns handle ──────────────────────────────────────── + + +def test_engine_submit_returns_handle(): + """submit() must return a RequestHandle (non-empty string).""" + engine = _engine() + msg = MemoryWriteMsg( + correlation_id="c0", request_id="r0", + dst_sip=0, dst_cube=0, dst_pe=0, + dst_pa=_hbm_pa(), nbytes=4096, pattern="zero", + ) + handle = engine.submit(msg) + assert isinstance(handle, str) + assert len(handle) > 0 + + +# ── 2. memory write completion ────────────────────────────────────── + + +def test_engine_memory_write_completion(): + """MemoryWrite must complete with ok=True.""" + engine = _engine() + msg = MemoryWriteMsg( + correlation_id="c0", request_id="r1", + dst_sip=0, dst_cube=0, dst_pe=0, + dst_pa=_hbm_pa(), nbytes=4096, pattern="zero", + ) + h = engine.submit(msg) + engine.wait(h) + comp, trace = engine.get_completion(h) + assert comp.ok is True + + +# ── 3. memory read completion ─────────────────────────────────────── + + +def test_engine_memory_read_completion(): + """MemoryRead must complete with ok=True.""" + engine = _engine() + msg = MemoryReadMsg( + correlation_id="c0", request_id="r2", + src_sip=0, src_cube=0, src_pe=0, + src_pa=_hbm_pa(), nbytes=4096, + ) + h = engine.submit(msg) + engine.wait(h) + comp, trace = engine.get_completion(h) + assert comp.ok is True + + +# ── 4. latency positive ──────────────────────────────────────────── + + +def test_engine_latency_positive(): + """Trace total_ns must be > 0 (ADR-0002 D4).""" + engine = _engine() + msg = MemoryWriteMsg( + correlation_id="c0", request_id="r3", + dst_sip=0, dst_cube=0, dst_pe=0, + dst_pa=_hbm_pa(), nbytes=4096, pattern="zero", + ) + h = engine.submit(msg) + engine.wait(h) + _, trace = engine.get_completion(h) + assert trace["total_ns"] > 0 + + +# ── 5. trace has total_ns and nbytes ─────────────────────────────── + + +def test_engine_trace_has_total_ns_and_nbytes(): + """Trace must contain 'total_ns' and 'nbytes'.""" + engine = _engine() + msg = MemoryWriteMsg( + correlation_id="c0", request_id="r4", + dst_sip=0, dst_cube=0, dst_pe=0, + dst_pa=_hbm_pa(), nbytes=4096, pattern="zero", + ) + h = engine.submit(msg) + engine.wait(h) + _, trace = engine.get_completion(h) + assert "total_ns" in trace + assert "nbytes" in trace + assert trace["nbytes"] == 4096 + + +# ── 6. latency includes node overhead_ns ──────────────────────────── + + +def test_engine_latency_includes_node_overhead_ns(): + """Path traverses components with overhead_ns > 0, so total >= some minimum.""" + engine = _engine() + msg = MemoryWriteMsg( + correlation_id="c0", request_id="r7", + dst_sip=0, dst_cube=0, dst_pe=0, + dst_pa=_hbm_pa(), nbytes=4096, pattern="zero", + ) + h = engine.submit(msg) + engine.wait(h) + _, trace = engine.get_completion(h) + # pcie_ep (5.0) + io_cpu (10.0) + m_cpu (5.0) = at least 20 ns + assert trace["total_ns"] >= 20.0 + + +# ── 7. concurrent requests ───────────────────────────────────────── + + +def test_engine_concurrent_requests(): + """Two requests submitted before wait must both complete with traces.""" + engine = _engine() + msg1 = MemoryWriteMsg( + correlation_id="c0", request_id="r9a", + dst_sip=0, dst_cube=0, dst_pe=0, + dst_pa=_hbm_pa(), nbytes=4096, pattern="zero", + ) + msg2 = MemoryWriteMsg( + correlation_id="c0", request_id="r9b", + dst_sip=0, dst_cube=0, dst_pe=1, + dst_pa=_hbm_pa(pe_id=1), nbytes=4096, pattern="zero", + ) + h1 = engine.submit(msg1) + h2 = engine.submit(msg2) + engine.wait(h1) + engine.wait(h2) + comp1, trace1 = engine.get_completion(h1) + comp2, trace2 = engine.get_completion(h2) + assert comp1.ok is True + assert comp2.ok is True + assert trace1["total_ns"] > 0 + assert trace2["total_ns"] > 0 + + +# ── 8. kernel launch ─────────────────────────────────────────────── + + +def test_engine_kernel_launch_simplified(): + """KernelLaunch returns latency > 0.""" + from kernbench.triton_emu.registry import clear_registry, register_kernel + + clear_registry() + hbm_pa = _hbm_pa(pe_id=0) + + def gemm_kernel(a_ptr, tl): + a = tl.load(a_ptr, shape=(4, 4), dtype="f16") + tl.store(a_ptr, a) + + register_kernel("gemm", gemm_kernel) + + engine = _engine() + shard0 = TensorArgShard( + sip=0, cube=0, pe=0, + pa=_hbm_pa(pe_id=0), nbytes=4096, offset_bytes=0, + ) + shard1 = TensorArgShard( + sip=0, cube=0, pe=1, + pa=_hbm_pa(pe_id=1), nbytes=4096, offset_bytes=4096, + ) + msg = KernelLaunchMsg( + correlation_id="c0", request_id="r10", + kernel_ref=KernelRef(name="gemm", kind="builtin"), + args=(TensorArg(shards=(shard0, shard1)),), + ) + h = engine.submit(msg) + engine.wait(h) + comp, trace = engine.get_completion(h) + assert comp.ok is True + assert trace["total_ns"] > 0 + clear_registry() + + +# ── 9. deterministic ─────────────────────────────────────────────── + + +def test_engine_deterministic(): + """Same request on two engines must produce identical latency.""" + msg = MemoryWriteMsg( + correlation_id="c0", request_id="r11", + dst_sip=0, dst_cube=0, dst_pe=0, + dst_pa=_hbm_pa(), nbytes=4096, pattern="zero", + ) + e1 = _engine() + h1 = e1.submit(msg) + e1.wait(h1) + _, t1 = e1.get_completion(h1) + + e2 = _engine() + h2 = e2.submit(msg) + e2.wait(h2) + _, t2 = e2.get_completion(h2) + + assert t1["total_ns"] == t2["total_ns"] + + +# ── 10. remote cube access succeeds with higher latency ──────────── + + +def test_dma_capacity_serializes_concurrent(): + """Two concurrent DMA writes to the same cube must contend at DMA capacity=1. + + When two MemoryWrite requests target the same cube's M_CPU simultaneously, + the DMA engine (capacity=1) serializes them. The slower request must take + longer than a single isolated request (ADR-0014 D4, ADR-0015 D5). + """ + # Single isolated write baseline + engine_single = _engine() + msg_single = MemoryWriteMsg( + correlation_id="c0", request_id="single", + dst_sip=0, dst_cube=0, dst_pe=0, + dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096, + pattern="zero", target_pe=0, + ) + h1 = engine_single.submit(msg_single) + engine_single.wait(h1) + _, t1 = engine_single.get_completion(h1) + single_ns = t1["total_ns"] + + # Two concurrent writes to same cube (different PEs) → DMA contention + engine_conc = _engine() + msg_a = MemoryWriteMsg( + correlation_id="c0", request_id="conc-a", + dst_sip=0, dst_cube=0, dst_pe=0, + dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096, + pattern="zero", target_pe=0, + ) + msg_b = MemoryWriteMsg( + correlation_id="c0", request_id="conc-b", + dst_sip=0, dst_cube=0, dst_pe=1, + dst_pa=_hbm_pa(sip=0, cube=0, pe_id=1), nbytes=4096, + pattern="zero", target_pe=1, + ) + ha = engine_conc.submit(msg_a) + hb = engine_conc.submit(msg_b) + engine_conc.wait(ha) + engine_conc.wait(hb) + _, ta = engine_conc.get_completion(ha) + _, tb = engine_conc.get_completion(hb) + + # At least one must be delayed by DMA contention + max_ns = max(ta["total_ns"], tb["total_ns"]) + assert max_ns > single_ns, ( + f"concurrent max ({max_ns:.2f}ns) must > single ({single_ns:.2f}ns) " + f"due to DMA capacity=1 contention" + ) + + +# ── 11. formula latency lower bound ────────────────────────────── + + +def test_formula_latency_lower_bound(): + """_formula_latency must be <= actual latency (ADR-0015 D7). + + Uses PE DMA path which is fully known at engine level. + """ + from kernbench.policy.address.phyaddr import PhysAddr as PA + from kernbench.policy.routing.router import AddressResolver, PathRouter + from kernbench.topology.builder import load_topology as lt + + graph = lt(TOPOLOGY_PATH) + engine = GraphEngine(graph) + resolver = AddressResolver(graph) + router = PathRouter(graph) + + pa = _hbm_pa(sip=0, cube=0, pe_id=1) + pa_obj = PA.decode(pa) + dst_node = resolver.resolve(pa_obj) + pe_ref = "sip0.cube0.pe0" + path = router.find_path(pe_ref, dst_node) + formula = engine._formula_latency(path, 4096) + + # Run actual simulation + msg = MemoryReadMsg( + correlation_id="c0", request_id="formula-lb", + src_sip=0, src_cube=0, src_pe=0, + src_pa=pa, nbytes=4096, target_pe=1, + ) + h = engine.submit(msg) + engine.wait(h) + _, trace = engine.get_completion(h) + actual = trace["total_ns"] + + assert formula <= actual, ( + f"formula ({formula:.2f}) must <= actual ({actual:.2f})" + ) + assert formula > 0, "formula must be > 0" + + +def test_formula_latency_exact_no_contention(): + """With no contention, formula should approximate actual for PE DMA. + + PE DMA is single-request with no fan-out or aggregation, + so formula ≈ actual (within small tolerance for SimPy scheduling). + """ + from kernbench.runtime_api.kernel import PeDmaMsg + from kernbench.policy.address.phyaddr import PhysAddr as PA + from kernbench.policy.routing.router import AddressResolver, PathRouter + from kernbench.topology.builder import load_topology as lt + + graph = lt(TOPOLOGY_PATH) + engine = GraphEngine(graph) + resolver = AddressResolver(graph) + router = PathRouter(graph) + + pa = _hbm_pa(sip=0, cube=0, pe_id=0) + pa_obj = PA.decode(pa) + dst_node = resolver.resolve(pa_obj) + pe_ref = "sip0.cube0.pe0" + path = router.find_path(pe_ref, dst_node) + formula = engine._formula_latency(path, 4096) + + msg = PeDmaMsg( + correlation_id="c0", request_id="formula-exact", + src_sip=0, src_cube=0, src_pe=0, + dst_pa=pa, nbytes=4096, + ) + h = engine.submit(msg) + engine.wait(h) + _, trace = engine.get_completion(h) + actual = trace["total_ns"] + + # No contention: formula should equal actual + assert abs(formula - actual) < 0.01, ( + f"formula ({formula:.4f}) ≈ actual ({actual:.4f}) expected with no contention" + ) + + +# ── 10. remote cube access succeeds with higher latency ──────────── + + +def test_engine_remote_cube_latency_higher(): + """Accessing a distant cube's HBM must have strictly higher latency than local. + + Uses separate engines to avoid contention effects. + cube15 (far corner of 4x4 mesh) requires multiple UCIe + NOC hops + from IO chiplet compared to cube0 (directly connected). + """ + engine_local = _engine() + engine_remote = _engine() + msg_local = MemoryReadMsg( + correlation_id="c0", request_id="r14a", + src_sip=0, src_cube=0, src_pe=0, + src_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096, + ) + msg_remote = MemoryReadMsg( + correlation_id="c0", request_id="r14b", + src_sip=0, src_cube=0, src_pe=0, + src_pa=_hbm_pa(sip=0, cube=15, pe_id=0), nbytes=4096, + ) + h_local = engine_local.submit(msg_local) + engine_local.wait(h_local) + _, t_local = engine_local.get_completion(h_local) + + h_remote = engine_remote.submit(msg_remote) + engine_remote.wait(h_remote) + comp_remote, t_remote = engine_remote.get_completion(h_remote) + + assert comp_remote.ok is True + assert t_remote is not None and t_local is not None + assert t_remote["total_ns"] > t_local["total_ns"], ( + f"remote cube {t_remote['total_ns']:.2f} must > local {t_local['total_ns']:.2f}" + ) + + diff --git a/tests/test_pe_components.py b/tests/test_pe_components.py new file mode 100644 index 0000000..4d6a1f9 --- /dev/null +++ b/tests/test_pe_components.py @@ -0,0 +1,1175 @@ +"""Tests for PE internal component implementations (ADR-0014). + +Validates: + - Registry resolves all 6 PE component impl strings + - PE_DMA dual-channel concurrency (READ ∥ WRITE allowed) + - PE_DMA same-channel serialization (READ ∥ READ blocked) + - PE_GEMM / PE_MATH shared accel_slot (capacity=1) + - PeDmaMsg probe regression (latency unchanged) + - Stage 2: PE_CPU kernel execution + PE_SCHEDULER dispatch +""" +from pathlib import Path + +import simpy + +from kernbench.common.pe_commands import ( + DmaReadCmd, + GemmCmd, + MathCmd, + PeInternalTxn, + TensorHandle, +) +from kernbench.components.base import ComponentRegistry +from kernbench.components.impls.pe_cpu import PeCpuComponent +from kernbench.components.impls.pe_dma import PeDmaComponent +from kernbench.components.impls.pe_gemm import PeGemmComponent +from kernbench.components.impls.pe_math import PeMathComponent +from kernbench.components.impls.pe_scheduler import PeSchedulerComponent +from kernbench.components.impls.pe_tcm import PeTcmComponent +from kernbench.policy.address.phyaddr import PhysAddr +from kernbench.runtime_api.kernel import ( + KernelLaunchMsg, + KernelRef, + MemoryReadMsg, + MemoryWriteMsg, + PeDmaMsg, + TensorArg, + TensorArgShard, +) +from kernbench.sim_engine.engine import GraphEngine +from kernbench.sim_engine.transaction import Transaction +from kernbench.topology.builder import load_topology +from kernbench.topology.types import Node +from kernbench.triton_emu.registry import clear_registry, register_kernel + +TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml" + + +def _engine(): + return GraphEngine(load_topology(TOPOLOGY_PATH)) + + +def _hbm_pa(sip: int = 0, cube: int = 0, pe_id: int = 0) -> int: + slice_bytes = 48 * (1 << 30) // 8 + pa = PhysAddr.pe_hbm_addr( + rack_id=0, sip_id=sip, cube_id=cube, pe_id=pe_id, + pe_local_hbm_offset=0x1000, slice_size_bytes=slice_bytes, + ) + return pa.encode() + + +# ── 1. Registry resolves all PE impl strings ────────────────────── + + +def test_pe_registry_resolves_all(): + """All 6 PE component impl strings must resolve to their specific classes.""" + expected = { + "pe_cpu_v1": PeCpuComponent, + "pe_scheduler_v1": PeSchedulerComponent, + "pe_dma_v1": PeDmaComponent, + "pe_gemm_v1": PeGemmComponent, + "pe_math_v1": PeMathComponent, + "pe_tcm_v1": PeTcmComponent, + } + for impl, cls in expected.items(): + node = Node(id=f"test.{impl}", kind=impl.replace("_v1", ""), + impl=impl, pos_mm=None, attrs={}) + comp = ComponentRegistry.create(node) + assert isinstance(comp, cls), f"{impl} resolved to {type(comp)}, expected {cls}" + + +# ── 2. PE_DMA dual-channel: READ ∥ WRITE allowed ────────────────── + + +def test_pe_dma_dual_channel_concurrent(): + """PE_DMA READ and WRITE channels are independent (ADR-0014 D4). + + Two concurrent DMA operations on different channels should not block + each other — both should complete at the same time as a single op. + """ + env = simpy.Environment() + node = Node(id="sip0.cube0.pe0.pe_dma", kind="pe_dma", + impl="pe_dma_v1", pos_mm=None, + attrs={"rd_engines": 1, "wr_engines": 1}) + comp = PeDmaComponent(node) + + # Create minimal ports: just need inbox via start() + # We'll use a sink store as out_port + sink = simpy.Store(env) + comp.out_ports["next"] = sink + comp.in_ports["src"] = simpy.Store(env) + comp.start(env) + + results = [] + + def submit_and_track(request, label): + done = env.event() + txn = Transaction( + request=request, path=["sip0.cube0.pe0.pe_dma", "next"], + step=0, nbytes=4096, done=done, + ) + yield comp._inbox.put(txn) + yield done + results.append((label, env.now)) + + # Drain sink so transactions don't block + def drain_sink(): + while True: + txn = yield sink.get() + txn.done.succeed() + + env.process(drain_sink()) + + read_req = MemoryReadMsg( + correlation_id="c", request_id="r1", + src_sip=0, src_cube=0, src_pe=0, + src_pa=_hbm_pa(), nbytes=4096, + ) + write_req = MemoryWriteMsg( + correlation_id="c", request_id="r2", + dst_sip=0, dst_cube=0, dst_pe=0, + dst_pa=_hbm_pa(), nbytes=4096, pattern="zero", + ) + + env.process(submit_and_track(read_req, "read")) + env.process(submit_and_track(write_req, "write")) + env.run() + + assert len(results) == 2 + # Both should complete at same time (no contention between channels) + assert results[0][1] == results[1][1], ( + f"READ and WRITE should not block each other: " + f"{results[0]} vs {results[1]}" + ) + + +# ── 3. PE_DMA same-channel serializes ───────────────────────────── + + +def test_pe_dma_same_channel_serializes(): + """Two READ operations on the same PE_DMA must serialize (capacity=1).""" + env = simpy.Environment() + node = Node(id="sip0.cube0.pe0.pe_dma", kind="pe_dma", + impl="pe_dma_v1", pos_mm=None, + attrs={"rd_engines": 1, "wr_engines": 1}) + comp = PeDmaComponent(node) + + sink = simpy.Store(env) + comp.out_ports["next"] = sink + comp.in_ports["src"] = simpy.Store(env) + comp.start(env) + + completions = [] + + def submit_read(req_id): + done = env.event() + req = MemoryReadMsg( + correlation_id="c", request_id=req_id, + src_sip=0, src_cube=0, src_pe=0, + src_pa=_hbm_pa(), nbytes=4096, + ) + txn = Transaction( + request=req, path=["sip0.cube0.pe0.pe_dma", "next"], + step=0, nbytes=4096, done=done, + ) + yield comp._inbox.put(txn) + yield done + completions.append((req_id, env.now)) + + # Sink completes transactions after 10ns delay (simulates downstream) + def drain_sink(): + while True: + txn = yield sink.get() + yield env.timeout(10) + txn.done.succeed() + + env.process(drain_sink()) + env.process(submit_read("r1")) + env.process(submit_read("r2")) + env.run() + + assert len(completions) == 2 + # Second read must complete later due to serialization + t1 = completions[0][1] + t2 = completions[1][1] + assert t2 > t1, ( + f"Second READ ({t2}) must complete after first ({t1}) " + f"due to DMA capacity=1" + ) + + +# ── 4. PE_GEMM / PE_MATH shared accel_slot ──────────────────────── + + +def test_pe_accel_shared_slot(): + """GEMM and MATH share PE_ACCEL capacity=1 — cannot overlap (ADR-0014 D4).""" + from kernbench.components.context import ComponentContext + from kernbench.policy.routing.router import AddressResolver, PathRouter + + graph = load_topology(TOPOLOGY_PATH) + env = simpy.Environment() + ctx = ComponentContext( + router=PathRouter(graph), + resolver=AddressResolver(graph), + positions={}, + ns_per_mm=0.01, + spec=graph.spec, + ) + + pe_prefix = "sip0.cube0.pe0" + gemm_node = Node( + id=f"{pe_prefix}.pe_gemm", kind="pe_gemm", impl="pe_gemm_v1", + pos_mm=None, attrs={"overhead_ns": 10.0, "shared_resource": "accel_slot"}, + ) + math_node = Node( + id=f"{pe_prefix}.pe_math", kind="pe_math", impl="pe_math_v1", + pos_mm=None, attrs={"overhead_ns": 10.0, "shared_resource": "accel_slot"}, + ) + gemm = PeGemmComponent(gemm_node, ctx) + math = PeMathComponent(math_node, ctx) + + # Wire minimal ports + gemm.in_ports["src"] = simpy.Store(env) + math.in_ports["src"] = simpy.Store(env) + gemm.start(env) + math.start(env) + + completions = [] + + def submit(comp_inst, label): + done = env.event() + txn = Transaction( + request=None, path=[comp_inst.node.id], + step=0, nbytes=0, done=done, + ) + yield comp_inst._inbox.put(txn) + yield done + completions.append((label, env.now)) + + env.process(submit(gemm, "gemm")) + env.process(submit(math, "math")) + env.run() + + assert len(completions) == 2 + t1 = completions[0][1] + t2 = completions[1][1] + # One completes at 10ns, the other at 20ns (serialized) + assert t1 == 10.0, f"First should complete at 10ns, got {t1}" + assert t2 == 20.0, f"Second should complete at 20ns, got {t2}" + + +# ── 5. PeDmaMsg probe regression ────────────────────────────────── + + +def test_pe_dma_probe_regression(): + """PeDmaMsg probe must still complete with same latency after PE component registration.""" + msg = PeDmaMsg( + correlation_id="probe", request_id="regression", + src_sip=0, src_cube=0, src_pe=0, + dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096, + ) + e1 = _engine() + h1 = e1.submit(msg) + e1.wait(h1) + comp1, trace1 = e1.get_completion(h1) + assert comp1.ok is True + assert trace1["total_ns"] > 0 + + # Deterministic: two engines produce same result + e2 = _engine() + h2 = e2.submit(msg) + e2.wait(h2) + _, trace2 = e2.get_completion(h2) + assert trace1["total_ns"] == trace2["total_ns"] + + +# ── 6. PE_GEMM handles PeInternalTxn ──────────────────────────── + + +def test_pe_gemm_handles_pe_internal_txn(): + """PE_GEMM receives PeInternalTxn with GemmCmd, acquires accel, signals done.""" + from kernbench.components.context import ComponentContext + from kernbench.policy.routing.router import AddressResolver, PathRouter + + graph = load_topology(TOPOLOGY_PATH) + env = simpy.Environment() + ctx = ComponentContext( + router=PathRouter(graph), + resolver=AddressResolver(graph), + positions={}, ns_per_mm=0.01, spec=graph.spec, + ) + + pe_prefix = "sip0.cube0.pe0" + gemm_node = Node( + id=f"{pe_prefix}.pe_gemm", kind="pe_gemm", impl="pe_gemm_v1", + pos_mm=None, attrs={"overhead_ns": 5.0, "shared_resource": "accel_slot"}, + ) + gemm = PeGemmComponent(gemm_node, ctx) + gemm.in_ports["src"] = simpy.Store(env) + gemm.start(env) + + a = TensorHandle(id="t1", pa=0, shape=(4, 8), dtype="f16", nbytes=64) + b = TensorHandle(id="t2", pa=0, shape=(8, 4), dtype="f16", nbytes=64) + out = TensorHandle(id="t3", pa=0, shape=(4, 4), dtype="f16", nbytes=32) + cmd = GemmCmd(a=a, b=b, out=out, m=4, k=8, n=4) + done = env.event() + pe_txn = PeInternalTxn(command=cmd, done=done, pe_prefix=pe_prefix) + + def submit(): + yield gemm._inbox.put(pe_txn) + yield done + + env.process(submit()) + env.run() + assert env.now == 5.0 # overhead_ns from node attrs + + +# ── 7. PE_MATH handles PeInternalTxn ──────────────────────────── + + +def test_pe_math_handles_pe_internal_txn(): + """PE_MATH receives PeInternalTxn with MathCmd, acquires accel, signals done.""" + from kernbench.components.context import ComponentContext + from kernbench.policy.routing.router import AddressResolver, PathRouter + + graph = load_topology(TOPOLOGY_PATH) + env = simpy.Environment() + ctx = ComponentContext( + router=PathRouter(graph), + resolver=AddressResolver(graph), + positions={}, ns_per_mm=0.01, spec=graph.spec, + ) + + pe_prefix = "sip0.cube0.pe0" + math_node = Node( + id=f"{pe_prefix}.pe_math", kind="pe_math", impl="pe_math_v1", + pos_mm=None, attrs={"overhead_ns": 3.0, "shared_resource": "accel_slot"}, + ) + math_comp = PeMathComponent(math_node, ctx) + math_comp.in_ports["src"] = simpy.Store(env) + math_comp.start(env) + + x = TensorHandle(id="t1", pa=0, shape=(4, 4), dtype="f16", nbytes=32) + out = TensorHandle(id="t2", pa=0, shape=(4, 4), dtype="f16", nbytes=32) + cmd = MathCmd(op="exp", inputs=(x,), out=out) + done = env.event() + pe_txn = PeInternalTxn(command=cmd, done=done, pe_prefix=pe_prefix) + + def submit(): + yield math_comp._inbox.put(pe_txn) + yield done + + env.process(submit()) + env.run() + assert env.now == 3.0 # overhead_ns from node attrs + + +# ── 8. PE_CPU kernel execution e2e (load-only kernel) ──────────── + + +def test_pe_kernel_e2e_load_only(): + """PE_CPU compiles and replays a simple load kernel through the full pipeline. + + Kernel: tl.load(hbm_pa, shape=(4,4), dtype='f16') + Expected: Transaction completes with latency > 0 (DMA to HBM and back). + """ + clear_registry() + hbm_pa = _hbm_pa(sip=0, cube=0, pe_id=0) + + def load_kernel(tl): + tl.load(hbm_pa, shape=(4, 4), dtype="f16") + + register_kernel("test_load_kernel", load_kernel) + + engine = _engine() + pe_cpu_id = "sip0.cube0.pe0.pe_cpu" + + # Create KernelLaunchMsg + launch_msg = KernelLaunchMsg( + correlation_id="test", request_id="load_e2e", + kernel_ref=KernelRef(name="test_load_kernel", kind="builtin"), + args=(), + ) + + # Inject Transaction at PE_CPU inbox + done = engine._env.event() + txn = Transaction( + request=launch_msg, + path=[pe_cpu_id], + step=0, nbytes=0, done=done, + ) + + start_ns = engine._env.now + + def inject(): + yield engine._components[pe_cpu_id]._inbox.put(txn) + yield done + + engine._env.process(inject()) + engine._env.run() + + total_ns = engine._env.now - start_ns + assert total_ns > 0, f"Kernel should take > 0ns, got {total_ns}" + clear_registry() + + +# ── 9. PE_CPU kernel execution e2e (load + store) ──────────────── + + +def test_pe_kernel_e2e_load_store(): + """PE_CPU: load→store kernel completes with latency > load-only kernel.""" + clear_registry() + hbm_pa = _hbm_pa(sip=0, cube=0, pe_id=0) + + def load_store_kernel(tl): + a = tl.load(hbm_pa, shape=(4, 4), dtype="f16") + tl.store(hbm_pa + 0x10000, a) + + register_kernel("test_load_store", load_store_kernel) + + engine = _engine() + pe_cpu_id = "sip0.cube0.pe0.pe_cpu" + launch_msg = KernelLaunchMsg( + correlation_id="test", request_id="ls_e2e", + kernel_ref=KernelRef(name="test_load_store", kind="builtin"), + args=(), + ) + + done = engine._env.event() + txn = Transaction( + request=launch_msg, path=[pe_cpu_id], + step=0, nbytes=0, done=done, + ) + + def inject(): + yield engine._components[pe_cpu_id]._inbox.put(txn) + yield done + + engine._env.process(inject()) + engine._env.run() + + total_ns = engine._env.now + assert total_ns > 0, f"load+store should take > 0ns, got {total_ns}" + clear_registry() + + +# ── 10. PE_CPU kernel with overhead timing ─────────────────────── + + +def test_pe_cpu_overhead_timing(): + """PeCpuOverheadCmd cycles are added to total kernel time.""" + clear_registry() + hbm_pa = _hbm_pa(sip=0, cube=0, pe_id=0) + pe_cpu_id = "sip0.cube0.pe0.pe_cpu" + + def no_overhead_kernel(tl): + tl.load(hbm_pa, shape=(4, 4), dtype="f16") + + def overhead_kernel(tl): + tl.cycles(100) + tl.load(hbm_pa, shape=(4, 4), dtype="f16") + + # Run without overhead + register_kernel("test_no_overhead", no_overhead_kernel) + engine1 = _engine() + done1 = engine1._env.event() + txn1 = Transaction( + request=KernelLaunchMsg( + correlation_id="t", request_id="r1", + kernel_ref=KernelRef(name="test_no_overhead", kind="builtin"), + args=(), + ), + path=[pe_cpu_id], step=0, nbytes=0, done=done1, + ) + + def inject1(): + yield engine1._components[pe_cpu_id]._inbox.put(txn1) + yield done1 + + engine1._env.process(inject1()) + engine1._env.run() + base_ns = engine1._env.now + + # Run with overhead + clear_registry() + register_kernel("test_overhead", overhead_kernel) + engine2 = _engine() + done2 = engine2._env.event() + txn2 = Transaction( + request=KernelLaunchMsg( + correlation_id="t", request_id="r2", + kernel_ref=KernelRef(name="test_overhead", kind="builtin"), + args=(), + ), + path=[pe_cpu_id], step=0, nbytes=0, done=done2, + ) + + def inject2(): + yield engine2._components[pe_cpu_id]._inbox.put(txn2) + yield done2 + + engine2._env.process(inject2()) + engine2._env.run() + overhead_ns = engine2._env.now + + # Overhead kernel should take 100 cycles more + assert overhead_ns == base_ns + 100, ( + f"Expected {base_ns + 100}ns with overhead, got {overhead_ns}ns" + ) + clear_registry() + + +# ── 11. PE_CPU kernel with GEMM (dot) ──────────────────────────── + + +def test_pe_kernel_e2e_gemm(): + """PE_CPU: kernel with tl.dot dispatches GemmCmd through PE_GEMM.""" + clear_registry() + hbm_pa = _hbm_pa(sip=0, cube=0, pe_id=0) + + def gemm_kernel(tl): + a = tl.load(hbm_pa, shape=(4, 8), dtype="f16") + b = tl.load(hbm_pa + 0x10000, shape=(8, 4), dtype="f16") + out = tl.dot(a, b) + tl.store(hbm_pa + 0x20000, out) + + register_kernel("test_gemm", gemm_kernel) + + engine = _engine() + pe_cpu_id = "sip0.cube0.pe0.pe_cpu" + launch_msg = KernelLaunchMsg( + correlation_id="test", request_id="gemm_e2e", + kernel_ref=KernelRef(name="test_gemm", kind="builtin"), + args=(), + ) + + done = engine._env.event() + txn = Transaction( + request=launch_msg, path=[pe_cpu_id], + step=0, nbytes=0, done=done, + ) + + def inject(): + yield engine._components[pe_cpu_id]._inbox.put(txn) + yield done + + engine._env.process(inject()) + engine._env.run() + + total_ns = engine._env.now + assert total_ns > 0, f"GEMM kernel should take > 0ns, got {total_ns}" + clear_registry() + + +# ── 12. PE_CPU kernel with math ops ────────────────────────────── + + +def test_pe_kernel_e2e_math(): + """PE_CPU: kernel with tl.exp dispatches MathCmd through PE_MATH.""" + clear_registry() + hbm_pa = _hbm_pa(sip=0, cube=0, pe_id=0) + + def math_kernel(tl): + x = tl.load(hbm_pa, shape=(4, 4), dtype="f16") + y = tl.exp(x) + tl.store(hbm_pa + 0x10000, y) + + register_kernel("test_math", math_kernel) + + engine = _engine() + pe_cpu_id = "sip0.cube0.pe0.pe_cpu" + launch_msg = KernelLaunchMsg( + correlation_id="test", request_id="math_e2e", + kernel_ref=KernelRef(name="test_math", kind="builtin"), + args=(), + ) + + done = engine._env.event() + txn = Transaction( + request=launch_msg, path=[pe_cpu_id], + step=0, nbytes=0, done=done, + ) + + def inject(): + yield engine._components[pe_cpu_id]._inbox.put(txn) + yield done + + engine._env.process(inject()) + engine._env.run() + + total_ns = engine._env.now + assert total_ns > 0, f"Math kernel should take > 0ns, got {total_ns}" + clear_registry() + + +# ── 13. Deterministic: same kernel → same latency ─────────────── + + +def test_pe_kernel_deterministic(): + """Same kernel on same PE produces identical latency across runs.""" + clear_registry() + hbm_pa = _hbm_pa(sip=0, cube=0, pe_id=0) + + def det_kernel(tl): + a = tl.load(hbm_pa, shape=(4, 4), dtype="f16") + tl.store(hbm_pa + 0x10000, a) + + register_kernel("test_det", det_kernel) + + results = [] + pe_cpu_id = "sip0.cube0.pe0.pe_cpu" + for _ in range(2): + engine = _engine() + done = engine._env.event() + txn = Transaction( + request=KernelLaunchMsg( + correlation_id="t", request_id="det", + kernel_ref=KernelRef(name="test_det", kind="builtin"), + args=(), + ), + path=[pe_cpu_id], step=0, nbytes=0, done=done, + ) + + def inject(e=engine, d=done, t=txn): + yield e._components[pe_cpu_id]._inbox.put(t) + yield d + + engine._env.process(inject()) + engine._env.run() + results.append(engine._env.now) + + assert results[0] == results[1], ( + f"Determinism violation: {results[0]} != {results[1]}" + ) + clear_registry() + + +# ── 14. Stage 3: Composite GEMM pipeline with tiling ───────────── + + +def test_composite_gemm_pipeline(): + """Composite GEMM with tl.ref(b) produces tiled pipeline execution. + + Kernel: tl.load(a) + tl.ref(b) + tl.composite(gemm) + tl.wait() + Validates: Transaction completes, latency > 0, latency > load-only. + """ + clear_registry() + hbm_pa_a = _hbm_pa(sip=0, cube=0, pe_id=0) + hbm_pa_b = _hbm_pa(sip=0, cube=0, pe_id=0) + out_pa = _hbm_pa(sip=0, cube=0, pe_id=0) + + def composite_gemm_kernel(tl): + a = tl.load(hbm_pa_a, shape=(32, 64), dtype="f16") + b = tl.ref(hbm_pa_b, shape=(64, 32), dtype="f16") + h = tl.composite(op="gemm", a=a, b=b, out_ptr=out_pa) + tl.wait(h) + + register_kernel("test_composite_gemm", composite_gemm_kernel) + + engine = _engine() + pe_cpu_id = "sip0.cube0.pe0.pe_cpu" + launch_msg = KernelLaunchMsg( + correlation_id="test", request_id="composite_gemm", + kernel_ref=KernelRef(name="test_composite_gemm", kind="builtin"), + args=(), + ) + + done = engine._env.event() + txn = Transaction( + request=launch_msg, path=[pe_cpu_id], + step=0, nbytes=0, done=done, + ) + + def inject(): + yield engine._components[pe_cpu_id]._inbox.put(txn) + yield done + + engine._env.process(inject()) + engine._env.run() + + total_ns = engine._env.now + assert total_ns > 0, f"Composite GEMM should take > 0ns, got {total_ns}" + clear_registry() + + +# ── 15. Stage 3: Composite generates multiple tiles ─────────────── + + +def test_composite_gemm_multi_tile(): + """Larger GEMM produces multiple tiles (K=128 > TILE_K=64 → 2 K-tiles). + + Validates latency is strictly greater than single-tile composite. + """ + clear_registry() + hbm_pa_a = _hbm_pa(sip=0, cube=0, pe_id=0) + hbm_pa_b = _hbm_pa(sip=0, cube=0, pe_id=0) + out_pa = _hbm_pa(sip=0, cube=0, pe_id=0) + + # Single K-tile (K=64, fits in one TILE_K) + def single_tile_kernel(tl): + a = tl.load(hbm_pa_a, shape=(32, 64), dtype="f16") + b = tl.ref(hbm_pa_b, shape=(64, 32), dtype="f16") + h = tl.composite(op="gemm", a=a, b=b, out_ptr=out_pa) + tl.wait(h) + + register_kernel("test_single_tile", single_tile_kernel) + engine1 = _engine() + pe_cpu_id = "sip0.cube0.pe0.pe_cpu" + done1 = engine1._env.event() + txn1 = Transaction( + request=KernelLaunchMsg( + correlation_id="t", request_id="st", + kernel_ref=KernelRef(name="test_single_tile", kind="builtin"), + args=(), + ), + path=[pe_cpu_id], step=0, nbytes=0, done=done1, + ) + + def inject1(): + yield engine1._components[pe_cpu_id]._inbox.put(txn1) + yield done1 + + engine1._env.process(inject1()) + engine1._env.run() + single_ns = engine1._env.now + + # Multi K-tile (K=128, needs 2 TILE_K=64 tiles) + clear_registry() + + def multi_tile_kernel(tl): + a = tl.load(hbm_pa_a, shape=(32, 128), dtype="f16") + b = tl.ref(hbm_pa_b, shape=(128, 32), dtype="f16") + h = tl.composite(op="gemm", a=a, b=b, out_ptr=out_pa) + tl.wait(h) + + register_kernel("test_multi_tile", multi_tile_kernel) + engine2 = _engine() + done2 = engine2._env.event() + txn2 = Transaction( + request=KernelLaunchMsg( + correlation_id="t", request_id="mt", + kernel_ref=KernelRef(name="test_multi_tile", kind="builtin"), + args=(), + ), + path=[pe_cpu_id], step=0, nbytes=0, done=done2, + ) + + def inject2(): + yield engine2._components[pe_cpu_id]._inbox.put(txn2) + yield done2 + + engine2._env.process(inject2()) + engine2._env.run() + multi_ns = engine2._env.now + + assert multi_ns > single_ns, ( + f"Multi-tile ({multi_ns}ns) should take longer than single-tile ({single_ns}ns)" + ) + clear_registry() + + +# ── 16. Stage 3: tl.ref() generates no DMA command ─────────────── + + +def test_tl_ref_no_dma(): + """tl.ref() creates TensorHandle but does NOT emit a DMA command.""" + from kernbench.triton_emu.tl_context import TLContext + + tl = TLContext(pe_id=0, dispatch_cycles=0) + handle = tl.ref(0x1000, shape=(4, 4), dtype="f16") + assert handle.pa == 0x1000 + assert handle.shape == (4, 4) + assert len(tl.commands) == 0, f"tl.ref should emit 0 commands, got {len(tl.commands)}" + + +# ── 17. Stage 4: M_CPU kernel launch fan-out ────────────────────── + + +def test_mcpu_kernel_launch_fanout(): + """M_CPU routes KernelLaunchMsg to PE_CPU via NOC, PE executes, response returns. + + Full pipeline: Host → PCIE_EP → IO_CPU → M_CPU → NOC → PE_CPU → engines + """ + clear_registry() + hbm_pa = _hbm_pa(sip=0, cube=0, pe_id=0) + + def simple_kernel(a_ptr, tl): + tl.load(a_ptr, shape=(4, 4), dtype="f16") + + register_kernel("test_mcpu_kernel", simple_kernel) + + engine = _engine() + + shard = TensorArgShard( + sip=0, cube=0, pe=0, + pa=hbm_pa, nbytes=32, offset_bytes=0, + ) + launch_msg = KernelLaunchMsg( + correlation_id="test", request_id="mcpu_launch", + kernel_ref=KernelRef(name="test_mcpu_kernel", kind="builtin"), + args=(TensorArg(shards=(shard,)),), + target_cubes=(0,), target_pe=0, + ) + + h = engine.submit(launch_msg) + engine.wait(h) + comp, trace = engine.get_completion(h) + + assert comp.ok is True + assert trace["total_ns"] > 0, f"Kernel launch should take > 0ns, got {trace['total_ns']}" + clear_registry() + + +# ── 18. Stage 4: M_CPU kernel launch with composite GEMM ────────── + + +def test_mcpu_kernel_launch_composite(): + """Full pipeline kernel launch with composite GEMM through M_CPU.""" + clear_registry() + hbm_pa = _hbm_pa(sip=0, cube=0, pe_id=0) + + def gemm_kernel(a_ptr, tl): + a = tl.load(a_ptr, shape=(32, 64), dtype="f16") + b = tl.ref(a_ptr, shape=(64, 32), dtype="f16") + h = tl.composite(op="gemm", a=a, b=b, out_ptr=a_ptr) + tl.wait(h) + + register_kernel("test_mcpu_composite", gemm_kernel) + + engine = _engine() + shard = TensorArgShard( + sip=0, cube=0, pe=0, + pa=hbm_pa, nbytes=32, offset_bytes=0, + ) + launch_msg = KernelLaunchMsg( + correlation_id="test", request_id="mcpu_composite", + kernel_ref=KernelRef(name="test_mcpu_composite", kind="builtin"), + args=(TensorArg(shards=(shard,)),), + target_cubes=(0,), target_pe=0, + ) + + h = engine.submit(launch_msg) + engine.wait(h) + comp, trace = engine.get_completion(h) + + assert comp.ok is True + assert trace["total_ns"] > 0 + clear_registry() + + +# ── 19. Stage 5: QKV GEMM benchmark completion ──────────────────── + + +def test_qkv_gemm_bench_completes(): + """The qkv_gemm benchmark runs to completion without error.""" + clear_registry() + from benches.qkv_gemm import run as bench_run + from kernbench.runtime_api.context import RuntimeContext + + graph = load_topology(TOPOLOGY_PATH) + engine = GraphEngine(graph) + ctx = RuntimeContext( + engine=engine, + target_device="sip0", + correlation_id="bench_test", + spec=graph.spec, + ) + bench_run(ctx) + ctx.wait_all() + + # All handles should have completed + for h in ctx.handles(): + comp, trace = engine.get_completion(h) + assert comp.ok is True + assert trace["total_ns"] > 0 + + # Trace collection: deploy + kernel phases + assert len(ctx._traces) > 0 + deploy_traces = [t for t in ctx._traces if t["phase"] in ("deploy", "memory_write")] + kernel_traces = [t for t in ctx._traces if t["phase"] == "kernel"] + assert len(deploy_traces) >= 2 # at least a, b (out is empty, no deploy) + assert len(kernel_traces) == 1 + assert kernel_traces[0]["name"] == "qkv_gemm" + assert kernel_traces[0]["total_ns"] > 0 + # Scalars should contain M, K, N + assert len(kernel_traces[0]["scalars"]) >= 3 + + clear_registry() + + +# ── 20. Stage 4: M_CPU multi-PE kernel launch fan-out ───────────── + + +def test_mcpu_multi_pe_kernel_launch(): + """M_CPU fans out KernelLaunchMsg to all 8 PEs when target_pe='all'. + + Validates: + - All PEs execute the kernel (latency > 0) + - Multi-PE latency >= single-PE latency (parallel but NOC contention) + """ + clear_registry() + hbm_pa = _hbm_pa(sip=0, cube=0, pe_id=0) + + def simple_kernel(a_ptr, tl): + tl.load(a_ptr, shape=(4, 4), dtype="f16") + + register_kernel("test_multi_pe", simple_kernel) + + # Single PE baseline + engine1 = _engine() + shard = TensorArgShard( + sip=0, cube=0, pe=0, + pa=hbm_pa, nbytes=32, offset_bytes=0, + ) + h1 = engine1.submit(KernelLaunchMsg( + correlation_id="t", request_id="single", + kernel_ref=KernelRef(name="test_multi_pe", kind="builtin"), + args=(TensorArg(shards=(shard,)),), + target_cubes=(0,), target_pe=0, + )) + engine1.wait(h1) + comp1, trace1 = engine1.get_completion(h1) + single_ns = trace1["total_ns"] + + # Multi PE (all 8) + engine2 = _engine() + h2 = engine2.submit(KernelLaunchMsg( + correlation_id="t", request_id="multi", + kernel_ref=KernelRef(name="test_multi_pe", kind="builtin"), + args=(TensorArg(shards=(shard,)),), + target_cubes=(0,), target_pe="all", + )) + engine2.wait(h2) + comp2, trace2 = engine2.get_completion(h2) + multi_ns = trace2["total_ns"] + + assert comp1.ok is True + assert comp2.ok is True + assert single_ns > 0 + assert multi_ns > 0 + assert multi_ns >= single_ns, ( + f"Multi-PE ({multi_ns}ns) should be >= single-PE ({single_ns}ns)" + ) + clear_registry() + + +# ── 21. Stage 5: QKV GEMM multi-PE benchmark completion ────────── + + +def test_qkv_gemm_bench_multi_pe_completes(): + """The qkv_gemm_multi_pe benchmark runs to completion without error.""" + clear_registry() + from benches.qkv_gemm_multi_pe import run as bench_run + from kernbench.runtime_api.context import RuntimeContext + + graph = load_topology(TOPOLOGY_PATH) + engine = GraphEngine(graph) + ctx = RuntimeContext( + engine=engine, + target_device="sip0", + correlation_id="bench_multi_pe", + spec=graph.spec, + ) + bench_run(ctx) + ctx.wait_all() + + for h in ctx.handles(): + comp, trace = engine.get_completion(h) + assert comp.ok is True + assert trace["total_ns"] > 0 + + # Multi-PE: 8 PEs, deploy traces per PE + kernel + deploy_traces = [t for t in ctx._traces if t["phase"] in ("deploy", "memory_write")] + kernel_traces = [t for t in ctx._traces if t["phase"] == "kernel"] + assert len(deploy_traces) >= 8 # replicate(a)*8 + column_wise(b)*8 + assert len(kernel_traces) == 1 + assert kernel_traces[0]["target_pe"] == "all" + + clear_registry() + + +def test_report_format(): + """format_report produces readable output with TFLOPS and BW.""" + from kernbench.cli.report import format_report + + traces = [ + {"phase": "deploy", "name": "a", "pe": 0, "nbytes": 65536, "total_ns": 25.0}, + {"phase": "deploy", "name": "b", "pe": 0, "nbytes": 65536, "total_ns": 25.0}, + {"phase": "kernel", "name": "qkv_gemm", "target_pe": 0, + "scalars": [128, 256, 128], "total_ns": 100.0}, + ] + report = format_report(traces, title="qkv_gemm") + assert "qkv_gemm" in report + assert "deploy" in report + assert "kernel" in report + assert "TFLOPS" in report + # GEMM TFLOPS: 2*128*256*128 / (100ns * 1e-9) / 1e12 = 83.886 + assert "83.886" in report + # BW: 65536 / 25.0 = 2621.4 GB/s + assert "2621.4" in report + # Util% column present + assert "Util" in report + + +# ── 22. Multi-CUBE kernel launch (ADR-0013 V4) ────────────────── + + +def test_multi_cube_kernel_launch(): + """IO_CPU fans out KernelLaunchMsg to M_CPUs in two different cubes. + + Validates ADR-0013 V4 (multi-CUBE within SIP): + - Shards in cube=0 and cube=1, each targeting pe=0 + - Completion ok=True, total_ns > 0 + - Multi-cube latency >= single-cube latency (inter-cube UCIe adds cost) + - Deterministic across runs + """ + clear_registry() + hbm_pa_c0 = _hbm_pa(sip=0, cube=0, pe_id=0) + hbm_pa_c1 = _hbm_pa(sip=0, cube=1, pe_id=0) + + def simple_kernel(a_ptr, tl): + tl.load(a_ptr, shape=(4, 4), dtype="f16") + + register_kernel("test_multi_cube", simple_kernel) + + # Single-cube baseline + engine1 = _engine() + shard_single = TensorArgShard( + sip=0, cube=0, pe=0, + pa=hbm_pa_c0, nbytes=32, offset_bytes=0, + ) + h1 = engine1.submit(KernelLaunchMsg( + correlation_id="t", request_id="single_cube", + kernel_ref=KernelRef(name="test_multi_cube", kind="builtin"), + args=(TensorArg(shards=(shard_single,)),), + target_cubes=(0,), target_pe=0, + )) + engine1.wait(h1) + comp1, trace1 = engine1.get_completion(h1) + single_ns = trace1["total_ns"] + + # Multi-cube: shards in cube=0 and cube=1 + engine2 = _engine() + shard_c0 = TensorArgShard( + sip=0, cube=0, pe=0, + pa=hbm_pa_c0, nbytes=32, offset_bytes=0, + ) + shard_c1 = TensorArgShard( + sip=0, cube=1, pe=0, + pa=hbm_pa_c1, nbytes=32, offset_bytes=0, + ) + h2 = engine2.submit(KernelLaunchMsg( + correlation_id="t", request_id="multi_cube", + kernel_ref=KernelRef(name="test_multi_cube", kind="builtin"), + args=(TensorArg(shards=(shard_c0, shard_c1)),), + target_pe=0, + )) + engine2.wait(h2) + comp2, trace2 = engine2.get_completion(h2) + multi_ns = trace2["total_ns"] + + assert comp1.ok is True + assert comp2.ok is True + assert single_ns > 0 + assert multi_ns > 0 + assert multi_ns >= single_ns, ( + f"Multi-cube ({multi_ns}ns) should be >= single-cube ({single_ns}ns)" + ) + + # Determinism check + engine3 = _engine() + h3 = engine3.submit(KernelLaunchMsg( + correlation_id="t", request_id="multi_cube_det", + kernel_ref=KernelRef(name="test_multi_cube", kind="builtin"), + args=(TensorArg(shards=(shard_c0, shard_c1)),), + target_pe=0, + )) + engine3.wait(h3) + _, trace3 = engine3.get_completion(h3) + assert trace2["total_ns"] == trace3["total_ns"], ( + f"Determinism violation: {trace2['total_ns']} != {trace3['total_ns']}" + ) + clear_registry() + + +# ── 23. Multi-SIP kernel launch (ADR-0013 V4) ─────────────────── + + +def test_multi_sip_kernel_launch(): + """Engine submits KernelLaunchMsg to two SIPs via separate PCIE_EPs. + + Validates ADR-0013 V4 (multi-SIP tray): + - Shards in sip=0/cube=0/pe=0 and sip=1/cube=0/pe=0 + - Completion ok=True, total_ns > 0 + - Multi-SIP latency >= single-SIP latency + - Deterministic across runs + """ + clear_registry() + hbm_pa_s0 = _hbm_pa(sip=0, cube=0, pe_id=0) + hbm_pa_s1 = _hbm_pa(sip=1, cube=0, pe_id=0) + + # Kernel uses cycles only — no HBM access, so it runs correctly on any SIP + # (a real multi-SIP kernel would use SIP-local PA for each PE) + def simple_kernel(a_ptr, tl): + tl.cycles(50) + + register_kernel("test_multi_sip", simple_kernel) + + # Single-SIP baseline + engine1 = _engine() + shard_single = TensorArgShard( + sip=0, cube=0, pe=0, + pa=hbm_pa_s0, nbytes=32, offset_bytes=0, + ) + h1 = engine1.submit(KernelLaunchMsg( + correlation_id="t", request_id="single_sip", + kernel_ref=KernelRef(name="test_multi_sip", kind="builtin"), + args=(TensorArg(shards=(shard_single,)),), + target_cubes=(0,), target_pe=0, + )) + engine1.wait(h1) + comp1, trace1 = engine1.get_completion(h1) + single_ns = trace1["total_ns"] + + # Multi-SIP: shards in sip=0 and sip=1 + engine2 = _engine() + shard_s0 = TensorArgShard( + sip=0, cube=0, pe=0, + pa=hbm_pa_s0, nbytes=32, offset_bytes=0, + ) + shard_s1 = TensorArgShard( + sip=1, cube=0, pe=0, + pa=hbm_pa_s1, nbytes=32, offset_bytes=0, + ) + h2 = engine2.submit(KernelLaunchMsg( + correlation_id="t", request_id="multi_sip", + kernel_ref=KernelRef(name="test_multi_sip", kind="builtin"), + args=(TensorArg(shards=(shard_s0, shard_s1)),), + target_pe=0, + )) + engine2.wait(h2) + comp2, trace2 = engine2.get_completion(h2) + multi_ns = trace2["total_ns"] + + assert comp1.ok is True + assert comp2.ok is True + assert single_ns > 0 + assert multi_ns > 0 + assert multi_ns >= single_ns, ( + f"Multi-SIP ({multi_ns}ns) should be >= single-SIP ({single_ns}ns)" + ) + + # Determinism check + engine3 = _engine() + h3 = engine3.submit(KernelLaunchMsg( + correlation_id="t", request_id="multi_sip_det", + kernel_ref=KernelRef(name="test_multi_sip", kind="builtin"), + args=(TensorArg(shards=(shard_s0, shard_s1)),), + target_pe=0, + )) + engine3.wait(h3) + _, trace3 = engine3.get_completion(h3) + assert trace2["total_ns"] == trace3["total_ns"], ( + f"Determinism violation: {trace2['total_ns']} != {trace3['total_ns']}" + ) + clear_registry() diff --git a/tests/test_phase_a_components.py b/tests/test_phase_a_components.py new file mode 100644 index 0000000..4360832 --- /dev/null +++ b/tests/test_phase_a_components.py @@ -0,0 +1,269 @@ +"""Phase A component infrastructure tests (ADR-0015). + +Verifies: + - TransitComponent, IoCpuComponent apply overhead_ns via run() + - HbmCtrlComponent and SramComponent act as terminal nodes (succeed done) + - MCpuComponent forwards when not terminal; completes when terminal + no ctx + - ComponentRegistry resolves impl strings to correct concrete classes + - GraphEngine passes ComponentContext to every component + - ComponentContext.router and .resolver are correctly populated +""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any + +import pytest +import simpy + +from kernbench.components.base import ComponentBase, ComponentRegistry +from kernbench.components.context import ComponentContext +from kernbench.components.impls import ( + HbmCtrlComponent, + IoCpuComponent, + MCpuComponent, + PcieEpComponent, + SramComponent, + TransitComponent, +) +from kernbench.sim_engine.engine import GraphEngine +from kernbench.sim_engine.transaction import Transaction +from kernbench.topology.builder import load_topology +from kernbench.topology.types import Node + +TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml" + + +def _node(impl: str, attrs: dict | None = None) -> Node: + return Node(id="test.node", kind="test", impl=impl, attrs=attrs or {}, pos_mm=None) + + +def _run_worker(comp: ComponentBase, env: simpy.Environment, txn: Transaction) -> None: + """Wire one in_port, start the component, inject txn, run env until done.""" + in_store: simpy.Store = simpy.Store(env) + comp.in_ports["src"] = in_store + comp.start(env) + env.process(_inject(in_store, txn)) + env.run(until=txn.done) + + +def _inject(store: simpy.Store, txn: Transaction): + yield store.put(txn) + + +# ── 1. run() latency: TransitComponent ─────────────────────────────── + + +def test_transit_component_run_overhead_ns(): + """TransitComponent.run() yields exactly overhead_ns.""" + node = _node("forwarding_v1", {"overhead_ns": 7.5}) + comp = TransitComponent(node) + env = simpy.Environment() + + def proc(): + yield from comp.run(env, nbytes=1024) + + env.process(proc()) + env.run() + assert env.now == pytest.approx(7.5) + + +def test_transit_component_run_zero_overhead_ns(): + """TransitComponent.run() with overhead_ns=0 completes immediately.""" + node = _node("noc_v1", {"overhead_ns": 0.0}) + comp = TransitComponent(node) + env = simpy.Environment() + done = [] + + def proc(): + yield from comp.run(env, nbytes=512) + done.append(True) + + env.process(proc()) + env.run() + assert done == [True] + assert env.now == pytest.approx(0.0) + + +# ── 2. run() latency: IoCpuComponent ──────────────────────────────── + + +def test_io_cpu_component_run_overhead_ns(): + """IoCpuComponent.run() yields exactly overhead_ns.""" + node = _node("io_cpu_v1", {"overhead_ns": 10.0}) + comp = IoCpuComponent(node) + env = simpy.Environment() + + def proc(): + yield from comp.run(env, nbytes=2048) + + env.process(proc()) + env.run() + assert env.now == pytest.approx(10.0) + + +# ── 3. Terminal: HbmCtrlComponent succeeds done ────────────────────── + + +def test_hbm_ctrl_terminal_succeeds_done(): + """HbmCtrlComponent is a terminal node: succeeds txn.done after run().""" + node = _node("hbm_ctrl_v1", {"overhead_ns": 0.0, "capacity": 1}) + comp = HbmCtrlComponent(node) + env = simpy.Environment() + done_event = env.event() + txn = Transaction(request=None, path=["test.node"], step=0, nbytes=256, done=done_event) + + _run_worker(comp, env, txn) + + assert done_event.triggered + + +def test_hbm_ctrl_resource_serializes_requests(): + """HbmCtrlComponent with capacity=1 serializes concurrent requests.""" + node = _node("hbm_ctrl_v1", {"overhead_ns": 5.0, "capacity": 1}) + comp = HbmCtrlComponent(node) + env = simpy.Environment() + in_store: simpy.Store = simpy.Store(env) + comp.in_ports["src"] = in_store + comp.start(env) + + done1 = env.event() + done2 = env.event() + txn1 = Transaction(request=None, path=["test.node"], step=0, nbytes=0, done=done1) + txn2 = Transaction(request=None, path=["test.node"], step=0, nbytes=0, done=done2) + + def inject(): + yield in_store.put(txn1) + yield in_store.put(txn2) + + env.process(inject()) + env.run(until=done2) + + # Both must be done; with serialization: t=5 + t=10 + assert done1.triggered + assert done2.triggered + assert env.now == pytest.approx(10.0) + + +# ── 4. Terminal: SramComponent succeeds done ───────────────────────── + + +def test_sram_terminal_succeeds_done(): + """SramComponent is a terminal node: succeeds txn.done after run().""" + node = _node("sram_v1", {"overhead_ns": 2.0}) + comp = SramComponent(node) + env = simpy.Environment() + done_event = env.event() + txn = Transaction(request=None, path=["test.node"], step=0, nbytes=512, done=done_event) + + _run_worker(comp, env, txn) + + assert done_event.triggered + assert env.now == pytest.approx(2.0) + + +# ── 5. MCpuComponent: forward when not terminal ────────────────────── + + +def test_m_cpu_forwards_when_not_terminal(): + """MCpuComponent forwards Transaction to next hop when not terminal.""" + node = _node("m_cpu_v1", {"overhead_ns": 5.0}) + comp = MCpuComponent(node) + env = simpy.Environment() + + # Wire in_port and out_port for a two-hop path [src, test.node, next] + in_store: simpy.Store = simpy.Store(env) + out_store: simpy.Store = simpy.Store(env) + comp.in_ports["src"] = in_store + comp.out_ports["next"] = out_store + comp.start(env) + + done_event = env.event() + txn = Transaction( + request=None, + path=["src", "test.node", "next"], + step=1, # currently at test.node; next_hop = "next" + nbytes=128, + done=done_event, + ) + + forwarded: list[Any] = [] + + def receiver(): + msg = yield out_store.get() + forwarded.append(msg) + msg.done.succeed() + + env.process(receiver()) + + def inject(): + yield in_store.put(txn) + + env.process(inject()) + env.run(until=done_event) + + assert len(forwarded) == 1 + assert forwarded[0].step == 2 # advanced + assert env.now == pytest.approx(5.0) + + +# ── 6. MCpuComponent: terminal with no ctx just completes ──────────── + + +def test_m_cpu_terminal_no_ctx_completes(): + """MCpuComponent without ctx completes txn.done when it is the terminal hop.""" + node = _node("m_cpu_v1", {"overhead_ns": 0.0}) + comp = MCpuComponent(node, ctx=None) + env = simpy.Environment() + done_event = env.event() + txn = Transaction(request=None, path=["test.node"], step=0, nbytes=64, done=done_event) + + _run_worker(comp, env, txn) + + assert done_event.triggered + + +# ── 7. ComponentRegistry resolves impl strings ─────────────────────── + + +@pytest.mark.parametrize("impl,expected_cls", [ + ("forwarding_v1", TransitComponent), + ("noc_v1", TransitComponent), + ("ucie_v1", TransitComponent), + ("xbar_v1", TransitComponent), + ("pcie_ep_v1", PcieEpComponent), + ("io_cpu_v1", IoCpuComponent), + ("m_cpu_v1", MCpuComponent), + ("hbm_ctrl_v1", HbmCtrlComponent), + ("sram_v1", SramComponent), +]) +def test_registry_resolves_impl(impl, expected_cls): + """ComponentRegistry.create() returns the correct concrete class for each impl.""" + node = _node(impl, {"overhead_ns": 0.0}) + comp = ComponentRegistry.create(node) + assert isinstance(comp, expected_cls) + + +# ── 8. GraphEngine passes ComponentContext to components ───────────── + + +def test_engine_passes_ctx_to_components(): + """GraphEngine injects a non-None ComponentContext into every component.""" + graph = load_topology(TOPOLOGY_PATH) + engine = GraphEngine(graph) + for node_id, comp in engine._components.items(): + assert comp.ctx is not None, f"{node_id}: ctx is None" + assert isinstance(comp.ctx, ComponentContext), f"{node_id}: ctx wrong type" + + +def test_engine_ctx_router_and_resolver_populated(): + """ComponentContext.router and .resolver are PathRouter / AddressResolver instances.""" + from kernbench.policy.routing.router import AddressResolver, PathRouter + + graph = load_topology(TOPOLOGY_PATH) + engine = GraphEngine(graph) + # Spot-check one component + first_comp = next(iter(engine._components.values())) + assert isinstance(first_comp.ctx.router, PathRouter) + assert isinstance(first_comp.ctx.resolver, AddressResolver) diff --git a/tests/test_phyaddr.py b/tests/test_phyaddr.py new file mode 100644 index 0000000..2cbb252 --- /dev/null +++ b/tests/test_phyaddr.py @@ -0,0 +1,268 @@ +import pytest + +from kernbench.policy.address.allocator import AddressConfig, AllocationError, PEMemAllocator +from kernbench.policy.address.phyaddr import PhysAddr, PhysAddrError, UnitType + +_MB = 1 << 20 +_GB = 1 << 30 + +# Topology-matching config: 48GB HBM / 8 slices / 16MB TCM / 4MB reserved / 32MB SRAM +_CFG = AddressConfig( + sip_count=2, + cubes_per_sip=16, + pes_per_cube=8, + hbm_bytes_per_cube=48 * _GB, + hbm_slices_per_cube=8, + tcm_bytes_per_pe=16 * _MB, + tcm_scheduler_reserved_bytes=4 * _MB, + sram_bytes_per_cube=32 * _MB, +) + + +# ── Immutability & value semantics ────────────────────────────────── + + +def test_physaddr_immutable(): + pa = PhysAddr.hbm_addr(rack_id=0, sip_id=0, cube_id=0, hbm_offset=0) + with pytest.raises(AttributeError): + pa.rack_id = 1 # type: ignore[misc] + # hashable + {pa} + # comparable + pa2 = PhysAddr.hbm_addr(rack_id=0, sip_id=0, cube_id=0, hbm_offset=0) + assert pa == pa2 + + +# ── HBM encode/decode roundtrip ──────────────────────────────────── + + +def test_hbm_encode_decode_roundtrip(): + pa = PhysAddr.hbm_addr(rack_id=2, sip_id=3, cube_id=5, hbm_offset=0x1000) + raw = pa.encode() + dec = PhysAddr.decode(raw) + assert dec.rack_id == 2 + assert dec.sip_id == 3 + assert dec.cube_id == 5 + assert dec.kind == "hbm" + assert dec.hbm_offset == 0x1000 + + +# ── PE resource encode/decode roundtrip ───────────────────────────── + + +def test_pe_resource_encode_decode_roundtrip(): + pa = PhysAddr( + rack_id=1, sip_id=2, sip_seg=7, local_offset=0, + kind="pe_resource", cube_id=7, + unit_type=UnitType.PE, pe_id=3, ext=1, sub_offset=0xFF, + ) + # manually build local_offset matching bit layout + local_offset = (UnitType.PE << 34) | (3 << 30) | (1 << 29) | 0xFF + pa2 = PhysAddr( + rack_id=1, sip_id=2, sip_seg=7, local_offset=local_offset, + kind="pe_resource", cube_id=7, + unit_type=UnitType.PE, pe_id=3, ext=1, sub_offset=0xFF, + ) + raw = pa2.encode() + dec = PhysAddr.decode(raw) + assert dec.kind == "pe_resource" + assert dec.unit_type == UnitType.PE + assert dec.pe_id == 3 + assert dec.ext == 1 + assert dec.sub_offset == 0xFF + + +# ── pe_hbm_addr factory ──────────────────────────────────────────── + + +def test_pe_hbm_addr_factory(): + SLICE = 6 * (1 << 30) # 6 GB per PE slice + pa = PhysAddr.pe_hbm_addr( + rack_id=0, sip_id=0, cube_id=0, + pe_id=2, pe_local_hbm_offset=1024, slice_size_bytes=SLICE, + ) + assert pa.kind == "hbm" + assert pa.cube_id == 0 + assert pa.hbm_offset == 2 * SLICE + 1024 + + +def test_pe_hbm_addr_overflow(): + SLICE = 6 * (1 << 30) + with pytest.raises(PhysAddrError, match="pe_local_hbm_offset"): + PhysAddr.pe_hbm_addr( + rack_id=0, sip_id=0, cube_id=0, + pe_id=0, pe_local_hbm_offset=SLICE, slice_size_bytes=SLICE, + ) + + +# ── Invalid unit_type decode (fix #1) ────────────────────────────── + + +def test_invalid_unit_type_raises(): + # Craft a PE-resource address with unit_type=7 (invalid) + local_offset = (7 << 34) | (0 << 30) | 0 + pa_raw = PhysAddr( + rack_id=0, sip_id=0, sip_seg=0, local_offset=local_offset, + ) + raw = pa_raw.encode() + with pytest.raises(PhysAddrError, match="unit_type"): + PhysAddr.decode(raw) + + +# ── hbm_pe_id utility (fix #3) ───────────────────────────────────── + + +def test_hbm_pe_id_utility(): + SLICE = 6 * (1 << 30) # 6 GB + pa = PhysAddr.pe_hbm_addr( + rack_id=0, sip_id=0, cube_id=0, + pe_id=5, pe_local_hbm_offset=256, slice_size_bytes=SLICE, + ) + assert PhysAddr.hbm_pe_id(pa.hbm_offset, SLICE) == 5 + + +# ── UnitType.SRAM exists (fix #5) ────────────────────────────────── + + +def test_sram_unit_type_exists(): + assert UnitType.SRAM == 2 + + +# ── cube_sram_addr factory + roundtrip ────────────────────────────── + + +def test_cube_sram_addr_roundtrip(): + pa = PhysAddr.cube_sram_addr( + rack_id=0, sip_id=1, cube_id=3, sram_offset=0x800, + ) + assert pa.kind == "pe_resource" + assert pa.unit_type == UnitType.SRAM + assert pa.cube_id == 3 + assert pa.sub_offset == 0x800 + # encode → decode roundtrip + dec = PhysAddr.decode(pa.encode()) + assert dec.unit_type == UnitType.SRAM + assert dec.cube_id == 3 + assert dec.sub_offset == 0x800 + + +def test_cube_sram_addr_range_check(): + with pytest.raises(PhysAddrError): + PhysAddr.cube_sram_addr( + rack_id=0, sip_id=0, cube_id=0, + sram_offset=(1 << 29), # exceeds 29-bit sub_offset + ) + + +# ── pe_tcm_addr factory + roundtrip ──────────────────────────────── + + +def test_pe_tcm_addr_roundtrip(): + pa = PhysAddr.pe_tcm_addr( + rack_id=0, sip_id=0, cube_id=2, pe_id=7, tcm_offset=0x400, + ) + assert pa.kind == "pe_resource" + assert pa.unit_type == UnitType.PE + assert pa.pe_id == 7 + assert pa.cube_id == 2 + assert pa.sub_offset == 0x400 + # encode → decode roundtrip + dec = PhysAddr.decode(pa.encode()) + assert dec.unit_type == UnitType.PE + assert dec.pe_id == 7 + assert dec.sub_offset == 0x400 + + +def test_pe_tcm_addr_range_check(): + with pytest.raises(PhysAddrError): + PhysAddr.pe_tcm_addr( + rack_id=0, sip_id=0, cube_id=0, pe_id=0, + tcm_offset=(1 << 29), # exceeds 29-bit sub_offset + ) + + +# ── AddressConfig ─────────────────────────────────────────────────── + + +def test_address_config_derived_sizes(): + assert _CFG.hbm_slice_bytes == 6 * _GB + assert _CFG.tcm_allocatable_bytes == 12 * _MB + + +# ── PEMemAllocator: HBM ──────────────────────────────────────────── + + +def _make_alloc(pe_id: int = 0) -> PEMemAllocator: + return PEMemAllocator(rack_id=0, sip_id=0, cube_id=0, pe_id=pe_id, cfg=_CFG) + + +def test_allocator_hbm_basic(): + a = _make_alloc(pe_id=3) + pa = a.alloc_hbm(4096) + assert pa.kind == "hbm" + assert pa.sip_id == 0 + assert pa.cube_id == 0 + # hbm_offset should be pe3's slice start + assert pa.hbm_offset == 3 * 6 * _GB + + +def test_allocator_hbm_sequential(): + a = _make_alloc() + pa1 = a.alloc_hbm(1024) + pa2 = a.alloc_hbm(2048) + assert pa1.hbm_offset == 0 # pe0 slice start + 0 + assert pa2.hbm_offset == 1024 # pe0 slice start + 1024 + + +def test_allocator_hbm_overflow(): + a = _make_alloc() + a.alloc_hbm(6 * _GB - 256) + with pytest.raises(AllocationError, match="HBM"): + a.alloc_hbm(512) + + +# ── PEMemAllocator: TCM ──────────────────────────────────────────── + + +def test_allocator_tcm_basic(): + a = _make_alloc(pe_id=5) + pa = a.alloc_tcm(256) + assert pa.kind == "pe_resource" + assert pa.unit_type == UnitType.PE + assert pa.pe_id == 5 + assert pa.sub_offset == 0 + + +def test_allocator_tcm_respects_reserved(): + a = _make_alloc() + # allocatable = 12 MB, should succeed + a.alloc_tcm(12 * _MB) + assert a.tcm_used == 12 * _MB + assert a.tcm_total == 12 * _MB + + +def test_allocator_tcm_overflow(): + a = _make_alloc() + a.alloc_tcm(12 * _MB) + with pytest.raises(AllocationError, match="TCM"): + a.alloc_tcm(1) + + +# ── PEMemAllocator: stats & determinism ───────────────────────────── + + +def test_allocator_stats(): + a = _make_alloc() + a.alloc_hbm(1000) + a.alloc_tcm(500) + assert a.hbm_used == 1000 + assert a.hbm_total == 6 * _GB + assert a.tcm_used == 500 + assert a.tcm_total == 12 * _MB + + +def test_allocator_deterministic(): + a1 = _make_alloc(pe_id=2) + a2 = _make_alloc(pe_id=2) + assert a1.alloc_hbm(4096) == a2.alloc_hbm(4096) + assert a1.alloc_tcm(256) == a2.alloc_tcm(256) diff --git a/tests/test_probe.py b/tests/test_probe.py new file mode 100644 index 0000000..eb70e4b --- /dev/null +++ b/tests/test_probe.py @@ -0,0 +1,221 @@ +"""Tests for H2D writes and PE DMA probe latency invariants. + +H2D tests use MemoryWriteMsg (pcie_ep → io_cpu → m_cpu → hbm_ctrl → response). +PE DMA tests use PeDmaMsg (direct pe_dma → xbar → hbm_ctrl injection). +""" +from pathlib import Path + +from kernbench.policy.address.phyaddr import PhysAddr +from kernbench.policy.routing.router import AddressResolver, PathRouter +from kernbench.runtime_api.kernel import MemoryWriteMsg, PeDmaMsg +from kernbench.sim_engine.engine import GraphEngine +from kernbench.topology.builder import load_topology + +TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml" + + +def _engine(): + return GraphEngine(load_topology(TOPOLOGY_PATH)) + + +def _hbm_pa(sip: int = 0, cube: int = 0, pe_id: int = 0) -> int: + slice_bytes = 48 * (1 << 30) // 8 + pa = PhysAddr.pe_hbm_addr( + rack_id=0, sip_id=sip, cube_id=cube, pe_id=pe_id, + pe_local_hbm_offset=0x1000, slice_size_bytes=slice_bytes, + ) + return pa.encode() + + +def _h2d_latency(dst_cube: int, dst_pe: int = 0) -> float: + engine = _engine() + msg = MemoryWriteMsg( + correlation_id="probe", request_id=f"h2d-c{dst_cube}-p{dst_pe}", + dst_sip=0, dst_cube=dst_cube, dst_pe=dst_pe, + dst_pa=_hbm_pa(sip=0, cube=dst_cube, pe_id=dst_pe), nbytes=4096, + pattern="zero", target_pe=dst_pe, + ) + h = engine.submit(msg) + engine.wait(h) + _, trace = engine.get_completion(h) + return trace["total_ns"] + + +# ── 1. Single-PE write completes ────────────────────────────────── + + +def test_single_pe_write_completes(): + """MemoryWriteMsg(target_pe=0) must complete with ok=True, latency > 0.""" + engine = _engine() + msg = MemoryWriteMsg( + correlation_id="probe", request_id="pe-local", + dst_sip=0, dst_cube=0, dst_pe=0, + dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096, + pattern="zero", target_pe=0, + ) + h = engine.submit(msg) + engine.wait(h) + comp, trace = engine.get_completion(h) + assert comp.ok is True + assert trace["total_ns"] > 0 + + +# ── 2. Cross-cube write positive latency ───────────────────────── + + +def test_cross_cube_write_positive(): + """Cross-cube MemoryWriteMsg(target_pe=0) must complete with latency > 0.""" + lat = _h2d_latency(dst_cube=1, dst_pe=0) + assert lat > 0 + + +# ── 3. H2D latency monotonicity ────────────────────────────────── + + +def test_h2d_latency_monotonic(): + """1hop < 2hop < 3hop < 4hop.""" + cubes = [0, 4, 8, 12] + latencies: list[tuple[int, float]] = [] + for cube in cubes: + lat = _h2d_latency(dst_cube=cube, dst_pe=0) + latencies.append((cube, lat)) + + for i in range(len(latencies) - 1): + assert latencies[i][1] < latencies[i + 1][1], ( + f"cube{latencies[i][0]}({latencies[i][1]:.2f}) " + f"must < cube{latencies[i + 1][0]}({latencies[i + 1][1]:.2f})" + ) + + +# ── 4. Single-PE write deterministic ───────────────────────────── + + +def test_single_pe_write_deterministic(): + """Same MemoryWriteMsg on two engines must produce identical latency.""" + msg = MemoryWriteMsg( + correlation_id="probe", request_id="det", + dst_sip=0, dst_cube=0, dst_pe=0, + dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096, + pattern="zero", target_pe=0, + ) + e1 = _engine() + h1 = e1.submit(msg) + e1.wait(h1) + _, t1 = e1.get_completion(h1) + + e2 = _engine() + h2 = e2.submit(msg) + e2.wait(h2) + _, t2 = e2.get_completion(h2) + + assert t1["total_ns"] == t2["total_ns"] + + +# ── 5. Cut-through (wormhole) wire model invariants ────────────── + + +def test_h2d_local_cube_cut_through(): + """H2D to local cube with cut-through should be < 50ns for 4096B. + + Full command path: pcie_ep → io_cpu → ucie → noc → m_cpu + DMA: m_cpu → noc → xbar → hbm_ctrl (drain once at terminal) + Plus response path back. + With store-and-forward each hop would serialize; cut-through keeps it low. + """ + lat = _h2d_latency(dst_cube=0, dst_pe=0) + assert lat < 65.0, f"Local H2D {lat:.2f}ns; cut-through expects < 65ns" + + +def test_h2d_remote_cube_cut_through(): + """H2D to 1-hop remote cube: cut-through drain dominates, not per-hop serialization. + + With store-and-forward, each hop would serialize 4096B, total >> 100ns. + With cut-through, drain happens once at bottleneck. + """ + lat = _h2d_latency(dst_cube=4, dst_pe=0) + assert lat < 80.0, f"Remote H2D {lat:.2f}ns; cut-through expects < 80ns" + + +# ── 6. PE DMA: direct injection tests ───────────────────────── + + +def _graph(): + return load_topology(TOPOLOGY_PATH) + + +def _pe_dma_latency(src_cube: int, src_pe: int, dst_pe: int) -> float: + engine = _engine() + msg = PeDmaMsg( + correlation_id="probe", request_id=f"dma-c{src_cube}-p{src_pe}-s{dst_pe}", + src_sip=0, src_cube=src_cube, src_pe=src_pe, + dst_pa=_hbm_pa(sip=0, cube=src_cube, pe_id=dst_pe), nbytes=4096, + ) + h = engine.submit(msg) + engine.wait(h) + _, trace = engine.get_completion(h) + return trace["total_ns"] + + +def _pe_dma_bottleneck(src_cube: int, src_pe: int, dst_pe: int) -> float | None: + graph = _graph() + edge_map = {(e.src, e.dst): e for e in graph.edges} + resolver = AddressResolver(graph) + router = PathRouter(graph) + pa = _hbm_pa(sip=0, cube=src_cube, pe_id=dst_pe) + pa_obj = PhysAddr.decode(pa) + dst_node = resolver.resolve(pa_obj) + pe_ref = f"sip0.cube{src_cube}.pe{src_pe}" + path = router.find_path(pe_ref, dst_node) + bws: list[float] = [] + for i in range(len(path) - 1): + e = edge_map.get((path[i], path[i + 1])) + if e and e.bw_gbs: + bws.append(e.bw_gbs) + return min(bws) if bws else None + + +def test_pe_dma_local_completes(): + """PeDmaMsg to local slice0 must complete with ok=True, latency > 0.""" + engine = _engine() + msg = PeDmaMsg( + correlation_id="probe", request_id="dma-local", + src_sip=0, src_cube=0, src_pe=0, + dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096, + ) + h = engine.submit(msg) + engine.wait(h) + comp, trace = engine.get_completion(h) + assert comp.ok is True + assert trace["total_ns"] > 0 + + +def test_pe_dma_local_bottleneck_256(): + """PE DMA pe0→slice0 (local): bottleneck = 256 GB/s (direct xbar→hbm).""" + bn = _pe_dma_bottleneck(src_cube=0, src_pe=0, dst_pe=0) + assert bn == 256.0, f"Local PE DMA bottleneck {bn}, expected 256.0" + + +def test_pe_dma_chain_bottleneck_128(): + """PE DMA pe0→slice1 (xbar chain): bottleneck = 128 GB/s.""" + bn = _pe_dma_bottleneck(src_cube=0, src_pe=0, dst_pe=1) + assert bn == 128.0, f"Chain PE DMA bottleneck {bn}, expected 128.0" + + +def test_pe_dma_deterministic(): + """Same PeDmaMsg on two engines must produce identical latency.""" + msg = PeDmaMsg( + correlation_id="probe", request_id="det", + src_sip=0, src_cube=0, src_pe=0, + dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096, + ) + e1 = _engine() + h1 = e1.submit(msg) + e1.wait(h1) + _, t1 = e1.get_completion(h1) + + e2 = _engine() + h2 = e2.submit(msg) + e2.wait(h2) + _, t2 = e2.get_completion(h2) + + assert t1["total_ns"] == t2["total_ns"] diff --git a/tests/test_routing.py b/tests/test_routing.py new file mode 100644 index 0000000..b44c147 --- /dev/null +++ b/tests/test_routing.py @@ -0,0 +1,226 @@ +import pytest + +from pathlib import Path + +from kernbench.policy.address.phyaddr import PhysAddr, UnitType +from kernbench.policy.routing.router import AddressResolver, PathRouter, RoutingError +from kernbench.topology.builder import load_topology + +TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml" + + +def _graph(): + return load_topology(TOPOLOGY_PATH) + + +# ── AddressResolver ────────────────────────────────────────────────── + + +def test_resolve_hbm_addr(): + """HBM address -> sip{S}.cube{C}.hbm_ctrl.slice{P}""" + g = _graph() + resolver = AddressResolver(g) + # hbm_offset=0x1000, slice_size=6GB -> slice 0 + pa = PhysAddr.hbm_addr(rack_id=0, sip_id=0, cube_id=3, hbm_offset=0x1000) + assert resolver.resolve(pa) == "sip0.cube3.hbm_ctrl.slice0" + + +def test_resolve_hbm_addr_slice4(): + """HBM address in PE4's slice range -> slice4.""" + g = _graph() + resolver = AddressResolver(g) + # slice_size = 6GB; PE4 offset starts at 4*6GB = 24GB = 0x600000000 + pa = PhysAddr.hbm_addr(rack_id=0, sip_id=0, cube_id=0, hbm_offset=0x600000000) + assert resolver.resolve(pa) == "sip0.cube0.hbm_ctrl.slice4" + + +def test_resolve_pe_tcm_addr(): + """PE TCM address → sip{S}.cube{C}.pe{P}.pe_tcm""" + g = _graph() + resolver = AddressResolver(g) + pa = PhysAddr.pe_tcm_addr(rack_id=0, sip_id=1, cube_id=5, pe_id=7, tcm_offset=0x400) + assert resolver.resolve(pa) == "sip1.cube5.pe7.pe_tcm" + + +def test_resolve_sram_addr(): + """SRAM address → sip{S}.cube{C}.sram""" + g = _graph() + resolver = AddressResolver(g) + pa = PhysAddr.cube_sram_addr(rack_id=0, sip_id=0, cube_id=10, sram_offset=0x800) + assert resolver.resolve(pa) == "sip0.cube10.sram" + + +def test_resolve_mcpu_addr(): + """MCPU pe_resource address → sip{S}.cube{C}.m_cpu""" + g = _graph() + resolver = AddressResolver(g) + pa = PhysAddr( + rack_id=0, sip_id=0, sip_seg=2, local_offset=(UnitType.MCPU << 34), + kind="pe_resource", cube_id=2, unit_type=UnitType.MCPU, + ) + assert resolver.resolve(pa) == "sip0.cube2.m_cpu" + + +def test_resolve_nonexistent_node(): + """Address pointing to a node outside the compiled topology raises RoutingError.""" + g = _graph() + resolver = AddressResolver(g) + # sip_id=15 doesn't exist in the 2-SIP topology + pa = PhysAddr.hbm_addr(rack_id=0, sip_id=15, cube_id=0, hbm_offset=0) + with pytest.raises(RoutingError): + resolver.resolve(pa) + + +# ── PathRouter: local HBM (same xbar half) ────────────────────────── + + +def test_path_local_hbm_same_half(): + """PE0 -> slice0 (local): pe_dma -> xbar.pe0 -> hbm_ctrl.slice0 (no chain hops).""" + g = _graph() + router = PathRouter(g) + path = router.find_path("sip0.cube0.pe0", "sip0.cube0.hbm_ctrl.slice0") + assert path[0] == "sip0.cube0.pe0.pe_dma" + assert "sip0.cube0.xbar.pe0" in path + assert path[-1] == "sip0.cube0.hbm_ctrl.slice0" + # local access: no bridge and no chain traversal (shortest path = 3 nodes) + assert not any("bridge" in n for n in path) + assert len(path) == 3 # pe_dma → xbar.pe0 → slice0 + + +# ── PathRouter: same-half remote HBM ──────────────────────────────── + + +def test_path_same_half_remote_hbm(): + """PE0 -> slice1: same-half chain traversal pe0→pe1, no bridge.""" + g = _graph() + router = PathRouter(g) + path = router.find_path("sip0.cube0.pe0", "sip0.cube0.hbm_ctrl.slice1") + assert path[0] == "sip0.cube0.pe0.pe_dma" + assert "sip0.cube0.xbar.pe0" in path # enter at pe0 + assert "sip0.cube0.xbar.pe1" in path # chain hop to pe1 + assert path[-1] == "sip0.cube0.hbm_ctrl.slice1" + assert not any("bridge" in n for n in path) + assert len(path) == 4 # pe_dma → xbar.pe0 → xbar.pe1 → slice1 + + +# ── PathRouter: cross-half HBM ────────────────────────────────────── + + +def test_path_cross_half_hbm(): + """PE0 -> slice4 (cross-half): pe_dma → xbar.pe0 → bridge.left → xbar.pe4 → slice4.""" + g = _graph() + router = PathRouter(g) + path = router.find_path("sip0.cube0.pe0", "sip0.cube0.hbm_ctrl.slice4") + assert path[0] == "sip0.cube0.pe0.pe_dma" + assert "sip0.cube0.xbar.pe0" in path + assert any("bridge" in n for n in path), "cross-half HBM must traverse bridge" + assert "sip0.cube0.xbar.pe4" in path + assert path[-1] == "sip0.cube0.hbm_ctrl.slice4" + # Shortest cross-half path: pe_dma → xbar.pe0 → bridge.left → xbar.pe4 → slice4 + assert len(path) == 5 + + +def test_path_cross_half_requires_bridge(): + """PE4 (bottom) -> slice2 (top) requires bridge traversal.""" + g = _graph() + router = PathRouter(g) + path = router.find_path("sip0.cube0.pe4", "sip0.cube0.hbm_ctrl.slice2") + assert any("bridge" in n for n in path), "cross-half HBM must traverse bridge" + assert any("xbar.pe" in n for n in path) + assert path[-1] == "sip0.cube0.hbm_ctrl.slice2" + + +def test_cross_half_distance_greater(): + """Cross-half HBM access must have greater distance than local-half.""" + g = _graph() + router = PathRouter(g) + _, dist_local = router.find_path_with_distance( + "sip0.cube0.pe0", "sip0.cube0.hbm_ctrl.slice0") + _, dist_cross = router.find_path_with_distance( + "sip0.cube0.pe0", "sip0.cube0.hbm_ctrl.slice4") + assert dist_cross > dist_local + + +def test_path_same_half_remote_longer(): + """Same-half remote HBM (PE0->slice3) has greater distance than local (PE0->slice0).""" + g = _graph() + router = PathRouter(g) + _, dist_local = router.find_path_with_distance( + "sip0.cube0.pe0", "sip0.cube0.hbm_ctrl.slice0") + _, dist_remote = router.find_path_with_distance( + "sip0.cube0.pe0", "sip0.cube0.hbm_ctrl.slice3") + assert dist_remote > dist_local, ( + f"same-half remote ({dist_remote:.2f}mm) must > local ({dist_local:.2f}mm)" + ) + + +def test_path_remote_cube_hbm(): + """PE0 in cube0 can reach HBM in cube1 via UCIe (ADR-0004 D4).""" + g = _graph() + router = PathRouter(g) + path = router.find_path("sip0.cube0.pe0", "sip0.cube1.hbm_ctrl.slice0") + assert path[0] == "sip0.cube0.pe0.pe_dma" + assert path[-1] == "sip0.cube1.hbm_ctrl.slice0" + # inter-cube path must cross a UCIe link + assert any("ucie" in n for n in path), "remote cube path must traverse UCIe" + # must not be trivially short (needs noc + ucie + remote noc + xbar) + assert len(path) >= 5 + + +# ── PathRouter: SRAM via NOC ──────────────────────────────────────── + + +def test_path_sram_via_noc(): + """PE → SRAM must go through NOC (non-HBM data path).""" + g = _graph() + router = PathRouter(g) + path = router.find_path("sip0.cube0.pe0", "sip0.cube0.sram") + assert path[0] == "sip0.cube0.pe0.pe_dma" + assert "sip0.cube0.noc" in path + assert path[-1] == "sip0.cube0.sram" + # should NOT go through xbar (SRAM is non-HBM path) + assert not any("xbar" in n for n in path) + + +# ── PathRouter: PE TCM (local) ────────────────────────────────────── + + +def test_path_local_tcm(): + """PE0 → own TCM is PE-internal, not via xbar or noc.""" + g = _graph() + router = PathRouter(g) + path = router.find_path("sip0.cube0.pe0", "sip0.cube0.pe0.pe_tcm") + assert path[0] == "sip0.cube0.pe0.pe_dma" + assert path[-1] == "sip0.cube0.pe0.pe_tcm" + # PE-internal path, no fabric + assert not any("xbar" in n or "noc" in n for n in path) + + +# ── PathRouter: distance monotonic ────────────────────────────────── + + +def test_path_distance_positive(): + """All routed paths must have accumulated distance > 0 (ADR-0002 D4).""" + g = _graph() + router = PathRouter(g) + _, dist = router.find_path_with_distance("sip0.cube0.pe0", "sip0.cube0.hbm_ctrl.slice0") + assert dist > 0 + + +def test_path_deterministic(): + """Same (src, dst) must always produce the same path.""" + g = _graph() + r1 = PathRouter(g) + r2 = PathRouter(g) + p1 = r1.find_path("sip0.cube0.pe3", "sip0.cube0.hbm_ctrl.slice3") + p2 = r2.find_path("sip0.cube0.pe3", "sip0.cube0.hbm_ctrl.slice3") + assert p1 == p2 + + +def test_remote_cube_path_no_routing_error(): + """Routing to remote cube HBM must not raise RoutingError (ADR-0004 D4).""" + g = _graph() + router = PathRouter(g) + # cube0.PE0 -> cube1.slice0 (adjacent cube, E direction) + path = router.find_path("sip0.cube0.pe0", "sip0.cube1.hbm_ctrl.slice0") + assert len(path) >= 1 # succeeds without exception diff --git a/tests/test_tensor.py b/tests/test_tensor.py new file mode 100644 index 0000000..a89109f --- /dev/null +++ b/tests/test_tensor.py @@ -0,0 +1,282 @@ +import pytest + +from kernbench.policy.address.allocator import AddressConfig, AllocationError, PEMemAllocator +from kernbench.policy.placement.dp import ( + ShardSpec, + column_wise, + tiled_column_major, + replicate, + row_wise, + tiled_row_major, +) +from kernbench.runtime_api.kernel import ( + KernelLaunchMsg, + KernelRef, + MemoryReadMsg, + MemoryWriteMsg, + ScalarArg, + TensorArg, + TensorArgShard, +) +from kernbench.runtime_api.tensor import ( + TensorHandle, + TensorShard, + deploy_tensor, + dtype_itemsize, +) + +_MB = 1 << 20 +_GB = 1 << 30 + +_CFG = AddressConfig( + sip_count=2, + cubes_per_sip=16, + pes_per_cube=8, + hbm_bytes_per_cube=48 * _GB, + hbm_slices_per_cube=8, + tcm_bytes_per_pe=16 * _MB, + tcm_scheduler_reserved_bytes=4 * _MB, + sram_bytes_per_cube=32 * _MB, +) + + +def _make_allocators(num_pe: int = 8) -> dict[int, PEMemAllocator]: + return { + i: PEMemAllocator(rack_id=0, sip_id=0, cube_id=0, pe_id=i, cfg=_CFG) + for i in range(num_pe) + } + + +# ── Tensor types ───────────────────────────────────────────────────── + + +def test_tensor_shard_immutable(): + ts = TensorShard(sip=0, cube=0, pe=0, pa=0x1000, nbytes=4096, offset_bytes=0) + with pytest.raises(AttributeError): + ts.pa = 0x2000 # type: ignore[misc] + # hashable + {ts} + + +def test_tensor_handle_nbytes(): + th = TensorHandle( + name="A", + shape=(1024, 512), + dtype="fp16", + itemsize=2, + shards=(), + ) + assert th.nbytes == 1024 * 512 * 2 # 1 MB + + +# ── Message types (ADR-0012) ───────────────────────────────────────── + + +def test_memory_write_msg_fields(): + msg = MemoryWriteMsg( + correlation_id="c0", + request_id="r0", + dst_sip=0, + dst_cube=3, + dst_pe=5, + dst_pa=0xDEAD, + nbytes=4096, + pattern="zero", + ) + assert msg.msg_type == "memory_write" + assert msg.src_kind == "pattern" + assert msg.dst_pa == 0xDEAD + assert msg.pattern == "zero" + with pytest.raises(AttributeError): + msg.nbytes = 0 # type: ignore[misc] + + +def test_memory_read_msg_fields(): + msg = MemoryReadMsg( + correlation_id="c0", + request_id="r1", + src_sip=1, + src_cube=2, + src_pe=7, + src_pa=0xBEEF, + nbytes=2048, + ) + assert msg.msg_type == "memory_read" + assert msg.src_pa == 0xBEEF + assert msg.nbytes == 2048 + + +def test_kernel_launch_msg_fields(): + shard = TensorArgShard(sip=0, cube=0, pe=0, pa=0x100, nbytes=1024, offset_bytes=0) + targ = TensorArg(shards=(shard,)) + sarg = ScalarArg(dtype="fp32", value=1.0) + kref = KernelRef(name="gemm", kind="builtin") + msg = KernelLaunchMsg( + correlation_id="c0", + request_id="r2", + kernel_ref=kref, + args=(targ, sarg), + ) + assert msg.msg_type == "kernel_launch" + assert msg.kernel_ref.name == "gemm" + assert len(msg.args) == 2 + assert msg.args[0].arg_kind == "tensor" + assert msg.args[1].arg_kind == "scalar" + + +# ── Placement: column_wise ─────────────────────────────────────────── + + +def test_column_wise_placement(): + """(1024, 512) fp16 across 8 PEs → K axis split → 8 shards, each (1024, 64) = 128KB""" + shards = column_wise(shape=(1024, 512), itemsize=2, num_pe=8) + assert len(shards) == 8 + expected_nbytes = 1024 * 64 * 2 # 128 KB + for i, s in enumerate(shards): + assert s.pe_index == i + assert s.nbytes == expected_nbytes + # offsets are contiguous + assert shards[0].offset_bytes == 0 + assert shards[1].offset_bytes == expected_nbytes + # total coverage + assert sum(s.nbytes for s in shards) == 1024 * 512 * 2 + + +# ── Placement: row_wise ────────────────────────────────────────────── + + +def test_row_wise_placement(): + """(1024, 512) fp16 across 8 PEs → M axis split → 8 shards, each (128, 512) = 128KB""" + shards = row_wise(shape=(1024, 512), itemsize=2, num_pe=8) + assert len(shards) == 8 + expected_nbytes = 128 * 512 * 2 # 128 KB + for i, s in enumerate(shards): + assert s.pe_index == i + assert s.nbytes == expected_nbytes + assert shards[0].offset_bytes == 0 + assert sum(s.nbytes for s in shards) == 1024 * 512 * 2 + + +# ── Placement: replicate ───────────────────────────────────────────── + + +def test_replicate_placement(): + """(1024, 512) fp16 across 8 PEs → each PE gets full copy = 1MB""" + shards = replicate(shape=(1024, 512), itemsize=2, num_pe=8) + assert len(shards) == 8 + full_nbytes = 1024 * 512 * 2 # 1 MB + for i, s in enumerate(shards): + assert s.pe_index == i + assert s.nbytes == full_nbytes + assert s.offset_bytes == 0 # each is a full copy + + +# ── Placement: tiled_column_major ───────────────────────────────────── + + +def test_tiled_column_major(): + """(1024, 512) tile=(256, 128) → 4×4=16 tiles, column-major → round-robin 8 PEs""" + shards = tiled_column_major( + shape=(1024, 512), itemsize=2, num_pe=8, tile_m=256, tile_k=128, + ) + # 4 tiles along M, 4 tiles along K → 16 tiles total + assert len(shards) == 16 + tile_bytes = 256 * 128 * 2 # 64 KB per tile + for s in shards: + assert s.nbytes == tile_bytes + # column-major: iterate K first, then M + # tile (m=0,k=0) → PE0, tile (m=0,k=1) → PE1, ..., (m=0,k=3) → PE3 + # tile (m=1,k=0) → PE4, tile (m=1,k=1) → PE5, ..., (m=1,k=3) → PE7 + # tile (m=2,k=0) → PE0, ... + assert shards[0].pe_index == 0 + assert shards[1].pe_index == 1 + assert shards[7].pe_index == 7 + assert shards[8].pe_index == 0 # wraps around + # total coverage + assert sum(s.nbytes for s in shards) == 1024 * 512 * 2 + + +# ── Placement: tiled_row_major ──────────────────────────────────────── + + +def test_tiled_row_major(): + """(1024, 512) tile=(256, 128) → 4×4=16 tiles, row-major → round-robin 8 PEs""" + shards = tiled_row_major( + shape=(1024, 512), itemsize=2, num_pe=8, tile_m=256, tile_k=128, + ) + assert len(shards) == 16 + tile_bytes = 256 * 128 * 2 + for s in shards: + assert s.nbytes == tile_bytes + # row-major: iterate M first, then K + # tile (m=0,k=0) → PE0, tile (m=1,k=0) → PE1, ..., (m=3,k=0) → PE3 + # tile (m=0,k=1) → PE4, tile (m=1,k=1) → PE5, ..., (m=3,k=1) → PE7 + # tile (m=0,k=2) → PE0, ... + assert shards[0].pe_index == 0 + assert shards[1].pe_index == 1 + assert shards[7].pe_index == 7 + assert shards[8].pe_index == 0 # wraps around + # total coverage + assert sum(s.nbytes for s in shards) == 1024 * 512 * 2 + + +# ── deploy_tensor ──────────────────────────────────────────────────── + + +def test_deploy_tensor_hbm(): + """Deploy with column_wise placement → TensorHandle with valid PA shards.""" + allocs = _make_allocators() + placement = column_wise(shape=(1024, 512), itemsize=2, num_pe=8) + th = deploy_tensor( + name="W", + shape=(1024, 512), + dtype="fp16", + placement=placement, + allocators=allocs, + mem_kind="hbm", + ) + assert th.name == "W" + assert th.shape == (1024, 512) + assert th.dtype == "fp16" + assert th.itemsize == 2 + assert len(th.shards) == 8 + # each shard has a distinct PA + pas = [s.pa for s in th.shards] + assert len(set(pas)) == 8 + # each shard placed on correct PE + for i, s in enumerate(th.shards): + assert s.pe == i + assert s.sip == 0 + assert s.cube == 0 + + +def test_deploy_tensor_tcm(): + """Deploy with TCM → uses pe_tcm_addr allocation.""" + allocs = _make_allocators() + placement = [ShardSpec(pe_index=0, offset_bytes=0, nbytes=256)] + th = deploy_tensor( + name="small", + shape=(128,), + dtype="fp16", + placement=placement, + allocators=allocs, + mem_kind="tcm", + ) + assert len(th.shards) == 1 + assert th.shards[0].pe == 0 + assert th.shards[0].nbytes == 256 + + +def test_deploy_tensor_overflow(): + """Allocation exceeding PE HBM capacity raises AllocationError.""" + allocs = _make_allocators() + # 6 GB per PE slice, try to allocate 7 GB + big_shard = ShardSpec(pe_index=0, offset_bytes=0, nbytes=7 * _GB) + with pytest.raises(AllocationError): + deploy_tensor( + name="toobig", + shape=(1,), + dtype="int8", + placement=[big_shard], + allocators=allocs, + ) diff --git a/tests/test_topology_compile.py b/tests/test_topology_compile.py new file mode 100644 index 0000000..e8c4359 --- /dev/null +++ b/tests/test_topology_compile.py @@ -0,0 +1,409 @@ +from pathlib import Path + +from kernbench.topology.builder import load_topology + +TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml" + + +def _graph(): + return load_topology(TOPOLOGY_PATH) + + +# ── Full graph: node counts ────────────────────────────────────────── + + +def test_full_graph_node_count(): + g = _graph() + # 1 switch + # + 2 SIPs × (1 IO × 2 comps + 16 cubes × (cube_comps + 8 PEs × 6 pe_comps)) + # cube_comps: 9 (noc, m_cpu, sram, 2 bridge, 4 ucie) + # + 8 xbar.pe{0..7} [replaced xbar.top/xbar.bottom] + # + 8 hbm_slices = 25 + # = 1 + 2*(2 + 16*(25+48)) = 1 + 2*(2+1168) = 1 + 2340 = 2341 + assert len(g.nodes) == 2341 + + +def test_full_graph_edge_count(): + g = _graph() + # Per cube: 144 (88 cube-fabric + 56 PE-internal) + # cube-fabric: 8 pe→xbar.pe + 8 pe→noc + 8 noc→pe_cpu + # + 8 xbar.pe→slice + 8 slice→xbar.pe (bidirectional for response) + # + 12 xbar chain (3 pairs × 2 dir × 2 halves) + # + 8 xbar.pe↔bridge (pe0↔bL, pe4↔bL, pe3↔bR, pe7↔bR, ×2 dir each) + # + 4 noc→ucie + 4 ucie→noc (bidirectional) + # + 8 noc→xbar.pe + 8 xbar.pe→noc (bidirectional for response) + # + 1 m_cpu→noc + 1 noc→m_cpu + 1 noc→sram + 1 sram→noc = 88 + # Per SIP: 16*144 + 48 inter-cube(bidirectional) + 8 io↔cube(bidirectional) + # + 1 io_internal + 1 switch→io = 2362 + # Total: 2 * 2362 = 4724 + assert len(g.edges) == 4724 + + +# ── Full graph: specific nodes exist ───────────────────────────────── + + +def test_system_switch_exists(): + g = _graph() + assert "fabric.switch0" in g.nodes + assert g.nodes["fabric.switch0"].kind == "switch" + assert g.nodes["fabric.switch0"].pos_mm is None # abstract + + +def test_io_chiplet_nodes_exist(): + g = _graph() + for s in range(2): + assert f"sip{s}.io0.pcie_ep" in g.nodes + assert f"sip{s}.io0.io_cpu" in g.nodes + + +def test_cube_component_nodes_exist(): + g = _graph() + cp = "sip0.cube0" + for name in ("noc", "m_cpu", + "bridge.left", "bridge.right", + "ucie-N", "ucie-S", "ucie-E", "ucie-W", + "sram"): + assert f"{cp}.{name}" in g.nodes + # xbar.top/xbar.bottom replaced by per-PE xbar entry nodes + assert "sip0.cube0.xbar.top" not in g.nodes + assert "sip0.cube0.xbar.bottom" not in g.nodes + for pe in range(8): + node_id = f"{cp}.xbar.pe{pe}" + assert node_id in g.nodes, f"{node_id} missing" + assert g.nodes[node_id].kind == "xbar" + # HBM slices (one per PE) + for s in range(8): + assert f"{cp}.hbm_ctrl.slice{s}" in g.nodes + assert g.nodes[f"{cp}.hbm_ctrl.slice{s}"].kind == "hbm_ctrl" + + +def test_pe_component_nodes_exist(): + g = _graph() + for comp in ("pe_cpu", "pe_scheduler", "pe_dma", "pe_gemm", "pe_math", "pe_tcm"): + assert f"sip0.cube0.pe0.{comp}" in g.nodes + assert f"sip1.cube15.pe7.{comp}" in g.nodes + + +# ── Full graph: positions ──────────────────────────────────────────── + + +def test_hbm_ctrl_slices_at_cube_center(): + g = _graph() + # cube0 origin = (0, 0), cx=8.5, cy=7.0, hbm_ctrl at (cx-2, cy) + # all slices share the same physical position + for s in range(8): + node = g.nodes[f"sip0.cube0.hbm_ctrl.slice{s}"] + assert node.pos_mm == (6.5, 7.0) + + +def test_hbm_ctrl_slices_cube5_position(): + g = _graph() + # cube5 = col=1, row=1 -> origin = (1*18, 1*15) = (18, 15) + # hbm_ctrl = (18 + 6.5, 15 + 7.0) = (24.5, 22.0) + node = g.nodes["sip0.cube5.hbm_ctrl.slice0"] + assert node.pos_mm == (24.5, 22.0) + + +def test_ucie_ports_at_cube_edges(): + g = _graph() + # cube0 origin = (0, 0), cube_w=17, cube_h=14 + # UCIe nodes inset by half-size so edges touch boundary + assert g.nodes["sip0.cube0.ucie-N"].pos_mm == (8.5, 0.6) + assert g.nodes["sip0.cube0.ucie-S"].pos_mm == (8.5, 13.4) + assert g.nodes["sip0.cube0.ucie-W"].pos_mm == (1.0, 7.0) + assert g.nodes["sip0.cube0.ucie-E"].pos_mm == (16.0, 7.0) + + +# ── Full graph: edges ──────────────────────────────────────────────── + + +def _edge_set(g): + return {(e.src, e.dst) for e in g.edges} + + +def test_inter_cube_ucie_edges(): + es = _edge_set(_graph()) + # cube0 (0,0) E → cube1 (1,0) W + assert ("sip0.cube0.ucie-E", "sip0.cube1.ucie-W") in es + # cube0 (0,0) S → cube4 (0,1) N + assert ("sip0.cube0.ucie-S", "sip0.cube4.ucie-N") in es + + +def test_io_to_cube_edges(): + es = _edge_set(_graph()) + # io0 connects to cubes (0,0)..(3,0) on N side + assert ("sip0.io0.io_cpu", "sip0.cube0.ucie-N") in es + assert ("sip0.io0.io_cpu", "sip0.cube3.ucie-N") in es + + +def test_switch_to_io_edges(): + es = _edge_set(_graph()) + assert ("fabric.switch0", "sip0.io0.pcie_ep") in es + assert ("fabric.switch0", "sip1.io0.pcie_ep") in es + + +def test_pe_to_xbar_edges(): + es = _edge_set(_graph()) + cp = "sip0.cube0" + # Each PE connects to its own xbar entry (per-PE chain model) + for pe in range(8): + assert (f"{cp}.pe{pe}.pe_dma", f"{cp}.xbar.pe{pe}") in es + # Old shared xbar.top/bottom edges must NOT exist + assert (f"{cp}.pe0.pe_dma", f"{cp}.xbar.top") not in es + assert (f"{cp}.pe4.pe_dma", f"{cp}.xbar.bottom") not in es + + +def test_command_path_m_cpu_noc_pe_cpu(): + es = _edge_set(_graph()) + cp = "sip0.cube0" + # m_cpu ↔ noc (bidirectional) + assert (f"{cp}.m_cpu", f"{cp}.noc") in es + assert (f"{cp}.noc", f"{cp}.m_cpu") in es + # noc → pe_cpu for each PE + assert (f"{cp}.noc", f"{cp}.pe0.pe_cpu") in es + assert (f"{cp}.noc", f"{cp}.pe7.pe_cpu") in es + + +def test_pe_internal_edges(): + es = _edge_set(_graph()) + pp = "sip0.cube0.pe0" + assert (f"{pp}.pe_cpu", f"{pp}.pe_scheduler") in es + assert (f"{pp}.pe_scheduler", f"{pp}.pe_dma") in es + assert (f"{pp}.pe_scheduler", f"{pp}.pe_gemm") in es + assert (f"{pp}.pe_scheduler", f"{pp}.pe_math") in es + assert (f"{pp}.pe_dma", f"{pp}.pe_tcm") in es + assert (f"{pp}.pe_gemm", f"{pp}.pe_tcm") in es + assert (f"{pp}.pe_math", f"{pp}.pe_tcm") in es + + +def test_xbar_to_hbm_slice_edges(): + """Each xbar.pe{i} connects only to its own (local) HBM slice.""" + es = _edge_set(_graph()) + cp = "sip0.cube0" + # xbar.pe_i -> slice_i only (local Y-direction access) + for pe in range(8): + assert (f"{cp}.xbar.pe{pe}", f"{cp}.hbm_ctrl.slice{pe}") in es + # Negative: xbar.pe_i must NOT directly connect to a different slice + assert (f"{cp}.xbar.pe0", f"{cp}.hbm_ctrl.slice1") not in es + assert (f"{cp}.xbar.pe0", f"{cp}.hbm_ctrl.slice4") not in es + assert (f"{cp}.xbar.pe4", f"{cp}.hbm_ctrl.slice0") not in es + + +# ── Views: system ──────────────────────────────────────────────────── + + +def test_system_view_nodes(): + v = _graph().system_view + assert "fabric.switch0" in v.nodes + assert "sip0" in v.nodes + assert "sip1" in v.nodes + assert "sip0.io0" in v.nodes + assert "sip1.io0" in v.nodes + + +# ── Views: SIP ─────────────────────────────────────────────────────── + + +def test_sip_view_cube_count(): + v = _graph().sip_view + cube_nodes = [n for n in v.nodes if n.startswith("cube")] + assert len(cube_nodes) == 16 + + +def test_sip_view_io_chiplets(): + v = _graph().sip_view + assert "io0" in v.nodes + + +def test_sip_view_cube_positions(): + v = _graph().sip_view + # cube0 (0,0): center = (8.5, 6+7.0) = (8.5, 13.0) [io_margin=6] + x, y = v.nodes["cube0"].pos_mm + assert x == 8.5 + assert y == 13.0 + # cube1 (1,0): center = (18+8.5, 13.0) = (26.5, 13.0) + x1, y1 = v.nodes["cube1"].pos_mm + assert x1 == 26.5 + assert y1 == 13.0 + + +# ── Views: cube ────────────────────────────────────────────────────── + + +def test_cube_view_has_all_components(): + v = _graph().cube_view + expected = {"ucie-N", "ucie-S", "ucie-W", "ucie-E", + "m_cpu", "hbm_ctrl", + "bridge.left", "bridge.right", "noc", "sram", + "xbar.pe0", "xbar.pe1", "xbar.pe2", "xbar.pe3", + "xbar.pe4", "xbar.pe5", "xbar.pe6", "xbar.pe7", + "pe0", "pe1", "pe2", "pe3", "pe4", "pe5", "pe6", "pe7"} + assert set(v.nodes.keys()) == expected + + +def test_cube_view_hbm_at_center(): + v = _graph().cube_view + assert v.nodes["hbm_ctrl"].pos_mm == (6.5, 7.0) + assert v.nodes["noc"].pos_mm == (10.5, 7.0) + assert v.width_mm == 17.0 + assert v.height_mm == 14.0 + + +def test_cube_view_pe_corner_mapping(): + v = _graph().cube_view + ves = {(e.src, e.dst) for e in v.edges} + # Each PE connects to its own xbar entry (chain model) + for i in range(8): + assert (f"pe{i}", f"xbar.pe{i}") in ves + # Old shared xbar.top/bottom mapping must not exist + assert ("pe0", "xbar.top") not in ves + assert ("pe4", "xbar.bottom") not in ves + + +# ── Views: PE ──────────────────────────────────────────────────────── + + +def test_pe_view_has_all_components(): + v = _graph().pe_view + assert set(v.nodes.keys()) == { + "pe_cpu", "pe_scheduler", "pe_dma", "pe_gemm", "pe_math", "pe_tcm" + } + + +def test_pe_view_edges(): + v = _graph().pe_view + ves = {(e.src, e.dst) for e in v.edges} + assert ("pe_cpu", "pe_scheduler") in ves + assert ("pe_scheduler", "pe_dma") in ves + assert ("pe_scheduler", "pe_gemm") in ves + assert ("pe_scheduler", "pe_math") in ves + assert ("pe_dma", "pe_tcm") in ves + assert ("pe_gemm", "pe_tcm") in ves + assert ("pe_math", "pe_tcm") in ves + + +# ── SRAM ──────────────────────────────────────────────────────────── + + +def test_sram_node_exists(): + g = _graph() + assert "sip0.cube0.sram" in g.nodes + assert g.nodes["sip0.cube0.sram"].kind == "sram" + + +def test_noc_to_sram_edges(): + es = _edge_set(_graph()) + cp = "sip0.cube0" + assert (f"{cp}.noc", f"{cp}.sram") in es + assert (f"{cp}.sram", f"{cp}.noc") in es + + +# ── PE_DMA → NOC (non-HBM data path) ─────────────────────────────── + + +def test_pe_dma_to_noc_edges(): + es = _edge_set(_graph()) + cp = "sip0.cube0" + for i in range(8): + assert (f"{cp}.pe{i}.pe_dma", f"{cp}.noc") in es + + +# ── Bridge connects XBAR halves (not NOC) ────────────────────────── + + +def test_bridge_connects_xbar_halves(): + """bridge.left connects leftmost PE nodes (pe0 top, pe4 bottom). + bridge.right connects rightmost PE nodes (pe3 top, pe7 bottom).""" + es = _edge_set(_graph()) + cp = "sip0.cube0" + # bridge.left ↔ pe0 (top-left) and pe4 (bottom-left) + assert (f"{cp}.xbar.pe0", f"{cp}.bridge.left") in es + assert (f"{cp}.bridge.left", f"{cp}.xbar.pe0") in es + assert (f"{cp}.xbar.pe4", f"{cp}.bridge.left") in es + assert (f"{cp}.bridge.left", f"{cp}.xbar.pe4") in es + # bridge.right ↔ pe3 (top-right) and pe7 (bottom-right) + assert (f"{cp}.xbar.pe3", f"{cp}.bridge.right") in es + assert (f"{cp}.bridge.right", f"{cp}.xbar.pe3") in es + assert (f"{cp}.xbar.pe7", f"{cp}.bridge.right") in es + assert (f"{cp}.bridge.right", f"{cp}.xbar.pe7") in es + # Old xbar.top/bottom ↔ bridge edges must NOT exist + assert (f"{cp}.xbar.top", f"{cp}.bridge.left") not in es + assert (f"{cp}.xbar.bottom", f"{cp}.bridge.left") not in es + + +def test_no_bridge_to_noc_edges(): + es = _edge_set(_graph()) + cp = "sip0.cube0" + assert (f"{cp}.bridge.left", f"{cp}.noc") not in es + assert (f"{cp}.bridge.right", f"{cp}.noc") not in es + + +# ── Cube view: new edges ──────────────────────────────────────────── + + +def test_cube_view_pe_to_noc(): + v = _graph().cube_view + ves = {(e.src, e.dst) for e in v.edges} + for i in range(8): + assert (f"pe{i}", "noc") in ves + + +def test_cube_view_sram(): + v = _graph().cube_view + assert "sram" in v.nodes + ves = {(e.src, e.dst) for e in v.edges} + assert ("noc", "sram") in ves + assert ("sram", "noc") in ves + + +def test_cube_view_bridge_xbar(): + v = _graph().cube_view + ves = {(e.src, e.dst) for e in v.edges} + # bridge.left connects pe0 (top-left) ↔ pe4 (bottom-left) + assert ("xbar.pe0", "bridge.left") in ves + assert ("bridge.left", "xbar.pe0") in ves + assert ("xbar.pe4", "bridge.left") in ves + assert ("bridge.left", "xbar.pe4") in ves + # bridge.right connects pe3 (top-right) ↔ pe7 (bottom-right) + assert ("xbar.pe3", "bridge.right") in ves + assert ("bridge.right", "xbar.pe3") in ves + assert ("xbar.pe7", "bridge.right") in ves + assert ("bridge.right", "xbar.pe7") in ves + + +# ── Chain xbar: new topology edges ────────────────────────────────── + + +def test_xbar_chain_edges(): + """Adjacent xbar.pe nodes within each half are bidirectionally connected.""" + es = _edge_set(_graph()) + cp = "sip0.cube0" + # Top chain: pe0 ↔ pe1 ↔ pe2 ↔ pe3 (NW→NE direction) + for a, b in [(0, 1), (1, 2), (2, 3)]: + assert (f"{cp}.xbar.pe{a}", f"{cp}.xbar.pe{b}") in es, f"missing pe{a}→pe{b}" + assert (f"{cp}.xbar.pe{b}", f"{cp}.xbar.pe{a}") in es, f"missing pe{b}→pe{a}" + # Bottom chain: pe4 ↔ pe5 ↔ pe6 ↔ pe7 + for a, b in [(4, 5), (5, 6), (6, 7)]: + assert (f"{cp}.xbar.pe{a}", f"{cp}.xbar.pe{b}") in es, f"missing pe{a}→pe{b}" + assert (f"{cp}.xbar.pe{b}", f"{cp}.xbar.pe{a}") in es, f"missing pe{b}→pe{a}" + # Negative: no cross-chain direct edges + assert (f"{cp}.xbar.pe0", f"{cp}.xbar.pe2") not in es + assert (f"{cp}.xbar.pe0", f"{cp}.xbar.pe4") not in es + + +def test_ucie_noc_reverse_edges(): + """UCIe ports must have reverse edges back to NOC (bidirectional).""" + es = _edge_set(_graph()) + cp = "sip0.cube1" # non-edge cube to avoid io-cube edges + for port in ("N", "S", "E", "W"): + assert (f"{cp}.ucie-{port}", f"{cp}.noc") in es, \ + f"missing ucie-{port}->noc reverse edge" + + +def test_noc_to_xbar_pe_edges(): + """NOC connects to all xbar.pe nodes (for remote cube HBM access).""" + es = _edge_set(_graph()) + cp = "sip0.cube0" + for pe in range(8): + assert (f"{cp}.noc", f"{cp}.xbar.pe{pe}") in es, \ + f"missing noc->xbar.pe{pe}" diff --git a/tests/test_topology_load.py b/tests/test_topology_load.py new file mode 100644 index 0000000..a30edf5 --- /dev/null +++ b/tests/test_topology_load.py @@ -0,0 +1,60 @@ +from pathlib import Path + +from kernbench.topology.builder import _read_spec, resolve_topology + +TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml" + + +def test_topology_yaml_loads_without_error(): + # _compile_graph is still stubbed (returns None); load must not raise + resolve_topology(str(TOPOLOGY_PATH)) + + +def test_pe_layout_structure(): + spec = _read_spec(TOPOLOGY_PATH) + pe_layout = spec["cube"]["pe_layout"] + assert set(pe_layout["corners"]) == {"NW", "NE", "SW", "SE"} + assert pe_layout["pe_per_corner"] == 2 + # derived total must equal original pe_per_cube: 8 + assert pe_layout["pe_per_corner"] * len(pe_layout["corners"]) == 8 + + +def test_pe_template_components(): + spec = _read_spec(TOPOLOGY_PATH) + comps = spec["cube"]["pe_template"]["components"] + assert set(comps.keys()) == { + "pe_cpu", "pe_scheduler", "pe_dma", "pe_gemm", "pe_math", "pe_tcm" + } + + +def test_pe_template_links_present(): + spec = _read_spec(TOPOLOGY_PATH) + links = spec["cube"]["pe_template"]["links"] + required = { + "pe_cpu_to_scheduler_mm", + "scheduler_to_dma_mm", + "scheduler_to_gemm_mm", + "scheduler_to_math_mm", + "dma_to_tcm_bw_gbs", "dma_to_tcm_mm", + "gemm_to_tcm_bw_gbs", "gemm_to_tcm_mm", + "math_to_tcm_bw_gbs", "math_to_tcm_mm", + } + assert required.issubset(set(links.keys())) + + +def test_pe_dma_not_in_cube_components(): + spec = _read_spec(TOPOLOGY_PATH) + assert "pe_dma" not in spec["cube"]["components"] + + +def test_pe_per_cube_removed(): + spec = _read_spec(TOPOLOGY_PATH) + assert "pe_per_cube" not in spec["cube"].get("device", {}) + + +def test_shared_resource_accel_slot(): + # ADR-0014 D4: PE_GEMM and PE_MATH share PE_ACCEL capacity = 1 + spec = _read_spec(TOPOLOGY_PATH) + comps = spec["cube"]["pe_template"]["components"] + assert comps["pe_gemm"]["attrs"]["shared_resource"] == "accel_slot" + assert comps["pe_math"]["attrs"]["shared_resource"] == "accel_slot" diff --git a/tests/test_topology_visualize.py b/tests/test_topology_visualize.py new file mode 100644 index 0000000..848c42d --- /dev/null +++ b/tests/test_topology_visualize.py @@ -0,0 +1,81 @@ +from pathlib import Path + +from kernbench.topology.builder import load_topology +from kernbench.topology.visualizer import emit_diagrams + +TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml" + +VIEW_FILES = ["system_view.svg", "sip_view.svg", "cube_view.svg", "pe_view.svg"] + + +def _emit(tmp_path: Path) -> list[Path]: + graph = load_topology(TOPOLOGY_PATH) + return emit_diagrams(graph, tmp_path) + + +def test_emit_creates_all_svg_files(tmp_path): + created = _emit(tmp_path) + assert len(created) == 4 + for name in VIEW_FILES: + assert (tmp_path / name).exists() + assert (tmp_path / name).stat().st_size > 0 + + +def test_svg_output_is_deterministic(tmp_path): + graph = load_topology(TOPOLOGY_PATH) + emit_diagrams(graph, tmp_path) + first = {name: (tmp_path / name).read_text() for name in VIEW_FILES} + emit_diagrams(graph, tmp_path) + second = {name: (tmp_path / name).read_text() for name in VIEW_FILES} + for name in VIEW_FILES: + assert first[name] == second[name], f"{name} is not deterministic" + + +def test_cube_svg_contains_hbm_ctrl(tmp_path): + _emit(tmp_path) + svg = (tmp_path / "cube_view.svg").read_text() + assert "HBM CTRL" in svg + + +def test_cube_svg_contains_ucie_ports(tmp_path): + _emit(tmp_path) + svg = (tmp_path / "cube_view.svg").read_text() + for port in ("UCIe-N", "UCIe-S", "UCIe-W", "UCIe-E"): + assert port in svg + + +def test_cube_svg_contains_pe_nodes(tmp_path): + _emit(tmp_path) + svg = (tmp_path / "cube_view.svg").read_text() + for i in range(8): + assert f"PE{i}" in svg + + +def test_pe_svg_contains_all_components(tmp_path): + _emit(tmp_path) + svg = (tmp_path / "pe_view.svg").read_text() + for comp in ("PE CPU", "PE SCHEDULER", "PE DMA", "PE GEMM", "PE MATH", "PE TCM"): + assert comp in svg + + +def test_sip_svg_contains_cubes(tmp_path): + _emit(tmp_path) + svg = (tmp_path / "sip_view.svg").read_text() + assert "CUBE (0,0)" in svg + assert "CUBE (3,3)" in svg + + +def test_system_svg_contains_switch_and_sips(tmp_path): + _emit(tmp_path) + svg = (tmp_path / "system_view.svg").read_text() + assert "Fabric Switch" in svg + assert "SIP 0" in svg + assert "SIP 1" in svg + + +def test_svg_is_valid_xml(tmp_path): + _emit(tmp_path) + for name in VIEW_FILES: + svg = (tmp_path / name).read_text() + assert svg.startswith("") diff --git a/tests/test_triton_emu.py b/tests/test_triton_emu.py new file mode 100644 index 0000000..036fc06 --- /dev/null +++ b/tests/test_triton_emu.py @@ -0,0 +1,349 @@ +"""Tests for Triton emulator: TLContext, command generation, kernel registry.""" +from kernbench.common.pe_commands import ( + CompletionHandle, + CompositeCmd, + DmaReadCmd, + DmaWriteCmd, + GemmCmd, + MathCmd, + PeCpuOverheadCmd, + TensorHandle, + WaitCmd, +) +from kernbench.triton_emu.registry import clear_registry, get_kernel, register_kernel +from kernbench.triton_emu.tl_context import TLContext, run_kernel + + +def _ctx(**kwargs) -> TLContext: + return TLContext(dispatch_cycles=0, **kwargs) + + +def _ctx_with_overhead(**kwargs) -> TLContext: + return TLContext(dispatch_cycles=1, **kwargs) + + +# ── 1. tl.load → DmaReadCmd ────────────────────────────────────── + + +def test_tl_load_generates_dma_read(): + tl = _ctx() + h = tl.load(0x1000, shape=(32, 64), dtype="f16") + assert isinstance(h, TensorHandle) + assert h.shape == (32, 64) + assert h.nbytes == 32 * 64 * 2 + cmds = tl.commands + assert len(cmds) == 1 + assert isinstance(cmds[0], DmaReadCmd) + assert cmds[0].src_pa == 0x1000 + assert cmds[0].nbytes == 32 * 64 * 2 + + +# ── 2. tl.store → DmaWriteCmd ──────────────────────────────────── + + +def test_tl_store_generates_dma_write(): + tl = _ctx() + h = tl.load(0x1000, shape=(16, 16), dtype="f32") + tl.store(0x2000, h) + cmds = [c for c in tl.commands if isinstance(c, DmaWriteCmd)] + assert len(cmds) == 1 + assert cmds[0].dst_pa == 0x2000 + assert cmds[0].nbytes == 16 * 16 * 4 + + +# ── 3. tl.dot → GemmCmd ────────────────────────────────────────── + + +def test_tl_dot_generates_gemm_cmd(): + tl = _ctx() + a = tl.load(0x1000, shape=(32, 64), dtype="f16") + b = tl.load(0x2000, shape=(64, 16), dtype="f16") + out = tl.dot(a, b) + assert out.shape == (32, 16) + cmds = [c for c in tl.commands if isinstance(c, GemmCmd)] + assert len(cmds) == 1 + assert cmds[0].m == 32 + assert cmds[0].k == 64 + assert cmds[0].n == 16 + + +# ── 4. tl.exp, tl.sqrt etc. → MathCmd ──────────────────────────── + + +def test_tl_math_unary_ops(): + tl = _ctx() + x = tl.load(0x1000, shape=(8, 8), dtype="f16") + for op_name, op_fn in [ + ("exp", tl.exp), ("log", tl.log), ("sqrt", tl.sqrt), + ("abs", tl.abs), ("sigmoid", tl.sigmoid), + ("cos", tl.cos), ("sin", tl.sin), + ]: + result = op_fn(x) + assert isinstance(result, TensorHandle) + assert result.shape == x.shape + + math_cmds = [c for c in tl.commands if isinstance(c, MathCmd)] + ops = [c.op for c in math_cmds] + assert ops == ["exp", "log", "sqrt", "abs", "sigmoid", "cos", "sin"] + + +# ── 5. a + b, a * b → MathCmd ──────────────────────────────────── + + +def test_tl_math_binary_ops(): + tl = _ctx() + a = tl.load(0x1000, shape=(4, 4), dtype="f16") + b = tl.load(0x2000, shape=(4, 4), dtype="f16") + r1 = run_kernel(lambda tl: None, tl) # activate context for operators + + # Need active context for operators + tl2 = _ctx() + a2 = tl2.load(0x1000, shape=(4, 4), dtype="f16") + b2 = tl2.load(0x2000, shape=(4, 4), dtype="f16") + + def kernel(tl): + pass + + # Use run_kernel to activate context, then test operators + tl3 = _ctx() + + def binary_kernel(tl): + a = tl.load(0x1000, shape=(4, 4), dtype="f16") + b = tl.load(0x2000, shape=(4, 4), dtype="f16") + _ = a + b + _ = a - b + _ = a * b + _ = a / b + + run_kernel(binary_kernel, tl3) + math_cmds = [c for c in tl3.commands if isinstance(c, MathCmd)] + ops = [c.op for c in math_cmds] + assert ops == ["add", "sub", "mul", "div"] + + +# ── 6. tl.sum, tl.max → MathCmd with axis ──────────────────────── + + +def test_tl_reduction_ops(): + tl = _ctx() + x = tl.load(0x1000, shape=(32, 64), dtype="f16") + s = tl.sum(x, axis=1) + m = tl.max(x, axis=0) + assert s.shape == (32, 1) + assert m.shape == (1, 64) + math_cmds = [c for c in tl.commands if isinstance(c, MathCmd)] + assert math_cmds[0].op == "sum" and math_cmds[0].axis == 1 + assert math_cmds[1].op == "max" and math_cmds[1].axis == 0 + + +# ── 7. tl.composite → CompositeCmd + CompletionHandle ──────────── + + +def test_tl_composite_nonblocking(): + tl = _ctx() + a = tl.load(0x1000, shape=(32, 64), dtype="f16") + b = tl.load(0x2000, shape=(64, 32), dtype="f16") + h = tl.composite(op="gemm", a=a, b=b, out_ptr=0x3000) + assert isinstance(h, CompletionHandle) + comp_cmds = [c for c in tl.commands if isinstance(c, CompositeCmd)] + assert len(comp_cmds) == 1 + assert comp_cmds[0].op == "gemm" + assert comp_cmds[0].out_pa == 0x3000 + assert comp_cmds[0].out_nbytes == 32 * 32 * 2 # M×N×dtype_bytes + + +# ── 8. tl.wait(handle) → WaitCmd ───────────────────────────────── + + +def test_tl_wait_specific(): + tl = _ctx() + a = tl.load(0x1000, shape=(4, 4), dtype="f16") + h = tl.composite(op="gemm", a=a, b=a, out_ptr=0x2000) + tl.wait(h) + wait_cmds = [c for c in tl.commands if isinstance(c, WaitCmd)] + assert len(wait_cmds) == 1 + assert wait_cmds[0].handle == h + + +# ── 9. tl.wait() → WaitCmd(handle=None) ────────────────────────── + + +def test_tl_wait_all(): + tl = _ctx() + tl.wait() + wait_cmds = [c for c in tl.commands if isinstance(c, WaitCmd)] + assert len(wait_cmds) == 1 + assert wait_cmds[0].handle is None + + +# ── 10. tl.cycles → PeCpuOverheadCmd ───────────────────────────── + + +def test_tl_cycles(): + tl = _ctx() + tl.cycles(10) + assert len(tl.commands) == 1 + assert isinstance(tl.commands[0], PeCpuOverheadCmd) + assert tl.commands[0].cycles == 10 + + +# ── 11. tl.program_id ──────────────────────────────────────────── + + +def test_tl_program_id(): + tl = TLContext(pe_id=5, num_programs=8) + assert tl.program_id(0) == 5 + assert tl.num_programs(0) == 8 + + +# ── 12. tl.arange, tl.zeros, tl.full ───────────────────────────── + + +def test_tl_arange_zeros_full(): + tl = _ctx() + r = tl.arange(0, 16, dtype="i32") + assert r.shape == (16,) + assert r.dtype == "i32" + + z = tl.zeros((4, 8), dtype="f16") + assert z.shape == (4, 8) + assert z.nbytes == 4 * 8 * 2 + + f = tl.full((2, 3), value=1.0, dtype="f32") + assert f.shape == (2, 3) + assert f.nbytes == 2 * 3 * 4 + + +# ── 13. tl.trans → shape change, no command ─────────────────────── + + +def test_tl_trans_shape(): + tl = _ctx() + h = tl.load(0x1000, shape=(32, 64), dtype="f16") + t = tl.trans(h) + assert t.shape == (64, 32) + assert t.id == h.id # same underlying data + # Only DmaReadCmd from load, no command from trans + assert len(tl.commands) == 1 + assert isinstance(tl.commands[0], DmaReadCmd) + + +# ── 14. Kernel registry ────────────────────────────────────────── + + +def test_kernel_registry(): + clear_registry() + + def my_kernel(tl): + pass + + register_kernel("test_kern", my_kernel) + assert get_kernel("test_kern") is my_kernel + clear_registry() + + +def test_kernel_registry_missing(): + clear_registry() + import pytest + with pytest.raises(KeyError): + get_kernel("nonexistent") + + +def test_kernel_registry_duplicate(): + clear_registry() + register_kernel("dup", lambda tl: None) + import pytest + with pytest.raises(ValueError): + register_kernel("dup", lambda tl: None) + clear_registry() + + +# ── 15. GEMM kernel → correct command sequence ─────────────────── + + +def test_gemm_kernel_command_sequence(): + """32×64 × 64×32 GEMM kernel produces [DmaRead, DmaRead, Composite].""" + def gemm_kernel(a_ptr, b_ptr, out_ptr, tl): + pid = tl.program_id(0) + a = tl.load(a_ptr, shape=(32, 64), dtype="f16") + b = tl.load(b_ptr + pid * 64 * 32 * 2, shape=(64, 32), dtype="f16") + tl.composite(op="gemm", a=a, b=b, out_ptr=out_ptr + pid * 32 * 32 * 2) + + tl = _ctx(pe_id=3) + run_kernel(gemm_kernel, tl, a_ptr=0x1000, b_ptr=0x2000, out_ptr=0x3000) + types = [type(c).__name__ for c in tl.commands] + assert types == ["DmaReadCmd", "DmaReadCmd", "CompositeCmd"] + + +# ── 16. Attention kernel → correct command sequence ─────────────── + + +def test_attention_kernel_command_sequence(): + """Attention kernel: load→dot→math ops→dot→store.""" + def attention_kernel(q_ptr, k_ptr, v_ptr, out_ptr, tl, + seq_len=16, head_dim=8): + pid = tl.program_id(0) + q = tl.load(q_ptr, shape=(seq_len, head_dim), dtype="f16") + k = tl.load(k_ptr, shape=(head_dim, seq_len), dtype="f16") + scores = tl.dot(q, k) + row_max = tl.max(scores, axis=1) + scores = scores - row_max + scores = tl.exp(scores) + row_sum = tl.sum(scores, axis=1) + scores = scores / row_sum + v = tl.load(v_ptr, shape=(seq_len, head_dim), dtype="f16") + out = tl.dot(scores, v) + tl.store(out_ptr, out) + + tl = _ctx(pe_id=0) + run_kernel( + attention_kernel, tl, + q_ptr=0x1000, k_ptr=0x2000, v_ptr=0x3000, out_ptr=0x4000, + ) + types = [type(c).__name__ for c in tl.commands] + # load, load, dot, max, sub, exp, sum, div, load, dot, store + assert types == [ + "DmaReadCmd", "DmaReadCmd", # load Q, K + "GemmCmd", # Q @ K + "MathCmd", "MathCmd", "MathCmd", # max, sub, exp + "MathCmd", "MathCmd", # sum, div + "DmaReadCmd", # load V + "GemmCmd", # scores @ V + "DmaWriteCmd", # store output + ] + # Verify math ops + math_cmds = [c for c in tl.commands if isinstance(c, MathCmd)] + math_ops = [c.op for c in math_cmds] + assert math_ops == ["max", "sub", "exp", "sum", "div"] + + +# ── 17. Dispatch overhead auto-inserted ─────────────────────────── + + +def test_dispatch_overhead_inserted(): + """Each tl API call auto-inserts PeCpuOverheadCmd when dispatch_cycles > 0.""" + tl = _ctx_with_overhead() + a = tl.load(0x1000, shape=(4, 4), dtype="f16") + tl.store(0x2000, a) + types = [type(c).__name__ for c in tl.commands] + # overhead, load, overhead, store + assert types == [ + "PeCpuOverheadCmd", "DmaReadCmd", + "PeCpuOverheadCmd", "DmaWriteCmd", + ] + + +# ── 18. where operation ────────────────────────────────────────── + + +def test_tl_where(): + tl = _ctx() + cond = tl.load(0x1000, shape=(4, 4), dtype="i32") + a = tl.load(0x2000, shape=(4, 4), dtype="f16") + b = tl.load(0x3000, shape=(4, 4), dtype="f16") + out = tl.where(cond, a, b) + assert isinstance(out, TensorHandle) + math_cmds = [c for c in tl.commands if isinstance(c, MathCmd)] + assert len(math_cmds) == 1 + assert math_cmds[0].op == "where" + assert len(math_cmds[0].inputs) == 3 diff --git a/topology.yaml b/topology.yaml new file mode 100644 index 0000000..62c9fe8 --- /dev/null +++ b/topology.yaml @@ -0,0 +1,126 @@ + +system: + ns_per_mm: 0.01 # wire propagation delay: 10 ps/mm (on-chip silicon) + + sips: + count: 2 + + components: + switch: { kind: switch, impl: switch_v1, attrs: { overhead_ns: 5.0 } } + + links: + io_ep_to_switch: + kind: pcie + bw_gbs_per_ep: 768.0 + distance_mm: 20.0 + +sip: + cube_mesh: { w: 4, h: 4 } + + iochiplet: + components: + pcie_ep: { kind: pcie_ep, impl: pcie_ep_v1, attrs: { overhead_ns: 5.0 } } + io_cpu: { kind: io_cpu, impl: io_cpu_v1, attrs: { overhead_ns: 10.0 } } + links: + pcie_ep_to_io_cpu_bw_gbs: 256.0 # matches system.links.io_ep_to_switch.bw_gbs_per_ep + pcie_ep_to_io_cpu_mm: 1.0 + io_cpu_to_ucie_bw_gbs: 512.0 # matches ucie.phy_bw_gbs per PHY + io_cpu_to_ucie_mm: 1.5 + instances: + - id: io0 + place: { side: N, offset_norm: 0.5 } + ucie: { phy_bw_gbs: 512.0, phys: [P0, P1, P2, P3] } + cube_ports: + - { cube: {xy: [0,0]}, cube_side: N, phy: P0, distance_mm: 2.0 } + - { cube: {xy: [1,0]}, cube_side: N, phy: P1, distance_mm: 2.0 } + - { cube: {xy: [2,0]}, cube_side: N, phy: P2, distance_mm: 2.0 } + - { cube: {xy: [3,0]}, cube_side: N, phy: P3, distance_mm: 2.0 } + + links: + inter_cube_mesh: + bw_gbs_per_ucie_phy: 512.0 + distance_mm_across_seam: 1.0 + routing: { algo: xy } + +cube: + geometry: + cube_mm: { w: 17.0, h: 14.0 } + hbm_mm: { w: 9.0, h: 5.0 } + ucie_mm: { size: 2.0 } + + pe_layout: + corners: [NW, NE, SW, SE] # N corners → xbar top row; S corners → xbar bottom row + pe_per_corner: 2 # total PEs per cube: 4 * 2 = 8 + + pe_template: + components: + pe_cpu: { kind: pe_cpu, impl: pe_cpu_v1, attrs: { overhead_ns: 2.0 } } + pe_scheduler: { kind: pe_scheduler, impl: pe_scheduler_v1, attrs: { overhead_ns: 1.0 } } + pe_dma: { kind: pe_dma, impl: pe_dma_v1, attrs: { rd_engines: 1, wr_engines: 1 } } + pe_gemm: { kind: pe_gemm, impl: pe_gemm_v1, attrs: { overhead_ns: 0.0, shared_resource: accel_slot, peak_tflops_f16: 8.0 } } + pe_math: { kind: pe_math, impl: pe_math_v1, attrs: { overhead_ns: 0.0, shared_resource: accel_slot } } + pe_tcm: { kind: pe_tcm, impl: pe_tcm_v1, attrs: + { size_mb: 16 } } + links: + pe_cpu_to_scheduler_mm: 0.5 + scheduler_to_dma_mm: 0.5 + scheduler_to_gemm_mm: 0.5 + scheduler_to_math_mm: 0.5 + dma_to_tcm_bw_gbs: 512.0 + dma_to_tcm_mm: 0.5 + gemm_to_tcm_bw_gbs: 512.0 # GEMM reads inputs from TCM (ADR-0014 D5) + gemm_to_tcm_mm: 0.5 + math_to_tcm_bw_gbs: 512.0 + math_to_tcm_mm: 0.5 + + memory_map: + hbm_total_gb_per_cube: 48 + hbm_slices_per_cube: 8 + hbm_total_bw_gbs: 1024.0 + + components: + noc: { kind: noc, impl: noc_2d_mesh_v1, attrs: { overhead_ns: 0.0 } } + m_cpu: { kind: m_cpu, impl: m_cpu_v1, attrs: { overhead_ns: 5.0 } } + xbar: + pe: { kind: xbar, impl: xbar_v1, attrs: { overhead_ns: 2.0 } } + bridges: + - { id: left, kind: xbar, impl: xbar_v1, attrs: { overhead_ns: 1.0 } } + - { id: right, kind: xbar, impl: xbar_v1, attrs: { overhead_ns: 1.0 } } + hbm_ctrl: { kind: hbm_ctrl, impl: hbm_ctrl_v1, attrs: { capacity: 1 } } + sram: { kind: sram, impl: sram_v1, attrs: { size_mb: 32, overhead_ns: 2.0 } } + + ucie: + decompose: true + ports: [N, S, E, W] + overhead_ns: 1.0 + + links: + pe_to_xbar_bw_gbs: 256.0 # per-PE effective (2048 / 8 PEs) + xbar_to_hbm_bw_gbs: 256.0 # per-PE effective (2048 / 8 PEs) + xbar_to_bridge_bw_gbs: 128.0 # bridge BW (same as xbar chain BW) + xbar_x_bw_gbs: 128.0 # X-direction BW for xbar chain traversal + xbar_chain_intra_corner_mm: 2.0 # xbar wire distance within same corner PE pair + xbar_chain_inter_corner_mm: 10.0 # xbar wire distance between corner pairs (NW↔NE, SW↔SE) + xbar_row_n_to_bridge_mm: 3.0 + xbar_row_s_to_bridge_mm: 3.0 + xbar_to_hbm_mm: 2.5 + pe_to_xbar_row_n_mm: 6.0 + pe_to_xbar_row_s_mm: 6.0 + pe_dma_to_noc_mm: 0.0 # noc is distributed; distance modeled as 0 + pe_dma_to_noc_bw_gbs: 512.0 # PE non-HBM data path BW + noc_to_xbar: + per_connection_bw_gbs: 128.0 # BW per NOC connection + noc_to_sram_mm: 0.0 # noc is distributed; distance modeled as 0 + noc_to_sram: + per_connection_bw_gbs: 128.0 # BW per NOC connection + n_connections: 4 # 4 × 128 = 512 GB/s aggregate + noc_to_ucie: + per_connection_bw_gbs: 128.0 # BW per NOC connection + n_connections: 4 # 4 × 128 = 512 GB/s = UCIe PHY BW + m_cpu_to_noc_mm: 0.0 # noc is distributed; distance modeled as 0 + noc_to_pe_cpu_mm: 0.0 # noc is distributed; distance modeled as 0 + +visualization: + emit_views: [system, sip, cube] + sip_ids: [0] + cubes: [0, 9, 15]