commit - release 1
@@ -0,0 +1,15 @@
|
|||||||
|
{
|
||||||
|
"permissions": {
|
||||||
|
"allow": [
|
||||||
|
"Bash(python -c \":*)",
|
||||||
|
"Bash(xargs ls -la)",
|
||||||
|
"Bash(wc -l /c/Users/ywkang/kernbench/src/kernbench/policy/address/phyaddr.py /c/Users/ywkang/kernbench/src/kernbench/runtime_api/*.py)",
|
||||||
|
"Bash(python scripts/gen_placement_diagrams.py)",
|
||||||
|
"Bash(python -c:*)",
|
||||||
|
"Bash(python -m kernbench.cli.main probe --topology topology.yaml)",
|
||||||
|
"Bash(xargs grep -l \"class.*ComponentBase\\\\|class.*DefaultComponent\")",
|
||||||
|
"Bash(python -m pytest tests/test_probe.py -v)",
|
||||||
|
"Bash(python -m pytest tests/test_probe.py tests/test_component_registry.py -v)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,36 @@
|
|||||||
|
{
|
||||||
|
"permissions": {
|
||||||
|
"allow": [
|
||||||
|
"Bash(python -m ruff check --select I --fix src/kernbench/runtime_api/context.py src/kernbench/runtime_api/bench_runner.py src/kernbench/cli/main.py)",
|
||||||
|
"Bash(python -m pytest tests/ -q)",
|
||||||
|
"Bash(python -m pytest tests/ -v)",
|
||||||
|
"Bash(python -m pytest tests/test_topology_load.py tests/test_cli.py -v)",
|
||||||
|
"Bash(python -c \":*)",
|
||||||
|
"Bash(python -m pytest tests/ -q --tb=no)",
|
||||||
|
"Bash(python -m pytest tests/ -v --tb=short)",
|
||||||
|
"Bash(python -m pytest tests/test_component_registry.py -v --tb=line)",
|
||||||
|
"Bash(python -m pytest tests/ --ignore=tests/test_component_registry.py -q --tb=no)",
|
||||||
|
"Bash(python -m pytest tests/ -q --tb=short)",
|
||||||
|
"Bash(python -m pytest --tb=short -q)",
|
||||||
|
"Bash(python -m pytest tests/test_phase_a_components.py -v)",
|
||||||
|
"Bash(python -m pytest --tb=short)",
|
||||||
|
"Bash(python -m pytest tests/ -x -q)",
|
||||||
|
"Bash(python -m pytest tests/test_probe.py::test_h2d_latency_monotonic -x -q)",
|
||||||
|
"Bash(python -m pytest tests/ -x --tb=short)",
|
||||||
|
"Bash(python -m pytest tests/ --tb=line)",
|
||||||
|
"Bash(python -m pytest tests/ --tb=short)",
|
||||||
|
"Bash(python -m kernbench probe --topology topology.yaml)",
|
||||||
|
"Bash(python -m pytest tests/ --tb=short -q)",
|
||||||
|
"Bash(python -m pytest tests/test_probe.py tests/test_component_registry.py tests/test_topology_compile.py -v)",
|
||||||
|
"Bash(python -m pytest tests/test_tensor.py tests/test_engine.py tests/test_probe.py tests/test_component_registry.py -v)",
|
||||||
|
"Bash(grep -l \"class.*CPU\" \"/c/Users/ywkang/kernbench/src/kernbench/components/impls\"/*.py)",
|
||||||
|
"Bash(grep -n \"^class \" \"/c/Users/ywkang/kernbench/src/kernbench/components/impls\"/*.py)",
|
||||||
|
"Bash(python -m pytest tests/test_engine.py tests/test_probe.py tests/test_component_registry.py -v)",
|
||||||
|
"Bash(grep -E \"\\\\.\\(py|md\\)$\")",
|
||||||
|
"Bash(python -m pytest tests/test_pe_components.py -v)",
|
||||||
|
"Bash(python -m pytest tests/test_triton_emu.py -v)",
|
||||||
|
"Bash(python -m pytest tests/test_pe_components.py tests/test_triton_emu.py -v)",
|
||||||
|
"Bash(python -m pytest tests/test_pe_components.py::test_mcpu_multi_pe_kernel_launch tests/test_pe_components.py::test_qkv_gemm_bench_multi_pe_completes -v)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,12 @@
|
|||||||
|
root = true
|
||||||
|
|
||||||
|
[*]
|
||||||
|
charset = utf-8
|
||||||
|
end_of_line = lf
|
||||||
|
insert_final_newline = true
|
||||||
|
trim_trailing_whitespace = true
|
||||||
|
indent_style = space
|
||||||
|
indent_size = 4
|
||||||
|
|
||||||
|
[*.md]
|
||||||
|
trim_trailing_whitespace = false
|
||||||
@@ -0,0 +1,28 @@
|
|||||||
|
# OS / Editor
|
||||||
|
.DS_Store
|
||||||
|
.vscode/.history/
|
||||||
|
*.swp
|
||||||
|
|
||||||
|
# Python
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*.pyd
|
||||||
|
.pytest_cache/
|
||||||
|
.mypy_cache/
|
||||||
|
.ruff_cache/
|
||||||
|
|
||||||
|
# Virtualenv
|
||||||
|
.venv/
|
||||||
|
|
||||||
|
# Packaging
|
||||||
|
dist/
|
||||||
|
build/
|
||||||
|
*.egg-info/
|
||||||
|
|
||||||
|
# Env
|
||||||
|
.env
|
||||||
|
.env.*
|
||||||
|
!.env.example
|
||||||
|
|
||||||
|
# Logs
|
||||||
|
*.log
|
||||||
@@ -0,0 +1,3 @@
|
|||||||
|
[MESSAGES CONTROL]
|
||||||
|
disable=missing-function-docstring
|
||||||
|
disable=missing-module-docstring
|
||||||
@@ -0,0 +1,16 @@
|
|||||||
|
{
|
||||||
|
"recommendations": [
|
||||||
|
"ms-python.python",
|
||||||
|
"ms-python.vscode-pylance",
|
||||||
|
"charliermarsh.ruff",
|
||||||
|
"ms-python.debugpy",
|
||||||
|
"eamodio.gitlens",
|
||||||
|
"usernamehw.errorlens",
|
||||||
|
"wayou.vscode-todo-highlight",
|
||||||
|
"christian-kohler.path-intellisense",
|
||||||
|
"editorconfig.editorconfig",
|
||||||
|
"ms-azuretools.vscode-docker",
|
||||||
|
"humao.rest-client",
|
||||||
|
"shd101wyy.markdown-preview-enhanced"
|
||||||
|
]
|
||||||
|
}
|
||||||
@@ -0,0 +1,55 @@
|
|||||||
|
{
|
||||||
|
"version": "0.2.0",
|
||||||
|
"configurations": [
|
||||||
|
{
|
||||||
|
"name": "Python: Current File",
|
||||||
|
"type": "python",
|
||||||
|
"request": "launch",
|
||||||
|
"program": "${file}",
|
||||||
|
"console": "integratedTerminal",
|
||||||
|
"justMyCode": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Run kernbench CLI",
|
||||||
|
"type": "python",
|
||||||
|
"request": "launch",
|
||||||
|
"module": "kernbench.cli.main",
|
||||||
|
"console": "integratedTerminal",
|
||||||
|
"justMyCode": true,
|
||||||
|
"args": [
|
||||||
|
"run",
|
||||||
|
"--topology", "topology.yaml",
|
||||||
|
"--bench", "qkv_gemm"
|
||||||
|
],
|
||||||
|
"env": {
|
||||||
|
"PYTHONPATH": "${workspaceFolder}/src"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Run KernBench Probe",
|
||||||
|
"type": "python",
|
||||||
|
"request": "launch",
|
||||||
|
"module": "kernbench.cli.main",
|
||||||
|
"console": "integratedTerminal",
|
||||||
|
"justMyCode": true,
|
||||||
|
"args": [
|
||||||
|
"probe",
|
||||||
|
"--topology", "topology.yaml",
|
||||||
|
],
|
||||||
|
"env": {
|
||||||
|
"PYTHONPATH": "${workspaceFolder}/src"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Pytest: Run All (terminal)",
|
||||||
|
"type": "python",
|
||||||
|
"request": "launch",
|
||||||
|
"module": "pytest",
|
||||||
|
"args": [
|
||||||
|
"-q"
|
||||||
|
],
|
||||||
|
"console": "integratedTerminal",
|
||||||
|
"justMyCode": true
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
@@ -0,0 +1,39 @@
|
|||||||
|
{
|
||||||
|
"editor.formatOnSave": true,
|
||||||
|
"editor.formatOnSaveMode": "file",
|
||||||
|
"editor.tabSize": 4,
|
||||||
|
"editor.insertSpaces": true,
|
||||||
|
"editor.rulers": [
|
||||||
|
100
|
||||||
|
],
|
||||||
|
"editor.minimap.enabled": false,
|
||||||
|
"files.eol": "\n",
|
||||||
|
"files.trimTrailingWhitespace": true,
|
||||||
|
"files.insertFinalNewline": true,
|
||||||
|
"search.useIgnoreFiles": true,
|
||||||
|
"git.autofetch": true,
|
||||||
|
"python.venvFolders": [".venv"],
|
||||||
|
"python.testing.pytestEnabled": true,
|
||||||
|
"python.testing.unittestEnabled": false,
|
||||||
|
"python.testing.autoTestDiscoverOnSaveEnabled": true,
|
||||||
|
"python.testing.pytestArgs": [
|
||||||
|
"tests", "-q", "-vv", "-s", "--tb=short", "--no-header",
|
||||||
|
],
|
||||||
|
"python.analysis.typeCheckingMode": "basic",
|
||||||
|
"python.analysis.autoImportCompletions": true,
|
||||||
|
"python.terminal.activateEnvironment": true,
|
||||||
|
"ruff.lint.enable": true,
|
||||||
|
"ruff.format.enable": true,
|
||||||
|
"editor.defaultFormatter": "charliermarsh.ruff",
|
||||||
|
"[python]": {
|
||||||
|
"editor.defaultFormatter": "charliermarsh.ruff"
|
||||||
|
},
|
||||||
|
"editor.codeActionsOnSave": {
|
||||||
|
"source.fixAll": "explicit",
|
||||||
|
"source.organizeImports": "explicit"
|
||||||
|
},
|
||||||
|
"python.analysis.extraPaths": [
|
||||||
|
"${workspaceFolder}/src"
|
||||||
|
]
|
||||||
|
|
||||||
|
}
|
||||||
@@ -0,0 +1,127 @@
|
|||||||
|
{
|
||||||
|
"version": "2.0.0",
|
||||||
|
"tasks": [
|
||||||
|
|
||||||
|
{
|
||||||
|
"label": "venv: create",
|
||||||
|
"type": "shell",
|
||||||
|
"command": "python -m venv .venv",
|
||||||
|
"problemMatcher": []
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"label": "deps: install",
|
||||||
|
"type": "shell",
|
||||||
|
"command": "${command:python.interpreterPath}",
|
||||||
|
"args": [
|
||||||
|
"-m",
|
||||||
|
"pip",
|
||||||
|
"install",
|
||||||
|
"-U",
|
||||||
|
"pip",
|
||||||
|
"&&",
|
||||||
|
"${command:python.interpreterPath}",
|
||||||
|
"-m",
|
||||||
|
"pip",
|
||||||
|
"install",
|
||||||
|
"-e",
|
||||||
|
".[dev]"
|
||||||
|
],
|
||||||
|
"problemMatcher": [],
|
||||||
|
"dependsOn": "venv: create"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"label": "Run KernBench CLI",
|
||||||
|
"type": "shell",
|
||||||
|
"command": "${command:python.interpreterPath}",
|
||||||
|
"args": [
|
||||||
|
"-m",
|
||||||
|
"kernbench.cli.main",
|
||||||
|
"run",
|
||||||
|
"--topology",
|
||||||
|
"topology.yaml",
|
||||||
|
"--bench",
|
||||||
|
"qkv_gemm"
|
||||||
|
],
|
||||||
|
"options": { "cwd": "${workspaceFolder}" },
|
||||||
|
"problemMatcher": [],
|
||||||
|
"presentation": {
|
||||||
|
"reveal": "silent",
|
||||||
|
"panel": "shared",
|
||||||
|
"clear": true
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"label": "Run KernBench Probe",
|
||||||
|
"type": "shell",
|
||||||
|
"command": "${command:python.interpreterPath}",
|
||||||
|
"args": [
|
||||||
|
"-m",
|
||||||
|
"kernbench.cli.main",
|
||||||
|
"probe",
|
||||||
|
"--topology",
|
||||||
|
"topology.yaml"
|
||||||
|
],
|
||||||
|
"options": { "cwd": "${workspaceFolder}" },
|
||||||
|
"problemMatcher": [],
|
||||||
|
"presentation": {
|
||||||
|
"reveal": "silent",
|
||||||
|
"panel": "shared",
|
||||||
|
"clear": true
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"label": "Pytest: Run All (terminal)",
|
||||||
|
"type": "shell",
|
||||||
|
"command": "${command:python.interpreterPath}",
|
||||||
|
"args": [
|
||||||
|
"-m",
|
||||||
|
"pytest",
|
||||||
|
"-q",
|
||||||
|
"-s"
|
||||||
|
],
|
||||||
|
"problemMatcher": [],
|
||||||
|
"group": "test"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"label": "lint",
|
||||||
|
"type": "shell",
|
||||||
|
"command": "${command:python.interpreterPath}",
|
||||||
|
"args": [
|
||||||
|
"-m",
|
||||||
|
"ruff",
|
||||||
|
"check",
|
||||||
|
"."
|
||||||
|
],
|
||||||
|
"problemMatcher": []
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"label": "format",
|
||||||
|
"type": "shell",
|
||||||
|
"command": "${command:python.interpreterPath}",
|
||||||
|
"args": [
|
||||||
|
"-m",
|
||||||
|
"ruff",
|
||||||
|
"format",
|
||||||
|
"."
|
||||||
|
],
|
||||||
|
"problemMatcher": []
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"label": "which python",
|
||||||
|
"type": "shell",
|
||||||
|
"command": "${command:python.interpreterPath}",
|
||||||
|
"args": [
|
||||||
|
"-c",
|
||||||
|
"import sys;print(sys.executable)"
|
||||||
|
],
|
||||||
|
"problemMatcher": []
|
||||||
|
}
|
||||||
|
|
||||||
|
]
|
||||||
|
}
|
||||||
@@ -0,0 +1,196 @@
|
|||||||
|
# Claude Code Instructions (Repo)
|
||||||
|
|
||||||
|
This repository uses Claude Code with strict architectural and verification rules.
|
||||||
|
SPEC.md and ADRs are the source of truth.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Terminology
|
||||||
|
|
||||||
|
- runtime API:
|
||||||
|
Host-facing public API used by benchmarks and user code (e.g., tensor deployment, kernel launch).
|
||||||
|
- simulation engine (sim_engine):
|
||||||
|
Discrete-event engine responsible for request injection, scheduling, and completion tracking.
|
||||||
|
- components:
|
||||||
|
Device-side nodes modeling hardware behavior (IO_CPU, M_CPU, PE_CPU, routers, engines, etc.).
|
||||||
|
|
||||||
|
## Authority & Scope
|
||||||
|
|
||||||
|
- SPEC.md defines the architectural contract.
|
||||||
|
- ADRs (docs/adr/ADR-*.md) define non-trivial architectural decisions.
|
||||||
|
- If a change conflicts with SPEC.md or an ADR:
|
||||||
|
- STOP.
|
||||||
|
- Explain the conflict.
|
||||||
|
- Propose options (keep spec, update ADR, or narrow scope).
|
||||||
|
- Do NOT silently change architecture.
|
||||||
|
- The repository structure reflects architectural intent; Claude Code MUST respect existing module boundaries and file locations.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Design Questions
|
||||||
|
|
||||||
|
- Design / architecture questions are ALWAYS allowed.
|
||||||
|
- Design questions MUST NOT modify:
|
||||||
|
- production code
|
||||||
|
- test code
|
||||||
|
- SPEC.md
|
||||||
|
- ADRs
|
||||||
|
- If a design question implies a change, default to Phase 1.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Change & Test Protocol (Mandatory)
|
||||||
|
|
||||||
|
All non-trivial changes MUST follow a two-phase process.
|
||||||
|
Design discussion is always allowed; code changes are not.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Phase 1 — Proposal + Verification
|
||||||
|
|
||||||
|
(No Production Code Changes)
|
||||||
|
|
||||||
|
#### Purpose
|
||||||
|
|
||||||
|
- Decide *what* to change and *how it will be validated*
|
||||||
|
- Establish verification coverage BEFORE touching production code
|
||||||
|
|
||||||
|
#### Phase 1 MUST include
|
||||||
|
|
||||||
|
1) **Design Proposal**
|
||||||
|
|
||||||
|
- Explain the design change.
|
||||||
|
- Explain why the change is needed.
|
||||||
|
- Explain consistency with SPEC.md and relevant ADRs.
|
||||||
|
|
||||||
|
1) **Verification Plan**
|
||||||
|
|
||||||
|
- SPEC requirement(s) / ADR(s) affected (e.g., R1/R2/R5, ADR-0002).
|
||||||
|
- Tests that validate the change:
|
||||||
|
- existing tests to run, and/or
|
||||||
|
- new tests to add.
|
||||||
|
- Concrete input cases used by the tests:
|
||||||
|
- topology (SIP / CUBE / PE layout)
|
||||||
|
- request parameters (src, dst, size_bytes).
|
||||||
|
- Expected observable assertions, such as:
|
||||||
|
- hop trace contains key waypoints,
|
||||||
|
- latency invariants (e.g., > 0, monotonic increase),
|
||||||
|
- deterministic route selection.
|
||||||
|
- **expected changes (or no changes) in generated diagrams**, if applicable.
|
||||||
|
|
||||||
|
If the Verification Plan is missing or vague, STOP.
|
||||||
|
|
||||||
|
#### Allowed in Phase 1
|
||||||
|
|
||||||
|
- Creating or modifying **test code only**
|
||||||
|
- Running tests and reporting results
|
||||||
|
|
||||||
|
#### Forbidden in Phase 1
|
||||||
|
|
||||||
|
- Any production code changes
|
||||||
|
- Any SPEC.md or ADR modifications
|
||||||
|
- Any production diff output
|
||||||
|
|
||||||
|
#### Phase 1 Output
|
||||||
|
|
||||||
|
- Proposal + Verification Plan
|
||||||
|
- Tests added/modified (if any)
|
||||||
|
- Test execution results (PASS / FAIL)
|
||||||
|
- Clear recommendation:
|
||||||
|
- "No Phase 2 needed" OR
|
||||||
|
- "Await approval for Phase 2"
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Phase 2 — Apply + Verify + Rollback
|
||||||
|
|
||||||
|
#### Trigger
|
||||||
|
|
||||||
|
Phase 2 is triggered ONLY by the exact user approval phrase:
|
||||||
|
|
||||||
|
**"ok"**
|
||||||
|
|
||||||
|
#### Phase 2 Rules
|
||||||
|
|
||||||
|
- Output **minimal unified diffs only**
|
||||||
|
- Modify ONLY production files declared in Phase 1
|
||||||
|
- Do NOT include explanations, comments, or unchanged code
|
||||||
|
- Automatically apply the diff to the working tree
|
||||||
|
|
||||||
|
#### Mandatory Verification
|
||||||
|
|
||||||
|
- Run the tests defined in the Phase 1 Verification Plan
|
||||||
|
|
||||||
|
#### Success Path
|
||||||
|
|
||||||
|
If ALL tests PASS:
|
||||||
|
|
||||||
|
- Keep the applied changes
|
||||||
|
- Ensure generated diagrams (if affected) are consistent
|
||||||
|
- Report success concisely
|
||||||
|
|
||||||
|
#### Failure Path (Mandatory)
|
||||||
|
|
||||||
|
If ANY test FAILS:
|
||||||
|
|
||||||
|
- Immediately rollback ALL Phase 2 changes
|
||||||
|
- Do NOT keep partial changes
|
||||||
|
- Report:
|
||||||
|
- failing test names
|
||||||
|
- error messages / assertions
|
||||||
|
- brief hypothesis of the root cause
|
||||||
|
- Return to Phase 1 state
|
||||||
|
|
||||||
|
Tests must NEVER be weakened, removed, or altered to force Phase 2 to pass.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## What Counts as "Non-Trivial"
|
||||||
|
|
||||||
|
(Protocol Required)
|
||||||
|
|
||||||
|
Any of the following:
|
||||||
|
|
||||||
|
- routing policy or ordering changes
|
||||||
|
- topology builder changes (nodes, links, parameters)
|
||||||
|
- address decoding / PhysAddr behavior
|
||||||
|
- latency composition rules
|
||||||
|
- changes affecting determinism or connectivity
|
||||||
|
- changes touching two or more production files
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Allowed Exceptions
|
||||||
|
|
||||||
|
(Protocol Still Required)
|
||||||
|
|
||||||
|
- comments or docstrings
|
||||||
|
- formatting-only changes
|
||||||
|
- type annotation changes with no runtime behavior change
|
||||||
|
|
||||||
|
In exceptions, Phase 1 MUST explicitly state:
|
||||||
|
**"No behavior change; tests unchanged."**
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## CLI Semantics
|
||||||
|
|
||||||
|
- `kernbench run --device <id>` runs the benchmark on a single device.
|
||||||
|
- Omitting `--device` runs the benchmark on all devices discovered in the topology (logically parallel).
|
||||||
|
- Device enumeration is handled by the CLI only; benchmarks MUST remain single-device.
|
||||||
|
|
||||||
|
## Derived Artifacts (Clarification)
|
||||||
|
|
||||||
|
- Generated diagrams under `docs/diagrams/` are **derived artifacts**, not production code.
|
||||||
|
- Creating or updating files in `docs/diagrams/`:
|
||||||
|
- does NOT count as a production code change,
|
||||||
|
- does NOT require Phase 2 approval,
|
||||||
|
- MUST be consistent with SPEC.md and ADRs.
|
||||||
|
|
||||||
|
## Enforcement Defaults
|
||||||
|
|
||||||
|
- If unsure whether a change is non-trivial → treat it as non-trivial.
|
||||||
|
- If unsure whether Phase 2 is allowed → STOP and ask.
|
||||||
|
- SPEC.md and ADRs are the final authority.
|
||||||
|
- runtime API MUST NOT hardcode topology/routing or internal hop sequences.
|
||||||
|
- sim_engine MUST remain independent of runtime API semantics (no tensor/kernel policy logic).
|
||||||
@@ -0,0 +1,13 @@
|
|||||||
|
# Python Project (VS Code Template)
|
||||||
|
|
||||||
|
## Quick start
|
||||||
|
1. Create venv + install dev deps (editable):
|
||||||
|
- VS Code: Run Task → `deps: install (editable)`
|
||||||
|
2. Run tests:
|
||||||
|
- VS Code: Run Task → `test`
|
||||||
|
3. Lint / format:
|
||||||
|
- `lint`, `format` tasks
|
||||||
|
|
||||||
|
## Structure
|
||||||
|
- `src/` app code
|
||||||
|
- `tests/` pytest
|
||||||
@@ -0,0 +1,327 @@
|
|||||||
|
# KernBench System-Level Simulator — SPEC
|
||||||
|
|
||||||
|
This document defines the architectural contract for the KernBench
|
||||||
|
system-level discrete-event simulator for our AI Accelerator SIP-based systems.
|
||||||
|
All implementations, tests, and changes MUST conform to this SPEC.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 0. Goal
|
||||||
|
|
||||||
|
Build a **system-level, discrete-event simulator** to evaluate the performance of
|
||||||
|
**LLM kernels running on our AI Accelerator SIP-based systems**, under varying
|
||||||
|
**SIP architectures, topologies, and interconnect configurations**.
|
||||||
|
|
||||||
|
The simulator models **data-movement and control paths across the full hardware
|
||||||
|
hierarchy** and computes **end-to-end execution latency** for kernel executions
|
||||||
|
dispatched to Processing Elements (PEs).
|
||||||
|
|
||||||
|
Primary objectives:
|
||||||
|
|
||||||
|
- compare LLM kernel execution latency under different system configurations
|
||||||
|
- model PE↔HBM, PE↔PE, CUBE↔CUBE, and SIP↔SIP communication and control paths
|
||||||
|
- guarantee deterministic, verifiable behavior with strong debuggability
|
||||||
|
- support visual inspection of the modeled system at multiple abstraction levels
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 0.1 Golden Invariants (Must NOT be violated)
|
||||||
|
|
||||||
|
- End-to-end latency is computed **strictly by explicit traversal** over modeled
|
||||||
|
components and links.
|
||||||
|
- Every routed request MUST incur **latency > 0**.
|
||||||
|
- Routing decisions MUST be **deterministic** given
|
||||||
|
(topology + routing policy + request).
|
||||||
|
- All valid request flows MUST have explicit connectivity in the model.
|
||||||
|
- No hidden shortcuts, implicit bypasses, or magic paths are allowed.
|
||||||
|
- Architectural decisions documented in ADRs override local optimizations.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 0.2 Architectural References (ADRs)
|
||||||
|
|
||||||
|
Major architectural decisions are documented in ADRs and referenced by number.
|
||||||
|
|
||||||
|
- ADR-0001: PhysAddr layout & address decoding contract
|
||||||
|
- ADR-0002: Routing distance, ordering, and bypass rules
|
||||||
|
- ADR-0003: Target system hierarchy & modeling scope (Tray / SIP / CUBE / PE / IO chiplet)
|
||||||
|
- ADR-0004: Memory semantics & local-HBM bandwidth guarantee contract
|
||||||
|
- ADR-0005: Diagram views (SIP / CUBE / PE) and distance-aware layout rules
|
||||||
|
- ADR-0006: Topology compilation, distance extraction, and automatic diagram generation
|
||||||
|
- ADR-0007: runtime_api vs sim_engine responsibility boundaries
|
||||||
|
- ADR-0008: Tensor deployment and allocation (Host allocator, PA-first)
|
||||||
|
- ADR-0009: Kernel execution fan-out and completion semantics
|
||||||
|
- ADR-0010: CLI device selection and multi-device execution semantics
|
||||||
|
- ADR-0011: Memory addressing simplification (PA-first)
|
||||||
|
- ADR-0012: Host ↔ IO_CPU message schema (PA-first, PE-tagged shards)
|
||||||
|
- ADR-0013: Verification strategy and Phase 1 test plan
|
||||||
|
|
||||||
|
SPEC MUST remain consistent with accepted ADRs.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. Core Requirements
|
||||||
|
|
||||||
|
### R1. Correct Routing and Control Path
|
||||||
|
|
||||||
|
- A request MUST traverse the correct sequence of components based on:
|
||||||
|
- source location,
|
||||||
|
- destination address or placement tags,
|
||||||
|
- routing policy and available topology connectivity.
|
||||||
|
- Local vs remote traffic MUST be distinguishable:
|
||||||
|
- same SIP vs different SIP,
|
||||||
|
- same CUBE vs different CUBE,
|
||||||
|
- (optional) same PE-group vs cross PE-group.
|
||||||
|
- Routing behavior MUST be reproducible and deterministic.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### R2. Latency is Computed by Traversal
|
||||||
|
|
||||||
|
End-to-end latency is the sum of:
|
||||||
|
|
||||||
|
- per-node fixed latency (processing / router delay),
|
||||||
|
- per-link latency (fixed and/or size-aware serialization: bytes / BW),
|
||||||
|
- per-service latency (e.g., memory controller service time).
|
||||||
|
|
||||||
|
The simulator MUST:
|
||||||
|
|
||||||
|
- support both fixed and size-aware latency,
|
||||||
|
- emit hop-by-hop traces with timestamps and component identifiers.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### R3. Topology is Configurable and Variable
|
||||||
|
|
||||||
|
Topology MUST NOT be hardcoded.
|
||||||
|
|
||||||
|
The simulator MUST accept multiple topologies (YAML / JSON / dict), varying:
|
||||||
|
|
||||||
|
- SIP count,
|
||||||
|
- CUBE count per SIP,
|
||||||
|
- PE count per CUBE,
|
||||||
|
- on-chip fabric structure (e.g., mesh / NoC / XBAR),
|
||||||
|
- IO chiplets and interconnects,
|
||||||
|
- link bandwidth, latency, and capacity parameters.
|
||||||
|
|
||||||
|
Given a topology:
|
||||||
|
|
||||||
|
- all required request flows MUST have valid connectivity,
|
||||||
|
- missing links are a topology construction error, not a routing error.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### R4. DI-First Component Design (Swappable Implementations)
|
||||||
|
|
||||||
|
All components MUST be replaceable behind stable interfaces, including:
|
||||||
|
|
||||||
|
- routers and fabrics (NoC, bridges, switches),
|
||||||
|
- XBAR-like selectors,
|
||||||
|
- DMA engines and queues,
|
||||||
|
- memory controllers and services (HBM, TCM, queues),
|
||||||
|
- management and control processors (modeled components).
|
||||||
|
|
||||||
|
The simulator MUST:
|
||||||
|
|
||||||
|
- use dependency injection (DI) to bind node specifications to implementation classes,
|
||||||
|
- allow component swapping without changing test logic,
|
||||||
|
- avoid leaking routing or policy logic into unrelated components.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### R5. Multi-Domain Communication Modeling
|
||||||
|
|
||||||
|
The simulator MUST model communication across hierarchical domains, including:
|
||||||
|
|
||||||
|
- PE ↔ local HBM
|
||||||
|
- PE ↔ remote HBM in the same CUBE
|
||||||
|
- PE ↔ remote HBM in other CUBEs within the same SIP
|
||||||
|
- PE ↔ remote HBM in other SIPs
|
||||||
|
- PE ↔ PE messaging (e.g., IPCQ)
|
||||||
|
- PE ↔ IO chiplets
|
||||||
|
- CUBE ↔ CUBE (e.g., via UCIe)
|
||||||
|
- SIP ↔ SIP (e.g., via PCIe or UAL)
|
||||||
|
|
||||||
|
Policy-based bypass is allowed ONLY if:
|
||||||
|
|
||||||
|
- the bypass path is explicitly represented in the model,
|
||||||
|
- the bypass incurs non-zero latency,
|
||||||
|
- the bypass is visible in traces and diagrams.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### R6. Verification-Driven Development
|
||||||
|
|
||||||
|
Development MUST follow a verification-driven workflow:
|
||||||
|
|
||||||
|
- behavior is validated by tests with meaningful input cases,
|
||||||
|
- tests encode SPEC-defined invariants, not incidental implementation details,
|
||||||
|
- changes without clear verification coverage are not allowed.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## R7. Runtime API
|
||||||
|
|
||||||
|
The simulator MUST provide a host-facing runtime API that:
|
||||||
|
|
||||||
|
- exposes tensor deployment and kernel execution operations,
|
||||||
|
- submits requests only to endpoint components (e.g., IO_CPU),
|
||||||
|
- owns host-side tensor handles and allocation metadata as PA shard maps,
|
||||||
|
- remains topology-agnostic and does not perform routing or fan-out.
|
||||||
|
|
||||||
|
Tensor deployment in Phase 0 produces **device physical-address (PA) shard mappings**.
|
||||||
|
Each shard explicitly identifies its target `(sip, cube, pe)` and PA range.
|
||||||
|
No separate host-visible allocation RPC (e.g., AllocateTensorMeta) exists.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## R8. Simulation Engine
|
||||||
|
|
||||||
|
The simulator MUST include a discrete-event simulation engine that:
|
||||||
|
|
||||||
|
- injects requests into the system graph,
|
||||||
|
- schedules events deterministically,
|
||||||
|
- tracks completion via correlation identifiers,
|
||||||
|
- decomposes runtime API operations into explicit graph requests
|
||||||
|
(e.g., MemoryWrite, MemoryRead, KernelLaunch).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## R9. CLI Execution Semantics
|
||||||
|
|
||||||
|
The CLI MUST support executing benchmarks:
|
||||||
|
|
||||||
|
- on a specified device.
|
||||||
|
|
||||||
|
Benchmarks are executed once per invocation within a single simulation instance.
|
||||||
|
If multiple devices are present in the topology, a benchmark MAY interact with
|
||||||
|
multiple devices internally, but the CLI does not launch multiple independent
|
||||||
|
benchmark instances by default.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## R10. Memory Addressing (Phase 0)
|
||||||
|
|
||||||
|
In Phase 0, the simulator uses a **PA-first memory model**:
|
||||||
|
|
||||||
|
- All memory operations use device physical addresses (PA) only.
|
||||||
|
- Virtual addressing, MMU/IOMMU, and address translation latency are out of scope.
|
||||||
|
- Tensor placement is represented as a list of PA shards, each explicitly tagged
|
||||||
|
with `(sip, cube, pe)`.
|
||||||
|
|
||||||
|
All memory access latency MUST be modeled explicitly via graph traversal.
|
||||||
|
No implicit translation or hidden latency is allowed.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. Model Concepts
|
||||||
|
|
||||||
|
### 2.1 Graph Execution Model
|
||||||
|
|
||||||
|
- Nodes represent modeled components (PE blocks, XBAR, NoC, bridges,
|
||||||
|
HBM controllers, IO components, etc.).
|
||||||
|
- Directed edges represent interconnect links with latency and bandwidth attributes.
|
||||||
|
- Execution model:
|
||||||
|
- a node receives a request,
|
||||||
|
- incurs node or service latency,
|
||||||
|
- emits the request to the next hop via a link,
|
||||||
|
- repeats until the destination service completes.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 2.2 Routing
|
||||||
|
|
||||||
|
Routing MAY be implemented as:
|
||||||
|
|
||||||
|
- policy-based routing (code-driven),
|
||||||
|
- routing tables (config-driven),
|
||||||
|
- topology-driven routing (e.g., mesh XY),
|
||||||
|
- or a hybrid approach.
|
||||||
|
|
||||||
|
Routing MUST:
|
||||||
|
|
||||||
|
- consume decoded address domains or explicit placement tags,
|
||||||
|
- operate only on explicit topology connectivity,
|
||||||
|
- remain deterministic.
|
||||||
|
|
||||||
|
Kernel execution requests reference tensors via PA shard mappings.
|
||||||
|
Each shard explicitly identifies its target PE, allowing IO_CPU to
|
||||||
|
deterministically fan-out execution without relying on PA decoding.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. Inputs and Identity
|
||||||
|
|
||||||
|
### 3.1 Node Identity Scheme
|
||||||
|
|
||||||
|
Nodes MUST have stable, parsable identifiers sufficient for domain inference
|
||||||
|
and trace-based debugging.
|
||||||
|
|
||||||
|
Example patterns:
|
||||||
|
|
||||||
|
- `tray.host_cpu`
|
||||||
|
- `sip{S}.io{I}.pcie_ep`
|
||||||
|
- `sip{S}.cube{C}.fabric`
|
||||||
|
- `sip{S}.cube{C}.pe{P}`
|
||||||
|
- `sip{S}.cube{C}.hbm_ctrl`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 3.2 Link Specifications
|
||||||
|
|
||||||
|
A link MAY include:
|
||||||
|
|
||||||
|
- fixed latency (ns),
|
||||||
|
- bandwidth (GB/s) for serialization latency,
|
||||||
|
- optional capacity for contention modeling.
|
||||||
|
|
||||||
|
Topology builders MUST ensure:
|
||||||
|
|
||||||
|
- required links exist,
|
||||||
|
- link parameters are consistent with topology intent.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. Output, Debuggability, and Diagrams
|
||||||
|
|
||||||
|
The simulator MUST provide:
|
||||||
|
|
||||||
|
- per-request hop-by-hop traces with timestamps,
|
||||||
|
- clear error messages for missing connectivity
|
||||||
|
(e.g., "no link for A → B"),
|
||||||
|
- reproducible, inspectable representations of the modeled system.
|
||||||
|
|
||||||
|
Diagrams are **derived artifacts** of the simulator model:
|
||||||
|
|
||||||
|
- They MUST be generatable from the **compiled topology** and **distance metadata**
|
||||||
|
used by execution and routing.
|
||||||
|
- Generation MAY be performed lazily or cached by the implementation,
|
||||||
|
as long as outputs remain consistent with the compiled topology.
|
||||||
|
|
||||||
|
Diagram abstraction levels and distance-aware layout rules are defined in ADR-0005.
|
||||||
|
Automatic diagram generation and output conventions are defined in ADR-0006.
|
||||||
|
|
||||||
|
By default, generated diagrams are written under:
|
||||||
|
|
||||||
|
- `docs/diagrams/`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5. Non-Goals (for now)
|
||||||
|
|
||||||
|
The following are explicitly out of scope:
|
||||||
|
|
||||||
|
- cycle-accurate microarchitecture modeling,
|
||||||
|
- detailed cache coherence protocols,
|
||||||
|
- full PCIe / CXL protocol correctness.
|
||||||
|
|
||||||
|
These MAY be layered later via additional components and policies.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 6. Decision Boundaries
|
||||||
|
|
||||||
|
- SPEC.md defines architectural intent and invariants.
|
||||||
|
- Code implements SPEC and MUST NOT introduce hidden invariants.
|
||||||
|
- Tests validate SPEC-defined behavior and MUST NOT encode fixed topology assumptions.
|
||||||
|
- ADRs record non-trivial architectural decisions and MUST be referenced when relevant.
|
||||||
@@ -0,0 +1,2 @@
|
|||||||
|
def run(ctx):
|
||||||
|
print("IPCQ all reduce kernel bench")
|
||||||
@@ -0,0 +1,37 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import importlib
|
||||||
|
from collections.abc import Callable
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from kernbench.runtime_api.context import RuntimeContext
|
||||||
|
|
||||||
|
BenchFn = Callable[[RuntimeContext], Any]
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_bench(bench_id: str) -> BenchFn:
|
||||||
|
"""
|
||||||
|
Resolve a bench id into a callable bench function.
|
||||||
|
|
||||||
|
Expected layout (repo root):
|
||||||
|
benches/<bench_id>.py
|
||||||
|
def run(ctx: RuntimeContext) -> Any
|
||||||
|
"""
|
||||||
|
bench_id = bench_id.strip()
|
||||||
|
if not bench_id:
|
||||||
|
raise ValueError("Bench id is empty.")
|
||||||
|
|
||||||
|
module_path = f"benches.{bench_id}"
|
||||||
|
|
||||||
|
try:
|
||||||
|
mod = importlib.import_module(module_path)
|
||||||
|
except ModuleNotFoundError as e:
|
||||||
|
raise ValueError(f"Unknown bench '{bench_id}'. Expected module {module_path}.py") from e
|
||||||
|
|
||||||
|
run_fn = getattr(mod, "run", None)
|
||||||
|
if run_fn is None:
|
||||||
|
raise ValueError(f"Bench module {module_path} must define a 'run(ctx)' function.")
|
||||||
|
if not callable(run_fn):
|
||||||
|
raise ValueError(f"'run' in {module_path} is not callable.")
|
||||||
|
|
||||||
|
return run_fn
|
||||||
@@ -0,0 +1,39 @@
|
|||||||
|
"""QKV GEMM benchmark: Q*K^T projection on a single PE.
|
||||||
|
|
||||||
|
Demonstrates the full host-to-PE kernel launch pipeline:
|
||||||
|
Host → PCIE_EP → IO_CPU → M_CPU → NOC → PE_CPU → PE_SCHEDULER → engines
|
||||||
|
|
||||||
|
Kernel: tl.load(a) + tl.ref(b) + tl.composite(gemm) + tl.wait()
|
||||||
|
- Tensor a is loaded into TCM via DMA
|
||||||
|
- Tensor b stays in HBM; PE_SCHEDULER streams it per-tile (32x64x32)
|
||||||
|
"""
|
||||||
|
from kernbench.policy.placement.dp import DPPolicy
|
||||||
|
|
||||||
|
# GEMM dimensions: (M, K) x (K, N) → (M, N)
|
||||||
|
M, K, N = 128, 256, 128
|
||||||
|
DTYPE = "f16"
|
||||||
|
|
||||||
|
|
||||||
|
def _gemm_kernel(a_ptr, b_ptr, out_ptr, M, K, N, tl, DTYPE="f16"):
|
||||||
|
"""QKV GEMM kernel: out = a @ b.
|
||||||
|
|
||||||
|
a is loaded into TCM (DMA_READ).
|
||||||
|
b is referenced in HBM (tl.ref, no DMA — scheduler streams per-tile).
|
||||||
|
"""
|
||||||
|
a = tl.load(a_ptr, shape=(M, K), dtype=DTYPE)
|
||||||
|
b = tl.ref(b_ptr, shape=(K, N), dtype=DTYPE)
|
||||||
|
handle = tl.composite(op="gemm", a=a, b=b, out_ptr=out_ptr)
|
||||||
|
tl.wait(handle)
|
||||||
|
|
||||||
|
|
||||||
|
def run(ctx):
|
||||||
|
"""Run the QKV GEMM benchmark."""
|
||||||
|
# DP placement: a=replicate (cube-level), b/out=column_wise (N-axis, single PE)
|
||||||
|
a = ctx.zeros((M, K), dtype=DTYPE, dp=DPPolicy(cube="replicate", pe="replicate"), name="a")
|
||||||
|
b = ctx.zeros((K, N), dtype=DTYPE, dp=DPPolicy(cube="replicate", pe="column_wise"), name="b")
|
||||||
|
out = ctx.empty(
|
||||||
|
(M, N), dtype=DTYPE, dp=DPPolicy(cube="replicate", pe="column_wise"), name="out",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Launch GEMM kernel
|
||||||
|
ctx.launch("qkv_gemm", _gemm_kernel, a, b, out, M, K, N)
|
||||||
@@ -0,0 +1,39 @@
|
|||||||
|
"""QKV GEMM benchmark: Q*K^T projection on all PEs in a cube (multi-PE).
|
||||||
|
|
||||||
|
Column-parallel GEMM: a is replicated (cube-level), b/out are column-sharded.
|
||||||
|
M_CPU fans out KernelLaunchMsg to all 8 PE_CPUs (ADR-0009 D3).
|
||||||
|
|
||||||
|
Kernel: tl.load(a) + tl.ref(b) + tl.composite(gemm) + tl.wait()
|
||||||
|
- Tensor a is loaded into TCM via DMA
|
||||||
|
- Tensor b stays in HBM; PE_SCHEDULER streams it per-tile (32x64x32)
|
||||||
|
"""
|
||||||
|
from kernbench.policy.placement.dp import DPPolicy
|
||||||
|
|
||||||
|
# GEMM dimensions: (M, K) x (K, N) -> (M, N)
|
||||||
|
M, K, N = 128, 256, 128
|
||||||
|
DTYPE = "f16"
|
||||||
|
|
||||||
|
|
||||||
|
def _gemm_kernel(a_ptr, b_ptr, out_ptr, M, K, N, tl, DTYPE="f16"):
|
||||||
|
"""QKV GEMM kernel: out = a @ b.
|
||||||
|
|
||||||
|
a is loaded into TCM (DMA_READ).
|
||||||
|
b is referenced in HBM (tl.ref, no DMA -- scheduler streams per-tile).
|
||||||
|
"""
|
||||||
|
a = tl.load(a_ptr, shape=(M, K), dtype=DTYPE)
|
||||||
|
b = tl.ref(b_ptr, shape=(K, N), dtype=DTYPE)
|
||||||
|
handle = tl.composite(op="gemm", a=a, b=b, out_ptr=out_ptr)
|
||||||
|
tl.wait(handle)
|
||||||
|
|
||||||
|
|
||||||
|
def run(ctx):
|
||||||
|
"""Run the multi-PE QKV GEMM benchmark."""
|
||||||
|
# DP placement: a=replicate (cube-level), b/out=column_wise (N-axis split)
|
||||||
|
a = ctx.zeros((M, K), dtype=DTYPE, dp=DPPolicy(cube="replicate", pe="replicate"), name="a")
|
||||||
|
b = ctx.zeros((K, N), dtype=DTYPE, dp=DPPolicy(cube="replicate", pe="column_wise"), name="b")
|
||||||
|
out = ctx.empty(
|
||||||
|
(M, N), dtype=DTYPE, dp=DPPolicy(cube="replicate", pe="column_wise"), name="out",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Launch GEMM kernel on all PEs
|
||||||
|
ctx.launch("qkv_gemm_multi", _gemm_kernel, a, b, out, M, K, N)
|
||||||
@@ -0,0 +1,108 @@
|
|||||||
|
# ADR-0001: PhysAddr Layout & Address Decoding Contract
|
||||||
|
|
||||||
|
## Status
|
||||||
|
|
||||||
|
Accepted
|
||||||
|
|
||||||
|
## Date
|
||||||
|
|
||||||
|
2026-02-27
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
KernBench Graph Latency Simulator must route requests deterministically and compute end-to-end latency strictly by graph traversal.
|
||||||
|
To model local vs remote traffic (same/different SIP, same/different CUBE, optional PE-group), requests need a stable, parsable address/location scheme that:
|
||||||
|
|
||||||
|
- can be decoded into routing domains (SIP/CUBE/HBM/PE-resource, etc.)
|
||||||
|
- remains topology-agnostic (no hardcoded counts)
|
||||||
|
- supports swappable policy and DI-first components without leaking topology assumptions into node implementations
|
||||||
|
|
||||||
|
## Decision
|
||||||
|
|
||||||
|
We define a **PhysAddr value object** and an **address decoding contract** that converts an integer address into routing domains.
|
||||||
|
|
||||||
|
### D1. PhysAddr is an immutable value object
|
||||||
|
|
||||||
|
- PhysAddr is immutable and comparable as a pure value.
|
||||||
|
- Any allocator returns a **fully specified PhysAddr** (not partial metadata).
|
||||||
|
- No global state may be required to interpret a PhysAddr.
|
||||||
|
|
||||||
|
### D2. PhysAddr fields (logical contract)
|
||||||
|
|
||||||
|
PhysAddr must be able to represent at least:
|
||||||
|
|
||||||
|
- `rack_id` (optional but reserved for scale-out)
|
||||||
|
- `sip_id` (device / SIP domain)
|
||||||
|
- `sip_seg` (SIP-level segment/window selection, e.g., cube window)
|
||||||
|
- `local_offset` (offset within the chosen segment/window)
|
||||||
|
|
||||||
|
Decoded/derived fields may include (optional):
|
||||||
|
|
||||||
|
- `cube_id`
|
||||||
|
- `kind` (e.g., HBM vs PE-resource vs raw)
|
||||||
|
- `unit_type` / `pe_id` (if PE-level addressing is modeled)
|
||||||
|
|
||||||
|
**Important:** The exact bit allocation may evolve, but the *semantic fields above* must remain decodable without hidden assumptions.
|
||||||
|
|
||||||
|
### D3. Decoding is deterministic and policy-compatible
|
||||||
|
|
||||||
|
- Decoding must deterministically map an integer address to:
|
||||||
|
- destination SIP domain (`sip_id`)
|
||||||
|
- destination sub-domain (`cube_id` if applicable)
|
||||||
|
- destination target kind (HBM/PE-resource/other)
|
||||||
|
- Decoding must not depend on runtime topology sizes; it may depend on **explicit topology parameters** provided through configuration (e.g., segment size, slice size), and those parameters must live in the topology/config layer (not in random components).
|
||||||
|
|
||||||
|
### D4. Topology-derived constants live in the topology layer
|
||||||
|
|
||||||
|
Constants such as segment sizes (e.g., HBM slice size / window size) are derived from topology configuration (YAML/JSON/dict) and are provided to the decoder via DI/config.
|
||||||
|
They must not be hardcoded in node implementations.
|
||||||
|
|
||||||
|
### D5. Routing consumes decoded domains, not raw bits
|
||||||
|
|
||||||
|
Routing policy uses decoded domains:
|
||||||
|
|
||||||
|
- `src` location (sip/cube/pe or node_id)
|
||||||
|
- `dst` domains derived from PhysAddr decoding
|
||||||
|
- `size_bytes` for size-aware link latency
|
||||||
|
Routing must not inspect raw bit-fields directly except inside the decoding module.
|
||||||
|
|
||||||
|
## Alternatives Considered
|
||||||
|
|
||||||
|
1) **Use raw integers everywhere, decode ad-hoc in routing**
|
||||||
|
|
||||||
|
- Rejected: leads to duplicated logic, inconsistent routing, and hidden assumptions embedded in multiple components.
|
||||||
|
|
||||||
|
1) **Hardcode topology sizes (SIP/CUBE/PE counts) into decoding**
|
||||||
|
|
||||||
|
- Rejected: violates SPEC (R3) and breaks swappability and configuration-driven topologies.
|
||||||
|
|
||||||
|
1) **Put decoding inside memory controllers or routers**
|
||||||
|
|
||||||
|
- Rejected: leaks policy into components and undermines DI-first, swappable implementations (SPEC R4).
|
||||||
|
|
||||||
|
## Consequences
|
||||||
|
|
||||||
|
### Positive
|
||||||
|
|
||||||
|
- Deterministic routing domains enable clear test invariants for local vs remote paths (SPEC R1, R5).
|
||||||
|
- Keeps topology variability (SPEC R3) while preserving consistent semantics.
|
||||||
|
- DI-first: decoder can be swapped or extended without changing components or tests (SPEC R4).
|
||||||
|
|
||||||
|
### Tradeoffs / Costs
|
||||||
|
|
||||||
|
- Requires explicit configuration for any topology-derived sizes.
|
||||||
|
- Introduces a single “blessed” decoding module that must remain stable and well-tested.
|
||||||
|
|
||||||
|
## Implementation Notes (Non-normative)
|
||||||
|
|
||||||
|
- Recommended module boundary:
|
||||||
|
- `src/kernbench/policy/address/phyaddr.py`
|
||||||
|
|
||||||
|
- Tests should cover:
|
||||||
|
- deterministic decoding
|
||||||
|
- local vs remote classification from decoded fields
|
||||||
|
- invariants: “allocator returns full PhysAddr”, “decoding requires no global state”
|
||||||
|
|
||||||
|
## Links
|
||||||
|
|
||||||
|
- SPEC.md: R1 (routing), R3 (configurable topology), R4 (DI-first), R5 (multi-domain comm)
|
||||||
@@ -0,0 +1,103 @@
|
|||||||
|
# ADR-0002: Routing Distance, Ordering & Bypass Rules
|
||||||
|
|
||||||
|
## Status
|
||||||
|
Accepted
|
||||||
|
|
||||||
|
## Date
|
||||||
|
2026-02-27
|
||||||
|
|
||||||
|
## Context
|
||||||
|
The KernBench Graph Latency Simulator must compare kernel execution time
|
||||||
|
across different architectures and topologies by computing end-to-end
|
||||||
|
latency from graph traversal.
|
||||||
|
|
||||||
|
To support meaningful comparison:
|
||||||
|
- routing must be deterministic
|
||||||
|
- latency must reflect actual interconnect structure
|
||||||
|
- local vs remote traffic must be distinguishable
|
||||||
|
- “bypass” optimizations must not undermine debuggability or correctness
|
||||||
|
|
||||||
|
The simulator also aims to avoid software-managed metadata and hidden
|
||||||
|
shortcuts that obscure control paths.
|
||||||
|
|
||||||
|
## Decision
|
||||||
|
|
||||||
|
### D1. Distance is accumulated latency, not hop count
|
||||||
|
- Routing “distance” is defined as the **sum of per-node and per-link latency**.
|
||||||
|
- Hop count alone must not be used for ordering or path selection.
|
||||||
|
- Size-aware serialization latency (bytes / BW) contributes to distance.
|
||||||
|
|
||||||
|
### D2. Routing order is derived from graph traversal
|
||||||
|
- The chosen route is the path with minimum accumulated latency
|
||||||
|
given the constructed graph and routing policy.
|
||||||
|
- Deterministic ordering must be guaranteed for identical inputs
|
||||||
|
(topology + policy + request).
|
||||||
|
|
||||||
|
### D3. Bypass is explicit and graph-represented
|
||||||
|
- Any bypass (e.g., local cube HBM access via XBAR instead of NOC) must be:
|
||||||
|
- explicitly represented as a graph path, and
|
||||||
|
- subject to latency accumulation like any other path.
|
||||||
|
- Example: PE_DMA has dual egress — one to XBAR (HBM path) and one to NOC (non-HBM path).
|
||||||
|
Both are explicit graph edges; neither is a “bypass” — they are distinct data paths
|
||||||
|
serving different memory domains.
|
||||||
|
- Implicit or “magic” bypass paths are disallowed.
|
||||||
|
|
||||||
|
### D4. No zero-latency end-to-end paths
|
||||||
|
|
||||||
|
- Every routed request must incur **end-to-end** latency > 0.
|
||||||
|
- Individual fabric segments (e.g., NOC hops) MAY have distance_mm = 0
|
||||||
|
when the fabric is distributed and distance is not meaningful at that granularity.
|
||||||
|
This is allowed because other components on the same path (e.g., PE_DMA, SRAM,
|
||||||
|
UCIe endpoints) contribute non-zero latency, ensuring the end-to-end invariant holds.
|
||||||
|
- Fully zero-latency end-to-end paths are disallowed, except for explicit
|
||||||
|
test-only stubs clearly marked as such.
|
||||||
|
|
||||||
|
### D5. Policy vs topology responsibility split
|
||||||
|
- Topology builder:
|
||||||
|
- defines nodes and links and their latency/BW parameters
|
||||||
|
- Routing policy:
|
||||||
|
- selects among available graph paths based on decoded domains
|
||||||
|
- Routing policy must not assume missing links; missing connectivity
|
||||||
|
is a topology construction error.
|
||||||
|
|
||||||
|
### D6. No software-managed routing metadata
|
||||||
|
- Routing decisions must not rely on per-request software-managed metadata
|
||||||
|
that tracks distance, hop count, or ordering outside the graph model.
|
||||||
|
- All distance/order computation is derived from traversal itself.
|
||||||
|
|
||||||
|
## Alternatives Considered
|
||||||
|
|
||||||
|
1) **Hop-count based routing**
|
||||||
|
- Rejected: ignores heterogeneous latency/BW and misrepresents
|
||||||
|
architectural differences.
|
||||||
|
|
||||||
|
2) **Implicit local shortcuts**
|
||||||
|
- Rejected: breaks debuggability and violates traversal-based latency.
|
||||||
|
|
||||||
|
3) **Software-managed distance metadata**
|
||||||
|
- Rejected: increases control overhead and obscures routing semantics.
|
||||||
|
|
||||||
|
## Consequences
|
||||||
|
|
||||||
|
### Positive
|
||||||
|
- Clear, debuggable hop-by-hop traces (SPEC R2, R4).
|
||||||
|
- Architecture comparisons reflect real interconnect structure.
|
||||||
|
- Routing behavior is reproducible and deterministic.
|
||||||
|
|
||||||
|
### Tradeoffs / Costs
|
||||||
|
- Graph construction must be correct and complete.
|
||||||
|
- Bypass modeling requires explicit graph representation,
|
||||||
|
which slightly increases topology description complexity.
|
||||||
|
|
||||||
|
## Implementation Notes (Non-normative)
|
||||||
|
- Recommended responsibilities:
|
||||||
|
- Graph builder: ensure all required paths exist.
|
||||||
|
- Router: select next hop based on decoded domains and policy.
|
||||||
|
- Tests should assert:
|
||||||
|
- non-zero end-to-end latency
|
||||||
|
- deterministic routing for identical inputs
|
||||||
|
- bypass paths appear explicitly in emitted traces
|
||||||
|
|
||||||
|
## Links
|
||||||
|
- SPEC.md: R1 (routing), R2 (latency), R3 (topology), R5 (multi-domain comm)
|
||||||
|
- ADR-0001: PhysAddr layout & decoding contract
|
||||||
@@ -0,0 +1,64 @@
|
|||||||
|
# ADR-0003: Target System Hierarchy & Modeling Scope
|
||||||
|
|
||||||
|
## Status
|
||||||
|
|
||||||
|
Accepted
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
We need a system-level simulator to evaluate LLM kernel performance on our AI Accelerator platform.
|
||||||
|
The platform is organized as a compute tray containing multiple identical SIPs connected via PCIe or UAL
|
||||||
|
through switching fabrics, with a host CPU issuing commands/kernels.
|
||||||
|
|
||||||
|
## Decision
|
||||||
|
|
||||||
|
We model the system hierarchy explicitly:
|
||||||
|
|
||||||
|
### D1. Tray-level
|
||||||
|
|
||||||
|
- A compute tray contains:
|
||||||
|
- Host CPU (issues requests / coordinates runtime & data placement)
|
||||||
|
- Multiple identical SIPs (accelerators)
|
||||||
|
- Interconnect fabric between SIPs (PCIe and/or UAL via switches)
|
||||||
|
|
||||||
|
### D2. SIP-level
|
||||||
|
|
||||||
|
- A SIP is a multi-die package composed of:
|
||||||
|
- Multiple CUBEs (HBM die + compute PEs + UCIe)
|
||||||
|
- One or more IO chiplets (host/SIP interfaces)
|
||||||
|
- IO chiplets:
|
||||||
|
- provide interfaces: PCIe-EP, IO_CPU, optionally UAL-EP
|
||||||
|
- can be multiple per SIP
|
||||||
|
- placement constrained to SIP shoreline (top/bottom/left/right); each shoreline may host 1–2 IO chiplets
|
||||||
|
|
||||||
|
### D3. CUBE-level
|
||||||
|
|
||||||
|
- A CUBE contains:
|
||||||
|
- HBM + memory controller (HBM_CTRL)
|
||||||
|
- XBAR (top/bottom): HBM pseudo-channel crossbar, PE's dedicated path to HBM
|
||||||
|
- Bridge (left/right): connects XBAR.top ↔ XBAR.bottom for cross-half HBM access
|
||||||
|
- NOC: distributed on-die fabric spanning the entire cube (distance modeled as 0);
|
||||||
|
carries non-HBM traffic including inter-cube (UCIe), command (M_CPU↔PE_CPU), and shared SRAM access
|
||||||
|
- Shared SRAM: cube-level shared memory accessible by all PEs via NOC
|
||||||
|
- management/control CPU (M_CPU) coordinating PE command distribution and completion aggregation
|
||||||
|
- multiple PEs
|
||||||
|
- up to 4 UCIe endpoints (N/E/W/S) for CUBE↔CUBE and CUBE↔IO connectivity
|
||||||
|
|
||||||
|
### D4. PE-level
|
||||||
|
|
||||||
|
- A PE can execute one kernel instance
|
||||||
|
- PE contains internal control + accelerators (modeled at PE view granularity):
|
||||||
|
- PE_CPU, command handler, PE_TCM, DMA/GEMM/MATH engines, internal queues
|
||||||
|
|
||||||
|
## Consequences
|
||||||
|
|
||||||
|
- The simulator supports abstraction by “views”:
|
||||||
|
- SIP view hides PE internals
|
||||||
|
- CUBE view treats each PE as a single block
|
||||||
|
- PE view expands PE internals
|
||||||
|
- Topology remains parameterized; sizes/counts/links come from configuration.
|
||||||
|
|
||||||
|
## Links
|
||||||
|
|
||||||
|
- SPEC R3/R5
|
||||||
|
- ADR-0005 (diagram views)
|
||||||
@@ -0,0 +1,64 @@
|
|||||||
|
# ADR-0004: Memory Semantics & Local-HBM Bandwidth Guarantee
|
||||||
|
|
||||||
|
## Status
|
||||||
|
|
||||||
|
Accepted
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
Accurately modeling PE↔HBM behavior is essential for kernel latency estimation.
|
||||||
|
Each PE has a notion of “local HBM” that must guarantee full HBM bandwidth, independent of intervening on-die fabric bandwidth.
|
||||||
|
|
||||||
|
## Decision
|
||||||
|
|
||||||
|
### D1. Local HBM definition
|
||||||
|
|
||||||
|
- Each PE is assigned a logically defined “local HBM” region.
|
||||||
|
- Local HBM corresponds to the pseudo-channel subset directly attached to that PE’s DMA path
|
||||||
|
via the XBAR (top or bottom, depending on PE corner placement).
|
||||||
|
- The path is: PE_DMA → XBAR.top/bottom → HBM_CTRL.
|
||||||
|
- The mapping (HBM pseudo-channels → PE local regions) is derived from topology configuration.
|
||||||
|
|
||||||
|
### D2. Local HBM bandwidth guarantee contract
|
||||||
|
|
||||||
|
- Accesses from a PE to its local HBM MUST guarantee full HBM read/write bandwidth
|
||||||
|
independent of intervening fabric bandwidth limits.
|
||||||
|
- This guarantee is modeled by:
|
||||||
|
- a dedicated logical path and/or service model that enforces HBM BW at the PE-local-HBM interaction point,
|
||||||
|
- while still incurring non-zero latency along explicitly modeled components.
|
||||||
|
|
||||||
|
### D3. Cross-half HBM semantics
|
||||||
|
|
||||||
|
- A PE connected to XBAR.bottom that accesses HBM pseudo-channels on the XBAR.top half
|
||||||
|
(or vice versa) traverses a bridge:
|
||||||
|
- PE_DMA → XBAR.bottom → bridge → XBAR.top → HBM_CTRL
|
||||||
|
- Bridge bandwidth may limit cross-half HBM access relative to local-half access.
|
||||||
|
|
||||||
|
### D4. Non-local HBM semantics (inter-cube / inter-SIP)
|
||||||
|
|
||||||
|
- Accesses from a PE to HBM in a different cube or SIP MAY be limited by:
|
||||||
|
- NOC bandwidth within the cube,
|
||||||
|
- inter-cube UCIe links,
|
||||||
|
- inter-SIP fabric (PCIe/UAL).
|
||||||
|
- These paths MUST be explicit and traceable.
|
||||||
|
|
||||||
|
### D5. Shared SRAM semantics
|
||||||
|
|
||||||
|
- Each CUBE contains a shared SRAM accessible by all PEs in that CUBE.
|
||||||
|
- Access path: PE_DMA → NOC → shared SRAM.
|
||||||
|
- Shared SRAM bandwidth is limited by the NOC↔SRAM link bandwidth.
|
||||||
|
- Shared SRAM is not part of the HBM address space; it is a separate memory domain.
|
||||||
|
|
||||||
|
## Verification Notes
|
||||||
|
|
||||||
|
Tests should cover:
|
||||||
|
|
||||||
|
- local-HBM case: BW matches HBM BW regardless of fabric BW parameter
|
||||||
|
- cross-half HBM case: latency includes bridge traversal
|
||||||
|
- non-local cases (inter-cube/inter-SIP): BW/latency respond to fabric/link parameters
|
||||||
|
- shared SRAM case: access via NOC with correct BW
|
||||||
|
|
||||||
|
## Links
|
||||||
|
|
||||||
|
- SPEC R2/R5
|
||||||
|
- ADR-0002 (distance/order & explicit bypass)
|
||||||
@@ -0,0 +1,186 @@
|
|||||||
|
# ADR-0005: Diagram Views & Distance-Aware Layout Rules
|
||||||
|
|
||||||
|
## Status
|
||||||
|
|
||||||
|
Accepted
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
We require verifiable and inspectable system modeling for a large-scale,
|
||||||
|
parameterized AI Accelerator system.
|
||||||
|
|
||||||
|
Humans must be able to:
|
||||||
|
|
||||||
|
- visually inspect the modeled topology,
|
||||||
|
- reason about communication structure and relative distance,
|
||||||
|
- do so at multiple abstraction levels without being overwhelmed by detail.
|
||||||
|
|
||||||
|
The simulator models distance (accumulated latency) as a first-class concept.
|
||||||
|
Diagrams must reflect this distance by default.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Global Defaults
|
||||||
|
|
||||||
|
- All diagrams MUST be **distance-aware by default**.
|
||||||
|
- All diagrams MUST render **representative views** of the architecture.
|
||||||
|
- Instance indices (e.g., sip0, cube2, pe3) MUST NOT be required for diagram generation.
|
||||||
|
- Instance indices MAY be used ONLY:
|
||||||
|
- to define a distance anchor in asymmetric or debugging scenarios, or
|
||||||
|
- when explicitly requested.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Representative Rendering Rule
|
||||||
|
|
||||||
|
- All CUBEs share the same internal structure.
|
||||||
|
- All PEs share the same internal structure.
|
||||||
|
|
||||||
|
Therefore:
|
||||||
|
|
||||||
|
- SIP-level diagrams render representative CUBEs and IO chiplets.
|
||||||
|
- CUBE-level diagrams render representative PEs as opaque blocks.
|
||||||
|
- PE-level diagrams render a representative PE with fully expanded internals.
|
||||||
|
|
||||||
|
Diagrams MUST NOT depend on specific SIP, CUBE, or PE indices
|
||||||
|
unless explicitly requested.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Diagram Views
|
||||||
|
|
||||||
|
### View A — SIP-Level Diagram
|
||||||
|
|
||||||
|
**Purpose**
|
||||||
|
Explain system-scale structure and connectivity.
|
||||||
|
|
||||||
|
**Visible elements**
|
||||||
|
|
||||||
|
- SIP boundaries (optional)
|
||||||
|
- CUBEs (opaque blocks)
|
||||||
|
- IO chiplets (opaque blocks)
|
||||||
|
- Optional UCIe stubs only if needed to clarify connectivity
|
||||||
|
|
||||||
|
**Hidden elements**
|
||||||
|
|
||||||
|
- PE internals
|
||||||
|
- CUBE internal fabric
|
||||||
|
- IO chiplet internals
|
||||||
|
|
||||||
|
**Visible links**
|
||||||
|
|
||||||
|
- Host ↔ IO chiplets (PCIe)
|
||||||
|
- SIP ↔ SIP (PCIe / UAL via switches)
|
||||||
|
- IO ↔ CUBE (on-package links)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### View B — CUBE-Level Diagram
|
||||||
|
|
||||||
|
**Purpose**
|
||||||
|
Explain cube-internal structure and data/control flow.
|
||||||
|
|
||||||
|
**Visible elements**
|
||||||
|
|
||||||
|
- XBAR (top/bottom): HBM pseudo-channel crossbar
|
||||||
|
- Bridge (left/right): cross-half HBM connectors between XBAR.top and XBAR.bottom
|
||||||
|
- NOC: distributed on-die fabric for non-HBM traffic
|
||||||
|
- HBM subsystem (HBM_CTRL)
|
||||||
|
- Shared SRAM: cube-level shared memory
|
||||||
|
- Management CPU (M_CPU)
|
||||||
|
- PEs as opaque blocks (PE[0..N−1])
|
||||||
|
- UCIe endpoints (N/E/W/S) as ports
|
||||||
|
|
||||||
|
**Hidden elements**
|
||||||
|
|
||||||
|
- PE internals
|
||||||
|
|
||||||
|
**Visible links**
|
||||||
|
|
||||||
|
- PE → XBAR (HBM data path, top or bottom by corner placement)
|
||||||
|
- PE → NOC (non-HBM data path)
|
||||||
|
- XBAR ↔ bridge ↔ XBAR (cross-half HBM access)
|
||||||
|
- XBAR → HBM_CTRL
|
||||||
|
- NOC ↔ UCIe endpoints
|
||||||
|
- NOC ↔ shared SRAM
|
||||||
|
- M_CPU ↔ NOC (command path)
|
||||||
|
- NOC → PE_CPU (command delivery, collapsed into PE block)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### View C — PE-Level Diagram
|
||||||
|
|
||||||
|
**Purpose**
|
||||||
|
Explain internal PE behavior and execution structure.
|
||||||
|
|
||||||
|
**Visible elements**
|
||||||
|
|
||||||
|
- PE_CPU
|
||||||
|
- Command handler / scheduler
|
||||||
|
- PE_TCM (local SRAM)
|
||||||
|
- HW accelerators (DMA, GEMM, MATH, etc.)
|
||||||
|
- Local HBM interface
|
||||||
|
- Optional IPCQ / messaging endpoints
|
||||||
|
|
||||||
|
**Visible links**
|
||||||
|
|
||||||
|
- Control paths (CPU → scheduler → engines)
|
||||||
|
- Data paths (engines ↔ TCM, DMA ↔ local HBM)
|
||||||
|
- External fabric ports as abstract ports only
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Distance-Aware Layout (Default)
|
||||||
|
|
||||||
|
### Distance definition
|
||||||
|
|
||||||
|
- Distance is defined as **accumulated latency**, consistent with ADR-0002.
|
||||||
|
- Distance is computed from a single anchor node.
|
||||||
|
|
||||||
|
### Default anchor selection
|
||||||
|
|
||||||
|
- SIP view: IO chiplet (or Host CPU if present)
|
||||||
|
- CUBE view: a representative PE
|
||||||
|
- PE view: PE_CPU or Command Handler
|
||||||
|
|
||||||
|
Anchors are **implicit defaults** and MUST NOT be required to be specified.
|
||||||
|
|
||||||
|
### Layout rules
|
||||||
|
|
||||||
|
- Diagrams MUST be laid out in layers based on distance buckets.
|
||||||
|
- Layout direction MUST be consistent within a view type
|
||||||
|
(preferred: left-to-right).
|
||||||
|
- Nodes with equal distance MUST have stable ordering
|
||||||
|
(by role or identifier, deterministically).
|
||||||
|
|
||||||
|
Cycles MAY be rendered using dashed or curved edges for readability,
|
||||||
|
without affecting distance semantics.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Generation Contract (for Tools / Claude Code)
|
||||||
|
|
||||||
|
When generating diagrams:
|
||||||
|
|
||||||
|
- Assume distance-aware layout by default.
|
||||||
|
- Assume representative rendering by default.
|
||||||
|
- Do NOT ask for SIP/CUBE/PE indices unless required.
|
||||||
|
- Do NOT expand hidden abstraction levels.
|
||||||
|
- Prefer architectural clarity over micro-hop fidelity.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Consequences
|
||||||
|
|
||||||
|
- Diagrams are stable across topology scaling.
|
||||||
|
- Changes in distance or routing policy are reflected visually.
|
||||||
|
- Diagrams serve as verifiable artifacts derived from the simulator model,
|
||||||
|
not as hand-maintained documentation.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Links
|
||||||
|
|
||||||
|
- SPEC Section 4 (Output, Debuggability, and Diagrams)
|
||||||
|
- ADR-0002 (Routing distance semantics)
|
||||||
|
- ADR-0006 (Topology compilation & automatic diagram generation)
|
||||||
@@ -0,0 +1,130 @@
|
|||||||
|
# ADR-0006: Topology Compilation, Distance Extraction, and Automatic Diagram Generation
|
||||||
|
|
||||||
|
## Status
|
||||||
|
|
||||||
|
Accepted
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
The simulator compiles topology configuration (e.g., topology.yaml) into an explicit model graph,
|
||||||
|
and computes routing and accumulated latency (distance).
|
||||||
|
Diagrams should be generated from these authoritative artifacts to ensure consistency and avoid
|
||||||
|
hand-maintained topology drawings.
|
||||||
|
|
||||||
|
Additionally, for usability, diagrams should be emitted automatically into a stable location
|
||||||
|
so that developers can preview them immediately in the repository.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Decision
|
||||||
|
|
||||||
|
### D1. Topology compilation is the single source of truth
|
||||||
|
|
||||||
|
- topology.yaml (or equivalent config) is compiled into:
|
||||||
|
- an explicit system graph,
|
||||||
|
- node/link attributes,
|
||||||
|
- routing policies.
|
||||||
|
This compiled graph is the authoritative representation of the system.
|
||||||
|
|
||||||
|
### D2. Distance extraction during compilation
|
||||||
|
|
||||||
|
- During or immediately after topology compilation, the simulator MUST compute distance metadata
|
||||||
|
(accumulated latency) consistent with ADR-0002.
|
||||||
|
- Distance metadata MUST be sufficient to support distance-aware diagram layout as defined in ADR-0005.
|
||||||
|
- Distributed fabric segments (e.g., NOC) MAY have distance_mm = 0 per ADR-0002 D4;
|
||||||
|
layout placement for such nodes uses explicit position metadata rather than distance buckets.
|
||||||
|
|
||||||
|
### D3. Diagram generation is a derived artifact
|
||||||
|
|
||||||
|
- Diagrams MUST be generated from:
|
||||||
|
- the compiled topology graph,
|
||||||
|
- extracted distance metadata,
|
||||||
|
- view/layout rules defined in ADR-0005.
|
||||||
|
- Diagram generation MUST NOT require additional hand-written topology descriptions.
|
||||||
|
|
||||||
|
### D4. Automatic diagram emission to the repository
|
||||||
|
|
||||||
|
- As part of topology compilation, the implementation MUST produce the following diagrams by default:
|
||||||
|
- SIP-level diagram (representative, distance-aware)
|
||||||
|
- CUBE-level diagram (representative, distance-aware)
|
||||||
|
- PE-level diagram (representative, distance-aware)
|
||||||
|
- The default output directory is:
|
||||||
|
- `docs/diagrams/`
|
||||||
|
- The generator MUST overwrite/update only when the compiled topology (or diagram rules) changes.
|
||||||
|
|
||||||
|
### D5. View-specific projection and layout
|
||||||
|
|
||||||
|
For each view (SIP / CUBE / PE):
|
||||||
|
|
||||||
|
- The generator MUST project the compiled graph into a reduced view graph:
|
||||||
|
- hide/collapse nodes according to ADR-0005,
|
||||||
|
- preserve connectivity semantics relevant to that view,
|
||||||
|
- compute distance buckets and assign layout layers deterministically.
|
||||||
|
- CUBE-level projection MUST include:
|
||||||
|
- XBAR (top/bottom), bridge (left/right), NOC, HBM_CTRL, shared SRAM, M_CPU, UCIe ports,
|
||||||
|
and PEs as opaque blocks.
|
||||||
|
- Distinct edge kinds for HBM path (PE→XBAR) vs non-HBM path (PE→NOC).
|
||||||
|
- Default anchors are implicit (ADR-0005) and MUST NOT require instance indices.
|
||||||
|
|
||||||
|
### D6. Output formats and determinism
|
||||||
|
|
||||||
|
- The generator MUST output at least one of:
|
||||||
|
- Mermaid (Markdown-native)
|
||||||
|
- Graphviz DOT (rank-based control)
|
||||||
|
- SVG (mm-accurate layout, no external dependencies)
|
||||||
|
- SVG is preferred when mm-accurate position metadata is available from the compiled topology.
|
||||||
|
- Output MUST be deterministic:
|
||||||
|
- same topology + same rules → identical diagram text
|
||||||
|
- File naming MUST be deterministic and stable (see "Output Conventions").
|
||||||
|
|
||||||
|
### D7. Performance and caching
|
||||||
|
|
||||||
|
- Diagram generation MAY be lazy and/or cached, as long as the outputs in `docs/diagrams/`
|
||||||
|
remain consistent with the compiled topology.
|
||||||
|
- The implementation SHOULD use a cache key based on:
|
||||||
|
- topology content hash,
|
||||||
|
- routing policy version,
|
||||||
|
- diagram rules version,
|
||||||
|
- view type (SIP/CUBE/PE).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Output Conventions
|
||||||
|
|
||||||
|
### Directory
|
||||||
|
|
||||||
|
- `docs/diagrams/` is the canonical output directory for generated diagrams.
|
||||||
|
|
||||||
|
### File names (recommended, deterministic)
|
||||||
|
|
||||||
|
- `system_view.svg` / `system_view.mmd` / `system_view.dot`
|
||||||
|
- `sip_view.svg` / `sip_view.mmd` / `sip_view.dot`
|
||||||
|
- `cube_view.svg` / `cube_view.mmd` / `cube_view.dot`
|
||||||
|
- `pe_view.svg` / `pe_view.mmd` / `pe_view.dot`
|
||||||
|
|
||||||
|
Optionally, for multi-topology workflows:
|
||||||
|
|
||||||
|
- `sip_view__{topology_id}.svg`
|
||||||
|
- `cube_view__{topology_id}.svg`
|
||||||
|
- `pe_view__{topology_id}.svg`
|
||||||
|
|
||||||
|
### Repository policy
|
||||||
|
|
||||||
|
- Generated diagram files MAY be committed to the repository to enable diff-based review.
|
||||||
|
- If committed, they MUST be reproducible from topology compilation.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Consequences
|
||||||
|
|
||||||
|
- Diagrams are always consistent with simulator behavior.
|
||||||
|
- Architectural changes automatically propagate to visualizations.
|
||||||
|
- Diagram diffs become meaningful indicators of architectural change.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Links
|
||||||
|
|
||||||
|
- SPEC Section 4 (Output, Debuggability, and Diagrams)
|
||||||
|
- ADR-0002 (Distance semantics)
|
||||||
|
- ADR-0005 (Diagram views and layout rules)
|
||||||
@@ -0,0 +1,89 @@
|
|||||||
|
# ADR-0007: Runtime API and Simulation Engine Boundaries
|
||||||
|
|
||||||
|
## Status
|
||||||
|
|
||||||
|
Accepted
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
The simulator consists of multiple layers with distinct responsibilities:
|
||||||
|
|
||||||
|
- a host-facing API layer used by benchmarks and user code,
|
||||||
|
- a discrete-event simulation engine that executes requests,
|
||||||
|
- device components that model hardware behavior.
|
||||||
|
|
||||||
|
Without strict boundaries, orchestration logic can leak into components,
|
||||||
|
or simulation internals can become entangled with user-facing APIs.
|
||||||
|
|
||||||
|
This ADR defines clear responsibility boundaries between:
|
||||||
|
|
||||||
|
- runtime API,
|
||||||
|
- simulation engine (sim_engine),
|
||||||
|
- hardware components.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Decision
|
||||||
|
|
||||||
|
### D1. Runtime API is host-facing orchestration only
|
||||||
|
|
||||||
|
The runtime API represents host/driver-level behavior and MUST:
|
||||||
|
|
||||||
|
- expose high-level operations (tensor deployment, kernel launch),
|
||||||
|
- submit requests only to endpoint components (e.g., IO_CPU),
|
||||||
|
- await completion via futures/handles,
|
||||||
|
- own and persist host-side metadata (tensor allocation maps, kernel bindings).
|
||||||
|
|
||||||
|
The runtime API MUST NOT:
|
||||||
|
|
||||||
|
- hardcode hop-by-hop routing or fan-out,
|
||||||
|
- directly invoke internal components (M_CPU, PE_CPU, engines),
|
||||||
|
- embed topology- or routing-specific assumptions.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### D2. Simulation engine executes and schedules requests
|
||||||
|
|
||||||
|
The simulation engine (sim_engine) MUST:
|
||||||
|
|
||||||
|
- inject requests into the compiled topology graph,
|
||||||
|
- schedule and execute events using a discrete-event model,
|
||||||
|
- manage correlation ids and completion tracking,
|
||||||
|
- decompose operations into low-level requests when required
|
||||||
|
(e.g., MemoryWrite events).
|
||||||
|
|
||||||
|
The simulation engine MUST NOT:
|
||||||
|
|
||||||
|
- define tensor semantics,
|
||||||
|
- define kernel execution policies,
|
||||||
|
- expose internal graph details to the runtime API.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### D3. Components own fan-out and aggregation
|
||||||
|
|
||||||
|
Device-side components MUST:
|
||||||
|
|
||||||
|
- fan-out requests to downstream domains
|
||||||
|
(IO_CPU → M_CPU → PE_CPU → schedulers/engines),
|
||||||
|
- aggregate completion and failure signals,
|
||||||
|
- propagate results deterministically upstream.
|
||||||
|
|
||||||
|
Neither the runtime API nor the simulation engine may orchestrate
|
||||||
|
component-level fan-out explicitly.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Consequences
|
||||||
|
|
||||||
|
- Runtime APIs remain stable as topology and routing evolve.
|
||||||
|
- Simulation internals can change without affecting user-facing code.
|
||||||
|
- Component implementations remain swappable via DI.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Links
|
||||||
|
|
||||||
|
- SPEC R4, R7, R8
|
||||||
|
- ADR-0008 (Tensor deployment)
|
||||||
|
- ADR-0009 (Kernel execution)
|
||||||
@@ -0,0 +1,100 @@
|
|||||||
|
# ADR-0008: Tensor Deployment and Allocation (Host Allocator, PA-first)
|
||||||
|
|
||||||
|
## Status
|
||||||
|
|
||||||
|
Accepted
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
Benchmarks require PyTorch-like tensor semantics:
|
||||||
|
|
||||||
|
- tensor creation (empty, fill),
|
||||||
|
- deployment to accelerator devices (tensor.to()).
|
||||||
|
|
||||||
|
In the realistic system, host software manages allocation/mapping and installs
|
||||||
|
mappings for DMA/MMU. For Phase 0 we simplify (ADR-0011):
|
||||||
|
|
||||||
|
- device memory operations use PA only,
|
||||||
|
- VA/MMU/IOMMU is not modeled.
|
||||||
|
|
||||||
|
To keep the host↔device interface minimal, we avoid a separate
|
||||||
|
AllocateTensorMeta message. Instead, host allocation produces a PA shard map
|
||||||
|
that is used directly by MemoryWrite/Read and KernelLaunch.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Decision
|
||||||
|
|
||||||
|
### D1. Tensor is a host-owned handle with PA shard mapping
|
||||||
|
|
||||||
|
A Tensor object is a host-owned handle that encapsulates:
|
||||||
|
|
||||||
|
- shape and dtype,
|
||||||
|
- initialization intent,
|
||||||
|
- device placement and allocation metadata as a PA shard map.
|
||||||
|
|
||||||
|
After deployment, the Tensor handle MUST contain:
|
||||||
|
|
||||||
|
- a list of shards, each with (sip,cube,pe,pa,nbytes,offset_bytes).
|
||||||
|
|
||||||
|
This PA shard mapping is the single source of truth for kernel argument binding.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### D2. Deployment uses a host allocator (Phase 0)
|
||||||
|
|
||||||
|
In Phase 0, tensor deployment produces PA shard mappings via a host allocator:
|
||||||
|
|
||||||
|
- placement (split/replicate/hybrid) is decided by a DP policy,
|
||||||
|
- allocation assigns PA ranges at the PE level and returns shard mappings,
|
||||||
|
- the Tensor handle stores the resulting shard list deterministically.
|
||||||
|
|
||||||
|
No separate host-visible device allocation RPC is required in Phase 0.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### D3. Data initialization and transfer uses MemoryWrite/Read only
|
||||||
|
|
||||||
|
Any data initialization or transfer implied by a tensor (e.g., fill, copy)
|
||||||
|
MUST be represented using Host ↔ IO_CPU messages only:
|
||||||
|
|
||||||
|
- MemoryWrite
|
||||||
|
- MemoryRead
|
||||||
|
|
||||||
|
Rules:
|
||||||
|
|
||||||
|
- MemoryWrite/Read MUST reference PA + (sip,cube,pe) tags (ADR-0012).
|
||||||
|
- Allocation metadata MUST NOT be embedded as a separate allocation message.
|
||||||
|
- Bulk tensor data MUST NOT be embedded in Phase 0 messages.
|
||||||
|
|
||||||
|
The simulation engine schedules MemoryWrite/Read through the graph so that
|
||||||
|
latency is computed by explicit traversal.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### D4. Extension path (non-breaking)
|
||||||
|
|
||||||
|
Future ADRs MAY introduce optional VA/MMU/IOMMU modeling by adding:
|
||||||
|
|
||||||
|
- virtual addressing in tensor handles,
|
||||||
|
- mapping install steps,
|
||||||
|
- translation latency/page granularity.
|
||||||
|
|
||||||
|
The Phase 0 PA shard map remains a valid fast-path configuration.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Consequences
|
||||||
|
|
||||||
|
- Host↔IO_CPU contract remains minimal (MemoryRead/Write + KernelLaunch).
|
||||||
|
- KernelLaunch can pass per-PE data placement explicitly via shard tags.
|
||||||
|
- Early implementation stays simple and testable.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Links
|
||||||
|
|
||||||
|
- ADR-0011 (PA-first)
|
||||||
|
- ADR-0012 (Host↔IO_CPU schema)
|
||||||
|
- ADR-0007 (runtime_api vs sim_engine boundaries)
|
||||||
|
- ADR-0009 (Kernel execution)
|
||||||
@@ -0,0 +1,74 @@
|
|||||||
|
# ADR-0009: Kernel Execution Messaging and Completion Semantics
|
||||||
|
|
||||||
|
## Status
|
||||||
|
|
||||||
|
Accepted
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
Kernel execution is initiated by the host and proceeds through
|
||||||
|
device control components:
|
||||||
|
|
||||||
|
Host → IO_CPU → M_CPU → PE_CPU → schedulers → engines
|
||||||
|
|
||||||
|
Completion propagates in reverse order.
|
||||||
|
|
||||||
|
To keep benchmarks simple and topology-agnostic,
|
||||||
|
kernel execution must be endpoint-driven with deterministic aggregation.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Decision
|
||||||
|
|
||||||
|
### D1. Kernel launch is an endpoint request
|
||||||
|
|
||||||
|
A kernel launch is initiated by submitting a single KernelLaunch request
|
||||||
|
to the IO_CPU endpoint.
|
||||||
|
|
||||||
|
The runtime API MUST:
|
||||||
|
|
||||||
|
- construct the kernel launch request,
|
||||||
|
- submit it to IO_CPU,
|
||||||
|
- await a single completion result.
|
||||||
|
|
||||||
|
The runtime API MUST NOT orchestrate internal fan-out.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### D2. Tensor arguments are passed by metadata
|
||||||
|
|
||||||
|
KernelLaunch requests MUST reference tensor arguments via:
|
||||||
|
|
||||||
|
- host-owned tensor handles, or
|
||||||
|
- resolved device address maps derived from those handles.
|
||||||
|
|
||||||
|
Bulk tensor data MUST NOT be embedded in kernel launch messages.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### D3. Fan-out and aggregation are component responsibilities
|
||||||
|
|
||||||
|
- IO_CPU fans out work to M_CPUs.
|
||||||
|
- M_CPU fans out work to PE_CPUs.
|
||||||
|
- PE_CPU manages kernel execution and engine dispatch.
|
||||||
|
|
||||||
|
Completion semantics:
|
||||||
|
|
||||||
|
- M_CPU completes when all targeted PEs complete or a failure policy triggers.
|
||||||
|
- IO_CPU completes when all targeted CUBEs complete or a failure policy triggers.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### D4. Completion and failure propagation
|
||||||
|
|
||||||
|
- All messages MUST carry correlation identifiers.
|
||||||
|
- Completion and failure MUST propagate deterministically to the host.
|
||||||
|
- The simulation engine provides futures/handles to observe completion.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Links
|
||||||
|
|
||||||
|
- SPEC R1, R2, R7, R8
|
||||||
|
- ADR-0007 (Runtime API boundaries)
|
||||||
|
- ADR-0008 (Tensor deployment)
|
||||||
@@ -0,0 +1,62 @@
|
|||||||
|
# ADR-0010: CLI Device Selection and Multi-Device Execution Semantics
|
||||||
|
|
||||||
|
## Status
|
||||||
|
|
||||||
|
Accepted
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
Benchmarks represent device-agnostic workloads that operate on a single device.
|
||||||
|
Users may want to run a benchmark:
|
||||||
|
|
||||||
|
- on a specific device, or
|
||||||
|
- across all devices in the system.
|
||||||
|
|
||||||
|
Device enumeration must not leak into benchmarks or runtime APIs.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Decision
|
||||||
|
|
||||||
|
### D1. Benchmarks are single-device by design
|
||||||
|
|
||||||
|
- A benchmark MUST define behavior for a single device only.
|
||||||
|
- A benchmark MUST accept a device identifier as input.
|
||||||
|
- Benchmarks MUST NOT enumerate or loop over multiple devices.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### D2. CLI controls device selection
|
||||||
|
|
||||||
|
The `kernbench run` command supports an optional `--device` argument:
|
||||||
|
|
||||||
|
- If `--device <id>` is specified:
|
||||||
|
- the benchmark executes once for the specified device.
|
||||||
|
|
||||||
|
- If `--device` is omitted:
|
||||||
|
- the benchmark executes once using all the SIPs discovered in the topology.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### D3. Multi-device execution is logically parallel
|
||||||
|
|
||||||
|
When running on multiple devices:
|
||||||
|
|
||||||
|
- benchmark executions are submitted to a single simulation engine instance,
|
||||||
|
- executions are logically parallel in simulation time,
|
||||||
|
- inter-device contention is naturally modeled.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### D4. Runtime API and simulation engine remain device-scoped
|
||||||
|
|
||||||
|
- Runtime API calls operate on one device per invocation.
|
||||||
|
- The simulation engine schedules all requests deterministically.
|
||||||
|
- Neither layer enumerates devices.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Links
|
||||||
|
|
||||||
|
- SPEC R7, R8
|
||||||
|
- ADR-0007 (Runtime API boundaries)
|
||||||
@@ -0,0 +1,65 @@
|
|||||||
|
# ADR-0011: Memory Addressing Simplification (PA-first)
|
||||||
|
|
||||||
|
## Status
|
||||||
|
|
||||||
|
Accepted
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
A realistic system uses host-side virtual addressing and an MMU/IOMMU-style
|
||||||
|
translation path for DMA: host allocates physical memory at PE level, maps it
|
||||||
|
into a virtual address space, installs mappings, and DMA requests use virtual
|
||||||
|
addresses that are translated to physical addresses.
|
||||||
|
|
||||||
|
For early development, we want a minimal, deterministic model that enables:
|
||||||
|
|
||||||
|
- correct routing and latency accounting through the graph,
|
||||||
|
- stable tensor deployment and kernel execution semantics,
|
||||||
|
- future extension toward VA/MMU without rewriting workflows.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Decision
|
||||||
|
|
||||||
|
### D1. Phase 0 model is PA-only
|
||||||
|
|
||||||
|
The simulator uses a PA-first model:
|
||||||
|
|
||||||
|
- All device memory accesses (MemoryRead/MemoryWrite) operate on device physical
|
||||||
|
addresses (PA) plus size.
|
||||||
|
- Tensor handles store PA-based shard mappings after deployment.
|
||||||
|
- KernelLaunch passes tensor arguments as PA-based mappings (or references to them).
|
||||||
|
- MMU/IOMMU concepts (virtual address spaces, page tables, translation latency)
|
||||||
|
are NOT modeled in Phase 0.
|
||||||
|
|
||||||
|
### D2. Allocation produces PA mappings
|
||||||
|
|
||||||
|
Device allocation selects PE-local memory regions and returns PA mappings
|
||||||
|
sufficient to execute kernels and issue DMA requests.
|
||||||
|
|
||||||
|
### D3. Extension path (non-breaking)
|
||||||
|
|
||||||
|
A future ADR MAY introduce an optional VA/MMU layer by:
|
||||||
|
|
||||||
|
- introducing virtual addresses in tensor handles,
|
||||||
|
- adding a mapping-install step,
|
||||||
|
- modeling translation latency and page granularity.
|
||||||
|
|
||||||
|
The Phase 0 PA model remains a valid fast-path configuration.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Consequences
|
||||||
|
|
||||||
|
- Early implementation stays simple and testable.
|
||||||
|
- All latency remains explicit via graph traversal, not hidden translation.
|
||||||
|
- Future VA/MMU modeling can be added without breaking existing benchmarks.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Links
|
||||||
|
|
||||||
|
- ADR-0007 (runtime_api vs sim_engine boundaries)
|
||||||
|
- ADR-0008 (tensor deployment)
|
||||||
|
- ADR-0009 (kernel execution)
|
||||||
|
- SPEC R2 (latency by traversal)
|
||||||
@@ -0,0 +1,232 @@
|
|||||||
|
# ADR-0012: Host ↔ IO_CPU Message Schema (PA-first, PE-tagged)
|
||||||
|
|
||||||
|
## Status
|
||||||
|
|
||||||
|
Accepted
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
Phase 0 uses a PA-first memory model (ADR-0011):
|
||||||
|
|
||||||
|
- memory operations use device physical addresses (PA) only,
|
||||||
|
- VA/MMU/IOMMU is not modeled.
|
||||||
|
|
||||||
|
The host-facing runtime API interacts with the device via the IO_CPU endpoint.
|
||||||
|
We define stable, minimal message schemas for Host ↔ IO_CPU so that:
|
||||||
|
|
||||||
|
- benchmarks remain stable,
|
||||||
|
- IO_CPU-internal fan-out/aggregation can evolve independently,
|
||||||
|
- completion and failure propagation is deterministic.
|
||||||
|
|
||||||
|
We also require PE-tagging (A 방식): each shard explicitly carries (sip,cube,pe)
|
||||||
|
so IO_CPU can deterministically route/fan-out without relying on PA decoding.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Decision
|
||||||
|
|
||||||
|
### D1. Contract scope
|
||||||
|
|
||||||
|
This schema is the stable contract ONLY for Host ↔ IO_CPU.
|
||||||
|
|
||||||
|
Messages beyond IO_CPU (to M_CPU, PE_CPU, schedulers, engines) are component-internal
|
||||||
|
and are NOT part of this host contract in Phase 0.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### D2. Required message set
|
||||||
|
|
||||||
|
The runtime API MUST use only these message types for Host ↔ IO_CPU:
|
||||||
|
|
||||||
|
- MemoryWrite
|
||||||
|
- MemoryRead
|
||||||
|
- KernelLaunch
|
||||||
|
|
||||||
|
All operations required by benchmarks (tensor init/copy, kernel run) MUST be expressible
|
||||||
|
with these messages.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### D3. Common envelope (mandatory for all requests)
|
||||||
|
|
||||||
|
All Host ↔ IO_CPU requests MUST include:
|
||||||
|
|
||||||
|
- `msg_type: str`
|
||||||
|
- `correlation_id: str`
|
||||||
|
- generated by the host
|
||||||
|
- used to match responses deterministically
|
||||||
|
- `request_id: str`
|
||||||
|
- unique within a correlation_id
|
||||||
|
- `target_device: str`
|
||||||
|
- device identifier (e.g., "sip:0")
|
||||||
|
- `timestamp_tag: str | None` (optional)
|
||||||
|
- debug tag only; MUST NOT affect determinism
|
||||||
|
|
||||||
|
All Host ↔ IO_CPU responses MUST include:
|
||||||
|
|
||||||
|
- `correlation_id: str`
|
||||||
|
- `request_id: str`
|
||||||
|
- `completion: Completion`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### D4. Completion schema (mandatory)
|
||||||
|
|
||||||
|
`Completion` MUST have:
|
||||||
|
|
||||||
|
- `ok: bool`
|
||||||
|
- `error_code: str | None`
|
||||||
|
- `error_message: str | None`
|
||||||
|
|
||||||
|
Rules:
|
||||||
|
|
||||||
|
- If `ok == true` then `error_code` and `error_message` MUST be null.
|
||||||
|
- If `ok == false` then `error_code` MUST be non-null.
|
||||||
|
- Completion semantics MUST be deterministic.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### D5. MemoryWrite schema (PA-first, PE-tagged)
|
||||||
|
|
||||||
|
`MemoryWrite` represents a host-initiated write/initialize operation to device memory.
|
||||||
|
|
||||||
|
Mandatory fields:
|
||||||
|
|
||||||
|
- common envelope fields (D3)
|
||||||
|
- destination placement tags (A 방식):
|
||||||
|
- `dst_sip: int`
|
||||||
|
- `dst_cube: int`
|
||||||
|
- `dst_pe: int`
|
||||||
|
- `dst_pa: int`
|
||||||
|
- destination physical address in the destination PE's address space
|
||||||
|
- `nbytes: int`
|
||||||
|
- `src_kind: "pattern" | "host_buffer_ref"`
|
||||||
|
- Phase 0 MUST support "pattern"
|
||||||
|
- `pattern: Pattern | None`
|
||||||
|
- required if `src_kind == "pattern"`
|
||||||
|
|
||||||
|
`Pattern` (Phase 0 mandatory support):
|
||||||
|
|
||||||
|
- `pattern_kind: "zero" | "fill_u8" | "fill_u16" | "fill_u32" | "fill_fp16" | "fill_fp32"`
|
||||||
|
- `value: number | None`
|
||||||
|
- required for fill_*; ignored for zero
|
||||||
|
|
||||||
|
Optional fields:
|
||||||
|
|
||||||
|
- `dst_mem_kind: "HBM" | "TCM" | "AUTO"` (default "AUTO")
|
||||||
|
- `debug_label: str | None`
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
|
||||||
|
- This message MUST NOT embed bulk tensor data in Phase 0.
|
||||||
|
- All latency MUST come from explicit graph traversal and modeled components.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### D6. MemoryRead schema (PA-first, PE-tagged)
|
||||||
|
|
||||||
|
`MemoryRead` represents a host-initiated read from device memory.
|
||||||
|
|
||||||
|
Mandatory fields:
|
||||||
|
|
||||||
|
- common envelope fields (D3)
|
||||||
|
- source placement tags (A 방식):
|
||||||
|
- `src_sip: int`
|
||||||
|
- `src_cube: int`
|
||||||
|
- `src_pe: int`
|
||||||
|
- `src_pa: int`
|
||||||
|
- `nbytes: int`
|
||||||
|
|
||||||
|
Optional fields:
|
||||||
|
|
||||||
|
- `dst_kind: "host_sink" | "discard"` (default "host_sink")
|
||||||
|
- `debug_label: str | None`
|
||||||
|
|
||||||
|
Response payload:
|
||||||
|
|
||||||
|
- actual bytes are NOT required in Phase 0 (latency/traces focus)
|
||||||
|
- implementations MAY return lightweight stats or hashes later via a new ADR
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### D7. KernelLaunch schema (PA-first, PE-tagged shards)
|
||||||
|
|
||||||
|
`KernelLaunch` represents launching a kernel on a target device via IO_CPU.
|
||||||
|
|
||||||
|
Mandatory fields:
|
||||||
|
|
||||||
|
- common envelope fields (D3)
|
||||||
|
- `kernel_ref: KernelRef`
|
||||||
|
- `args: list[KernelArg]`
|
||||||
|
|
||||||
|
`KernelRef` MUST have:
|
||||||
|
|
||||||
|
- `name: str`
|
||||||
|
- `kind: "deployed" | "builtin"`
|
||||||
|
- `deploy_pa: int | None` — PA where kernel binary was deployed (required for "deployed")
|
||||||
|
- `deploy_sip: int` — SIP where binary resides
|
||||||
|
- `deploy_cube: int` — cube where binary resides
|
||||||
|
- `deploy_pe: int` — PE where binary resides
|
||||||
|
- `nbytes_code: int` — kernel binary size (for BW modeling)
|
||||||
|
|
||||||
|
Kernel binaries MUST be pre-deployed to device memory via MemoryWrite.
|
||||||
|
KernelLaunch MUST NOT embed kernel source code or IR in the launch message.
|
||||||
|
|
||||||
|
`KernelArg` supports tensor args by PA mapping and scalars by value.
|
||||||
|
|
||||||
|
Tensor arg (mandatory):
|
||||||
|
|
||||||
|
- `arg_kind: "tensor"`
|
||||||
|
- `tensor_pa_map: TensorPAMap`
|
||||||
|
|
||||||
|
`TensorPAMap` MUST have:
|
||||||
|
|
||||||
|
- `shards: list[TensorShard]`
|
||||||
|
|
||||||
|
`TensorShard` MUST have (A 방식 강제):
|
||||||
|
|
||||||
|
- `sip: int`
|
||||||
|
- `cube: int`
|
||||||
|
- `pe: int`
|
||||||
|
- `pa: int`
|
||||||
|
- `nbytes: int`
|
||||||
|
- `offset_bytes: int`
|
||||||
|
|
||||||
|
Scalar arg (mandatory):
|
||||||
|
|
||||||
|
- `arg_kind: "scalar"`
|
||||||
|
- `dtype: "i32" | "i64" | "fp16" | "fp32" | "bool"`
|
||||||
|
- `value: number | bool`
|
||||||
|
|
||||||
|
Optional KernelLaunch fields:
|
||||||
|
|
||||||
|
- `grid: dict | None`
|
||||||
|
- `meta: dict | None`
|
||||||
|
- `failure_policy: "fail_fast" | "collect_all"` (default "fail_fast")
|
||||||
|
- `debug_label: str | None`
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
|
||||||
|
- KernelLaunch MUST NOT embed bulk tensor data.
|
||||||
|
- KernelLaunch MUST be submitted only to the IO_CPU endpoint.
|
||||||
|
- IO_CPU MUST fan-out work internally using the shard (sip,cube,pe) tags.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Verification Notes
|
||||||
|
|
||||||
|
Tests SHOULD validate:
|
||||||
|
|
||||||
|
- schema validation rejects missing mandatory fields,
|
||||||
|
- deterministic correlation/response matching,
|
||||||
|
- MemoryWrite/Read/KernelLaunch produce explicit hop traces,
|
||||||
|
- all routed requests incur latency > 0.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Links
|
||||||
|
|
||||||
|
- ADR-0011 (PA-first memory addressing)
|
||||||
|
- ADR-0007 (runtime_api vs sim_engine boundaries)
|
||||||
|
- ADR-0009 (kernel execution fan-out/aggregation)
|
||||||
|
- SPEC R2, R7, R8
|
||||||
@@ -0,0 +1,139 @@
|
|||||||
|
# ADR-0013: Verification Strategy and Phase 1 Test Plan
|
||||||
|
|
||||||
|
## Status
|
||||||
|
|
||||||
|
Accepted
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
KernBench is a system-level simulator whose correctness is defined by:
|
||||||
|
|
||||||
|
- adherence to SPEC-defined invariants,
|
||||||
|
- determinism and debuggability,
|
||||||
|
- explicit modeling of routing and latency.
|
||||||
|
|
||||||
|
Given the evolving implementation, we need a stable verification strategy
|
||||||
|
that prevents architectural drift while allowing incremental development.
|
||||||
|
|
||||||
|
This ADR defines the Phase 1 verification plan and what constitutes
|
||||||
|
"correct behavior" for early implementations.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Decision
|
||||||
|
|
||||||
|
### D1. Verification is contract-based
|
||||||
|
|
||||||
|
Verification MUST be derived from:
|
||||||
|
|
||||||
|
- SPEC requirements,
|
||||||
|
- accepted ADRs.
|
||||||
|
|
||||||
|
Tests MUST validate architectural contracts, not incidental implementation details.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### D2. Phase 1 verification scope
|
||||||
|
|
||||||
|
Phase 1 verification focuses on:
|
||||||
|
|
||||||
|
- message contract validity (ADR-0012),
|
||||||
|
- routing and fan-out semantics at the IO_CPU boundary (ADR-0009),
|
||||||
|
- PA-first memory addressing and shard tagging (ADR-0011),
|
||||||
|
- core latency and trace invariants (SPEC 0.1, R2).
|
||||||
|
|
||||||
|
Microarchitectural accuracy, bandwidth contention, and cycle-level behavior
|
||||||
|
are explicitly out of scope in Phase 1.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### D3. Required Phase 1 verification cases
|
||||||
|
|
||||||
|
The following verification cases MUST be supported by the implementation:
|
||||||
|
|
||||||
|
#### V1. Message schema validation
|
||||||
|
|
||||||
|
- KernelLaunch requests missing `(sip, cube, pe)` in any tensor shard MUST be rejected.
|
||||||
|
- MemoryWrite/MemoryRead requests missing destination/source placement tags MUST be rejected.
|
||||||
|
- Completion results MUST follow the `ok / error_code / error_message` contract.
|
||||||
|
|
||||||
|
#### V2. IO_CPU fan-out and aggregation
|
||||||
|
|
||||||
|
Given:
|
||||||
|
|
||||||
|
- a topology with one SIP, one CUBE, and two PEs,
|
||||||
|
- a KernelLaunch request containing two tensor shards targeting different PEs,
|
||||||
|
|
||||||
|
The system MUST:
|
||||||
|
|
||||||
|
- submit a single KernelLaunch to IO_CPU,
|
||||||
|
- fan-out work internally to both PEs,
|
||||||
|
- aggregate completion and return a single deterministic completion to the host.
|
||||||
|
|
||||||
|
#### V3. Latency and trace invariants
|
||||||
|
|
||||||
|
For any valid request:
|
||||||
|
|
||||||
|
- the hop-by-hop trace MUST be non-empty,
|
||||||
|
- total latency MUST be greater than zero,
|
||||||
|
- repeated runs with identical inputs MUST produce identical traces.
|
||||||
|
|
||||||
|
#### V4. Topology independence and cross-domain coverage
|
||||||
|
|
||||||
|
Verification cases MUST pass for multiple topology shapes, including:
|
||||||
|
|
||||||
|
- minimal: (1 SIP, 1 CUBE, 1 PE)
|
||||||
|
- multi-PE: (1 SIP, 1 CUBE, N PEs)
|
||||||
|
- multi-CUBE within a SIP: (1 SIP, M CUBEs, ≥1 PE per CUBE)
|
||||||
|
- multi-SIP tray: (K SIPs, ≥1 CUBE per SIP, ≥1 PE per CUBE)
|
||||||
|
|
||||||
|
For multi-CUBE and multi-SIP topologies, Phase 1 verification focuses on:
|
||||||
|
|
||||||
|
- explicit connectivity (required links exist),
|
||||||
|
- deterministic routing and control-path traversal,
|
||||||
|
- non-empty traces and latency > 0 for representative cross-domain requests
|
||||||
|
(inter-CUBE and inter-SIP paths).
|
||||||
|
|
||||||
|
Tests MUST NOT hardcode topology sizes, node ids, or link counts.
|
||||||
|
Instead, tests MUST derive expectations from the compiled topology metadata
|
||||||
|
---
|
||||||
|
|
||||||
|
### D4. Phase 1 artifacts
|
||||||
|
|
||||||
|
Phase 1 MAY include:
|
||||||
|
|
||||||
|
- verification-only test code,
|
||||||
|
- topology fixtures,
|
||||||
|
- trace inspection utilities.
|
||||||
|
|
||||||
|
Phase 1 MUST NOT require:
|
||||||
|
|
||||||
|
- production code changes solely to satisfy tests,
|
||||||
|
- weakening or removing tests to allow progress.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### D5. Phase 2 enforcement
|
||||||
|
|
||||||
|
Phase 2 (Apply) MUST:
|
||||||
|
|
||||||
|
- run the Phase 1 verification cases,
|
||||||
|
- rollback all changes if any verification fails,
|
||||||
|
- preserve tests as authoritative contracts.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Consequences
|
||||||
|
|
||||||
|
- Architectural correctness is enforced early.
|
||||||
|
- Tests serve as executable documentation of system behavior.
|
||||||
|
- Implementation remains flexible without losing rigor.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Links
|
||||||
|
|
||||||
|
- SPEC 0.1, R2, R6
|
||||||
|
- ADR-0011 (PA-first memory addressing)
|
||||||
|
- ADR-0012 (Host ↔ IO_CPU message schema)
|
||||||
|
- ADR-0009 (Kernel execution semantics)
|
||||||
@@ -0,0 +1,364 @@
|
|||||||
|
# ADR-0014: PE Internal Execution Model (PE_CPU, PE_SCHEDULER, and Composite Commands)
|
||||||
|
|
||||||
|
## Status
|
||||||
|
|
||||||
|
Proposed
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
ADR-0003 (system hierarchy) and ADR-0009 (kernel execution semantics) reference PE internals but do not define:
|
||||||
|
|
||||||
|
- the dispatch model inside a PE,
|
||||||
|
- the responsibilities of PE_SCHEDULER,
|
||||||
|
- the PE_TCM-centric dataflow contract used by accelerator engines.
|
||||||
|
|
||||||
|
We need a deterministic and debuggable PE-internal execution contract that supports:
|
||||||
|
|
||||||
|
- simple single-engine commands
|
||||||
|
- composite commands that build a tiled pipeline across DMA and accelerator engines
|
||||||
|
|
||||||
|
The simulator must produce deterministic traces and allow modeling of PE-internal pipelining without introducing nondeterministic engine scheduling.
|
||||||
|
|
||||||
|
## Decision
|
||||||
|
|
||||||
|
### D1. PE internal component roles
|
||||||
|
|
||||||
|
Each PE contains the following logical components.
|
||||||
|
|
||||||
|
**PE_CPU**
|
||||||
|
|
||||||
|
- Executes kernel instruction stream or kernel control logic.
|
||||||
|
- Generates PE commands.
|
||||||
|
- Submits commands to PE_SCHEDULER.
|
||||||
|
- PE_CPU does NOT enqueue work directly into engine queues.
|
||||||
|
|
||||||
|
**PE_SCHEDULER**
|
||||||
|
|
||||||
|
- The sole dispatcher inside a PE.
|
||||||
|
- Receives commands from PE_CPU.
|
||||||
|
- Expands composite commands into sub-commands.
|
||||||
|
- Tracks dependencies and command state.
|
||||||
|
- Dispatches work to engine queues.
|
||||||
|
- Manages tile scheduling for composite commands.
|
||||||
|
|
||||||
|
**PE_DMA**
|
||||||
|
|
||||||
|
- Handles memory transfers between PE_TCM and external memory domains.
|
||||||
|
- PE_DMA has **dual egress** at the CUBE level:
|
||||||
|
- **→ XBAR**: dedicated path to HBM (local and cross-half via bridge)
|
||||||
|
- **→ NOC**: path to non-HBM destinations (shared SRAM, inter-cube UCIe, etc.)
|
||||||
|
- Supported directions include:
|
||||||
|
- HBM → PE_TCM (via XBAR)
|
||||||
|
- PE_TCM → HBM (via XBAR)
|
||||||
|
- PE_TCM → shared SRAM (via NOC)
|
||||||
|
- PE_TCM → other memory domains (via NOC, if supported by topology)
|
||||||
|
|
||||||
|
**PE_GEMM**
|
||||||
|
|
||||||
|
- Matrix multiplication engine.
|
||||||
|
- Reads activations from PE_TCM.
|
||||||
|
- May stream weights directly from HBM.
|
||||||
|
|
||||||
|
**PE_MATH**
|
||||||
|
|
||||||
|
- Element-wise computation engine.
|
||||||
|
- Reads and writes PE_TCM.
|
||||||
|
|
||||||
|
**PE_TCM**
|
||||||
|
|
||||||
|
- Local SRAM used as the staging memory for accelerator operations.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### D2. Command lifecycle and queues
|
||||||
|
|
||||||
|
PE_SCHEDULER maintains three logical structures.
|
||||||
|
|
||||||
|
**SubmissionQueue**
|
||||||
|
|
||||||
|
- Written by PE_CPU.
|
||||||
|
- Contains incoming PE commands waiting to be processed.
|
||||||
|
|
||||||
|
**InflightTable**
|
||||||
|
|
||||||
|
- Owned and mutated only by PE_SCHEDULER.
|
||||||
|
- Tracks:
|
||||||
|
- expanded sub-commands
|
||||||
|
- dependency state
|
||||||
|
- engine assignment
|
||||||
|
- completion status
|
||||||
|
|
||||||
|
**CompletionQueue**
|
||||||
|
|
||||||
|
- Written by PE_SCHEDULER.
|
||||||
|
- Contains final completion records for commands.
|
||||||
|
|
||||||
|
**Single-writer rule**
|
||||||
|
|
||||||
|
- Only PE_SCHEDULER is allowed to mutate command completion state.
|
||||||
|
- Engine components must report completion via explicit completion events/messages.
|
||||||
|
|
||||||
|
**Command completion**
|
||||||
|
|
||||||
|
A command becomes DONE when:
|
||||||
|
|
||||||
|
- all sub-commands complete
|
||||||
|
- PE_SCHEDULER publishes a completion record to CompletionQueue.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### D3. Dispatch modes
|
||||||
|
|
||||||
|
PE commands are divided into two categories.
|
||||||
|
|
||||||
|
#### D3.1 Simple command
|
||||||
|
|
||||||
|
A simple command expands to exactly one engine sub-command.
|
||||||
|
|
||||||
|
Examples include:
|
||||||
|
|
||||||
|
- DMA transfer
|
||||||
|
- GEMM compute
|
||||||
|
- MATH compute
|
||||||
|
|
||||||
|
Execution flow:
|
||||||
|
|
||||||
|
```
|
||||||
|
PE_CPU → SubmissionQueue → PE_SCHEDULER → engine queue → engine execution → completion event → PE_SCHEDULER → CompletionQueue
|
||||||
|
```
|
||||||
|
|
||||||
|
#### D3.2 Composite command (tiled pipeline)
|
||||||
|
|
||||||
|
Composite commands implement tiled pipelined execution across engines.
|
||||||
|
|
||||||
|
Each tile executes the following pipeline:
|
||||||
|
|
||||||
|
```
|
||||||
|
Input DMA (READ)
|
||||||
|
→ Compute (GEMM or MATH)
|
||||||
|
→ Output DMA (WRITE)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Tiling rule**
|
||||||
|
|
||||||
|
If the DMA payload exceeds hardware tile size, PE_SCHEDULER splits the transfer into tiles.
|
||||||
|
Each tile is assigned a monotonically increasing `tile_id`.
|
||||||
|
|
||||||
|
**Tile dependency rules**
|
||||||
|
|
||||||
|
For tile `t`:
|
||||||
|
|
||||||
|
- Compute must wait for input DMA: `DMA_READ(t) → COMPUTE(t)`
|
||||||
|
- Output DMA must wait for compute: `COMPUTE(t) → DMA_WRITE(t)`
|
||||||
|
- All dependencies are enforced by PE_SCHEDULER.
|
||||||
|
|
||||||
|
**Overlap policy (Phase 0 default)**
|
||||||
|
|
||||||
|
Operations for different tiles may overlap when engine resources permit.
|
||||||
|
|
||||||
|
Allowed overlaps:
|
||||||
|
|
||||||
|
```
|
||||||
|
DMA_READ(t+1) ∥ COMPUTE(t)
|
||||||
|
DMA_WRITE(t−1) ∥ COMPUTE(t)
|
||||||
|
DMA_READ(t) ∥ DMA_WRITE(t)
|
||||||
|
```
|
||||||
|
|
||||||
|
Disallowed overlaps:
|
||||||
|
|
||||||
|
```
|
||||||
|
GEMM(t) ∥ GEMM(t′)
|
||||||
|
MATH(t) ∥ MATH(t′)
|
||||||
|
GEMM(t) ∥ MATH(t′)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### D4. Engine execution model (Phase 0 default)
|
||||||
|
|
||||||
|
Each engine behaves as a deterministic service resource.
|
||||||
|
|
||||||
|
**DMA engine**
|
||||||
|
|
||||||
|
PE_DMA contains two independent channels.
|
||||||
|
|
||||||
|
```
|
||||||
|
DMA_READ capacity = 1
|
||||||
|
DMA_WRITE capacity = 1
|
||||||
|
```
|
||||||
|
|
||||||
|
Rules:
|
||||||
|
|
||||||
|
- DMA_READ and DMA_WRITE may execute concurrently.
|
||||||
|
- Multiple READs cannot overlap.
|
||||||
|
- Multiple WRITEs cannot overlap.
|
||||||
|
|
||||||
|
Example allowed:
|
||||||
|
|
||||||
|
```
|
||||||
|
DMA_READ(t+1) ∥ DMA_WRITE(t)
|
||||||
|
```
|
||||||
|
|
||||||
|
Example not allowed:
|
||||||
|
|
||||||
|
```
|
||||||
|
DMA_READ(t) ∥ DMA_READ(t+1)
|
||||||
|
DMA_WRITE(t) ∥ DMA_WRITE(t+1)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Compute engine**
|
||||||
|
|
||||||
|
Compute operations share a single compute resource.
|
||||||
|
|
||||||
|
```
|
||||||
|
PE_ACCEL capacity = 1
|
||||||
|
```
|
||||||
|
|
||||||
|
Both GEMM and MATH require this shared compute slot.
|
||||||
|
|
||||||
|
Consequences:
|
||||||
|
|
||||||
|
- GEMM ∥ GEMM not allowed
|
||||||
|
- MATH ∥ MATH not allowed
|
||||||
|
- GEMM ∥ MATH not allowed
|
||||||
|
|
||||||
|
Only one compute operation can run in a PE at a time.
|
||||||
|
|
||||||
|
**Compute opcode restriction**
|
||||||
|
|
||||||
|
Composite commands contain one compute opcode only.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
|
||||||
|
```
|
||||||
|
COMPOSITE_GEMM
|
||||||
|
COMPOSITE_MATH
|
||||||
|
```
|
||||||
|
|
||||||
|
Mixed compute pipelines such as `GEMM → MATH` are not supported in Phase 0.
|
||||||
|
|
||||||
|
**Engine completion signaling**
|
||||||
|
|
||||||
|
Every engine emits a completion event when a sub-command finishes.
|
||||||
|
Completion events are delivered to PE_SCHEDULER.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### D5. Dataflow model
|
||||||
|
|
||||||
|
Compute operations use a TCM-centric dataflow model.
|
||||||
|
|
||||||
|
**Input path (HBM)**
|
||||||
|
|
||||||
|
```
|
||||||
|
HBM → XBAR → PE_DMA (DMA_READ) → PE_TCM
|
||||||
|
```
|
||||||
|
|
||||||
|
**Input path (shared SRAM)**
|
||||||
|
|
||||||
|
```
|
||||||
|
Shared SRAM → NOC → PE_DMA (DMA_READ) → PE_TCM
|
||||||
|
```
|
||||||
|
|
||||||
|
**Compute stage**
|
||||||
|
|
||||||
|
Compute engines read input tensors from PE_TCM.
|
||||||
|
|
||||||
|
```
|
||||||
|
PE_TCM → GEMM / MATH
|
||||||
|
```
|
||||||
|
|
||||||
|
Weights for GEMM may optionally stream directly from HBM (via XBAR).
|
||||||
|
|
||||||
|
**Output path (HBM)**
|
||||||
|
|
||||||
|
Compute results are written to PE_TCM, then DMA writes to HBM.
|
||||||
|
|
||||||
|
```
|
||||||
|
PE_TCM → PE_DMA (DMA_WRITE) → XBAR → HBM
|
||||||
|
```
|
||||||
|
|
||||||
|
**Output path (shared SRAM)**
|
||||||
|
|
||||||
|
```
|
||||||
|
PE_TCM → PE_DMA (DMA_WRITE) → NOC → Shared SRAM
|
||||||
|
```
|
||||||
|
|
||||||
|
#### D5.1 PE_TCM partitioning and ownership boundary
|
||||||
|
|
||||||
|
The PE_TCM address space is partitioned into two logical regions.
|
||||||
|
|
||||||
|
**SchedulerReservedTCM**
|
||||||
|
|
||||||
|
- A staging region owned exclusively by PE_SCHEDULER.
|
||||||
|
- This region is used for composite command tile buffers.
|
||||||
|
- PE_SCHEDULER:
|
||||||
|
- partitions this region into tile buffers
|
||||||
|
- assigns buffers for DMA_READ, COMPUTE, and DMA_WRITE stages
|
||||||
|
- guarantees input/output buffer separation
|
||||||
|
- manages tile buffer lifetime
|
||||||
|
|
||||||
|
**AllocatableTCM**
|
||||||
|
|
||||||
|
- General-purpose region managed by PEMemAllocator.
|
||||||
|
- Used by host or DP-visible allocations.
|
||||||
|
|
||||||
|
**Visibility rule (hard isolation)**
|
||||||
|
|
||||||
|
- PEMemAllocator must not see or allocate memory inside SchedulerReservedTCM.
|
||||||
|
- SchedulerReservedTCM is excluded from allocator-managed ranges by construction.
|
||||||
|
- This prevents DP or host allocations from interfering with scheduler staging buffers.
|
||||||
|
|
||||||
|
**Tile buffer rules**
|
||||||
|
|
||||||
|
Within SchedulerReservedTCM:
|
||||||
|
|
||||||
|
- input buffers and output buffers must not overlap
|
||||||
|
- PE_SCHEDULER assigns tile buffers for DMA and compute stages
|
||||||
|
- tile buffers remain valid until the corresponding DMA_WRITE completes
|
||||||
|
- Buffer reuse is allowed only after the tile lifetime finishes.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### D6. Observability and trace contract
|
||||||
|
|
||||||
|
The simulator must emit deterministic trace events.
|
||||||
|
|
||||||
|
Required events include:
|
||||||
|
|
||||||
|
- `command_submitted`
|
||||||
|
- `sub_command_dispatched`
|
||||||
|
- `engine_start`
|
||||||
|
- `engine_complete`
|
||||||
|
- `tile_ready`
|
||||||
|
- `command_complete`
|
||||||
|
|
||||||
|
Trace ordering must be deterministic for identical inputs.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### D7. Topology representation
|
||||||
|
|
||||||
|
PE internal components are declared in `cube.pe_template`.
|
||||||
|
|
||||||
|
The template is instantiated once per PE.
|
||||||
|
|
||||||
|
PE instances are derived from `cube.pe_layout`.
|
||||||
|
|
||||||
|
External connectivity such as:
|
||||||
|
|
||||||
|
- PE_DMA → XBAR (HBM data path)
|
||||||
|
- PE_DMA → NOC (non-HBM data path: shared SRAM, inter-cube UCIe)
|
||||||
|
- NOC → PE_CPU (command path from M_CPU)
|
||||||
|
|
||||||
|
is modeled at the CUBE level (see ADR-0003 D3).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Links
|
||||||
|
|
||||||
|
- SPEC R3, R4
|
||||||
|
- ADR-0003 D4 (PE-level system hierarchy)
|
||||||
|
- ADR-0005 View C (PE-level diagram)
|
||||||
|
- ADR-0008 D2 (PA-level allocation at PE scope; PEMemAllocator is the per-PE allocator instance)
|
||||||
|
- ADR-0009 D3 (kernel execution fan-out and PE_CPU dispatch)
|
||||||
@@ -0,0 +1,178 @@
|
|||||||
|
# ADR-0015: Component Port/Wire Model and Fabric Routing
|
||||||
|
|
||||||
|
## Status
|
||||||
|
|
||||||
|
Proposed
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
ADR-0007 D2 assigns path-walking and low-level request decomposition to the simulation engine.
|
||||||
|
In practice, the engine iterates the topology path and calls `run()` on each component
|
||||||
|
sequentially — conflating routing policy with component behavior and preventing realistic
|
||||||
|
hardware modeling (queues, contention, fan-out).
|
||||||
|
|
||||||
|
ADR-0007 D3 already states that components own fan-out and aggregation, but the current
|
||||||
|
implementation does not enforce this for fabric traversal.
|
||||||
|
|
||||||
|
This ADR defines:
|
||||||
|
|
||||||
|
- how components communicate via typed port queues,
|
||||||
|
- how propagation delay is modeled (wire processes),
|
||||||
|
- the fabric path for Memory R/W through M_CPU.DMA,
|
||||||
|
- the reduced role of the simulation engine,
|
||||||
|
- M_CPU.DMA as an internal subcomponent of M_CPU.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Decision
|
||||||
|
|
||||||
|
### D1. Component port model
|
||||||
|
|
||||||
|
Each component has typed input/output ports modeled as SimPy Stores:
|
||||||
|
|
||||||
|
```
|
||||||
|
in_ports: dict[str, simpy.Store] # keyed by source node_id
|
||||||
|
out_ports: dict[str, simpy.Store] # keyed by destination node_id
|
||||||
|
```
|
||||||
|
|
||||||
|
Ports are created at engine initialization based on graph edges.
|
||||||
|
Each directed edge (src → dst) results in:
|
||||||
|
|
||||||
|
- `src.out_ports[dst]` — the sending end
|
||||||
|
- `dst.in_ports[src]` — the receiving end
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### D2. Wire process (propagation delay)
|
||||||
|
|
||||||
|
For each directed edge (src, dst) in the topology graph, a SimPy wire process
|
||||||
|
models propagation delay:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def wire_process(env, out_port, in_port, delay_ns):
|
||||||
|
while True:
|
||||||
|
cmd = yield out_port.get()
|
||||||
|
yield env.timeout(delay_ns)
|
||||||
|
yield in_port.put(cmd)
|
||||||
|
```
|
||||||
|
|
||||||
|
Wire processes are started at engine initialization.
|
||||||
|
BW constraints are enforced by the sending component's out_port capacity or token model,
|
||||||
|
not by the wire process itself.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### D3. Engine role (reduced)
|
||||||
|
|
||||||
|
The simulation engine MUST:
|
||||||
|
|
||||||
|
- wire components at initialization (create port Stores, start wire processes),
|
||||||
|
- identify the entry component for each request type (PCIE_EP),
|
||||||
|
- put the request into the entry component's in_port,
|
||||||
|
- wait for a completion event.
|
||||||
|
|
||||||
|
The simulation engine MUST NOT:
|
||||||
|
|
||||||
|
- walk the topology path during request execution,
|
||||||
|
- call component `run()` methods directly,
|
||||||
|
- track per-hop latency or decompose fan-out.
|
||||||
|
|
||||||
|
This supersedes ADR-0007 D2's "decompose operations into low-level requests" clause.
|
||||||
|
ADR-0007 D2 must be amended accordingly.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### D4. Unified fabric path for Memory R/W and Kernel Launch
|
||||||
|
|
||||||
|
Both Memory R/W and Kernel Launch use the same fabric path to reach the target cube's M_CPU.
|
||||||
|
The difference is what M_CPU does upon receiving the request.
|
||||||
|
|
||||||
|
**Forward path (IO_CPU → target M_CPU):**
|
||||||
|
|
||||||
|
```
|
||||||
|
IO_CPU
|
||||||
|
→ [transit cubes: ucie_out → wire → ucie_in → noc → ucie_out] (zero or more)
|
||||||
|
→ target cube: ucie_in → noc → M_CPU
|
||||||
|
```
|
||||||
|
|
||||||
|
**At M_CPU (diverges by operation type):**
|
||||||
|
|
||||||
|
```
|
||||||
|
Memory R/W: M_CPU → M_CPU.DMA → noc → hbm_ctrl
|
||||||
|
Kernel Launch: M_CPU → PE[0..n] (parallel fan-out)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Completion path (reverse, same fabric):**
|
||||||
|
|
||||||
|
```
|
||||||
|
Memory R/W: hbm_ctrl → noc → M_CPU.DMA → M_CPU
|
||||||
|
Kernel Launch: PE[0..n] all complete → M_CPU (aggregation)
|
||||||
|
|
||||||
|
M_CPU → [transit cubes: ucie → noc → ucie] → IO_CPU → runtime_api
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### D5. M_CPU.DMA is an internal subcomponent of M_CPU
|
||||||
|
|
||||||
|
M_CPU.DMA is NOT a separate topology node.
|
||||||
|
It is an internal subcomponent owned by the M_CPU component implementation.
|
||||||
|
|
||||||
|
M_CPU.DMA:
|
||||||
|
|
||||||
|
- owns the DMA READ and DMA WRITE queues (capacity=1 each, per ADR-0014 D4),
|
||||||
|
- issues memory requests over the NOC to hbm_ctrl,
|
||||||
|
- receives completion from hbm_ctrl via the NOC,
|
||||||
|
- reports completion to M_CPU,
|
||||||
|
- is created and managed inside M_CPU's `__init__` and `run()`.
|
||||||
|
|
||||||
|
M_CPU.DMA does not appear as a node in the compiled topology graph.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### D6. Transit cube forwarding
|
||||||
|
|
||||||
|
A cube that is not the target of a memory or kernel request acts as a transit node.
|
||||||
|
Transit cubes forward requests without consuming them:
|
||||||
|
|
||||||
|
```
|
||||||
|
ucie_in (from upstream) → noc → ucie_out (to downstream)
|
||||||
|
```
|
||||||
|
|
||||||
|
Transit forwarding is implemented entirely within the ucie_in component.
|
||||||
|
The noc and ucie_out components in a transit cube forward the packet without modification.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### D7. _formula_latency is preserved as a lower-bound cross-check
|
||||||
|
|
||||||
|
The path-based formula latency function (`_formula_latency`) is preserved in the engine
|
||||||
|
as a lower bound for correctness verification.
|
||||||
|
|
||||||
|
Invariant:
|
||||||
|
|
||||||
|
- Phase 0: `_formula_latency == component model total_ns`
|
||||||
|
- Phase 1+: `_formula_latency <= component model total_ns` (contention adds queueing)
|
||||||
|
|
||||||
|
This function is independent of the port/wire model and requires only the topology graph.
|
||||||
|
It is used for shard comparison in `_route_kernel` and as a regression guard.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Consequences
|
||||||
|
|
||||||
|
- Components model realistic hardware behavior (queues, contention, fan-out).
|
||||||
|
- Propagation delay is modeled accurately per edge.
|
||||||
|
- Engine is decoupled from routing policy.
|
||||||
|
- Component implementations remain swappable via DI (ADR-0007 D3).
|
||||||
|
- ADR-0007 D2 must be amended to remove path-walking from engine responsibilities.
|
||||||
|
- ADR-0009 D3 should be updated to reference the unified fabric path (D4 above).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Links
|
||||||
|
|
||||||
|
- ADR-0007 D2 (to be amended: engine path-walking clause)
|
||||||
|
- ADR-0009 D3 (kernel execution fan-out; fabric path to be referenced)
|
||||||
|
- ADR-0014 D4 (DMA engine capacity=1)
|
||||||
|
- ADR-0012 D1 (host ↔ IO_CPU message schema; M_CPU.DMA is component-internal)
|
||||||
@@ -0,0 +1,363 @@
|
|||||||
|
# 실무 DI 패턴: kernbench 구현으로 배우는 Dependency Injection
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 슬라이드 1 — 오늘 이야기할 것
|
||||||
|
|
||||||
|
**질문:** 코드를 어떻게 설계해야 테스트하기 쉽고, 갈아끼우기 쉬울까?
|
||||||
|
|
||||||
|
**답:** Dependency Injection (DI)
|
||||||
|
|
||||||
|
오늘은 이론이 아니라 **실제로 돌아가는 시뮬레이터 코드**를 보면서 배웁니다.
|
||||||
|
|
||||||
|
```
|
||||||
|
kernbench
|
||||||
|
└── AI 가속기 하드웨어를 Python으로 시뮬레이션하는 프레임워크
|
||||||
|
- 수십 개의 하드웨어 컴포넌트 (NOC, HBM, PE, CPU...)
|
||||||
|
- 각 컴포넌트는 런타임에 교체 가능
|
||||||
|
- 테스트에서 Mock 컴포넌트로 즉시 대체 가능
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 슬라이드 2 — DI가 없으면 어떤 일이 생기나
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ❌ DI 없는 코드
|
||||||
|
class IoCpuComponent:
|
||||||
|
def run(self, env, nbytes):
|
||||||
|
router = PathRouter() # 직접 생성 — 교체 불가
|
||||||
|
hbm = HbmCtrlComponent() # 직접 생성 — 교체 불가
|
||||||
|
yield env.timeout(10.0)
|
||||||
|
```
|
||||||
|
|
||||||
|
**문제:**
|
||||||
|
- 테스트할 때 실제 `PathRouter`와 `HbmCtrl`이 항상 따라온다
|
||||||
|
- 컴포넌트를 Mock으로 바꾸려면 **소스 코드를 수정**해야 한다
|
||||||
|
- 다른 topology(다른 라우팅 전략)를 쓰고 싶으면 **또 수정**
|
||||||
|
|
||||||
|
> 클래스가 자기 의존성을 스스로 만들면, 그 클래스는 의존성과 결합된다
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 슬라이드 3 — DI의 핵심 원칙
|
||||||
|
|
||||||
|
**의존성은 밖에서 만들어서 안으로 넣어준다**
|
||||||
|
|
||||||
|
```
|
||||||
|
┌────────────────────────────┐
|
||||||
|
│ 조립자 (Assembler) │ ← 누가 무엇을 쓸지 결정
|
||||||
|
│ GraphEngine.__init__ │
|
||||||
|
└────────────┬───────────────┘
|
||||||
|
│ ctx 주입
|
||||||
|
▼
|
||||||
|
┌────────────────────────────┐
|
||||||
|
│ 컴포넌트 (Component) │ ← 어떻게 동작하는지만 알면 됨
|
||||||
|
│ IoCpuComponent │
|
||||||
|
│ self.ctx.router.find_path(...) ← 그냥 사용
|
||||||
|
└────────────────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
**세 가지 역할 분리:**
|
||||||
|
1. **Interface** — 무엇을 할 수 있는가 (`ComponentBase`)
|
||||||
|
2. **Implementation** — 어떻게 하는가 (`IoCpuComponent`, `HbmCtrlComponent`, ...)
|
||||||
|
3. **Assembler** — 무엇을 연결할 것인가 (`GraphEngine`)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 슬라이드 4 — 패턴 1: Constructor Injection
|
||||||
|
|
||||||
|
> 생성자로 의존성을 받는다
|
||||||
|
|
||||||
|
```python
|
||||||
|
# kernbench/components/base.py
|
||||||
|
|
||||||
|
class ComponentBase(ABC):
|
||||||
|
def __init__(self, node: Node, ctx: ComponentContext | None = None):
|
||||||
|
self.node = node
|
||||||
|
self.ctx = ctx # 외부에서 주입받은 의존성
|
||||||
|
self.in_ports: dict[str, simpy.Store] = {}
|
||||||
|
self.out_ports: dict[str, simpy.Store] = {}
|
||||||
|
```
|
||||||
|
|
||||||
|
```python
|
||||||
|
# 사용 측 — ctx를 직접 만들지 않는다
|
||||||
|
class IoCpuComponent(ComponentBase):
|
||||||
|
def _dispatch(self, env, txn):
|
||||||
|
path = self.ctx.router.find_node_path(...) # ctx는 이미 들어와 있음
|
||||||
|
yield self.out_ports[next_hop].put(...)
|
||||||
|
```
|
||||||
|
|
||||||
|
**언제 쓰나:**
|
||||||
|
- 컴포넌트가 살아있는 동안 의존성이 바뀌지 않을 때
|
||||||
|
- 의존성 없이는 컴포넌트가 동작하지 않을 때 (필수 의존성)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 슬라이드 5 — Context Object 패턴
|
||||||
|
|
||||||
|
> 의존성이 많아지면 묶어서 하나로
|
||||||
|
|
||||||
|
```python
|
||||||
|
# kernbench/components/context.py
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ComponentContext:
|
||||||
|
router: PathRouter # 라우팅 정책
|
||||||
|
resolver: AddressResolver # 주소 해석
|
||||||
|
positions: dict[str, ...] # 물리적 위치 정보
|
||||||
|
ns_per_mm: float # 전파 지연 상수
|
||||||
|
edge_map: dict[...] # 엣지 정보
|
||||||
|
spec: dict # 토폴로지 스펙
|
||||||
|
```
|
||||||
|
|
||||||
|
**왜 Context로 묶나?**
|
||||||
|
- 생성자 인자가 6개면 → 컴포넌트 추가할 때마다 시그니처 변경
|
||||||
|
- Context 하나면 → 새 필드 추가해도 기존 컴포넌트 무영향
|
||||||
|
- 컴포넌트는 **필요한 것만 꺼내 쓴다**
|
||||||
|
|
||||||
|
```python
|
||||||
|
class TwoDMeshNocComponent(ComponentBase):
|
||||||
|
def _route(self, env, txn):
|
||||||
|
src_pos = self.ctx.positions.get(prev_hop) # 위치만 사용
|
||||||
|
ns_per_mm = self.ctx.ns_per_mm # 상수만 사용
|
||||||
|
# router, resolver 등은 건드리지 않음
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 슬라이드 6 — 패턴 2: Registry + Factory
|
||||||
|
|
||||||
|
> 문자열 키 → 클래스 매핑으로 런타임 교체
|
||||||
|
|
||||||
|
```python
|
||||||
|
# kernbench/components/base.py
|
||||||
|
|
||||||
|
class ComponentRegistry:
|
||||||
|
_registry: dict[str, type[ComponentBase]] = {}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def register(cls, impl: str, component_cls: type[ComponentBase]):
|
||||||
|
cls._registry[impl] = component_cls
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def create(cls, node, overrides=None, ctx=None) -> ComponentBase:
|
||||||
|
if overrides and node.impl in overrides:
|
||||||
|
return overrides[node.impl](node, ctx) # 1순위: 호출자 override
|
||||||
|
if node.impl in cls._registry:
|
||||||
|
return cls._registry[node.impl](node, ctx) # 2순위: 등록된 구현
|
||||||
|
return DefaultComponent(node, ctx) # 3순위: 기본값 fallback
|
||||||
|
```
|
||||||
|
|
||||||
|
**Resolution 우선순위:**
|
||||||
|
```
|
||||||
|
overrides[impl] ← 테스트/실험용 주입
|
||||||
|
↓ (없으면)
|
||||||
|
_registry[impl] ← 프로덕션 구현
|
||||||
|
↓ (없으면)
|
||||||
|
DefaultComponent ← 안전한 fallback
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 슬라이드 7 — Registry 등록 방식
|
||||||
|
|
||||||
|
```python
|
||||||
|
# kernbench/components/impls/__init__.py
|
||||||
|
|
||||||
|
from kernbench.components.base import ComponentRegistry
|
||||||
|
from kernbench.components.impls.noc import TwoDMeshNocComponent
|
||||||
|
from kernbench.components.impls.io_cpu import IoCpuComponent
|
||||||
|
# ...
|
||||||
|
|
||||||
|
ComponentRegistry.register("noc_2d_mesh_v1", TwoDMeshNocComponent)
|
||||||
|
ComponentRegistry.register("io_cpu_v1", IoCpuComponent)
|
||||||
|
ComponentRegistry.register("hbm_ctrl_v1", HbmCtrlComponent)
|
||||||
|
# ...
|
||||||
|
```
|
||||||
|
|
||||||
|
**topology.yaml (설정 파일)**
|
||||||
|
```yaml
|
||||||
|
nodes:
|
||||||
|
- id: sip0.cube0.noc
|
||||||
|
impl: noc_2d_mesh_v1 # ← 이 문자열이 Registry 키
|
||||||
|
```
|
||||||
|
|
||||||
|
**흐름:**
|
||||||
|
```
|
||||||
|
YAML → impl 문자열 → Registry.create() → 실제 컴포넌트 인스턴스
|
||||||
|
```
|
||||||
|
|
||||||
|
impl 문자열만 바꾸면 동작이 바뀐다. 코드 수정 없음.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 슬라이드 8 — 패턴 3: Override Injection (테스트용)
|
||||||
|
|
||||||
|
> 호출자가 특정 impl만 갈아끼운다
|
||||||
|
|
||||||
|
```python
|
||||||
|
# tests/test_component_registry.py
|
||||||
|
|
||||||
|
class SpyXbar(ComponentBase):
|
||||||
|
calls = 0
|
||||||
|
|
||||||
|
def run(self, env, nbytes):
|
||||||
|
SpyXbar.calls += 1
|
||||||
|
yield env.timeout(0)
|
||||||
|
|
||||||
|
|
||||||
|
# 테스트에서 xbar_v1만 SpyXbar로 교체
|
||||||
|
engine = GraphEngine(
|
||||||
|
graph,
|
||||||
|
component_overrides={"xbar_v1": SpyXbar} # ← 이것만 추가
|
||||||
|
)
|
||||||
|
|
||||||
|
result = engine.run(msg)
|
||||||
|
assert SpyXbar.calls > 0 # Xbar가 실제로 호출됐는지 검증
|
||||||
|
```
|
||||||
|
|
||||||
|
**핵심:** 테스트 코드가 프로덕션 코드를 **수정하지 않는다**
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 슬라이드 9 — 조립자: GraphEngine
|
||||||
|
|
||||||
|
> 컴포넌트를 생성하고 연결하는 유일한 곳
|
||||||
|
|
||||||
|
```python
|
||||||
|
# kernbench/sim_engine/engine.py
|
||||||
|
|
||||||
|
class GraphEngine:
|
||||||
|
def __init__(self, graph, component_overrides=None):
|
||||||
|
|
||||||
|
# 1. 공유 의존성 생성
|
||||||
|
ctx = ComponentContext(
|
||||||
|
router=PathRouter(graph),
|
||||||
|
resolver=AddressResolver(graph),
|
||||||
|
positions={nid: n.pos_mm for nid, n in graph.nodes.items()},
|
||||||
|
ns_per_mm=...,
|
||||||
|
)
|
||||||
|
|
||||||
|
# 2. 컴포넌트 생성 (DI: ctx 주입)
|
||||||
|
self._components = {
|
||||||
|
node_id: ComponentRegistry.create(node, overrides, ctx)
|
||||||
|
for node_id, node in graph.nodes.items()
|
||||||
|
}
|
||||||
|
|
||||||
|
# 3. 포트 연결 (배선)
|
||||||
|
for e in graph.edges:
|
||||||
|
store = simpy.Store(self._env)
|
||||||
|
self._components[e.src].out_ports[e.dst] = store
|
||||||
|
self._components[e.dst].in_ports[e.src] = store
|
||||||
|
```
|
||||||
|
|
||||||
|
**생성 → 주입 → 연결** — 이 세 단계가 한 곳에서만 일어난다
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 슬라이드 10 — 전체 구조 한눈에 보기
|
||||||
|
|
||||||
|
```
|
||||||
|
topology.yaml
|
||||||
|
│ impl: "noc_2d_mesh_v1"
|
||||||
|
▼
|
||||||
|
GraphEngine.__init__() ← 조립자
|
||||||
|
│
|
||||||
|
├── ComponentContext 생성 ← 공유 의존성 묶음
|
||||||
|
│ ├── PathRouter
|
||||||
|
│ ├── AddressResolver
|
||||||
|
│ └── positions, ns_per_mm, ...
|
||||||
|
│
|
||||||
|
├── ComponentRegistry.create(node, overrides, ctx)
|
||||||
|
│ ├── overrides["noc_2d_mesh_v1"]? → SpyNoc (테스트)
|
||||||
|
│ ├── registry["noc_2d_mesh_v1"]? → TwoDMeshNocComponent (프로덕션)
|
||||||
|
│ └── fallback → DefaultComponent
|
||||||
|
│
|
||||||
|
└── 포트 배선: out_ports / in_ports 연결
|
||||||
|
|
||||||
|
Component (TwoDMeshNocComponent)
|
||||||
|
└── self.ctx.positions, self.ctx.ns_per_mm 사용
|
||||||
|
(라우터, 리졸버는 건드리지 않음 — 필요한 것만)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 슬라이드 11 — 무엇을 얻었나
|
||||||
|
|
||||||
|
| 상황 | DI 없이 | DI 있이 |
|
||||||
|
|------|---------|---------|
|
||||||
|
| NOC 알고리즘 교체 | 소스 코드 수정 | YAML에서 impl 문자열 변경 |
|
||||||
|
| Xbar 동작 검증 | 실제 HW 전부 구동 | `overrides={"xbar_v1": SpyXbar}` |
|
||||||
|
| 새 컴포넌트 추가 | 기존 코드 수정 | `register("new_v1", NewComp)` |
|
||||||
|
| 컨텍스트 필드 추가 | 모든 생성자 수정 | `ComponentContext`에 필드 추가 |
|
||||||
|
| 테스트 격리 | 불가능 | 필요한 것만 override |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 슬라이드 12 — 실무 적용 체크리스트
|
||||||
|
|
||||||
|
**설계할 때 물어볼 것:**
|
||||||
|
|
||||||
|
1. **이 클래스가 직접 `new`(생성)하는 것은 무엇인가?**
|
||||||
|
→ 생성하는 것 = 교체할 수 없는 것. 생성자로 받을 수 없는지 검토.
|
||||||
|
|
||||||
|
2. **의존성이 3개 이상이면?**
|
||||||
|
→ Context Object로 묶어라.
|
||||||
|
|
||||||
|
3. **테스트에서 이 클래스를 단독으로 실행할 수 있는가?**
|
||||||
|
→ 없다면 DI가 필요하다는 신호.
|
||||||
|
|
||||||
|
4. **설정(YAML/config)으로 동작을 바꾸고 싶은가?**
|
||||||
|
→ Registry + 문자열 키 패턴.
|
||||||
|
|
||||||
|
5. **누가 조립하는가?**
|
||||||
|
→ 조립자는 하나여야 한다. 컴포넌트 안에 조립 로직이 있으면 안 된다.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 슬라이드 13 — 안티패턴: 이것은 하지 말자
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ❌ 서비스 로케이터 (컴포넌트 안에서 registry 호출)
|
||||||
|
class BadComponent(ComponentBase):
|
||||||
|
def run(self, env, nbytes):
|
||||||
|
router = ComponentRegistry.get("router") # 컴포넌트가 직접 찾는다
|
||||||
|
...
|
||||||
|
|
||||||
|
# ❌ 전역 싱글톤 직접 참조
|
||||||
|
class BadComponent(ComponentBase):
|
||||||
|
def run(self, env, nbytes):
|
||||||
|
router = GlobalRouter.instance() # 교체 불가
|
||||||
|
...
|
||||||
|
|
||||||
|
# ❌ 생성자 안에서 의존성 생성
|
||||||
|
class BadComponent(ComponentBase):
|
||||||
|
def __init__(self, node):
|
||||||
|
self.router = PathRouter(node.graph) # 테스트에서 격리 불가
|
||||||
|
```
|
||||||
|
|
||||||
|
**공통 문제:** 컴포넌트가 자기 의존성을 스스로 해결한다 → 결합도 증가
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 슬라이드 14 — 요약
|
||||||
|
|
||||||
|
> **DI = 의존성의 생성과 사용을 분리하는 것**
|
||||||
|
|
||||||
|
```
|
||||||
|
생성 → Registry / Assembler (GraphEngine)
|
||||||
|
사용 → Component (IoCpuComponent, TwoDMeshNocComponent, ...)
|
||||||
|
```
|
||||||
|
|
||||||
|
**kernbench에서 배운 패턴 3가지:**
|
||||||
|
|
||||||
|
1. **Constructor Injection** — 필수 의존성은 생성자로
|
||||||
|
2. **Context Object** — 의존성 묶음을 하나의 dataclass로
|
||||||
|
3. **Registry + Override** — 문자열 키로 구현체 선택, 테스트에서 교체
|
||||||
|
|
||||||
|
**결과:** 141개 테스트, YAML 한 줄로 컴포넌트 교체, 프로덕션 코드 수정 없이 Mock 주입
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
*참고 코드: kernbench/src/kernbench/components/*
|
||||||
@@ -0,0 +1,26 @@
|
|||||||
|
# Generated Diagrams
|
||||||
|
|
||||||
|
This directory contains diagrams generated from topology compilation.
|
||||||
|
|
||||||
|
## What these files are
|
||||||
|
- Derived artifacts generated from:
|
||||||
|
- compiled topology graph
|
||||||
|
- distance (accumulated latency) metadata
|
||||||
|
- view/layout rules (ADR-0005)
|
||||||
|
|
||||||
|
These files are meant for quick visual inspection and review.
|
||||||
|
|
||||||
|
## Default outputs
|
||||||
|
- SIP view: `sip_view.mmd` (and/or `sip_view.dot`)
|
||||||
|
- CUBE view: `cube_view.mmd` (and/or `cube_view.dot`)
|
||||||
|
- PE view: `pe_view.mmd` (and/or `pe_view.dot`)
|
||||||
|
|
||||||
|
## How to preview
|
||||||
|
- In VS Code:
|
||||||
|
- open `.mmd` or `.md` containing Mermaid blocks and use Markdown Preview
|
||||||
|
- for `.dot`, use a Graphviz preview extension or `dot -Tpng`
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
- Diagrams are representative and distance-aware by default.
|
||||||
|
- Instance indices are not required unless debugging asymmetry.
|
||||||
|
- Outputs should be deterministic for the same topology and rules.
|
||||||
@@ -0,0 +1,156 @@
|
|||||||
|
<svg xmlns="http://www.w3.org/2000/svg" width="556" height="472" viewBox="0 0 556 472">
|
||||||
|
<title>cube</title>
|
||||||
|
<rect width="556" height="472" fill="#f8fafc"/>
|
||||||
|
<text x="278" y="18" text-anchor="middle" font-family="monospace" font-size="14" font-weight="bold" fill="#1e293b">CUBE VIEW</text>
|
||||||
|
<rect x="40.0" y="40.0" width="476.0" height="392.0" rx="6" fill="none" stroke="#475569" stroke-width="2" stroke-dasharray="8,4"/>
|
||||||
|
<rect x="152.0" y="166.0" width="252.0" height="140.0" rx="4" fill="#d1fae5" stroke="#10b981" stroke-width="1.5" stroke-dasharray="6,3" opacity="0.5"/>
|
||||||
|
<text x="278.0" y="278.0" text-anchor="middle" font-family="monospace" font-size="11" fill="#047857" opacity="0.7">HBM</text>
|
||||||
|
<polyline points="82.0,82.0 82.0,95.0 82.0,95.0 82.0,138.0" fill="none" stroke="#f97316" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="82.0" y="92.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">6.0mm 256GB/s</text>
|
||||||
|
<polyline points="82.0,82.0 82.0,144.0 334.0,144.0 334.0,236.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
|
||||||
|
<polyline points="334.0,236.0 334.0,144.0 82.0,144.0 82.0,82.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.6"/>
|
||||||
|
<polyline points="166.0,82.0 166.0,95.0 166.0,95.0 166.0,138.0" fill="none" stroke="#f97316" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="166.0" y="92.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">6.0mm 256GB/s</text>
|
||||||
|
<polyline points="166.0,82.0 166.0,154.0 334.0,154.0 334.0,236.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
|
||||||
|
<polyline points="334.0,236.0 334.0,144.0 166.0,144.0 166.0,82.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.6"/>
|
||||||
|
<polyline points="390.0,82.0 390.0,95.0 390.0,95.0 390.0,138.0" fill="none" stroke="#f97316" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="390.0" y="92.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">6.0mm 256GB/s</text>
|
||||||
|
<polyline points="390.0,82.0 390.0,164.0 334.0,164.0 334.0,236.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
|
||||||
|
<polyline points="334.0,236.0 334.0,144.0 390.0,144.0 390.0,82.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.6"/>
|
||||||
|
<polyline points="474.0,82.0 474.0,95.0 474.0,95.0 474.0,138.0" fill="none" stroke="#f97316" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="474.0" y="92.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">6.0mm 256GB/s</text>
|
||||||
|
<polyline points="474.0,82.0 474.0,174.0 334.0,174.0 334.0,236.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
|
||||||
|
<polyline points="334.0,236.0 334.0,144.0 474.0,144.0 474.0,82.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.6"/>
|
||||||
|
<polyline points="82.0,390.0 82.0,347.0 82.0,347.0 82.0,334.0" fill="none" stroke="#f97316" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="82.0" y="344.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">6.0mm 256GB/s</text>
|
||||||
|
<polyline points="82.0,390.0 82.0,338.0 334.0,338.0 334.0,236.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
|
||||||
|
<polyline points="334.0,236.0 334.0,298.0 82.0,298.0 82.0,390.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.6"/>
|
||||||
|
<polyline points="166.0,390.0 166.0,347.0 166.0,347.0 166.0,334.0" fill="none" stroke="#f97316" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="166.0" y="344.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">6.0mm 256GB/s</text>
|
||||||
|
<polyline points="166.0,390.0 166.0,348.0 334.0,348.0 334.0,236.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
|
||||||
|
<polyline points="334.0,236.0 334.0,298.0 166.0,298.0 166.0,390.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.6"/>
|
||||||
|
<polyline points="390.0,390.0 390.0,347.0 390.0,347.0 390.0,334.0" fill="none" stroke="#f97316" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="390.0" y="344.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">6.0mm 256GB/s</text>
|
||||||
|
<polyline points="390.0,390.0 390.0,358.0 334.0,358.0 334.0,236.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
|
||||||
|
<polyline points="334.0,236.0 334.0,298.0 390.0,298.0 390.0,390.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.6"/>
|
||||||
|
<polyline points="474.0,390.0 474.0,347.0 474.0,347.0 474.0,334.0" fill="none" stroke="#f97316" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="474.0" y="344.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">6.0mm 256GB/s</text>
|
||||||
|
<polyline points="474.0,390.0 474.0,368.0 334.0,368.0 334.0,236.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
|
||||||
|
<polyline points="334.0,236.0 334.0,298.0 474.0,298.0 474.0,390.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.6"/>
|
||||||
|
<polyline points="82.0,138.0 222.0,138.0 222.0,236.0" fill="none" stroke="#10b981" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="152.0" y="183.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.5mm 256GB/s</text>
|
||||||
|
<polyline points="166.0,138.0 222.0,138.0 222.0,236.0" fill="none" stroke="#10b981" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="194.0" y="183.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.5mm 256GB/s</text>
|
||||||
|
<polyline points="390.0,138.0 222.0,138.0 222.0,236.0" fill="none" stroke="#10b981" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="306.0" y="183.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.5mm 256GB/s</text>
|
||||||
|
<polyline points="474.0,138.0 222.0,138.0 222.0,236.0" fill="none" stroke="#10b981" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="348.0" y="183.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.5mm 256GB/s</text>
|
||||||
|
<polyline points="82.0,334.0 222.0,334.0 222.0,236.0" fill="none" stroke="#10b981" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="152.0" y="281.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.5mm 256GB/s</text>
|
||||||
|
<polyline points="166.0,334.0 222.0,334.0 222.0,236.0" fill="none" stroke="#10b981" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="194.0" y="281.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.5mm 256GB/s</text>
|
||||||
|
<polyline points="390.0,334.0 222.0,334.0 222.0,236.0" fill="none" stroke="#10b981" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="306.0" y="281.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.5mm 256GB/s</text>
|
||||||
|
<polyline points="474.0,334.0 222.0,334.0 222.0,236.0" fill="none" stroke="#10b981" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="348.0" y="281.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.5mm 256GB/s</text>
|
||||||
|
<line x1="82.0" y1="138.0" x2="166.0" y2="138.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="124.0" y="134.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.0mm 128GB/s</text>
|
||||||
|
<line x1="166.0" y1="138.0" x2="82.0" y2="138.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="124.0" y="134.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.0mm 128GB/s</text>
|
||||||
|
<line x1="166.0" y1="138.0" x2="390.0" y2="138.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="278.0" y="134.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">10.0mm 128GB/s</text>
|
||||||
|
<line x1="390.0" y1="138.0" x2="166.0" y2="138.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="278.0" y="134.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">10.0mm 128GB/s</text>
|
||||||
|
<line x1="390.0" y1="138.0" x2="474.0" y2="138.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="432.0" y="134.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.0mm 128GB/s</text>
|
||||||
|
<line x1="474.0" y1="138.0" x2="390.0" y2="138.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="432.0" y="134.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.0mm 128GB/s</text>
|
||||||
|
<line x1="82.0" y1="334.0" x2="166.0" y2="334.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="124.0" y="330.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.0mm 128GB/s</text>
|
||||||
|
<line x1="166.0" y1="334.0" x2="82.0" y2="334.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="124.0" y="330.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.0mm 128GB/s</text>
|
||||||
|
<line x1="166.0" y1="334.0" x2="390.0" y2="334.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="278.0" y="330.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">10.0mm 128GB/s</text>
|
||||||
|
<line x1="390.0" y1="334.0" x2="166.0" y2="334.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="278.0" y="330.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">10.0mm 128GB/s</text>
|
||||||
|
<line x1="390.0" y1="334.0" x2="474.0" y2="334.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="432.0" y="330.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.0mm 128GB/s</text>
|
||||||
|
<line x1="474.0" y1="334.0" x2="390.0" y2="334.0" stroke="#94a3b8" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="432.0" y="330.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">2.0mm 128GB/s</text>
|
||||||
|
<polyline points="82.0,138.0 110.0,138.0 110.0,292.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="96.0" y="211.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.0mm 512GB/s</text>
|
||||||
|
<polyline points="110.0,292.0 82.0,292.0 82.0,138.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="96.0" y="211.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.0mm 512GB/s</text>
|
||||||
|
<polyline points="82.0,334.0 110.0,334.0 110.0,292.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="96.0" y="309.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.0mm 512GB/s</text>
|
||||||
|
<polyline points="110.0,292.0 82.0,292.0 82.0,334.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="96.0" y="309.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.0mm 512GB/s</text>
|
||||||
|
<polyline points="474.0,138.0 446.0,138.0 446.0,292.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="460.0" y="211.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.0mm 512GB/s</text>
|
||||||
|
<polyline points="446.0,292.0 474.0,292.0 474.0,138.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="460.0" y="211.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.0mm 512GB/s</text>
|
||||||
|
<polyline points="474.0,334.0 446.0,334.0 446.0,292.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="460.0" y="309.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.0mm 512GB/s</text>
|
||||||
|
<polyline points="446.0,292.0 474.0,292.0 474.0,334.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="460.0" y="309.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.0mm 512GB/s</text>
|
||||||
|
<polyline points="334.0,236.0 334.0,131.4 278.0,131.4 278.0,56.8" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.6"/>
|
||||||
|
<polyline points="334.0,236.0 334.0,310.6 278.0,310.6 278.0,415.2" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.6"/>
|
||||||
|
<polyline points="334.0,236.0 334.0,221.0 488.0,221.0 488.0,236.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.6"/>
|
||||||
|
<polyline points="334.0,236.0 334.0,221.0 68.0,221.0 68.0,236.0" fill="none" stroke="#a78bfa" stroke-width="1" opacity="0.6"/>
|
||||||
|
<polyline points="446.0,194.0 446.0,200.0 334.0,200.0 334.0,236.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.6"/>
|
||||||
|
<polyline points="334.0,236.0 334.0,200.0 446.0,200.0 446.0,194.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.6"/>
|
||||||
|
<polyline points="334.0,236.0 110.0,236.0 110.0,194.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.8"/>
|
||||||
|
<polyline points="110.0,194.0 334.0,194.0 334.0,236.0" fill="none" stroke="#f59e0b" stroke-width="1" opacity="0.8"/>
|
||||||
|
<rect x="250.0" y="40.0" width="56.0" height="33.6" rx="4" fill="#3b82f6" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="278.0" y="60.8" text-anchor="middle" font-family="monospace" font-size="10" fill="#ffffff">UCIe-N</text>
|
||||||
|
<rect x="250.0" y="398.4" width="56.0" height="33.6" rx="4" fill="#3b82f6" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="278.0" y="419.2" text-anchor="middle" font-family="monospace" font-size="10" fill="#ffffff">UCIe-S</text>
|
||||||
|
<rect x="460.0" y="219.2" width="56.0" height="33.6" rx="4" fill="#3b82f6" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="488.0" y="240.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#ffffff">UCIe-E</text>
|
||||||
|
<rect x="40.0" y="219.2" width="56.0" height="33.6" rx="4" fill="#3b82f6" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="68.0" y="240.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#ffffff">UCIe-W</text>
|
||||||
|
<rect x="306.0" y="219.2" width="56.0" height="33.6" rx="4" fill="#a78bfa" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="334.0" y="240.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">NOC</text>
|
||||||
|
<rect x="418.0" y="177.2" width="56.0" height="33.6" rx="4" fill="#f59e0b" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="446.0" y="198.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">M CPU</text>
|
||||||
|
<rect x="194.0" y="219.2" width="56.0" height="33.6" rx="4" fill="#10b981" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="222.0" y="240.0" text-anchor="middle" font-family="monospace" font-size="8" fill="#ffffff">HBM CTRL</text>
|
||||||
|
<rect x="82.0" y="177.2" width="56.0" height="33.6" rx="4" fill="#f59e0b" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="110.0" y="198.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">SRAM</text>
|
||||||
|
<rect x="82.0" y="275.2" width="56.0" height="33.6" rx="4" fill="#f97316" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="110.0" y="296.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">Bridge LEFT</text>
|
||||||
|
<rect x="418.0" y="275.2" width="56.0" height="33.6" rx="4" fill="#f97316" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="446.0" y="296.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">Bridge RIGHT</text>
|
||||||
|
<rect x="56.8" y="68.0" width="50.4" height="28.0" rx="4" fill="#94a3b8" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="82.0" y="86.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">PE0</text>
|
||||||
|
<rect x="54.0" y="121.2" width="56.0" height="33.6" rx="4" fill="#f97316" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="82.0" y="142.0" text-anchor="middle" font-family="monospace" font-size="8" fill="#1e293b">XBAR PE0</text>
|
||||||
|
<rect x="140.8" y="68.0" width="50.4" height="28.0" rx="4" fill="#94a3b8" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="166.0" y="86.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">PE1</text>
|
||||||
|
<rect x="138.0" y="121.2" width="56.0" height="33.6" rx="4" fill="#f97316" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="166.0" y="142.0" text-anchor="middle" font-family="monospace" font-size="8" fill="#1e293b">XBAR PE1</text>
|
||||||
|
<rect x="364.8" y="68.0" width="50.4" height="28.0" rx="4" fill="#94a3b8" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="390.0" y="86.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">PE2</text>
|
||||||
|
<rect x="362.0" y="121.2" width="56.0" height="33.6" rx="4" fill="#f97316" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="390.0" y="142.0" text-anchor="middle" font-family="monospace" font-size="8" fill="#1e293b">XBAR PE2</text>
|
||||||
|
<rect x="448.8" y="68.0" width="50.4" height="28.0" rx="4" fill="#94a3b8" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="474.0" y="86.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">PE3</text>
|
||||||
|
<rect x="446.0" y="121.2" width="56.0" height="33.6" rx="4" fill="#f97316" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="474.0" y="142.0" text-anchor="middle" font-family="monospace" font-size="8" fill="#1e293b">XBAR PE3</text>
|
||||||
|
<rect x="56.8" y="376.0" width="50.4" height="28.0" rx="4" fill="#94a3b8" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="82.0" y="394.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">PE4</text>
|
||||||
|
<rect x="54.0" y="317.2" width="56.0" height="33.6" rx="4" fill="#f97316" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="82.0" y="338.0" text-anchor="middle" font-family="monospace" font-size="8" fill="#1e293b">XBAR PE4</text>
|
||||||
|
<rect x="140.8" y="376.0" width="50.4" height="28.0" rx="4" fill="#94a3b8" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="166.0" y="394.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">PE5</text>
|
||||||
|
<rect x="138.0" y="317.2" width="56.0" height="33.6" rx="4" fill="#f97316" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="166.0" y="338.0" text-anchor="middle" font-family="monospace" font-size="8" fill="#1e293b">XBAR PE5</text>
|
||||||
|
<rect x="364.8" y="376.0" width="50.4" height="28.0" rx="4" fill="#94a3b8" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="390.0" y="394.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">PE6</text>
|
||||||
|
<rect x="362.0" y="317.2" width="56.0" height="33.6" rx="4" fill="#f97316" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="390.0" y="338.0" text-anchor="middle" font-family="monospace" font-size="8" fill="#1e293b">XBAR PE6</text>
|
||||||
|
<rect x="448.8" y="376.0" width="50.4" height="28.0" rx="4" fill="#94a3b8" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="474.0" y="394.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">PE7</text>
|
||||||
|
<rect x="446.0" y="317.2" width="56.0" height="33.6" rx="4" fill="#f97316" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="474.0" y="338.0" text-anchor="middle" font-family="monospace" font-size="8" fill="#1e293b">XBAR PE7</text>
|
||||||
|
</svg>
|
||||||
|
After Width: | Height: | Size: 18 KiB |
@@ -0,0 +1,31 @@
|
|||||||
|
<svg xmlns="http://www.w3.org/2000/svg" width="500" height="360" viewBox="0 0 500 360">
|
||||||
|
<title>pe</title>
|
||||||
|
<rect width="500" height="360" fill="#f8fafc"/>
|
||||||
|
<text x="250" y="18" text-anchor="middle" font-family="monospace" font-size="14" font-weight="bold" fill="#1e293b">PE VIEW</text>
|
||||||
|
<line x1="92.5" y1="180.0" x2="180.0" y2="180.0" stroke="#94a3b8" stroke-width="1.5" opacity="0.8"/>
|
||||||
|
<text x="136.2" y="176.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">0.5mm</text>
|
||||||
|
<polyline points="180.0,180.0 180.0,92.5 285.0,92.5" fill="none" stroke="#94a3b8" stroke-width="1.5" opacity="0.8"/>
|
||||||
|
<text x="232.5" y="132.2" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">0.5mm</text>
|
||||||
|
<line x1="180.0" y1="180.0" x2="285.0" y2="180.0" stroke="#94a3b8" stroke-width="1.5" opacity="0.8"/>
|
||||||
|
<text x="232.5" y="176.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">0.5mm</text>
|
||||||
|
<polyline points="180.0,180.0 180.0,267.5 285.0,267.5" fill="none" stroke="#94a3b8" stroke-width="1.5" opacity="0.8"/>
|
||||||
|
<text x="232.5" y="219.8" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">0.5mm</text>
|
||||||
|
<polyline points="285.0,92.5 390.0,92.5 390.0,180.0" fill="none" stroke="#94a3b8" stroke-width="1.5" opacity="0.8"/>
|
||||||
|
<text x="337.5" y="132.2" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">0.5mm 512GB/s</text>
|
||||||
|
<line x1="285.0" y1="180.0" x2="390.0" y2="180.0" stroke="#94a3b8" stroke-width="1.5" opacity="0.8"/>
|
||||||
|
<text x="337.5" y="176.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">0.5mm 512GB/s</text>
|
||||||
|
<polyline points="285.0,267.5 390.0,267.5 390.0,180.0" fill="none" stroke="#94a3b8" stroke-width="1.5" opacity="0.8"/>
|
||||||
|
<text x="337.5" y="219.8" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">0.5mm 512GB/s</text>
|
||||||
|
<rect x="48.8" y="155.5" width="87.5" height="49.0" rx="4" fill="#ef4444" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="92.5" y="184.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#ffffff">PE CPU</text>
|
||||||
|
<rect x="136.2" y="155.5" width="87.5" height="49.0" rx="4" fill="#f59e0b" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="180.0" y="184.0" text-anchor="middle" font-family="monospace" font-size="9" fill="#1e293b">PE SCHEDULER</text>
|
||||||
|
<rect x="241.2" y="68.0" width="87.5" height="49.0" rx="4" fill="#3b82f6" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="285.0" y="96.5" text-anchor="middle" font-family="monospace" font-size="10" fill="#ffffff">PE DMA</text>
|
||||||
|
<rect x="241.2" y="155.5" width="87.5" height="49.0" rx="4" fill="#8b5cf6" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="285.0" y="184.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#ffffff">PE GEMM</text>
|
||||||
|
<rect x="241.2" y="243.0" width="87.5" height="49.0" rx="4" fill="#ec4899" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="285.0" y="271.5" text-anchor="middle" font-family="monospace" font-size="10" fill="#ffffff">PE MATH</text>
|
||||||
|
<rect x="346.2" y="155.5" width="87.5" height="49.0" rx="4" fill="#10b981" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="390.0" y="184.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#ffffff">PE TCM</text>
|
||||||
|
</svg>
|
||||||
|
After Width: | Height: | Size: 3.2 KiB |
@@ -0,0 +1,72 @@
|
|||||||
|
<svg xmlns="http://www.w3.org/2000/svg" width="820" height="500" viewBox="0 0 820 500" font-family="monospace">
|
||||||
|
<rect width="820" height="500" fill="#f8fafc" rx="6"/>
|
||||||
|
<text x="410" y="32" text-anchor="middle" font-size="16" font-weight="bold" fill="#1e293b">Placement: column_wise</text>
|
||||||
|
<text x="410.0" y="54.0" text-anchor="middle" font-size="12" fill="#475569" font-weight="normal">Tensor (1024×512) fp16 → K axis split into 8 parts</text>
|
||||||
|
<text x="320.0" y="82.0" text-anchor="middle" font-size="11" fill="#475569" font-weight="normal">← K=512 →</text>
|
||||||
|
<text x="68.0" y="250.0" text-anchor="middle" font-size="11" fill="#475569" transform="rotate(-90 68.0 250.0)">↑ M=1024 ↓</text>
|
||||||
|
<rect x="80.0" y="90.0" width="60.0" height="320.0" fill="#3b82f6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="110.0" y="246.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE0</text>
|
||||||
|
<text x="110.0" y="262.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">(1024×64)</text>
|
||||||
|
<rect x="140.0" y="90.0" width="60.0" height="320.0" fill="#10b981" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="170.0" y="246.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE1</text>
|
||||||
|
<text x="170.0" y="262.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">(1024×64)</text>
|
||||||
|
<rect x="200.0" y="90.0" width="60.0" height="320.0" fill="#f59e0b" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="230.0" y="246.0" text-anchor="middle" font-size="12" fill="#000" font-weight="bold">PE2</text>
|
||||||
|
<text x="230.0" y="262.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">(1024×64)</text>
|
||||||
|
<rect x="260.0" y="90.0" width="60.0" height="320.0" fill="#ef4444" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="290.0" y="246.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE3</text>
|
||||||
|
<text x="290.0" y="262.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">(1024×64)</text>
|
||||||
|
<rect x="320.0" y="90.0" width="60.0" height="320.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="350.0" y="246.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE4</text>
|
||||||
|
<text x="350.0" y="262.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">(1024×64)</text>
|
||||||
|
<rect x="380.0" y="90.0" width="60.0" height="320.0" fill="#ec4899" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="410.0" y="246.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE5</text>
|
||||||
|
<text x="410.0" y="262.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">(1024×64)</text>
|
||||||
|
<rect x="440.0" y="90.0" width="60.0" height="320.0" fill="#06b6d4" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="470.0" y="246.0" text-anchor="middle" font-size="12" fill="#000" font-weight="bold">PE6</text>
|
||||||
|
<text x="470.0" y="262.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">(1024×64)</text>
|
||||||
|
<rect x="500.0" y="90.0" width="60.0" height="320.0" fill="#f97316" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="530.0" y="246.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE7</text>
|
||||||
|
<text x="530.0" y="262.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">(1024×64)</text>
|
||||||
|
<rect x="80.0" y="90.0" width="480.0" height="320.0" fill="none" stroke="#1e293b" stroke-width="2" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="110.0" y="426.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">off=0 B</text>
|
||||||
|
<text x="110.0" y="440.0" text-anchor="middle" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
|
||||||
|
<text x="170.0" y="426.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">off=128 KB</text>
|
||||||
|
<text x="170.0" y="440.0" text-anchor="middle" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
|
||||||
|
<text x="230.0" y="426.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">off=256 KB</text>
|
||||||
|
<text x="230.0" y="440.0" text-anchor="middle" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
|
||||||
|
<text x="290.0" y="426.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">off=384 KB</text>
|
||||||
|
<text x="290.0" y="440.0" text-anchor="middle" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
|
||||||
|
<text x="350.0" y="426.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">off=512 KB</text>
|
||||||
|
<text x="350.0" y="440.0" text-anchor="middle" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
|
||||||
|
<text x="410.0" y="426.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">off=640 KB</text>
|
||||||
|
<text x="410.0" y="440.0" text-anchor="middle" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
|
||||||
|
<text x="470.0" y="426.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">off=768 KB</text>
|
||||||
|
<text x="470.0" y="440.0" text-anchor="middle" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
|
||||||
|
<text x="530.0" y="426.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">off=896 KB</text>
|
||||||
|
<text x="530.0" y="440.0" text-anchor="middle" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
|
||||||
|
<text x="670.0" y="100.0" text-anchor="middle" font-size="12" fill="#1e293b" font-weight="bold">PE Legend</text>
|
||||||
|
<rect x="620.0" y="106.0" width="16.0" height="16.0" fill="#3b82f6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="642.0" y="118.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE0</text>
|
||||||
|
<rect x="620.0" y="128.0" width="16.0" height="16.0" fill="#10b981" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="642.0" y="140.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE1</text>
|
||||||
|
<rect x="620.0" y="150.0" width="16.0" height="16.0" fill="#f59e0b" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="642.0" y="162.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE2</text>
|
||||||
|
<rect x="620.0" y="172.0" width="16.0" height="16.0" fill="#ef4444" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="642.0" y="184.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE3</text>
|
||||||
|
<rect x="620.0" y="194.0" width="16.0" height="16.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="642.0" y="206.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE4</text>
|
||||||
|
<rect x="620.0" y="216.0" width="16.0" height="16.0" fill="#ec4899" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="642.0" y="228.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE5</text>
|
||||||
|
<rect x="620.0" y="238.0" width="16.0" height="16.0" fill="#06b6d4" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="642.0" y="250.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE6</text>
|
||||||
|
<rect x="620.0" y="260.0" width="16.0" height="16.0" fill="#f97316" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="642.0" y="272.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE7</text>
|
||||||
|
<rect x="620.0" y="320.0" width="167.0" height="120.0" fill="#e2e8f0" stroke="#94a3b8" stroke-width="1" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="630.0" y="338.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Strategy: column_wise</text>
|
||||||
|
<text x="630.0" y="356.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Split axis: K</text>
|
||||||
|
<text x="630.0" y="374.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Shards: 8</text>
|
||||||
|
<text x="630.0" y="392.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Each: (1024, 64)</text>
|
||||||
|
<text x="630.0" y="410.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Each: 128 KB</text>
|
||||||
|
<text x="630.0" y="428.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Total: 1 MB</text>
|
||||||
|
</svg>
|
||||||
|
After Width: | Height: | Size: 8.1 KiB |
@@ -0,0 +1,47 @@
|
|||||||
|
<svg xmlns="http://www.w3.org/2000/svg" width="820" height="500" viewBox="0 0 820 500" font-family="monospace">
|
||||||
|
<rect width="820" height="500" fill="#f8fafc" rx="6"/>
|
||||||
|
<text x="410" y="32" text-anchor="middle" font-size="16" font-weight="bold" fill="#1e293b">Placement: replicate</text>
|
||||||
|
<text x="410.0" y="54.0" text-anchor="middle" font-size="12" fill="#475569" font-weight="normal">Tensor (1024×512) fp16 → full copy to each PE</text>
|
||||||
|
<rect x="60.0" y="90.0" width="163.0" height="162.0" fill="#3b82f6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="141.5" y="157.0" text-anchor="middle" font-size="14" fill="#fff" font-weight="bold">PE0</text>
|
||||||
|
<text x="141.5" y="177.0" text-anchor="middle" font-size="11" fill="#fff" font-weight="normal">(1024×512)</text>
|
||||||
|
<text x="141.5" y="193.0" text-anchor="middle" font-size="10" fill="#fff" font-weight="normal">1 MB</text>
|
||||||
|
<text x="141.5" y="207.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">offset=0</text>
|
||||||
|
<rect x="239.0" y="90.0" width="163.0" height="162.0" fill="#10b981" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="320.5" y="157.0" text-anchor="middle" font-size="14" fill="#fff" font-weight="bold">PE1</text>
|
||||||
|
<text x="320.5" y="177.0" text-anchor="middle" font-size="11" fill="#fff" font-weight="normal">(1024×512)</text>
|
||||||
|
<text x="320.5" y="193.0" text-anchor="middle" font-size="10" fill="#fff" font-weight="normal">1 MB</text>
|
||||||
|
<text x="320.5" y="207.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">offset=0</text>
|
||||||
|
<rect x="418.0" y="90.0" width="163.0" height="162.0" fill="#f59e0b" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="499.5" y="157.0" text-anchor="middle" font-size="14" fill="#000" font-weight="bold">PE2</text>
|
||||||
|
<text x="499.5" y="177.0" text-anchor="middle" font-size="11" fill="#000" font-weight="normal">(1024×512)</text>
|
||||||
|
<text x="499.5" y="193.0" text-anchor="middle" font-size="10" fill="#000" font-weight="normal">1 MB</text>
|
||||||
|
<text x="499.5" y="207.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">offset=0</text>
|
||||||
|
<rect x="597.0" y="90.0" width="163.0" height="162.0" fill="#ef4444" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="678.5" y="157.0" text-anchor="middle" font-size="14" fill="#fff" font-weight="bold">PE3</text>
|
||||||
|
<text x="678.5" y="177.0" text-anchor="middle" font-size="11" fill="#fff" font-weight="normal">(1024×512)</text>
|
||||||
|
<text x="678.5" y="193.0" text-anchor="middle" font-size="10" fill="#fff" font-weight="normal">1 MB</text>
|
||||||
|
<text x="678.5" y="207.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">offset=0</text>
|
||||||
|
<rect x="60.0" y="268.0" width="163.0" height="162.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="141.5" y="335.0" text-anchor="middle" font-size="14" fill="#fff" font-weight="bold">PE4</text>
|
||||||
|
<text x="141.5" y="355.0" text-anchor="middle" font-size="11" fill="#fff" font-weight="normal">(1024×512)</text>
|
||||||
|
<text x="141.5" y="371.0" text-anchor="middle" font-size="10" fill="#fff" font-weight="normal">1 MB</text>
|
||||||
|
<text x="141.5" y="385.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">offset=0</text>
|
||||||
|
<rect x="239.0" y="268.0" width="163.0" height="162.0" fill="#ec4899" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="320.5" y="335.0" text-anchor="middle" font-size="14" fill="#fff" font-weight="bold">PE5</text>
|
||||||
|
<text x="320.5" y="355.0" text-anchor="middle" font-size="11" fill="#fff" font-weight="normal">(1024×512)</text>
|
||||||
|
<text x="320.5" y="371.0" text-anchor="middle" font-size="10" fill="#fff" font-weight="normal">1 MB</text>
|
||||||
|
<text x="320.5" y="385.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">offset=0</text>
|
||||||
|
<rect x="418.0" y="268.0" width="163.0" height="162.0" fill="#06b6d4" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="499.5" y="335.0" text-anchor="middle" font-size="14" fill="#000" font-weight="bold">PE6</text>
|
||||||
|
<text x="499.5" y="355.0" text-anchor="middle" font-size="11" fill="#000" font-weight="normal">(1024×512)</text>
|
||||||
|
<text x="499.5" y="371.0" text-anchor="middle" font-size="10" fill="#000" font-weight="normal">1 MB</text>
|
||||||
|
<text x="499.5" y="385.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">offset=0</text>
|
||||||
|
<rect x="597.0" y="268.0" width="163.0" height="162.0" fill="#f97316" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="678.5" y="335.0" text-anchor="middle" font-size="14" fill="#fff" font-weight="bold">PE7</text>
|
||||||
|
<text x="678.5" y="355.0" text-anchor="middle" font-size="11" fill="#fff" font-weight="normal">(1024×512)</text>
|
||||||
|
<text x="678.5" y="371.0" text-anchor="middle" font-size="10" fill="#fff" font-weight="normal">1 MB</text>
|
||||||
|
<text x="678.5" y="385.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">offset=0</text>
|
||||||
|
<rect x="60.0" y="450.0" width="496.0" height="30.0" fill="#e2e8f0" stroke="#94a3b8" stroke-width="1" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="70.0" y="468.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Strategy: replicate | Shards: 8 | Each: 1 MB | Total mem: 8 MB</text>
|
||||||
|
</svg>
|
||||||
|
After Width: | Height: | Size: 5.2 KiB |
@@ -0,0 +1,72 @@
|
|||||||
|
<svg xmlns="http://www.w3.org/2000/svg" width="820" height="560" viewBox="0 0 820 560" font-family="monospace">
|
||||||
|
<rect width="820" height="560" fill="#f8fafc" rx="6"/>
|
||||||
|
<text x="410" y="32" text-anchor="middle" font-size="16" font-weight="bold" fill="#1e293b">Placement: row_wise</text>
|
||||||
|
<text x="410.0" y="54.0" text-anchor="middle" font-size="12" fill="#475569" font-weight="normal">Tensor (1024×512) fp16 → M axis split into 8 parts</text>
|
||||||
|
<text x="240.0" y="82.0" text-anchor="middle" font-size="11" fill="#475569" font-weight="normal">← K=512 →</text>
|
||||||
|
<text x="68.0" y="290.0" text-anchor="middle" font-size="11" fill="#475569" transform="rotate(-90 68.0 290.0)">↑ M=1024 ↓</text>
|
||||||
|
<rect x="80.0" y="90.0" width="320.0" height="50.0" fill="#3b82f6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="240.0" y="111.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE0</text>
|
||||||
|
<text x="240.0" y="127.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">(128×512)</text>
|
||||||
|
<rect x="80.0" y="140.0" width="320.0" height="50.0" fill="#10b981" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="240.0" y="161.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE1</text>
|
||||||
|
<text x="240.0" y="177.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">(128×512)</text>
|
||||||
|
<rect x="80.0" y="190.0" width="320.0" height="50.0" fill="#f59e0b" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="240.0" y="211.0" text-anchor="middle" font-size="12" fill="#000" font-weight="bold">PE2</text>
|
||||||
|
<text x="240.0" y="227.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">(128×512)</text>
|
||||||
|
<rect x="80.0" y="240.0" width="320.0" height="50.0" fill="#ef4444" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="240.0" y="261.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE3</text>
|
||||||
|
<text x="240.0" y="277.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">(128×512)</text>
|
||||||
|
<rect x="80.0" y="290.0" width="320.0" height="50.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="240.0" y="311.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE4</text>
|
||||||
|
<text x="240.0" y="327.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">(128×512)</text>
|
||||||
|
<rect x="80.0" y="340.0" width="320.0" height="50.0" fill="#ec4899" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="240.0" y="361.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE5</text>
|
||||||
|
<text x="240.0" y="377.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">(128×512)</text>
|
||||||
|
<rect x="80.0" y="390.0" width="320.0" height="50.0" fill="#06b6d4" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="240.0" y="411.0" text-anchor="middle" font-size="12" fill="#000" font-weight="bold">PE6</text>
|
||||||
|
<text x="240.0" y="427.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">(128×512)</text>
|
||||||
|
<rect x="80.0" y="440.0" width="320.0" height="50.0" fill="#f97316" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="240.0" y="461.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE7</text>
|
||||||
|
<text x="240.0" y="477.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">(128×512)</text>
|
||||||
|
<rect x="80.0" y="90.0" width="320.0" height="400.0" fill="none" stroke="#1e293b" stroke-width="2" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="410.0" y="111.0" text-anchor="start" font-size="9" fill="#475569" font-weight="normal">off=0 B</text>
|
||||||
|
<text x="410.0" y="125.0" text-anchor="start" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
|
||||||
|
<text x="410.0" y="161.0" text-anchor="start" font-size="9" fill="#475569" font-weight="normal">off=128 KB</text>
|
||||||
|
<text x="410.0" y="175.0" text-anchor="start" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
|
||||||
|
<text x="410.0" y="211.0" text-anchor="start" font-size="9" fill="#475569" font-weight="normal">off=256 KB</text>
|
||||||
|
<text x="410.0" y="225.0" text-anchor="start" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
|
||||||
|
<text x="410.0" y="261.0" text-anchor="start" font-size="9" fill="#475569" font-weight="normal">off=384 KB</text>
|
||||||
|
<text x="410.0" y="275.0" text-anchor="start" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
|
||||||
|
<text x="410.0" y="311.0" text-anchor="start" font-size="9" fill="#475569" font-weight="normal">off=512 KB</text>
|
||||||
|
<text x="410.0" y="325.0" text-anchor="start" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
|
||||||
|
<text x="410.0" y="361.0" text-anchor="start" font-size="9" fill="#475569" font-weight="normal">off=640 KB</text>
|
||||||
|
<text x="410.0" y="375.0" text-anchor="start" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
|
||||||
|
<text x="410.0" y="411.0" text-anchor="start" font-size="9" fill="#475569" font-weight="normal">off=768 KB</text>
|
||||||
|
<text x="410.0" y="425.0" text-anchor="start" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
|
||||||
|
<text x="410.0" y="461.0" text-anchor="start" font-size="9" fill="#475569" font-weight="normal">off=896 KB</text>
|
||||||
|
<text x="410.0" y="475.0" text-anchor="start" font-size="9" fill="#64748b" font-weight="normal">128 KB</text>
|
||||||
|
<text x="630.0" y="100.0" text-anchor="middle" font-size="12" fill="#1e293b" font-weight="bold">PE Legend</text>
|
||||||
|
<rect x="580.0" y="106.0" width="16.0" height="16.0" fill="#3b82f6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="602.0" y="118.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE0</text>
|
||||||
|
<rect x="580.0" y="128.0" width="16.0" height="16.0" fill="#10b981" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="602.0" y="140.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE1</text>
|
||||||
|
<rect x="580.0" y="150.0" width="16.0" height="16.0" fill="#f59e0b" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="602.0" y="162.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE2</text>
|
||||||
|
<rect x="580.0" y="172.0" width="16.0" height="16.0" fill="#ef4444" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="602.0" y="184.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE3</text>
|
||||||
|
<rect x="580.0" y="194.0" width="16.0" height="16.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="602.0" y="206.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE4</text>
|
||||||
|
<rect x="580.0" y="216.0" width="16.0" height="16.0" fill="#ec4899" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="602.0" y="228.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE5</text>
|
||||||
|
<rect x="580.0" y="238.0" width="16.0" height="16.0" fill="#06b6d4" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="602.0" y="250.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE6</text>
|
||||||
|
<rect x="580.0" y="260.0" width="16.0" height="16.0" fill="#f97316" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="602.0" y="272.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE7</text>
|
||||||
|
<rect x="580.0" y="320.0" width="146.0" height="120.0" fill="#e2e8f0" stroke="#94a3b8" stroke-width="1" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="590.0" y="338.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Strategy: row_wise</text>
|
||||||
|
<text x="590.0" y="356.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Split axis: M</text>
|
||||||
|
<text x="590.0" y="374.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Shards: 8</text>
|
||||||
|
<text x="590.0" y="392.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Each: (128, 512)</text>
|
||||||
|
<text x="590.0" y="410.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Each: 128 KB</text>
|
||||||
|
<text x="590.0" y="428.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Total: 1 MB</text>
|
||||||
|
</svg>
|
||||||
|
After Width: | Height: | Size: 8.1 KiB |
@@ -0,0 +1,116 @@
|
|||||||
|
<svg xmlns="http://www.w3.org/2000/svg" width="820" height="620" viewBox="0 0 820 620" font-family="monospace">
|
||||||
|
<rect width="820" height="620" fill="#f8fafc" rx="6"/>
|
||||||
|
<text x="410" y="32" text-anchor="middle" font-size="16" font-weight="bold" fill="#1e293b">Placement: tiled_column_major</text>
|
||||||
|
<text x="410.0" y="54.0" text-anchor="middle" font-size="11" fill="#475569" font-weight="normal">Tensor (1024×512) fp16, tile=(256×128) → 4×4=16 tiles, column-major (K first)</text>
|
||||||
|
<text x="280.0" y="82.0" text-anchor="middle" font-size="11" fill="#475569" font-weight="normal">← K=512 →</text>
|
||||||
|
<text x="68.0" y="290.0" text-anchor="middle" font-size="11" fill="#475569" transform="rotate(-90 68.0 290.0)">↑ M=1024 ↓</text>
|
||||||
|
<rect x="80.0" y="90.0" width="100.0" height="100.0" fill="#3b82f6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="130.0" y="136.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE0</text>
|
||||||
|
<text x="130.0" y="152.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t0</text>
|
||||||
|
<rect x="180.0" y="90.0" width="100.0" height="100.0" fill="#10b981" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="230.0" y="136.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE1</text>
|
||||||
|
<text x="230.0" y="152.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t1</text>
|
||||||
|
<rect x="280.0" y="90.0" width="100.0" height="100.0" fill="#f59e0b" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="330.0" y="136.0" text-anchor="middle" font-size="12" fill="#000" font-weight="bold">PE2</text>
|
||||||
|
<text x="330.0" y="152.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">t2</text>
|
||||||
|
<rect x="380.0" y="90.0" width="100.0" height="100.0" fill="#ef4444" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="430.0" y="136.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE3</text>
|
||||||
|
<text x="430.0" y="152.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t3</text>
|
||||||
|
<rect x="80.0" y="190.0" width="100.0" height="100.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="130.0" y="236.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE4</text>
|
||||||
|
<text x="130.0" y="252.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t4</text>
|
||||||
|
<rect x="180.0" y="190.0" width="100.0" height="100.0" fill="#ec4899" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="230.0" y="236.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE5</text>
|
||||||
|
<text x="230.0" y="252.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t5</text>
|
||||||
|
<rect x="280.0" y="190.0" width="100.0" height="100.0" fill="#06b6d4" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="330.0" y="236.0" text-anchor="middle" font-size="12" fill="#000" font-weight="bold">PE6</text>
|
||||||
|
<text x="330.0" y="252.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">t6</text>
|
||||||
|
<rect x="380.0" y="190.0" width="100.0" height="100.0" fill="#f97316" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="430.0" y="236.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE7</text>
|
||||||
|
<text x="430.0" y="252.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t7</text>
|
||||||
|
<rect x="80.0" y="290.0" width="100.0" height="100.0" fill="#3b82f6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="130.0" y="336.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE0</text>
|
||||||
|
<text x="130.0" y="352.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t8</text>
|
||||||
|
<rect x="180.0" y="290.0" width="100.0" height="100.0" fill="#10b981" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="230.0" y="336.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE1</text>
|
||||||
|
<text x="230.0" y="352.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t9</text>
|
||||||
|
<rect x="280.0" y="290.0" width="100.0" height="100.0" fill="#f59e0b" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="330.0" y="336.0" text-anchor="middle" font-size="12" fill="#000" font-weight="bold">PE2</text>
|
||||||
|
<text x="330.0" y="352.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">t10</text>
|
||||||
|
<rect x="380.0" y="290.0" width="100.0" height="100.0" fill="#ef4444" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="430.0" y="336.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE3</text>
|
||||||
|
<text x="430.0" y="352.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t11</text>
|
||||||
|
<rect x="80.0" y="390.0" width="100.0" height="100.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="130.0" y="436.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE4</text>
|
||||||
|
<text x="130.0" y="452.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t12</text>
|
||||||
|
<rect x="180.0" y="390.0" width="100.0" height="100.0" fill="#ec4899" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="230.0" y="436.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE5</text>
|
||||||
|
<text x="230.0" y="452.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t13</text>
|
||||||
|
<rect x="280.0" y="390.0" width="100.0" height="100.0" fill="#06b6d4" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="330.0" y="436.0" text-anchor="middle" font-size="12" fill="#000" font-weight="bold">PE6</text>
|
||||||
|
<text x="330.0" y="452.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">t14</text>
|
||||||
|
<rect x="380.0" y="390.0" width="100.0" height="100.0" fill="#f97316" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="430.0" y="436.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE7</text>
|
||||||
|
<text x="430.0" y="452.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t15</text>
|
||||||
|
<rect x="80.0" y="90.0" width="400.0" height="400.0" fill="none" stroke="#1e293b" stroke-width="2" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="130.0" y="506.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">k=0..127</text>
|
||||||
|
<text x="230.0" y="506.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">k=128..255</text>
|
||||||
|
<text x="330.0" y="506.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">k=256..383</text>
|
||||||
|
<text x="430.0" y="506.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">k=384..511</text>
|
||||||
|
<text x="64.0" y="140.0" text-anchor="end" font-size="9" fill="#475569" font-weight="normal">m=0..255</text>
|
||||||
|
<text x="64.0" y="240.0" text-anchor="end" font-size="9" fill="#475569" font-weight="normal">m=256..511</text>
|
||||||
|
<text x="64.0" y="340.0" text-anchor="end" font-size="9" fill="#475569" font-weight="normal">m=512..767</text>
|
||||||
|
<text x="64.0" y="440.0" text-anchor="end" font-size="9" fill="#475569" font-weight="normal">m=768..1023</text>
|
||||||
|
<text x="590.0" y="90.0" text-anchor="middle" font-size="12" fill="#1e293b" font-weight="bold">PE Legend</text>
|
||||||
|
<rect x="540.0" y="96.0" width="16.0" height="16.0" fill="#3b82f6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="562.0" y="108.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE0</text>
|
||||||
|
<rect x="540.0" y="118.0" width="16.0" height="16.0" fill="#10b981" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="562.0" y="130.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE1</text>
|
||||||
|
<rect x="540.0" y="140.0" width="16.0" height="16.0" fill="#f59e0b" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="562.0" y="152.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE2</text>
|
||||||
|
<rect x="540.0" y="162.0" width="16.0" height="16.0" fill="#ef4444" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="562.0" y="174.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE3</text>
|
||||||
|
<rect x="540.0" y="184.0" width="16.0" height="16.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="562.0" y="196.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE4</text>
|
||||||
|
<rect x="540.0" y="206.0" width="16.0" height="16.0" fill="#ec4899" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="562.0" y="218.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE5</text>
|
||||||
|
<rect x="540.0" y="228.0" width="16.0" height="16.0" fill="#06b6d4" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="562.0" y="240.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE6</text>
|
||||||
|
<rect x="540.0" y="250.0" width="16.0" height="16.0" fill="#f97316" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="562.0" y="262.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE7</text>
|
||||||
|
<text x="540.0" y="310.0" text-anchor="middle" font-size="12" fill="#1e293b" font-weight="bold">Tile Assignment Order</text>
|
||||||
|
<rect x="540.0" y="318.0" width="12.0" height="12.0" fill="#3b82f6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="558.0" y="328.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 0 → PE0 (0,0) off=0 B</text>
|
||||||
|
<rect x="540.0" y="334.0" width="12.0" height="12.0" fill="#10b981" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="558.0" y="344.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 1 → PE1 (0,1) off=256 B</text>
|
||||||
|
<rect x="540.0" y="350.0" width="12.0" height="12.0" fill="#f59e0b" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="558.0" y="360.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 2 → PE2 (0,2) off=512 B</text>
|
||||||
|
<rect x="540.0" y="366.0" width="12.0" height="12.0" fill="#ef4444" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="558.0" y="376.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 3 → PE3 (0,3) off=768 B</text>
|
||||||
|
<rect x="540.0" y="382.0" width="12.0" height="12.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="558.0" y="392.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 4 → PE4 (1,0) off=256 KB</text>
|
||||||
|
<rect x="540.0" y="398.0" width="12.0" height="12.0" fill="#ec4899" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="558.0" y="408.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 5 → PE5 (1,1) off=256 KB</text>
|
||||||
|
<rect x="540.0" y="414.0" width="12.0" height="12.0" fill="#06b6d4" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="558.0" y="424.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 6 → PE6 (1,2) off=256 KB</text>
|
||||||
|
<rect x="540.0" y="430.0" width="12.0" height="12.0" fill="#f97316" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="558.0" y="440.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 7 → PE7 (1,3) off=256 KB</text>
|
||||||
|
<rect x="540.0" y="446.0" width="12.0" height="12.0" fill="#3b82f6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="558.0" y="456.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 8 → PE0 (2,0) off=512 KB</text>
|
||||||
|
<rect x="540.0" y="462.0" width="12.0" height="12.0" fill="#10b981" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="558.0" y="472.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 9 → PE1 (2,1) off=512 KB</text>
|
||||||
|
<rect x="540.0" y="478.0" width="12.0" height="12.0" fill="#f59e0b" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="558.0" y="488.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t10 → PE2 (2,2) off=512 KB</text>
|
||||||
|
<rect x="540.0" y="494.0" width="12.0" height="12.0" fill="#ef4444" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="558.0" y="504.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t11 → PE3 (2,3) off=512 KB</text>
|
||||||
|
<rect x="540.0" y="510.0" width="12.0" height="12.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="558.0" y="520.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t12 → PE4 (3,0) off=768 KB</text>
|
||||||
|
<rect x="540.0" y="526.0" width="12.0" height="12.0" fill="#ec4899" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="558.0" y="536.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t13 → PE5 (3,1) off=768 KB</text>
|
||||||
|
<rect x="540.0" y="542.0" width="12.0" height="12.0" fill="#06b6d4" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="558.0" y="552.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t14 → PE6 (3,2) off=768 KB</text>
|
||||||
|
<rect x="540.0" y="558.0" width="12.0" height="12.0" fill="#f97316" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="558.0" y="568.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t15 → PE7 (3,3) off=768 KB</text>
|
||||||
|
<rect x="80.0" y="560.0" width="608.0" height="30.0" fill="#e2e8f0" stroke="#94a3b8" stroke-width="1" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="90.0" y="578.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Strategy: tiled_column_major | Tile: (256×128)=64 KB | Tiles: 16 | Total: 1 MB</text>
|
||||||
|
</svg>
|
||||||
|
After Width: | Height: | Size: 14 KiB |
@@ -0,0 +1,116 @@
|
|||||||
|
<svg xmlns="http://www.w3.org/2000/svg" width="820" height="620" viewBox="0 0 820 620" font-family="monospace">
|
||||||
|
<rect width="820" height="620" fill="#f8fafc" rx="6"/>
|
||||||
|
<text x="410" y="32" text-anchor="middle" font-size="16" font-weight="bold" fill="#1e293b">Placement: tiled_row_major</text>
|
||||||
|
<text x="410.0" y="54.0" text-anchor="middle" font-size="11" fill="#475569" font-weight="normal">Tensor (1024×512) fp16, tile=(256×128) → 4×4=16 tiles, row-major (M first)</text>
|
||||||
|
<text x="280.0" y="82.0" text-anchor="middle" font-size="11" fill="#475569" font-weight="normal">← K=512 →</text>
|
||||||
|
<text x="68.0" y="290.0" text-anchor="middle" font-size="11" fill="#475569" transform="rotate(-90 68.0 290.0)">↑ M=1024 ↓</text>
|
||||||
|
<rect x="80.0" y="90.0" width="100.0" height="100.0" fill="#3b82f6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="130.0" y="136.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE0</text>
|
||||||
|
<text x="130.0" y="152.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t0</text>
|
||||||
|
<rect x="80.0" y="190.0" width="100.0" height="100.0" fill="#10b981" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="130.0" y="236.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE1</text>
|
||||||
|
<text x="130.0" y="252.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t1</text>
|
||||||
|
<rect x="80.0" y="290.0" width="100.0" height="100.0" fill="#f59e0b" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="130.0" y="336.0" text-anchor="middle" font-size="12" fill="#000" font-weight="bold">PE2</text>
|
||||||
|
<text x="130.0" y="352.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">t2</text>
|
||||||
|
<rect x="80.0" y="390.0" width="100.0" height="100.0" fill="#ef4444" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="130.0" y="436.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE3</text>
|
||||||
|
<text x="130.0" y="452.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t3</text>
|
||||||
|
<rect x="180.0" y="90.0" width="100.0" height="100.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="230.0" y="136.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE4</text>
|
||||||
|
<text x="230.0" y="152.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t4</text>
|
||||||
|
<rect x="180.0" y="190.0" width="100.0" height="100.0" fill="#ec4899" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="230.0" y="236.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE5</text>
|
||||||
|
<text x="230.0" y="252.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t5</text>
|
||||||
|
<rect x="180.0" y="290.0" width="100.0" height="100.0" fill="#06b6d4" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="230.0" y="336.0" text-anchor="middle" font-size="12" fill="#000" font-weight="bold">PE6</text>
|
||||||
|
<text x="230.0" y="352.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">t6</text>
|
||||||
|
<rect x="180.0" y="390.0" width="100.0" height="100.0" fill="#f97316" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="230.0" y="436.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE7</text>
|
||||||
|
<text x="230.0" y="452.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t7</text>
|
||||||
|
<rect x="280.0" y="90.0" width="100.0" height="100.0" fill="#3b82f6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="330.0" y="136.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE0</text>
|
||||||
|
<text x="330.0" y="152.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t8</text>
|
||||||
|
<rect x="280.0" y="190.0" width="100.0" height="100.0" fill="#10b981" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="330.0" y="236.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE1</text>
|
||||||
|
<text x="330.0" y="252.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t9</text>
|
||||||
|
<rect x="280.0" y="290.0" width="100.0" height="100.0" fill="#f59e0b" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="330.0" y="336.0" text-anchor="middle" font-size="12" fill="#000" font-weight="bold">PE2</text>
|
||||||
|
<text x="330.0" y="352.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">t10</text>
|
||||||
|
<rect x="280.0" y="390.0" width="100.0" height="100.0" fill="#ef4444" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="330.0" y="436.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE3</text>
|
||||||
|
<text x="330.0" y="452.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t11</text>
|
||||||
|
<rect x="380.0" y="90.0" width="100.0" height="100.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="430.0" y="136.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE4</text>
|
||||||
|
<text x="430.0" y="152.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t12</text>
|
||||||
|
<rect x="380.0" y="190.0" width="100.0" height="100.0" fill="#ec4899" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="430.0" y="236.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE5</text>
|
||||||
|
<text x="430.0" y="252.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t13</text>
|
||||||
|
<rect x="380.0" y="290.0" width="100.0" height="100.0" fill="#06b6d4" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="430.0" y="336.0" text-anchor="middle" font-size="12" fill="#000" font-weight="bold">PE6</text>
|
||||||
|
<text x="430.0" y="352.0" text-anchor="middle" font-size="9" fill="#000" font-weight="normal">t14</text>
|
||||||
|
<rect x="380.0" y="390.0" width="100.0" height="100.0" fill="#f97316" stroke="#334155" stroke-width="1.5" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="430.0" y="436.0" text-anchor="middle" font-size="12" fill="#fff" font-weight="bold">PE7</text>
|
||||||
|
<text x="430.0" y="452.0" text-anchor="middle" font-size="9" fill="#fff" font-weight="normal">t15</text>
|
||||||
|
<rect x="80.0" y="90.0" width="400.0" height="400.0" fill="none" stroke="#1e293b" stroke-width="2" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="130.0" y="506.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">k=0..127</text>
|
||||||
|
<text x="230.0" y="506.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">k=128..255</text>
|
||||||
|
<text x="330.0" y="506.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">k=256..383</text>
|
||||||
|
<text x="430.0" y="506.0" text-anchor="middle" font-size="9" fill="#475569" font-weight="normal">k=384..511</text>
|
||||||
|
<text x="64.0" y="140.0" text-anchor="end" font-size="9" fill="#475569" font-weight="normal">m=0..255</text>
|
||||||
|
<text x="64.0" y="240.0" text-anchor="end" font-size="9" fill="#475569" font-weight="normal">m=256..511</text>
|
||||||
|
<text x="64.0" y="340.0" text-anchor="end" font-size="9" fill="#475569" font-weight="normal">m=512..767</text>
|
||||||
|
<text x="64.0" y="440.0" text-anchor="end" font-size="9" fill="#475569" font-weight="normal">m=768..1023</text>
|
||||||
|
<text x="590.0" y="90.0" text-anchor="middle" font-size="12" fill="#1e293b" font-weight="bold">PE Legend</text>
|
||||||
|
<rect x="540.0" y="96.0" width="16.0" height="16.0" fill="#3b82f6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="562.0" y="108.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE0</text>
|
||||||
|
<rect x="540.0" y="118.0" width="16.0" height="16.0" fill="#10b981" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="562.0" y="130.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE1</text>
|
||||||
|
<rect x="540.0" y="140.0" width="16.0" height="16.0" fill="#f59e0b" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="562.0" y="152.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE2</text>
|
||||||
|
<rect x="540.0" y="162.0" width="16.0" height="16.0" fill="#ef4444" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="562.0" y="174.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE3</text>
|
||||||
|
<rect x="540.0" y="184.0" width="16.0" height="16.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="562.0" y="196.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE4</text>
|
||||||
|
<rect x="540.0" y="206.0" width="16.0" height="16.0" fill="#ec4899" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="562.0" y="218.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE5</text>
|
||||||
|
<rect x="540.0" y="228.0" width="16.0" height="16.0" fill="#06b6d4" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="562.0" y="240.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE6</text>
|
||||||
|
<rect x="540.0" y="250.0" width="16.0" height="16.0" fill="#f97316" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="562.0" y="262.0" text-anchor="start" font-size="11" fill="#1e293b" font-weight="normal">PE7</text>
|
||||||
|
<text x="540.0" y="310.0" text-anchor="middle" font-size="12" fill="#1e293b" font-weight="bold">Tile Assignment Order</text>
|
||||||
|
<rect x="540.0" y="318.0" width="12.0" height="12.0" fill="#3b82f6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="558.0" y="328.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 0 → PE0 (0,0) off=0 B</text>
|
||||||
|
<rect x="540.0" y="334.0" width="12.0" height="12.0" fill="#10b981" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="558.0" y="344.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 1 → PE1 (1,0) off=256 KB</text>
|
||||||
|
<rect x="540.0" y="350.0" width="12.0" height="12.0" fill="#f59e0b" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="558.0" y="360.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 2 → PE2 (2,0) off=512 KB</text>
|
||||||
|
<rect x="540.0" y="366.0" width="12.0" height="12.0" fill="#ef4444" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="558.0" y="376.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 3 → PE3 (3,0) off=768 KB</text>
|
||||||
|
<rect x="540.0" y="382.0" width="12.0" height="12.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="558.0" y="392.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 4 → PE4 (0,1) off=256 B</text>
|
||||||
|
<rect x="540.0" y="398.0" width="12.0" height="12.0" fill="#ec4899" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="558.0" y="408.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 5 → PE5 (1,1) off=256 KB</text>
|
||||||
|
<rect x="540.0" y="414.0" width="12.0" height="12.0" fill="#06b6d4" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="558.0" y="424.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 6 → PE6 (2,1) off=512 KB</text>
|
||||||
|
<rect x="540.0" y="430.0" width="12.0" height="12.0" fill="#f97316" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="558.0" y="440.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 7 → PE7 (3,1) off=768 KB</text>
|
||||||
|
<rect x="540.0" y="446.0" width="12.0" height="12.0" fill="#3b82f6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="558.0" y="456.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 8 → PE0 (0,2) off=512 B</text>
|
||||||
|
<rect x="540.0" y="462.0" width="12.0" height="12.0" fill="#10b981" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="558.0" y="472.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t 9 → PE1 (1,2) off=256 KB</text>
|
||||||
|
<rect x="540.0" y="478.0" width="12.0" height="12.0" fill="#f59e0b" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="558.0" y="488.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t10 → PE2 (2,2) off=512 KB</text>
|
||||||
|
<rect x="540.0" y="494.0" width="12.0" height="12.0" fill="#ef4444" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="558.0" y="504.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t11 → PE3 (3,2) off=768 KB</text>
|
||||||
|
<rect x="540.0" y="510.0" width="12.0" height="12.0" fill="#8b5cf6" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="558.0" y="520.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t12 → PE4 (0,3) off=768 B</text>
|
||||||
|
<rect x="540.0" y="526.0" width="12.0" height="12.0" fill="#ec4899" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="558.0" y="536.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t13 → PE5 (1,3) off=256 KB</text>
|
||||||
|
<rect x="540.0" y="542.0" width="12.0" height="12.0" fill="#06b6d4" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="558.0" y="552.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t14 → PE6 (2,3) off=512 KB</text>
|
||||||
|
<rect x="540.0" y="558.0" width="12.0" height="12.0" fill="#f97316" stroke="#334155" stroke-width="1.0" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="558.0" y="568.0" text-anchor="start" font-size="9" fill="#334155" font-weight="normal">t15 → PE7 (3,3) off=768 KB</text>
|
||||||
|
<rect x="80.0" y="560.0" width="587.0" height="30.0" fill="#e2e8f0" stroke="#94a3b8" stroke-width="1" fill-opacity="1.0" rx="2"/>
|
||||||
|
<text x="90.0" y="578.0" text-anchor="start" font-size="10" fill="#334155" font-weight="normal">Strategy: tiled_row_major | Tile: (256×128)=64 KB | Tiles: 16 | Total: 1 MB</text>
|
||||||
|
</svg>
|
||||||
|
After Width: | Height: | Size: 14 KiB |
@@ -0,0 +1,95 @@
|
|||||||
|
<svg xmlns="http://www.w3.org/2000/svg" width="648" height="648" viewBox="0 0 648 648">
|
||||||
|
<title>sip</title>
|
||||||
|
<rect width="648" height="648" fill="#f8fafc"/>
|
||||||
|
<text x="324" y="18" text-anchor="middle" font-family="monospace" font-size="14" font-weight="bold" fill="#1e293b">SIP VIEW</text>
|
||||||
|
<line x1="108.0" y1="144.0" x2="252.0" y2="144.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="180.0" y="140.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
|
||||||
|
<line x1="108.0" y1="144.0" x2="108.0" y2="264.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="108.0" y="200.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
|
||||||
|
<line x1="252.0" y1="144.0" x2="396.0" y2="144.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="324.0" y="140.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
|
||||||
|
<line x1="252.0" y1="144.0" x2="252.0" y2="264.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="252.0" y="200.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
|
||||||
|
<line x1="396.0" y1="144.0" x2="540.0" y2="144.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="468.0" y="140.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
|
||||||
|
<line x1="396.0" y1="144.0" x2="396.0" y2="264.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="396.0" y="200.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
|
||||||
|
<line x1="540.0" y1="144.0" x2="540.0" y2="264.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="540.0" y="200.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
|
||||||
|
<line x1="108.0" y1="264.0" x2="252.0" y2="264.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="180.0" y="260.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
|
||||||
|
<line x1="108.0" y1="264.0" x2="108.0" y2="384.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="108.0" y="320.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
|
||||||
|
<line x1="252.0" y1="264.0" x2="396.0" y2="264.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="324.0" y="260.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
|
||||||
|
<line x1="252.0" y1="264.0" x2="252.0" y2="384.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="252.0" y="320.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
|
||||||
|
<line x1="396.0" y1="264.0" x2="540.0" y2="264.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="468.0" y="260.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
|
||||||
|
<line x1="396.0" y1="264.0" x2="396.0" y2="384.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="396.0" y="320.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
|
||||||
|
<line x1="540.0" y1="264.0" x2="540.0" y2="384.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="540.0" y="320.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
|
||||||
|
<line x1="108.0" y1="384.0" x2="252.0" y2="384.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="180.0" y="380.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
|
||||||
|
<line x1="108.0" y1="384.0" x2="108.0" y2="504.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="108.0" y="440.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
|
||||||
|
<line x1="252.0" y1="384.0" x2="396.0" y2="384.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="324.0" y="380.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
|
||||||
|
<line x1="252.0" y1="384.0" x2="252.0" y2="504.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="252.0" y="440.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
|
||||||
|
<line x1="396.0" y1="384.0" x2="540.0" y2="384.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="468.0" y="380.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
|
||||||
|
<line x1="396.0" y1="384.0" x2="396.0" y2="504.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="396.0" y="440.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
|
||||||
|
<line x1="540.0" y1="384.0" x2="540.0" y2="504.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="540.0" y="440.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
|
||||||
|
<line x1="108.0" y1="504.0" x2="252.0" y2="504.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="180.0" y="500.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
|
||||||
|
<line x1="252.0" y1="504.0" x2="396.0" y2="504.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="324.0" y="500.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
|
||||||
|
<line x1="396.0" y1="504.0" x2="540.0" y2="504.0" stroke="#3b82f6" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="468.0" y="500.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">1.0mm 512GB/s</text>
|
||||||
|
<polyline points="324.0,56.0 108.0,56.0 108.0,144.0" fill="none" stroke="#0ea5e9" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="216.0" y="96.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.5mm 512GB/s</text>
|
||||||
|
<polyline points="324.0,56.0 252.0,56.0 252.0,144.0" fill="none" stroke="#0ea5e9" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="288.0" y="96.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.5mm 512GB/s</text>
|
||||||
|
<polyline points="324.0,56.0 396.0,56.0 396.0,144.0" fill="none" stroke="#0ea5e9" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="360.0" y="96.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.5mm 512GB/s</text>
|
||||||
|
<polyline points="324.0,56.0 540.0,56.0 540.0,144.0" fill="none" stroke="#0ea5e9" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="432.0" y="96.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">3.5mm 512GB/s</text>
|
||||||
|
<rect x="84.0" y="128.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="108.0" y="148.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (0,0)</text>
|
||||||
|
<rect x="228.0" y="128.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="252.0" y="148.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (1,0)</text>
|
||||||
|
<rect x="372.0" y="128.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="396.0" y="148.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (2,0)</text>
|
||||||
|
<rect x="516.0" y="128.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="540.0" y="148.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (3,0)</text>
|
||||||
|
<rect x="84.0" y="248.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="108.0" y="268.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (0,1)</text>
|
||||||
|
<rect x="228.0" y="248.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="252.0" y="268.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (1,1)</text>
|
||||||
|
<rect x="372.0" y="248.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="396.0" y="268.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (2,1)</text>
|
||||||
|
<rect x="516.0" y="248.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="540.0" y="268.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (3,1)</text>
|
||||||
|
<rect x="84.0" y="368.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="108.0" y="388.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (0,2)</text>
|
||||||
|
<rect x="228.0" y="368.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="252.0" y="388.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (1,2)</text>
|
||||||
|
<rect x="372.0" y="368.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="396.0" y="388.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (2,2)</text>
|
||||||
|
<rect x="516.0" y="368.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="540.0" y="388.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (3,2)</text>
|
||||||
|
<rect x="84.0" y="488.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="108.0" y="508.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (0,3)</text>
|
||||||
|
<rect x="228.0" y="488.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="252.0" y="508.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (1,3)</text>
|
||||||
|
<rect x="372.0" y="488.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="396.0" y="508.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (2,3)</text>
|
||||||
|
<rect x="516.0" y="488.0" width="48.0" height="32.0" rx="4" fill="#cbd5e1" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="540.0" y="508.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#1e293b">CUBE (3,3)</text>
|
||||||
|
<rect x="308.0" y="50.0" width="32.0" height="12.0" rx="4" fill="#0ea5e9" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="324.0" y="60.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#ffffff">IO io0</text>
|
||||||
|
</svg>
|
||||||
|
After Width: | Height: | Size: 10 KiB |
@@ -0,0 +1,19 @@
|
|||||||
|
<svg xmlns="http://www.w3.org/2000/svg" width="768" height="396" viewBox="0 0 768 396">
|
||||||
|
<title>system</title>
|
||||||
|
<rect width="768" height="396" fill="#f8fafc"/>
|
||||||
|
<text x="384" y="18" text-anchor="middle" font-family="monospace" font-size="14" font-weight="bold" fill="#1e293b">SYSTEM VIEW</text>
|
||||||
|
<polyline points="384.0,60.0 182.0,60.0 182.0,120.0" fill="none" stroke="#6366f1" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="283.0" y="86.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">20.0mm 256GB/s</text>
|
||||||
|
<polyline points="384.0,60.0 586.0,60.0 586.0,120.0" fill="none" stroke="#6366f1" stroke-width="1" opacity="0.8"/>
|
||||||
|
<text x="485.0" y="86.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#64748b">20.0mm 256GB/s</text>
|
||||||
|
<rect x="374.0" y="57.0" width="20.0" height="6.0" rx="4" fill="#6366f1" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="384.0" y="64.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#ffffff">Fabric Switch</text>
|
||||||
|
<rect x="62.0" y="138.0" width="240.0" height="200.0" rx="4" fill="#e0e7ff" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="182.0" y="242.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">SIP 0</text>
|
||||||
|
<rect x="174.0" y="117.0" width="16.0" height="6.0" rx="4" fill="#0ea5e9" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="182.0" y="124.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#ffffff">IO io0</text>
|
||||||
|
<rect x="466.0" y="138.0" width="240.0" height="200.0" rx="4" fill="#e0e7ff" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="586.0" y="242.0" text-anchor="middle" font-family="monospace" font-size="10" fill="#1e293b">SIP 1</text>
|
||||||
|
<rect x="578.0" y="117.0" width="16.0" height="6.0" rx="4" fill="#0ea5e9" stroke="#475569" stroke-width="1"/>
|
||||||
|
<text x="586.0" y="124.0" text-anchor="middle" font-family="monospace" font-size="7" fill="#ffffff">IO io0</text>
|
||||||
|
</svg>
|
||||||
|
After Width: | Height: | Size: 1.9 KiB |
@@ -0,0 +1,381 @@
|
|||||||
|
# Latency Model
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
kernbench uses a discrete-event simulation (SimPy) to compute end-to-end latency.
|
||||||
|
Every request flows through a graph of **components** connected by **wires**.
|
||||||
|
The total latency reported is the **actual SimPy wall-clock** (`env.now` delta),
|
||||||
|
not a static formula—so contention and queueing are captured automatically.
|
||||||
|
|
||||||
|
```
|
||||||
|
total_ns (actual) = wire_prop + component_overhead + drain + queueing
|
||||||
|
├── deterministic ──────────────────┘ │
|
||||||
|
└── contention-dependent ────────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
## Three Deterministic Cost Components
|
||||||
|
|
||||||
|
### 1. Wire Propagation
|
||||||
|
|
||||||
|
```
|
||||||
|
wire_ns = distance_mm × ns_per_mm (global: 0.01 = 10 ps/mm)
|
||||||
|
```
|
||||||
|
|
||||||
|
Every edge in the topology graph has a `distance_mm`. A SimPy wire process
|
||||||
|
delays each message by `wire_ns` before delivering it to the next component.
|
||||||
|
For on-chip silicon this is ~10 ps/mm; the same constant applies everywhere
|
||||||
|
since all links are on-die or interposer. Wire propagation is typically <1 ns
|
||||||
|
and negligible compared to other costs.
|
||||||
|
|
||||||
|
### 2. Component Overhead (`overhead_ns`)
|
||||||
|
|
||||||
|
```
|
||||||
|
component_ns = node.attrs["overhead_ns"]
|
||||||
|
```
|
||||||
|
|
||||||
|
Each component on the path adds a fixed processing delay via `yield env.timeout(overhead_ns)`.
|
||||||
|
This models arbitration, protocol processing, pipeline stages, etc.
|
||||||
|
|
||||||
|
| Component | overhead_ns | Meaning |
|
||||||
|
|-----------|-------------|---------|
|
||||||
|
| pcie_ep | 5.0 | PCIe protocol processing |
|
||||||
|
| io_cpu | 10.0 | Command decode / dispatch |
|
||||||
|
| m_cpu | 5.0 | DMA scheduling |
|
||||||
|
| fabric switch | 5.0 | Packet arbitration |
|
||||||
|
| xbar | 2.0 | Crossbar arbitration |
|
||||||
|
| xbar bridge | 1.0 | Bridge traversal between xbar halves |
|
||||||
|
| ucie | 1.0 | UCIe protocol overhead per port |
|
||||||
|
| noc (2D mesh) | 0.0 | Hop delay modeled internally via manhattan distance |
|
||||||
|
| hbm_ctrl | 0.0 | Access time captured in drain_ns |
|
||||||
|
| pe_cpu | 2.0 | Command dispatch |
|
||||||
|
| pe_scheduler | 1.0 | PE-internal scheduling |
|
||||||
|
| pe_gemm/math | 0.0 | Placeholder; will use flops-based model |
|
||||||
|
|
||||||
|
### 3. Drain (Serialization Delay)
|
||||||
|
|
||||||
|
```
|
||||||
|
drain_ns = nbytes / bottleneck_bw_gbs
|
||||||
|
```
|
||||||
|
|
||||||
|
**Wormhole (cut-through) model**: data flows through intermediate nodes as a
|
||||||
|
pipeline. Serialization cost is paid **once** at the terminal node, not at
|
||||||
|
every hop. The bottleneck is the minimum `bw_gbs` across all edges in the path.
|
||||||
|
|
||||||
|
Example: 4096 bytes through a path with bottleneck 128 GB/s → `4096 / 128 = 32.0 ns`.
|
||||||
|
|
||||||
|
### Formula (Theoretical Lower Bound)
|
||||||
|
|
||||||
|
```
|
||||||
|
formula_ns = Σ(wire_prop) + Σ(overhead_ns) + drain_ns
|
||||||
|
```
|
||||||
|
|
||||||
|
This is the latency with **zero contention**—no other request competing for
|
||||||
|
any resource. The engine provides `_formula_latency()` for verification.
|
||||||
|
With no contention: `actual == formula`. With contention: `actual > formula`.
|
||||||
|
|
||||||
|
### Diagram: PE DMA Read (pe0 → local slice0, 4096 bytes)
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
sequenceDiagram
|
||||||
|
participant D as pe_dma
|
||||||
|
participant X as xbar.pe0
|
||||||
|
participant H as hbm_ctrl.slice0
|
||||||
|
|
||||||
|
D->>X: txn (4096B)
|
||||||
|
Note over X: overhead 2.0 ns
|
||||||
|
X->>H: txn (wire 0.025 ns)
|
||||||
|
Note over H: acquire Resource
|
||||||
|
Note over H: overhead 0 ns
|
||||||
|
Note over H: drain 4096/256 = 16.0 ns
|
||||||
|
Note over H: release Resource
|
||||||
|
H-->>D: done.succeed()
|
||||||
|
|
||||||
|
Note over D,H: total_ns = 18.09 ns<br/>formula = wire(0.025) + ovhd(2.0) + drain(16.0) = 18.025 ns<br/>actual ≈ formula (no contention)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Diagram: Two Requests — No Contention vs HOL Blocking
|
||||||
|
|
||||||
|
#### Case 1: Different slices (parallel, no contention)
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
sequenceDiagram
|
||||||
|
participant A as Request A
|
||||||
|
participant S0 as hbm_ctrl.slice0<br/>Resource(cap=1)
|
||||||
|
participant S1 as hbm_ctrl.slice1<br/>Resource(cap=1)
|
||||||
|
|
||||||
|
Note over A,S1: t=2 ns — both requests arrive at their own slice
|
||||||
|
A->>S0: A (4KB)
|
||||||
|
A->>S1: B (4KB)
|
||||||
|
Note over S0: acquire (immediate)
|
||||||
|
Note over S1: acquire (immediate)
|
||||||
|
Note over S0: drain 16.0 ns
|
||||||
|
Note over S1: drain 16.0 ns
|
||||||
|
Note over S0: t=18 release
|
||||||
|
Note over S1: t=18 release
|
||||||
|
|
||||||
|
Note over A,S1: A actual = 18 ns, B actual = 18 ns<br/>No waiting — separate Resources
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Case 2: Same slice (HOL blocking)
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
sequenceDiagram
|
||||||
|
participant A as Request A (4KB)
|
||||||
|
participant Q as hbm_ctrl.slice0<br/>Resource(cap=1)
|
||||||
|
participant B as Request B (64B)
|
||||||
|
|
||||||
|
Note over A,B: t=0 — A arrives first
|
||||||
|
A->>Q: acquire (immediate)
|
||||||
|
Note over Q: drain A = 16.0 ns
|
||||||
|
|
||||||
|
Note over B,Q: t=5 — B arrives, yield req → BLOCKED
|
||||||
|
B--xQ: waiting...
|
||||||
|
|
||||||
|
Note over Q: t=16 — A drain done, release
|
||||||
|
Q->>B: B acquires resource
|
||||||
|
Note over Q: drain B = 0.25 ns
|
||||||
|
Note over Q: t=16.25 — B done, release
|
||||||
|
|
||||||
|
Note over A,B: A actual = 16.0 ns (== formula)<br/>B actual = 11.25 ns (formula 0.25 + queueing 11.0)<br/>HOL blocking: short request waits behind long drain
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## How SimPy Tracks Latency
|
||||||
|
|
||||||
|
### Measurement
|
||||||
|
|
||||||
|
```python
|
||||||
|
start_ns = env.now
|
||||||
|
yield txn_done # wait for the transaction to complete
|
||||||
|
total_ns = env.now - start_ns # ← this is what probe reports
|
||||||
|
```
|
||||||
|
|
||||||
|
`env.now` is SimPy's simulation clock. It only advances when a process `yield`s
|
||||||
|
a timeout or waits on a resource/store. The delta between start and done captures
|
||||||
|
**everything**: wire delays, component overheads, drain, and any queueing.
|
||||||
|
|
||||||
|
### Component Pipeline
|
||||||
|
|
||||||
|
Each component is a SimPy process:
|
||||||
|
|
||||||
|
```
|
||||||
|
_fan_in (per in_port) → _inbox (Store) → _worker → out_ports
|
||||||
|
```
|
||||||
|
|
||||||
|
1. **`_fan_in`**: relays messages from each `in_port` into a shared `_inbox` Store.
|
||||||
|
2. **`_worker`**: pulls from `_inbox`, spawns `_forward_txn` per message.
|
||||||
|
3. **`_forward_txn`**: calls `run()` (overhead), then puts to `out_ports[next_hop]`.
|
||||||
|
|
||||||
|
The worker uses `env.process()` (pipeline model), so multiple messages can be
|
||||||
|
in-flight through the same component concurrently. Contention happens when
|
||||||
|
they compete for shared resources (e.g., `simpy.Resource` in hbm_ctrl).
|
||||||
|
|
||||||
|
### Wire Process
|
||||||
|
|
||||||
|
```python
|
||||||
|
while True:
|
||||||
|
msg = yield out_port.get() # wait for sender
|
||||||
|
yield env.timeout(prop_ns) # propagation delay
|
||||||
|
yield in_port.put(msg) # deliver to receiver
|
||||||
|
```
|
||||||
|
|
||||||
|
Each directed edge has its own wire process. Messages are delayed by exactly
|
||||||
|
`distance_mm × ns_per_mm`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Contention and Queueing
|
||||||
|
|
||||||
|
Queueing delay is **not a separate formula term**—it emerges from SimPy's
|
||||||
|
event scheduling when multiple requests compete for the same resource.
|
||||||
|
|
||||||
|
### Where Contention Occurs
|
||||||
|
|
||||||
|
| Resource | SimPy Type | Capacity | Effect |
|
||||||
|
|----------|-----------|----------|--------|
|
||||||
|
| hbm_ctrl | `simpy.Resource` | 1 | Serializes HBM access |
|
||||||
|
| m_cpu DMA read engine | `simpy.Resource` | 1 | Serializes DMA reads |
|
||||||
|
| m_cpu DMA write engine | `simpy.Resource` | 1 | Serializes DMA writes |
|
||||||
|
| pe_dma channels | `simpy.Resource` | configurable | Serializes PE DMA ops |
|
||||||
|
| component inbox | `simpy.Store` | unbounded | No backpressure (FIFO) |
|
||||||
|
|
||||||
|
### How Queueing Works
|
||||||
|
|
||||||
|
```python
|
||||||
|
# hbm_ctrl._worker
|
||||||
|
with self._resource.request() as req:
|
||||||
|
yield req # ← BLOCKS if resource is occupied
|
||||||
|
yield from self.run(env, txn.nbytes)
|
||||||
|
yield env.timeout(drain_ns)
|
||||||
|
```
|
||||||
|
|
||||||
|
If request A holds the resource and request B arrives:
|
||||||
|
- B's `yield req` blocks until A releases the resource
|
||||||
|
- SimPy advances B's `env.now` by A's remaining service time
|
||||||
|
- This "extra" time shows up in B's `total_ns` automatically
|
||||||
|
|
||||||
|
```
|
||||||
|
No contention: actual_ns == formula_ns
|
||||||
|
Contention: actual_ns > formula_ns
|
||||||
|
queueing_delay = actual_ns - formula_ns
|
||||||
|
```
|
||||||
|
|
||||||
|
### Head-of-Line (HOL) Blocking at hbm_ctrl
|
||||||
|
|
||||||
|
The `simpy.Resource` is held for the **entire** `with` block—both overhead and
|
||||||
|
drain. The resource is NOT released between overhead and drain:
|
||||||
|
|
||||||
|
```python
|
||||||
|
with self._resource.request() as req:
|
||||||
|
yield req # acquire (or wait)
|
||||||
|
yield from self.run(env, txn.nbytes) # overhead_ns ─┐
|
||||||
|
yield env.timeout(drain_ns) # drain_ns │ resource held
|
||||||
|
# ← resource released here ───────────────────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
This means a short request arriving during a long request's drain must wait
|
||||||
|
for the full remaining drain time—classic head-of-line blocking:
|
||||||
|
|
||||||
|
```
|
||||||
|
Request A: 4 KB, drain = 16.0 ns (arrives at t=0)
|
||||||
|
Request B: 64 B, drain = 0.25 ns (arrives at t=5)
|
||||||
|
|
||||||
|
Timeline:
|
||||||
|
t=0.00 A acquires resource
|
||||||
|
t=0.00 A: overhead (0 ns)
|
||||||
|
t=0.00 A: drain starts (16.0 ns)
|
||||||
|
t=5.00 B arrives → yield req → BLOCKED (A holds resource)
|
||||||
|
t=16.00 A: drain done → resource released
|
||||||
|
t=16.00 B acquires resource
|
||||||
|
t=16.00 B: overhead (0 ns)
|
||||||
|
t=16.25 B: drain done → resource released
|
||||||
|
|
||||||
|
B actual = 11.25 ns (waited 11.0 + own 0.25)
|
||||||
|
B formula = 0.25 ns
|
||||||
|
B queueing = 11.0 ns ← HOL blocking penalty
|
||||||
|
```
|
||||||
|
|
||||||
|
**Why this is physically realistic**: An HBM channel processes one burst at a
|
||||||
|
time. While data is being serialized onto the channel (drain), no other request
|
||||||
|
can use that channel. The FIFO ordering (`simpy.Resource` default) reflects
|
||||||
|
the simplest controller scheduling policy.
|
||||||
|
|
||||||
|
**Alternative: priority scheduling**: If needed, `simpy.PriorityResource` can
|
||||||
|
prioritize shorter requests (Shortest Job First), but this is not currently
|
||||||
|
used since FIFO matches typical HBM controller behavior.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Worked Example: Two Concurrent PE DMA Reads
|
||||||
|
|
||||||
|
Setup: PE0 and PE1 in cube0 both read 4096 bytes from their local HBM slices
|
||||||
|
(slice0 and slice1), submitted to the **same engine** at the same time.
|
||||||
|
|
||||||
|
### Paths
|
||||||
|
|
||||||
|
```
|
||||||
|
DMA A: pe0.pe_dma → xbar.pe0 → hbm_ctrl.slice0
|
||||||
|
DMA B: pe1.pe_dma → xbar.pe1 → hbm_ctrl.slice1
|
||||||
|
```
|
||||||
|
|
||||||
|
### No Contention (different HBM slices)
|
||||||
|
|
||||||
|
Since slice0 and slice1 are **separate** hbm_ctrl instances, each with its own
|
||||||
|
`simpy.Resource(capacity=1)`, there is no resource competition.
|
||||||
|
|
||||||
|
```
|
||||||
|
DMA A timeline:
|
||||||
|
t=0.00 pe_dma dequeues txn
|
||||||
|
t=0.00 xbar.pe0: overhead_ns=2.0 → t=2.00
|
||||||
|
t=2.025 wire prop (2.5mm × 0.01) → t=2.025
|
||||||
|
t=2.025 hbm_ctrl.slice0: yield req → immediate (no contention)
|
||||||
|
t=2.025 hbm_ctrl.slice0: overhead_ns=0 → t=2.025
|
||||||
|
t=18.025 drain_ns = 4096/256 = 16.0 → t=18.025
|
||||||
|
t=18.025 done
|
||||||
|
|
||||||
|
DMA B timeline: (identical, on its own slice)
|
||||||
|
t=0.00 → ... → t=18.09 done
|
||||||
|
```
|
||||||
|
|
||||||
|
Both complete at ~18.09 ns. `actual == formula` for both.
|
||||||
|
|
||||||
|
### With Contention (same HBM slice)
|
||||||
|
|
||||||
|
Now suppose both PE0 and PE1 read from **slice0**:
|
||||||
|
|
||||||
|
```
|
||||||
|
DMA A: pe0.pe_dma → xbar.pe0 → hbm_ctrl.slice0
|
||||||
|
DMA B: pe1.pe_dma → xbar.pe1 → xbar.pe0 → hbm_ctrl.slice0
|
||||||
|
(chain traversal to reach slice0)
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
DMA A timeline:
|
||||||
|
t=0.00 xbar.pe0(2.0) → wire → hbm_ctrl.slice0
|
||||||
|
t=2.025 yield req → immediate (first to arrive)
|
||||||
|
t=18.025 drain 16.0 → release resource → done
|
||||||
|
actual_A = 18.025 ns (== formula)
|
||||||
|
|
||||||
|
DMA B timeline:
|
||||||
|
t=0.00 xbar.pe1(2.0) → xbar.pe0(2.0) → wire → hbm_ctrl.slice0
|
||||||
|
t=4.035 yield req → BLOCKED (A holds resource until t=18.025)
|
||||||
|
t=18.025 acquire resource
|
||||||
|
t=34.025 drain 16.0 → release → done
|
||||||
|
actual_B = 34.035 ns
|
||||||
|
|
||||||
|
formula_B = wire(0.035) + overhead(4.0) + drain(32.0) = 36.035 ns
|
||||||
|
But actual_B is different because drain uses bottleneck BW of B's path (128 GB/s)
|
||||||
|
while A's path has BW 256 GB/s. Let's recalculate:
|
||||||
|
|
||||||
|
B's bottleneck: xbar_x_bw = 128 GB/s → drain = 4096/128 = 32.0 ns
|
||||||
|
formula_B = 0.035 + 4.0 + 32.0 = 36.035 ns
|
||||||
|
actual_B = 36.035 + queueing ≈ 50+ ns
|
||||||
|
queueing = time waiting for A to release hbm_ctrl
|
||||||
|
```
|
||||||
|
|
||||||
|
The key insight: **queueing delay is not in the formula**. It only appears in
|
||||||
|
the actual SimPy simulation when resources are contested. The probe reports
|
||||||
|
`actual_ns`, which includes all queueing. To see pure queueing overhead,
|
||||||
|
compare `actual_ns` vs `formula_ns` (available in PE DMA traces).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Probe Output Explained
|
||||||
|
|
||||||
|
```
|
||||||
|
=== PE DMA Latency ===
|
||||||
|
Case Target Actual Ovhd Drain Wire Ovhd% Drain% Eff.BW BN.BW Util%
|
||||||
|
pe-local-hbm c0.pe0->c0.slice0 18.09 2.0 16.0 0.08 11.1% 88.5% 226.49 256.0 88.5%
|
||||||
|
pe-cross-half-hbm c0.pe0->c0.slice4 37.14 5.0 32.0 0.14 13.5% 86.1% 110.27 128.0 86.1%
|
||||||
|
```
|
||||||
|
|
||||||
|
| Column | Meaning |
|
||||||
|
|--------|---------|
|
||||||
|
| **Actual** | SimPy measured `env.now` delta (includes contention if any) |
|
||||||
|
| **Ovhd** | Sum of `overhead_ns` for all components on the forward path |
|
||||||
|
| **Drain** | `nbytes / bottleneck_bw` — serialization at terminal |
|
||||||
|
| **Wire** | Sum of `distance_mm × ns_per_mm` for all edges |
|
||||||
|
| **Ovhd%** | `Ovhd / Actual × 100` — fraction of time spent in component processing |
|
||||||
|
| **Drain%** | `Drain / Actual × 100` — fraction of time spent in data transfer |
|
||||||
|
| **Eff.BW** | `nbytes / Actual` — achieved bandwidth |
|
||||||
|
| **BN.BW** | Bottleneck bandwidth (min `bw_gbs` on path) |
|
||||||
|
| **Util%** | `Eff.BW / BN.BW × 100` — how close to theoretical max BW |
|
||||||
|
|
||||||
|
### Why Util% < 100%
|
||||||
|
|
||||||
|
`Util% = Drain% = drain_ns / actual_ns`. The gap from 100% is the overhead
|
||||||
|
fraction. For small transfers (4KB), overhead is significant relative to drain.
|
||||||
|
For large transfers, drain dominates and utilization approaches 100%.
|
||||||
|
|
||||||
|
```
|
||||||
|
4 KB: Ovhd=2.0, Drain=16.0 → Util=88.5% (overhead is 11% of time)
|
||||||
|
64 KB: Ovhd=2.0, Drain=256.0 → Util=99.2% (overhead is <1% of time)
|
||||||
|
```
|
||||||
|
|
||||||
|
### H2D Path: Why Ovhd% is ~40%
|
||||||
|
|
||||||
|
H2D traverses many components (pcie_ep → io_cpu → ucie → noc → m_cpu → noc →
|
||||||
|
xbar → hbm_ctrl + response path). Total forward overhead is ~23 ns vs drain
|
||||||
|
of 32 ns for 4KB, so overhead is comparable to data transfer time—resulting
|
||||||
|
in ~55% utilization. This is expected for small command-path transfers.
|
||||||
@@ -0,0 +1,30 @@
|
|||||||
|
[build-system]
|
||||||
|
requires = ["setuptools>=68", "wheel"]
|
||||||
|
build-backend = "setuptools.build_meta"
|
||||||
|
|
||||||
|
[project]
|
||||||
|
name = "kernbench"
|
||||||
|
version = "0.1.0"
|
||||||
|
requires-python = ">=3.10"
|
||||||
|
dependencies = ["pytest", "simpy", "pyyaml"]
|
||||||
|
|
||||||
|
[project.scripts]
|
||||||
|
kernbench = "kernbench.cli.main:main"
|
||||||
|
|
||||||
|
[project.optional-dependencies]
|
||||||
|
dev = [
|
||||||
|
"pytest>=7",
|
||||||
|
"ruff>=0.4.0",
|
||||||
|
]
|
||||||
|
|
||||||
|
[tool.ruff]
|
||||||
|
line-length = 100
|
||||||
|
target-version = "py310"
|
||||||
|
fix = false
|
||||||
|
|
||||||
|
[tool.ruff.lint]
|
||||||
|
select = ["E", "F", "I", "B", "UP"]
|
||||||
|
ignore = ["F401"]
|
||||||
|
|
||||||
|
[tool.pytest.ini_options]
|
||||||
|
addopts = ["--disable-warnings"]
|
||||||
@@ -0,0 +1,393 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Generate SVG diagrams illustrating each placement strategy.
|
||||||
|
|
||||||
|
Example tensor: (M=1024, K=512) fp16 (itemsize=2), 8 PEs.
|
||||||
|
Tiled variants use tile_m=256, tile_k=128.
|
||||||
|
|
||||||
|
Output: docs/diagrams/placement_*.svg
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import math
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# ── Diagram parameters ──────────────────────────────────────────────
|
||||||
|
M, K = 1024, 512
|
||||||
|
ITEMSIZE = 2
|
||||||
|
NUM_PE = 8
|
||||||
|
TILE_M, TILE_K = 256, 128
|
||||||
|
|
||||||
|
PE_COLORS = [
|
||||||
|
"#3b82f6", # PE0 blue
|
||||||
|
"#10b981", # PE1 emerald
|
||||||
|
"#f59e0b", # PE2 amber
|
||||||
|
"#ef4444", # PE3 red
|
||||||
|
"#8b5cf6", # PE4 violet
|
||||||
|
"#ec4899", # PE5 pink
|
||||||
|
"#06b6d4", # PE6 cyan
|
||||||
|
"#f97316", # PE7 orange
|
||||||
|
]
|
||||||
|
PE_TEXT_COLORS = [
|
||||||
|
"#fff", "#fff", "#000", "#fff",
|
||||||
|
"#fff", "#fff", "#000", "#fff",
|
||||||
|
]
|
||||||
|
|
||||||
|
OUT_DIR = Path(__file__).parent.parent / "docs" / "diagrams"
|
||||||
|
|
||||||
|
# ── SVG helpers ─────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _svg_header(w: int, h: int, title: str) -> str:
|
||||||
|
return (
|
||||||
|
f'<svg xmlns="http://www.w3.org/2000/svg" width="{w}" height="{h}"'
|
||||||
|
f' viewBox="0 0 {w} {h}" font-family="monospace">\n'
|
||||||
|
f'<rect width="{w}" height="{h}" fill="#f8fafc" rx="6"/>\n'
|
||||||
|
f'<text x="{w // 2}" y="32" text-anchor="middle" font-size="16"'
|
||||||
|
f' font-weight="bold" fill="#1e293b">{title}</text>\n'
|
||||||
|
)
|
||||||
|
|
||||||
|
def _svg_footer() -> str:
|
||||||
|
return "</svg>\n"
|
||||||
|
|
||||||
|
def _rect(x: float, y: float, w: float, h: float, fill: str,
|
||||||
|
stroke: str = "#334155", sw: float = 1.0, opacity: float = 1.0) -> str:
|
||||||
|
return (
|
||||||
|
f'<rect x="{x:.1f}" y="{y:.1f}" width="{w:.1f}" height="{h:.1f}"'
|
||||||
|
f' fill="{fill}" stroke="{stroke}" stroke-width="{sw}"'
|
||||||
|
f' fill-opacity="{opacity}" rx="2"/>\n'
|
||||||
|
)
|
||||||
|
|
||||||
|
def _text(x: float, y: float, txt: str, size: int = 11,
|
||||||
|
anchor: str = "middle", fill: str = "#1e293b",
|
||||||
|
weight: str = "normal") -> str:
|
||||||
|
return (
|
||||||
|
f'<text x="{x:.1f}" y="{y:.1f}" text-anchor="{anchor}"'
|
||||||
|
f' font-size="{size}" fill="{fill}" font-weight="{weight}">{txt}</text>\n'
|
||||||
|
)
|
||||||
|
|
||||||
|
def _line(x1: float, y1: float, x2: float, y2: float,
|
||||||
|
stroke: str = "#94a3b8", sw: float = 1) -> str:
|
||||||
|
return (
|
||||||
|
f'<line x1="{x1:.1f}" y1="{y1:.1f}" x2="{x2:.1f}" y2="{y2:.1f}"'
|
||||||
|
f' stroke="{stroke}" stroke-width="{sw}"/>\n'
|
||||||
|
)
|
||||||
|
|
||||||
|
def _format_bytes(n: int) -> str:
|
||||||
|
if n >= (1 << 20):
|
||||||
|
return f"{n >> 20} MB"
|
||||||
|
if n >= (1 << 10):
|
||||||
|
return f"{n >> 10} KB"
|
||||||
|
return f"{n} B"
|
||||||
|
|
||||||
|
def _legend(x: float, y0: float, num_pe: int = NUM_PE) -> str:
|
||||||
|
s = _text(x + 50, y0, "PE Legend", size=12, weight="bold")
|
||||||
|
for i in range(num_pe):
|
||||||
|
ly = y0 + 18 + i * 22
|
||||||
|
s += _rect(x, ly - 12, 16, 16, PE_COLORS[i])
|
||||||
|
s += _text(x + 22, ly, f"PE{i}", size=11, anchor="start")
|
||||||
|
return s
|
||||||
|
|
||||||
|
def _axes(gx: float, gy: float, gw: float, gh: float,
|
||||||
|
m_label: str = "M=1024", k_label: str = "K=512") -> str:
|
||||||
|
"""Draw axis labels and dimension arrows."""
|
||||||
|
s = ""
|
||||||
|
# K axis (horizontal) label above grid
|
||||||
|
s += _text(gx + gw / 2, gy - 8, f"← {k_label} →", size=11, fill="#475569")
|
||||||
|
# M axis (vertical) label left of grid
|
||||||
|
mx = gx - 12
|
||||||
|
my = gy + gh / 2
|
||||||
|
s += (
|
||||||
|
f'<text x="{mx:.1f}" y="{my:.1f}" text-anchor="middle"'
|
||||||
|
f' font-size="11" fill="#475569"'
|
||||||
|
f' transform="rotate(-90 {mx:.1f} {my:.1f})">↑ {m_label} ↓</text>\n'
|
||||||
|
)
|
||||||
|
return s
|
||||||
|
|
||||||
|
def _info_box(x: float, y: float, lines: list[str]) -> str:
|
||||||
|
"""Rounded info box with key/value lines."""
|
||||||
|
bw = max(len(l) for l in lines) * 7 + 20
|
||||||
|
bh = len(lines) * 18 + 12
|
||||||
|
s = _rect(x, y, bw, bh, "#e2e8f0", stroke="#94a3b8", sw=1)
|
||||||
|
for i, line in enumerate(lines):
|
||||||
|
s += _text(x + 10, y + 18 + i * 18, line, size=10, anchor="start", fill="#334155")
|
||||||
|
return s
|
||||||
|
|
||||||
|
# ── Grid drawing ────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _draw_grid(
|
||||||
|
gx: float, gy: float, gw: float, gh: float,
|
||||||
|
cells: list[dict], # [{row, col, rspan, cspan, pe, label?, offset?}]
|
||||||
|
rows: int, cols: int,
|
||||||
|
cell_labels: bool = True,
|
||||||
|
) -> str:
|
||||||
|
"""Draw a grid of colored cells representing shard placement."""
|
||||||
|
cw = gw / cols
|
||||||
|
ch = gh / rows
|
||||||
|
s = ""
|
||||||
|
for c in cells:
|
||||||
|
cx = gx + c["col"] * cw
|
||||||
|
cy = gy + c["row"] * ch
|
||||||
|
w = c.get("cspan", 1) * cw
|
||||||
|
h = c.get("rspan", 1) * ch
|
||||||
|
pe = c["pe"]
|
||||||
|
s += _rect(cx, cy, w, h, PE_COLORS[pe], stroke="#334155", sw=1.5)
|
||||||
|
# PE label
|
||||||
|
lx = cx + w / 2
|
||||||
|
ly = cy + h / 2
|
||||||
|
s += _text(lx, ly - 4, f"PE{pe}", size=12,
|
||||||
|
fill=PE_TEXT_COLORS[pe], weight="bold")
|
||||||
|
if cell_labels and "label" in c:
|
||||||
|
s += _text(lx, ly + 12, c["label"], size=9,
|
||||||
|
fill=PE_TEXT_COLORS[pe])
|
||||||
|
# Grid border
|
||||||
|
s += _rect(gx, gy, gw, gh, "none", stroke="#1e293b", sw=2)
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
# ── Strategy-specific generators ────────────────────────────────────
|
||||||
|
|
||||||
|
def gen_column_wise() -> str:
|
||||||
|
"""Column-wise: split K into 8 equal parts."""
|
||||||
|
W, H = 820, 500
|
||||||
|
s = _svg_header(W, H, "Placement: column_wise")
|
||||||
|
s += _text(W // 2, 54, f"Tensor ({M}×{K}) fp16 → K axis split into {NUM_PE} parts",
|
||||||
|
size=12, fill="#475569")
|
||||||
|
|
||||||
|
gx, gy, gw, gh = 80, 90, 480, 320
|
||||||
|
chunk_k = K // NUM_PE # 64
|
||||||
|
chunk_bytes = M * chunk_k * ITEMSIZE
|
||||||
|
|
||||||
|
s += _axes(gx, gy, gw, gh)
|
||||||
|
cells = []
|
||||||
|
for i in range(NUM_PE):
|
||||||
|
cells.append({
|
||||||
|
"row": 0, "col": i, "rspan": 1, "cspan": 1,
|
||||||
|
"pe": i,
|
||||||
|
"label": f"({M}×{chunk_k})",
|
||||||
|
})
|
||||||
|
s += _draw_grid(gx, gy, gw, gh, cells, rows=1, cols=NUM_PE)
|
||||||
|
|
||||||
|
# Column dimension labels
|
||||||
|
cw = gw / NUM_PE
|
||||||
|
for i in range(NUM_PE):
|
||||||
|
cx = gx + i * cw + cw / 2
|
||||||
|
off = i * chunk_bytes
|
||||||
|
s += _text(cx, gy + gh + 16, f"off={_format_bytes(off)}", size=9, fill="#475569")
|
||||||
|
s += _text(cx, gy + gh + 30, f"{_format_bytes(chunk_bytes)}", size=9, fill="#64748b")
|
||||||
|
|
||||||
|
s += _legend(620, 100)
|
||||||
|
s += _info_box(620, 320, [
|
||||||
|
f"Strategy: column_wise",
|
||||||
|
f"Split axis: K",
|
||||||
|
f"Shards: {NUM_PE}",
|
||||||
|
f"Each: ({M}, {chunk_k})",
|
||||||
|
f"Each: {_format_bytes(chunk_bytes)}",
|
||||||
|
f"Total: {_format_bytes(M * K * ITEMSIZE)}",
|
||||||
|
])
|
||||||
|
s += _svg_footer()
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
def gen_row_wise() -> str:
|
||||||
|
"""Row-wise: split M into 8 equal parts."""
|
||||||
|
W, H = 820, 560
|
||||||
|
s = _svg_header(W, H, "Placement: row_wise")
|
||||||
|
s += _text(W // 2, 54, f"Tensor ({M}×{K}) fp16 → M axis split into {NUM_PE} parts",
|
||||||
|
size=12, fill="#475569")
|
||||||
|
|
||||||
|
gx, gy, gw, gh = 80, 90, 320, 400
|
||||||
|
chunk_m = M // NUM_PE # 128
|
||||||
|
chunk_bytes = chunk_m * K * ITEMSIZE
|
||||||
|
|
||||||
|
s += _axes(gx, gy, gw, gh)
|
||||||
|
cells = []
|
||||||
|
for i in range(NUM_PE):
|
||||||
|
cells.append({
|
||||||
|
"row": i, "col": 0, "rspan": 1, "cspan": 1,
|
||||||
|
"pe": i,
|
||||||
|
"label": f"({chunk_m}×{K})",
|
||||||
|
})
|
||||||
|
s += _draw_grid(gx, gy, gw, gh, cells, rows=NUM_PE, cols=1)
|
||||||
|
|
||||||
|
# Row dimension labels
|
||||||
|
ch = gh / NUM_PE
|
||||||
|
for i in range(NUM_PE):
|
||||||
|
cy = gy + i * ch + ch / 2
|
||||||
|
off = i * chunk_bytes
|
||||||
|
s += _text(gx + gw + 10, cy - 4, f"off={_format_bytes(off)}",
|
||||||
|
size=9, anchor="start", fill="#475569")
|
||||||
|
s += _text(gx + gw + 10, cy + 10, f"{_format_bytes(chunk_bytes)}",
|
||||||
|
size=9, anchor="start", fill="#64748b")
|
||||||
|
|
||||||
|
s += _legend(580, 100)
|
||||||
|
s += _info_box(580, 320, [
|
||||||
|
f"Strategy: row_wise",
|
||||||
|
f"Split axis: M",
|
||||||
|
f"Shards: {NUM_PE}",
|
||||||
|
f"Each: ({chunk_m}, {K})",
|
||||||
|
f"Each: {_format_bytes(chunk_bytes)}",
|
||||||
|
f"Total: {_format_bytes(M * K * ITEMSIZE)}",
|
||||||
|
])
|
||||||
|
s += _svg_footer()
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
def gen_replicate() -> str:
|
||||||
|
"""Replicate: full copy per PE."""
|
||||||
|
W, H = 820, 500
|
||||||
|
s = _svg_header(W, H, "Placement: replicate")
|
||||||
|
s += _text(W // 2, 54, f"Tensor ({M}×{K}) fp16 → full copy to each PE",
|
||||||
|
size=12, fill="#475569")
|
||||||
|
|
||||||
|
full_bytes = M * K * ITEMSIZE
|
||||||
|
# Show 8 small copies in 2 rows × 4 cols
|
||||||
|
cols, rows = 4, 2
|
||||||
|
margin_x, margin_y = 60, 90
|
||||||
|
gap = 16
|
||||||
|
bw = (700 - (cols - 1) * gap) / cols
|
||||||
|
bh = (340 - (rows - 1) * gap) / rows
|
||||||
|
|
||||||
|
for i in range(NUM_PE):
|
||||||
|
r = i // cols
|
||||||
|
c = i % cols
|
||||||
|
bx = margin_x + c * (bw + gap)
|
||||||
|
by = margin_y + r * (bh + gap)
|
||||||
|
s += _rect(bx, by, bw, bh, PE_COLORS[i], stroke="#334155", sw=1.5)
|
||||||
|
s += _text(bx + bw / 2, by + bh / 2 - 14, f"PE{i}",
|
||||||
|
size=14, fill=PE_TEXT_COLORS[i], weight="bold")
|
||||||
|
s += _text(bx + bw / 2, by + bh / 2 + 6, f"({M}×{K})",
|
||||||
|
size=11, fill=PE_TEXT_COLORS[i])
|
||||||
|
s += _text(bx + bw / 2, by + bh / 2 + 22, f"{_format_bytes(full_bytes)}",
|
||||||
|
size=10, fill=PE_TEXT_COLORS[i])
|
||||||
|
s += _text(bx + bw / 2, by + bh / 2 + 36, "offset=0",
|
||||||
|
size=9, fill=PE_TEXT_COLORS[i])
|
||||||
|
|
||||||
|
s += _info_box(60, 450, [
|
||||||
|
f"Strategy: replicate | Shards: {NUM_PE} | Each: {_format_bytes(full_bytes)}"
|
||||||
|
f" | Total mem: {_format_bytes(full_bytes * NUM_PE)}",
|
||||||
|
])
|
||||||
|
s += _svg_footer()
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
def gen_tiled(column_major: bool) -> str:
|
||||||
|
"""2D tiled placement. column_major=True → tiled_column_major."""
|
||||||
|
name = "tiled_column_major" if column_major else "tiled_row_major"
|
||||||
|
order = "column-major (K first)" if column_major else "row-major (M first)"
|
||||||
|
|
||||||
|
tiles_m = M // TILE_M # 4
|
||||||
|
tiles_k = K // TILE_K # 4
|
||||||
|
total_tiles = tiles_m * tiles_k # 16
|
||||||
|
tile_bytes = TILE_M * TILE_K * ITEMSIZE
|
||||||
|
|
||||||
|
W, H = 820, 620
|
||||||
|
s = _svg_header(W, H, f"Placement: {name}")
|
||||||
|
s += _text(W // 2, 54,
|
||||||
|
f"Tensor ({M}×{K}) fp16, tile=({TILE_M}×{TILE_K}) → "
|
||||||
|
f"{tiles_m}×{tiles_k}={total_tiles} tiles, {order}",
|
||||||
|
size=11, fill="#475569")
|
||||||
|
|
||||||
|
gx, gy, gw, gh = 80, 90, 400, 400
|
||||||
|
s += _axes(gx, gy, gw, gh)
|
||||||
|
|
||||||
|
# Build tile → PE mapping
|
||||||
|
cells = []
|
||||||
|
idx = 0
|
||||||
|
if column_major:
|
||||||
|
# iterate M first (rows), then K (cols) — but column-major means
|
||||||
|
# we traverse in the order that fills columns first
|
||||||
|
# Actually: column-major = K axis first within each M row
|
||||||
|
# The implementation iterates: for mi in tiles_m: for ki in tiles_k
|
||||||
|
for mi in range(tiles_m):
|
||||||
|
for ki in range(tiles_k):
|
||||||
|
pe = idx % NUM_PE
|
||||||
|
row_bytes = K * ITEMSIZE
|
||||||
|
offset = (mi * TILE_M * row_bytes) + (ki * TILE_K * ITEMSIZE)
|
||||||
|
cells.append({
|
||||||
|
"row": mi, "col": ki, "rspan": 1, "cspan": 1,
|
||||||
|
"pe": pe,
|
||||||
|
"label": f"t{idx}",
|
||||||
|
"offset": offset,
|
||||||
|
"idx": idx,
|
||||||
|
})
|
||||||
|
idx += 1
|
||||||
|
else:
|
||||||
|
# row-major: iterate K first (cols), then M (rows)
|
||||||
|
for ki in range(tiles_k):
|
||||||
|
for mi in range(tiles_m):
|
||||||
|
pe = idx % NUM_PE
|
||||||
|
row_bytes = K * ITEMSIZE
|
||||||
|
offset = (mi * TILE_M * row_bytes) + (ki * TILE_K * ITEMSIZE)
|
||||||
|
cells.append({
|
||||||
|
"row": mi, "col": ki, "rspan": 1, "cspan": 1,
|
||||||
|
"pe": pe,
|
||||||
|
"label": f"t{idx}",
|
||||||
|
"offset": offset,
|
||||||
|
"idx": idx,
|
||||||
|
})
|
||||||
|
idx += 1
|
||||||
|
|
||||||
|
s += _draw_grid(gx, gy, gw, gh, cells, rows=tiles_m, cols=tiles_k)
|
||||||
|
|
||||||
|
# Tile dimension labels on top
|
||||||
|
cw = gw / tiles_k
|
||||||
|
for ki in range(tiles_k):
|
||||||
|
cx = gx + ki * cw + cw / 2
|
||||||
|
s += _text(cx, gy + gh + 16, f"k={ki * TILE_K}..{(ki + 1) * TILE_K - 1}",
|
||||||
|
size=9, fill="#475569")
|
||||||
|
|
||||||
|
# Tile dimension labels on left
|
||||||
|
ch = gh / tiles_m
|
||||||
|
for mi in range(tiles_m):
|
||||||
|
cy = gy + mi * ch + ch / 2
|
||||||
|
s += _text(gx - 16, cy, f"m={mi * TILE_M}..{(mi + 1) * TILE_M - 1}",
|
||||||
|
size=9, anchor="end", fill="#475569")
|
||||||
|
|
||||||
|
s += _legend(540, 90)
|
||||||
|
|
||||||
|
# Assignment table
|
||||||
|
table_y = 310
|
||||||
|
s += _text(540, table_y, "Tile Assignment Order", size=12, weight="bold")
|
||||||
|
# Sort cells by idx for table
|
||||||
|
sorted_cells = sorted(cells, key=lambda c: c["idx"])
|
||||||
|
for i, c in enumerate(sorted_cells):
|
||||||
|
ty = table_y + 18 + i * 16
|
||||||
|
if ty > H - 20:
|
||||||
|
break
|
||||||
|
pe = c["pe"]
|
||||||
|
s += _rect(540, ty - 10, 12, 12, PE_COLORS[pe])
|
||||||
|
s += _text(558, ty,
|
||||||
|
f"t{c['idx']:>2d} → PE{pe} ({c['row']},{c['col']})"
|
||||||
|
f" off={_format_bytes(c['offset'])}",
|
||||||
|
size=9, anchor="start", fill="#334155")
|
||||||
|
|
||||||
|
s += _info_box(80, H - 60, [
|
||||||
|
f"Strategy: {name} | Tile: ({TILE_M}×{TILE_K})={_format_bytes(tile_bytes)}"
|
||||||
|
f" | Tiles: {total_tiles} | Total: {_format_bytes(M * K * ITEMSIZE)}",
|
||||||
|
])
|
||||||
|
s += _svg_footer()
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
# ── Main ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
OUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
diagrams = {
|
||||||
|
"placement_column_wise.svg": gen_column_wise(),
|
||||||
|
"placement_row_wise.svg": gen_row_wise(),
|
||||||
|
"placement_replicate.svg": gen_replicate(),
|
||||||
|
"placement_tiled_column_major.svg": gen_tiled(column_major=True),
|
||||||
|
"placement_tiled_row_major.svg": gen_tiled(column_major=False),
|
||||||
|
}
|
||||||
|
|
||||||
|
for name, svg in diagrams.items():
|
||||||
|
path = OUT_DIR / name
|
||||||
|
path.write_text(svg, encoding="utf-8")
|
||||||
|
print(f" wrote {path}")
|
||||||
|
|
||||||
|
print(f"\nGenerated {len(diagrams)} placement diagrams.")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -0,0 +1,64 @@
|
|||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from benches.loader import resolve_bench
|
||||||
|
from kernbench.cli.probe import cmd_probe
|
||||||
|
from kernbench.cli.report import format_report
|
||||||
|
from kernbench.common.types import SimEngine
|
||||||
|
from kernbench.runtime_api.bench_runner import run_bench
|
||||||
|
from kernbench.runtime_api.types import DeviceSelector, resolve_device
|
||||||
|
from kernbench.sim_engine.engine import GraphEngine
|
||||||
|
from kernbench.topology.builder import resolve_topology
|
||||||
|
|
||||||
|
|
||||||
|
def build_parser() -> argparse.ArgumentParser:
|
||||||
|
p = argparse.ArgumentParser(prog="kernbench")
|
||||||
|
sub = p.add_subparsers(dest="cmd", required=True)
|
||||||
|
|
||||||
|
runp = sub.add_parser("run", help="Run a benchmark")
|
||||||
|
runp.add_argument("--topology", required=True)
|
||||||
|
runp.add_argument("--bench", required=True)
|
||||||
|
runp.add_argument(
|
||||||
|
"--device", default=None, help="Target device: 'all' or 'sip:<N>' (default: all)"
|
||||||
|
)
|
||||||
|
runp.set_defaults(_handler=cmd_run)
|
||||||
|
|
||||||
|
probep = sub.add_parser("probe", help="Probe latency and BW for predefined traffic patterns")
|
||||||
|
probep.add_argument("--topology", required=True)
|
||||||
|
probep.add_argument("--case", default="all", help="Case name or 'all' (default: all)")
|
||||||
|
probep.set_defaults(_handler=cmd_probe)
|
||||||
|
|
||||||
|
return p
|
||||||
|
|
||||||
|
|
||||||
|
def engine_factory(topology: object, device: DeviceSelector) -> SimEngine:
|
||||||
|
topo_obj = getattr(topology, "topology_obj", topology)
|
||||||
|
return GraphEngine(topo_obj)
|
||||||
|
|
||||||
|
|
||||||
|
def cmd_run(args) -> int:
|
||||||
|
print("> Running benchmark with:", args)
|
||||||
|
|
||||||
|
topo = resolve_topology(args.topology)
|
||||||
|
bench = resolve_bench(args.bench)
|
||||||
|
device = resolve_device(args.device)
|
||||||
|
|
||||||
|
result = run_bench(topology=topo, bench_fn=bench, device=device, engine_factory=engine_factory)
|
||||||
|
|
||||||
|
topo_obj = getattr(topo, "topology_obj", topo)
|
||||||
|
spec = getattr(topo_obj, "spec", None)
|
||||||
|
if result.traces:
|
||||||
|
print(format_report(result.traces, title=args.bench, spec=spec))
|
||||||
|
print(result.summary_text())
|
||||||
|
|
||||||
|
return 0 if result.completion.ok else 1
|
||||||
|
|
||||||
|
|
||||||
|
def main(argv=None) -> int:
|
||||||
|
parser = build_parser()
|
||||||
|
args = parser.parse_args(argv)
|
||||||
|
return int(args._handler(args))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(main())
|
||||||
@@ -0,0 +1,248 @@
|
|||||||
|
"""kernbench probe: latency and BW verification utility.
|
||||||
|
|
||||||
|
Runs predefined traffic patterns through the simulation engine and reports
|
||||||
|
latency, effective bandwidth, bottleneck bandwidth, and utilization for each
|
||||||
|
case. Validates monotonicity invariants across hop counts and access types.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from kernbench.policy.address.phyaddr import PhysAddr
|
||||||
|
from kernbench.policy.routing.router import AddressResolver, PathRouter
|
||||||
|
from kernbench.runtime_api.kernel import MemoryWriteMsg, PeDmaMsg
|
||||||
|
from kernbench.sim_engine.engine import GraphEngine
|
||||||
|
from kernbench.topology.builder import load_topology
|
||||||
|
from kernbench.topology.types import TopologyGraph
|
||||||
|
|
||||||
|
|
||||||
|
# -- Helpers ----------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _hbm_pa(sip: int, cube: int, pe_id: int, spec: dict) -> int:
|
||||||
|
mm = spec["cube"]["memory_map"]
|
||||||
|
slice_bytes = mm["hbm_total_gb_per_cube"] * (1 << 30) // mm["hbm_slices_per_cube"]
|
||||||
|
pa = PhysAddr.pe_hbm_addr(
|
||||||
|
rack_id=0, sip_id=sip, cube_id=cube, pe_id=pe_id,
|
||||||
|
pe_local_hbm_offset=0x1000, slice_size_bytes=slice_bytes,
|
||||||
|
)
|
||||||
|
return pa.encode()
|
||||||
|
|
||||||
|
|
||||||
|
def _build_edge_map(graph: TopologyGraph) -> dict[tuple[str, str], object]:
|
||||||
|
return {(e.src, e.dst): e for e in graph.edges}
|
||||||
|
|
||||||
|
|
||||||
|
def _formula_breakdown(
|
||||||
|
path: list[str], nbytes: int, edge_map: dict, graph: TopologyGraph,
|
||||||
|
) -> tuple[float, float, float, float]:
|
||||||
|
"""Return (wire_ns, overhead_ns, drain_ns, formula_ns) for a path."""
|
||||||
|
ns_per_mm = graph.spec.get("system", {}).get("ns_per_mm", 0.01)
|
||||||
|
wire_ns = 0.0
|
||||||
|
for i in range(len(path) - 1):
|
||||||
|
e = edge_map.get((path[i], path[i + 1]))
|
||||||
|
if e:
|
||||||
|
wire_ns += e.distance_mm * ns_per_mm
|
||||||
|
overhead_ns = 0.0
|
||||||
|
for nid in path:
|
||||||
|
node = graph.nodes.get(nid)
|
||||||
|
if node:
|
||||||
|
overhead_ns += float(node.attrs.get("overhead_ns", 0.0))
|
||||||
|
bws = [e.bw_gbs for i in range(len(path) - 1)
|
||||||
|
if (e := edge_map.get((path[i], path[i + 1]))) and e.bw_gbs]
|
||||||
|
drain_ns = nbytes / min(bws) if bws else 0.0
|
||||||
|
return wire_ns, overhead_ns, drain_ns, wire_ns + overhead_ns + drain_ns
|
||||||
|
|
||||||
|
|
||||||
|
def _bottleneck_bw(path: list[str], edge_map: dict) -> float | None:
|
||||||
|
"""Per-request bottleneck: single request uses one connection."""
|
||||||
|
bws: list[float] = []
|
||||||
|
for i in range(len(path) - 1):
|
||||||
|
e = edge_map.get((path[i], path[i + 1]))
|
||||||
|
if e and e.bw_gbs:
|
||||||
|
bws.append(e.bw_gbs)
|
||||||
|
return min(bws) if bws else None
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def _fmt_bw(bw: float | None) -> str:
|
||||||
|
return f"{bw:.1f}" if bw is not None else "-"
|
||||||
|
|
||||||
|
|
||||||
|
def _fmt_util(eff: float, bn: float | None) -> str:
|
||||||
|
if bn is None or bn <= 0:
|
||||||
|
return "-"
|
||||||
|
return f"{eff / bn * 100:.1f}%"
|
||||||
|
|
||||||
|
|
||||||
|
def _short_name(node_id: str) -> str:
|
||||||
|
"""Shorten node id: keep last 2 segments to avoid ambiguity (xbar.pe0 vs pe0)."""
|
||||||
|
parts = node_id.split(".")
|
||||||
|
return ".".join(parts[-2:]) if len(parts) >= 2 else node_id
|
||||||
|
|
||||||
|
|
||||||
|
def _short_path(path: list[str]) -> str:
|
||||||
|
return " -> ".join(_short_name(n) for n in path)
|
||||||
|
|
||||||
|
|
||||||
|
# -- Probe runner -----------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def run_probe(topology_path: str, case_filter: str | None = None) -> int:
|
||||||
|
path = Path(topology_path).expanduser().resolve()
|
||||||
|
graph = load_topology(path)
|
||||||
|
edge_map = _build_edge_map(graph)
|
||||||
|
spec = graph.spec
|
||||||
|
resolver = AddressResolver(graph)
|
||||||
|
router = PathRouter(graph)
|
||||||
|
|
||||||
|
nbytes = 4096
|
||||||
|
show_all = case_filter is None or case_filter == "all"
|
||||||
|
|
||||||
|
# === H2D Write ===
|
||||||
|
h2d_cases = [
|
||||||
|
("h2d-1hop", 0, 1),
|
||||||
|
("h2d-2hop", 4, 2),
|
||||||
|
("h2d-3hop", 8, 3),
|
||||||
|
("h2d-4hop", 12, 4),
|
||||||
|
]
|
||||||
|
h2d_results: list[tuple[str, int, float, float, float | None]] = []
|
||||||
|
h2d_paths: list[tuple[str, list[str], list[str], list[str]]] = []
|
||||||
|
|
||||||
|
print()
|
||||||
|
print("=== H2D Write Latency (IO->HBM, varying hop count) ===")
|
||||||
|
print(f" {'Case':<14} {'Target':<16} {'Hops':>4} {'Actual':>8}"
|
||||||
|
f" {'Ovhd':>6} {'Drain':>6} {'Wire':>5} {'Ovhd%':>6} {'Drain%':>7}"
|
||||||
|
f" {'Eff.BW':>8} {'BN.BW':>8} {'Util%':>6}")
|
||||||
|
print(" " + "-" * 115)
|
||||||
|
|
||||||
|
for name, cube, hops in h2d_cases:
|
||||||
|
if not show_all and case_filter != name:
|
||||||
|
continue
|
||||||
|
engine = GraphEngine(graph)
|
||||||
|
pa = _hbm_pa(sip=0, cube=cube, pe_id=0, spec=spec)
|
||||||
|
msg = MemoryWriteMsg(
|
||||||
|
correlation_id="probe", request_id=name,
|
||||||
|
dst_sip=0, dst_cube=cube, dst_pe=0,
|
||||||
|
dst_pa=pa, nbytes=nbytes, pattern="zero",
|
||||||
|
)
|
||||||
|
h = engine.submit(msg)
|
||||||
|
engine.wait(h)
|
||||||
|
_, trace = engine.get_completion(h)
|
||||||
|
total_ns = trace["total_ns"]
|
||||||
|
eff_bw = nbytes / total_ns if total_ns > 0 else 0.0
|
||||||
|
|
||||||
|
pa_obj = PhysAddr.decode(pa)
|
||||||
|
dst_node = resolver.resolve(pa_obj)
|
||||||
|
|
||||||
|
pcie_ep = resolver.find_pcie_ep(0)
|
||||||
|
io_cpu = resolver.find_io_cpu(0)
|
||||||
|
m_cpu = resolver.find_m_cpu(0, cube)
|
||||||
|
leg1 = router.find_node_path(pcie_ep, io_cpu)
|
||||||
|
leg2 = router.find_node_path(io_cpu, m_cpu)
|
||||||
|
leg3 = router.find_mcpu_dma_path(m_cpu, dst_node)
|
||||||
|
full_path = leg1 + leg2[1:] + leg3[1:]
|
||||||
|
bn_bw = _bottleneck_bw(full_path, edge_map)
|
||||||
|
|
||||||
|
# Forward path breakdown only (response path is implicit in actual_ns)
|
||||||
|
fwd_path = leg1 + leg2[1:] + leg3[1:]
|
||||||
|
wire, ovhd, drain, formula = _formula_breakdown(fwd_path, nbytes, edge_map, graph)
|
||||||
|
|
||||||
|
ovhd_pct = ovhd / total_ns * 100 if total_ns > 0 else 0
|
||||||
|
drain_pct = drain / total_ns * 100 if total_ns > 0 else 0
|
||||||
|
|
||||||
|
h2d_results.append((name, hops, total_ns, eff_bw, bn_bw))
|
||||||
|
h2d_paths.append((name, leg1, leg2, leg3))
|
||||||
|
print(f" {name:<14} cube{cube}.pe0{'':<8} {hops:>4} {total_ns:>8.2f}"
|
||||||
|
f" {ovhd:>6.1f} {drain:>6.1f} {wire:>5.2f} {ovhd_pct:>5.1f}% {drain_pct:>5.1f}%"
|
||||||
|
f" {eff_bw:>8.2f} {_fmt_bw(bn_bw):>8} {_fmt_util(eff_bw, bn_bw):>6}")
|
||||||
|
|
||||||
|
if len(h2d_results) >= 2:
|
||||||
|
lats = [r[2] for r in h2d_results]
|
||||||
|
mono = all(lats[i] < lats[i + 1] for i in range(len(lats) - 1))
|
||||||
|
sym = "[v]" if mono else "[x]"
|
||||||
|
print(f" {sym} Monotonic increase: {'PASS' if mono else 'FAIL'}")
|
||||||
|
|
||||||
|
if h2d_paths:
|
||||||
|
print()
|
||||||
|
print(" Route Details:")
|
||||||
|
print(f" {'Case':<14} {'Leg':>4} Path")
|
||||||
|
print(" " + "-" * 80)
|
||||||
|
for name, leg1, leg2, leg3 in h2d_paths:
|
||||||
|
print(f" {name:<14} {'L1':>4} {_short_path(leg1)}")
|
||||||
|
print(f" {'':<14} {'L2':>4} {_short_path(leg2)}")
|
||||||
|
print(f" {'':<14} {'L3':>4} {_short_path(leg3)}")
|
||||||
|
|
||||||
|
# === PE DMA → HBM (direct PE-level injection) ===
|
||||||
|
# (name, sip, src_cube, src_pe, dst_cube, dst_pe)
|
||||||
|
pe_cases = [
|
||||||
|
("pe-local-hbm", 0, 0, 0, 0, 0), # pe0 → slice0 (local, 256 GB/s)
|
||||||
|
("pe-same-half-hbm", 0, 0, 0, 0, 1), # pe0 → slice1 (xbar chain, 128 GB/s)
|
||||||
|
("pe-cross-half-hbm", 0, 0, 0, 0, 4), # pe0 → slice4 (xbar chain, 128 GB/s)
|
||||||
|
("pe-cross-cube-hbm", 0, 0, 0, 1, 0), # cube0.pe0 → cube1.slice0 (NOC, 128 GB/s)
|
||||||
|
]
|
||||||
|
pe_results: list[tuple[str, float, float, float | None]] = []
|
||||||
|
pe_paths: list[tuple[str, list[str]]] = []
|
||||||
|
|
||||||
|
print()
|
||||||
|
print("=== PE DMA Latency (pe_dma -> xbar -> HBM, direct injection) ===")
|
||||||
|
print(f" {'Case':<22} {'Target':<28} {'Actual':>8}"
|
||||||
|
f" {'Ovhd':>6} {'Drain':>6} {'Wire':>5} {'Ovhd%':>6} {'Drain%':>7}"
|
||||||
|
f" {'Eff.BW':>8} {'BN.BW':>8} {'Util%':>6}")
|
||||||
|
print(" " + "-" * 120)
|
||||||
|
|
||||||
|
for name, sip, src_cube, src_pe, dst_cube, dst_pe in pe_cases:
|
||||||
|
if not show_all and case_filter != name:
|
||||||
|
continue
|
||||||
|
engine = GraphEngine(graph)
|
||||||
|
dst_pa = _hbm_pa(sip=sip, cube=dst_cube, pe_id=dst_pe, spec=spec)
|
||||||
|
msg = PeDmaMsg(
|
||||||
|
correlation_id="probe", request_id=name,
|
||||||
|
src_sip=sip, src_cube=src_cube, src_pe=src_pe,
|
||||||
|
dst_pa=dst_pa, nbytes=nbytes,
|
||||||
|
)
|
||||||
|
h = engine.submit(msg)
|
||||||
|
engine.wait(h)
|
||||||
|
_, trace = engine.get_completion(h)
|
||||||
|
total_ns = trace["total_ns"]
|
||||||
|
eff_bw = nbytes / total_ns if total_ns > 0 else 0.0
|
||||||
|
|
||||||
|
pe_ref = f"sip{sip}.cube{src_cube}.pe{src_pe}"
|
||||||
|
pa_obj = PhysAddr.decode(dst_pa)
|
||||||
|
dst_node = resolver.resolve(pa_obj)
|
||||||
|
dma_path = router.find_path(pe_ref, dst_node)
|
||||||
|
bn_bw = _bottleneck_bw(dma_path, edge_map)
|
||||||
|
|
||||||
|
wire, ovhd, drain, formula = _formula_breakdown(dma_path, nbytes, edge_map, graph)
|
||||||
|
|
||||||
|
ovhd_pct = ovhd / total_ns * 100 if total_ns > 0 else 0
|
||||||
|
drain_pct = drain / total_ns * 100 if total_ns > 0 else 0
|
||||||
|
|
||||||
|
target_str = f"c{src_cube}.pe{src_pe}->c{dst_cube}.slice{dst_pe}"
|
||||||
|
pe_results.append((name, total_ns, eff_bw, bn_bw))
|
||||||
|
pe_paths.append((name, dma_path))
|
||||||
|
print(f" {name:<22} {target_str:<28} {total_ns:>8.2f}"
|
||||||
|
f" {ovhd:>6.1f} {drain:>6.1f} {wire:>5.2f} {ovhd_pct:>5.1f}% {drain_pct:>5.1f}%"
|
||||||
|
f" {eff_bw:>8.2f} {_fmt_bw(bn_bw):>8} {_fmt_util(eff_bw, bn_bw):>6}")
|
||||||
|
|
||||||
|
if len(pe_results) >= 2:
|
||||||
|
local = [r for r in pe_results if "local" in r[0]]
|
||||||
|
chain = [r for r in pe_results if "local" not in r[0]]
|
||||||
|
if local and chain:
|
||||||
|
print(f" * Local BN: {_fmt_bw(local[0][3])} GB/s, "
|
||||||
|
f"Chain/NOC BN: {_fmt_bw(chain[0][3])} GB/s")
|
||||||
|
|
||||||
|
if pe_paths:
|
||||||
|
print()
|
||||||
|
print(" Route Details:")
|
||||||
|
print(f" {'Case':<22} Path")
|
||||||
|
print(" " + "-" * 80)
|
||||||
|
for name, dma_path in pe_paths:
|
||||||
|
print(f" {name:<22} {_short_path(dma_path)}")
|
||||||
|
|
||||||
|
print()
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
def cmd_probe(args) -> int:
|
||||||
|
return run_probe(args.topology, getattr(args, "case", "all"))
|
||||||
@@ -0,0 +1,175 @@
|
|||||||
|
"""Performance report formatter for bench results."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
|
||||||
|
_DTYPE_BITS: dict[str, int] = {
|
||||||
|
"f16": 16, "fp16": 16, "float16": 16, "bf16": 16,
|
||||||
|
"f32": 32, "fp32": 32, "float32": 32,
|
||||||
|
"i8": 8, "int8": 8, "i16": 16, "int16": 16, "i32": 32, "int32": 32,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def format_report(
|
||||||
|
traces: list[dict],
|
||||||
|
title: str = "Benchmark",
|
||||||
|
spec: dict | None = None,
|
||||||
|
) -> str:
|
||||||
|
"""Format collected traces into a human-readable performance report.
|
||||||
|
|
||||||
|
spec: topology spec dict for peak TFLOPS / BW extraction.
|
||||||
|
"""
|
||||||
|
peak_tflops_f16, peak_hbm_bw_gbs = _extract_peaks(spec)
|
||||||
|
num_pes = _count_pes(spec)
|
||||||
|
|
||||||
|
lines: list[str] = []
|
||||||
|
title_line = f"-- {title} Performance Report "
|
||||||
|
|
||||||
|
deploy_entries = [t for t in traces if t.get("phase") not in ("kernel",)]
|
||||||
|
kernel_entries = [t for t in traces if t.get("phase") == "kernel"]
|
||||||
|
|
||||||
|
# ── Title ──
|
||||||
|
# Compute max header width for consistent separator lengths
|
||||||
|
_cmd_hdr = (f"{'Cmd':<10} {'Name':<12} {'SIP':>4} {'Cube':>5} {'PE':>4} {'Bytes':>10} "
|
||||||
|
f"{'Lat(ns)':>10} {'Xfer(ns)':>10} {'Proc(ns)':>10} "
|
||||||
|
f"{'BW(GB/s)':>10} {'MinBW':>10} {'Util%':>7}")
|
||||||
|
report_width = len(_cmd_hdr)
|
||||||
|
lines.append(title_line + "-" * max(0, report_width - len(title_line)))
|
||||||
|
|
||||||
|
# ── Command summary ──
|
||||||
|
if deploy_entries:
|
||||||
|
lines.append("")
|
||||||
|
hdr = (f"{'Cmd':<10} {'Name':<12} {'SIP':>4} {'Cube':>5} {'PE':>4} {'Bytes':>10} "
|
||||||
|
f"{'Lat(ns)':>10} {'Xfer(ns)':>10} {'Proc(ns)':>10} "
|
||||||
|
f"{'BW(GB/s)':>10} {'MinBW':>10} {'Util%':>7}")
|
||||||
|
lines.append(hdr)
|
||||||
|
lines.append("-" * len(hdr))
|
||||||
|
for e in deploy_entries:
|
||||||
|
lat = e.get("total_ns", 0.0)
|
||||||
|
nb = e.get("nbytes", 0)
|
||||||
|
sip = e.get("sip", "-")
|
||||||
|
pe = e.get("pe", "-")
|
||||||
|
cube = e.get("cube", "-")
|
||||||
|
cmd = e.get("phase", "deploy")
|
||||||
|
xfer_ns = e.get("xfer_ns", 0.0)
|
||||||
|
proc_ns = lat - xfer_ns if xfer_ns > 0 else 0.0
|
||||||
|
bw = nb / lat if lat > 0 else 0.0
|
||||||
|
min_bw = nb / xfer_ns if xfer_ns > 0 else 0.0
|
||||||
|
util = (xfer_ns / lat * 100) if lat > 0 and xfer_ns > 0 else 0.0
|
||||||
|
lines.append(
|
||||||
|
f"{cmd:<10} {e.get('name', '?'):<12} {str(sip):>4} {str(cube):>5} {str(pe):>4} {nb:>10} "
|
||||||
|
f"{lat:>10.1f} {xfer_ns:>10.1f} {proc_ns:>10.1f} "
|
||||||
|
f"{bw:>10.1f} {min_bw:>10.1f} {util:>6.1f}%"
|
||||||
|
)
|
||||||
|
|
||||||
|
# ── Kernel summary ──
|
||||||
|
if kernel_entries:
|
||||||
|
lines.append("")
|
||||||
|
k_hdr = (f"{'Phase':<10} {'Name':<12} {'PE':>4} {'E2E(ns)':>10} "
|
||||||
|
f"{'PE(ns)':>10} {'DMA(ns)':>10} {'Comp(ns)':>10} "
|
||||||
|
f"{'Bound':<8} {'TFLOPS':>8} {'Peak':>8} {'Util%':>7}")
|
||||||
|
lines.append(k_hdr)
|
||||||
|
lines.append("-" * len(k_hdr))
|
||||||
|
for e in kernel_entries:
|
||||||
|
e2e_ns = e.get("total_ns", 0.0)
|
||||||
|
pe_ns = e.get("pe_exec_ns", e2e_ns)
|
||||||
|
dma_ns = e.get("dma_ns", 0.0)
|
||||||
|
compute_ns = e.get("compute_ns", 0.0)
|
||||||
|
target_pe = e.get("target_pe", "-")
|
||||||
|
scalars = e.get("scalars", [])
|
||||||
|
pe_str = "all" if target_pe == "all" else str(target_pe)
|
||||||
|
n_active = num_pes if target_pe == "all" else 1
|
||||||
|
|
||||||
|
# Bound indicator based on measured DMA vs compute time
|
||||||
|
if dma_ns > 0 or compute_ns > 0:
|
||||||
|
bound = "memory" if dma_ns >= compute_ns else "compute"
|
||||||
|
else:
|
||||||
|
bound = "-"
|
||||||
|
|
||||||
|
achieved = _calc_tflops(scalars, pe_ns)
|
||||||
|
peak_total = peak_tflops_f16 * n_active
|
||||||
|
util = (achieved / peak_total * 100) if peak_total > 0 else 0.0
|
||||||
|
lines.append(
|
||||||
|
f"{'kernel':<10} {e.get('name', '?'):<12} {pe_str:>4} {e2e_ns:>10.1f} "
|
||||||
|
f"{pe_ns:>10.1f} {dma_ns:>10.1f} {compute_ns:>10.1f} "
|
||||||
|
f"{bound:<8} {achieved:>8.3f} {peak_total:>8.1f} {util:>6.1f}%"
|
||||||
|
)
|
||||||
|
|
||||||
|
# ── Per-PE summary ──
|
||||||
|
pe_deploy = _per_pe_deploy(deploy_entries)
|
||||||
|
if len(pe_deploy) > 1:
|
||||||
|
lines.append("")
|
||||||
|
pe_title = (f"-- Per-PE Summary (peak: {peak_tflops_f16:.1f} TFLOPS/PE, "
|
||||||
|
f"{peak_hbm_bw_gbs:.0f} GB/s HBM BW) ")
|
||||||
|
pe_hdr = (f"{'PE':>4} {'Deploy(ns)':>10} {'BW(GB/s)':>10} {'BW Util':>8} "
|
||||||
|
f"{'Kernel(ns)':>10} {'TFLOPS':>8} {'Util':>7}")
|
||||||
|
pe_width = max(len(pe_title), len(pe_hdr))
|
||||||
|
lines.append(pe_title + "-" * max(0, pe_width - len(pe_title)))
|
||||||
|
lines.append(pe_hdr)
|
||||||
|
lines.append("-" * pe_width)
|
||||||
|
|
||||||
|
k_ns = sum(e.get("pe_exec_ns", e.get("total_ns", 0.0)) for e in kernel_entries)
|
||||||
|
k_scalars = kernel_entries[0].get("scalars", []) if kernel_entries else []
|
||||||
|
n_active = len(pe_deploy)
|
||||||
|
total_achieved = _calc_tflops(k_scalars, k_ns)
|
||||||
|
per_pe_tflops = total_achieved / n_active if n_active > 0 else 0.0
|
||||||
|
pe_util = (per_pe_tflops / peak_tflops_f16 * 100) if peak_tflops_f16 > 0 else 0.0
|
||||||
|
|
||||||
|
for pe_id in sorted(pe_deploy):
|
||||||
|
d_ns, d_bytes = pe_deploy[pe_id]
|
||||||
|
d_bw = d_bytes / d_ns if d_ns > 0 else 0.0
|
||||||
|
d_util = (d_bw / peak_hbm_bw_gbs * 100) if peak_hbm_bw_gbs > 0 else 0.0
|
||||||
|
lines.append(
|
||||||
|
f"{pe_id:>4} {d_ns:>10.1f} {d_bw:>10.1f} {d_util:>7.1f}% "
|
||||||
|
f"{k_ns:>10.1f} {per_pe_tflops:>8.3f} {pe_util:>6.1f}%"
|
||||||
|
)
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_peaks(spec: dict | None) -> tuple[float, float]:
|
||||||
|
"""Extract peak TFLOPS (f16) and HBM BW (GB/s) from spec."""
|
||||||
|
if spec is None:
|
||||||
|
return 0.0, 0.0
|
||||||
|
cube = spec.get("cube", {})
|
||||||
|
pe_template = cube.get("pe_template", {})
|
||||||
|
comps = pe_template.get("components", {})
|
||||||
|
gemm_attrs = comps.get("pe_gemm", {}).get("attrs", {})
|
||||||
|
peak_tflops = float(gemm_attrs.get("peak_tflops_f16", 0.0))
|
||||||
|
cube_links = cube.get("links", {})
|
||||||
|
hbm_bw = float(cube_links.get("xbar_to_hbm_bw_gbs", 0.0))
|
||||||
|
return peak_tflops, hbm_bw
|
||||||
|
|
||||||
|
|
||||||
|
def _count_pes(spec: dict | None) -> int:
|
||||||
|
if spec is None:
|
||||||
|
return 8
|
||||||
|
cube = spec.get("cube", {})
|
||||||
|
layout = cube.get("pe_layout", {})
|
||||||
|
per_corner = layout.get("pe_per_corner", 2)
|
||||||
|
corners = len(layout.get("corners", ["NW", "NE", "SW", "SE"]))
|
||||||
|
return per_corner * corners
|
||||||
|
|
||||||
|
|
||||||
|
def _calc_tflops(scalars: list, latency_ns: float) -> float:
|
||||||
|
"""Calculate achieved TFLOPS from scalar args [M, K, N] and latency."""
|
||||||
|
if len(scalars) < 3 or latency_ns <= 0:
|
||||||
|
return 0.0
|
||||||
|
m, k, n = scalars[0], scalars[1], scalars[2]
|
||||||
|
flops = 2.0 * m * k * n
|
||||||
|
return flops / (latency_ns * 1e-9) / 1e12
|
||||||
|
|
||||||
|
|
||||||
|
def _per_pe_deploy(deploy_entries: list[dict]) -> dict[int, tuple[float, int]]:
|
||||||
|
"""Aggregate deploy latency and bytes per PE."""
|
||||||
|
result: dict[int, tuple[float, int]] = {}
|
||||||
|
for e in deploy_entries:
|
||||||
|
pe = e.get("pe", 0)
|
||||||
|
lat = e.get("total_ns", 0.0)
|
||||||
|
nb = e.get("nbytes", 0)
|
||||||
|
if pe in result:
|
||||||
|
old_ns, old_bytes = result[pe]
|
||||||
|
result[pe] = (old_ns + lat, old_bytes + nb)
|
||||||
|
else:
|
||||||
|
result[pe] = (lat, nb)
|
||||||
|
return result
|
||||||
@@ -0,0 +1,150 @@
|
|||||||
|
"""PE-internal command types and handles (ADR-0014).
|
||||||
|
|
||||||
|
Generated by triton_emu (TLContext) and consumed by PE component
|
||||||
|
implementations (PE_CPU, PE_SCHEDULER, PE_DMA, PE_GEMM, PE_MATH).
|
||||||
|
|
||||||
|
Command lifecycle:
|
||||||
|
Triton kernel → TLContext → [PeCommand list] → PE_CPU → PE_SCHEDULER → engines
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import TYPE_CHECKING, Any, Literal
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
import simpy
|
||||||
|
|
||||||
|
|
||||||
|
# ── Handles ───────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class TensorHandle:
|
||||||
|
"""Opaque reference to a tensor residing in PE_TCM.
|
||||||
|
|
||||||
|
Returned by tl.load, tl.dot, tl.exp, etc.
|
||||||
|
Carries metadata for command generation; data field is reserved
|
||||||
|
for future validate mode (numpy array).
|
||||||
|
"""
|
||||||
|
|
||||||
|
id: str
|
||||||
|
pa: int # physical address in HBM/TCM
|
||||||
|
shape: tuple[int, ...]
|
||||||
|
dtype: str
|
||||||
|
nbytes: int # total byte size
|
||||||
|
data: object = None # reserved for validate mode
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class CompletionHandle:
|
||||||
|
"""Opaque handle for a non-blocking composite command.
|
||||||
|
|
||||||
|
Returned by tl.composite, consumed by tl.wait.
|
||||||
|
"""
|
||||||
|
|
||||||
|
id: str
|
||||||
|
|
||||||
|
|
||||||
|
# ── PE Commands ───────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class DmaReadCmd:
|
||||||
|
"""DMA READ: HBM → PE_TCM."""
|
||||||
|
|
||||||
|
handle: TensorHandle
|
||||||
|
src_pa: int
|
||||||
|
nbytes: int
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class DmaWriteCmd:
|
||||||
|
"""DMA WRITE: PE_TCM → HBM."""
|
||||||
|
|
||||||
|
handle: TensorHandle
|
||||||
|
dst_pa: int
|
||||||
|
nbytes: int
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class GemmCmd:
|
||||||
|
"""GEMM engine command: matrix multiply on TCM data.
|
||||||
|
|
||||||
|
out = a @ b, all operands in TCM.
|
||||||
|
"""
|
||||||
|
|
||||||
|
a: TensorHandle
|
||||||
|
b: TensorHandle
|
||||||
|
out: TensorHandle
|
||||||
|
m: int
|
||||||
|
k: int
|
||||||
|
n: int
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class MathCmd:
|
||||||
|
"""MATH engine command: unary/binary/reduction on TCM data.
|
||||||
|
|
||||||
|
op: "exp", "log", "sqrt", "abs", "sigmoid", "cos", "sin",
|
||||||
|
"add", "sub", "mul", "div", "where",
|
||||||
|
"sum", "max", "min"
|
||||||
|
"""
|
||||||
|
|
||||||
|
op: str
|
||||||
|
inputs: tuple[TensorHandle, ...]
|
||||||
|
out: TensorHandle
|
||||||
|
axis: int | None = None # for reductions
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class CompositeCmd:
|
||||||
|
"""Composite command: tiled pipeline of DMA_READ + COMPUTE + DMA_WRITE.
|
||||||
|
|
||||||
|
Non-blocking — submitted to PE_SCHEDULER which manages tile splitting
|
||||||
|
and pipeline overlaps (ADR-0014 D3.2).
|
||||||
|
"""
|
||||||
|
|
||||||
|
completion: CompletionHandle
|
||||||
|
op: Literal["gemm", "math"]
|
||||||
|
a: TensorHandle
|
||||||
|
b: TensorHandle | None
|
||||||
|
out_pa: int
|
||||||
|
out_nbytes: int
|
||||||
|
math_op: str | None = None # for op="math": which math operation
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class WaitCmd:
|
||||||
|
"""Wait for a specific composite or all pending composites."""
|
||||||
|
|
||||||
|
handle: CompletionHandle | None = None # None = wait all
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class PeCpuOverheadCmd:
|
||||||
|
"""PE_CPU scalar execution overhead (cycles)."""
|
||||||
|
|
||||||
|
cycles: int
|
||||||
|
|
||||||
|
|
||||||
|
# Union type for all PE commands
|
||||||
|
PeCommand = (
|
||||||
|
DmaReadCmd | DmaWriteCmd | GemmCmd | MathCmd
|
||||||
|
| CompositeCmd | WaitCmd | PeCpuOverheadCmd
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class PeInternalTxn:
|
||||||
|
"""PE-internal message flowing PE_CPU → PE_SCHEDULER → engines.
|
||||||
|
|
||||||
|
Carries a single PeCommand and a completion event. PE_CPU creates one
|
||||||
|
PeInternalTxn per command during the replay phase and sends it to
|
||||||
|
PE_SCHEDULER, which routes it to the appropriate engine (PE_DMA,
|
||||||
|
PE_GEMM, PE_MATH). The engine signals ``done`` on completion.
|
||||||
|
"""
|
||||||
|
|
||||||
|
command: PeCommand
|
||||||
|
done: simpy.Event # succeeded when the engine completes this command
|
||||||
|
pe_prefix: str = "" # e.g. "sip0.cube0.pe0" — needed by PE_DMA for path resolution
|
||||||
|
result_data: dict[str, Any] = field(default_factory=dict)
|
||||||
@@ -0,0 +1,29 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Any, NewType, Protocol, TypeAlias
|
||||||
|
|
||||||
|
RequestHandle = NewType("RequestHandle", str)
|
||||||
|
|
||||||
|
Trace: TypeAlias = Any
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class Completion:
|
||||||
|
ok: bool
|
||||||
|
error_code: str | None = None
|
||||||
|
error_message: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class SimEngine(Protocol):
|
||||||
|
"""
|
||||||
|
Backend simulation/runner engine contract.
|
||||||
|
|
||||||
|
Engine must be able to:
|
||||||
|
- accept requests created by RuntimeContext (submit/dispatch)
|
||||||
|
- report completion and optional trace for a given handle
|
||||||
|
"""
|
||||||
|
|
||||||
|
def get_completion(self, handle: RequestHandle) -> tuple[Completion, Trace | None]: ...
|
||||||
|
def submit(self, request: Any) -> RequestHandle: ...
|
||||||
|
def wait(self, handle: RequestHandle) -> None: ...
|
||||||
@@ -0,0 +1,4 @@
|
|||||||
|
from kernbench.components.base import ComponentBase, ComponentRegistry
|
||||||
|
from kernbench.components.context import ComponentContext
|
||||||
|
|
||||||
|
__all__ = ["ComponentBase", "ComponentRegistry", "ComponentContext"]
|
||||||
@@ -0,0 +1,167 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from collections.abc import Generator
|
||||||
|
from typing import TYPE_CHECKING, Any
|
||||||
|
|
||||||
|
import simpy
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from kernbench.components.context import ComponentContext
|
||||||
|
from kernbench.topology.types import Node
|
||||||
|
|
||||||
|
|
||||||
|
class ComponentBase(ABC):
|
||||||
|
"""Base class for all SimPy component implementations (ADR-0007 D3, ADR-0015).
|
||||||
|
|
||||||
|
Each component corresponds to one node in the compiled topology graph.
|
||||||
|
It models the processing overhead at that node as a SimPy generator,
|
||||||
|
allowing future implementations to add queueing and contention.
|
||||||
|
|
||||||
|
Port model (ADR-0015 D1):
|
||||||
|
in_ports[src_node_id] — SimPy Store for incoming messages from src
|
||||||
|
out_ports[dst_node_id] — SimPy Store for outgoing messages to dst
|
||||||
|
Ports are wired by GraphEngine at initialization; wire processes model
|
||||||
|
propagation delay between connected ports (ADR-0015 D2).
|
||||||
|
|
||||||
|
Context (ADR-0015 D4):
|
||||||
|
ctx — ComponentContext with router and resolver.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
|
||||||
|
self.node = node
|
||||||
|
self.ctx = ctx
|
||||||
|
self.in_ports: dict[str, simpy.Store] = {}
|
||||||
|
self.out_ports: dict[str, simpy.Store] = {}
|
||||||
|
|
||||||
|
def start(self, env: simpy.Environment) -> None:
|
||||||
|
"""Called once after all ports are wired.
|
||||||
|
|
||||||
|
Default: starts a fan-in collector and a generic forwarding worker.
|
||||||
|
The worker calls self.run() for per-component latency, then routes the
|
||||||
|
Transaction to the next hop or signals done (duck-typed; no direct
|
||||||
|
Transaction import to avoid circular dependencies).
|
||||||
|
|
||||||
|
Override in components that need custom fan-out / aggregation logic
|
||||||
|
(e.g. MCpuComponent, IoCpuComponent for kernel launch).
|
||||||
|
"""
|
||||||
|
if not self.in_ports:
|
||||||
|
return
|
||||||
|
self._inbox: simpy.Store = simpy.Store(env)
|
||||||
|
for port in self.in_ports.values():
|
||||||
|
env.process(self._fan_in(port))
|
||||||
|
env.process(self._worker(env))
|
||||||
|
|
||||||
|
def _fan_in(self, port: simpy.Store) -> Generator:
|
||||||
|
"""Relay messages from one in_port into the shared inbox."""
|
||||||
|
while True:
|
||||||
|
msg = yield port.get()
|
||||||
|
yield self._inbox.put(msg)
|
||||||
|
|
||||||
|
def _worker(self, env: simpy.Environment) -> Generator:
|
||||||
|
"""Generic forwarding worker: spawns _forward_txn per message (pipeline)."""
|
||||||
|
while True:
|
||||||
|
txn: Any = yield self._inbox.get()
|
||||||
|
env.process(self._forward_txn(env, txn))
|
||||||
|
|
||||||
|
def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator:
|
||||||
|
"""Apply run() latency, then forward to next hop or drain at terminal."""
|
||||||
|
yield from self.run(env, txn.nbytes)
|
||||||
|
next_hop = txn.next_hop # duck-typed: Transaction.next_hop
|
||||||
|
if next_hop:
|
||||||
|
yield self.out_ports[next_hop].put(txn.advance())
|
||||||
|
else:
|
||||||
|
drain = getattr(txn, "drain_ns", 0.0)
|
||||||
|
if drain > 0:
|
||||||
|
yield env.timeout(drain)
|
||||||
|
txn.done.succeed()
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
|
||||||
|
"""SimPy process: yield one or more events for this node's processing.
|
||||||
|
|
||||||
|
Subclasses yield env.timeout(overhead_ns) or compute latency dynamically.
|
||||||
|
Called by _forward_txn and subclass-specific handlers.
|
||||||
|
"""
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
class PeEngineBase(ComponentBase):
|
||||||
|
"""Base class for PE-internal engines (PE_DMA, PE_GEMM, PE_MATH).
|
||||||
|
|
||||||
|
Provides:
|
||||||
|
- ``_pe_prefix``: extracted from node.id (e.g. "sip0.cube0.pe0")
|
||||||
|
- Dual-message ``_worker``: dispatches PeInternalTxn to
|
||||||
|
``handle_command()`` and Transaction to inherited ``_forward_txn()``.
|
||||||
|
- ``init_resources(env)``: hook for subclass resource initialization,
|
||||||
|
called by ``start()`` before the worker is spawned.
|
||||||
|
|
||||||
|
Subclass contract:
|
||||||
|
1. Override ``handle_command(env, pe_txn)`` — process a PeInternalTxn.
|
||||||
|
2. Override ``run(env, nbytes)`` — yield component latency.
|
||||||
|
3. Optionally override ``init_resources(env)`` for DMA channels, etc.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
|
||||||
|
super().__init__(node, ctx)
|
||||||
|
self._pe_prefix: str = node.id.rsplit(".", 1)[0]
|
||||||
|
|
||||||
|
def start(self, env: simpy.Environment) -> None:
|
||||||
|
self.init_resources(env)
|
||||||
|
super().start(env)
|
||||||
|
|
||||||
|
def init_resources(self, env: simpy.Environment) -> None:
|
||||||
|
"""Hook for subclass resource initialization. Called before worker spawn."""
|
||||||
|
|
||||||
|
def _worker(self, env: simpy.Environment) -> Generator:
|
||||||
|
"""Dual-message dispatch: PeInternalTxn → handle_command, Transaction → _forward_txn."""
|
||||||
|
from kernbench.common.pe_commands import PeInternalTxn
|
||||||
|
|
||||||
|
while True:
|
||||||
|
msg: Any = yield self._inbox.get()
|
||||||
|
if isinstance(msg, PeInternalTxn):
|
||||||
|
env.process(self.handle_command(env, msg))
|
||||||
|
else:
|
||||||
|
env.process(self._forward_txn(env, msg))
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def handle_command(self, env: simpy.Environment, pe_txn: Any) -> Generator:
|
||||||
|
"""Process a PE-internal command (PeInternalTxn).
|
||||||
|
|
||||||
|
Subclass must:
|
||||||
|
- Perform engine-specific work (acquire resources, compute, etc.)
|
||||||
|
- Call ``pe_txn.done.succeed()`` on completion.
|
||||||
|
"""
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
class ComponentRegistry:
|
||||||
|
"""DI registry: maps node.impl strings to ComponentBase subclasses.
|
||||||
|
|
||||||
|
Resolution order for ComponentRegistry.create(node, overrides, ctx):
|
||||||
|
1. overrides[node.impl] — caller-injected override
|
||||||
|
2. _registry[node.impl] — globally registered impl
|
||||||
|
3. Error — no fallback; every node must have an impl
|
||||||
|
"""
|
||||||
|
|
||||||
|
_registry: dict[str, type[ComponentBase]] = {}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def register(cls, impl: str, component_cls: type[ComponentBase]) -> None:
|
||||||
|
cls._registry[impl] = component_cls
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def create(
|
||||||
|
cls,
|
||||||
|
node: Node,
|
||||||
|
overrides: dict[str, type[ComponentBase]] | None = None,
|
||||||
|
ctx: ComponentContext | None = None,
|
||||||
|
) -> ComponentBase:
|
||||||
|
if overrides and node.impl in overrides:
|
||||||
|
return overrides[node.impl](node, ctx)
|
||||||
|
if node.impl in cls._registry:
|
||||||
|
return cls._registry[node.impl](node, ctx)
|
||||||
|
raise ValueError(
|
||||||
|
f"No component registered for impl '{node.impl}' (node: {node.id}). "
|
||||||
|
f"Register it in kernbench.components.impls.__init__."
|
||||||
|
)
|
||||||
@@ -0,0 +1,52 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import simpy
|
||||||
|
|
||||||
|
from kernbench.policy.routing.router import AddressResolver, PathRouter
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ComponentContext:
|
||||||
|
"""Topology services injected into every component implementation.
|
||||||
|
|
||||||
|
Required by components that need routing or address resolution
|
||||||
|
(IoCpuComponent, MCpuComponent, …). TransitComponent ignores ctx.
|
||||||
|
|
||||||
|
Passed via ComponentRegistry.create(node, overrides, ctx=ctx).
|
||||||
|
"""
|
||||||
|
|
||||||
|
router: PathRouter
|
||||||
|
resolver: AddressResolver
|
||||||
|
positions: dict[str, tuple[float, float] | None] # node_id → pos_mm
|
||||||
|
ns_per_mm: float # wire propagation constant (from topology spec)
|
||||||
|
edge_map: dict[tuple[str, str], Any] = field(default_factory=dict)
|
||||||
|
spec: dict = field(default_factory=dict) # topology spec (cube layout, PE count, etc.)
|
||||||
|
|
||||||
|
def get_shared_resource(
|
||||||
|
self, env: simpy.Environment, key: str, capacity: int = 1,
|
||||||
|
) -> simpy.Resource:
|
||||||
|
"""Return a shared SimPy Resource, creating it on first access.
|
||||||
|
|
||||||
|
Used by PE components that share a resource across engines within
|
||||||
|
the same PE (e.g. accel_slot shared by PE_GEMM and PE_MATH).
|
||||||
|
Key should be scoped per PE: e.g. "sip0.cube0.pe0.accel_slot".
|
||||||
|
"""
|
||||||
|
if not hasattr(self, "_shared_resources"):
|
||||||
|
self._shared_resources: dict[str, simpy.Resource] = {}
|
||||||
|
if key not in self._shared_resources:
|
||||||
|
self._shared_resources[key] = simpy.Resource(env, capacity=capacity)
|
||||||
|
return self._shared_resources[key]
|
||||||
|
|
||||||
|
def compute_drain_ns(self, path: list[str], nbytes: int) -> float:
|
||||||
|
"""Wormhole drain time: nbytes / bottleneck_bw along path."""
|
||||||
|
min_bw = float("inf")
|
||||||
|
for i in range(len(path) - 1):
|
||||||
|
edge = self.edge_map.get((path[i], path[i + 1]))
|
||||||
|
if edge and getattr(edge, "bw_gbs", None):
|
||||||
|
min_bw = min(min_bw, edge.bw_gbs)
|
||||||
|
if min_bw == float("inf"):
|
||||||
|
return 0.0
|
||||||
|
return nbytes / min_bw
|
||||||
@@ -0,0 +1,54 @@
|
|||||||
|
"""Concrete component implementations.
|
||||||
|
|
||||||
|
Each module registers its component(s) with ComponentRegistry on import.
|
||||||
|
Import this package to activate all built-in implementations.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from kernbench.components.base import ComponentRegistry
|
||||||
|
from kernbench.components.impls.forwarding import TransitComponent
|
||||||
|
from kernbench.components.impls.hbm_ctrl import HbmCtrlComponent
|
||||||
|
from kernbench.components.impls.io_cpu import IoCpuComponent
|
||||||
|
from kernbench.components.impls.m_cpu import MCpuComponent
|
||||||
|
from kernbench.components.impls.noc import TwoDMeshNocComponent
|
||||||
|
from kernbench.components.impls.pcie_ep import PcieEpComponent
|
||||||
|
from kernbench.components.impls.pe_cpu import PeCpuComponent
|
||||||
|
from kernbench.components.impls.pe_dma import PeDmaComponent
|
||||||
|
from kernbench.components.impls.pe_gemm import PeGemmComponent
|
||||||
|
from kernbench.components.impls.pe_math import PeMathComponent
|
||||||
|
from kernbench.components.impls.pe_scheduler import PeSchedulerComponent
|
||||||
|
from kernbench.components.impls.pe_tcm import PeTcmComponent
|
||||||
|
from kernbench.components.impls.sram import SramComponent
|
||||||
|
|
||||||
|
ComponentRegistry.register("forwarding_v1", TransitComponent)
|
||||||
|
ComponentRegistry.register("switch_v1", TransitComponent)
|
||||||
|
ComponentRegistry.register("noc_v1", TransitComponent)
|
||||||
|
ComponentRegistry.register("noc_2d_mesh_v1", TwoDMeshNocComponent)
|
||||||
|
ComponentRegistry.register("ucie_v1", TransitComponent)
|
||||||
|
ComponentRegistry.register("xbar_v1", TransitComponent)
|
||||||
|
ComponentRegistry.register("pcie_ep_v1", PcieEpComponent)
|
||||||
|
ComponentRegistry.register("io_cpu_v1", IoCpuComponent)
|
||||||
|
ComponentRegistry.register("m_cpu_v1", MCpuComponent)
|
||||||
|
ComponentRegistry.register("hbm_ctrl_v1", HbmCtrlComponent)
|
||||||
|
ComponentRegistry.register("sram_v1", SramComponent)
|
||||||
|
ComponentRegistry.register("pe_cpu_v1", PeCpuComponent)
|
||||||
|
ComponentRegistry.register("pe_scheduler_v1", PeSchedulerComponent)
|
||||||
|
ComponentRegistry.register("pe_dma_v1", PeDmaComponent)
|
||||||
|
ComponentRegistry.register("pe_gemm_v1", PeGemmComponent)
|
||||||
|
ComponentRegistry.register("pe_math_v1", PeMathComponent)
|
||||||
|
ComponentRegistry.register("pe_tcm_v1", PeTcmComponent)
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"HbmCtrlComponent",
|
||||||
|
"IoCpuComponent",
|
||||||
|
"MCpuComponent",
|
||||||
|
"PcieEpComponent",
|
||||||
|
"PeCpuComponent",
|
||||||
|
"PeDmaComponent",
|
||||||
|
"PeGemmComponent",
|
||||||
|
"PeMathComponent",
|
||||||
|
"PeSchedulerComponent",
|
||||||
|
"PeTcmComponent",
|
||||||
|
"TransitComponent",
|
||||||
|
"TwoDMeshNocComponent",
|
||||||
|
"SramComponent",
|
||||||
|
]
|
||||||
@@ -0,0 +1,27 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from collections.abc import Generator
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
|
import simpy
|
||||||
|
|
||||||
|
from kernbench.components.base import ComponentBase
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from kernbench.components.context import ComponentContext
|
||||||
|
from kernbench.topology.types import Node
|
||||||
|
|
||||||
|
|
||||||
|
class TransitComponent(ComponentBase):
|
||||||
|
"""Transit component for NOC, UCIe, XBAR nodes.
|
||||||
|
|
||||||
|
Applies overhead_ns processing delay (from node.attrs) then forwards the
|
||||||
|
Transaction to the next hop via inherited _forward_txn().
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
|
||||||
|
super().__init__(node, ctx)
|
||||||
|
|
||||||
|
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
|
||||||
|
overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
|
||||||
|
yield env.timeout(overhead_ns)
|
||||||
@@ -0,0 +1,101 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from collections.abc import Generator
|
||||||
|
from typing import TYPE_CHECKING, Any
|
||||||
|
|
||||||
|
import simpy
|
||||||
|
|
||||||
|
from kernbench.components.base import ComponentBase
|
||||||
|
from kernbench.sim_engine.transaction import Transaction
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from kernbench.components.context import ComponentContext
|
||||||
|
from kernbench.topology.types import Node
|
||||||
|
|
||||||
|
|
||||||
|
class HbmCtrlComponent(ComponentBase):
|
||||||
|
"""HBM controller: terminal component that models HBM access latency.
|
||||||
|
|
||||||
|
Dual-channel model: separate read and write resources (each capacity=1)
|
||||||
|
allowing concurrent read/write like PE_DMA. Multiple reads or multiple
|
||||||
|
writes still serialize within their respective channel.
|
||||||
|
|
||||||
|
On completion, creates a ResponseMsg and sends it back on the reverse path
|
||||||
|
so that response latency is modeled through the fabric.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
|
||||||
|
super().__init__(node, ctx)
|
||||||
|
self._read: simpy.Resource | None = None
|
||||||
|
self._write: simpy.Resource | None = None
|
||||||
|
|
||||||
|
def start(self, env: simpy.Environment) -> None:
|
||||||
|
capacity = int(self.node.attrs.get("capacity", 1))
|
||||||
|
self._read = simpy.Resource(env, capacity=capacity)
|
||||||
|
self._write = simpy.Resource(env, capacity=capacity)
|
||||||
|
super().start(env)
|
||||||
|
|
||||||
|
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
|
||||||
|
overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
|
||||||
|
yield env.timeout(overhead_ns)
|
||||||
|
|
||||||
|
def _select_channel(self, txn: Any) -> simpy.Resource:
|
||||||
|
"""Select channel based on request type: write requests → write, else → read."""
|
||||||
|
from kernbench.runtime_api.kernel import MemoryWriteMsg, PeDmaMsg
|
||||||
|
|
||||||
|
assert self._read is not None and self._write is not None
|
||||||
|
req = txn.request
|
||||||
|
if isinstance(req, MemoryWriteMsg):
|
||||||
|
return self._write
|
||||||
|
if isinstance(req, PeDmaMsg) and req.is_write:
|
||||||
|
return self._write
|
||||||
|
return self._read
|
||||||
|
|
||||||
|
def _worker(self, env: simpy.Environment) -> Generator:
|
||||||
|
"""Dispatch each incoming txn to a concurrent process for channel-level parallelism."""
|
||||||
|
while True:
|
||||||
|
txn: Any = yield self._inbox.get()
|
||||||
|
env.process(self._handle_txn(env, txn))
|
||||||
|
|
||||||
|
def _handle_txn(self, env: simpy.Environment, txn: Any) -> Generator:
|
||||||
|
"""Acquire channel, run, apply drain, send response."""
|
||||||
|
channel = self._select_channel(txn)
|
||||||
|
with channel.request() as req:
|
||||||
|
yield req
|
||||||
|
yield from self.run(env, txn.nbytes)
|
||||||
|
drain = getattr(txn, "drain_ns", 0.0)
|
||||||
|
if drain > 0:
|
||||||
|
yield env.timeout(drain)
|
||||||
|
yield from self._send_response(env, txn)
|
||||||
|
|
||||||
|
def _send_response(self, env: simpy.Environment, txn: Any) -> Generator:
|
||||||
|
"""Create ResponseMsg and send on reverse path back to originator.
|
||||||
|
|
||||||
|
PeDmaMsg is a direct probe with no IO_CPU/M_CPU aggregation in the path,
|
||||||
|
so we succeed txn.done directly instead of sending a response Transaction.
|
||||||
|
"""
|
||||||
|
from kernbench.runtime_api.kernel import PeDmaMsg
|
||||||
|
|
||||||
|
if isinstance(txn.request, PeDmaMsg):
|
||||||
|
txn.done.succeed()
|
||||||
|
return
|
||||||
|
|
||||||
|
reverse_path = list(reversed(txn.path))
|
||||||
|
if len(reverse_path) >= 2 and self.ctx:
|
||||||
|
from kernbench.runtime_api.kernel import ResponseMsg
|
||||||
|
|
||||||
|
parts = self.node.id.split(".")
|
||||||
|
cube_id = int(parts[1].replace("cube", ""))
|
||||||
|
pe_id = int(parts[3].replace("slice", ""))
|
||||||
|
resp_msg = ResponseMsg(
|
||||||
|
correlation_id=txn.request.correlation_id,
|
||||||
|
request_id=txn.request.request_id,
|
||||||
|
src_cube=cube_id, src_pe=pe_id, success=True,
|
||||||
|
)
|
||||||
|
resp_txn = Transaction(
|
||||||
|
request=resp_msg, path=reverse_path, step=0,
|
||||||
|
nbytes=0, done=env.event(), is_response=True,
|
||||||
|
)
|
||||||
|
yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
|
||||||
|
else:
|
||||||
|
txn.done.succeed()
|
||||||
@@ -0,0 +1,145 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from collections.abc import Generator
|
||||||
|
from typing import TYPE_CHECKING, Any
|
||||||
|
|
||||||
|
import simpy
|
||||||
|
|
||||||
|
from kernbench.components.base import ComponentBase
|
||||||
|
from kernbench.sim_engine.transaction import Transaction
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from kernbench.components.context import ComponentContext
|
||||||
|
from kernbench.topology.types import Node
|
||||||
|
|
||||||
|
|
||||||
|
class IoCpuComponent(ComponentBase):
|
||||||
|
"""IO_CPU component: multi-cube fan-out with response aggregation.
|
||||||
|
|
||||||
|
Forward path:
|
||||||
|
1. Applies overhead_ns processing overhead.
|
||||||
|
2. Resolves target cube(s) from request.target_cubes.
|
||||||
|
3. Fans out sub-Transactions to each target cube's M_CPU.
|
||||||
|
|
||||||
|
Response path:
|
||||||
|
Collects ResponseMsg from each M_CPU. When all cube responses are
|
||||||
|
received, succeeds the parent txn.done.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
|
||||||
|
super().__init__(node, ctx)
|
||||||
|
# Pending fan-out tracking: request_id → (expected, received, parent_txn_done)
|
||||||
|
self._pending: dict[str, tuple[int, int, simpy.Event]] = {}
|
||||||
|
|
||||||
|
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
|
||||||
|
overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
|
||||||
|
yield env.timeout(overhead_ns)
|
||||||
|
|
||||||
|
def _worker(self, env: simpy.Environment) -> Generator:
|
||||||
|
while True:
|
||||||
|
txn: Any = yield self._inbox.get()
|
||||||
|
if getattr(txn, "is_response", False):
|
||||||
|
self._collect_response(txn)
|
||||||
|
else:
|
||||||
|
yield from self.run(env, txn.nbytes)
|
||||||
|
env.process(self._dispatch_to_m_cpus(env, txn))
|
||||||
|
|
||||||
|
def _collect_response(self, resp_txn: Any) -> None:
|
||||||
|
"""Receive a cube response and increment the aggregation counter."""
|
||||||
|
key = resp_txn.request.request_id
|
||||||
|
if key not in self._pending:
|
||||||
|
return
|
||||||
|
expected, received, parent_done = self._pending[key]
|
||||||
|
received += 1
|
||||||
|
if received >= expected:
|
||||||
|
parent_done.succeed()
|
||||||
|
del self._pending[key]
|
||||||
|
else:
|
||||||
|
self._pending[key] = (expected, received, parent_done)
|
||||||
|
|
||||||
|
def _dispatch_to_m_cpus(self, env: simpy.Environment, txn: Any) -> Generator:
|
||||||
|
"""Fan out sub-Transactions to target cube M_CPUs, wait for responses."""
|
||||||
|
from kernbench.runtime_api.kernel import KernelLaunchMsg, MemoryReadMsg, MemoryWriteMsg
|
||||||
|
|
||||||
|
request = txn.request
|
||||||
|
try:
|
||||||
|
cube_targets = self._resolve_cube_targets(request)
|
||||||
|
except Exception:
|
||||||
|
txn.done.succeed()
|
||||||
|
return
|
||||||
|
|
||||||
|
if not cube_targets:
|
||||||
|
txn.done.succeed()
|
||||||
|
return
|
||||||
|
|
||||||
|
# Setup aggregation
|
||||||
|
self._pending[request.request_id] = (len(cube_targets), 0, txn.done)
|
||||||
|
|
||||||
|
# Fan out to each target cube's M_CPU
|
||||||
|
for sip, cube in cube_targets:
|
||||||
|
try:
|
||||||
|
m_cpu_id = self.ctx.resolver.find_m_cpu(sip, cube)
|
||||||
|
path = self.ctx.router.find_node_path(self.node.id, m_cpu_id)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
if len(path) < 2:
|
||||||
|
continue
|
||||||
|
sub_txn = Transaction(
|
||||||
|
request=request, path=path, step=0,
|
||||||
|
nbytes=txn.nbytes, done=env.event(),
|
||||||
|
result_data=txn.result_data,
|
||||||
|
)
|
||||||
|
yield self.out_ports[path[1]].put(sub_txn.advance())
|
||||||
|
|
||||||
|
def _resolve_cube_targets(self, request: Any) -> list[tuple[int, int]]:
|
||||||
|
"""Return list of (sip, cube) pairs to fan out to."""
|
||||||
|
from kernbench.runtime_api.kernel import KernelLaunchMsg, MemoryReadMsg, MemoryWriteMsg
|
||||||
|
|
||||||
|
target_cubes = getattr(request, "target_cubes", "all")
|
||||||
|
|
||||||
|
if isinstance(request, MemoryWriteMsg):
|
||||||
|
sip = request.dst_sip
|
||||||
|
if target_cubes == "all":
|
||||||
|
cube = self._cube_from_pa(request.dst_pa, fallback=request.dst_cube)
|
||||||
|
return [(sip, cube)]
|
||||||
|
return [(sip, c) for c in target_cubes]
|
||||||
|
|
||||||
|
if isinstance(request, MemoryReadMsg):
|
||||||
|
sip = request.src_sip
|
||||||
|
if target_cubes == "all":
|
||||||
|
cube = self._cube_from_pa(request.src_pa, fallback=request.src_cube)
|
||||||
|
return [(sip, cube)]
|
||||||
|
return [(sip, c) for c in target_cubes]
|
||||||
|
|
||||||
|
if isinstance(request, KernelLaunchMsg):
|
||||||
|
my_sip = self._my_sip()
|
||||||
|
if target_cubes != "all":
|
||||||
|
return [(my_sip, c) for c in target_cubes]
|
||||||
|
# "all": derive from tensor shards, filtered to this SIP
|
||||||
|
seen: set[tuple[int, int]] = set()
|
||||||
|
targets: list[tuple[int, int]] = []
|
||||||
|
for arg in request.args:
|
||||||
|
if arg.arg_kind != "tensor":
|
||||||
|
continue
|
||||||
|
for shard in arg.shards:
|
||||||
|
if shard.sip != my_sip:
|
||||||
|
continue
|
||||||
|
key = (shard.sip, shard.cube)
|
||||||
|
if key not in seen:
|
||||||
|
seen.add(key)
|
||||||
|
targets.append(key)
|
||||||
|
return targets
|
||||||
|
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _cube_from_pa(self, pa_val: int, fallback: int) -> int:
|
||||||
|
"""Extract cube_id from a physical address, with fallback."""
|
||||||
|
from kernbench.policy.address.phyaddr import PhysAddr
|
||||||
|
try:
|
||||||
|
return PhysAddr.decode(pa_val).cube_id
|
||||||
|
except Exception:
|
||||||
|
return fallback
|
||||||
|
|
||||||
|
def _my_sip(self) -> int:
|
||||||
|
"""Extract this IO_CPU's SIP ID from its node ID (e.g. 'sip0.io0.io_cpu' → 0)."""
|
||||||
|
return int(self.node.id.split(".")[0].replace("sip", ""))
|
||||||
@@ -0,0 +1,269 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from collections.abc import Generator
|
||||||
|
from typing import TYPE_CHECKING, Any
|
||||||
|
|
||||||
|
import simpy
|
||||||
|
|
||||||
|
from kernbench.components.base import ComponentBase
|
||||||
|
from kernbench.sim_engine.transaction import Transaction
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from kernbench.components.context import ComponentContext
|
||||||
|
from kernbench.topology.types import Node
|
||||||
|
|
||||||
|
|
||||||
|
class MCpuComponent(ComponentBase):
|
||||||
|
"""M_CPU component: multi-PE DMA fan-out with response aggregation.
|
||||||
|
|
||||||
|
Forward path (ADR-0015 D5):
|
||||||
|
When a forward Transaction arrives at m_cpu (terminal hop), M_CPU fans out
|
||||||
|
DMA sub-Transactions to target PEs' HBM slices. target_pe on the request
|
||||||
|
controls fan-out: int → single PE, "all" → all PEs in the cube.
|
||||||
|
|
||||||
|
Response path:
|
||||||
|
ResponseMsg from each hbm_ctrl arrives back at m_cpu. Once all PE responses
|
||||||
|
are collected, m_cpu sends an aggregate ResponseMsg on the reverse command
|
||||||
|
path back to io_cpu.
|
||||||
|
|
||||||
|
Transit:
|
||||||
|
When m_cpu is NOT the terminal hop (transit or response relay), the
|
||||||
|
Transaction is forwarded normally to the next hop.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
|
||||||
|
super().__init__(node, ctx)
|
||||||
|
# Pending fan-out tracking: request_id → (expected, received, all_done_event)
|
||||||
|
self._pending: dict[str, tuple[int, int, simpy.Event]] = {}
|
||||||
|
# Store parent txn for response sending: request_id → parent_txn
|
||||||
|
self._parent_txns: dict[str, Any] = {}
|
||||||
|
# DMA engine resources (ADR-0015 D5, ADR-0014 D4): capacity=1 each
|
||||||
|
self._dma_write: simpy.Resource | None = None
|
||||||
|
self._dma_read: simpy.Resource | None = None
|
||||||
|
|
||||||
|
def start(self, env: simpy.Environment) -> None:
|
||||||
|
self._dma_write = simpy.Resource(env, capacity=1)
|
||||||
|
self._dma_read = simpy.Resource(env, capacity=1)
|
||||||
|
super().start(env)
|
||||||
|
|
||||||
|
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
|
||||||
|
overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
|
||||||
|
yield env.timeout(overhead_ns)
|
||||||
|
|
||||||
|
def _worker(self, env: simpy.Environment) -> Generator:
|
||||||
|
"""Dispatch forward txns, collect response txns."""
|
||||||
|
from kernbench.runtime_api.kernel import KernelLaunchMsg
|
||||||
|
|
||||||
|
while True:
|
||||||
|
txn: Any = yield self._inbox.get()
|
||||||
|
if getattr(txn, "is_response", False):
|
||||||
|
self._collect_response(txn)
|
||||||
|
else:
|
||||||
|
yield from self.run(env, txn.nbytes)
|
||||||
|
next_hop = txn.next_hop
|
||||||
|
if next_hop:
|
||||||
|
yield self.out_ports[next_hop].put(txn.advance())
|
||||||
|
elif self.ctx is not None and txn.request is not None:
|
||||||
|
if isinstance(txn.request, KernelLaunchMsg):
|
||||||
|
env.process(self._kernel_launch_fanout(env, txn))
|
||||||
|
else:
|
||||||
|
env.process(self._dma_fanout(env, txn))
|
||||||
|
else:
|
||||||
|
txn.done.succeed()
|
||||||
|
|
||||||
|
def _collect_response(self, resp_txn: Any) -> None:
|
||||||
|
"""Receive a PE response and increment the aggregation counter."""
|
||||||
|
key = resp_txn.request.request_id
|
||||||
|
if key not in self._pending:
|
||||||
|
return
|
||||||
|
expected, received, all_done = self._pending[key]
|
||||||
|
received += 1
|
||||||
|
if received >= expected:
|
||||||
|
all_done.succeed()
|
||||||
|
del self._pending[key]
|
||||||
|
else:
|
||||||
|
self._pending[key] = (expected, received, all_done)
|
||||||
|
|
||||||
|
def _dma_fanout(self, env: simpy.Environment, txn: Any) -> Generator:
|
||||||
|
"""Fan out DMA sub-Transactions to target PE(s), wait for responses,
|
||||||
|
then send aggregate response on reverse command path.
|
||||||
|
|
||||||
|
Each DMA transfer acquires the DMA resource (capacity=1 per ADR-0014 D4),
|
||||||
|
so multi-PE fan-out is serialized through the DMA engine.
|
||||||
|
"""
|
||||||
|
from kernbench.runtime_api.kernel import MemoryWriteMsg
|
||||||
|
|
||||||
|
request = txn.request
|
||||||
|
target_pe = getattr(request, "target_pe", "all")
|
||||||
|
|
||||||
|
dst_nodes = self._resolve_dma_destinations(request, target_pe)
|
||||||
|
if not dst_nodes:
|
||||||
|
txn.done.succeed()
|
||||||
|
return
|
||||||
|
|
||||||
|
# Setup aggregation
|
||||||
|
all_done = env.event()
|
||||||
|
self._pending[request.request_id] = (len(dst_nodes), 0, all_done)
|
||||||
|
self._parent_txns[request.request_id] = txn
|
||||||
|
|
||||||
|
# Select DMA resource based on operation type
|
||||||
|
dma_res = self._dma_write if isinstance(request, MemoryWriteMsg) else self._dma_read
|
||||||
|
|
||||||
|
# Fan out DMA sub-txns (serialized through DMA resource)
|
||||||
|
max_drain_ns = 0.0
|
||||||
|
for dst_node in dst_nodes:
|
||||||
|
try:
|
||||||
|
dma_path = self.ctx.router.find_mcpu_dma_path(self.node.id, dst_node)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
if len(dma_path) < 2:
|
||||||
|
continue
|
||||||
|
drain_ns = self.ctx.compute_drain_ns(dma_path, txn.nbytes)
|
||||||
|
max_drain_ns = max(max_drain_ns, drain_ns)
|
||||||
|
sub_txn = Transaction(
|
||||||
|
request=request, path=dma_path, step=0,
|
||||||
|
nbytes=txn.nbytes, done=env.event(),
|
||||||
|
drain_ns=drain_ns,
|
||||||
|
)
|
||||||
|
with dma_res.request() as req:
|
||||||
|
yield req
|
||||||
|
yield self.out_ports[dma_path[1]].put(sub_txn.advance())
|
||||||
|
|
||||||
|
# Wait for all PE responses
|
||||||
|
yield all_done
|
||||||
|
txn.result_data["xfer_ns"] = max_drain_ns
|
||||||
|
del self._parent_txns[request.request_id]
|
||||||
|
|
||||||
|
# Send aggregate response on reverse command path
|
||||||
|
reverse_path = list(reversed(txn.path))
|
||||||
|
if len(reverse_path) >= 2:
|
||||||
|
from kernbench.runtime_api.kernel import ResponseMsg
|
||||||
|
|
||||||
|
parts = self.node.id.split(".")
|
||||||
|
cube_id = int(parts[1].replace("cube", ""))
|
||||||
|
resp_msg = ResponseMsg(
|
||||||
|
correlation_id=request.correlation_id,
|
||||||
|
request_id=request.request_id,
|
||||||
|
src_cube=cube_id, src_pe=-1, success=True,
|
||||||
|
)
|
||||||
|
resp_txn = Transaction(
|
||||||
|
request=resp_msg, path=reverse_path, step=0,
|
||||||
|
nbytes=0, done=env.event(), is_response=True,
|
||||||
|
)
|
||||||
|
yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
|
||||||
|
else:
|
||||||
|
txn.done.succeed()
|
||||||
|
|
||||||
|
def _kernel_launch_fanout(self, env: simpy.Environment, txn: Any) -> Generator:
|
||||||
|
"""Fan out KernelLaunchMsg to target PE_CPU(s) via NOC (ADR-0009 D3).
|
||||||
|
|
||||||
|
Routes through find_node_path (M_CPU → NOC → PE_CPU command edges).
|
||||||
|
Waits for sub_txn.done directly — no ResponseMsg needed for PE direction.
|
||||||
|
Then sends aggregate ResponseMsg back to IO_CPU on the reverse path.
|
||||||
|
"""
|
||||||
|
request = txn.request
|
||||||
|
target_pe = getattr(request, "target_pe", "all")
|
||||||
|
cube_prefix = self.node.id.rsplit(".", 1)[0] # e.g. "sip0.cube0"
|
||||||
|
pe_ids = self._resolve_pe_ids(target_pe)
|
||||||
|
|
||||||
|
if not pe_ids:
|
||||||
|
txn.done.succeed()
|
||||||
|
return
|
||||||
|
|
||||||
|
# Fan out to each PE_CPU and collect done events
|
||||||
|
sub_dones: list[simpy.Event] = []
|
||||||
|
sub_txns: list[Transaction] = []
|
||||||
|
for pe_id in pe_ids:
|
||||||
|
pe_cpu_id = f"{cube_prefix}.pe{pe_id}.pe_cpu"
|
||||||
|
try:
|
||||||
|
path = self.ctx.router.find_node_path(self.node.id, pe_cpu_id)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
if len(path) < 2:
|
||||||
|
continue
|
||||||
|
sub_done = env.event()
|
||||||
|
sub_txn = Transaction(
|
||||||
|
request=request, path=path, step=0,
|
||||||
|
nbytes=0, done=sub_done,
|
||||||
|
)
|
||||||
|
yield self.out_ports[path[1]].put(sub_txn.advance())
|
||||||
|
sub_dones.append(sub_done)
|
||||||
|
sub_txns.append(sub_txn)
|
||||||
|
|
||||||
|
if not sub_dones:
|
||||||
|
txn.done.succeed()
|
||||||
|
return
|
||||||
|
|
||||||
|
# Wait for all PE_CPUs to complete
|
||||||
|
for sd in sub_dones:
|
||||||
|
yield sd
|
||||||
|
|
||||||
|
# Aggregate PE-internal metrics (max across PEs)
|
||||||
|
pe_exec_values = [st.result_data.get("pe_exec_ns", 0.0) for st in sub_txns]
|
||||||
|
if pe_exec_values:
|
||||||
|
txn.result_data["pe_exec_ns"] = max(pe_exec_values)
|
||||||
|
dma_values = [st.result_data.get("dma_ns", 0.0) for st in sub_txns]
|
||||||
|
if dma_values:
|
||||||
|
txn.result_data["dma_ns"] = max(dma_values)
|
||||||
|
compute_values = [st.result_data.get("compute_ns", 0.0) for st in sub_txns]
|
||||||
|
if compute_values:
|
||||||
|
txn.result_data["compute_ns"] = max(compute_values)
|
||||||
|
|
||||||
|
# Send aggregate response on reverse command path back to IO_CPU
|
||||||
|
reverse_path = list(reversed(txn.path))
|
||||||
|
if len(reverse_path) >= 2:
|
||||||
|
from kernbench.runtime_api.kernel import ResponseMsg
|
||||||
|
|
||||||
|
parts = self.node.id.split(".")
|
||||||
|
cube_id = int(parts[1].replace("cube", ""))
|
||||||
|
resp_msg = ResponseMsg(
|
||||||
|
correlation_id=request.correlation_id,
|
||||||
|
request_id=request.request_id,
|
||||||
|
src_cube=cube_id, src_pe=-1, success=True,
|
||||||
|
)
|
||||||
|
resp_txn = Transaction(
|
||||||
|
request=resp_msg, path=reverse_path, step=0,
|
||||||
|
nbytes=0, done=env.event(), is_response=True,
|
||||||
|
)
|
||||||
|
yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
|
||||||
|
else:
|
||||||
|
txn.done.succeed()
|
||||||
|
|
||||||
|
def _resolve_dma_destinations(self, request: Any, target_pe: int | str) -> list[str]:
|
||||||
|
"""Return list of HBM destination node_ids for DMA fan-out.
|
||||||
|
|
||||||
|
Uses PA-based resolution to determine the actual target cube and slice,
|
||||||
|
enabling cross-cube DMA routing when the PA points to a remote cube.
|
||||||
|
"""
|
||||||
|
cube_prefix = self.node.id.rsplit(".", 1)[0] # e.g. "sip0.cube0"
|
||||||
|
|
||||||
|
if isinstance(target_pe, int):
|
||||||
|
return [f"{cube_prefix}.hbm_ctrl.slice{target_pe}"]
|
||||||
|
|
||||||
|
# PA-based resolution: extract actual target from physical address
|
||||||
|
pa_val = getattr(request, "dst_pa", None) or getattr(request, "src_pa", None)
|
||||||
|
if pa_val is not None:
|
||||||
|
from kernbench.policy.address.phyaddr import PhysAddr
|
||||||
|
try:
|
||||||
|
pa = PhysAddr.decode(pa_val)
|
||||||
|
return [self.ctx.resolver.resolve(pa)]
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# "all" without PA (KernelLaunch): all slices in local cube
|
||||||
|
n_slices = 8
|
||||||
|
if self.ctx and self.ctx.spec:
|
||||||
|
mm = self.ctx.spec.get("cube", {}).get("memory_map", {})
|
||||||
|
n_slices = mm.get("hbm_slices_per_cube", 8)
|
||||||
|
return [f"{cube_prefix}.hbm_ctrl.slice{i}" for i in range(n_slices)]
|
||||||
|
|
||||||
|
def _resolve_pe_ids(self, target_pe: int | str) -> list[int]:
|
||||||
|
"""Return list of PE IDs to fan out to (used by kernel launch fan-out)."""
|
||||||
|
if isinstance(target_pe, int):
|
||||||
|
return [target_pe]
|
||||||
|
# "all": all PEs in local cube
|
||||||
|
n_slices = 8
|
||||||
|
if self.ctx and self.ctx.spec:
|
||||||
|
mm = self.ctx.spec.get("cube", {}).get("memory_map", {})
|
||||||
|
n_slices = mm.get("hbm_slices_per_cube", 8)
|
||||||
|
return list(range(n_slices))
|
||||||
@@ -0,0 +1,187 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from collections.abc import Generator
|
||||||
|
from typing import TYPE_CHECKING, Any
|
||||||
|
|
||||||
|
import simpy
|
||||||
|
|
||||||
|
from kernbench.components.base import ComponentBase
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from kernbench.components.context import ComponentContext
|
||||||
|
from kernbench.topology.types import Node
|
||||||
|
|
||||||
|
|
||||||
|
class TwoDMeshNocComponent(ComponentBase):
|
||||||
|
"""2D mesh NOC modeled as a single smart node.
|
||||||
|
|
||||||
|
Latency model:
|
||||||
|
- Traversal latency = Manhattan distance between prev_hop and next_hop
|
||||||
|
node positions, split into XY segments, traversed with pipeline.
|
||||||
|
- overhead_ns (from node.attrs) is added once per traversal.
|
||||||
|
|
||||||
|
Contention model:
|
||||||
|
- Each directed XY segment is a simpy.Resource(capacity=1).
|
||||||
|
- Pipeline: next segment's resource is requested before the current
|
||||||
|
segment's timeout completes, so a free downstream segment is acquired
|
||||||
|
immediately (wormhole-style cut-through).
|
||||||
|
- Two transactions sharing a segment (same row or column band) contend.
|
||||||
|
|
||||||
|
Concurrency:
|
||||||
|
- _worker spawns an independent SimPy process per transaction, so the
|
||||||
|
NOC is never serialized at the node level — only at segment resources.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
|
||||||
|
super().__init__(node, ctx)
|
||||||
|
self._env: simpy.Environment | None = None
|
||||||
|
self._links: dict[tuple, simpy.Resource] = {}
|
||||||
|
self._x_grid: list[float] = []
|
||||||
|
self._y_grid: list[float] = []
|
||||||
|
|
||||||
|
def start(self, env: simpy.Environment) -> None:
|
||||||
|
self._env = env
|
||||||
|
self._build_grid()
|
||||||
|
super().start(env)
|
||||||
|
|
||||||
|
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
|
||||||
|
yield env.timeout(0)
|
||||||
|
|
||||||
|
# ── Grid construction ────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _build_grid(self) -> None:
|
||||||
|
if not self.ctx:
|
||||||
|
return
|
||||||
|
cube_prefix = self.node.id.rsplit(".", 1)[0]
|
||||||
|
xs: set[float] = set()
|
||||||
|
ys: set[float] = set()
|
||||||
|
for node_id, pos in self.ctx.positions.items():
|
||||||
|
if node_id.startswith(cube_prefix + ".") and pos is not None:
|
||||||
|
xs.add(round(pos[0], 2))
|
||||||
|
ys.add(round(pos[1], 2))
|
||||||
|
self._x_grid = sorted(xs)
|
||||||
|
self._y_grid = sorted(ys)
|
||||||
|
|
||||||
|
def _get_link(self, key: tuple) -> simpy.Resource:
|
||||||
|
if key not in self._links:
|
||||||
|
assert self._env is not None
|
||||||
|
self._links[key] = simpy.Resource(self._env, capacity=1)
|
||||||
|
return self._links[key]
|
||||||
|
|
||||||
|
# ── Worker ───────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _worker(self, env: simpy.Environment) -> Generator:
|
||||||
|
while True:
|
||||||
|
txn: Any = yield self._inbox.get()
|
||||||
|
env.process(self._route(env, txn))
|
||||||
|
|
||||||
|
def _route(self, env: simpy.Environment, txn: Any) -> Generator:
|
||||||
|
prev_hop = txn.path[txn.step - 1] if txn.step > 0 else None
|
||||||
|
next_hop = txn.next_hop
|
||||||
|
overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
|
||||||
|
|
||||||
|
links: list[tuple[tuple, float]] = []
|
||||||
|
if prev_hop and next_hop and self.ctx:
|
||||||
|
src_pos = self.ctx.positions.get(prev_hop)
|
||||||
|
dst_pos = self.ctx.positions.get(next_hop)
|
||||||
|
if src_pos and dst_pos:
|
||||||
|
links = self._xy_links(src_pos, dst_pos)
|
||||||
|
|
||||||
|
if links:
|
||||||
|
yield from self._traverse(env, links, overhead_ns)
|
||||||
|
else:
|
||||||
|
yield env.timeout(overhead_ns)
|
||||||
|
|
||||||
|
if next_hop:
|
||||||
|
yield self.out_ports[next_hop].put(txn.advance())
|
||||||
|
else:
|
||||||
|
drain = getattr(txn, "drain_ns", 0.0)
|
||||||
|
if drain > 0:
|
||||||
|
yield env.timeout(drain)
|
||||||
|
txn.done.succeed()
|
||||||
|
|
||||||
|
# ── XY routing and pipelined link traversal ──────────────────────
|
||||||
|
|
||||||
|
def _traverse(
|
||||||
|
self,
|
||||||
|
env: simpy.Environment,
|
||||||
|
links: list[tuple[tuple, float]],
|
||||||
|
overhead_ns: float,
|
||||||
|
) -> Generator:
|
||||||
|
"""Pipeline: request next segment before current timeout finishes."""
|
||||||
|
ns_per_mm = self.ctx.ns_per_mm # type: ignore[union-attr]
|
||||||
|
|
||||||
|
# Acquire first link
|
||||||
|
first_key, _ = links[0]
|
||||||
|
current_resource = self._get_link(first_key)
|
||||||
|
current_req = current_resource.request()
|
||||||
|
yield current_req
|
||||||
|
|
||||||
|
for i, (_, dist_mm) in enumerate(links):
|
||||||
|
# Request next link before current timeout (pipeline)
|
||||||
|
if i + 1 < len(links):
|
||||||
|
next_key, _ = links[i + 1]
|
||||||
|
next_resource = self._get_link(next_key)
|
||||||
|
next_req = next_resource.request()
|
||||||
|
|
||||||
|
yield env.timeout(dist_mm * ns_per_mm + (overhead_ns if i == 0 else 0.0))
|
||||||
|
current_resource.release(current_req)
|
||||||
|
|
||||||
|
if i + 1 < len(links):
|
||||||
|
yield next_req # usually already fulfilled (pipeline)
|
||||||
|
current_resource = next_resource
|
||||||
|
current_req = next_req
|
||||||
|
|
||||||
|
def _xy_links(
|
||||||
|
self,
|
||||||
|
src: tuple[float, float],
|
||||||
|
dst: tuple[float, float],
|
||||||
|
) -> list[tuple[tuple, float]]:
|
||||||
|
"""XY routing: horizontal segment first, then vertical.
|
||||||
|
|
||||||
|
Returns list of (link_key, dist_mm) pairs, where link_key uniquely
|
||||||
|
identifies a directed segment shared across concurrent transactions.
|
||||||
|
"""
|
||||||
|
x0, y0 = src
|
||||||
|
x1, y1 = dst
|
||||||
|
links: list[tuple[tuple, float]] = []
|
||||||
|
|
||||||
|
# Horizontal segment at y≈y0
|
||||||
|
if abs(x0 - x1) > 1e-9:
|
||||||
|
y_band = self._snap(y0, self._y_grid)
|
||||||
|
for xa, xb in self._segments(x0, x1, self._x_grid):
|
||||||
|
d = abs(xb - xa)
|
||||||
|
if d > 1e-9:
|
||||||
|
lo, hi = (xa, xb) if xa < xb else (xb, xa)
|
||||||
|
dir_h = "E" if xb > xa else "W"
|
||||||
|
links.append((("H", round(y_band, 2), round(lo, 2), round(hi, 2), dir_h), d))
|
||||||
|
|
||||||
|
# Vertical segment at x≈x1
|
||||||
|
if abs(y0 - y1) > 1e-9:
|
||||||
|
x_band = self._snap(x1, self._x_grid)
|
||||||
|
for ya, yb in self._segments(y0, y1, self._y_grid):
|
||||||
|
d = abs(yb - ya)
|
||||||
|
if d > 1e-9:
|
||||||
|
lo, hi = (ya, yb) if ya < yb else (yb, ya)
|
||||||
|
dir_v = "S" if yb > ya else "N"
|
||||||
|
links.append((("V", round(x_band, 2), round(lo, 2), round(hi, 2), dir_v), d))
|
||||||
|
|
||||||
|
return links
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _snap(val: float, grid: list[float]) -> float:
|
||||||
|
if not grid:
|
||||||
|
return val
|
||||||
|
return min(grid, key=lambda g: abs(g - val))
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _segments(a: float, b: float, grid: list[float]) -> list[tuple[float, float]]:
|
||||||
|
"""Consecutive (p_i, p_{i+1}) pairs covering range [a, b] using grid waypoints."""
|
||||||
|
if abs(a - b) < 1e-9:
|
||||||
|
return []
|
||||||
|
lo, hi = (a, b) if a < b else (b, a)
|
||||||
|
pts = [lo] + [g for g in grid if lo + 1e-9 < g < hi - 1e-9] + [hi]
|
||||||
|
pairs = [(pts[i], pts[i + 1]) for i in range(len(pts) - 1)]
|
||||||
|
if a > b:
|
||||||
|
pairs = [(p2, p1) for p1, p2 in reversed(pairs)]
|
||||||
|
return pairs
|
||||||
@@ -0,0 +1,27 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from collections.abc import Generator
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
|
import simpy
|
||||||
|
|
||||||
|
from kernbench.components.base import ComponentBase
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from kernbench.components.context import ComponentContext
|
||||||
|
from kernbench.topology.types import Node
|
||||||
|
|
||||||
|
|
||||||
|
class PcieEpComponent(ComponentBase):
|
||||||
|
"""PCIe endpoint: protocol processing overhead before forwarding.
|
||||||
|
|
||||||
|
Applies overhead_ns (from node.attrs) for PCIe protocol handling,
|
||||||
|
then forwards via inherited _forward_txn().
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
|
||||||
|
super().__init__(node, ctx)
|
||||||
|
|
||||||
|
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
|
||||||
|
overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
|
||||||
|
yield env.timeout(overhead_ns)
|
||||||
@@ -0,0 +1,154 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from collections.abc import Generator
|
||||||
|
from typing import TYPE_CHECKING, Any
|
||||||
|
|
||||||
|
import simpy
|
||||||
|
|
||||||
|
from kernbench.components.base import ComponentBase
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from kernbench.components.context import ComponentContext
|
||||||
|
from kernbench.topology.types import Node
|
||||||
|
|
||||||
|
|
||||||
|
class PeCpuComponent(ComponentBase):
|
||||||
|
"""PE_CPU: kernel execution controller (Stage 2).
|
||||||
|
|
||||||
|
Two-phase kernel execution (ADR-0014 D1):
|
||||||
|
Phase 1 (compile): look up kernel from registry, run it with TLContext
|
||||||
|
to generate a PeCommand list.
|
||||||
|
Phase 2 (replay): iterate commands, dispatch to PE_SCHEDULER via
|
||||||
|
PeInternalTxn, wait for blocking commands.
|
||||||
|
|
||||||
|
Non-kernel Transactions are forwarded normally.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
|
||||||
|
super().__init__(node, ctx)
|
||||||
|
self._pe_prefix = node.id.rsplit(".", 1)[0] # e.g. "sip0.cube0.pe0"
|
||||||
|
try:
|
||||||
|
self._pe_idx = int(self._pe_prefix.rsplit("pe", 1)[1])
|
||||||
|
except (IndexError, ValueError):
|
||||||
|
self._pe_idx = 0
|
||||||
|
# Extract sip/cube index for multi-SIP/cube shard matching
|
||||||
|
parts = node.id.split(".")
|
||||||
|
try:
|
||||||
|
self._sip_idx = int(parts[0].replace("sip", ""))
|
||||||
|
except (IndexError, ValueError):
|
||||||
|
self._sip_idx = 0
|
||||||
|
try:
|
||||||
|
self._cube_idx = int(parts[1].replace("cube", ""))
|
||||||
|
except (IndexError, ValueError):
|
||||||
|
self._cube_idx = 0
|
||||||
|
|
||||||
|
def _find_shard(self, shards: tuple) -> Any:
|
||||||
|
"""Find shard matching this PE's (sip, cube, pe). Fallback to positional index."""
|
||||||
|
for s in shards:
|
||||||
|
if s.sip == self._sip_idx and s.cube == self._cube_idx and s.pe == self._pe_idx:
|
||||||
|
return s
|
||||||
|
return shards[min(self._pe_idx, len(shards) - 1)]
|
||||||
|
|
||||||
|
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
|
||||||
|
overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
|
||||||
|
yield env.timeout(overhead_ns)
|
||||||
|
|
||||||
|
def _worker(self, env: simpy.Environment) -> Generator:
|
||||||
|
while True:
|
||||||
|
txn: Any = yield self._inbox.get()
|
||||||
|
from kernbench.runtime_api.kernel import KernelLaunchMsg
|
||||||
|
|
||||||
|
if hasattr(txn, "request") and isinstance(txn.request, KernelLaunchMsg):
|
||||||
|
yield from self._execute_kernel(env, txn)
|
||||||
|
else:
|
||||||
|
yield from self._forward_txn(env, txn)
|
||||||
|
|
||||||
|
def _execute_kernel(self, env: simpy.Environment, txn: Any) -> Generator:
|
||||||
|
"""Compile kernel function and replay command trace."""
|
||||||
|
from kernbench.common.pe_commands import (
|
||||||
|
CompositeCmd,
|
||||||
|
PeCpuOverheadCmd,
|
||||||
|
PeInternalTxn,
|
||||||
|
WaitCmd,
|
||||||
|
)
|
||||||
|
from kernbench.triton_emu.registry import get_kernel
|
||||||
|
from kernbench.triton_emu.tl_context import TLContext, run_kernel
|
||||||
|
|
||||||
|
request = txn.request
|
||||||
|
|
||||||
|
# Phase 1: Compile — apply PE_CPU setup overhead, then run kernel
|
||||||
|
yield from self.run(env, 0)
|
||||||
|
|
||||||
|
kernel_fn = get_kernel(request.kernel_ref.name)
|
||||||
|
tl = TLContext(pe_id=self._pe_idx, dispatch_cycles=0)
|
||||||
|
|
||||||
|
# Unpack KernelLaunchMsg.args into positional args for kernel function
|
||||||
|
# TensorArg → PA (pointer), ScalarArg → value
|
||||||
|
kernel_args: list = []
|
||||||
|
for arg in request.args:
|
||||||
|
if arg.arg_kind == "tensor":
|
||||||
|
shard = self._find_shard(arg.shards)
|
||||||
|
kernel_args.append(shard.pa)
|
||||||
|
elif arg.arg_kind == "scalar":
|
||||||
|
kernel_args.append(arg.value)
|
||||||
|
|
||||||
|
run_kernel(kernel_fn, tl, *kernel_args)
|
||||||
|
commands = tl.commands
|
||||||
|
|
||||||
|
# Phase 2: Replay — dispatch commands to PE_SCHEDULER
|
||||||
|
pe_exec_start = env.now
|
||||||
|
scheduler_id = f"{self._pe_prefix}.pe_scheduler"
|
||||||
|
pending: dict[str, simpy.Event] = {} # completion_id → done event
|
||||||
|
composite_results: list[dict] = [] # collect result_data from CompositeCmd txns
|
||||||
|
|
||||||
|
for cmd in commands:
|
||||||
|
if isinstance(cmd, PeCpuOverheadCmd):
|
||||||
|
yield env.timeout(cmd.cycles)
|
||||||
|
elif isinstance(cmd, WaitCmd):
|
||||||
|
if cmd.handle is not None:
|
||||||
|
evt = pending.pop(cmd.handle.id, None)
|
||||||
|
if evt:
|
||||||
|
yield evt
|
||||||
|
else:
|
||||||
|
# Wait all pending completions
|
||||||
|
for evt in pending.values():
|
||||||
|
yield evt
|
||||||
|
pending.clear()
|
||||||
|
elif isinstance(cmd, CompositeCmd):
|
||||||
|
# Non-blocking: dispatch to scheduler, track completion
|
||||||
|
done_evt = env.event()
|
||||||
|
pe_txn = PeInternalTxn(
|
||||||
|
command=cmd, done=done_evt,
|
||||||
|
pe_prefix=self._pe_prefix,
|
||||||
|
)
|
||||||
|
composite_results.append(pe_txn.result_data)
|
||||||
|
yield self.out_ports[scheduler_id].put(pe_txn)
|
||||||
|
pending[cmd.completion.id] = done_evt
|
||||||
|
else:
|
||||||
|
# Blocking: dispatch and wait for completion
|
||||||
|
done_evt = env.event()
|
||||||
|
pe_txn = PeInternalTxn(
|
||||||
|
command=cmd, done=done_evt,
|
||||||
|
pe_prefix=self._pe_prefix,
|
||||||
|
)
|
||||||
|
yield self.out_ports[scheduler_id].put(pe_txn)
|
||||||
|
yield done_evt
|
||||||
|
|
||||||
|
# Wait for any remaining pending completions
|
||||||
|
for evt in pending.values():
|
||||||
|
yield evt
|
||||||
|
|
||||||
|
# Record PE-internal execution time
|
||||||
|
txn.result_data["pe_exec_ns"] = env.now - pe_exec_start
|
||||||
|
|
||||||
|
# Aggregate dma_ns / compute_ns from CompositeCmd results
|
||||||
|
total_dma_ns = 0.0
|
||||||
|
total_compute_ns = 0.0
|
||||||
|
for rd in composite_results:
|
||||||
|
total_dma_ns += rd.get("dma_ns", 0.0)
|
||||||
|
total_compute_ns += rd.get("compute_ns", 0.0)
|
||||||
|
txn.result_data["dma_ns"] = total_dma_ns
|
||||||
|
txn.result_data["compute_ns"] = total_compute_ns
|
||||||
|
|
||||||
|
# Signal original Transaction done
|
||||||
|
txn.done.succeed()
|
||||||
@@ -0,0 +1,116 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from collections.abc import Generator
|
||||||
|
from typing import TYPE_CHECKING, Any
|
||||||
|
|
||||||
|
import simpy
|
||||||
|
|
||||||
|
from kernbench.components.base import PeEngineBase
|
||||||
|
from kernbench.sim_engine.transaction import Transaction
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from kernbench.common.pe_commands import PeInternalTxn
|
||||||
|
from kernbench.components.context import ComponentContext
|
||||||
|
from kernbench.topology.types import Node
|
||||||
|
|
||||||
|
|
||||||
|
class PeDmaComponent(PeEngineBase):
|
||||||
|
"""PE_DMA: dual-channel DMA engine with READ and WRITE resources.
|
||||||
|
|
||||||
|
Each channel has capacity=1 (ADR-0014 D4):
|
||||||
|
- DMA_READ and DMA_WRITE may execute concurrently.
|
||||||
|
- Multiple READs cannot overlap; multiple WRITEs cannot overlap.
|
||||||
|
|
||||||
|
Handles two message types:
|
||||||
|
- Transaction: external fabric messages (PeDmaMsg probes, M_CPU DMA)
|
||||||
|
- PeInternalTxn: PE-internal commands from PE_SCHEDULER
|
||||||
|
(DmaReadCmd → HBM read, DmaWriteCmd → HBM write)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
|
||||||
|
super().__init__(node, ctx)
|
||||||
|
self._dma_read: simpy.Resource | None = None
|
||||||
|
self._dma_write: simpy.Resource | None = None
|
||||||
|
|
||||||
|
def init_resources(self, env: simpy.Environment) -> None:
|
||||||
|
self._dma_read = simpy.Resource(env, capacity=1)
|
||||||
|
self._dma_write = simpy.Resource(env, capacity=1)
|
||||||
|
|
||||||
|
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
|
||||||
|
yield env.timeout(0)
|
||||||
|
|
||||||
|
def handle_command(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
|
||||||
|
"""Handle PE-internal DMA command: resolve PA → HBM path → transfer."""
|
||||||
|
from kernbench.common.pe_commands import DmaReadCmd, DmaWriteCmd
|
||||||
|
from kernbench.policy.address.phyaddr import PhysAddr
|
||||||
|
from kernbench.runtime_api.kernel import PeDmaMsg
|
||||||
|
|
||||||
|
cmd = pe_txn.command
|
||||||
|
assert self._dma_read is not None and self._dma_write is not None
|
||||||
|
|
||||||
|
# Determine direction and target PA
|
||||||
|
if isinstance(cmd, DmaReadCmd):
|
||||||
|
dma_res = self._dma_read
|
||||||
|
target_pa = cmd.src_pa
|
||||||
|
is_write = False
|
||||||
|
elif isinstance(cmd, DmaWriteCmd):
|
||||||
|
dma_res = self._dma_write
|
||||||
|
target_pa = cmd.dst_pa
|
||||||
|
is_write = True
|
||||||
|
else:
|
||||||
|
pe_txn.done.succeed()
|
||||||
|
return
|
||||||
|
|
||||||
|
# Resolve PA → HBM node and compute path
|
||||||
|
pa = PhysAddr.decode(target_pa)
|
||||||
|
dst_node = self.ctx.resolver.resolve(pa)
|
||||||
|
path = self.ctx.router.find_path(self._pe_prefix, dst_node)
|
||||||
|
drain_ns = self.ctx.compute_drain_ns(path, cmd.nbytes)
|
||||||
|
|
||||||
|
# Acquire DMA channel (command issue serialization)
|
||||||
|
with dma_res.request() as req:
|
||||||
|
yield req
|
||||||
|
# Create sub-Transaction with PeDmaMsg (HbmCtrl handles it directly)
|
||||||
|
sub_done = env.event()
|
||||||
|
sub_request = PeDmaMsg(
|
||||||
|
correlation_id="pe_internal",
|
||||||
|
request_id=f"dma_{id(pe_txn)}",
|
||||||
|
src_sip=0, src_cube=0, src_pe=0,
|
||||||
|
dst_pa=target_pa, nbytes=cmd.nbytes,
|
||||||
|
is_write=is_write,
|
||||||
|
)
|
||||||
|
sub_txn = Transaction(
|
||||||
|
request=sub_request, path=path, step=0,
|
||||||
|
nbytes=cmd.nbytes, done=sub_done, drain_ns=drain_ns,
|
||||||
|
)
|
||||||
|
# Send to next hop (path[0] is pe_dma itself, path[1] is xbar)
|
||||||
|
if len(path) > 1:
|
||||||
|
yield self.out_ports[path[1]].put(sub_txn.advance())
|
||||||
|
# DMA channel released after issue
|
||||||
|
|
||||||
|
# Wait for HBM transfer completion
|
||||||
|
yield sub_done
|
||||||
|
pe_txn.done.succeed()
|
||||||
|
|
||||||
|
def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator:
|
||||||
|
"""Handle external Transaction (PeDmaMsg probe, M_CPU DMA) with channel acquisition."""
|
||||||
|
dma_res = self._select_channel(txn)
|
||||||
|
with dma_res.request() as req:
|
||||||
|
yield req
|
||||||
|
next_hop = txn.next_hop
|
||||||
|
if next_hop:
|
||||||
|
yield self.out_ports[next_hop].put(txn.advance())
|
||||||
|
else:
|
||||||
|
drain = getattr(txn, "drain_ns", 0.0)
|
||||||
|
if drain > 0:
|
||||||
|
yield env.timeout(drain)
|
||||||
|
txn.done.succeed()
|
||||||
|
|
||||||
|
def _select_channel(self, txn: Any) -> simpy.Resource:
|
||||||
|
"""Select DMA channel based on request type."""
|
||||||
|
from kernbench.runtime_api.kernel import MemoryWriteMsg
|
||||||
|
|
||||||
|
assert self._dma_read is not None and self._dma_write is not None
|
||||||
|
if isinstance(txn.request, MemoryWriteMsg):
|
||||||
|
return self._dma_write
|
||||||
|
return self._dma_read
|
||||||
@@ -0,0 +1,90 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from collections.abc import Generator
|
||||||
|
from typing import TYPE_CHECKING, Any
|
||||||
|
|
||||||
|
import simpy
|
||||||
|
|
||||||
|
from kernbench.components.base import PeEngineBase
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from kernbench.common.pe_commands import PeInternalTxn
|
||||||
|
from kernbench.components.context import ComponentContext
|
||||||
|
from kernbench.topology.types import Node
|
||||||
|
|
||||||
|
|
||||||
|
# dtype → bit width (for TFLOPS scaling)
|
||||||
|
_DTYPE_BITS: dict[str, int] = {
|
||||||
|
"f16": 16, "fp16": 16, "float16": 16, "bf16": 16,
|
||||||
|
"f32": 32, "fp32": 32, "float32": 32,
|
||||||
|
"i8": 8, "int8": 8,
|
||||||
|
"i16": 16, "int16": 16,
|
||||||
|
"i32": 32, "int32": 32,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class PeGemmComponent(PeEngineBase):
|
||||||
|
"""PE_GEMM: matrix multiplication engine sharing accel_slot (ADR-0014 D4).
|
||||||
|
|
||||||
|
Uses a shared compute resource (PE_ACCEL capacity=1) that is mutually
|
||||||
|
exclusive with PE_MATH within the same PE.
|
||||||
|
|
||||||
|
Compute latency model:
|
||||||
|
FLOPs = 2 * M * K * N
|
||||||
|
effective_tflops = peak_tflops_f16 * (16 / dtype_bits)
|
||||||
|
compute_ns = FLOPs / (effective_tflops * 1e3)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
|
||||||
|
super().__init__(node, ctx)
|
||||||
|
self._accel: simpy.Resource | None = None
|
||||||
|
self._peak_tflops_f16: float = float(node.attrs.get("peak_tflops_f16", 0.0))
|
||||||
|
|
||||||
|
def init_resources(self, env: simpy.Environment) -> None:
|
||||||
|
resource_name = self.node.attrs.get("shared_resource")
|
||||||
|
if resource_name and self.ctx:
|
||||||
|
self._accel = self.ctx.get_shared_resource(
|
||||||
|
env, f"{self._pe_prefix}.{resource_name}"
|
||||||
|
)
|
||||||
|
|
||||||
|
def _compute_ns(self, m: int, k: int, n: int, dtype: str) -> float:
|
||||||
|
"""Compute GEMM latency in nanoseconds."""
|
||||||
|
if self._peak_tflops_f16 <= 0:
|
||||||
|
return float(self.node.attrs.get("overhead_ns", 0.0))
|
||||||
|
dtype_bits = _DTYPE_BITS.get(dtype, 16)
|
||||||
|
effective_tflops = self._peak_tflops_f16 * (16.0 / dtype_bits)
|
||||||
|
flops = 2.0 * m * k * n
|
||||||
|
return flops / (effective_tflops * 1e3)
|
||||||
|
|
||||||
|
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
|
||||||
|
overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
|
||||||
|
yield env.timeout(overhead_ns)
|
||||||
|
|
||||||
|
def handle_command(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
|
||||||
|
from kernbench.common.pe_commands import GemmCmd
|
||||||
|
|
||||||
|
cmd = pe_txn.command
|
||||||
|
if self._accel:
|
||||||
|
with self._accel.request() as req:
|
||||||
|
yield req
|
||||||
|
if isinstance(cmd, GemmCmd):
|
||||||
|
ns = self._compute_ns(cmd.m, cmd.k, cmd.n, cmd.a.dtype)
|
||||||
|
yield env.timeout(ns)
|
||||||
|
else:
|
||||||
|
yield from self.run(env, 0)
|
||||||
|
else:
|
||||||
|
if isinstance(cmd, GemmCmd):
|
||||||
|
ns = self._compute_ns(cmd.m, cmd.k, cmd.n, cmd.a.dtype)
|
||||||
|
yield env.timeout(ns)
|
||||||
|
else:
|
||||||
|
yield from self.run(env, 0)
|
||||||
|
pe_txn.done.succeed()
|
||||||
|
|
||||||
|
def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator:
|
||||||
|
"""Transaction forwarding with accel_slot acquisition."""
|
||||||
|
if self._accel:
|
||||||
|
with self._accel.request() as req:
|
||||||
|
yield req
|
||||||
|
yield from super()._forward_txn(env, txn)
|
||||||
|
else:
|
||||||
|
yield from super()._forward_txn(env, txn)
|
||||||
@@ -0,0 +1,54 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from collections.abc import Generator
|
||||||
|
from typing import TYPE_CHECKING, Any
|
||||||
|
|
||||||
|
import simpy
|
||||||
|
|
||||||
|
from kernbench.components.base import PeEngineBase
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from kernbench.common.pe_commands import PeInternalTxn
|
||||||
|
from kernbench.components.context import ComponentContext
|
||||||
|
from kernbench.topology.types import Node
|
||||||
|
|
||||||
|
|
||||||
|
class PeMathComponent(PeEngineBase):
|
||||||
|
"""PE_MATH: element-wise computation engine sharing accel_slot (ADR-0014 D4).
|
||||||
|
|
||||||
|
Uses a shared compute resource (PE_ACCEL capacity=1) that is mutually
|
||||||
|
exclusive with PE_GEMM within the same PE.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
|
||||||
|
super().__init__(node, ctx)
|
||||||
|
self._accel: simpy.Resource | None = None
|
||||||
|
|
||||||
|
def init_resources(self, env: simpy.Environment) -> None:
|
||||||
|
resource_name = self.node.attrs.get("shared_resource")
|
||||||
|
if resource_name and self.ctx:
|
||||||
|
self._accel = self.ctx.get_shared_resource(
|
||||||
|
env, f"{self._pe_prefix}.{resource_name}"
|
||||||
|
)
|
||||||
|
|
||||||
|
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
|
||||||
|
overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
|
||||||
|
yield env.timeout(overhead_ns)
|
||||||
|
|
||||||
|
def handle_command(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
|
||||||
|
if self._accel:
|
||||||
|
with self._accel.request() as req:
|
||||||
|
yield req
|
||||||
|
yield from self.run(env, 0)
|
||||||
|
else:
|
||||||
|
yield from self.run(env, 0)
|
||||||
|
pe_txn.done.succeed()
|
||||||
|
|
||||||
|
def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator:
|
||||||
|
"""Transaction forwarding with accel_slot acquisition."""
|
||||||
|
if self._accel:
|
||||||
|
with self._accel.request() as req:
|
||||||
|
yield req
|
||||||
|
yield from super()._forward_txn(env, txn)
|
||||||
|
else:
|
||||||
|
yield from super()._forward_txn(env, txn)
|
||||||
@@ -0,0 +1,245 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from collections.abc import Generator
|
||||||
|
from typing import TYPE_CHECKING, Any
|
||||||
|
|
||||||
|
import simpy
|
||||||
|
|
||||||
|
from kernbench.components.base import ComponentBase
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from kernbench.common.pe_commands import PeInternalTxn
|
||||||
|
from kernbench.components.context import ComponentContext
|
||||||
|
from kernbench.topology.types import Node
|
||||||
|
|
||||||
|
|
||||||
|
class PeSchedulerComponent(ComponentBase):
|
||||||
|
"""PE_SCHEDULER: sole dispatcher inside a PE (ADR-0014 D1).
|
||||||
|
|
||||||
|
Receives PeInternalTxn from PE_CPU, routes to the appropriate engine:
|
||||||
|
- DmaReadCmd / DmaWriteCmd → PE_DMA
|
||||||
|
- GemmCmd → PE_GEMM
|
||||||
|
- MathCmd → PE_MATH
|
||||||
|
- CompositeCmd → tiled pipeline (Stage 3: ADR-0014 D3.2)
|
||||||
|
|
||||||
|
Composite GEMM pipeline (32x64x32 tiles):
|
||||||
|
DMA_READ(b_tile_t) → COMPUTE(t) → DMA_WRITE(out_tile_t)
|
||||||
|
with overlap: READ(t+1) || COMPUTE(t) || WRITE(t-1)
|
||||||
|
|
||||||
|
Applies scheduler overhead_ns before dispatching each command.
|
||||||
|
Non-PeInternalTxn messages are forwarded via inherited _forward_txn().
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Scheduler tile dimensions (ADR-0014 D3.2)
|
||||||
|
TILE_M = 32
|
||||||
|
TILE_K = 64
|
||||||
|
TILE_N = 32
|
||||||
|
|
||||||
|
# Command → engine suffix dispatch table.
|
||||||
|
# New engines: add a single entry here (e.g. ConvCmd: "pe_conv").
|
||||||
|
_CMD_DISPATCH: dict[type, str] = {}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _ensure_dispatch_table(cls) -> None:
|
||||||
|
if cls._CMD_DISPATCH:
|
||||||
|
return
|
||||||
|
from kernbench.common.pe_commands import DmaReadCmd, DmaWriteCmd, GemmCmd, MathCmd
|
||||||
|
|
||||||
|
cls._CMD_DISPATCH = {
|
||||||
|
DmaReadCmd: "pe_dma",
|
||||||
|
DmaWriteCmd: "pe_dma",
|
||||||
|
GemmCmd: "pe_gemm",
|
||||||
|
MathCmd: "pe_math",
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
|
||||||
|
super().__init__(node, ctx)
|
||||||
|
self._pe_prefix = node.id.rsplit(".", 1)[0]
|
||||||
|
self._ensure_dispatch_table()
|
||||||
|
|
||||||
|
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
|
||||||
|
overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
|
||||||
|
yield env.timeout(overhead_ns)
|
||||||
|
|
||||||
|
def _worker(self, env: simpy.Environment) -> Generator:
|
||||||
|
from kernbench.common.pe_commands import PeInternalTxn
|
||||||
|
|
||||||
|
while True:
|
||||||
|
msg: Any = yield self._inbox.get()
|
||||||
|
if isinstance(msg, PeInternalTxn):
|
||||||
|
env.process(self._dispatch(env, msg))
|
||||||
|
else:
|
||||||
|
yield from self._forward_txn(env, msg)
|
||||||
|
|
||||||
|
def _dispatch(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
|
||||||
|
"""Route a PeInternalTxn to the correct engine via dispatch table."""
|
||||||
|
from kernbench.common.pe_commands import CompositeCmd
|
||||||
|
|
||||||
|
# Scheduler overhead
|
||||||
|
yield from self.run(env, 0)
|
||||||
|
|
||||||
|
cmd = pe_txn.command
|
||||||
|
|
||||||
|
# Check dispatch table first
|
||||||
|
engine_suffix = self._CMD_DISPATCH.get(type(cmd))
|
||||||
|
if engine_suffix is not None:
|
||||||
|
yield self.out_ports[f"{self._pe_prefix}.{engine_suffix}"].put(pe_txn)
|
||||||
|
return
|
||||||
|
|
||||||
|
# CompositeCmd: tiled pipeline (not a simple forward)
|
||||||
|
if isinstance(cmd, CompositeCmd):
|
||||||
|
yield from self._dispatch_composite(env, pe_txn)
|
||||||
|
return
|
||||||
|
|
||||||
|
# Unknown command — signal done immediately
|
||||||
|
pe_txn.done.succeed()
|
||||||
|
|
||||||
|
def _dispatch_composite(self, env: simpy.Environment, pe_txn: PeInternalTxn) -> Generator:
|
||||||
|
"""Composite tiled pipeline (ADR-0014 D3.2).
|
||||||
|
|
||||||
|
GEMM: 3-stage pipeline with b-tile streaming from HBM.
|
||||||
|
MATH: sequential compute + DMA_WRITE (no tiling).
|
||||||
|
"""
|
||||||
|
from kernbench.common.pe_commands import CompositeCmd
|
||||||
|
|
||||||
|
cmd = pe_txn.command
|
||||||
|
assert isinstance(cmd, CompositeCmd)
|
||||||
|
if cmd.op == "gemm" and cmd.b is not None:
|
||||||
|
yield from self._pipeline_gemm(env, pe_txn, cmd)
|
||||||
|
else:
|
||||||
|
yield from self._pipeline_math(env, pe_txn, cmd)
|
||||||
|
|
||||||
|
def _pipeline_gemm(self, env: simpy.Environment, pe_txn: PeInternalTxn, cmd: Any) -> Generator:
|
||||||
|
"""Tiled GEMM pipeline: stream b tiles from HBM, compute, write results.
|
||||||
|
|
||||||
|
Tensor a is in TCM (loaded via tl.load). Tensor b is in HBM (via tl.ref).
|
||||||
|
Pipeline: DMA_READ(b_tile_t) -> COMPUTE(t) -> DMA_WRITE(out_tile_t)
|
||||||
|
Overlap: READ(t+1) || COMPUTE(t) || WRITE(t-1)
|
||||||
|
"""
|
||||||
|
from kernbench.common.pe_commands import (
|
||||||
|
DmaReadCmd,
|
||||||
|
DmaWriteCmd,
|
||||||
|
GemmCmd,
|
||||||
|
PeInternalTxn as PeTxn,
|
||||||
|
TensorHandle,
|
||||||
|
)
|
||||||
|
|
||||||
|
pp = self._pe_prefix
|
||||||
|
a = cmd.a # already in TCM
|
||||||
|
b = cmd.b # HBM reference (via tl.ref)
|
||||||
|
|
||||||
|
M, K_a = a.shape[-2], a.shape[-1]
|
||||||
|
K_b, N = b.shape[-2], b.shape[-1]
|
||||||
|
dtype = a.dtype
|
||||||
|
dtype_bytes = b.nbytes // (K_b * N) if (K_b * N) > 0 else 2
|
||||||
|
|
||||||
|
# Tile counts
|
||||||
|
n_tiles_k = max(1, (K_a + self.TILE_K - 1) // self.TILE_K)
|
||||||
|
n_tiles_n = max(1, (N + self.TILE_N - 1) // self.TILE_N)
|
||||||
|
n_tiles = n_tiles_k * n_tiles_n
|
||||||
|
|
||||||
|
prev_compute_done = None
|
||||||
|
prev_write_done = None
|
||||||
|
total_dma_ns = 0.0
|
||||||
|
total_compute_ns = 0.0
|
||||||
|
|
||||||
|
for tile_idx in range(n_tiles):
|
||||||
|
tk = tile_idx // n_tiles_n
|
||||||
|
tn = tile_idx % n_tiles_n
|
||||||
|
|
||||||
|
k_start = tk * self.TILE_K
|
||||||
|
n_start = tn * self.TILE_N
|
||||||
|
tile_k = min(self.TILE_K, K_a - k_start)
|
||||||
|
tile_n = min(self.TILE_N, N - n_start)
|
||||||
|
tile_nbytes = tile_k * tile_n * dtype_bytes
|
||||||
|
|
||||||
|
# --- Stage 1: DMA_READ b_tile from HBM ---
|
||||||
|
read_done = env.event()
|
||||||
|
b_tile_pa = b.pa + (k_start * N + n_start) * dtype_bytes
|
||||||
|
b_tile_handle = TensorHandle(
|
||||||
|
id=f"b_tile_{tile_idx}", pa=b_tile_pa,
|
||||||
|
shape=(tile_k, tile_n), dtype=dtype, nbytes=tile_nbytes,
|
||||||
|
)
|
||||||
|
read_cmd = DmaReadCmd(handle=b_tile_handle, src_pa=b_tile_pa, nbytes=tile_nbytes)
|
||||||
|
read_txn = PeTxn(command=read_cmd, done=read_done, pe_prefix=pp)
|
||||||
|
t0 = env.now
|
||||||
|
yield self.out_ports[f"{pp}.pe_dma"].put(read_txn)
|
||||||
|
|
||||||
|
# Wait for previous compute before starting this tile's compute
|
||||||
|
if prev_compute_done is not None:
|
||||||
|
yield prev_compute_done
|
||||||
|
|
||||||
|
# Wait for this tile's DMA_READ
|
||||||
|
yield read_done
|
||||||
|
total_dma_ns += env.now - t0
|
||||||
|
|
||||||
|
# --- Stage 2: COMPUTE (GEMM) ---
|
||||||
|
compute_done = env.event()
|
||||||
|
out_handle = TensorHandle(
|
||||||
|
id=f"out_tile_{tile_idx}", pa=0,
|
||||||
|
shape=(M, tile_n), dtype=dtype,
|
||||||
|
nbytes=M * tile_n * dtype_bytes,
|
||||||
|
)
|
||||||
|
compute_cmd = GemmCmd(a=a, b=b_tile_handle, out=out_handle,
|
||||||
|
m=M, k=tile_k, n=tile_n)
|
||||||
|
compute_txn = PeTxn(command=compute_cmd, done=compute_done, pe_prefix=pp)
|
||||||
|
t0 = env.now
|
||||||
|
yield self.out_ports[f"{pp}.pe_gemm"].put(compute_txn)
|
||||||
|
|
||||||
|
# Wait for previous write (DMA_WRITE serialization)
|
||||||
|
if prev_write_done is not None:
|
||||||
|
yield prev_write_done
|
||||||
|
|
||||||
|
# Wait for compute of THIS tile
|
||||||
|
yield compute_done
|
||||||
|
total_compute_ns += env.now - t0
|
||||||
|
prev_compute_done = compute_done
|
||||||
|
|
||||||
|
# --- Stage 3: DMA_WRITE out_tile to HBM ---
|
||||||
|
write_done = env.event()
|
||||||
|
out_tile_pa = cmd.out_pa + n_start * dtype_bytes
|
||||||
|
write_nbytes = M * tile_n * dtype_bytes
|
||||||
|
write_cmd = DmaWriteCmd(handle=out_handle, dst_pa=out_tile_pa, nbytes=write_nbytes)
|
||||||
|
write_txn = PeTxn(command=write_cmd, done=write_done, pe_prefix=pp)
|
||||||
|
t0 = env.now
|
||||||
|
yield self.out_ports[f"{pp}.pe_dma"].put(write_txn)
|
||||||
|
prev_write_done = write_done
|
||||||
|
|
||||||
|
# Wait for final write
|
||||||
|
if prev_write_done is not None:
|
||||||
|
t0 = env.now
|
||||||
|
yield prev_write_done
|
||||||
|
total_dma_ns += env.now - t0
|
||||||
|
|
||||||
|
pe_txn.result_data["dma_ns"] = total_dma_ns
|
||||||
|
pe_txn.result_data["compute_ns"] = total_compute_ns
|
||||||
|
pe_txn.done.succeed()
|
||||||
|
|
||||||
|
def _pipeline_math(self, env: simpy.Environment, pe_txn: PeInternalTxn, cmd: Any) -> Generator:
|
||||||
|
"""Non-GEMM composite: sequential compute + DMA_WRITE (no tiling)."""
|
||||||
|
from kernbench.common.pe_commands import (
|
||||||
|
DmaWriteCmd,
|
||||||
|
MathCmd,
|
||||||
|
PeInternalTxn as PeTxn,
|
||||||
|
)
|
||||||
|
|
||||||
|
pp = self._pe_prefix
|
||||||
|
|
||||||
|
# Step 1: Compute (MATH)
|
||||||
|
compute_done = env.event()
|
||||||
|
compute_cmd = MathCmd(
|
||||||
|
op=cmd.math_op or "identity",
|
||||||
|
inputs=(cmd.a,), out=cmd.a,
|
||||||
|
)
|
||||||
|
compute_txn = PeTxn(command=compute_cmd, done=compute_done, pe_prefix=pp)
|
||||||
|
yield self.out_ports[f"{pp}.pe_math"].put(compute_txn)
|
||||||
|
yield compute_done
|
||||||
|
|
||||||
|
# Step 2: DMA_WRITE result to HBM
|
||||||
|
write_done = env.event()
|
||||||
|
write_cmd = DmaWriteCmd(handle=cmd.a, dst_pa=cmd.out_pa, nbytes=cmd.out_nbytes)
|
||||||
|
write_txn = PeTxn(command=write_cmd, done=write_done, pe_prefix=pp)
|
||||||
|
yield self.out_ports[f"{pp}.pe_dma"].put(write_txn)
|
||||||
|
yield write_done
|
||||||
|
|
||||||
|
pe_txn.done.succeed()
|
||||||
@@ -0,0 +1,25 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from collections.abc import Generator
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
|
from kernbench.components.base import ComponentBase
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from kernbench.components.context import ComponentContext
|
||||||
|
from kernbench.topology.types import Node
|
||||||
|
|
||||||
|
|
||||||
|
class PeTcmComponent(ComponentBase):
|
||||||
|
"""PE_TCM: tightly-coupled memory / local SRAM staging buffer.
|
||||||
|
|
||||||
|
Terminal storage component for PE-internal dataflow (ADR-0014 D5).
|
||||||
|
Phase 0: applies overhead_ns and drain_ns at terminal.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
|
||||||
|
super().__init__(node, ctx)
|
||||||
|
|
||||||
|
def run(self, env, nbytes: int) -> Generator:
|
||||||
|
overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
|
||||||
|
yield env.timeout(overhead_ns)
|
||||||
@@ -0,0 +1,59 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from collections.abc import Generator
|
||||||
|
from typing import TYPE_CHECKING, Any
|
||||||
|
|
||||||
|
import simpy
|
||||||
|
|
||||||
|
from kernbench.components.base import ComponentBase
|
||||||
|
from kernbench.sim_engine.transaction import Transaction
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from kernbench.components.context import ComponentContext
|
||||||
|
from kernbench.topology.types import Node
|
||||||
|
|
||||||
|
|
||||||
|
class SramComponent(ComponentBase):
|
||||||
|
"""Cube SRAM: terminal component that models SRAM access latency.
|
||||||
|
|
||||||
|
Applies overhead_ns processing overhead (from node.attrs).
|
||||||
|
On completion, sends a ResponseMsg back on the reverse path.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, node: Node, ctx: ComponentContext | None = None) -> None:
|
||||||
|
super().__init__(node, ctx)
|
||||||
|
|
||||||
|
def run(self, env: simpy.Environment, nbytes: int) -> Generator:
|
||||||
|
overhead_ns = float(self.node.attrs.get("overhead_ns", 0.0))
|
||||||
|
yield env.timeout(overhead_ns)
|
||||||
|
|
||||||
|
def _worker(self, env: simpy.Environment) -> Generator:
|
||||||
|
"""Terminal worker: process, apply drain, send response."""
|
||||||
|
while True:
|
||||||
|
txn: Any = yield self._inbox.get()
|
||||||
|
yield from self.run(env, txn.nbytes)
|
||||||
|
drain = getattr(txn, "drain_ns", 0.0)
|
||||||
|
if drain > 0:
|
||||||
|
yield env.timeout(drain)
|
||||||
|
yield from self._send_response(env, txn)
|
||||||
|
|
||||||
|
def _send_response(self, env: simpy.Environment, txn: Any) -> Generator:
|
||||||
|
"""Create ResponseMsg and send on reverse path."""
|
||||||
|
reverse_path = list(reversed(txn.path))
|
||||||
|
if len(reverse_path) >= 2 and self.ctx:
|
||||||
|
from kernbench.runtime_api.kernel import ResponseMsg
|
||||||
|
|
||||||
|
parts = self.node.id.split(".")
|
||||||
|
cube_id = int(parts[1].replace("cube", ""))
|
||||||
|
resp_msg = ResponseMsg(
|
||||||
|
correlation_id=txn.request.correlation_id,
|
||||||
|
request_id=txn.request.request_id,
|
||||||
|
src_cube=cube_id, src_pe=-1, success=True,
|
||||||
|
)
|
||||||
|
resp_txn = Transaction(
|
||||||
|
request=resp_msg, path=reverse_path, step=0,
|
||||||
|
nbytes=0, done=env.event(), is_response=True,
|
||||||
|
)
|
||||||
|
yield self.out_ports[reverse_path[1]].put(resp_txn.advance())
|
||||||
|
else:
|
||||||
|
txn.done.succeed()
|
||||||
@@ -0,0 +1,85 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
from kernbench.policy.address.phyaddr import PhysAddr
|
||||||
|
|
||||||
|
|
||||||
|
class AllocationError(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class AddressConfig:
|
||||||
|
sip_count: int
|
||||||
|
cubes_per_sip: int
|
||||||
|
pes_per_cube: int
|
||||||
|
hbm_bytes_per_cube: int
|
||||||
|
hbm_slices_per_cube: int
|
||||||
|
tcm_bytes_per_pe: int
|
||||||
|
tcm_scheduler_reserved_bytes: int
|
||||||
|
sram_bytes_per_cube: int
|
||||||
|
|
||||||
|
@property
|
||||||
|
def hbm_slice_bytes(self) -> int:
|
||||||
|
return self.hbm_bytes_per_cube // self.hbm_slices_per_cube
|
||||||
|
|
||||||
|
@property
|
||||||
|
def tcm_allocatable_bytes(self) -> int:
|
||||||
|
return self.tcm_bytes_per_pe - self.tcm_scheduler_reserved_bytes
|
||||||
|
|
||||||
|
|
||||||
|
class PEMemAllocator:
|
||||||
|
def __init__(
|
||||||
|
self, rack_id: int, sip_id: int, cube_id: int, pe_id: int, cfg: AddressConfig,
|
||||||
|
) -> None:
|
||||||
|
self._rack_id = rack_id
|
||||||
|
self._sip_id = sip_id
|
||||||
|
self._cube_id = cube_id
|
||||||
|
self._pe_id = pe_id
|
||||||
|
self._cfg = cfg
|
||||||
|
self._hbm_cursor = 0
|
||||||
|
self._tcm_cursor = 0
|
||||||
|
|
||||||
|
def alloc_hbm(self, nbytes: int) -> PhysAddr:
|
||||||
|
if self._hbm_cursor + nbytes > self._cfg.hbm_slice_bytes:
|
||||||
|
raise AllocationError(
|
||||||
|
f"HBM overflow: need {nbytes}, "
|
||||||
|
f"available {self._cfg.hbm_slice_bytes - self._hbm_cursor}"
|
||||||
|
)
|
||||||
|
pa = PhysAddr.pe_hbm_addr(
|
||||||
|
rack_id=self._rack_id, sip_id=self._sip_id, cube_id=self._cube_id,
|
||||||
|
pe_id=self._pe_id, pe_local_hbm_offset=self._hbm_cursor,
|
||||||
|
slice_size_bytes=self._cfg.hbm_slice_bytes,
|
||||||
|
)
|
||||||
|
self._hbm_cursor += nbytes
|
||||||
|
return pa
|
||||||
|
|
||||||
|
def alloc_tcm(self, nbytes: int) -> PhysAddr:
|
||||||
|
if self._tcm_cursor + nbytes > self._cfg.tcm_allocatable_bytes:
|
||||||
|
raise AllocationError(
|
||||||
|
f"TCM overflow: need {nbytes}, "
|
||||||
|
f"available {self._cfg.tcm_allocatable_bytes - self._tcm_cursor}"
|
||||||
|
)
|
||||||
|
pa = PhysAddr.pe_tcm_addr(
|
||||||
|
rack_id=self._rack_id, sip_id=self._sip_id, cube_id=self._cube_id,
|
||||||
|
pe_id=self._pe_id, tcm_offset=self._tcm_cursor,
|
||||||
|
)
|
||||||
|
self._tcm_cursor += nbytes
|
||||||
|
return pa
|
||||||
|
|
||||||
|
@property
|
||||||
|
def hbm_used(self) -> int:
|
||||||
|
return self._hbm_cursor
|
||||||
|
|
||||||
|
@property
|
||||||
|
def hbm_total(self) -> int:
|
||||||
|
return self._cfg.hbm_slice_bytes
|
||||||
|
|
||||||
|
@property
|
||||||
|
def tcm_used(self) -> int:
|
||||||
|
return self._tcm_cursor
|
||||||
|
|
||||||
|
@property
|
||||||
|
def tcm_total(self) -> int:
|
||||||
|
return self._cfg.tcm_allocatable_bytes
|
||||||
@@ -0,0 +1,184 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from enum import IntEnum
|
||||||
|
from typing import Literal
|
||||||
|
|
||||||
|
MAX_51 = (1 << 51) - 1
|
||||||
|
|
||||||
|
|
||||||
|
class PhysAddrError(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def _chk_range(name: str, v: int, bits: int) -> None:
|
||||||
|
if not (0 <= v < (1 << bits)):
|
||||||
|
raise PhysAddrError(f"{name} out of range for {bits} bits: {v}")
|
||||||
|
|
||||||
|
|
||||||
|
def _chk_max(name: str, v: int, maxv: int) -> None:
|
||||||
|
if not (0 <= v <= maxv):
|
||||||
|
raise PhysAddrError(f"{name} out of range (0..{maxv}): {v}")
|
||||||
|
|
||||||
|
|
||||||
|
class UnitType(IntEnum):
|
||||||
|
PE = 0
|
||||||
|
MCPU = 1
|
||||||
|
SRAM = 2
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class PhysAddr:
|
||||||
|
"""
|
||||||
|
51-bit physical address value object.
|
||||||
|
|
||||||
|
Layout:
|
||||||
|
[50:47] rack_id (4)
|
||||||
|
[46:43] sip_id (4)
|
||||||
|
[42:38] sip_seg (5) # cube_id
|
||||||
|
[37:0] local_offset (38) => each segment is 256GB
|
||||||
|
|
||||||
|
local_offset:
|
||||||
|
[37] selector: 1 = HBM window (128GB reserved), 0 = PE resource window
|
||||||
|
"""
|
||||||
|
|
||||||
|
rack_id: int
|
||||||
|
sip_id: int
|
||||||
|
sip_seg: int
|
||||||
|
local_offset: int
|
||||||
|
|
||||||
|
kind: Literal["hbm", "pe_resource", "raw"] = "raw"
|
||||||
|
cube_id: int = 0
|
||||||
|
unit_type: UnitType = UnitType.PE
|
||||||
|
pe_id: int = 0
|
||||||
|
ext: int = 0
|
||||||
|
sub_offset: int = 0
|
||||||
|
hbm_offset: int = 0
|
||||||
|
|
||||||
|
HBM_WINDOW_BYTES = 1 << 37 # 128GB
|
||||||
|
|
||||||
|
def encode(self) -> int:
|
||||||
|
_chk_range("rack_id", self.rack_id, 4)
|
||||||
|
_chk_range("sip_id", self.sip_id, 4)
|
||||||
|
_chk_range("sip_seg", self.sip_seg, 5)
|
||||||
|
_chk_range("local_offset", self.local_offset, 38)
|
||||||
|
addr = (self.rack_id << 47) | (self.sip_id << 43) | (self.sip_seg << 38) | self.local_offset
|
||||||
|
if not (0 <= addr <= MAX_51):
|
||||||
|
raise PhysAddrError("address exceeds 51-bit space")
|
||||||
|
return addr
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def decode(addr: int) -> PhysAddr:
|
||||||
|
if not (0 <= addr <= MAX_51):
|
||||||
|
raise PhysAddrError("addr must be a 51-bit value")
|
||||||
|
rack = (addr >> 47) & 0xF
|
||||||
|
sip_id = (addr >> 43) & 0xF
|
||||||
|
sip_seg = (addr >> 38) & 0x1F
|
||||||
|
off = addr & ((1 << 38) - 1)
|
||||||
|
cube_id = sip_seg
|
||||||
|
sel = (off >> 37) & 0x1
|
||||||
|
if sel == 1:
|
||||||
|
hbm_offset = int(off & ((1 << 37) - 1))
|
||||||
|
return PhysAddr(
|
||||||
|
rack_id=rack,
|
||||||
|
sip_id=sip_id,
|
||||||
|
sip_seg=sip_seg,
|
||||||
|
local_offset=off,
|
||||||
|
kind="hbm",
|
||||||
|
cube_id=cube_id,
|
||||||
|
hbm_offset=hbm_offset,
|
||||||
|
)
|
||||||
|
# PE resource decode
|
||||||
|
raw_ut = int((off >> 34) & 0x7)
|
||||||
|
try:
|
||||||
|
unit_type = UnitType(raw_ut)
|
||||||
|
except ValueError:
|
||||||
|
raise PhysAddrError(f"unknown unit_type: {raw_ut}") from None
|
||||||
|
pe_id = int((off >> 30) & 0xF)
|
||||||
|
ext = int((off >> 29) & 0x1)
|
||||||
|
sub_offset = int(off & ((1 << 29) - 1))
|
||||||
|
return PhysAddr(
|
||||||
|
rack_id=rack,
|
||||||
|
sip_id=sip_id,
|
||||||
|
sip_seg=sip_seg,
|
||||||
|
local_offset=off,
|
||||||
|
kind="pe_resource",
|
||||||
|
cube_id=cube_id,
|
||||||
|
unit_type=unit_type,
|
||||||
|
pe_id=pe_id,
|
||||||
|
ext=ext,
|
||||||
|
sub_offset=sub_offset,
|
||||||
|
hbm_offset=0,
|
||||||
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def hbm_addr(*, rack_id: int, sip_id: int, cube_id: int, hbm_offset: int) -> PhysAddr:
|
||||||
|
_chk_max("cube_id", cube_id, 31)
|
||||||
|
_chk_range("hbm_offset", hbm_offset, 37)
|
||||||
|
sip_seg = cube_id
|
||||||
|
local_offset = (1 << 37) | int(hbm_offset)
|
||||||
|
return PhysAddr(
|
||||||
|
rack_id=rack_id,
|
||||||
|
sip_id=sip_id,
|
||||||
|
sip_seg=sip_seg,
|
||||||
|
local_offset=local_offset,
|
||||||
|
kind="hbm",
|
||||||
|
cube_id=cube_id,
|
||||||
|
hbm_offset=int(hbm_offset),
|
||||||
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def pe_hbm_addr(
|
||||||
|
*,
|
||||||
|
rack_id: int,
|
||||||
|
sip_id: int,
|
||||||
|
cube_id: int,
|
||||||
|
pe_id: int,
|
||||||
|
pe_local_hbm_offset: int,
|
||||||
|
slice_size_bytes: int,
|
||||||
|
) -> PhysAddr:
|
||||||
|
_chk_max("cube_id", cube_id, 31)
|
||||||
|
_chk_range("pe_id", pe_id, 4)
|
||||||
|
if not (0 <= pe_local_hbm_offset < slice_size_bytes):
|
||||||
|
raise PhysAddrError("pe_local_hbm_offset out of PE local slice range")
|
||||||
|
hbm_offset = int(pe_id) * int(slice_size_bytes) + int(pe_local_hbm_offset)
|
||||||
|
if not (0 <= hbm_offset < PhysAddr.HBM_WINDOW_BYTES):
|
||||||
|
raise PhysAddrError("HBM offset exceeds reserved 128GB window")
|
||||||
|
return PhysAddr.hbm_addr(
|
||||||
|
rack_id=rack_id, sip_id=sip_id, cube_id=cube_id, hbm_offset=hbm_offset
|
||||||
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def hbm_pe_id(hbm_offset: int, slice_size_bytes: int) -> int:
|
||||||
|
return hbm_offset // slice_size_bytes
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def cube_sram_addr(
|
||||||
|
*, rack_id: int, sip_id: int, cube_id: int, sram_offset: int,
|
||||||
|
) -> PhysAddr:
|
||||||
|
_chk_max("cube_id", cube_id, 31)
|
||||||
|
_chk_range("sram_offset", sram_offset, 29)
|
||||||
|
sip_seg = cube_id
|
||||||
|
local_offset = (UnitType.SRAM << 34) | sram_offset
|
||||||
|
return PhysAddr(
|
||||||
|
rack_id=rack_id, sip_id=sip_id, sip_seg=sip_seg,
|
||||||
|
local_offset=local_offset,
|
||||||
|
kind="pe_resource", cube_id=cube_id,
|
||||||
|
unit_type=UnitType.SRAM, sub_offset=sram_offset,
|
||||||
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def pe_tcm_addr(
|
||||||
|
*, rack_id: int, sip_id: int, cube_id: int, pe_id: int, tcm_offset: int,
|
||||||
|
) -> PhysAddr:
|
||||||
|
_chk_max("cube_id", cube_id, 31)
|
||||||
|
_chk_range("pe_id", pe_id, 4)
|
||||||
|
_chk_range("tcm_offset", tcm_offset, 29)
|
||||||
|
sip_seg = cube_id
|
||||||
|
local_offset = (UnitType.PE << 34) | (pe_id << 30) | tcm_offset
|
||||||
|
return PhysAddr(
|
||||||
|
rack_id=rack_id, sip_id=sip_id, sip_seg=sip_seg,
|
||||||
|
local_offset=local_offset,
|
||||||
|
kind="pe_resource", cube_id=cube_id,
|
||||||
|
unit_type=UnitType.PE, pe_id=pe_id, sub_offset=tcm_offset,
|
||||||
|
)
|
||||||
@@ -0,0 +1,174 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from math import ceil
|
||||||
|
from typing import Literal
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class DPPolicy:
|
||||||
|
"""Two-level data-parallel policy: cube-level + pe-level."""
|
||||||
|
|
||||||
|
cube: Literal["replicate", "shard_m", "shard_k"] = "replicate"
|
||||||
|
pe: Literal["replicate", "column_wise", "row_wise"] = "replicate"
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_dp_policy(
|
||||||
|
policy: DPPolicy,
|
||||||
|
*,
|
||||||
|
shape: tuple[int, int],
|
||||||
|
itemsize: int,
|
||||||
|
num_pe: int,
|
||||||
|
num_cubes: int = 1,
|
||||||
|
) -> list[ShardSpec]:
|
||||||
|
"""Resolve a DPPolicy into a list[ShardSpec] with two-level resolution.
|
||||||
|
|
||||||
|
Cube-level policy distributes across cubes, pe-level distributes within
|
||||||
|
each cube. ShardSpec.pe_index uses flat indexing: cube_id * num_pe + pe_id.
|
||||||
|
"""
|
||||||
|
_PE_RESOLVERS = {
|
||||||
|
"replicate": replicate,
|
||||||
|
"column_wise": column_wise,
|
||||||
|
"row_wise": row_wise,
|
||||||
|
}
|
||||||
|
resolver = _PE_RESOLVERS.get(policy.pe)
|
||||||
|
if resolver is None:
|
||||||
|
raise ValueError(f"Unknown pe-level policy: {policy.pe}")
|
||||||
|
|
||||||
|
if num_cubes <= 1:
|
||||||
|
return resolver(shape=shape, itemsize=itemsize, num_pe=num_pe)
|
||||||
|
|
||||||
|
# Two-level resolution: cube-level → pe-level
|
||||||
|
M, K = shape
|
||||||
|
all_shards: list[ShardSpec] = []
|
||||||
|
|
||||||
|
for cube_id in range(num_cubes):
|
||||||
|
# Determine per-cube shape based on cube-level policy
|
||||||
|
if policy.cube == "replicate":
|
||||||
|
cube_shape = (M, K)
|
||||||
|
cube_offset = 0
|
||||||
|
elif policy.cube == "shard_m":
|
||||||
|
chunk_m = M // num_cubes
|
||||||
|
cube_shape = (chunk_m, K)
|
||||||
|
cube_offset = cube_id * chunk_m * K * itemsize
|
||||||
|
elif policy.cube == "shard_k":
|
||||||
|
chunk_k = K // num_cubes
|
||||||
|
cube_shape = (M, chunk_k)
|
||||||
|
cube_offset = cube_id * M * chunk_k * itemsize
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown cube-level policy: {policy.cube}")
|
||||||
|
|
||||||
|
# Resolve pe-level within this cube's shape
|
||||||
|
pe_shards = resolver(shape=cube_shape, itemsize=itemsize, num_pe=num_pe)
|
||||||
|
|
||||||
|
# Remap pe_index to flat index and adjust offset
|
||||||
|
for ps in pe_shards:
|
||||||
|
flat_idx = cube_id * num_pe + ps.pe_index
|
||||||
|
all_shards.append(ShardSpec(
|
||||||
|
pe_index=flat_idx,
|
||||||
|
offset_bytes=cube_offset + ps.offset_bytes,
|
||||||
|
nbytes=ps.nbytes,
|
||||||
|
))
|
||||||
|
|
||||||
|
return all_shards
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class ShardSpec:
|
||||||
|
pe_index: int
|
||||||
|
offset_bytes: int
|
||||||
|
nbytes: int
|
||||||
|
|
||||||
|
|
||||||
|
def column_wise(
|
||||||
|
*, shape: tuple[int, int], itemsize: int, num_pe: int,
|
||||||
|
) -> list[ShardSpec]:
|
||||||
|
"""Split K axis into num_pe equal parts. Each PE gets (M, K/P)."""
|
||||||
|
M, K = shape
|
||||||
|
chunk_k = K // num_pe
|
||||||
|
chunk_bytes = M * chunk_k * itemsize
|
||||||
|
shards = []
|
||||||
|
for i in range(num_pe):
|
||||||
|
shards.append(ShardSpec(
|
||||||
|
pe_index=i,
|
||||||
|
offset_bytes=i * chunk_bytes,
|
||||||
|
nbytes=chunk_bytes,
|
||||||
|
))
|
||||||
|
return shards
|
||||||
|
|
||||||
|
|
||||||
|
def row_wise(
|
||||||
|
*, shape: tuple[int, int], itemsize: int, num_pe: int,
|
||||||
|
) -> list[ShardSpec]:
|
||||||
|
"""Split M axis into num_pe equal parts. Each PE gets (M/P, K)."""
|
||||||
|
M, K = shape
|
||||||
|
chunk_m = M // num_pe
|
||||||
|
chunk_bytes = chunk_m * K * itemsize
|
||||||
|
shards = []
|
||||||
|
for i in range(num_pe):
|
||||||
|
shards.append(ShardSpec(
|
||||||
|
pe_index=i,
|
||||||
|
offset_bytes=i * chunk_bytes,
|
||||||
|
nbytes=chunk_bytes,
|
||||||
|
))
|
||||||
|
return shards
|
||||||
|
|
||||||
|
|
||||||
|
def replicate(
|
||||||
|
*, shape: tuple[int, int], itemsize: int, num_pe: int,
|
||||||
|
) -> list[ShardSpec]:
|
||||||
|
"""Full copy per PE. Each PE gets (M, K)."""
|
||||||
|
M, K = shape
|
||||||
|
full_bytes = M * K * itemsize
|
||||||
|
return [
|
||||||
|
ShardSpec(pe_index=i, offset_bytes=0, nbytes=full_bytes)
|
||||||
|
for i in range(num_pe)
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def tiled_column_major(
|
||||||
|
*, shape: tuple[int, int], itemsize: int, num_pe: int,
|
||||||
|
tile_m: int, tile_k: int,
|
||||||
|
) -> list[ShardSpec]:
|
||||||
|
"""2D tiling, column-major order (K axis first), round-robin across PEs."""
|
||||||
|
M, K = shape
|
||||||
|
tiles_m = ceil(M / tile_m)
|
||||||
|
tiles_k = ceil(K / tile_k)
|
||||||
|
tile_bytes = tile_m * tile_k * itemsize
|
||||||
|
row_bytes = K * itemsize
|
||||||
|
shards = []
|
||||||
|
idx = 0
|
||||||
|
for mi in range(tiles_m):
|
||||||
|
for ki in range(tiles_k):
|
||||||
|
offset = (mi * tile_m * row_bytes) + (ki * tile_k * itemsize)
|
||||||
|
shards.append(ShardSpec(
|
||||||
|
pe_index=idx % num_pe,
|
||||||
|
offset_bytes=offset,
|
||||||
|
nbytes=tile_bytes,
|
||||||
|
))
|
||||||
|
idx += 1
|
||||||
|
return shards
|
||||||
|
|
||||||
|
|
||||||
|
def tiled_row_major(
|
||||||
|
*, shape: tuple[int, int], itemsize: int, num_pe: int,
|
||||||
|
tile_m: int, tile_k: int,
|
||||||
|
) -> list[ShardSpec]:
|
||||||
|
"""2D tiling, row-major order (M axis first), round-robin across PEs."""
|
||||||
|
M, K = shape
|
||||||
|
tiles_m = ceil(M / tile_m)
|
||||||
|
tiles_k = ceil(K / tile_k)
|
||||||
|
tile_bytes = tile_m * tile_k * itemsize
|
||||||
|
row_bytes = K * itemsize
|
||||||
|
shards = []
|
||||||
|
idx = 0
|
||||||
|
for ki in range(tiles_k):
|
||||||
|
for mi in range(tiles_m):
|
||||||
|
offset = (mi * tile_m * row_bytes) + (ki * tile_k * itemsize)
|
||||||
|
shards.append(ShardSpec(
|
||||||
|
pe_index=idx % num_pe,
|
||||||
|
offset_bytes=offset,
|
||||||
|
nbytes=tile_bytes,
|
||||||
|
))
|
||||||
|
idx += 1
|
||||||
|
return shards
|
||||||
@@ -0,0 +1,184 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import heapq
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
from kernbench.policy.address.phyaddr import PhysAddr, UnitType
|
||||||
|
from kernbench.topology.types import TopologyGraph
|
||||||
|
|
||||||
|
|
||||||
|
class RoutingError(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class AddressResolver:
|
||||||
|
"""Resolve a PhysAddr to the destination node_id in the compiled graph.
|
||||||
|
|
||||||
|
Also provides named node lookups (find_m_cpu, find_pcie_ep, …) so that
|
||||||
|
component implementations never construct node_id strings directly.
|
||||||
|
Centralising the naming convention here means a single change propagates
|
||||||
|
everywhere (ADR-0015 D4).
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, graph: TopologyGraph) -> None:
|
||||||
|
self._node_ids = set(graph.nodes)
|
||||||
|
mm = graph.spec["cube"]["memory_map"]
|
||||||
|
self._slice_size_bytes = mm["hbm_total_gb_per_cube"] * (1 << 30) // mm["hbm_slices_per_cube"]
|
||||||
|
|
||||||
|
# ── Physical-address resolution ──────────────────────────────────
|
||||||
|
|
||||||
|
def resolve(self, addr: PhysAddr) -> str:
|
||||||
|
s = addr.sip_id
|
||||||
|
c = addr.cube_id
|
||||||
|
if addr.kind == "hbm":
|
||||||
|
pe_slice = PhysAddr.hbm_pe_id(addr.hbm_offset, self._slice_size_bytes)
|
||||||
|
node_id = f"sip{s}.cube{c}.hbm_ctrl.slice{pe_slice}"
|
||||||
|
elif addr.kind == "pe_resource":
|
||||||
|
if addr.unit_type == UnitType.PE:
|
||||||
|
node_id = f"sip{s}.cube{c}.pe{addr.pe_id}.pe_tcm"
|
||||||
|
elif addr.unit_type == UnitType.SRAM:
|
||||||
|
node_id = f"sip{s}.cube{c}.sram"
|
||||||
|
elif addr.unit_type == UnitType.MCPU:
|
||||||
|
node_id = f"sip{s}.cube{c}.m_cpu"
|
||||||
|
else:
|
||||||
|
raise RoutingError(f"unsupported unit_type: {addr.unit_type}")
|
||||||
|
else:
|
||||||
|
raise RoutingError(f"unsupported address kind: {addr.kind}")
|
||||||
|
if node_id not in self._node_ids:
|
||||||
|
raise RoutingError(f"node {node_id} not found in topology")
|
||||||
|
return node_id
|
||||||
|
|
||||||
|
# ── Named node lookups ───────────────────────────────────────────
|
||||||
|
|
||||||
|
def find_m_cpu(self, sip: int, cube: int) -> str:
|
||||||
|
node_id = f"sip{sip}.cube{cube}.m_cpu"
|
||||||
|
if node_id not in self._node_ids:
|
||||||
|
raise RoutingError(f"M_CPU not found: {node_id}")
|
||||||
|
return node_id
|
||||||
|
|
||||||
|
def find_pcie_ep(self, sip: int, io_id: str = "io0") -> str:
|
||||||
|
node_id = f"sip{sip}.{io_id}.pcie_ep"
|
||||||
|
if node_id not in self._node_ids:
|
||||||
|
raise RoutingError(f"PCIE_EP not found: {node_id}")
|
||||||
|
return node_id
|
||||||
|
|
||||||
|
def find_io_cpu(self, sip: int, io_id: str = "io0") -> str:
|
||||||
|
node_id = f"sip{sip}.{io_id}.io_cpu"
|
||||||
|
if node_id not in self._node_ids:
|
||||||
|
raise RoutingError(f"IO_CPU not found: {node_id}")
|
||||||
|
return node_id
|
||||||
|
|
||||||
|
def find_all_pcie_eps(self) -> list[str]:
|
||||||
|
"""Return all PCIE_EP node ids across all SIPs, sorted."""
|
||||||
|
return sorted(nid for nid in self._node_ids if nid.endswith(".pcie_ep"))
|
||||||
|
|
||||||
|
|
||||||
|
class PathRouter:
|
||||||
|
"""Find data-path from a source PE (or arbitrary node) to a destination node.
|
||||||
|
|
||||||
|
Two adjacency graphs are maintained:
|
||||||
|
_adj — excludes command edges (used by PE DMA routing, find_path)
|
||||||
|
_adj_all — includes all edges (used by component-to-component routing,
|
||||||
|
find_node_path; required because M_CPU↔NOC links are "command")
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Edge kinds excluded from M_CPU DMA adjacency: prevents routing through
|
||||||
|
# PE-internal pipeline nodes when computing DMA paths.
|
||||||
|
_MCPU_DMA_EXCLUDE = {"pe_internal", "pe_to_xbar"}
|
||||||
|
|
||||||
|
def __init__(self, graph: TopologyGraph) -> None:
|
||||||
|
self._adj: dict[str, list[tuple[str, float]]] = defaultdict(list)
|
||||||
|
self._adj_all: dict[str, list[tuple[str, float]]] = defaultdict(list)
|
||||||
|
self._adj_mcpu_dma: dict[str, list[tuple[str, float]]] = defaultdict(list)
|
||||||
|
for e in graph.edges:
|
||||||
|
w = e.routing_weight_mm if e.routing_weight_mm is not None else e.distance_mm
|
||||||
|
self._adj_all[e.src].append((e.dst, w))
|
||||||
|
if e.kind != "command":
|
||||||
|
self._adj[e.src].append((e.dst, w))
|
||||||
|
if e.kind not in self._MCPU_DMA_EXCLUDE:
|
||||||
|
self._adj_mcpu_dma[e.src].append((e.dst, w))
|
||||||
|
|
||||||
|
def find_path(self, src_pe: str, dst_node: str) -> list[str]:
|
||||||
|
"""PE DMA routing: prepends .pe_dma, excludes command edges."""
|
||||||
|
start = f"{src_pe}.pe_dma"
|
||||||
|
return self._run_dijkstra(self._adj, start, dst_node)
|
||||||
|
|
||||||
|
def find_path_with_distance(self, src_pe: str, dst_node: str) -> tuple[list[str], float]:
|
||||||
|
start = f"{src_pe}.pe_dma"
|
||||||
|
return self._run_dijkstra_with_dist(self._adj, start, dst_node)
|
||||||
|
|
||||||
|
def find_mcpu_dma_path(self, m_cpu_id: str, dst_hbm_slice_id: str) -> list[str]:
|
||||||
|
"""M_CPU DMA path: never routes through PE-internal nodes (ADR-0015 D5).
|
||||||
|
|
||||||
|
Same-cube: deterministic [m_cpu, noc, xbar.pe_i, hbm_ctrl.slice_i].
|
||||||
|
Cross-cube: Dijkstra via _adj_mcpu_dma (pe_internal/pe_to_xbar excluded)
|
||||||
|
→ routes through NOC → UCIe → target cube NOC → xbar → HBM.
|
||||||
|
"""
|
||||||
|
m_cube = ".".join(m_cpu_id.split(".")[:2])
|
||||||
|
d_cube = ".".join(dst_hbm_slice_id.split(".")[:2])
|
||||||
|
if m_cube == d_cube:
|
||||||
|
slice_idx = int(dst_hbm_slice_id.rsplit("slice", 1)[1])
|
||||||
|
return [
|
||||||
|
m_cpu_id,
|
||||||
|
f"{m_cube}.noc",
|
||||||
|
f"{m_cube}.xbar.pe{slice_idx}",
|
||||||
|
dst_hbm_slice_id,
|
||||||
|
]
|
||||||
|
return self._run_dijkstra(self._adj_mcpu_dma, m_cpu_id, dst_hbm_slice_id)
|
||||||
|
|
||||||
|
def find_node_path(self, src: str, dst: str) -> list[str]:
|
||||||
|
"""General routing between arbitrary nodes, including command edges.
|
||||||
|
|
||||||
|
Used by components (IoCpuComponent, MCpuComponent) that route through
|
||||||
|
M_CPU↔NOC command-kind links.
|
||||||
|
"""
|
||||||
|
return self._run_dijkstra(self._adj_all, src, dst)
|
||||||
|
|
||||||
|
def _run_dijkstra(
|
||||||
|
self,
|
||||||
|
adj: dict[str, list[tuple[str, float]]],
|
||||||
|
start: str,
|
||||||
|
goal: str,
|
||||||
|
) -> list[str]:
|
||||||
|
path, _ = self._run_dijkstra_with_dist(adj, start, goal)
|
||||||
|
return path
|
||||||
|
|
||||||
|
def _run_dijkstra_with_dist(
|
||||||
|
self,
|
||||||
|
adj: dict[str, list[tuple[str, float]]],
|
||||||
|
start: str,
|
||||||
|
goal: str,
|
||||||
|
) -> tuple[list[str], float]:
|
||||||
|
if start == goal:
|
||||||
|
return [start], 0.0
|
||||||
|
best: dict[str, float] = {start: 0.0}
|
||||||
|
prev: dict[str, str] = {}
|
||||||
|
heap: list[tuple[float, str]] = [(0.0, start)]
|
||||||
|
while heap:
|
||||||
|
d, node = heapq.heappop(heap)
|
||||||
|
if node == goal:
|
||||||
|
path: list[str] = []
|
||||||
|
cur = goal
|
||||||
|
while cur != start:
|
||||||
|
path.append(cur)
|
||||||
|
cur = prev[cur]
|
||||||
|
path.append(start)
|
||||||
|
path.reverse()
|
||||||
|
return path, d
|
||||||
|
if d > best.get(node, float("inf")):
|
||||||
|
continue
|
||||||
|
for neighbor, edge_dist in adj[node]:
|
||||||
|
new_d = d + edge_dist
|
||||||
|
if new_d < best.get(neighbor, float("inf")):
|
||||||
|
best[neighbor] = new_d
|
||||||
|
prev[neighbor] = node
|
||||||
|
heapq.heappush(heap, (new_d, neighbor))
|
||||||
|
raise RoutingError(f"no path from {start} to {goal}")
|
||||||
|
|
||||||
|
# ── backward-compat shims (used by existing tests) ───────────────
|
||||||
|
|
||||||
|
def _dijkstra(self, start: str, goal: str) -> list[str]:
|
||||||
|
return self._run_dijkstra(self._adj, start, goal)
|
||||||
|
|
||||||
|
def _dijkstra_with_dist(self, start: str, goal: str) -> tuple[list[str], float]:
|
||||||
|
return self._run_dijkstra_with_dist(self._adj, start, goal)
|
||||||
@@ -0,0 +1,96 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from collections.abc import Callable
|
||||||
|
from enum import Enum
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from kernbench.common.types import Completion, SimEngine, Trace
|
||||||
|
|
||||||
|
from .context import RuntimeContext
|
||||||
|
from .types import BenchResult, DeviceSelector
|
||||||
|
|
||||||
|
|
||||||
|
class CompletionPolicy(str, Enum):
|
||||||
|
LAST_SUBMITTED = "last_submitted"
|
||||||
|
LAST_COMPLETED = "last_completed" # requires trace/timestamps or engine support; stub for now
|
||||||
|
ALL_OK_FAIL_FAST = "all_ok_fail_fast"
|
||||||
|
|
||||||
|
|
||||||
|
BenchFn = Callable[[RuntimeContext], Any]
|
||||||
|
EngineFactory = Callable[[object, DeviceSelector], SimEngine]
|
||||||
|
|
||||||
|
|
||||||
|
def run_bench(
|
||||||
|
*,
|
||||||
|
topology: object,
|
||||||
|
bench_fn: BenchFn,
|
||||||
|
device: DeviceSelector,
|
||||||
|
engine_factory: EngineFactory,
|
||||||
|
correlation_id: str = "bench0",
|
||||||
|
completion_policy: CompletionPolicy = CompletionPolicy.LAST_SUBMITTED,
|
||||||
|
) -> BenchResult:
|
||||||
|
"""
|
||||||
|
Minimal bench runner.
|
||||||
|
|
||||||
|
- topology: compiled topology object (opaque to runtime here)
|
||||||
|
- bench_fn: callable that receives RuntimeContext and submits requests
|
||||||
|
- device: DeviceSelector ("all" or "sip:<N>")
|
||||||
|
- engine_factory: builds sim_engine for given topology & device
|
||||||
|
- completion_policy: how to determine overall completion/result
|
||||||
|
"""
|
||||||
|
engine = engine_factory(topology, device)
|
||||||
|
# Extract spec from TopologyHandle or TopologyGraph
|
||||||
|
topo_obj = getattr(topology, "topology_obj", topology)
|
||||||
|
spec = getattr(topo_obj, "spec", None)
|
||||||
|
ctx = RuntimeContext(
|
||||||
|
engine=engine, target_device=device,
|
||||||
|
correlation_id=correlation_id, spec=spec,
|
||||||
|
)
|
||||||
|
|
||||||
|
bench_fn(ctx)
|
||||||
|
|
||||||
|
ctx.wait_all()
|
||||||
|
|
||||||
|
collected_traces = ctx._traces or None
|
||||||
|
|
||||||
|
handles = ctx.handles()
|
||||||
|
if not handles:
|
||||||
|
return BenchResult(
|
||||||
|
completion=Completion(
|
||||||
|
ok=False, error_code="NO_REQUESTS", error_message="Bench submitted no requests"
|
||||||
|
),
|
||||||
|
correlation_id=correlation_id,
|
||||||
|
trace=None,
|
||||||
|
traces=collected_traces,
|
||||||
|
)
|
||||||
|
|
||||||
|
if completion_policy == CompletionPolicy.LAST_SUBMITTED:
|
||||||
|
last = handles[-1]
|
||||||
|
completion, trace = engine.get_completion(last)
|
||||||
|
return BenchResult(
|
||||||
|
completion=completion, correlation_id=correlation_id,
|
||||||
|
trace=trace, traces=collected_traces,
|
||||||
|
)
|
||||||
|
|
||||||
|
if completion_policy == CompletionPolicy.ALL_OK_FAIL_FAST:
|
||||||
|
last_trace: Trace | None = None
|
||||||
|
for h in handles:
|
||||||
|
c, t = engine.get_completion(h)
|
||||||
|
last_trace = t if t is not None else last_trace
|
||||||
|
if not c.ok:
|
||||||
|
return BenchResult(
|
||||||
|
completion=c, correlation_id=correlation_id,
|
||||||
|
trace=last_trace, traces=collected_traces,
|
||||||
|
)
|
||||||
|
return BenchResult(
|
||||||
|
completion=Completion(ok=True), correlation_id=correlation_id,
|
||||||
|
trace=last_trace, traces=collected_traces,
|
||||||
|
)
|
||||||
|
|
||||||
|
# LAST_COMPLETED placeholder (needs engine support for timing). Fall back.
|
||||||
|
last = handles[-1]
|
||||||
|
completion, trace = engine.get_completion(last)
|
||||||
|
return BenchResult(
|
||||||
|
completion=completion, correlation_id=correlation_id,
|
||||||
|
trace=trace, traces=collected_traces,
|
||||||
|
)
|
||||||
@@ -0,0 +1,282 @@
|
|||||||
|
# kernbench/runtime_api/context.py
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from kernbench.common.types import Completion, RequestHandle, SimEngine
|
||||||
|
|
||||||
|
from .types import DeviceSelector
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class RuntimeContext:
|
||||||
|
engine: SimEngine
|
||||||
|
target_device: DeviceSelector
|
||||||
|
correlation_id: str
|
||||||
|
spec: dict | None = None
|
||||||
|
|
||||||
|
_handles: list[RequestHandle] = field(default_factory=list, init=False)
|
||||||
|
_completed: set[RequestHandle] = field(default_factory=set, init=False)
|
||||||
|
_allocators: dict[int, Any] = field(default_factory=dict, init=False)
|
||||||
|
_tensor_counter: int = field(default=0, init=False)
|
||||||
|
_traces: list[dict] = field(default_factory=list, init=False)
|
||||||
|
|
||||||
|
def submit(self, request: Any) -> RequestHandle:
|
||||||
|
submit_fn = getattr(self.engine, "submit", None)
|
||||||
|
if submit_fn is None:
|
||||||
|
raise AttributeError("Engine does not implement submit(request) -> RequestHandle.")
|
||||||
|
handle: RequestHandle = submit_fn(request) # type: ignore[call-arg]
|
||||||
|
self._handles.append(handle)
|
||||||
|
return handle
|
||||||
|
|
||||||
|
def is_completed(self, handle: RequestHandle) -> bool:
|
||||||
|
return handle in self._completed
|
||||||
|
|
||||||
|
def wait(self, handle: RequestHandle, *, _meta: dict | None = None) -> Completion:
|
||||||
|
if handle in self._completed:
|
||||||
|
completion, trace = self.engine.get_completion(handle)
|
||||||
|
return completion
|
||||||
|
|
||||||
|
wait_fn = getattr(self.engine, "wait", None)
|
||||||
|
if wait_fn is not None:
|
||||||
|
wait_fn(handle) # type: ignore[misc]
|
||||||
|
|
||||||
|
completion, trace = self.engine.get_completion(handle)
|
||||||
|
self._completed.add(handle)
|
||||||
|
if _meta is not None and trace is not None:
|
||||||
|
entry = dict(trace) if isinstance(trace, dict) else {"raw": trace}
|
||||||
|
entry.update(_meta)
|
||||||
|
self._traces.append(entry)
|
||||||
|
return completion
|
||||||
|
|
||||||
|
def wait_all(self) -> None:
|
||||||
|
for h in self._handles:
|
||||||
|
if h not in self._completed:
|
||||||
|
self.wait(h)
|
||||||
|
|
||||||
|
def handles(self) -> list[RequestHandle]:
|
||||||
|
return list(self._handles)
|
||||||
|
|
||||||
|
# ── PyTorch-like tensor API ──────────────────────────────────────
|
||||||
|
|
||||||
|
def _ensure_allocators(self) -> dict:
|
||||||
|
"""Lazily create PEMemAllocator instances from spec."""
|
||||||
|
if self._allocators:
|
||||||
|
return self._allocators
|
||||||
|
if self.spec is None:
|
||||||
|
raise RuntimeError(
|
||||||
|
"RuntimeContext.spec is required for tensor operations. "
|
||||||
|
"Pass spec=graph.spec when creating RuntimeContext."
|
||||||
|
)
|
||||||
|
from kernbench.policy.address.allocator import AddressConfig, PEMemAllocator
|
||||||
|
|
||||||
|
system = self.spec.get("system", {})
|
||||||
|
cube = self.spec.get("cube", {})
|
||||||
|
mm = cube.get("memory_map", {})
|
||||||
|
pe_template = cube.get("pe_template", {})
|
||||||
|
pe_comps = pe_template.get("components", {})
|
||||||
|
tcm_cfg = pe_comps.get("pe_tcm", {}).get("attrs", {})
|
||||||
|
|
||||||
|
sip_count = system.get("sips", {}).get("count", 1)
|
||||||
|
cubes_per_sip = system.get("sips", {}).get("cubes_per_sip", 16)
|
||||||
|
pes_per_cube = (
|
||||||
|
cube.get("pe_layout", {}).get("pe_per_corner", 2)
|
||||||
|
* len(cube.get("pe_layout", {}).get("corners", ["NW", "NE", "SW", "SE"]))
|
||||||
|
)
|
||||||
|
hbm_gb = mm.get("hbm_total_gb_per_cube", 48)
|
||||||
|
hbm_slices = mm.get("hbm_slices_per_cube", 8)
|
||||||
|
tcm_mb = tcm_cfg.get("size_mb", 16)
|
||||||
|
|
||||||
|
cfg = AddressConfig(
|
||||||
|
sip_count=sip_count,
|
||||||
|
cubes_per_sip=cubes_per_sip,
|
||||||
|
pes_per_cube=pes_per_cube,
|
||||||
|
hbm_bytes_per_cube=hbm_gb * (1 << 30),
|
||||||
|
hbm_slices_per_cube=hbm_slices,
|
||||||
|
tcm_bytes_per_pe=tcm_mb * (1 << 20),
|
||||||
|
tcm_scheduler_reserved_bytes=4 * (1 << 20),
|
||||||
|
sram_bytes_per_cube=32 * (1 << 20),
|
||||||
|
)
|
||||||
|
# Create allocators for all SIPs × cubes × PEs
|
||||||
|
# Flat index: sip_id * cubes_per_sip * pes_per_cube + cube_id * pes_per_cube + pe_id
|
||||||
|
self._pes_per_cube = pes_per_cube
|
||||||
|
self._num_cubes = cubes_per_sip
|
||||||
|
self._num_sips = sip_count
|
||||||
|
cubes_x_pes = cubes_per_sip * pes_per_cube
|
||||||
|
for sip_id in range(sip_count):
|
||||||
|
for cube_id in range(cubes_per_sip):
|
||||||
|
for pe_id in range(pes_per_cube):
|
||||||
|
flat_idx = sip_id * cubes_x_pes + cube_id * pes_per_cube + pe_id
|
||||||
|
self._allocators[flat_idx] = PEMemAllocator(
|
||||||
|
rack_id=0, sip_id=sip_id, cube_id=cube_id, pe_id=pe_id, cfg=cfg,
|
||||||
|
)
|
||||||
|
return self._allocators
|
||||||
|
|
||||||
|
def _next_tensor_name(self) -> str:
|
||||||
|
self._tensor_counter += 1
|
||||||
|
return f"t{self._tensor_counter}"
|
||||||
|
|
||||||
|
def zeros(
|
||||||
|
self,
|
||||||
|
shape: tuple[int, ...],
|
||||||
|
dtype: str = "f16",
|
||||||
|
*,
|
||||||
|
placement: list | None = None,
|
||||||
|
dp: Any = None,
|
||||||
|
name: str | None = None,
|
||||||
|
):
|
||||||
|
"""Create a tensor and deploy to HBM with zero-fill (like torch.zeros)."""
|
||||||
|
return self._create_tensor(shape, dtype, placement, name, pattern="zero", dp=dp)
|
||||||
|
|
||||||
|
def empty(
|
||||||
|
self,
|
||||||
|
shape: tuple[int, ...],
|
||||||
|
dtype: str = "f16",
|
||||||
|
*,
|
||||||
|
placement: list | None = None,
|
||||||
|
dp: Any = None,
|
||||||
|
name: str | None = None,
|
||||||
|
):
|
||||||
|
"""Allocate a tensor in HBM without initialization (like torch.empty)."""
|
||||||
|
return self._create_tensor(shape, dtype, placement, name, pattern=None, dp=dp)
|
||||||
|
|
||||||
|
def _create_tensor(
|
||||||
|
self,
|
||||||
|
shape: tuple[int, ...],
|
||||||
|
dtype: str,
|
||||||
|
placement: list | None,
|
||||||
|
name: str | None,
|
||||||
|
pattern: str | None,
|
||||||
|
dp: Any = None,
|
||||||
|
):
|
||||||
|
from kernbench.policy.placement.dp import DPPolicy, ShardSpec, resolve_dp_policy
|
||||||
|
from kernbench.runtime_api.kernel import MemoryWriteMsg
|
||||||
|
from kernbench.runtime_api.tensor import Tensor, deploy_tensor, dtype_itemsize
|
||||||
|
|
||||||
|
tensor_name = name or self._next_tensor_name()
|
||||||
|
t = Tensor(shape=shape, dtype=dtype, name=tensor_name)
|
||||||
|
|
||||||
|
dp_policy: DPPolicy | None = None
|
||||||
|
|
||||||
|
# Resolve placement: dp= takes priority over placement=
|
||||||
|
if dp is not None and isinstance(dp, DPPolicy):
|
||||||
|
dp_policy = dp
|
||||||
|
allocators = self._ensure_allocators()
|
||||||
|
itemsize = dtype_itemsize(dtype)
|
||||||
|
shape_2d = (shape[0], shape[1]) # type: tuple[int, int]
|
||||||
|
total_cubes = self._num_sips * self._num_cubes
|
||||||
|
placement = resolve_dp_policy(
|
||||||
|
dp, shape=shape_2d, itemsize=itemsize,
|
||||||
|
num_pe=self._pes_per_cube, num_cubes=total_cubes,
|
||||||
|
)
|
||||||
|
elif placement is None:
|
||||||
|
placement = [ShardSpec(pe_index=0, offset_bytes=0, nbytes=t.nbytes)]
|
||||||
|
|
||||||
|
# Infer target_pe from placement: multi-PE → "all", single PE → pe_index
|
||||||
|
pe_indices = {s.pe_index for s in placement}
|
||||||
|
target_pe: int | str = "all" if len(pe_indices) > 1 else next(iter(pe_indices))
|
||||||
|
t.to(placement=placement, target_pe=target_pe, dp_policy=dp_policy)
|
||||||
|
|
||||||
|
# Allocate PAs via PEMemAllocator
|
||||||
|
allocators = self._ensure_allocators()
|
||||||
|
handle = deploy_tensor(
|
||||||
|
name=tensor_name,
|
||||||
|
shape=shape,
|
||||||
|
dtype=dtype,
|
||||||
|
placement=placement,
|
||||||
|
allocators=allocators,
|
||||||
|
)
|
||||||
|
t._handle = handle
|
||||||
|
|
||||||
|
# Submit MemoryWriteMsg per shard (deploy data to device)
|
||||||
|
if pattern is not None:
|
||||||
|
for shard in handle.shards:
|
||||||
|
h = self.submit(MemoryWriteMsg(
|
||||||
|
correlation_id=self.correlation_id,
|
||||||
|
request_id=f"deploy_{tensor_name}_pe{shard.pe}",
|
||||||
|
dst_sip=shard.sip, dst_cube=shard.cube, dst_pe=shard.pe,
|
||||||
|
dst_pa=shard.pa, nbytes=shard.nbytes, pattern=pattern,
|
||||||
|
target_cubes=(shard.cube,), target_pe=shard.pe,
|
||||||
|
))
|
||||||
|
self.wait(h, _meta={
|
||||||
|
"phase": "memory_write", "name": tensor_name,
|
||||||
|
"sip": shard.sip, "cube": shard.cube, "pe": shard.pe,
|
||||||
|
"nbytes": shard.nbytes,
|
||||||
|
})
|
||||||
|
|
||||||
|
return t
|
||||||
|
|
||||||
|
def launch(
|
||||||
|
self,
|
||||||
|
kernel_name: str,
|
||||||
|
kernel_fn: Any,
|
||||||
|
*args: Any,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> RequestHandle:
|
||||||
|
"""Register and launch a kernel (like a fused torch op).
|
||||||
|
|
||||||
|
Positional args: Tensor objects become TensorArg, int/float become ScalarArg.
|
||||||
|
Keyword args: become ScalarArg (name is discarded, order preserved).
|
||||||
|
"""
|
||||||
|
from kernbench.runtime_api.kernel import (
|
||||||
|
KernelLaunchMsg,
|
||||||
|
KernelRef,
|
||||||
|
ScalarArg,
|
||||||
|
)
|
||||||
|
from kernbench.runtime_api.tensor import Tensor
|
||||||
|
from kernbench.triton_emu.registry import register_kernel
|
||||||
|
|
||||||
|
# Register kernel (idempotent)
|
||||||
|
try:
|
||||||
|
register_kernel(kernel_name, kernel_fn)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Build kernel args from positional + keyword args
|
||||||
|
kernel_args: list = []
|
||||||
|
target_pe: int | str = 0
|
||||||
|
|
||||||
|
for a in args:
|
||||||
|
if isinstance(a, Tensor):
|
||||||
|
kernel_args.append(a.to_tensor_arg())
|
||||||
|
# Infer target_pe from tensor DP metadata
|
||||||
|
if a._dp_metadata is not None:
|
||||||
|
dp_target = a._dp_metadata.target_pe
|
||||||
|
if dp_target == "all":
|
||||||
|
target_pe = "all"
|
||||||
|
elif isinstance(dp_target, int) and target_pe != "all":
|
||||||
|
target_pe = dp_target
|
||||||
|
elif isinstance(a, (int, float)):
|
||||||
|
dtype_str = "f32" if isinstance(a, float) else "i32"
|
||||||
|
kernel_args.append(ScalarArg(dtype=dtype_str, value=a))
|
||||||
|
|
||||||
|
for v in kwargs.values():
|
||||||
|
if isinstance(v, (int, float)):
|
||||||
|
dtype_str = "f32" if isinstance(v, float) else "i32"
|
||||||
|
kernel_args.append(ScalarArg(dtype=dtype_str, value=v))
|
||||||
|
|
||||||
|
# Determine target cubes from all tensor shards
|
||||||
|
cube_set: set[int] = set()
|
||||||
|
for a in args:
|
||||||
|
if isinstance(a, Tensor) and a._handle is not None:
|
||||||
|
for s in a._handle.shards:
|
||||||
|
cube_set.add(s.cube)
|
||||||
|
target_cubes = tuple(sorted(cube_set)) if cube_set else (0,)
|
||||||
|
|
||||||
|
# Collect scalar values for GEMM FLOP calculation
|
||||||
|
scalar_vals = [a.value for a in kernel_args if hasattr(a, "value")]
|
||||||
|
|
||||||
|
h = self.submit(KernelLaunchMsg(
|
||||||
|
correlation_id=self.correlation_id,
|
||||||
|
request_id=kernel_name,
|
||||||
|
kernel_ref=KernelRef(name=kernel_name, kind="builtin"),
|
||||||
|
args=tuple(kernel_args),
|
||||||
|
target_cubes=target_cubes,
|
||||||
|
target_pe=target_pe,
|
||||||
|
))
|
||||||
|
self.wait(h, _meta={
|
||||||
|
"phase": "kernel", "name": kernel_name,
|
||||||
|
"target_pe": target_pe, "scalars": scalar_vals,
|
||||||
|
})
|
||||||
|
return h
|
||||||
@@ -0,0 +1,123 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Literal, TypeAlias
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class MemoryWriteMsg:
|
||||||
|
correlation_id: str
|
||||||
|
request_id: str
|
||||||
|
dst_sip: int
|
||||||
|
dst_cube: int
|
||||||
|
dst_pe: int
|
||||||
|
dst_pa: int
|
||||||
|
nbytes: int
|
||||||
|
src_kind: Literal["pattern", "host_buffer_ref"] = "pattern"
|
||||||
|
pattern: str | None = None
|
||||||
|
target_cubes: tuple[int, ...] | Literal["all"] = "all"
|
||||||
|
target_pe: int | Literal["all"] = "all"
|
||||||
|
msg_type: Literal["memory_write"] = "memory_write"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class MemoryReadMsg:
|
||||||
|
correlation_id: str
|
||||||
|
request_id: str
|
||||||
|
src_sip: int
|
||||||
|
src_cube: int
|
||||||
|
src_pe: int
|
||||||
|
src_pa: int
|
||||||
|
nbytes: int
|
||||||
|
target_cubes: tuple[int, ...] | Literal["all"] = "all"
|
||||||
|
target_pe: int | Literal["all"] = "all"
|
||||||
|
msg_type: Literal["memory_read"] = "memory_read"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class KernelRef:
|
||||||
|
"""Reference to a kernel binary or builtin timing model.
|
||||||
|
|
||||||
|
Kernel binaries must be pre-deployed to device memory via MemoryWriteMsg.
|
||||||
|
KernelLaunchMsg references the deployed location by PA — source code or IR
|
||||||
|
MUST NOT be embedded in launch messages.
|
||||||
|
|
||||||
|
- "deployed": kernel binary pre-deployed to HBM/SRAM at deploy_pa.
|
||||||
|
- "builtin": simulator built-in timing model, identified by name.
|
||||||
|
"""
|
||||||
|
|
||||||
|
name: str
|
||||||
|
kind: Literal["deployed", "builtin"]
|
||||||
|
deploy_pa: int | None = None
|
||||||
|
deploy_sip: int = 0
|
||||||
|
deploy_cube: int = 0
|
||||||
|
deploy_pe: int = 0
|
||||||
|
nbytes_code: int = 0
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class TensorArgShard:
|
||||||
|
sip: int
|
||||||
|
cube: int
|
||||||
|
pe: int
|
||||||
|
pa: int
|
||||||
|
nbytes: int
|
||||||
|
offset_bytes: int
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class TensorArg:
|
||||||
|
shards: tuple[TensorArgShard, ...]
|
||||||
|
arg_kind: Literal["tensor"] = "tensor"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class ScalarArg:
|
||||||
|
dtype: str
|
||||||
|
value: float | int
|
||||||
|
arg_kind: Literal["scalar"] = "scalar"
|
||||||
|
|
||||||
|
|
||||||
|
KernelArg: TypeAlias = TensorArg | ScalarArg
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class KernelLaunchMsg:
|
||||||
|
correlation_id: str
|
||||||
|
request_id: str
|
||||||
|
kernel_ref: KernelRef
|
||||||
|
args: tuple[KernelArg, ...]
|
||||||
|
target_cubes: tuple[int, ...] | Literal["all"] = "all"
|
||||||
|
target_pe: int | Literal["all"] = "all"
|
||||||
|
msg_type: Literal["kernel_launch"] = "kernel_launch"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class ResponseMsg:
|
||||||
|
"""Device→Host response carrying PE execution result."""
|
||||||
|
|
||||||
|
correlation_id: str
|
||||||
|
request_id: str
|
||||||
|
src_cube: int
|
||||||
|
src_pe: int
|
||||||
|
success: bool
|
||||||
|
msg_type: Literal["response"] = "response"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class PeDmaMsg:
|
||||||
|
"""Direct PE DMA request: host injects a transfer at PE_DMA level.
|
||||||
|
|
||||||
|
Used by the probe utility to measure PE→HBM latency without requiring
|
||||||
|
the full PE_CPU → scheduler → DMA pipeline.
|
||||||
|
"""
|
||||||
|
|
||||||
|
correlation_id: str
|
||||||
|
request_id: str
|
||||||
|
src_sip: int
|
||||||
|
src_cube: int
|
||||||
|
src_pe: int
|
||||||
|
dst_pa: int
|
||||||
|
nbytes: int
|
||||||
|
is_write: bool = False
|
||||||
|
msg_type: Literal["pe_dma"] = "pe_dma"
|
||||||
@@ -0,0 +1,166 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import math
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Literal
|
||||||
|
|
||||||
|
from kernbench.policy.address.allocator import PEMemAllocator
|
||||||
|
from kernbench.policy.placement.dp import DPPolicy, ShardSpec
|
||||||
|
from kernbench.runtime_api.kernel import TensorArg, TensorArgShard
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class TensorShard:
|
||||||
|
sip: int
|
||||||
|
cube: int
|
||||||
|
pe: int
|
||||||
|
pa: int
|
||||||
|
nbytes: int
|
||||||
|
offset_bytes: int
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class TensorHandle:
|
||||||
|
name: str
|
||||||
|
shape: tuple[int, ...]
|
||||||
|
dtype: str
|
||||||
|
itemsize: int
|
||||||
|
shards: tuple[TensorShard, ...]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def nbytes(self) -> int:
|
||||||
|
return math.prod(self.shape) * self.itemsize
|
||||||
|
|
||||||
|
|
||||||
|
_DTYPE_ITEMSIZE = {
|
||||||
|
"fp16": 2, "float16": 2, "f16": 2,
|
||||||
|
"fp32": 4, "float32": 4, "f32": 4,
|
||||||
|
"bf16": 2,
|
||||||
|
"int8": 1, "i8": 1,
|
||||||
|
"int16": 2, "i16": 2,
|
||||||
|
"int32": 4, "i32": 4,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def dtype_itemsize(dtype: str) -> int:
|
||||||
|
if dtype not in _DTYPE_ITEMSIZE:
|
||||||
|
raise ValueError(f"unsupported dtype: {dtype}")
|
||||||
|
return _DTYPE_ITEMSIZE[dtype]
|
||||||
|
|
||||||
|
|
||||||
|
def deploy_tensor(
|
||||||
|
*,
|
||||||
|
name: str,
|
||||||
|
shape: tuple[int, ...],
|
||||||
|
dtype: str,
|
||||||
|
placement: list[ShardSpec],
|
||||||
|
allocators: dict[int, PEMemAllocator],
|
||||||
|
mem_kind: Literal["hbm", "tcm"] = "hbm",
|
||||||
|
) -> TensorHandle:
|
||||||
|
isize = dtype_itemsize(dtype)
|
||||||
|
shards: list[TensorShard] = []
|
||||||
|
for spec in placement:
|
||||||
|
alloc = allocators[spec.pe_index]
|
||||||
|
if mem_kind == "hbm":
|
||||||
|
pa = alloc.alloc_hbm(spec.nbytes)
|
||||||
|
else:
|
||||||
|
pa = alloc.alloc_tcm(spec.nbytes)
|
||||||
|
shards.append(TensorShard(
|
||||||
|
sip=alloc._sip_id,
|
||||||
|
cube=alloc._cube_id,
|
||||||
|
pe=alloc._pe_id,
|
||||||
|
pa=pa.encode(),
|
||||||
|
nbytes=spec.nbytes,
|
||||||
|
offset_bytes=spec.offset_bytes,
|
||||||
|
))
|
||||||
|
return TensorHandle(
|
||||||
|
name=name,
|
||||||
|
shape=shape,
|
||||||
|
dtype=dtype,
|
||||||
|
itemsize=isize,
|
||||||
|
shards=tuple(shards),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ── PyTorch-like Tensor API ──────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class DPMetadata:
|
||||||
|
"""Data-parallel placement metadata (stored as Tensor._dp_metadata)."""
|
||||||
|
|
||||||
|
placement: list[ShardSpec]
|
||||||
|
dp_policy: DPPolicy | None = None
|
||||||
|
sip: int = 0
|
||||||
|
cube: int = 0
|
||||||
|
target_pe: int | str = 0 # int → single PE, "all" → all PEs
|
||||||
|
|
||||||
|
|
||||||
|
class Tensor:
|
||||||
|
"""PyTorch-like tensor for benchmark code.
|
||||||
|
|
||||||
|
Usage::
|
||||||
|
|
||||||
|
a = ctx.zeros((M, K), dtype="f16")
|
||||||
|
a = ctx.zeros((M, K), dtype="f16", placement=dp.replicate(num_pe=8))
|
||||||
|
ctx.launch("kernel_name", kernel_fn, a, b, out, M=M, K=K)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
shape: tuple[int, ...],
|
||||||
|
dtype: str = "f16",
|
||||||
|
name: str = "",
|
||||||
|
) -> None:
|
||||||
|
self.shape = shape
|
||||||
|
self.dtype = dtype
|
||||||
|
self.name = name
|
||||||
|
self._dp_metadata: DPMetadata | None = None
|
||||||
|
self._handle: TensorHandle | None = None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def itemsize(self) -> int:
|
||||||
|
return dtype_itemsize(self.dtype)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def nbytes(self) -> int:
|
||||||
|
return math.prod(self.shape) * self.itemsize
|
||||||
|
|
||||||
|
@property
|
||||||
|
def pa(self) -> int:
|
||||||
|
"""Primary PA (first shard). Used as kernel pointer argument."""
|
||||||
|
if self._handle is None or not self._handle.shards:
|
||||||
|
raise RuntimeError(f"Tensor '{self.name}' is not deployed yet")
|
||||||
|
return self._handle.shards[0].pa
|
||||||
|
|
||||||
|
def to(
|
||||||
|
self,
|
||||||
|
placement: list[ShardSpec] | None = None,
|
||||||
|
*,
|
||||||
|
dp_policy: DPPolicy | None = None,
|
||||||
|
sip: int = 0,
|
||||||
|
cube: int = 0,
|
||||||
|
target_pe: int | str = 0,
|
||||||
|
) -> Tensor:
|
||||||
|
"""Set DP placement metadata (like torch.Tensor.to())."""
|
||||||
|
if placement is None:
|
||||||
|
placement = [ShardSpec(pe_index=0, offset_bytes=0, nbytes=self.nbytes)]
|
||||||
|
self._dp_metadata = DPMetadata(
|
||||||
|
placement=placement, dp_policy=dp_policy,
|
||||||
|
sip=sip, cube=cube, target_pe=target_pe,
|
||||||
|
)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def to_tensor_arg(self) -> TensorArg:
|
||||||
|
"""Convert deployed shards to KernelLaunchMsg TensorArg."""
|
||||||
|
if self._handle is None:
|
||||||
|
raise RuntimeError(f"Tensor '{self.name}' is not deployed yet")
|
||||||
|
return TensorArg(
|
||||||
|
shards=tuple(
|
||||||
|
TensorArgShard(
|
||||||
|
sip=s.sip, cube=s.cube, pe=s.pe,
|
||||||
|
pa=s.pa, nbytes=s.nbytes, offset_bytes=s.offset_bytes,
|
||||||
|
)
|
||||||
|
for s in self._handle.shards
|
||||||
|
),
|
||||||
|
)
|
||||||
@@ -0,0 +1,71 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
from kernbench.common.types import Completion, Trace
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class BenchResult:
|
||||||
|
completion: Completion
|
||||||
|
correlation_id: str
|
||||||
|
trace: Trace | None = None
|
||||||
|
traces: list[dict] | None = None
|
||||||
|
|
||||||
|
def summary_text(self) -> str:
|
||||||
|
if self.completion.ok:
|
||||||
|
return f"[OK] correlation_id={self.correlation_id}"
|
||||||
|
code = self.completion.error_code or "ERROR"
|
||||||
|
msg = self.completion.error_message or ""
|
||||||
|
return f"[FAIL:{code}] correlation_id={self.correlation_id} {msg}".rstrip()
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class DeviceSelector:
|
||||||
|
"""
|
||||||
|
Device selector.
|
||||||
|
|
||||||
|
Supported:
|
||||||
|
- "all" : all SIPs in the tray topology
|
||||||
|
- "sip:<N>" : a single SIP index
|
||||||
|
"""
|
||||||
|
|
||||||
|
raw: str # "all" or "sip:<N>"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_all(self) -> bool:
|
||||||
|
return self.raw == "all"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def sip_index(self) -> int:
|
||||||
|
if self.is_all:
|
||||||
|
raise ValueError("DeviceSelector is 'all'; no single sip_index.")
|
||||||
|
m = re.fullmatch(r"sip:(\d+)", self.raw)
|
||||||
|
if not m:
|
||||||
|
raise ValueError(
|
||||||
|
f"Invalid device '{self.raw}'. Expected 'all' or 'sip:<N>' (e.g., sip:0)."
|
||||||
|
)
|
||||||
|
return int(m.group(1))
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_device(raw: str | None) -> DeviceSelector:
|
||||||
|
"""
|
||||||
|
Resolve the CLI --device string into a DeviceSelector.
|
||||||
|
|
||||||
|
Semantics:
|
||||||
|
- if omitted/empty -> "all"
|
||||||
|
- else accept "all" or "sip:<N>"
|
||||||
|
"""
|
||||||
|
if raw is None or raw.strip() == "":
|
||||||
|
return DeviceSelector(raw="all")
|
||||||
|
|
||||||
|
raw = raw.strip().lower()
|
||||||
|
if raw == "all":
|
||||||
|
return DeviceSelector(raw="all")
|
||||||
|
|
||||||
|
m = re.fullmatch(r"sip:(\d+)", raw)
|
||||||
|
if not m:
|
||||||
|
raise ValueError(f"Invalid device '{raw}'. Expected 'all' or 'sip:<N>' (e.g., sip:0).")
|
||||||
|
|
||||||
|
return DeviceSelector(raw=raw)
|
||||||
@@ -0,0 +1,31 @@
|
|||||||
|
# kernbench/engine/dummy.py
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from kernbench.common.types import Completion, RequestHandle, SimEngine, Trace
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class DummyEngine(SimEngine):
|
||||||
|
topology: object
|
||||||
|
device_raw: str
|
||||||
|
_n: int = 0
|
||||||
|
_store: dict[str, tuple[Completion, Trace | None]] = None # type: ignore
|
||||||
|
|
||||||
|
def __post_init__(self) -> None:
|
||||||
|
self._store = {}
|
||||||
|
|
||||||
|
def submit(self, request: Any) -> RequestHandle:
|
||||||
|
self._n += 1
|
||||||
|
h = RequestHandle(f"h{self._n}")
|
||||||
|
# 여기서 request 처리/시뮬레이션/스케줄링 등을 수행
|
||||||
|
self._store[str(h)] = (Completion(ok=True), {"request": request, "device": self.device_raw})
|
||||||
|
return h
|
||||||
|
|
||||||
|
def get_completion(self, handle: RequestHandle) -> tuple[Completion, Trace | None]:
|
||||||
|
return self._store[str(handle)]
|
||||||
|
|
||||||
|
def wait(self, handle: RequestHandle) -> None:
|
||||||
|
pass
|
||||||
@@ -0,0 +1,298 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import simpy
|
||||||
|
|
||||||
|
from kernbench.common.types import Completion, RequestHandle, Trace
|
||||||
|
import kernbench.components.impls # noqa: F401 — registers built-in implementations
|
||||||
|
from kernbench.components.base import ComponentBase, ComponentRegistry
|
||||||
|
from kernbench.components.context import ComponentContext
|
||||||
|
from kernbench.policy.address.phyaddr import PhysAddr
|
||||||
|
from kernbench.policy.routing.router import AddressResolver, PathRouter
|
||||||
|
from kernbench.runtime_api.kernel import KernelLaunchMsg, MemoryReadMsg, MemoryWriteMsg, PeDmaMsg
|
||||||
|
from kernbench.sim_engine.transaction import Transaction
|
||||||
|
from kernbench.topology.types import Edge, TopologyGraph
|
||||||
|
|
||||||
|
|
||||||
|
class GraphEngine:
|
||||||
|
"""simpy-based discrete-event simulation engine.
|
||||||
|
|
||||||
|
Phase B: engine injects a Transaction into the PCIE_EP host queue for
|
||||||
|
each request. Components handle their own routing:
|
||||||
|
Path 1: PCIE_EP → IO_CPU (engine-computed path, pre-loaded in Transaction)
|
||||||
|
Path 2: IO_CPU → M_CPU (IO_CPU dispatches, fire-and-forget callback)
|
||||||
|
Path 3: M_CPU.DMA → HBM (M_CPU dispatches, fire-and-forget callback)
|
||||||
|
|
||||||
|
Component implementations are DI-injectable via component_overrides (ADR-0007 D3).
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
graph: TopologyGraph,
|
||||||
|
*,
|
||||||
|
component_overrides: dict[str, type[ComponentBase]] | None = None,
|
||||||
|
) -> None:
|
||||||
|
self._env = simpy.Environment()
|
||||||
|
self._resolver = AddressResolver(graph)
|
||||||
|
self._router = PathRouter(graph)
|
||||||
|
self._nodes = graph.nodes
|
||||||
|
self._edge_map: dict[tuple[str, str], Edge] = {}
|
||||||
|
for e in graph.edges:
|
||||||
|
self._edge_map[(e.src, e.dst)] = e
|
||||||
|
self._ns_per_mm: float = graph.spec.get("system", {}).get("ns_per_mm", 0.01)
|
||||||
|
self._results: dict[str, tuple[Completion, Trace]] = {}
|
||||||
|
self._events: dict[str, simpy.Event] = {}
|
||||||
|
self._counter = 0
|
||||||
|
overrides = component_overrides or {}
|
||||||
|
ctx = ComponentContext(
|
||||||
|
router=self._router,
|
||||||
|
resolver=self._resolver,
|
||||||
|
positions={nid: n.pos_mm for nid, n in graph.nodes.items()},
|
||||||
|
ns_per_mm=self._ns_per_mm,
|
||||||
|
edge_map=self._edge_map,
|
||||||
|
spec=graph.spec,
|
||||||
|
)
|
||||||
|
self._components: dict[str, ComponentBase] = {
|
||||||
|
node_id: ComponentRegistry.create(node, overrides, ctx)
|
||||||
|
for node_id, node in graph.nodes.items()
|
||||||
|
}
|
||||||
|
|
||||||
|
# Wire ports: one Store per directed edge (ADR-0015 D1)
|
||||||
|
for e in graph.edges:
|
||||||
|
src_comp = self._components.get(e.src)
|
||||||
|
dst_comp = self._components.get(e.dst)
|
||||||
|
if src_comp is None or dst_comp is None:
|
||||||
|
continue
|
||||||
|
store: simpy.Store = simpy.Store(self._env)
|
||||||
|
src_comp.out_ports[e.dst] = store
|
||||||
|
dst_comp.in_ports[e.src] = store
|
||||||
|
|
||||||
|
# Wire processes: propagation delay per edge (ADR-0015 D2)
|
||||||
|
# Cut-through (wormhole) model: wires apply propagation only.
|
||||||
|
# Serialization (drain) is computed per-path and applied once at the terminal.
|
||||||
|
for e in graph.edges:
|
||||||
|
src_comp = self._components.get(e.src)
|
||||||
|
dst_comp = self._components.get(e.dst)
|
||||||
|
if src_comp is None or dst_comp is None:
|
||||||
|
continue
|
||||||
|
prop_ns = e.distance_mm * self._ns_per_mm
|
||||||
|
self._env.process(
|
||||||
|
self._wire(src_comp.out_ports[e.dst], dst_comp.in_ports[e.src],
|
||||||
|
prop_ns)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Attach host queues to PCIE_EP in_ports before start() (ADR-0015 D3)
|
||||||
|
self._host_queues: dict[str, simpy.Store] = {}
|
||||||
|
for pcie_ep_id in self._resolver.find_all_pcie_eps():
|
||||||
|
host_q: simpy.Store = simpy.Store(self._env)
|
||||||
|
self._components[pcie_ep_id].in_ports["host"] = host_q
|
||||||
|
self._host_queues[pcie_ep_id] = host_q
|
||||||
|
|
||||||
|
# Attach host queues to PE_DMA nodes for direct PE DMA injection
|
||||||
|
self._pe_dma_queues: dict[str, simpy.Store] = {}
|
||||||
|
for node_id, node in graph.nodes.items():
|
||||||
|
if node.kind == "pe_dma":
|
||||||
|
host_q = simpy.Store(self._env)
|
||||||
|
self._components[node_id].in_ports["host"] = host_q
|
||||||
|
self._pe_dma_queues[node_id] = host_q
|
||||||
|
|
||||||
|
# Start components after all ports are wired (ADR-0015 D3)
|
||||||
|
for comp in self._components.values():
|
||||||
|
comp.start(self._env)
|
||||||
|
|
||||||
|
def submit(self, request: Any) -> RequestHandle:
|
||||||
|
self._counter += 1
|
||||||
|
handle = RequestHandle(f"h{self._counter}")
|
||||||
|
event = self._env.event()
|
||||||
|
self._events[str(handle)] = event
|
||||||
|
self._env.process(self._process(str(handle), request, event))
|
||||||
|
return handle
|
||||||
|
|
||||||
|
def wait(self, handle: RequestHandle) -> None:
|
||||||
|
key = str(handle)
|
||||||
|
event = self._events[key]
|
||||||
|
if not event.triggered:
|
||||||
|
self._env.run(until=event)
|
||||||
|
|
||||||
|
def get_completion(self, handle: RequestHandle) -> tuple[Completion, Trace | None]:
|
||||||
|
return self._results[str(handle)]
|
||||||
|
|
||||||
|
# ── internal ────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _wire(
|
||||||
|
self,
|
||||||
|
out_port: simpy.Store,
|
||||||
|
in_port: simpy.Store,
|
||||||
|
prop_ns: float,
|
||||||
|
):
|
||||||
|
"""SimPy process: relay messages with propagation delay only.
|
||||||
|
|
||||||
|
Cut-through (wormhole) model: serialization (drain) is computed per-path
|
||||||
|
and applied once at the terminal component, not at every wire hop.
|
||||||
|
"""
|
||||||
|
while True:
|
||||||
|
msg = yield out_port.get()
|
||||||
|
if prop_ns > 0:
|
||||||
|
yield self._env.timeout(prop_ns)
|
||||||
|
yield in_port.put(msg)
|
||||||
|
|
||||||
|
def _process(self, key: str, request: Any, done: simpy.Event):
|
||||||
|
if isinstance(request, PeDmaMsg):
|
||||||
|
yield from self._process_pe_dma(key, request, done)
|
||||||
|
return
|
||||||
|
|
||||||
|
entries = self._entry_points(request)
|
||||||
|
if not entries:
|
||||||
|
self._results[key] = (
|
||||||
|
Completion(ok=True),
|
||||||
|
{"total_ns": 0.0, "nbytes": 0},
|
||||||
|
)
|
||||||
|
done.succeed()
|
||||||
|
return
|
||||||
|
|
||||||
|
start_ns = self._env.now
|
||||||
|
total_nbytes = 0
|
||||||
|
|
||||||
|
root_txn: Transaction | None = None
|
||||||
|
if len(entries) == 1:
|
||||||
|
# Single-SIP: direct inject (common path, no extra events)
|
||||||
|
pcie_ep_id, io_cpu_id, nbytes = entries[0]
|
||||||
|
total_nbytes = nbytes
|
||||||
|
path = self._router.find_node_path(pcie_ep_id, io_cpu_id)
|
||||||
|
txn_done = self._env.event()
|
||||||
|
txn = Transaction(request=request, path=path, step=0, nbytes=nbytes, done=txn_done)
|
||||||
|
root_txn = txn
|
||||||
|
yield self._host_queues[pcie_ep_id].put(txn)
|
||||||
|
yield txn_done
|
||||||
|
else:
|
||||||
|
# Multi-SIP: inject per SIP, aggregate completions (ADR-0007)
|
||||||
|
sub_dones: list[simpy.Event] = []
|
||||||
|
sub_txns: list[Transaction] = []
|
||||||
|
for pcie_ep_id, io_cpu_id, nbytes in entries:
|
||||||
|
total_nbytes = max(total_nbytes, nbytes)
|
||||||
|
path = self._router.find_node_path(pcie_ep_id, io_cpu_id)
|
||||||
|
txn_done = self._env.event()
|
||||||
|
txn = Transaction(
|
||||||
|
request=request, path=path, step=0,
|
||||||
|
nbytes=nbytes, done=txn_done,
|
||||||
|
)
|
||||||
|
yield self._host_queues[pcie_ep_id].put(txn)
|
||||||
|
sub_dones.append(txn_done)
|
||||||
|
sub_txns.append(txn)
|
||||||
|
for sd in sub_dones:
|
||||||
|
yield sd
|
||||||
|
# Aggregate pe_exec_ns from multi-SIP (max)
|
||||||
|
pe_vals = [st.result_data.get("pe_exec_ns") for st in sub_txns]
|
||||||
|
pe_vals = [v for v in pe_vals if v is not None]
|
||||||
|
if pe_vals:
|
||||||
|
if root_txn is None:
|
||||||
|
root_txn = sub_txns[0]
|
||||||
|
root_txn.result_data["pe_exec_ns"] = max(pe_vals)
|
||||||
|
|
||||||
|
total_ns = self._env.now - start_ns
|
||||||
|
result_trace: dict[str, Any] = {"total_ns": total_ns, "nbytes": total_nbytes}
|
||||||
|
if root_txn is not None and root_txn.result_data:
|
||||||
|
result_trace.update(root_txn.result_data)
|
||||||
|
self._results[key] = (
|
||||||
|
Completion(ok=True),
|
||||||
|
result_trace,
|
||||||
|
)
|
||||||
|
done.succeed()
|
||||||
|
|
||||||
|
def _process_pe_dma(self, key: str, request: PeDmaMsg, done: simpy.Event):
|
||||||
|
"""Inject a Transaction directly at PE_DMA for PE→HBM latency measurement."""
|
||||||
|
pe_prefix = f"sip{request.src_sip}.cube{request.src_cube}.pe{request.src_pe}"
|
||||||
|
pe_dma_id = f"{pe_prefix}.pe_dma"
|
||||||
|
pa = PhysAddr.decode(request.dst_pa)
|
||||||
|
dst_node = self._resolver.resolve(pa)
|
||||||
|
path = self._router.find_path(pe_prefix, dst_node)
|
||||||
|
drain_ns = self._path_drain_ns(path, request.nbytes)
|
||||||
|
|
||||||
|
start_ns = self._env.now
|
||||||
|
txn_done = self._env.event()
|
||||||
|
txn = Transaction(request=request, path=path, step=0, nbytes=request.nbytes,
|
||||||
|
done=txn_done, drain_ns=drain_ns)
|
||||||
|
yield self._pe_dma_queues[pe_dma_id].put(txn)
|
||||||
|
yield txn_done
|
||||||
|
total_ns = self._env.now - start_ns
|
||||||
|
formula_ns = self._formula_latency(path, request.nbytes)
|
||||||
|
self._results[key] = (
|
||||||
|
Completion(ok=True),
|
||||||
|
{"total_ns": total_ns, "formula_ns": formula_ns, "nbytes": request.nbytes},
|
||||||
|
)
|
||||||
|
done.succeed()
|
||||||
|
|
||||||
|
def _path_drain_ns(self, path: list[str], nbytes: int) -> float:
|
||||||
|
"""Wormhole drain time: nbytes / bottleneck_bw along path."""
|
||||||
|
min_bw = float("inf")
|
||||||
|
for i in range(len(path) - 1):
|
||||||
|
edge = self._edge_map.get((path[i], path[i + 1]))
|
||||||
|
if edge and edge.bw_gbs:
|
||||||
|
min_bw = min(min_bw, edge.bw_gbs)
|
||||||
|
if min_bw == float("inf"):
|
||||||
|
return 0.0
|
||||||
|
return nbytes / min_bw
|
||||||
|
|
||||||
|
def _formula_latency(self, path: list[str], nbytes: int) -> float:
|
||||||
|
"""Lower-bound formula latency (ADR-0015 D7).
|
||||||
|
|
||||||
|
formula = Σ(wire propagation) + Σ(component overhead_ns) + drain_ns
|
||||||
|
|
||||||
|
Phase 0: formula == actual (no contention).
|
||||||
|
Phase 1+: formula <= actual (contention adds queueing).
|
||||||
|
"""
|
||||||
|
total = 0.0
|
||||||
|
# Wire propagation delays
|
||||||
|
for i in range(len(path) - 1):
|
||||||
|
edge = self._edge_map.get((path[i], path[i + 1]))
|
||||||
|
if edge:
|
||||||
|
total += edge.distance_mm * self._ns_per_mm
|
||||||
|
# Component overhead_ns
|
||||||
|
for node_id in path:
|
||||||
|
node = self._nodes.get(node_id)
|
||||||
|
if node:
|
||||||
|
total += float(node.attrs.get("overhead_ns", 0.0))
|
||||||
|
# Drain
|
||||||
|
total += self._path_drain_ns(path, nbytes)
|
||||||
|
return total
|
||||||
|
|
||||||
|
def _entry_points(self, request: Any) -> list[tuple[str, str, int]]:
|
||||||
|
"""Return list of (pcie_ep_id, io_cpu_id, nbytes) per target SIP.
|
||||||
|
|
||||||
|
For Memory{Write,Read}: single SIP entry.
|
||||||
|
For KernelLaunchMsg: one entry per distinct SIP in tensor shards.
|
||||||
|
"""
|
||||||
|
if isinstance(request, MemoryWriteMsg):
|
||||||
|
sip = request.dst_sip
|
||||||
|
return [(
|
||||||
|
self._resolver.find_pcie_ep(sip),
|
||||||
|
self._resolver.find_io_cpu(sip),
|
||||||
|
request.nbytes,
|
||||||
|
)]
|
||||||
|
|
||||||
|
if isinstance(request, MemoryReadMsg):
|
||||||
|
sip = request.src_sip
|
||||||
|
return [(
|
||||||
|
self._resolver.find_pcie_ep(sip),
|
||||||
|
self._resolver.find_io_cpu(sip),
|
||||||
|
request.nbytes,
|
||||||
|
)]
|
||||||
|
|
||||||
|
if isinstance(request, KernelLaunchMsg):
|
||||||
|
seen: set[int] = set()
|
||||||
|
entries: list[tuple[str, str, int]] = []
|
||||||
|
for arg in request.args:
|
||||||
|
if arg.arg_kind != "tensor":
|
||||||
|
continue
|
||||||
|
for shard in arg.shards:
|
||||||
|
if shard.sip not in seen:
|
||||||
|
seen.add(shard.sip)
|
||||||
|
entries.append((
|
||||||
|
self._resolver.find_pcie_ep(shard.sip),
|
||||||
|
self._resolver.find_io_cpu(shard.sip),
|
||||||
|
shard.nbytes,
|
||||||
|
))
|
||||||
|
return entries
|
||||||
|
|
||||||
|
raise ValueError(f"unsupported request type: {type(request)}")
|
||||||
@@ -0,0 +1,49 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import simpy
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Transaction:
|
||||||
|
"""In-flight request traversing the device fabric hop-by-hop (ADR-0015 D4).
|
||||||
|
|
||||||
|
A Transaction carries a host request through one leg of the device fabric.
|
||||||
|
Each component on the path reads from its in_port, processes (overhead_ns or
|
||||||
|
other latency), and advances the Transaction to the next hop via out_port.
|
||||||
|
Wire processes (ADR-0015 D2) model propagation delay between hops.
|
||||||
|
|
||||||
|
Multi-leg flows (e.g. IO_CPU → M_CPU as leg 1, M_CPU.DMA → HBM as leg 2)
|
||||||
|
use separate Transactions: the terminal component of leg 1 creates leg 2
|
||||||
|
and waits for leg 2's done before succeeding leg 1's done.
|
||||||
|
"""
|
||||||
|
|
||||||
|
request: Any # original host request (MemoryReadMsg, KernelLaunchMsg, …)
|
||||||
|
path: list[str] # node_id sequence for this leg
|
||||||
|
step: int # index of the component currently holding this Transaction
|
||||||
|
nbytes: int # payload size (bytes)
|
||||||
|
done: simpy.Event # succeeded when this leg completes
|
||||||
|
drain_ns: float = 0.0 # wormhole drain time: nbytes / bottleneck_bw (applied once at terminal)
|
||||||
|
is_response: bool = False # True when carrying ResponseMsg on reverse path
|
||||||
|
result_data: dict[str, Any] = field(default_factory=dict) # PE-level metrics (pe_exec_ns, etc.)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def next_hop(self) -> str | None:
|
||||||
|
"""Node id of the next component, or None if this is the terminal hop."""
|
||||||
|
nxt = self.step + 1
|
||||||
|
return self.path[nxt] if nxt < len(self.path) else None
|
||||||
|
|
||||||
|
def advance(self) -> Transaction:
|
||||||
|
"""Return a copy of this Transaction advanced one step along the path."""
|
||||||
|
return Transaction(
|
||||||
|
request=self.request,
|
||||||
|
path=self.path,
|
||||||
|
step=self.step + 1,
|
||||||
|
nbytes=self.nbytes,
|
||||||
|
done=self.done,
|
||||||
|
drain_ns=self.drain_ns,
|
||||||
|
is_response=self.is_response,
|
||||||
|
result_data=self.result_data,
|
||||||
|
)
|
||||||
@@ -0,0 +1,965 @@
|
|||||||
|
# kernbench/topology/builder.py
|
||||||
|
"""
|
||||||
|
Topology compiler: parses topology.yaml and produces a fully-instantiated
|
||||||
|
TopologyGraph with nodes, edges, and representative view projections.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
from .types import Edge, Node, TopologyGraph, TopologyHandle, ViewGraph
|
||||||
|
|
||||||
|
|
||||||
|
# PE component offsets from PE center (small, intra-PE distances ~0.5mm)
|
||||||
|
_PE_COMP_OFFSETS = {
|
||||||
|
"pe_cpu": (-0.3, 0.0),
|
||||||
|
"pe_scheduler": (-0.15, 0.0),
|
||||||
|
"pe_dma": (0.0, -0.15),
|
||||||
|
"pe_gemm": (0.0, 0.0),
|
||||||
|
"pe_math": (0.0, 0.15),
|
||||||
|
"pe_tcm": (0.3, 0.0),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ── Public API ───────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_topology(path_str: str) -> TopologyHandle:
|
||||||
|
"""Validate path and build compiled topology graph."""
|
||||||
|
p = Path(path_str).expanduser().resolve()
|
||||||
|
if not p.exists():
|
||||||
|
raise FileNotFoundError(f"Topology file not found: {p}")
|
||||||
|
if not p.is_file():
|
||||||
|
raise ValueError(f"Topology path is not a file: {p}")
|
||||||
|
graph = load_topology(p)
|
||||||
|
return TopologyHandle(path=p, topology_obj=graph)
|
||||||
|
|
||||||
|
|
||||||
|
def load_topology(path: Path) -> TopologyGraph:
|
||||||
|
"""Load topology spec from file and compile into a topology graph."""
|
||||||
|
spec = _read_spec(path)
|
||||||
|
_validate_spec(spec)
|
||||||
|
return _compile_graph(spec)
|
||||||
|
|
||||||
|
|
||||||
|
def _read_spec(path: Path) -> dict[str, Any]:
|
||||||
|
"""Read YAML topology spec file and return a dict."""
|
||||||
|
try:
|
||||||
|
with path.open("r", encoding="utf-8") as f:
|
||||||
|
data = yaml.safe_load(f)
|
||||||
|
except yaml.YAMLError as e:
|
||||||
|
msg = f"Failed to parse YAML topology: {path}"
|
||||||
|
mark = getattr(e, "problem_mark", None)
|
||||||
|
if mark is not None:
|
||||||
|
msg += f" (line {mark.line + 1}, column {mark.column + 1})"
|
||||||
|
raise ValueError(msg) from e
|
||||||
|
|
||||||
|
if data is None:
|
||||||
|
raise ValueError(f"Topology YAML is empty: {path}")
|
||||||
|
if not isinstance(data, dict):
|
||||||
|
raise ValueError(
|
||||||
|
f"Topology YAML root must be a mapping/dict: {path} (got {type(data).__name__})"
|
||||||
|
)
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def _validate_spec(spec: dict) -> None:
|
||||||
|
# TODO: schema validation
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
# ── Graph Compiler ───────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def _compile_graph(spec: dict) -> TopologyGraph:
|
||||||
|
"""Build fully-instantiated flat graph + representative view projections."""
|
||||||
|
nodes: dict[str, Node] = {}
|
||||||
|
edges: list[Edge] = []
|
||||||
|
|
||||||
|
system = spec["system"]
|
||||||
|
sip_spec = spec["sip"]
|
||||||
|
cube_spec = spec["cube"]
|
||||||
|
|
||||||
|
mesh_w = sip_spec["cube_mesh"]["w"]
|
||||||
|
mesh_h = sip_spec["cube_mesh"]["h"]
|
||||||
|
cube_w = cube_spec["geometry"]["cube_mm"]["w"]
|
||||||
|
cube_h = cube_spec["geometry"]["cube_mm"]["h"]
|
||||||
|
seam = sip_spec["links"]["inter_cube_mesh"]["distance_mm_across_seam"]
|
||||||
|
stride_x = cube_w + seam
|
||||||
|
stride_y = cube_h + seam
|
||||||
|
|
||||||
|
# System-level
|
||||||
|
_instantiate_system(nodes, system)
|
||||||
|
|
||||||
|
# Per-SIP
|
||||||
|
for sip_id in range(system["sips"]["count"]):
|
||||||
|
sp = f"sip{sip_id}"
|
||||||
|
|
||||||
|
# IO chiplets
|
||||||
|
_instantiate_io_chiplets(
|
||||||
|
nodes, edges, sp, sip_spec,
|
||||||
|
cube_w, cube_h, mesh_w, mesh_h, seam,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Cubes + PEs
|
||||||
|
for row in range(mesh_h):
|
||||||
|
for col in range(mesh_w):
|
||||||
|
cid = row * mesh_w + col
|
||||||
|
cp = f"{sp}.cube{cid}"
|
||||||
|
origin = (col * stride_x, row * stride_y)
|
||||||
|
_instantiate_cube(nodes, edges, cp, cube_spec, origin)
|
||||||
|
|
||||||
|
# Inter-cube UCIe mesh
|
||||||
|
_add_inter_cube_edges(edges, sp, mesh_w, mesh_h, sip_spec)
|
||||||
|
|
||||||
|
# IO → cube UCIe
|
||||||
|
_add_io_to_cube_edges(edges, sp, sip_spec, mesh_w)
|
||||||
|
|
||||||
|
# Switch → IO pcie_ep
|
||||||
|
_add_system_to_io_edges(edges, sp, sip_spec, system)
|
||||||
|
|
||||||
|
# Build views
|
||||||
|
return TopologyGraph(
|
||||||
|
spec=spec,
|
||||||
|
nodes=nodes,
|
||||||
|
edges=edges,
|
||||||
|
system_view=_build_system_view(spec),
|
||||||
|
sip_view=_build_sip_view(spec),
|
||||||
|
cube_view=_build_cube_view(spec),
|
||||||
|
pe_view=_build_pe_view(spec),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Layout helpers ───────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def _cube_local_positions(cube_w: float, cube_h: float) -> dict[str, tuple[float, float]]:
|
||||||
|
"""Cube-internal component positions relative to cube origin (0,0) at top-left."""
|
||||||
|
cx, cy = cube_w / 2, cube_h / 2
|
||||||
|
# UCIe node half-sizes (default 2.0×1.2mm) — inset so edges touch boundary
|
||||||
|
uh = 0.6 # half height
|
||||||
|
uw = 1.0 # half width
|
||||||
|
return {
|
||||||
|
"ucie-N": (cx, uh),
|
||||||
|
"ucie-S": (cx, cube_h - uh),
|
||||||
|
"ucie-W": (uw, cy),
|
||||||
|
"ucie-E": (cube_w - uw, cy),
|
||||||
|
"m_cpu": (cube_w - 2.5, cy - 1.5),
|
||||||
|
"xbar.top": (cx, 3.5), # Y reference for top-half xbar.pe nodes
|
||||||
|
"hbm_ctrl": (cx - 2.0, cy),
|
||||||
|
"xbar.bottom": (cx, cube_h - 3.5), # Y reference for bottom-half xbar.pe nodes
|
||||||
|
"bridge.left": (2.5, cy + 2.0),
|
||||||
|
"bridge.right": (cube_w - 2.5, cy + 2.0),
|
||||||
|
"noc": (cx + 2.0, cy),
|
||||||
|
"sram": (2.5, cy - 1.5),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _corner_pe_positions(cube_w: float, cube_h: float) -> dict[str, list[tuple[float, float]]]:
|
||||||
|
"""PE center positions per corner, relative to cube origin."""
|
||||||
|
return {
|
||||||
|
"NW": [(1.5, 1.5), (4.5, 1.5)],
|
||||||
|
"NE": [(cube_w - 4.5, 1.5), (cube_w - 1.5, 1.5)],
|
||||||
|
"SW": [(1.5, cube_h - 1.5), (4.5, cube_h - 1.5)],
|
||||||
|
"SE": [(cube_w - 4.5, cube_h - 1.5), (cube_w - 1.5, cube_h - 1.5)],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ── Instantiation: system ───────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def _instantiate_system(nodes: dict[str, Node], system: dict) -> None:
|
||||||
|
"""Add system-level nodes (fabric switch)."""
|
||||||
|
sw = system["components"]["switch"]
|
||||||
|
sw_id = "fabric.switch0"
|
||||||
|
nodes[sw_id] = Node(
|
||||||
|
id=sw_id, kind=sw["kind"], impl=sw["impl"],
|
||||||
|
attrs=sw.get("attrs", {}), pos_mm=None, label="Switch",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Instantiation: IO chiplets ──────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def _instantiate_io_chiplets(
|
||||||
|
nodes: dict[str, Node],
|
||||||
|
edges: list[Edge],
|
||||||
|
sp: str,
|
||||||
|
sip_spec: dict,
|
||||||
|
cube_w: float,
|
||||||
|
cube_h: float,
|
||||||
|
mesh_w: int,
|
||||||
|
mesh_h: int,
|
||||||
|
seam: float,
|
||||||
|
) -> None:
|
||||||
|
"""Add IO chiplet nodes and internal pcie_ep → io_cpu edges."""
|
||||||
|
io_spec = sip_spec["iochiplet"]
|
||||||
|
comp = io_spec["components"]
|
||||||
|
links = io_spec["links"]
|
||||||
|
mesh_total_w = mesh_w * cube_w + (mesh_w - 1) * seam
|
||||||
|
mesh_total_h = mesh_h * cube_h + (mesh_h - 1) * seam
|
||||||
|
|
||||||
|
for inst in io_spec["instances"]:
|
||||||
|
iid = inst["id"]
|
||||||
|
prefix = f"{sp}.{iid}"
|
||||||
|
side = inst["place"]["side"]
|
||||||
|
cx = mesh_total_w / 2
|
||||||
|
if side == "N":
|
||||||
|
pcie_y, cpu_y = -5.0, -3.0
|
||||||
|
else:
|
||||||
|
pcie_y, cpu_y = mesh_total_h + 5.0, mesh_total_h + 3.0
|
||||||
|
|
||||||
|
# pcie_ep
|
||||||
|
ep = comp["pcie_ep"]
|
||||||
|
ep_id = f"{prefix}.pcie_ep"
|
||||||
|
nodes[ep_id] = Node(
|
||||||
|
id=ep_id, kind=ep["kind"], impl=ep["impl"],
|
||||||
|
attrs=ep["attrs"], pos_mm=(cx, pcie_y), label="PCIe EP",
|
||||||
|
)
|
||||||
|
|
||||||
|
# io_cpu
|
||||||
|
cpu = comp["io_cpu"]
|
||||||
|
cpu_id = f"{prefix}.io_cpu"
|
||||||
|
nodes[cpu_id] = Node(
|
||||||
|
id=cpu_id, kind=cpu["kind"], impl=cpu["impl"],
|
||||||
|
attrs=cpu["attrs"], pos_mm=(cx, cpu_y), label="IO CPU",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Internal edge
|
||||||
|
edges.append(Edge(
|
||||||
|
src=ep_id, dst=cpu_id,
|
||||||
|
distance_mm=links["pcie_ep_to_io_cpu_mm"],
|
||||||
|
bw_gbs=links["pcie_ep_to_io_cpu_bw_gbs"],
|
||||||
|
kind="io_internal",
|
||||||
|
))
|
||||||
|
|
||||||
|
|
||||||
|
# ── Instantiation: cube + PEs ───────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def _instantiate_cube(
|
||||||
|
nodes: dict[str, Node],
|
||||||
|
edges: list[Edge],
|
||||||
|
cp: str,
|
||||||
|
cube: dict,
|
||||||
|
origin: tuple[float, float],
|
||||||
|
) -> None:
|
||||||
|
"""Add all cube-internal nodes and edges, including PE instances."""
|
||||||
|
cube_w = cube["geometry"]["cube_mm"]["w"]
|
||||||
|
cube_h = cube["geometry"]["cube_mm"]["h"]
|
||||||
|
ox, oy = origin
|
||||||
|
local_pos = _cube_local_positions(cube_w, cube_h)
|
||||||
|
clinks = cube["links"]
|
||||||
|
n_slices = cube["memory_map"]["hbm_slices_per_cube"]
|
||||||
|
|
||||||
|
# ── UCIe ports ──
|
||||||
|
ucie_ns = cube["ucie"]["overhead_ns"]
|
||||||
|
for port in cube["ucie"]["ports"]:
|
||||||
|
pid = f"{cp}.ucie-{port}"
|
||||||
|
lx, ly = local_pos[f"ucie-{port}"]
|
||||||
|
nodes[pid] = Node(
|
||||||
|
id=pid, kind="ucie_port", impl="ucie_v1",
|
||||||
|
attrs={"overhead_ns": ucie_ns}, pos_mm=(ox + lx, oy + ly),
|
||||||
|
label=f"UCIe-{port}",
|
||||||
|
)
|
||||||
|
|
||||||
|
# ── Named components: noc, m_cpu, sram ──
|
||||||
|
for name in ("noc", "m_cpu", "sram"):
|
||||||
|
c = cube["components"][name]
|
||||||
|
nid = f"{cp}.{name}"
|
||||||
|
lx, ly = local_pos[name]
|
||||||
|
nodes[nid] = Node(
|
||||||
|
id=nid, kind=c["kind"], impl=c["impl"],
|
||||||
|
attrs=c["attrs"], pos_mm=(ox + lx, oy + ly),
|
||||||
|
label=name.upper().replace("_", " "),
|
||||||
|
)
|
||||||
|
|
||||||
|
# ── HBM controller slices (one per PE) ──
|
||||||
|
hbm_spec = cube["components"]["hbm_ctrl"]
|
||||||
|
hbm_lx, hbm_ly = local_pos["hbm_ctrl"]
|
||||||
|
for sl in range(n_slices):
|
||||||
|
sid = f"{cp}.hbm_ctrl.slice{sl}"
|
||||||
|
nodes[sid] = Node(
|
||||||
|
id=sid, kind=hbm_spec["kind"], impl=hbm_spec["impl"],
|
||||||
|
attrs=hbm_spec["attrs"], pos_mm=(ox + hbm_lx, oy + hbm_ly),
|
||||||
|
label=f"HBM SLICE{sl}",
|
||||||
|
)
|
||||||
|
|
||||||
|
# ── Bridges ──
|
||||||
|
for br in cube["components"]["xbar"]["bridges"]:
|
||||||
|
bname = br["id"]
|
||||||
|
nid = f"{cp}.bridge.{bname}"
|
||||||
|
lx, ly = local_pos[f"bridge.{bname}"]
|
||||||
|
nodes[nid] = Node(
|
||||||
|
id=nid, kind=br["kind"], impl=br["impl"],
|
||||||
|
attrs=br["attrs"], pos_mm=(ox + lx, oy + ly),
|
||||||
|
label=f"Bridge {bname.upper()}",
|
||||||
|
)
|
||||||
|
|
||||||
|
# ── PE instances + per-PE xbar entry nodes ──
|
||||||
|
corners = cube["pe_layout"]["corners"]
|
||||||
|
pe_per_corner = cube["pe_layout"]["pe_per_corner"]
|
||||||
|
corner_pos = _corner_pe_positions(cube_w, cube_h)
|
||||||
|
pe_tmpl = cube["pe_template"]
|
||||||
|
pe_links = pe_tmpl["links"]
|
||||||
|
|
||||||
|
xbar_pe_spec = cube["components"]["xbar"]["pe"]
|
||||||
|
xbar_top_y = local_pos["xbar.top"][1]
|
||||||
|
xbar_bot_y = local_pos["xbar.bottom"][1]
|
||||||
|
|
||||||
|
pe_idx = 0
|
||||||
|
for corner in corners:
|
||||||
|
is_top = corner in ("NW", "NE")
|
||||||
|
xbar_y = xbar_top_y if is_top else xbar_bot_y
|
||||||
|
mm_key = "pe_to_xbar_row_n_mm" if is_top else "pe_to_xbar_row_s_mm"
|
||||||
|
for ci in range(pe_per_corner):
|
||||||
|
pp = f"{cp}.pe{pe_idx}"
|
||||||
|
pe_cx, pe_cy = corner_pos[corner][ci]
|
||||||
|
|
||||||
|
# Per-PE xbar entry node
|
||||||
|
xbar_nid = f"{cp}.xbar.pe{pe_idx}"
|
||||||
|
nodes[xbar_nid] = Node(
|
||||||
|
id=xbar_nid, kind=xbar_pe_spec["kind"], impl=xbar_pe_spec["impl"],
|
||||||
|
attrs=xbar_pe_spec["attrs"], pos_mm=(ox + pe_cx, oy + xbar_y),
|
||||||
|
label=f"XBAR PE{pe_idx}",
|
||||||
|
)
|
||||||
|
|
||||||
|
# PE template components
|
||||||
|
for comp_name, comp_spec in pe_tmpl["components"].items():
|
||||||
|
cid = f"{pp}.{comp_name}"
|
||||||
|
dx, dy = _PE_COMP_OFFSETS.get(comp_name, (0.0, 0.0))
|
||||||
|
nodes[cid] = Node(
|
||||||
|
id=cid, kind=comp_spec["kind"], impl=comp_spec["impl"],
|
||||||
|
attrs=comp_spec["attrs"],
|
||||||
|
pos_mm=(ox + pe_cx + dx, oy + pe_cy + dy),
|
||||||
|
label=comp_name.upper().replace("_", " "),
|
||||||
|
)
|
||||||
|
|
||||||
|
# PE-internal edges
|
||||||
|
_add_pe_internal_edges(edges, pp, pe_links)
|
||||||
|
|
||||||
|
# PE_DMA → xbar.pe_i (HBM data path)
|
||||||
|
edges.append(Edge(
|
||||||
|
src=f"{pp}.pe_dma", dst=xbar_nid,
|
||||||
|
distance_mm=clinks[mm_key],
|
||||||
|
bw_gbs=clinks["pe_to_xbar_bw_gbs"],
|
||||||
|
kind="pe_to_xbar",
|
||||||
|
))
|
||||||
|
|
||||||
|
# PE_DMA → noc (non-HBM data path: SRAM, inter-cube, etc.)
|
||||||
|
edges.append(Edge(
|
||||||
|
src=f"{pp}.pe_dma", dst=f"{cp}.noc",
|
||||||
|
distance_mm=clinks["pe_dma_to_noc_mm"],
|
||||||
|
bw_gbs=clinks["pe_dma_to_noc_bw_gbs"],
|
||||||
|
kind="pe_to_noc",
|
||||||
|
))
|
||||||
|
|
||||||
|
# noc → PE_CPU (command delivery)
|
||||||
|
edges.append(Edge(
|
||||||
|
src=f"{cp}.noc", dst=f"{pp}.pe_cpu",
|
||||||
|
distance_mm=clinks["noc_to_pe_cpu_mm"],
|
||||||
|
kind="command",
|
||||||
|
))
|
||||||
|
|
||||||
|
pe_idx += 1
|
||||||
|
|
||||||
|
# ── Cube fabric edges ──
|
||||||
|
|
||||||
|
# xbar.pe_i ↔ hbm_ctrl.slice_i (local Y-path, bidirectional for response)
|
||||||
|
for i in range(n_slices):
|
||||||
|
edges.append(Edge(
|
||||||
|
src=f"{cp}.xbar.pe{i}", dst=f"{cp}.hbm_ctrl.slice{i}",
|
||||||
|
distance_mm=clinks["xbar_to_hbm_mm"],
|
||||||
|
bw_gbs=clinks["xbar_to_hbm_bw_gbs"],
|
||||||
|
kind="xbar_to_hbm",
|
||||||
|
))
|
||||||
|
edges.append(Edge(
|
||||||
|
src=f"{cp}.hbm_ctrl.slice{i}", dst=f"{cp}.xbar.pe{i}",
|
||||||
|
distance_mm=clinks["xbar_to_hbm_mm"],
|
||||||
|
bw_gbs=clinks["xbar_to_hbm_bw_gbs"],
|
||||||
|
kind="hbm_to_xbar",
|
||||||
|
))
|
||||||
|
|
||||||
|
# xbar chain: pe0↔pe1↔pe2↔pe3 (top), pe4↔pe5↔pe6↔pe7 (bottom)
|
||||||
|
half = n_slices // 2
|
||||||
|
for half_start in (0, half):
|
||||||
|
for i in range(half_start, half_start + half - 1):
|
||||||
|
intra = ((i - half_start) % pe_per_corner) != (pe_per_corner - 1)
|
||||||
|
x_dist = clinks["xbar_chain_intra_corner_mm"] if intra else clinks["xbar_chain_inter_corner_mm"]
|
||||||
|
for a, b in [(i, i + 1), (i + 1, i)]:
|
||||||
|
edges.append(Edge(
|
||||||
|
src=f"{cp}.xbar.pe{a}", dst=f"{cp}.xbar.pe{b}",
|
||||||
|
distance_mm=x_dist,
|
||||||
|
bw_gbs=clinks["xbar_x_bw_gbs"],
|
||||||
|
kind="xbar_chain",
|
||||||
|
))
|
||||||
|
|
||||||
|
# bridge connections: pe0↔bridge.left↔pe4, pe3↔bridge.right↔pe7
|
||||||
|
for bname, pe_top, pe_bot in [("left", 0, half), ("right", half - 1, n_slices - 1)]:
|
||||||
|
br_node = f"{cp}.bridge.{bname}"
|
||||||
|
for pe_i, br_mm_key in [(pe_top, "xbar_row_n_to_bridge_mm"),
|
||||||
|
(pe_bot, "xbar_row_s_to_bridge_mm")]:
|
||||||
|
xbar_node = f"{cp}.xbar.pe{pe_i}"
|
||||||
|
edges.append(Edge(
|
||||||
|
src=xbar_node, dst=br_node,
|
||||||
|
distance_mm=clinks[br_mm_key],
|
||||||
|
bw_gbs=clinks["xbar_to_bridge_bw_gbs"],
|
||||||
|
kind="xbar_to_bridge",
|
||||||
|
))
|
||||||
|
edges.append(Edge(
|
||||||
|
src=br_node, dst=xbar_node,
|
||||||
|
distance_mm=clinks[br_mm_key],
|
||||||
|
bw_gbs=clinks["xbar_to_bridge_bw_gbs"],
|
||||||
|
kind="bridge_to_xbar",
|
||||||
|
))
|
||||||
|
|
||||||
|
# ucie ↔ noc (UCIe-NOC boundary; per_connection_bw_gbs = 128 GB/s, n_connections = 4)
|
||||||
|
_noc_ucie = clinks["noc_to_ucie"]
|
||||||
|
for port in cube["ucie"]["ports"]:
|
||||||
|
edges.append(Edge(
|
||||||
|
src=f"{cp}.ucie-{port}", dst=f"{cp}.noc",
|
||||||
|
distance_mm=0.0,
|
||||||
|
bw_gbs=_noc_ucie["per_connection_bw_gbs"],
|
||||||
|
n_connections=_noc_ucie["n_connections"],
|
||||||
|
kind="ucie_to_noc",
|
||||||
|
))
|
||||||
|
|
||||||
|
for port in cube["ucie"]["ports"]:
|
||||||
|
edges.append(Edge(
|
||||||
|
src=f"{cp}.noc", dst=f"{cp}.ucie-{port}",
|
||||||
|
distance_mm=0.0,
|
||||||
|
bw_gbs=_noc_ucie["per_connection_bw_gbs"],
|
||||||
|
n_connections=_noc_ucie["n_connections"],
|
||||||
|
kind="noc_to_ucie",
|
||||||
|
))
|
||||||
|
|
||||||
|
# noc ↔ xbar.pe{i}: wire delay is 0 (NOC traversal latency computed by TwoDMeshNocComponent);
|
||||||
|
# routing_weight_mm=50.0 steers PE DMA Dijkstra away from this path (prefer direct pe_dma→xbar)
|
||||||
|
_noc_xbar = clinks.get("noc_to_xbar", {})
|
||||||
|
_noc_xbar_bw = _noc_xbar.get("per_connection_bw_gbs")
|
||||||
|
for i in range(n_slices):
|
||||||
|
edges.append(Edge(
|
||||||
|
src=f"{cp}.noc", dst=f"{cp}.xbar.pe{i}",
|
||||||
|
distance_mm=0.0,
|
||||||
|
bw_gbs=_noc_xbar_bw,
|
||||||
|
routing_weight_mm=50.0,
|
||||||
|
kind="noc_to_xbar",
|
||||||
|
))
|
||||||
|
edges.append(Edge(
|
||||||
|
src=f"{cp}.xbar.pe{i}", dst=f"{cp}.noc",
|
||||||
|
distance_mm=0.0,
|
||||||
|
bw_gbs=_noc_xbar_bw,
|
||||||
|
routing_weight_mm=50.0,
|
||||||
|
kind="xbar_to_noc",
|
||||||
|
))
|
||||||
|
|
||||||
|
# m_cpu ↔ noc (command dispatch, both directions)
|
||||||
|
edges.append(Edge(
|
||||||
|
src=f"{cp}.m_cpu", dst=f"{cp}.noc",
|
||||||
|
distance_mm=clinks["m_cpu_to_noc_mm"],
|
||||||
|
kind="command",
|
||||||
|
))
|
||||||
|
edges.append(Edge(
|
||||||
|
src=f"{cp}.noc", dst=f"{cp}.m_cpu",
|
||||||
|
distance_mm=clinks["m_cpu_to_noc_mm"],
|
||||||
|
kind="command",
|
||||||
|
))
|
||||||
|
|
||||||
|
# noc ↔ sram (shared SRAM access; per_connection_bw_gbs = 128 GB/s, n_connections = 4)
|
||||||
|
_noc_sram = clinks["noc_to_sram"]
|
||||||
|
edges.append(Edge(
|
||||||
|
src=f"{cp}.noc", dst=f"{cp}.sram",
|
||||||
|
distance_mm=clinks["noc_to_sram_mm"],
|
||||||
|
bw_gbs=_noc_sram["per_connection_bw_gbs"],
|
||||||
|
n_connections=_noc_sram["n_connections"],
|
||||||
|
kind="noc_to_sram",
|
||||||
|
))
|
||||||
|
edges.append(Edge(
|
||||||
|
src=f"{cp}.sram", dst=f"{cp}.noc",
|
||||||
|
distance_mm=clinks["noc_to_sram_mm"],
|
||||||
|
bw_gbs=_noc_sram["per_connection_bw_gbs"],
|
||||||
|
n_connections=_noc_sram["n_connections"],
|
||||||
|
kind="noc_to_sram",
|
||||||
|
))
|
||||||
|
|
||||||
|
|
||||||
|
def _add_pe_internal_edges(edges: list[Edge], pp: str, pe_links: dict) -> None:
|
||||||
|
"""Add PE-internal edges for a single PE instance."""
|
||||||
|
edges.append(Edge(
|
||||||
|
src=f"{pp}.pe_cpu", dst=f"{pp}.pe_scheduler",
|
||||||
|
distance_mm=pe_links["pe_cpu_to_scheduler_mm"],
|
||||||
|
kind="pe_internal",
|
||||||
|
))
|
||||||
|
for eng, key in [("pe_dma", "scheduler_to_dma_mm"),
|
||||||
|
("pe_gemm", "scheduler_to_gemm_mm"),
|
||||||
|
("pe_math", "scheduler_to_math_mm")]:
|
||||||
|
edges.append(Edge(
|
||||||
|
src=f"{pp}.pe_scheduler", dst=f"{pp}.{eng}",
|
||||||
|
distance_mm=pe_links[key],
|
||||||
|
kind="pe_internal",
|
||||||
|
))
|
||||||
|
for eng, mm_key, bw_key in [("pe_dma", "dma_to_tcm_mm", "dma_to_tcm_bw_gbs"),
|
||||||
|
("pe_gemm", "gemm_to_tcm_mm", "gemm_to_tcm_bw_gbs"),
|
||||||
|
("pe_math", "math_to_tcm_mm", "math_to_tcm_bw_gbs")]:
|
||||||
|
edges.append(Edge(
|
||||||
|
src=f"{pp}.{eng}", dst=f"{pp}.pe_tcm",
|
||||||
|
distance_mm=pe_links[mm_key],
|
||||||
|
bw_gbs=pe_links[bw_key],
|
||||||
|
kind="pe_internal",
|
||||||
|
))
|
||||||
|
|
||||||
|
|
||||||
|
# ── Inter-cube / IO / system edges ──────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def _add_inter_cube_edges(
|
||||||
|
edges: list[Edge], sp: str, mesh_w: int, mesh_h: int, sip_spec: dict,
|
||||||
|
) -> None:
|
||||||
|
"""Add UCIe mesh edges between adjacent cubes within a SIP."""
|
||||||
|
mesh = sip_spec["links"]["inter_cube_mesh"]
|
||||||
|
bw = mesh["bw_gbs_per_ucie_phy"]
|
||||||
|
dist = mesh["distance_mm_across_seam"]
|
||||||
|
for row in range(mesh_h):
|
||||||
|
for col in range(mesh_w):
|
||||||
|
cid = row * mesh_w + col
|
||||||
|
if col + 1 < mesh_w:
|
||||||
|
nid = row * mesh_w + (col + 1)
|
||||||
|
edges.append(Edge(
|
||||||
|
src=f"{sp}.cube{cid}.ucie-E", dst=f"{sp}.cube{nid}.ucie-W",
|
||||||
|
distance_mm=dist, bw_gbs=bw, kind="ucie_mesh",
|
||||||
|
))
|
||||||
|
edges.append(Edge(
|
||||||
|
src=f"{sp}.cube{nid}.ucie-W", dst=f"{sp}.cube{cid}.ucie-E",
|
||||||
|
distance_mm=dist, bw_gbs=bw, kind="ucie_mesh",
|
||||||
|
))
|
||||||
|
if row + 1 < mesh_h:
|
||||||
|
nid = (row + 1) * mesh_w + col
|
||||||
|
edges.append(Edge(
|
||||||
|
src=f"{sp}.cube{cid}.ucie-S", dst=f"{sp}.cube{nid}.ucie-N",
|
||||||
|
distance_mm=dist, bw_gbs=bw, kind="ucie_mesh",
|
||||||
|
))
|
||||||
|
edges.append(Edge(
|
||||||
|
src=f"{sp}.cube{nid}.ucie-N", dst=f"{sp}.cube{cid}.ucie-S",
|
||||||
|
distance_mm=dist, bw_gbs=bw, kind="ucie_mesh",
|
||||||
|
))
|
||||||
|
|
||||||
|
|
||||||
|
def _add_io_to_cube_edges(
|
||||||
|
edges: list[Edge], sp: str, sip_spec: dict, mesh_w: int,
|
||||||
|
) -> None:
|
||||||
|
"""Add IO chiplet io_cpu ↔ cube UCIe edges (bidirectional for response)."""
|
||||||
|
io_links = sip_spec["iochiplet"]["links"]
|
||||||
|
io_to_ucie_mm = io_links["io_cpu_to_ucie_mm"]
|
||||||
|
io_to_ucie_bw = io_links["io_cpu_to_ucie_bw_gbs"]
|
||||||
|
for inst in sip_spec["iochiplet"]["instances"]:
|
||||||
|
iid = inst["id"]
|
||||||
|
io_cpu_id = f"{sp}.{iid}.io_cpu"
|
||||||
|
for port in inst["cube_ports"]:
|
||||||
|
cube_col, cube_row = port["cube"]["xy"]
|
||||||
|
cube_id = cube_row * mesh_w + cube_col
|
||||||
|
cube_side = port["cube_side"]
|
||||||
|
ucie_id = f"{sp}.cube{cube_id}.ucie-{cube_side}"
|
||||||
|
edges.append(Edge(
|
||||||
|
src=io_cpu_id, dst=ucie_id,
|
||||||
|
distance_mm=io_to_ucie_mm + port["distance_mm"],
|
||||||
|
bw_gbs=io_to_ucie_bw,
|
||||||
|
kind="io_to_cube",
|
||||||
|
))
|
||||||
|
edges.append(Edge(
|
||||||
|
src=ucie_id, dst=io_cpu_id,
|
||||||
|
distance_mm=io_to_ucie_mm + port["distance_mm"],
|
||||||
|
bw_gbs=io_to_ucie_bw,
|
||||||
|
kind="cube_to_io",
|
||||||
|
))
|
||||||
|
|
||||||
|
|
||||||
|
def _add_system_to_io_edges(
|
||||||
|
edges: list[Edge], sp: str, sip_spec: dict, system: dict,
|
||||||
|
) -> None:
|
||||||
|
"""Add fabric switch → IO chiplet PCIe edges."""
|
||||||
|
sw_id = "fabric.switch0"
|
||||||
|
sys_link = system["links"]["io_ep_to_switch"]
|
||||||
|
for inst in sip_spec["iochiplet"]["instances"]:
|
||||||
|
pcie_ep_id = f"{sp}.{inst['id']}.pcie_ep"
|
||||||
|
edges.append(Edge(
|
||||||
|
src=sw_id, dst=pcie_ep_id,
|
||||||
|
distance_mm=sys_link["distance_mm"],
|
||||||
|
bw_gbs=sys_link["bw_gbs_per_ep"],
|
||||||
|
kind="pcie",
|
||||||
|
))
|
||||||
|
|
||||||
|
|
||||||
|
# ── View builders ────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def _build_system_view(spec: dict) -> ViewGraph:
|
||||||
|
"""System-level view: SIP blocks, IO chiplets, fabric switch."""
|
||||||
|
system = spec["system"]
|
||||||
|
sip_count = system["sips"]["count"]
|
||||||
|
sip_w, sip_h = 71.0, 59.0
|
||||||
|
gap = 30.0
|
||||||
|
canvas_w = sip_count * sip_w + (sip_count - 1) * gap
|
||||||
|
canvas_h = sip_h + 20.0
|
||||||
|
|
||||||
|
nodes: dict[str, Node] = {}
|
||||||
|
view_edges: list[Edge] = []
|
||||||
|
|
||||||
|
sw = system["components"]["switch"]
|
||||||
|
sw_id = "fabric.switch0"
|
||||||
|
nodes[sw_id] = Node(
|
||||||
|
id=sw_id, kind=sw["kind"], impl=sw["impl"],
|
||||||
|
attrs=sw.get("attrs", {}), pos_mm=(canvas_w / 2, 5.0), label="Fabric Switch",
|
||||||
|
)
|
||||||
|
|
||||||
|
for s in range(sip_count):
|
||||||
|
sx = s * (sip_w + gap)
|
||||||
|
sy = 20.0
|
||||||
|
sip_id = f"sip{s}"
|
||||||
|
|
||||||
|
nodes[sip_id] = Node(
|
||||||
|
id=sip_id, kind="sip", impl="",
|
||||||
|
attrs={"w_mm": sip_w, "h_mm": sip_h},
|
||||||
|
pos_mm=(sx + sip_w / 2, sy + sip_h / 2),
|
||||||
|
label=f"SIP {s}",
|
||||||
|
)
|
||||||
|
|
||||||
|
for inst in spec["sip"]["iochiplet"]["instances"]:
|
||||||
|
iid = inst["id"]
|
||||||
|
io_nid = f"{sip_id}.{iid}"
|
||||||
|
side = inst["place"]["side"]
|
||||||
|
iy = sy if side == "N" else sy + sip_h
|
||||||
|
nodes[io_nid] = Node(
|
||||||
|
id=io_nid, kind="iochiplet", impl="",
|
||||||
|
attrs={}, pos_mm=(sx + sip_w / 2, iy), label=f"IO {iid}",
|
||||||
|
)
|
||||||
|
view_edges.append(Edge(
|
||||||
|
src=sw_id, dst=io_nid,
|
||||||
|
distance_mm=system["links"]["io_ep_to_switch"]["distance_mm"],
|
||||||
|
bw_gbs=system["links"]["io_ep_to_switch"]["bw_gbs_per_ep"],
|
||||||
|
kind="pcie",
|
||||||
|
))
|
||||||
|
|
||||||
|
return ViewGraph(
|
||||||
|
name="system", nodes=nodes, edges=view_edges,
|
||||||
|
width_mm=canvas_w, height_mm=canvas_h,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _build_sip_view(spec: dict) -> ViewGraph:
|
||||||
|
"""SIP-level view: cube mesh + IO chiplets (representative, sip0)."""
|
||||||
|
sip_spec = spec["sip"]
|
||||||
|
cube_spec = spec["cube"]
|
||||||
|
mesh_w = sip_spec["cube_mesh"]["w"]
|
||||||
|
mesh_h = sip_spec["cube_mesh"]["h"]
|
||||||
|
cube_w = cube_spec["geometry"]["cube_mm"]["w"]
|
||||||
|
cube_h = cube_spec["geometry"]["cube_mm"]["h"]
|
||||||
|
seam = sip_spec["links"]["inter_cube_mesh"]["distance_mm_across_seam"]
|
||||||
|
stride_x = cube_w + seam
|
||||||
|
stride_y = cube_h + seam
|
||||||
|
mesh_total_w = mesh_w * cube_w + (mesh_w - 1) * seam
|
||||||
|
mesh_total_h = mesh_h * cube_h + (mesh_h - 1) * seam
|
||||||
|
io_margin = 6.0
|
||||||
|
canvas_w = mesh_total_w
|
||||||
|
canvas_h = mesh_total_h + 2 * io_margin
|
||||||
|
|
||||||
|
nodes: dict[str, Node] = {}
|
||||||
|
view_edges: list[Edge] = []
|
||||||
|
|
||||||
|
# Cubes as opaque blocks
|
||||||
|
for row in range(mesh_h):
|
||||||
|
for col in range(mesh_w):
|
||||||
|
cid = row * mesh_w + col
|
||||||
|
cx = col * stride_x + cube_w / 2
|
||||||
|
cy = io_margin + row * stride_y + cube_h / 2
|
||||||
|
nid = f"cube{cid}"
|
||||||
|
nodes[nid] = Node(
|
||||||
|
id=nid, kind="cube", impl="",
|
||||||
|
attrs={"w_mm": cube_w, "h_mm": cube_h, "col": col, "row": row},
|
||||||
|
pos_mm=(cx, cy), label=f"CUBE ({col},{row})",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Inter-cube mesh edges
|
||||||
|
mesh_link = sip_spec["links"]["inter_cube_mesh"]
|
||||||
|
for row in range(mesh_h):
|
||||||
|
for col in range(mesh_w):
|
||||||
|
cid = row * mesh_w + col
|
||||||
|
if col + 1 < mesh_w:
|
||||||
|
nid = row * mesh_w + (col + 1)
|
||||||
|
view_edges.append(Edge(
|
||||||
|
src=f"cube{cid}", dst=f"cube{nid}",
|
||||||
|
distance_mm=mesh_link["distance_mm_across_seam"],
|
||||||
|
bw_gbs=mesh_link["bw_gbs_per_ucie_phy"],
|
||||||
|
kind="ucie_mesh",
|
||||||
|
))
|
||||||
|
if row + 1 < mesh_h:
|
||||||
|
nid = (row + 1) * mesh_w + col
|
||||||
|
view_edges.append(Edge(
|
||||||
|
src=f"cube{cid}", dst=f"cube{nid}",
|
||||||
|
distance_mm=mesh_link["distance_mm_across_seam"],
|
||||||
|
bw_gbs=mesh_link["bw_gbs_per_ucie_phy"],
|
||||||
|
kind="ucie_mesh",
|
||||||
|
))
|
||||||
|
|
||||||
|
# IO chiplets
|
||||||
|
io_links = sip_spec["iochiplet"]["links"]
|
||||||
|
for inst in sip_spec["iochiplet"]["instances"]:
|
||||||
|
iid = inst["id"]
|
||||||
|
side = inst["place"]["side"]
|
||||||
|
iy = 2.0 if side == "N" else canvas_h - 2.0
|
||||||
|
nodes[iid] = Node(
|
||||||
|
id=iid, kind="iochiplet", impl="",
|
||||||
|
attrs={}, pos_mm=(mesh_total_w / 2, iy), label=f"IO {iid}",
|
||||||
|
)
|
||||||
|
for port in inst["cube_ports"]:
|
||||||
|
cube_col, cube_row = port["cube"]["xy"]
|
||||||
|
cube_id = cube_row * mesh_w + cube_col
|
||||||
|
view_edges.append(Edge(
|
||||||
|
src=iid, dst=f"cube{cube_id}",
|
||||||
|
distance_mm=io_links["io_cpu_to_ucie_mm"] + port["distance_mm"],
|
||||||
|
bw_gbs=io_links["io_cpu_to_ucie_bw_gbs"],
|
||||||
|
kind="io_to_cube",
|
||||||
|
))
|
||||||
|
|
||||||
|
return ViewGraph(
|
||||||
|
name="sip", nodes=nodes, edges=view_edges,
|
||||||
|
width_mm=canvas_w, height_mm=canvas_h,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _build_cube_view(spec: dict) -> ViewGraph:
|
||||||
|
"""Cube-level view: representative single cube, PEs as opaque blocks."""
|
||||||
|
cube = spec["cube"]
|
||||||
|
cube_w = cube["geometry"]["cube_mm"]["w"]
|
||||||
|
cube_h = cube["geometry"]["cube_mm"]["h"]
|
||||||
|
local_pos = _cube_local_positions(cube_w, cube_h)
|
||||||
|
clinks = cube["links"]
|
||||||
|
n_slices = cube["memory_map"]["hbm_slices_per_cube"]
|
||||||
|
|
||||||
|
nodes: dict[str, Node] = {}
|
||||||
|
view_edges: list[Edge] = []
|
||||||
|
|
||||||
|
# UCIe ports
|
||||||
|
for port in cube["ucie"]["ports"]:
|
||||||
|
pid = f"ucie-{port}"
|
||||||
|
lx, ly = local_pos[pid]
|
||||||
|
nodes[pid] = Node(
|
||||||
|
id=pid, kind="ucie_port", impl="ucie_v1",
|
||||||
|
attrs={}, pos_mm=(lx, ly), label=f"UCIe-{port}",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Named components (hbm_ctrl as single representative node in view)
|
||||||
|
for name in ("noc", "m_cpu", "hbm_ctrl", "sram"):
|
||||||
|
c = cube["components"][name]
|
||||||
|
lx, ly = local_pos[name]
|
||||||
|
nodes[name] = Node(
|
||||||
|
id=name, kind=c["kind"], impl=c["impl"],
|
||||||
|
attrs=c["attrs"], pos_mm=(lx, ly),
|
||||||
|
label=name.upper().replace("_", " "),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Bridges
|
||||||
|
for br in cube["components"]["xbar"]["bridges"]:
|
||||||
|
bname = br["id"]
|
||||||
|
bid = f"bridge.{bname}"
|
||||||
|
lx, ly = local_pos[bid]
|
||||||
|
nodes[bid] = Node(
|
||||||
|
id=bid, kind=br["kind"], impl=br["impl"],
|
||||||
|
attrs=br["attrs"], pos_mm=(lx, ly),
|
||||||
|
label=f"Bridge {bname.upper()}",
|
||||||
|
)
|
||||||
|
|
||||||
|
# PEs as opaque blocks + per-PE xbar entry nodes
|
||||||
|
corners = cube["pe_layout"]["corners"]
|
||||||
|
pe_per_corner = cube["pe_layout"]["pe_per_corner"]
|
||||||
|
corner_pos = _corner_pe_positions(cube_w, cube_h)
|
||||||
|
xbar_pe_spec = cube["components"]["xbar"]["pe"]
|
||||||
|
xbar_top_y = local_pos["xbar.top"][1]
|
||||||
|
xbar_bot_y = local_pos["xbar.bottom"][1]
|
||||||
|
|
||||||
|
pe_idx = 0
|
||||||
|
for corner in corners:
|
||||||
|
is_top = corner in ("NW", "NE")
|
||||||
|
xbar_y = xbar_top_y if is_top else xbar_bot_y
|
||||||
|
mm_key = "pe_to_xbar_row_n_mm" if is_top else "pe_to_xbar_row_s_mm"
|
||||||
|
for ci in range(pe_per_corner):
|
||||||
|
pid = f"pe{pe_idx}"
|
||||||
|
xbar_id = f"xbar.pe{pe_idx}"
|
||||||
|
px, py = corner_pos[corner][ci]
|
||||||
|
|
||||||
|
nodes[pid] = Node(
|
||||||
|
id=pid, kind="pe", impl="",
|
||||||
|
attrs={"corner": corner}, pos_mm=(px, py),
|
||||||
|
label=f"PE{pe_idx}",
|
||||||
|
)
|
||||||
|
nodes[xbar_id] = Node(
|
||||||
|
id=xbar_id, kind=xbar_pe_spec["kind"], impl=xbar_pe_spec["impl"],
|
||||||
|
attrs=xbar_pe_spec["attrs"], pos_mm=(px, xbar_y),
|
||||||
|
label=f"XBAR PE{pe_idx}",
|
||||||
|
)
|
||||||
|
|
||||||
|
# PE → xbar.pe_i (HBM data path)
|
||||||
|
view_edges.append(Edge(
|
||||||
|
src=pid, dst=xbar_id,
|
||||||
|
distance_mm=clinks[mm_key],
|
||||||
|
bw_gbs=clinks["pe_to_xbar_bw_gbs"],
|
||||||
|
kind="pe_to_xbar",
|
||||||
|
))
|
||||||
|
# PE → noc (non-HBM data path)
|
||||||
|
view_edges.append(Edge(
|
||||||
|
src=pid, dst="noc",
|
||||||
|
distance_mm=clinks["pe_dma_to_noc_mm"],
|
||||||
|
bw_gbs=clinks["pe_dma_to_noc_bw_gbs"],
|
||||||
|
kind="pe_to_noc",
|
||||||
|
))
|
||||||
|
# noc → PE (command delivery)
|
||||||
|
view_edges.append(Edge(
|
||||||
|
src="noc", dst=pid,
|
||||||
|
distance_mm=clinks["noc_to_pe_cpu_mm"],
|
||||||
|
kind="command",
|
||||||
|
))
|
||||||
|
pe_idx += 1
|
||||||
|
|
||||||
|
# Cube fabric edges
|
||||||
|
# xbar.pe_i → hbm_ctrl (single representative node in view)
|
||||||
|
for i in range(n_slices):
|
||||||
|
view_edges.append(Edge(
|
||||||
|
src=f"xbar.pe{i}", dst="hbm_ctrl",
|
||||||
|
distance_mm=clinks["xbar_to_hbm_mm"],
|
||||||
|
bw_gbs=clinks["xbar_to_hbm_bw_gbs"],
|
||||||
|
kind="xbar_to_hbm",
|
||||||
|
))
|
||||||
|
|
||||||
|
# xbar chain
|
||||||
|
half = n_slices // 2
|
||||||
|
for half_start in (0, half):
|
||||||
|
for i in range(half_start, half_start + half - 1):
|
||||||
|
intra = ((i - half_start) % pe_per_corner) != (pe_per_corner - 1)
|
||||||
|
x_dist = clinks["xbar_chain_intra_corner_mm"] if intra else clinks["xbar_chain_inter_corner_mm"]
|
||||||
|
for a, b in [(i, i + 1), (i + 1, i)]:
|
||||||
|
view_edges.append(Edge(
|
||||||
|
src=f"xbar.pe{a}", dst=f"xbar.pe{b}",
|
||||||
|
distance_mm=x_dist,
|
||||||
|
bw_gbs=clinks["xbar_x_bw_gbs"],
|
||||||
|
kind="xbar_chain",
|
||||||
|
))
|
||||||
|
|
||||||
|
# bridge connections
|
||||||
|
for bname, pe_top, pe_bot in [("left", 0, half), ("right", half - 1, n_slices - 1)]:
|
||||||
|
br_id = f"bridge.{bname}"
|
||||||
|
for pe_i, br_mm_key in [(pe_top, "xbar_row_n_to_bridge_mm"),
|
||||||
|
(pe_bot, "xbar_row_s_to_bridge_mm")]:
|
||||||
|
xbar_id = f"xbar.pe{pe_i}"
|
||||||
|
view_edges.append(Edge(
|
||||||
|
src=xbar_id, dst=br_id,
|
||||||
|
distance_mm=clinks[br_mm_key],
|
||||||
|
bw_gbs=clinks["xbar_to_bridge_bw_gbs"],
|
||||||
|
kind="xbar_to_bridge",
|
||||||
|
))
|
||||||
|
view_edges.append(Edge(
|
||||||
|
src=br_id, dst=xbar_id,
|
||||||
|
distance_mm=clinks[br_mm_key],
|
||||||
|
bw_gbs=clinks["xbar_to_bridge_bw_gbs"],
|
||||||
|
kind="bridge_to_xbar",
|
||||||
|
))
|
||||||
|
|
||||||
|
_noc_ucie_v = clinks["noc_to_ucie"]
|
||||||
|
for port in cube["ucie"]["ports"]:
|
||||||
|
view_edges.append(Edge(
|
||||||
|
src="noc", dst=f"ucie-{port}",
|
||||||
|
distance_mm=0.0,
|
||||||
|
bw_gbs=_noc_ucie_v["per_connection_bw_gbs"],
|
||||||
|
n_connections=_noc_ucie_v["n_connections"],
|
||||||
|
kind="noc_to_ucie",
|
||||||
|
))
|
||||||
|
|
||||||
|
# m_cpu ↔ noc (command dispatch, both directions)
|
||||||
|
view_edges.append(Edge(
|
||||||
|
src="m_cpu", dst="noc",
|
||||||
|
distance_mm=clinks["m_cpu_to_noc_mm"],
|
||||||
|
kind="command",
|
||||||
|
))
|
||||||
|
view_edges.append(Edge(
|
||||||
|
src="noc", dst="m_cpu",
|
||||||
|
distance_mm=clinks["m_cpu_to_noc_mm"],
|
||||||
|
kind="command",
|
||||||
|
))
|
||||||
|
|
||||||
|
# noc ↔ sram (shared SRAM access, bidirectional)
|
||||||
|
_noc_sram_v = clinks["noc_to_sram"]
|
||||||
|
view_edges.append(Edge(
|
||||||
|
src="noc", dst="sram",
|
||||||
|
distance_mm=clinks["noc_to_sram_mm"],
|
||||||
|
bw_gbs=_noc_sram_v["per_connection_bw_gbs"],
|
||||||
|
n_connections=_noc_sram_v["n_connections"],
|
||||||
|
kind="noc_to_sram",
|
||||||
|
))
|
||||||
|
view_edges.append(Edge(
|
||||||
|
src="sram", dst="noc",
|
||||||
|
distance_mm=clinks["noc_to_sram_mm"],
|
||||||
|
bw_gbs=_noc_sram_v["per_connection_bw_gbs"],
|
||||||
|
n_connections=_noc_sram_v["n_connections"],
|
||||||
|
kind="noc_to_sram",
|
||||||
|
))
|
||||||
|
|
||||||
|
return ViewGraph(
|
||||||
|
name="cube", nodes=nodes, edges=view_edges,
|
||||||
|
width_mm=cube_w, height_mm=cube_h,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _build_pe_view(spec: dict) -> ViewGraph:
|
||||||
|
"""PE-level view: representative single PE with all template components."""
|
||||||
|
pe_tmpl = spec["cube"]["pe_template"]
|
||||||
|
pe_links = pe_tmpl["links"]
|
||||||
|
canvas_w, canvas_h = 12.0, 8.0
|
||||||
|
|
||||||
|
positions = {
|
||||||
|
"pe_cpu": (1.5, 4.0),
|
||||||
|
"pe_scheduler": (4.0, 4.0),
|
||||||
|
"pe_dma": (7.0, 1.5),
|
||||||
|
"pe_gemm": (7.0, 4.0),
|
||||||
|
"pe_math": (7.0, 6.5),
|
||||||
|
"pe_tcm": (10.0, 4.0),
|
||||||
|
}
|
||||||
|
|
||||||
|
nodes: dict[str, Node] = {}
|
||||||
|
view_edges: list[Edge] = []
|
||||||
|
|
||||||
|
for comp_name, comp_spec in pe_tmpl["components"].items():
|
||||||
|
px, py = positions[comp_name]
|
||||||
|
nodes[comp_name] = Node(
|
||||||
|
id=comp_name, kind=comp_spec["kind"], impl=comp_spec["impl"],
|
||||||
|
attrs=comp_spec["attrs"], pos_mm=(px, py),
|
||||||
|
label=comp_name.upper().replace("_", " "),
|
||||||
|
)
|
||||||
|
|
||||||
|
view_edges.append(Edge(
|
||||||
|
src="pe_cpu", dst="pe_scheduler",
|
||||||
|
distance_mm=pe_links["pe_cpu_to_scheduler_mm"],
|
||||||
|
kind="pe_internal",
|
||||||
|
))
|
||||||
|
for eng, key in [("pe_dma", "scheduler_to_dma_mm"),
|
||||||
|
("pe_gemm", "scheduler_to_gemm_mm"),
|
||||||
|
("pe_math", "scheduler_to_math_mm")]:
|
||||||
|
view_edges.append(Edge(
|
||||||
|
src="pe_scheduler", dst=eng,
|
||||||
|
distance_mm=pe_links[key],
|
||||||
|
kind="pe_internal",
|
||||||
|
))
|
||||||
|
for eng, mm_key, bw_key in [("pe_dma", "dma_to_tcm_mm", "dma_to_tcm_bw_gbs"),
|
||||||
|
("pe_gemm", "gemm_to_tcm_mm", "gemm_to_tcm_bw_gbs"),
|
||||||
|
("pe_math", "math_to_tcm_mm", "math_to_tcm_bw_gbs")]:
|
||||||
|
view_edges.append(Edge(
|
||||||
|
src=eng, dst="pe_tcm",
|
||||||
|
distance_mm=pe_links[mm_key],
|
||||||
|
bw_gbs=pe_links[bw_key],
|
||||||
|
kind="pe_internal",
|
||||||
|
))
|
||||||
|
|
||||||
|
return ViewGraph(
|
||||||
|
name="pe", nodes=nodes, edges=view_edges,
|
||||||
|
width_mm=canvas_w, height_mm=canvas_h,
|
||||||
|
)
|
||||||
@@ -0,0 +1,56 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Node:
|
||||||
|
id: str
|
||||||
|
kind: str
|
||||||
|
impl: str
|
||||||
|
attrs: dict[str, Any]
|
||||||
|
pos_mm: tuple[float, float] | None # (x_mm, y_mm); None for abstract nodes
|
||||||
|
label: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Edge:
|
||||||
|
src: str # node id
|
||||||
|
dst: str # node id
|
||||||
|
distance_mm: float # physical wire delay distance (ns = distance_mm * ns_per_mm)
|
||||||
|
routing_weight_mm: float | None = None # Dijkstra cost; None → use distance_mm
|
||||||
|
bw_gbs: float | None = None
|
||||||
|
n_connections: int | None = None # multi-connection links; single request uses 1 connection
|
||||||
|
kind: str = "link"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ViewGraph:
|
||||||
|
name: str # "system" | "sip" | "cube" | "pe"
|
||||||
|
nodes: dict[str, Node]
|
||||||
|
edges: list[Edge]
|
||||||
|
width_mm: float
|
||||||
|
height_mm: float
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class TopologyGraph:
|
||||||
|
spec: dict[str, Any]
|
||||||
|
|
||||||
|
# Full instantiated flat graph (used by sim_engine)
|
||||||
|
nodes: dict[str, Node] = field(default_factory=dict)
|
||||||
|
edges: list[Edge] = field(default_factory=list)
|
||||||
|
|
||||||
|
# Representative view projections (used by visualizer)
|
||||||
|
system_view: ViewGraph | None = None
|
||||||
|
sip_view: ViewGraph | None = None
|
||||||
|
cube_view: ViewGraph | None = None
|
||||||
|
pe_view: ViewGraph | None = None
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class TopologyHandle:
|
||||||
|
path: Path
|
||||||
|
topology_obj: TopologyGraph | None # None until _compile_graph is implemented
|
||||||
@@ -0,0 +1,367 @@
|
|||||||
|
# kernbench/topology/visualizer.py
|
||||||
|
"""
|
||||||
|
SVG diagram generator for TopologyGraph views.
|
||||||
|
|
||||||
|
Produces mm-accurate, deterministic SVG files for each view level
|
||||||
|
(system, SIP, cube, PE) per ADR-0005 and ADR-0006.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from .types import Edge, Node, TopologyGraph, ViewGraph
|
||||||
|
|
||||||
|
# ── Color palette by component kind ─────────────────────────────────
|
||||||
|
|
||||||
|
_KIND_COLORS: dict[str, str] = {
|
||||||
|
"switch": "#6366f1", # indigo
|
||||||
|
"sip": "#e0e7ff", # light indigo
|
||||||
|
"iochiplet": "#0ea5e9", # sky blue
|
||||||
|
"pcie_ep": "#0ea5e9",
|
||||||
|
"io_cpu": "#0ea5e9",
|
||||||
|
"ucie_port": "#3b82f6", # blue
|
||||||
|
"noc": "#a78bfa", # purple
|
||||||
|
"m_cpu": "#f59e0b", # amber
|
||||||
|
"xbar": "#f97316", # orange
|
||||||
|
"hbm_ctrl": "#10b981", # emerald
|
||||||
|
"pe": "#94a3b8", # slate
|
||||||
|
"pe_cpu": "#ef4444", # red
|
||||||
|
"pe_scheduler": "#f59e0b", # amber
|
||||||
|
"pe_dma": "#3b82f6", # blue
|
||||||
|
"pe_gemm": "#8b5cf6", # violet
|
||||||
|
"pe_math": "#ec4899", # pink
|
||||||
|
"pe_tcm": "#10b981", # emerald
|
||||||
|
"sram": "#f59e0b", # amber
|
||||||
|
"cube": "#cbd5e1", # slate-300
|
||||||
|
}
|
||||||
|
|
||||||
|
_EDGE_COLORS: dict[str, str] = {
|
||||||
|
"pcie": "#6366f1",
|
||||||
|
"io_internal": "#0ea5e9",
|
||||||
|
"io_to_cube": "#0ea5e9",
|
||||||
|
"ucie_mesh": "#3b82f6",
|
||||||
|
"pe_to_xbar": "#f97316",
|
||||||
|
"xbar_to_hbm": "#10b981",
|
||||||
|
"xbar_to_bridge": "#a78bfa",
|
||||||
|
"bridge_to_xbar": "#a78bfa",
|
||||||
|
"noc_to_ucie": "#a78bfa",
|
||||||
|
"pe_to_noc": "#a78bfa",
|
||||||
|
"noc_to_sram": "#f59e0b",
|
||||||
|
"command": "#f59e0b",
|
||||||
|
"pe_internal": "#94a3b8",
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── Node sizing ──────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
_DEFAULT_NODE_W = 2.0 # mm
|
||||||
|
_DEFAULT_NODE_H = 1.2 # mm
|
||||||
|
|
||||||
|
_KIND_SIZE: dict[str, tuple[float, float]] = {
|
||||||
|
"sip": (60.0, 50.0),
|
||||||
|
"cube": (6.0, 4.0),
|
||||||
|
"iochiplet": (4.0, 1.5),
|
||||||
|
"switch": (5.0, 1.5),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ── Public API ───────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def emit_diagrams(graph: TopologyGraph, out_dir: Path) -> list[Path]:
|
||||||
|
"""Generate SVG diagrams for all views. Returns list of created file paths."""
|
||||||
|
out_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
created: list[Path] = []
|
||||||
|
|
||||||
|
views = [
|
||||||
|
("system_view", graph.system_view),
|
||||||
|
("sip_view", graph.sip_view),
|
||||||
|
("cube_view", graph.cube_view),
|
||||||
|
("pe_view", graph.pe_view),
|
||||||
|
]
|
||||||
|
|
||||||
|
for name, view in views:
|
||||||
|
if view is None:
|
||||||
|
continue
|
||||||
|
svg = _render_view_svg(view)
|
||||||
|
path = out_dir / f"{name}.svg"
|
||||||
|
path.write_text(svg, encoding="utf-8")
|
||||||
|
created.append(path)
|
||||||
|
|
||||||
|
return created
|
||||||
|
|
||||||
|
|
||||||
|
# ── SVG rendering ────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def _render_view_svg(view: ViewGraph) -> str:
|
||||||
|
"""Render a ViewGraph to an SVG string."""
|
||||||
|
scale = _pick_scale(view)
|
||||||
|
pad = 40 # px padding
|
||||||
|
node_sizes = _compute_node_sizes(view, scale)
|
||||||
|
|
||||||
|
# Canvas size in px
|
||||||
|
w_px = int(view.width_mm * scale + 2 * pad)
|
||||||
|
h_px = int(view.height_mm * scale + 2 * pad)
|
||||||
|
|
||||||
|
parts: list[str] = []
|
||||||
|
parts.append(_svg_header(w_px, h_px, view.name))
|
||||||
|
|
||||||
|
# Background
|
||||||
|
parts.append(f' <rect width="{w_px}" height="{h_px}" fill="#f8fafc"/>')
|
||||||
|
|
||||||
|
# Title
|
||||||
|
parts.append(
|
||||||
|
f' <text x="{w_px // 2}" y="18" text-anchor="middle" '
|
||||||
|
f'font-family="monospace" font-size="14" font-weight="bold" fill="#1e293b">'
|
||||||
|
f'{view.name.upper()} VIEW</text>'
|
||||||
|
)
|
||||||
|
|
||||||
|
# Special: draw cube boundary + HBM block background in cube view
|
||||||
|
if view.name == "cube":
|
||||||
|
_draw_cube_boundary(parts, view, scale, pad)
|
||||||
|
_draw_hbm_block(parts, view, scale, pad)
|
||||||
|
|
||||||
|
# Edges (draw before nodes so nodes are on top)
|
||||||
|
# Track fan-out edges to assign per-edge offsets
|
||||||
|
fanout_counter: dict[str, int] = {}
|
||||||
|
for edge in view.edges:
|
||||||
|
if edge.src in view.nodes and edge.dst in view.nodes:
|
||||||
|
_draw_edge(parts, edge, view, node_sizes, scale, pad, fanout_counter)
|
||||||
|
|
||||||
|
# Nodes
|
||||||
|
for node in view.nodes.values():
|
||||||
|
_draw_node(parts, node, node_sizes, scale, pad)
|
||||||
|
|
||||||
|
parts.append("</svg>")
|
||||||
|
return "\n".join(parts)
|
||||||
|
|
||||||
|
|
||||||
|
def _pick_scale(view: ViewGraph) -> float:
|
||||||
|
"""Pixels per mm, chosen per view type."""
|
||||||
|
return {
|
||||||
|
"system": 4.0,
|
||||||
|
"sip": 8.0,
|
||||||
|
"cube": 28.0,
|
||||||
|
"pe": 35.0,
|
||||||
|
}.get(view.name, 10.0)
|
||||||
|
|
||||||
|
|
||||||
|
def _compute_node_sizes(
|
||||||
|
view: ViewGraph, scale: float,
|
||||||
|
) -> dict[str, tuple[float, float]]:
|
||||||
|
"""Returns (w_px, h_px) for each node."""
|
||||||
|
sizes: dict[str, tuple[float, float]] = {}
|
||||||
|
for nid, node in view.nodes.items():
|
||||||
|
w_mm, h_mm = _KIND_SIZE.get(node.kind, (_DEFAULT_NODE_W, _DEFAULT_NODE_H))
|
||||||
|
# For cube view, use smaller PE nodes
|
||||||
|
if view.name == "cube" and node.kind == "pe":
|
||||||
|
w_mm, h_mm = 1.8, 1.0
|
||||||
|
if view.name == "pe":
|
||||||
|
w_mm, h_mm = 2.5, 1.4
|
||||||
|
sizes[nid] = (w_mm * scale, h_mm * scale)
|
||||||
|
return sizes
|
||||||
|
|
||||||
|
|
||||||
|
def _svg_header(w: int, h: int, title: str) -> str:
|
||||||
|
return (
|
||||||
|
f'<svg xmlns="http://www.w3.org/2000/svg" '
|
||||||
|
f'width="{w}" height="{h}" viewBox="0 0 {w} {h}">\n'
|
||||||
|
f' <title>{title}</title>'
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _draw_cube_boundary(
|
||||||
|
parts: list[str], view: ViewGraph, scale: float, pad: int,
|
||||||
|
) -> None:
|
||||||
|
"""Draw the cube die outline as a dashed rectangle."""
|
||||||
|
bx = pad
|
||||||
|
by = pad
|
||||||
|
bw = view.width_mm * scale
|
||||||
|
bh = view.height_mm * scale
|
||||||
|
parts.append(
|
||||||
|
f' <rect x="{bx:.1f}" y="{by:.1f}" '
|
||||||
|
f'width="{bw:.1f}" height="{bh:.1f}" '
|
||||||
|
f'rx="6" fill="none" stroke="#475569" stroke-width="2" '
|
||||||
|
f'stroke-dasharray="8,4"/>'
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _draw_hbm_block(
|
||||||
|
parts: list[str], view: ViewGraph, scale: float, pad: int,
|
||||||
|
) -> None:
|
||||||
|
"""Draw HBM area as a filled rectangle in cube view."""
|
||||||
|
# HBM area: centered at (8.5, 7.0), size 9x5 -> x=[4.0,13.0], y=[4.5,9.5]
|
||||||
|
hbm_x = 4.0 * scale + pad
|
||||||
|
hbm_y = 4.5 * scale + pad
|
||||||
|
hbm_w = 9.0 * scale
|
||||||
|
hbm_h = 5.0 * scale
|
||||||
|
parts.append(
|
||||||
|
f' <rect x="{hbm_x:.1f}" y="{hbm_y:.1f}" '
|
||||||
|
f'width="{hbm_w:.1f}" height="{hbm_h:.1f}" '
|
||||||
|
f'rx="4" fill="#d1fae5" stroke="#10b981" stroke-width="1.5" '
|
||||||
|
f'stroke-dasharray="6,3" opacity="0.5"/>'
|
||||||
|
)
|
||||||
|
cx = 8.5 * scale + pad
|
||||||
|
cy = 8.5 * scale + pad
|
||||||
|
parts.append(
|
||||||
|
f' <text x="{cx:.1f}" y="{cy:.1f}" text-anchor="middle" '
|
||||||
|
f'font-family="monospace" font-size="11" fill="#047857" opacity="0.7">'
|
||||||
|
f'HBM</text>'
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _draw_node(
|
||||||
|
parts: list[str],
|
||||||
|
node: Node,
|
||||||
|
sizes: dict[str, tuple[float, float]],
|
||||||
|
scale: float,
|
||||||
|
pad: int,
|
||||||
|
) -> None:
|
||||||
|
"""Draw a single node as a rounded rectangle with label."""
|
||||||
|
if node.pos_mm is None:
|
||||||
|
return
|
||||||
|
px = node.pos_mm[0] * scale + pad
|
||||||
|
py = node.pos_mm[1] * scale + pad
|
||||||
|
w, h = sizes.get(node.id, (40, 24))
|
||||||
|
|
||||||
|
x = px - w / 2
|
||||||
|
y = py - h / 2
|
||||||
|
fill = _KIND_COLORS.get(node.kind, "#e2e8f0")
|
||||||
|
text_color = "#ffffff" if _is_dark(fill) else "#1e293b"
|
||||||
|
|
||||||
|
parts.append(
|
||||||
|
f' <rect x="{x:.1f}" y="{y:.1f}" width="{w:.1f}" height="{h:.1f}" '
|
||||||
|
f'rx="4" fill="{fill}" stroke="#475569" stroke-width="1"/>'
|
||||||
|
)
|
||||||
|
|
||||||
|
label = node.label or node.id
|
||||||
|
font_size = _label_font_size(w, label)
|
||||||
|
parts.append(
|
||||||
|
f' <text x="{px:.1f}" y="{py + 4:.1f}" text-anchor="middle" '
|
||||||
|
f'font-family="monospace" font-size="{font_size}" fill="{text_color}">'
|
||||||
|
f'{_escape(label)}</text>'
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Fan-out edge kinds that need offset routing ─────────────────────
|
||||||
|
|
||||||
|
_FANOUT_KINDS = {"pe_to_xbar", "pe_to_noc", "command", "noc_to_ucie"}
|
||||||
|
|
||||||
|
|
||||||
|
def _draw_edge(
|
||||||
|
parts: list[str],
|
||||||
|
edge: Edge,
|
||||||
|
view: ViewGraph,
|
||||||
|
sizes: dict[str, tuple[float, float]],
|
||||||
|
scale: float,
|
||||||
|
pad: int,
|
||||||
|
fanout_counter: dict[str, int],
|
||||||
|
) -> None:
|
||||||
|
"""Draw an edge with orthogonal (90-degree) routing for fan-out kinds."""
|
||||||
|
nodes = view.nodes
|
||||||
|
src_node = nodes[edge.src]
|
||||||
|
dst_node = nodes[edge.dst]
|
||||||
|
if src_node.pos_mm is None or dst_node.pos_mm is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
x1 = src_node.pos_mm[0] * scale + pad
|
||||||
|
y1 = src_node.pos_mm[1] * scale + pad
|
||||||
|
x2 = dst_node.pos_mm[0] * scale + pad
|
||||||
|
y2 = dst_node.pos_mm[1] * scale + pad
|
||||||
|
|
||||||
|
color = _EDGE_COLORS.get(edge.kind, "#94a3b8")
|
||||||
|
width = "1.5" if edge.kind == "pe_internal" else "1"
|
||||||
|
opacity = "0.6" if edge.kind in ("command", "noc_to_ucie") else "0.8"
|
||||||
|
|
||||||
|
if edge.kind in _FANOUT_KINDS and view.name == "cube":
|
||||||
|
# Orthogonal routing: src→horizontal→vertical→dst with per-edge offset.
|
||||||
|
group_key = f"{edge.kind}:{edge.dst}"
|
||||||
|
idx = fanout_counter.get(group_key, 0)
|
||||||
|
fanout_counter[group_key] = idx + 1
|
||||||
|
|
||||||
|
# Route: go vertically from src to a staggered horizontal channel,
|
||||||
|
# then horizontally to dst x, then vertically to dst.
|
||||||
|
mid_y = (y1 + y2) / 2 + (idx - 1.5) * 10 # spread channels vertically
|
||||||
|
|
||||||
|
parts.append(
|
||||||
|
f' <polyline points="{x1:.1f},{y1:.1f} {x1:.1f},{mid_y:.1f} '
|
||||||
|
f'{x2:.1f},{mid_y:.1f} {x2:.1f},{y2:.1f}" '
|
||||||
|
f'fill="none" stroke="{color}" stroke-width="{width}" opacity="{opacity}"/>'
|
||||||
|
)
|
||||||
|
|
||||||
|
# Label on the horizontal segment
|
||||||
|
if edge.distance_mm > 0:
|
||||||
|
lx = (x1 + x2) / 2
|
||||||
|
label = f"{edge.distance_mm:.1f}mm"
|
||||||
|
if edge.bw_gbs:
|
||||||
|
label += f" {edge.bw_gbs:.0f}GB/s"
|
||||||
|
parts.append(
|
||||||
|
f' <text x="{lx:.1f}" y="{mid_y - 3:.1f}" text-anchor="middle" '
|
||||||
|
f'font-family="monospace" font-size="7" fill="#64748b">'
|
||||||
|
f'{label}</text>'
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
# Non-fanout: orthogonal L-bend
|
||||||
|
if abs(x2 - x1) > 1 and abs(y2 - y1) > 1:
|
||||||
|
# PE view: vertical-first for left→right edges (scheduler→engines),
|
||||||
|
# horizontal-first for right→right edges (engines→tcm)
|
||||||
|
if view.name == "pe":
|
||||||
|
if src_node.pos_mm[0] < view.width_mm / 2:
|
||||||
|
# Source in left half: vertical-first (scheduler fan-out)
|
||||||
|
parts.append(
|
||||||
|
f' <polyline points="{x1:.1f},{y1:.1f} {x1:.1f},{y2:.1f} {x2:.1f},{y2:.1f}" '
|
||||||
|
f'fill="none" stroke="{color}" stroke-width="{width}" opacity="{opacity}"/>'
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Source in right half: horizontal-first (dma/math→tcm)
|
||||||
|
parts.append(
|
||||||
|
f' <polyline points="{x1:.1f},{y1:.1f} {x2:.1f},{y1:.1f} {x2:.1f},{y2:.1f}" '
|
||||||
|
f'fill="none" stroke="{color}" stroke-width="{width}" opacity="{opacity}"/>'
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
parts.append(
|
||||||
|
f' <polyline points="{x1:.1f},{y1:.1f} {x2:.1f},{y1:.1f} {x2:.1f},{y2:.1f}" '
|
||||||
|
f'fill="none" stroke="{color}" stroke-width="{width}" opacity="{opacity}"/>'
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
parts.append(
|
||||||
|
f' <line x1="{x1:.1f}" y1="{y1:.1f}" x2="{x2:.1f}" y2="{y2:.1f}" '
|
||||||
|
f'stroke="{color}" stroke-width="{width}" opacity="{opacity}"/>'
|
||||||
|
)
|
||||||
|
|
||||||
|
# Distance label at midpoint
|
||||||
|
if edge.distance_mm > 0:
|
||||||
|
mx = (x1 + x2) / 2
|
||||||
|
my = (y1 + y2) / 2
|
||||||
|
label = f"{edge.distance_mm:.1f}mm"
|
||||||
|
if edge.bw_gbs:
|
||||||
|
label += f" {edge.bw_gbs:.0f}GB/s"
|
||||||
|
parts.append(
|
||||||
|
f' <text x="{mx:.1f}" y="{my - 4:.1f}" text-anchor="middle" '
|
||||||
|
f'font-family="monospace" font-size="7" fill="#64748b">'
|
||||||
|
f'{label}</text>'
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Helpers ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def _is_dark(hex_color: str) -> bool:
|
||||||
|
"""Check if a hex color is dark (for white text)."""
|
||||||
|
h = hex_color.lstrip("#")
|
||||||
|
r, g, b = int(h[0:2], 16), int(h[2:4], 16), int(h[4:6], 16)
|
||||||
|
return (r * 0.299 + g * 0.587 + b * 0.114) < 140
|
||||||
|
|
||||||
|
|
||||||
|
def _label_font_size(box_width: float, label: str) -> int:
|
||||||
|
"""Choose font size to fit label in box."""
|
||||||
|
char_w = len(label) * 7
|
||||||
|
if char_w > box_width * 0.9:
|
||||||
|
return max(7, int(box_width * 0.9 / len(label) * 1.4))
|
||||||
|
return 10
|
||||||
|
|
||||||
|
|
||||||
|
def _escape(text: str) -> str:
|
||||||
|
"""Escape XML special characters."""
|
||||||
|
return text.replace("&", "&").replace("<", "<").replace(">", ">")
|
||||||
@@ -0,0 +1,11 @@
|
|||||||
|
"""Triton emulator: fake tl module for kernel performance simulation.
|
||||||
|
|
||||||
|
Provides TLContext (the fake `tl` parameter) that kernels use to express
|
||||||
|
memory access patterns and compute operations. Kernel functions are plain
|
||||||
|
Python — no yield, no async — and generate a PeCommand trace that PE_CPU
|
||||||
|
replays through SimPy.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
from kernbench.triton_emu.registry import register_kernel, get_kernel
|
||||||
|
from kernbench.triton_emu.tl_context import TLContext
|
||||||
|
"""
|
||||||
@@ -0,0 +1,30 @@
|
|||||||
|
"""Kernel registry: maps kernel names to Python callable generators.
|
||||||
|
|
||||||
|
Benchmarks register kernel functions here; PE_CPU looks them up by
|
||||||
|
KernelRef.name at execution time.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from collections.abc import Callable
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
_kernels: dict[str, Callable[..., None]] = {}
|
||||||
|
|
||||||
|
|
||||||
|
def register_kernel(name: str, fn: Callable[..., None]) -> None:
|
||||||
|
"""Register a kernel function by name."""
|
||||||
|
if name in _kernels:
|
||||||
|
raise ValueError(f"kernel '{name}' already registered")
|
||||||
|
_kernels[name] = fn
|
||||||
|
|
||||||
|
|
||||||
|
def get_kernel(name: str) -> Callable[..., None]:
|
||||||
|
"""Look up a registered kernel function by name."""
|
||||||
|
if name not in _kernels:
|
||||||
|
raise KeyError(f"kernel '{name}' not registered")
|
||||||
|
return _kernels[name]
|
||||||
|
|
||||||
|
|
||||||
|
def clear_registry() -> None:
|
||||||
|
"""Clear all registered kernels (for testing)."""
|
||||||
|
_kernels.clear()
|
||||||
@@ -0,0 +1,356 @@
|
|||||||
|
"""TLContext: fake Triton Language module for kernel performance simulation.
|
||||||
|
|
||||||
|
Passed as the `tl` parameter to kernel functions. Each API call records a
|
||||||
|
PeCommand in the internal trace. After the kernel returns, PE_CPU replays
|
||||||
|
the command list through SimPy.
|
||||||
|
|
||||||
|
Kernel code looks like standard Python — no yield, no async:
|
||||||
|
|
||||||
|
def my_kernel(a_ptr, b_ptr, out_ptr, tl):
|
||||||
|
pid = tl.program_id(0)
|
||||||
|
a = tl.load(a_ptr, shape=(32, 64), dtype="f16")
|
||||||
|
b = tl.load(b_ptr + pid * stride, shape=(64, 32), dtype="f16")
|
||||||
|
tl.composite(op="gemm", a=a, b=b, out_ptr=out_ptr)
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import math
|
||||||
|
from typing import Literal
|
||||||
|
|
||||||
|
from kernbench.common.pe_commands import (
|
||||||
|
CompletionHandle,
|
||||||
|
CompositeCmd,
|
||||||
|
DmaReadCmd,
|
||||||
|
DmaWriteCmd,
|
||||||
|
GemmCmd,
|
||||||
|
MathCmd,
|
||||||
|
PeCommand,
|
||||||
|
PeCpuOverheadCmd,
|
||||||
|
TensorHandle,
|
||||||
|
WaitCmd,
|
||||||
|
)
|
||||||
|
|
||||||
|
_DTYPE_BYTES: dict[str, int] = {
|
||||||
|
"f16": 2, "f32": 4, "f64": 8,
|
||||||
|
"bf16": 2,
|
||||||
|
"i8": 1, "i16": 2, "i32": 4, "i64": 8,
|
||||||
|
"u8": 1, "u16": 2, "u32": 4, "u64": 8,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class TLContext:
|
||||||
|
"""Fake Triton Language context.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pe_id: program instance index (returned by program_id).
|
||||||
|
num_programs: total number of program instances.
|
||||||
|
dispatch_cycles: PE_CPU overhead per tl API call (auto-inserted).
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
pe_id: int = 0,
|
||||||
|
num_programs: int = 1,
|
||||||
|
dispatch_cycles: int = 1,
|
||||||
|
) -> None:
|
||||||
|
self._pe_id = pe_id
|
||||||
|
self._num_programs = num_programs
|
||||||
|
self._dispatch_cycles = dispatch_cycles
|
||||||
|
self._commands: list[PeCommand] = []
|
||||||
|
self._handle_counter = 0
|
||||||
|
self._completion_counter = 0
|
||||||
|
|
||||||
|
@property
|
||||||
|
def commands(self) -> list[PeCommand]:
|
||||||
|
"""Return the recorded command trace."""
|
||||||
|
return self._commands
|
||||||
|
|
||||||
|
# ── helpers ────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _next_handle_id(self) -> str:
|
||||||
|
self._handle_counter += 1
|
||||||
|
return f"t{self._handle_counter}"
|
||||||
|
|
||||||
|
def _next_completion_id(self) -> str:
|
||||||
|
self._completion_counter += 1
|
||||||
|
return f"c{self._completion_counter}"
|
||||||
|
|
||||||
|
def _dtype_bytes(self, dtype: str) -> int:
|
||||||
|
return _DTYPE_BYTES.get(dtype, 2)
|
||||||
|
|
||||||
|
def _nbytes(self, shape: tuple[int, ...], dtype: str) -> int:
|
||||||
|
return math.prod(shape) * self._dtype_bytes(dtype)
|
||||||
|
|
||||||
|
def _emit_dispatch_overhead(self) -> None:
|
||||||
|
if self._dispatch_cycles > 0:
|
||||||
|
self._commands.append(PeCpuOverheadCmd(cycles=self._dispatch_cycles))
|
||||||
|
|
||||||
|
def _make_handle(
|
||||||
|
self, pa: int, shape: tuple[int, ...], dtype: str,
|
||||||
|
) -> TensorHandle:
|
||||||
|
return TensorHandle(
|
||||||
|
id=self._next_handle_id(),
|
||||||
|
pa=pa, shape=shape, dtype=dtype,
|
||||||
|
nbytes=self._nbytes(shape, dtype),
|
||||||
|
)
|
||||||
|
|
||||||
|
# ── Reference (no DMA, metadata only) ────────────────────────
|
||||||
|
|
||||||
|
def ref(
|
||||||
|
self, ptr: int, shape: tuple[int, ...], dtype: str = "f16",
|
||||||
|
) -> TensorHandle:
|
||||||
|
"""Create a TensorHandle referencing HBM data without issuing DMA.
|
||||||
|
|
||||||
|
Used when the scheduler will stream data per-tile (e.g., tensor b
|
||||||
|
in a composite GEMM). No command is generated.
|
||||||
|
"""
|
||||||
|
return self._make_handle(pa=ptr, shape=shape, dtype=dtype)
|
||||||
|
|
||||||
|
# ── Data Movement (blocking, DMA engine) ──────────────────────
|
||||||
|
|
||||||
|
def load(
|
||||||
|
self, ptr: int, shape: tuple[int, ...], dtype: str = "f16",
|
||||||
|
) -> TensorHandle:
|
||||||
|
"""Load tensor from HBM to TCM. Returns TensorHandle."""
|
||||||
|
self._emit_dispatch_overhead()
|
||||||
|
handle = self._make_handle(pa=ptr, shape=shape, dtype=dtype)
|
||||||
|
self._commands.append(DmaReadCmd(
|
||||||
|
handle=handle, src_pa=ptr, nbytes=handle.nbytes,
|
||||||
|
))
|
||||||
|
return handle
|
||||||
|
|
||||||
|
def store(self, ptr: int, handle: TensorHandle) -> None:
|
||||||
|
"""Store tensor from TCM to HBM."""
|
||||||
|
self._emit_dispatch_overhead()
|
||||||
|
self._commands.append(DmaWriteCmd(
|
||||||
|
handle=handle, dst_pa=ptr, nbytes=handle.nbytes,
|
||||||
|
))
|
||||||
|
|
||||||
|
# ── GEMM Engine (blocking) ────────────────────────────────────
|
||||||
|
|
||||||
|
def dot(self, a: TensorHandle, b: TensorHandle) -> TensorHandle:
|
||||||
|
"""Matrix multiply: out = a @ b. Both operands must be in TCM.
|
||||||
|
|
||||||
|
a: (M, K), b: (K, N) → out: (M, N)
|
||||||
|
"""
|
||||||
|
if len(a.shape) < 2 or len(b.shape) < 2:
|
||||||
|
raise ValueError("dot requires 2D tensors")
|
||||||
|
m, k = a.shape[-2], a.shape[-1]
|
||||||
|
k2, n = b.shape[-2], b.shape[-1]
|
||||||
|
if k != k2:
|
||||||
|
raise ValueError(f"dot shape mismatch: a.K={k} != b.K={k2}")
|
||||||
|
out_shape = (*a.shape[:-2], m, n)
|
||||||
|
out_dtype = a.dtype
|
||||||
|
out = self._make_handle(pa=0, shape=out_shape, dtype=out_dtype)
|
||||||
|
self._emit_dispatch_overhead()
|
||||||
|
self._commands.append(GemmCmd(a=a, b=b, out=out, m=m, k=k, n=n))
|
||||||
|
return out
|
||||||
|
|
||||||
|
# ── MATH Engine: unary (blocking) ─────────────────────────────
|
||||||
|
|
||||||
|
def _unary_math(self, op: str, x: TensorHandle) -> TensorHandle:
|
||||||
|
out = self._make_handle(pa=0, shape=x.shape, dtype=x.dtype)
|
||||||
|
self._emit_dispatch_overhead()
|
||||||
|
self._commands.append(MathCmd(op=op, inputs=(x,), out=out))
|
||||||
|
return out
|
||||||
|
|
||||||
|
def exp(self, x: TensorHandle) -> TensorHandle:
|
||||||
|
return self._unary_math("exp", x)
|
||||||
|
|
||||||
|
def log(self, x: TensorHandle) -> TensorHandle:
|
||||||
|
return self._unary_math("log", x)
|
||||||
|
|
||||||
|
def sqrt(self, x: TensorHandle) -> TensorHandle:
|
||||||
|
return self._unary_math("sqrt", x)
|
||||||
|
|
||||||
|
def abs(self, x: TensorHandle) -> TensorHandle:
|
||||||
|
return self._unary_math("abs", x)
|
||||||
|
|
||||||
|
def sigmoid(self, x: TensorHandle) -> TensorHandle:
|
||||||
|
return self._unary_math("sigmoid", x)
|
||||||
|
|
||||||
|
def cos(self, x: TensorHandle) -> TensorHandle:
|
||||||
|
return self._unary_math("cos", x)
|
||||||
|
|
||||||
|
def sin(self, x: TensorHandle) -> TensorHandle:
|
||||||
|
return self._unary_math("sin", x)
|
||||||
|
|
||||||
|
# ── MATH Engine: reduction (blocking) ─────────────────────────
|
||||||
|
|
||||||
|
def _reduction(
|
||||||
|
self, op: str, x: TensorHandle, axis: int,
|
||||||
|
) -> TensorHandle:
|
||||||
|
out_shape = list(x.shape)
|
||||||
|
out_shape[axis] = 1
|
||||||
|
out = self._make_handle(pa=0, shape=tuple(out_shape), dtype=x.dtype)
|
||||||
|
self._emit_dispatch_overhead()
|
||||||
|
self._commands.append(MathCmd(op=op, inputs=(x,), out=out, axis=axis))
|
||||||
|
return out
|
||||||
|
|
||||||
|
def sum(self, x: TensorHandle, axis: int) -> TensorHandle:
|
||||||
|
return self._reduction("sum", x, axis)
|
||||||
|
|
||||||
|
def max(self, x: TensorHandle, axis: int) -> TensorHandle:
|
||||||
|
return self._reduction("max", x, axis)
|
||||||
|
|
||||||
|
def min(self, x: TensorHandle, axis: int) -> TensorHandle:
|
||||||
|
return self._reduction("min", x, axis)
|
||||||
|
|
||||||
|
# ── MATH Engine: binary (blocking) ────────────────────────────
|
||||||
|
|
||||||
|
def _binary_math(
|
||||||
|
self, op: str, a: TensorHandle, b: TensorHandle,
|
||||||
|
) -> TensorHandle:
|
||||||
|
out = self._make_handle(pa=0, shape=a.shape, dtype=a.dtype)
|
||||||
|
self._emit_dispatch_overhead()
|
||||||
|
self._commands.append(MathCmd(op=op, inputs=(a, b), out=out))
|
||||||
|
return out
|
||||||
|
|
||||||
|
def where(
|
||||||
|
self, cond: TensorHandle, a: TensorHandle, b: TensorHandle,
|
||||||
|
) -> TensorHandle:
|
||||||
|
out = self._make_handle(pa=0, shape=a.shape, dtype=a.dtype)
|
||||||
|
self._emit_dispatch_overhead()
|
||||||
|
self._commands.append(MathCmd(op="where", inputs=(cond, a, b), out=out))
|
||||||
|
return out
|
||||||
|
|
||||||
|
# ── Index / Scalar (PE_CPU, no engine) ────────────────────────
|
||||||
|
|
||||||
|
def program_id(self, axis: int = 0) -> int:
|
||||||
|
"""Return program instance index."""
|
||||||
|
return self._pe_id
|
||||||
|
|
||||||
|
def num_programs(self, axis: int = 0) -> int:
|
||||||
|
"""Return total number of program instances."""
|
||||||
|
return self._num_programs
|
||||||
|
|
||||||
|
def arange(self, start: int, end: int, dtype: str = "i32") -> TensorHandle:
|
||||||
|
"""Create index range tensor in TCM."""
|
||||||
|
n = end - start
|
||||||
|
return self._make_handle(pa=0, shape=(n,), dtype=dtype)
|
||||||
|
|
||||||
|
def zeros(self, shape: tuple[int, ...], dtype: str = "f16") -> TensorHandle:
|
||||||
|
"""Create zero-filled tensor in TCM."""
|
||||||
|
return self._make_handle(pa=0, shape=shape, dtype=dtype)
|
||||||
|
|
||||||
|
def full(
|
||||||
|
self, shape: tuple[int, ...], value: float | int, dtype: str = "f16",
|
||||||
|
) -> TensorHandle:
|
||||||
|
"""Create constant-filled tensor in TCM."""
|
||||||
|
return self._make_handle(pa=0, shape=shape, dtype=dtype)
|
||||||
|
|
||||||
|
# ── Metadata (no compute, no DMA) ─────────────────────────────
|
||||||
|
|
||||||
|
def trans(self, x: TensorHandle) -> TensorHandle:
|
||||||
|
"""Transpose — shape change only, no command generated."""
|
||||||
|
if len(x.shape) < 2:
|
||||||
|
raise ValueError("trans requires at least 2D tensor")
|
||||||
|
new_shape = (*x.shape[:-2], x.shape[-1], x.shape[-2])
|
||||||
|
return TensorHandle(
|
||||||
|
id=x.id, pa=x.pa, shape=new_shape,
|
||||||
|
dtype=x.dtype, nbytes=x.nbytes, data=x.data,
|
||||||
|
)
|
||||||
|
|
||||||
|
# ── Composite + Control ───────────────────────────────────────
|
||||||
|
|
||||||
|
def composite(
|
||||||
|
self,
|
||||||
|
op: Literal["gemm", "math"],
|
||||||
|
a: TensorHandle,
|
||||||
|
b: TensorHandle | None = None,
|
||||||
|
out_ptr: int = 0,
|
||||||
|
math_op: str | None = None,
|
||||||
|
) -> CompletionHandle:
|
||||||
|
"""Submit a composite command (non-blocking, tiled pipeline).
|
||||||
|
|
||||||
|
Returns CompletionHandle for use with wait().
|
||||||
|
"""
|
||||||
|
# Compute output size based on op
|
||||||
|
if op == "gemm" and b is not None:
|
||||||
|
m, k = a.shape[-2], a.shape[-1]
|
||||||
|
n = b.shape[-1]
|
||||||
|
out_dtype = a.dtype
|
||||||
|
out_nbytes = m * n * self._dtype_bytes(out_dtype)
|
||||||
|
else:
|
||||||
|
out_nbytes = a.nbytes
|
||||||
|
|
||||||
|
completion = CompletionHandle(id=self._next_completion_id())
|
||||||
|
self._emit_dispatch_overhead()
|
||||||
|
self._commands.append(CompositeCmd(
|
||||||
|
completion=completion, op=op,
|
||||||
|
a=a, b=b, out_pa=out_ptr, out_nbytes=out_nbytes,
|
||||||
|
math_op=math_op,
|
||||||
|
))
|
||||||
|
return completion
|
||||||
|
|
||||||
|
def wait(self, handle: CompletionHandle | None = None) -> None:
|
||||||
|
"""Wait for a specific composite or all pending composites."""
|
||||||
|
self._commands.append(WaitCmd(handle=handle))
|
||||||
|
|
||||||
|
def cycles(self, n: int) -> None:
|
||||||
|
"""Declare PE_CPU scalar execution overhead (cycles)."""
|
||||||
|
self._commands.append(PeCpuOverheadCmd(cycles=n))
|
||||||
|
|
||||||
|
|
||||||
|
# ── TensorHandle arithmetic operators ─────────────────────────────
|
||||||
|
# Enables: a + b, a * b, a - b, a / b in kernel code.
|
||||||
|
# Each creates a MathCmd via a module-level helper that requires a
|
||||||
|
# TLContext. We attach the context to handles via a closure approach.
|
||||||
|
|
||||||
|
|
||||||
|
def _enable_tensor_ops() -> None:
|
||||||
|
"""Patch TensorHandle with arithmetic operators.
|
||||||
|
|
||||||
|
Called once at module load. Operators create MathCmd entries via
|
||||||
|
a thread-local TLContext reference set during kernel execution.
|
||||||
|
"""
|
||||||
|
import threading
|
||||||
|
|
||||||
|
_local = threading.local()
|
||||||
|
|
||||||
|
def set_active_context(ctx: TLContext | None) -> None:
|
||||||
|
_local.ctx = ctx
|
||||||
|
|
||||||
|
def get_active_context() -> TLContext:
|
||||||
|
ctx = getattr(_local, "ctx", None)
|
||||||
|
if ctx is None:
|
||||||
|
raise RuntimeError("TensorHandle ops require an active TLContext")
|
||||||
|
return ctx
|
||||||
|
|
||||||
|
def _binop(op: str):
|
||||||
|
def method(self: TensorHandle, other: TensorHandle) -> TensorHandle:
|
||||||
|
ctx = get_active_context()
|
||||||
|
return ctx._binary_math(op, self, other)
|
||||||
|
return method
|
||||||
|
|
||||||
|
# Patch TensorHandle class with operators
|
||||||
|
TensorHandle.__add__ = _binop("add") # type: ignore[attr-defined]
|
||||||
|
TensorHandle.__sub__ = _binop("sub") # type: ignore[attr-defined]
|
||||||
|
TensorHandle.__mul__ = _binop("mul") # type: ignore[attr-defined]
|
||||||
|
TensorHandle.__truediv__ = _binop("div") # type: ignore[attr-defined]
|
||||||
|
|
||||||
|
# Expose context management
|
||||||
|
TLContext._set_active = staticmethod(set_active_context) # type: ignore[attr-defined]
|
||||||
|
TLContext._get_active = staticmethod(get_active_context) # type: ignore[attr-defined]
|
||||||
|
|
||||||
|
|
||||||
|
_enable_tensor_ops()
|
||||||
|
|
||||||
|
|
||||||
|
def run_kernel(
|
||||||
|
kernel_fn,
|
||||||
|
tl_ctx: TLContext,
|
||||||
|
*args,
|
||||||
|
**kwargs,
|
||||||
|
) -> list[PeCommand]:
|
||||||
|
"""Execute a kernel function with the given TLContext and return commands.
|
||||||
|
|
||||||
|
Sets tl_ctx as the active context for TensorHandle operators,
|
||||||
|
calls the kernel, then clears the context.
|
||||||
|
"""
|
||||||
|
TLContext._set_active(tl_ctx) # type: ignore[attr-defined]
|
||||||
|
try:
|
||||||
|
kernel_fn(*args, tl=tl_ctx, **kwargs)
|
||||||
|
finally:
|
||||||
|
TLContext._set_active(None) # type: ignore[attr-defined]
|
||||||
|
return tl_ctx.commands
|
||||||
@@ -0,0 +1,22 @@
|
|||||||
|
import kernbench.cli.main as cli_main
|
||||||
|
|
||||||
|
|
||||||
|
def test_cli_main_arg_parsing(monkeypatch):
|
||||||
|
|
||||||
|
def fake_cmd_run(args) -> int:
|
||||||
|
assert args.cmd == "run"
|
||||||
|
assert args.topology == "topology.yaml"
|
||||||
|
assert args.bench == "qkv_gemm"
|
||||||
|
assert args.device == None
|
||||||
|
return 0
|
||||||
|
|
||||||
|
# monkey patch the handler to test arg parsing without running the actual bench
|
||||||
|
monkeypatch.setattr(cli_main, "cmd_run", fake_cmd_run)
|
||||||
|
rc = cli_main.main(["run", "--topology", "topology.yaml", "--bench", "qkv_gemm"])
|
||||||
|
assert rc == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_cli_main():
|
||||||
|
|
||||||
|
rc = cli_main.main(["run", "--topology", "topology.yaml", "--bench", "qkv_gemm"])
|
||||||
|
assert rc == 0
|
||||||
@@ -0,0 +1,187 @@
|
|||||||
|
"""Tests for the SimPy component model and DI registry (ADR-0007 D3).
|
||||||
|
|
||||||
|
Phase 1 verification: all tests FAIL until Phase 2 implements production code.
|
||||||
|
|
||||||
|
Latency invariant after refactor:
|
||||||
|
total_ns = Σ(wire propagation) + Σ(component.run() overhead_ns) + nbytes / bottleneck_bw
|
||||||
|
This is identical to the current formula for Phase 0 (no contention).
|
||||||
|
"""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import simpy
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from kernbench.components.base import ComponentBase, ComponentRegistry
|
||||||
|
from kernbench.components.impls.forwarding import TransitComponent
|
||||||
|
from kernbench.policy.address.phyaddr import PhysAddr
|
||||||
|
from kernbench.runtime_api.kernel import MemoryReadMsg
|
||||||
|
from kernbench.sim_engine.engine import GraphEngine
|
||||||
|
from kernbench.topology.builder import load_topology
|
||||||
|
from kernbench.topology.types import Node
|
||||||
|
|
||||||
|
TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml"
|
||||||
|
|
||||||
|
|
||||||
|
def _graph():
|
||||||
|
return load_topology(TOPOLOGY_PATH)
|
||||||
|
|
||||||
|
|
||||||
|
def _hbm_pa(pe_id: int = 0) -> int:
|
||||||
|
slice_bytes = 48 * (1 << 30) // 8
|
||||||
|
pa = PhysAddr.pe_hbm_addr(
|
||||||
|
rack_id=0, sip_id=0, cube_id=0, pe_id=pe_id,
|
||||||
|
pe_local_hbm_offset=0x1000, slice_size_bytes=slice_bytes,
|
||||||
|
)
|
||||||
|
return pa.encode()
|
||||||
|
|
||||||
|
|
||||||
|
def _node(impl: str, overhead_ns: float = 0.0) -> Node:
|
||||||
|
return Node(id="test", kind="xbar", impl=impl, attrs={"overhead_ns": overhead_ns}, pos_mm=None)
|
||||||
|
|
||||||
|
|
||||||
|
# ── 1. unknown impl → error ──────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_registry_unknown_impl_raises_error():
|
||||||
|
"""Unregistered impl raises ValueError (no fallback)."""
|
||||||
|
node = _node("totally_unknown_v99", overhead_ns=5.0)
|
||||||
|
with pytest.raises(ValueError, match="No component registered"):
|
||||||
|
ComponentRegistry.create(node)
|
||||||
|
|
||||||
|
|
||||||
|
# ── 2. TransitComponent yields exactly overhead_ns via simpy timeout ──
|
||||||
|
|
||||||
|
|
||||||
|
def test_transit_component_yields_overhead_ns():
|
||||||
|
"""TransitComponent.run() yields exactly node.attrs['overhead_ns'] ns."""
|
||||||
|
node = _node("xbar_v1", overhead_ns=3.0)
|
||||||
|
comp = TransitComponent(node)
|
||||||
|
env = simpy.Environment()
|
||||||
|
|
||||||
|
def proc():
|
||||||
|
yield from comp.run(env, nbytes=4096)
|
||||||
|
|
||||||
|
env.process(proc())
|
||||||
|
env.run()
|
||||||
|
assert env.now == pytest.approx(3.0)
|
||||||
|
|
||||||
|
|
||||||
|
def test_transit_component_zero_overhead_ns():
|
||||||
|
"""TransitComponent with overhead_ns=0 still yields (no infinite loop)."""
|
||||||
|
node = _node("noc_v1", overhead_ns=0.0)
|
||||||
|
comp = TransitComponent(node)
|
||||||
|
env = simpy.Environment()
|
||||||
|
|
||||||
|
done = []
|
||||||
|
|
||||||
|
def proc():
|
||||||
|
yield from comp.run(env, nbytes=1024)
|
||||||
|
done.append(True)
|
||||||
|
|
||||||
|
env.process(proc())
|
||||||
|
env.run()
|
||||||
|
assert done == [True]
|
||||||
|
assert env.now == pytest.approx(0.0)
|
||||||
|
|
||||||
|
|
||||||
|
# ── 3. DI override: custom component is invoked by engine ────────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_engine_component_override_is_called():
|
||||||
|
"""Custom component injected via component_overrides is invoked during simulation."""
|
||||||
|
|
||||||
|
class SpyXbar(ComponentBase):
|
||||||
|
calls = 0
|
||||||
|
|
||||||
|
def run(self, env, nbytes):
|
||||||
|
SpyXbar.calls += 1
|
||||||
|
yield env.timeout(0)
|
||||||
|
|
||||||
|
SpyXbar.calls = 0
|
||||||
|
graph = _graph()
|
||||||
|
engine = GraphEngine(graph, component_overrides={"xbar_v1": SpyXbar})
|
||||||
|
msg = MemoryReadMsg(
|
||||||
|
correlation_id="c", request_id="r",
|
||||||
|
src_sip=0, src_cube=0, src_pe=0,
|
||||||
|
src_pa=_hbm_pa(pe_id=0), nbytes=4096,
|
||||||
|
)
|
||||||
|
h = engine.submit(msg)
|
||||||
|
engine.wait(h)
|
||||||
|
# PE0→slice0 path passes through xbar.pe0 (impl=xbar_v1)
|
||||||
|
assert SpyXbar.calls > 0
|
||||||
|
|
||||||
|
|
||||||
|
# ── 4. behavior unchanged: total_ns matches existing formula ─────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_engine_component_model_same_latency_as_before():
|
||||||
|
"""Phase B component model total_ns for PE0→slice0 local HBM (4096B).
|
||||||
|
|
||||||
|
Cut-through (wormhole) wire model: wires apply propagation only.
|
||||||
|
Serialization (drain) is computed per-path and applied once at the terminal.
|
||||||
|
|
||||||
|
Forward path:
|
||||||
|
Path 1: pcie_ep(5.0) + wire(1.0mm=0.01) + io_cpu(10.0)
|
||||||
|
Path 2: wire(3.5mm=0.035) + ucie-N(1.0)
|
||||||
|
+ 2DMeshNOC(ucie-N→m_cpu: Manhattan 10.9mm=0.109) + m_cpu(5.0)
|
||||||
|
Path 3 DMA (m_cpu→noc→xbar.pe0→hbm_ctrl.slice0):
|
||||||
|
+ 2DMeshNOC(m_cpu→xbar.pe0: Manhattan 15.0mm=0.15)
|
||||||
|
+ xbar.pe0(2.0) + wire(2.5mm=0.025) + hbm_ctrl(0.0)
|
||||||
|
+ drain_ns(4096/128 = 32.0, bottleneck = noc_to_xbar 128 GB/s)
|
||||||
|
|
||||||
|
Response path (reverse, nbytes=0, drain=0):
|
||||||
|
DMA response: hbm_ctrl→xbar.pe0→noc→m_cpu (propagation + xbar overhead_ns)
|
||||||
|
Command response: m_cpu→noc→ucie-N→io_cpu (propagation + ucie overhead_ns)
|
||||||
|
|
||||||
|
Total: ~58.648 ns
|
||||||
|
"""
|
||||||
|
graph = _graph()
|
||||||
|
engine = GraphEngine(graph)
|
||||||
|
msg = MemoryReadMsg(
|
||||||
|
correlation_id="c", request_id="r",
|
||||||
|
src_sip=0, src_cube=0, src_pe=0,
|
||||||
|
src_pa=_hbm_pa(pe_id=0), nbytes=4096,
|
||||||
|
)
|
||||||
|
h = engine.submit(msg)
|
||||||
|
engine.wait(h)
|
||||||
|
_, trace = engine.get_completion(h)
|
||||||
|
assert trace["total_ns"] == pytest.approx(58.648, rel=1e-4)
|
||||||
|
|
||||||
|
|
||||||
|
# ── 5. override is scoped: only targeted impl is replaced ────────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_engine_override_is_scoped_to_impl():
|
||||||
|
"""xbar_v1 override (ZeroXbar, no overhead_ns) reduces total_ns by exactly 4.0 ns.
|
||||||
|
|
||||||
|
xbar.pe0 has overhead_ns=2.0. It is traversed on both the forward DMA path
|
||||||
|
and the reverse response path, so replacing it with a zero-latency impl
|
||||||
|
removes 2.0 ns × 2 = 4.0 ns; all other components are unchanged.
|
||||||
|
"""
|
||||||
|
|
||||||
|
class ZeroXbar(ComponentBase):
|
||||||
|
def run(self, env, nbytes):
|
||||||
|
yield env.timeout(0)
|
||||||
|
|
||||||
|
graph = _graph()
|
||||||
|
engine_default = GraphEngine(graph)
|
||||||
|
engine_override = GraphEngine(graph, component_overrides={"xbar_v1": ZeroXbar})
|
||||||
|
|
||||||
|
msg = MemoryReadMsg(
|
||||||
|
correlation_id="c", request_id="r",
|
||||||
|
src_sip=0, src_cube=0, src_pe=0,
|
||||||
|
src_pa=_hbm_pa(pe_id=0), nbytes=4096,
|
||||||
|
)
|
||||||
|
|
||||||
|
h_d = engine_default.submit(msg)
|
||||||
|
engine_default.wait(h_d)
|
||||||
|
_, t_default = engine_default.get_completion(h_d)
|
||||||
|
|
||||||
|
h_o = engine_override.submit(msg)
|
||||||
|
engine_override.wait(h_o)
|
||||||
|
_, t_override = engine_override.get_completion(h_o)
|
||||||
|
|
||||||
|
# ZeroXbar removes overhead_ns=2.0 from xbar.pe0 on forward + response = 4.0 ns faster
|
||||||
|
assert t_override["total_ns"] < t_default["total_ns"]
|
||||||
|
assert t_default["total_ns"] - t_override["total_ns"] == pytest.approx(4.0, rel=1e-6)
|
||||||
@@ -0,0 +1,405 @@
|
|||||||
|
import pytest
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from kernbench.common.types import Completion, RequestHandle
|
||||||
|
from kernbench.policy.address.phyaddr import PhysAddr
|
||||||
|
from kernbench.runtime_api.kernel import (
|
||||||
|
KernelLaunchMsg,
|
||||||
|
KernelRef,
|
||||||
|
MemoryReadMsg,
|
||||||
|
MemoryWriteMsg,
|
||||||
|
ScalarArg,
|
||||||
|
TensorArg,
|
||||||
|
TensorArgShard,
|
||||||
|
)
|
||||||
|
from kernbench.sim_engine.engine import GraphEngine
|
||||||
|
from kernbench.topology.builder import load_topology
|
||||||
|
|
||||||
|
TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml"
|
||||||
|
|
||||||
|
|
||||||
|
def _engine():
|
||||||
|
graph = load_topology(TOPOLOGY_PATH)
|
||||||
|
return GraphEngine(graph)
|
||||||
|
|
||||||
|
|
||||||
|
def _hbm_pa(sip: int = 0, cube: int = 0, pe_id: int = 0) -> int:
|
||||||
|
"""Create an HBM physical address targeting a specific PE's HBM slice."""
|
||||||
|
# 48 GB / 8 slices = 6 GB per slice
|
||||||
|
slice_bytes = 48 * (1 << 30) // 8
|
||||||
|
pa = PhysAddr.pe_hbm_addr(
|
||||||
|
rack_id=0, sip_id=sip, cube_id=cube, pe_id=pe_id,
|
||||||
|
pe_local_hbm_offset=0x1000, slice_size_bytes=slice_bytes,
|
||||||
|
)
|
||||||
|
return pa.encode()
|
||||||
|
|
||||||
|
|
||||||
|
def _sram_pa(sip: int = 0, cube: int = 0) -> int:
|
||||||
|
"""Create an SRAM physical address."""
|
||||||
|
pa = PhysAddr.cube_sram_addr(rack_id=0, sip_id=sip, cube_id=cube, sram_offset=0x800)
|
||||||
|
return pa.encode()
|
||||||
|
|
||||||
|
|
||||||
|
# ── 1. submit returns handle ────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_engine_submit_returns_handle():
|
||||||
|
"""submit() must return a RequestHandle (non-empty string)."""
|
||||||
|
engine = _engine()
|
||||||
|
msg = MemoryWriteMsg(
|
||||||
|
correlation_id="c0", request_id="r0",
|
||||||
|
dst_sip=0, dst_cube=0, dst_pe=0,
|
||||||
|
dst_pa=_hbm_pa(), nbytes=4096, pattern="zero",
|
||||||
|
)
|
||||||
|
handle = engine.submit(msg)
|
||||||
|
assert isinstance(handle, str)
|
||||||
|
assert len(handle) > 0
|
||||||
|
|
||||||
|
|
||||||
|
# ── 2. memory write completion ──────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_engine_memory_write_completion():
|
||||||
|
"""MemoryWrite must complete with ok=True."""
|
||||||
|
engine = _engine()
|
||||||
|
msg = MemoryWriteMsg(
|
||||||
|
correlation_id="c0", request_id="r1",
|
||||||
|
dst_sip=0, dst_cube=0, dst_pe=0,
|
||||||
|
dst_pa=_hbm_pa(), nbytes=4096, pattern="zero",
|
||||||
|
)
|
||||||
|
h = engine.submit(msg)
|
||||||
|
engine.wait(h)
|
||||||
|
comp, trace = engine.get_completion(h)
|
||||||
|
assert comp.ok is True
|
||||||
|
|
||||||
|
|
||||||
|
# ── 3. memory read completion ───────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_engine_memory_read_completion():
|
||||||
|
"""MemoryRead must complete with ok=True."""
|
||||||
|
engine = _engine()
|
||||||
|
msg = MemoryReadMsg(
|
||||||
|
correlation_id="c0", request_id="r2",
|
||||||
|
src_sip=0, src_cube=0, src_pe=0,
|
||||||
|
src_pa=_hbm_pa(), nbytes=4096,
|
||||||
|
)
|
||||||
|
h = engine.submit(msg)
|
||||||
|
engine.wait(h)
|
||||||
|
comp, trace = engine.get_completion(h)
|
||||||
|
assert comp.ok is True
|
||||||
|
|
||||||
|
|
||||||
|
# ── 4. latency positive ────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_engine_latency_positive():
|
||||||
|
"""Trace total_ns must be > 0 (ADR-0002 D4)."""
|
||||||
|
engine = _engine()
|
||||||
|
msg = MemoryWriteMsg(
|
||||||
|
correlation_id="c0", request_id="r3",
|
||||||
|
dst_sip=0, dst_cube=0, dst_pe=0,
|
||||||
|
dst_pa=_hbm_pa(), nbytes=4096, pattern="zero",
|
||||||
|
)
|
||||||
|
h = engine.submit(msg)
|
||||||
|
engine.wait(h)
|
||||||
|
_, trace = engine.get_completion(h)
|
||||||
|
assert trace["total_ns"] > 0
|
||||||
|
|
||||||
|
|
||||||
|
# ── 5. trace has total_ns and nbytes ───────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_engine_trace_has_total_ns_and_nbytes():
|
||||||
|
"""Trace must contain 'total_ns' and 'nbytes'."""
|
||||||
|
engine = _engine()
|
||||||
|
msg = MemoryWriteMsg(
|
||||||
|
correlation_id="c0", request_id="r4",
|
||||||
|
dst_sip=0, dst_cube=0, dst_pe=0,
|
||||||
|
dst_pa=_hbm_pa(), nbytes=4096, pattern="zero",
|
||||||
|
)
|
||||||
|
h = engine.submit(msg)
|
||||||
|
engine.wait(h)
|
||||||
|
_, trace = engine.get_completion(h)
|
||||||
|
assert "total_ns" in trace
|
||||||
|
assert "nbytes" in trace
|
||||||
|
assert trace["nbytes"] == 4096
|
||||||
|
|
||||||
|
|
||||||
|
# ── 6. latency includes node overhead_ns ────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_engine_latency_includes_node_overhead_ns():
|
||||||
|
"""Path traverses components with overhead_ns > 0, so total >= some minimum."""
|
||||||
|
engine = _engine()
|
||||||
|
msg = MemoryWriteMsg(
|
||||||
|
correlation_id="c0", request_id="r7",
|
||||||
|
dst_sip=0, dst_cube=0, dst_pe=0,
|
||||||
|
dst_pa=_hbm_pa(), nbytes=4096, pattern="zero",
|
||||||
|
)
|
||||||
|
h = engine.submit(msg)
|
||||||
|
engine.wait(h)
|
||||||
|
_, trace = engine.get_completion(h)
|
||||||
|
# pcie_ep (5.0) + io_cpu (10.0) + m_cpu (5.0) = at least 20 ns
|
||||||
|
assert trace["total_ns"] >= 20.0
|
||||||
|
|
||||||
|
|
||||||
|
# ── 7. concurrent requests ─────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_engine_concurrent_requests():
|
||||||
|
"""Two requests submitted before wait must both complete with traces."""
|
||||||
|
engine = _engine()
|
||||||
|
msg1 = MemoryWriteMsg(
|
||||||
|
correlation_id="c0", request_id="r9a",
|
||||||
|
dst_sip=0, dst_cube=0, dst_pe=0,
|
||||||
|
dst_pa=_hbm_pa(), nbytes=4096, pattern="zero",
|
||||||
|
)
|
||||||
|
msg2 = MemoryWriteMsg(
|
||||||
|
correlation_id="c0", request_id="r9b",
|
||||||
|
dst_sip=0, dst_cube=0, dst_pe=1,
|
||||||
|
dst_pa=_hbm_pa(pe_id=1), nbytes=4096, pattern="zero",
|
||||||
|
)
|
||||||
|
h1 = engine.submit(msg1)
|
||||||
|
h2 = engine.submit(msg2)
|
||||||
|
engine.wait(h1)
|
||||||
|
engine.wait(h2)
|
||||||
|
comp1, trace1 = engine.get_completion(h1)
|
||||||
|
comp2, trace2 = engine.get_completion(h2)
|
||||||
|
assert comp1.ok is True
|
||||||
|
assert comp2.ok is True
|
||||||
|
assert trace1["total_ns"] > 0
|
||||||
|
assert trace2["total_ns"] > 0
|
||||||
|
|
||||||
|
|
||||||
|
# ── 8. kernel launch ───────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_engine_kernel_launch_simplified():
|
||||||
|
"""KernelLaunch returns latency > 0."""
|
||||||
|
from kernbench.triton_emu.registry import clear_registry, register_kernel
|
||||||
|
|
||||||
|
clear_registry()
|
||||||
|
hbm_pa = _hbm_pa(pe_id=0)
|
||||||
|
|
||||||
|
def gemm_kernel(a_ptr, tl):
|
||||||
|
a = tl.load(a_ptr, shape=(4, 4), dtype="f16")
|
||||||
|
tl.store(a_ptr, a)
|
||||||
|
|
||||||
|
register_kernel("gemm", gemm_kernel)
|
||||||
|
|
||||||
|
engine = _engine()
|
||||||
|
shard0 = TensorArgShard(
|
||||||
|
sip=0, cube=0, pe=0,
|
||||||
|
pa=_hbm_pa(pe_id=0), nbytes=4096, offset_bytes=0,
|
||||||
|
)
|
||||||
|
shard1 = TensorArgShard(
|
||||||
|
sip=0, cube=0, pe=1,
|
||||||
|
pa=_hbm_pa(pe_id=1), nbytes=4096, offset_bytes=4096,
|
||||||
|
)
|
||||||
|
msg = KernelLaunchMsg(
|
||||||
|
correlation_id="c0", request_id="r10",
|
||||||
|
kernel_ref=KernelRef(name="gemm", kind="builtin"),
|
||||||
|
args=(TensorArg(shards=(shard0, shard1)),),
|
||||||
|
)
|
||||||
|
h = engine.submit(msg)
|
||||||
|
engine.wait(h)
|
||||||
|
comp, trace = engine.get_completion(h)
|
||||||
|
assert comp.ok is True
|
||||||
|
assert trace["total_ns"] > 0
|
||||||
|
clear_registry()
|
||||||
|
|
||||||
|
|
||||||
|
# ── 9. deterministic ───────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_engine_deterministic():
|
||||||
|
"""Same request on two engines must produce identical latency."""
|
||||||
|
msg = MemoryWriteMsg(
|
||||||
|
correlation_id="c0", request_id="r11",
|
||||||
|
dst_sip=0, dst_cube=0, dst_pe=0,
|
||||||
|
dst_pa=_hbm_pa(), nbytes=4096, pattern="zero",
|
||||||
|
)
|
||||||
|
e1 = _engine()
|
||||||
|
h1 = e1.submit(msg)
|
||||||
|
e1.wait(h1)
|
||||||
|
_, t1 = e1.get_completion(h1)
|
||||||
|
|
||||||
|
e2 = _engine()
|
||||||
|
h2 = e2.submit(msg)
|
||||||
|
e2.wait(h2)
|
||||||
|
_, t2 = e2.get_completion(h2)
|
||||||
|
|
||||||
|
assert t1["total_ns"] == t2["total_ns"]
|
||||||
|
|
||||||
|
|
||||||
|
# ── 10. remote cube access succeeds with higher latency ────────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_dma_capacity_serializes_concurrent():
|
||||||
|
"""Two concurrent DMA writes to the same cube must contend at DMA capacity=1.
|
||||||
|
|
||||||
|
When two MemoryWrite requests target the same cube's M_CPU simultaneously,
|
||||||
|
the DMA engine (capacity=1) serializes them. The slower request must take
|
||||||
|
longer than a single isolated request (ADR-0014 D4, ADR-0015 D5).
|
||||||
|
"""
|
||||||
|
# Single isolated write baseline
|
||||||
|
engine_single = _engine()
|
||||||
|
msg_single = MemoryWriteMsg(
|
||||||
|
correlation_id="c0", request_id="single",
|
||||||
|
dst_sip=0, dst_cube=0, dst_pe=0,
|
||||||
|
dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096,
|
||||||
|
pattern="zero", target_pe=0,
|
||||||
|
)
|
||||||
|
h1 = engine_single.submit(msg_single)
|
||||||
|
engine_single.wait(h1)
|
||||||
|
_, t1 = engine_single.get_completion(h1)
|
||||||
|
single_ns = t1["total_ns"]
|
||||||
|
|
||||||
|
# Two concurrent writes to same cube (different PEs) → DMA contention
|
||||||
|
engine_conc = _engine()
|
||||||
|
msg_a = MemoryWriteMsg(
|
||||||
|
correlation_id="c0", request_id="conc-a",
|
||||||
|
dst_sip=0, dst_cube=0, dst_pe=0,
|
||||||
|
dst_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096,
|
||||||
|
pattern="zero", target_pe=0,
|
||||||
|
)
|
||||||
|
msg_b = MemoryWriteMsg(
|
||||||
|
correlation_id="c0", request_id="conc-b",
|
||||||
|
dst_sip=0, dst_cube=0, dst_pe=1,
|
||||||
|
dst_pa=_hbm_pa(sip=0, cube=0, pe_id=1), nbytes=4096,
|
||||||
|
pattern="zero", target_pe=1,
|
||||||
|
)
|
||||||
|
ha = engine_conc.submit(msg_a)
|
||||||
|
hb = engine_conc.submit(msg_b)
|
||||||
|
engine_conc.wait(ha)
|
||||||
|
engine_conc.wait(hb)
|
||||||
|
_, ta = engine_conc.get_completion(ha)
|
||||||
|
_, tb = engine_conc.get_completion(hb)
|
||||||
|
|
||||||
|
# At least one must be delayed by DMA contention
|
||||||
|
max_ns = max(ta["total_ns"], tb["total_ns"])
|
||||||
|
assert max_ns > single_ns, (
|
||||||
|
f"concurrent max ({max_ns:.2f}ns) must > single ({single_ns:.2f}ns) "
|
||||||
|
f"due to DMA capacity=1 contention"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ── 11. formula latency lower bound ──────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_formula_latency_lower_bound():
|
||||||
|
"""_formula_latency must be <= actual latency (ADR-0015 D7).
|
||||||
|
|
||||||
|
Uses PE DMA path which is fully known at engine level.
|
||||||
|
"""
|
||||||
|
from kernbench.policy.address.phyaddr import PhysAddr as PA
|
||||||
|
from kernbench.policy.routing.router import AddressResolver, PathRouter
|
||||||
|
from kernbench.topology.builder import load_topology as lt
|
||||||
|
|
||||||
|
graph = lt(TOPOLOGY_PATH)
|
||||||
|
engine = GraphEngine(graph)
|
||||||
|
resolver = AddressResolver(graph)
|
||||||
|
router = PathRouter(graph)
|
||||||
|
|
||||||
|
pa = _hbm_pa(sip=0, cube=0, pe_id=1)
|
||||||
|
pa_obj = PA.decode(pa)
|
||||||
|
dst_node = resolver.resolve(pa_obj)
|
||||||
|
pe_ref = "sip0.cube0.pe0"
|
||||||
|
path = router.find_path(pe_ref, dst_node)
|
||||||
|
formula = engine._formula_latency(path, 4096)
|
||||||
|
|
||||||
|
# Run actual simulation
|
||||||
|
msg = MemoryReadMsg(
|
||||||
|
correlation_id="c0", request_id="formula-lb",
|
||||||
|
src_sip=0, src_cube=0, src_pe=0,
|
||||||
|
src_pa=pa, nbytes=4096, target_pe=1,
|
||||||
|
)
|
||||||
|
h = engine.submit(msg)
|
||||||
|
engine.wait(h)
|
||||||
|
_, trace = engine.get_completion(h)
|
||||||
|
actual = trace["total_ns"]
|
||||||
|
|
||||||
|
assert formula <= actual, (
|
||||||
|
f"formula ({formula:.2f}) must <= actual ({actual:.2f})"
|
||||||
|
)
|
||||||
|
assert formula > 0, "formula must be > 0"
|
||||||
|
|
||||||
|
|
||||||
|
def test_formula_latency_exact_no_contention():
|
||||||
|
"""With no contention, formula should approximate actual for PE DMA.
|
||||||
|
|
||||||
|
PE DMA is single-request with no fan-out or aggregation,
|
||||||
|
so formula ≈ actual (within small tolerance for SimPy scheduling).
|
||||||
|
"""
|
||||||
|
from kernbench.runtime_api.kernel import PeDmaMsg
|
||||||
|
from kernbench.policy.address.phyaddr import PhysAddr as PA
|
||||||
|
from kernbench.policy.routing.router import AddressResolver, PathRouter
|
||||||
|
from kernbench.topology.builder import load_topology as lt
|
||||||
|
|
||||||
|
graph = lt(TOPOLOGY_PATH)
|
||||||
|
engine = GraphEngine(graph)
|
||||||
|
resolver = AddressResolver(graph)
|
||||||
|
router = PathRouter(graph)
|
||||||
|
|
||||||
|
pa = _hbm_pa(sip=0, cube=0, pe_id=0)
|
||||||
|
pa_obj = PA.decode(pa)
|
||||||
|
dst_node = resolver.resolve(pa_obj)
|
||||||
|
pe_ref = "sip0.cube0.pe0"
|
||||||
|
path = router.find_path(pe_ref, dst_node)
|
||||||
|
formula = engine._formula_latency(path, 4096)
|
||||||
|
|
||||||
|
msg = PeDmaMsg(
|
||||||
|
correlation_id="c0", request_id="formula-exact",
|
||||||
|
src_sip=0, src_cube=0, src_pe=0,
|
||||||
|
dst_pa=pa, nbytes=4096,
|
||||||
|
)
|
||||||
|
h = engine.submit(msg)
|
||||||
|
engine.wait(h)
|
||||||
|
_, trace = engine.get_completion(h)
|
||||||
|
actual = trace["total_ns"]
|
||||||
|
|
||||||
|
# No contention: formula should equal actual
|
||||||
|
assert abs(formula - actual) < 0.01, (
|
||||||
|
f"formula ({formula:.4f}) ≈ actual ({actual:.4f}) expected with no contention"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ── 10. remote cube access succeeds with higher latency ────────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_engine_remote_cube_latency_higher():
|
||||||
|
"""Accessing a distant cube's HBM must have strictly higher latency than local.
|
||||||
|
|
||||||
|
Uses separate engines to avoid contention effects.
|
||||||
|
cube15 (far corner of 4x4 mesh) requires multiple UCIe + NOC hops
|
||||||
|
from IO chiplet compared to cube0 (directly connected).
|
||||||
|
"""
|
||||||
|
engine_local = _engine()
|
||||||
|
engine_remote = _engine()
|
||||||
|
msg_local = MemoryReadMsg(
|
||||||
|
correlation_id="c0", request_id="r14a",
|
||||||
|
src_sip=0, src_cube=0, src_pe=0,
|
||||||
|
src_pa=_hbm_pa(sip=0, cube=0, pe_id=0), nbytes=4096,
|
||||||
|
)
|
||||||
|
msg_remote = MemoryReadMsg(
|
||||||
|
correlation_id="c0", request_id="r14b",
|
||||||
|
src_sip=0, src_cube=0, src_pe=0,
|
||||||
|
src_pa=_hbm_pa(sip=0, cube=15, pe_id=0), nbytes=4096,
|
||||||
|
)
|
||||||
|
h_local = engine_local.submit(msg_local)
|
||||||
|
engine_local.wait(h_local)
|
||||||
|
_, t_local = engine_local.get_completion(h_local)
|
||||||
|
|
||||||
|
h_remote = engine_remote.submit(msg_remote)
|
||||||
|
engine_remote.wait(h_remote)
|
||||||
|
comp_remote, t_remote = engine_remote.get_completion(h_remote)
|
||||||
|
|
||||||
|
assert comp_remote.ok is True
|
||||||
|
assert t_remote is not None and t_local is not None
|
||||||
|
assert t_remote["total_ns"] > t_local["total_ns"], (
|
||||||
|
f"remote cube {t_remote['total_ns']:.2f} must > local {t_local['total_ns']:.2f}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -0,0 +1,269 @@
|
|||||||
|
"""Phase A component infrastructure tests (ADR-0015).
|
||||||
|
|
||||||
|
Verifies:
|
||||||
|
- TransitComponent, IoCpuComponent apply overhead_ns via run()
|
||||||
|
- HbmCtrlComponent and SramComponent act as terminal nodes (succeed done)
|
||||||
|
- MCpuComponent forwards when not terminal; completes when terminal + no ctx
|
||||||
|
- ComponentRegistry resolves impl strings to correct concrete classes
|
||||||
|
- GraphEngine passes ComponentContext to every component
|
||||||
|
- ComponentContext.router and .resolver are correctly populated
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import simpy
|
||||||
|
|
||||||
|
from kernbench.components.base import ComponentBase, ComponentRegistry
|
||||||
|
from kernbench.components.context import ComponentContext
|
||||||
|
from kernbench.components.impls import (
|
||||||
|
HbmCtrlComponent,
|
||||||
|
IoCpuComponent,
|
||||||
|
MCpuComponent,
|
||||||
|
PcieEpComponent,
|
||||||
|
SramComponent,
|
||||||
|
TransitComponent,
|
||||||
|
)
|
||||||
|
from kernbench.sim_engine.engine import GraphEngine
|
||||||
|
from kernbench.sim_engine.transaction import Transaction
|
||||||
|
from kernbench.topology.builder import load_topology
|
||||||
|
from kernbench.topology.types import Node
|
||||||
|
|
||||||
|
TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml"
|
||||||
|
|
||||||
|
|
||||||
|
def _node(impl: str, attrs: dict | None = None) -> Node:
|
||||||
|
return Node(id="test.node", kind="test", impl=impl, attrs=attrs or {}, pos_mm=None)
|
||||||
|
|
||||||
|
|
||||||
|
def _run_worker(comp: ComponentBase, env: simpy.Environment, txn: Transaction) -> None:
|
||||||
|
"""Wire one in_port, start the component, inject txn, run env until done."""
|
||||||
|
in_store: simpy.Store = simpy.Store(env)
|
||||||
|
comp.in_ports["src"] = in_store
|
||||||
|
comp.start(env)
|
||||||
|
env.process(_inject(in_store, txn))
|
||||||
|
env.run(until=txn.done)
|
||||||
|
|
||||||
|
|
||||||
|
def _inject(store: simpy.Store, txn: Transaction):
|
||||||
|
yield store.put(txn)
|
||||||
|
|
||||||
|
|
||||||
|
# ── 1. run() latency: TransitComponent ───────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_transit_component_run_overhead_ns():
|
||||||
|
"""TransitComponent.run() yields exactly overhead_ns."""
|
||||||
|
node = _node("forwarding_v1", {"overhead_ns": 7.5})
|
||||||
|
comp = TransitComponent(node)
|
||||||
|
env = simpy.Environment()
|
||||||
|
|
||||||
|
def proc():
|
||||||
|
yield from comp.run(env, nbytes=1024)
|
||||||
|
|
||||||
|
env.process(proc())
|
||||||
|
env.run()
|
||||||
|
assert env.now == pytest.approx(7.5)
|
||||||
|
|
||||||
|
|
||||||
|
def test_transit_component_run_zero_overhead_ns():
|
||||||
|
"""TransitComponent.run() with overhead_ns=0 completes immediately."""
|
||||||
|
node = _node("noc_v1", {"overhead_ns": 0.0})
|
||||||
|
comp = TransitComponent(node)
|
||||||
|
env = simpy.Environment()
|
||||||
|
done = []
|
||||||
|
|
||||||
|
def proc():
|
||||||
|
yield from comp.run(env, nbytes=512)
|
||||||
|
done.append(True)
|
||||||
|
|
||||||
|
env.process(proc())
|
||||||
|
env.run()
|
||||||
|
assert done == [True]
|
||||||
|
assert env.now == pytest.approx(0.0)
|
||||||
|
|
||||||
|
|
||||||
|
# ── 2. run() latency: IoCpuComponent ────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_io_cpu_component_run_overhead_ns():
|
||||||
|
"""IoCpuComponent.run() yields exactly overhead_ns."""
|
||||||
|
node = _node("io_cpu_v1", {"overhead_ns": 10.0})
|
||||||
|
comp = IoCpuComponent(node)
|
||||||
|
env = simpy.Environment()
|
||||||
|
|
||||||
|
def proc():
|
||||||
|
yield from comp.run(env, nbytes=2048)
|
||||||
|
|
||||||
|
env.process(proc())
|
||||||
|
env.run()
|
||||||
|
assert env.now == pytest.approx(10.0)
|
||||||
|
|
||||||
|
|
||||||
|
# ── 3. Terminal: HbmCtrlComponent succeeds done ──────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_hbm_ctrl_terminal_succeeds_done():
|
||||||
|
"""HbmCtrlComponent is a terminal node: succeeds txn.done after run()."""
|
||||||
|
node = _node("hbm_ctrl_v1", {"overhead_ns": 0.0, "capacity": 1})
|
||||||
|
comp = HbmCtrlComponent(node)
|
||||||
|
env = simpy.Environment()
|
||||||
|
done_event = env.event()
|
||||||
|
txn = Transaction(request=None, path=["test.node"], step=0, nbytes=256, done=done_event)
|
||||||
|
|
||||||
|
_run_worker(comp, env, txn)
|
||||||
|
|
||||||
|
assert done_event.triggered
|
||||||
|
|
||||||
|
|
||||||
|
def test_hbm_ctrl_resource_serializes_requests():
|
||||||
|
"""HbmCtrlComponent with capacity=1 serializes concurrent requests."""
|
||||||
|
node = _node("hbm_ctrl_v1", {"overhead_ns": 5.0, "capacity": 1})
|
||||||
|
comp = HbmCtrlComponent(node)
|
||||||
|
env = simpy.Environment()
|
||||||
|
in_store: simpy.Store = simpy.Store(env)
|
||||||
|
comp.in_ports["src"] = in_store
|
||||||
|
comp.start(env)
|
||||||
|
|
||||||
|
done1 = env.event()
|
||||||
|
done2 = env.event()
|
||||||
|
txn1 = Transaction(request=None, path=["test.node"], step=0, nbytes=0, done=done1)
|
||||||
|
txn2 = Transaction(request=None, path=["test.node"], step=0, nbytes=0, done=done2)
|
||||||
|
|
||||||
|
def inject():
|
||||||
|
yield in_store.put(txn1)
|
||||||
|
yield in_store.put(txn2)
|
||||||
|
|
||||||
|
env.process(inject())
|
||||||
|
env.run(until=done2)
|
||||||
|
|
||||||
|
# Both must be done; with serialization: t=5 + t=10
|
||||||
|
assert done1.triggered
|
||||||
|
assert done2.triggered
|
||||||
|
assert env.now == pytest.approx(10.0)
|
||||||
|
|
||||||
|
|
||||||
|
# ── 4. Terminal: SramComponent succeeds done ─────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_sram_terminal_succeeds_done():
|
||||||
|
"""SramComponent is a terminal node: succeeds txn.done after run()."""
|
||||||
|
node = _node("sram_v1", {"overhead_ns": 2.0})
|
||||||
|
comp = SramComponent(node)
|
||||||
|
env = simpy.Environment()
|
||||||
|
done_event = env.event()
|
||||||
|
txn = Transaction(request=None, path=["test.node"], step=0, nbytes=512, done=done_event)
|
||||||
|
|
||||||
|
_run_worker(comp, env, txn)
|
||||||
|
|
||||||
|
assert done_event.triggered
|
||||||
|
assert env.now == pytest.approx(2.0)
|
||||||
|
|
||||||
|
|
||||||
|
# ── 5. MCpuComponent: forward when not terminal ──────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_m_cpu_forwards_when_not_terminal():
|
||||||
|
"""MCpuComponent forwards Transaction to next hop when not terminal."""
|
||||||
|
node = _node("m_cpu_v1", {"overhead_ns": 5.0})
|
||||||
|
comp = MCpuComponent(node)
|
||||||
|
env = simpy.Environment()
|
||||||
|
|
||||||
|
# Wire in_port and out_port for a two-hop path [src, test.node, next]
|
||||||
|
in_store: simpy.Store = simpy.Store(env)
|
||||||
|
out_store: simpy.Store = simpy.Store(env)
|
||||||
|
comp.in_ports["src"] = in_store
|
||||||
|
comp.out_ports["next"] = out_store
|
||||||
|
comp.start(env)
|
||||||
|
|
||||||
|
done_event = env.event()
|
||||||
|
txn = Transaction(
|
||||||
|
request=None,
|
||||||
|
path=["src", "test.node", "next"],
|
||||||
|
step=1, # currently at test.node; next_hop = "next"
|
||||||
|
nbytes=128,
|
||||||
|
done=done_event,
|
||||||
|
)
|
||||||
|
|
||||||
|
forwarded: list[Any] = []
|
||||||
|
|
||||||
|
def receiver():
|
||||||
|
msg = yield out_store.get()
|
||||||
|
forwarded.append(msg)
|
||||||
|
msg.done.succeed()
|
||||||
|
|
||||||
|
env.process(receiver())
|
||||||
|
|
||||||
|
def inject():
|
||||||
|
yield in_store.put(txn)
|
||||||
|
|
||||||
|
env.process(inject())
|
||||||
|
env.run(until=done_event)
|
||||||
|
|
||||||
|
assert len(forwarded) == 1
|
||||||
|
assert forwarded[0].step == 2 # advanced
|
||||||
|
assert env.now == pytest.approx(5.0)
|
||||||
|
|
||||||
|
|
||||||
|
# ── 6. MCpuComponent: terminal with no ctx just completes ────────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_m_cpu_terminal_no_ctx_completes():
|
||||||
|
"""MCpuComponent without ctx completes txn.done when it is the terminal hop."""
|
||||||
|
node = _node("m_cpu_v1", {"overhead_ns": 0.0})
|
||||||
|
comp = MCpuComponent(node, ctx=None)
|
||||||
|
env = simpy.Environment()
|
||||||
|
done_event = env.event()
|
||||||
|
txn = Transaction(request=None, path=["test.node"], step=0, nbytes=64, done=done_event)
|
||||||
|
|
||||||
|
_run_worker(comp, env, txn)
|
||||||
|
|
||||||
|
assert done_event.triggered
|
||||||
|
|
||||||
|
|
||||||
|
# ── 7. ComponentRegistry resolves impl strings ───────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("impl,expected_cls", [
|
||||||
|
("forwarding_v1", TransitComponent),
|
||||||
|
("noc_v1", TransitComponent),
|
||||||
|
("ucie_v1", TransitComponent),
|
||||||
|
("xbar_v1", TransitComponent),
|
||||||
|
("pcie_ep_v1", PcieEpComponent),
|
||||||
|
("io_cpu_v1", IoCpuComponent),
|
||||||
|
("m_cpu_v1", MCpuComponent),
|
||||||
|
("hbm_ctrl_v1", HbmCtrlComponent),
|
||||||
|
("sram_v1", SramComponent),
|
||||||
|
])
|
||||||
|
def test_registry_resolves_impl(impl, expected_cls):
|
||||||
|
"""ComponentRegistry.create() returns the correct concrete class for each impl."""
|
||||||
|
node = _node(impl, {"overhead_ns": 0.0})
|
||||||
|
comp = ComponentRegistry.create(node)
|
||||||
|
assert isinstance(comp, expected_cls)
|
||||||
|
|
||||||
|
|
||||||
|
# ── 8. GraphEngine passes ComponentContext to components ─────────────
|
||||||
|
|
||||||
|
|
||||||
|
def test_engine_passes_ctx_to_components():
|
||||||
|
"""GraphEngine injects a non-None ComponentContext into every component."""
|
||||||
|
graph = load_topology(TOPOLOGY_PATH)
|
||||||
|
engine = GraphEngine(graph)
|
||||||
|
for node_id, comp in engine._components.items():
|
||||||
|
assert comp.ctx is not None, f"{node_id}: ctx is None"
|
||||||
|
assert isinstance(comp.ctx, ComponentContext), f"{node_id}: ctx wrong type"
|
||||||
|
|
||||||
|
|
||||||
|
def test_engine_ctx_router_and_resolver_populated():
|
||||||
|
"""ComponentContext.router and .resolver are PathRouter / AddressResolver instances."""
|
||||||
|
from kernbench.policy.routing.router import AddressResolver, PathRouter
|
||||||
|
|
||||||
|
graph = load_topology(TOPOLOGY_PATH)
|
||||||
|
engine = GraphEngine(graph)
|
||||||
|
# Spot-check one component
|
||||||
|
first_comp = next(iter(engine._components.values()))
|
||||||
|
assert isinstance(first_comp.ctx.router, PathRouter)
|
||||||
|
assert isinstance(first_comp.ctx.resolver, AddressResolver)
|
||||||