687c98086d
Filename + lifecycle:
- ADR rename to ADR-NNNN-<cat>-title.md with 8 3-letter category prefixes
(dev / mem / lat / prog / algo / par / api / ver). Numbers stay immutable.
- ADR Lifecycle split into 3 folders, documented in CLAUDE.md Part 2:
docs/adr/ (Accepted), docs/adr-proposed/ (Proposed/Stub/Draft),
docs/adr-history/ (Superseded/Merged). Status field gains "Draft" for
retroactive docs pending verification.
Merges (one ADR per topic, no change-history annotations):
- ADR-0017 absorbs ADR-0019 (Cube NOC + per-PE HBM connectivity, 10 D-items)
- ADR-0014 absorbs ADR-0021 (PE pipeline execution model, 8 D-items incl.
TileToken self-routing and multi-op composite epilogue scope)
- ADR-0023 absorbs docs/ipcq-dma-codesign-hw.md as new "HW Realization
Notes (Informative)" section (D16-D23 + Open HW Questions). codesign-hw.md
deleted; ADR-0019/0021 moved to adr-history with one-line stub status
Retroactive documentation (G4 closures, code-verified):
- ADR-0037 forwarding component (TransitComponent: first-flit overhead,
serial worker, path-based routing, single impl/multiple names)
- ADR-0036 IO_CPU component (target_start_ns global barrier stamping,
per-cube fan-out, response aggregation)
- ADR-0035 M_CPU & M_CPU.DMA component (3 fan-out paths, DMA Resources,
target_start_ns passthrough)
- ADR-0034 HBM controller internal design (per-PC state, address-based
selection, flit-aware per-flit commit, async finalize, command-only
fallback path)
Content updates:
- ADR-0010 expanded to full CLI surface (run/probe/web), retitled
"Command Line Interface and Execution Semantics"
- ADR-0007 D2 rewritten to current state; ADR-0015 supersession notes pruned
- ADR-0005 wrapped in Decision header with D1-D5; ADR-0022 metadata
block replaced with standard Status header
- ADR-0024 trimmed to rank=SIP launcher essentials (D1-D4);
ADR-0027 cleaned of supersession history
- ADR-0033 D6 cleanup: address-based PC selection moved out of future-work
(now documented in ADR-0034 D3); related D1/D3 wording realigned
- Cross-references back-filled in 5 ADRs (G3 gaps closed)
Onboarding docs split:
- docs/onboarding/ created
- moved: hw-architecture-overview.md, latency-model.md, di-presentation.md,
ccl-author-guide{,.en}.md
- references updated in README, ADR-0023{,.en}, src/kernbench/ccl/__init__.py
Source / test / yaml: ADR-NNNN cross-references in docstrings and YAML
comments updated after the merges (ADR-0021->0014 D6, ADR-0019->0017 D8).
No behavior change.
Tooling:
- tools/verify_adr_lang_pairs.py + tests/test_verify_adr_lang_pairs.py
(ADR EN/KO pair invariant checker)
- .claude/commands/report.md tracked (/report slash command)
- .gitignore: allow .claude/commands/*.md while keeping settings files ignored
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
225 lines
7.9 KiB
Python
225 lines
7.9 KiB
Python
"""Tests for ADR-0017 D4/D7 per-PE HBM partitioning.
|
|
|
|
ADR-0017 D4/D7 specifies:
|
|
|
|
- Each PE owns 8 of the cube's 64 pseudo-channels (PE_X → PCs 8X..8X+7).
|
|
- HBM CTRL is split per-PE: ``hbm_ctrl.pe{X}`` is reachable ONLY through
|
|
PE_X's attaching router. Accessing PE_Y's slice from PE_X requires
|
|
mesh routing to r_Y_attach before entering hbm_ctrl.pe{Y}.
|
|
|
|
These tests enforce that property without weakening
|
|
assertions.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
from kernbench.policy.address.phyaddr import PhysAddr
|
|
from kernbench.policy.routing.router import AddressResolver, PathRouter
|
|
from kernbench.runtime_api.kernel import PeDmaMsg
|
|
from kernbench.sim_engine.engine import GraphEngine
|
|
from kernbench.topology.builder import load_topology
|
|
|
|
TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml"
|
|
|
|
|
|
def _graph():
|
|
return load_topology(TOPOLOGY_PATH)
|
|
|
|
|
|
def _slice_bytes(spec: dict) -> int:
|
|
mm = spec["cube"]["memory_map"]
|
|
return mm["hbm_total_gb_per_cube"] * (1 << 30) // mm["hbm_slices_per_cube"]
|
|
|
|
|
|
def _hbm_pa(*, sip: int, cube: int, pe_id: int, offset: int, spec: dict) -> int:
|
|
return PhysAddr.pe_hbm_addr(
|
|
sip_id=sip, die_id=cube, pe_id=pe_id,
|
|
pe_local_hbm_offset=offset, slice_size_bytes=_slice_bytes(spec),
|
|
).encode()
|
|
|
|
|
|
# Mapping derived from topology.yaml + cube_mesh attach info
|
|
PE_ATTACH_ROUTER = {
|
|
0: "r0c0", 1: "r0c1",
|
|
2: "r1c4", 3: "r1c5",
|
|
4: "r4c0", 5: "r4c1",
|
|
6: "r5c4", 7: "r5c5",
|
|
}
|
|
|
|
|
|
# ── 1. Topology: 8 per-PE HBM CTRL nodes per cube ────────────────────
|
|
|
|
|
|
def test_topology_has_8_hbm_ctrl_per_cube():
|
|
"""Each cube must expose 8 hbm_ctrl instances, one per PE
|
|
(``hbm_ctrl.pe0`` .. ``hbm_ctrl.pe7``). The legacy single
|
|
``hbm_ctrl`` must be absent."""
|
|
graph = _graph()
|
|
for pe in range(8):
|
|
nid = f"sip0.cube0.hbm_ctrl.pe{pe}"
|
|
assert nid in graph.nodes, (
|
|
f"Expected per-PE HBM CTRL node {nid!r} (ADR-0017 D4)"
|
|
)
|
|
node = graph.nodes[nid]
|
|
assert int(node.attrs.get("num_pcs", 0)) == 8, (
|
|
f"{nid} must have num_pcs=8; got {node.attrs.get('num_pcs')}"
|
|
)
|
|
# Cube-wide single hbm_ctrl must not exist
|
|
assert "sip0.cube0.hbm_ctrl" not in graph.nodes, (
|
|
"Cube-wide single sip0.cube0.hbm_ctrl must not exist; only "
|
|
"per-PE hbm_ctrl.pe{X} (ADR-0017 D4)"
|
|
)
|
|
|
|
|
|
# ── 2. Each per-PE HBM CTRL connects ONLY to its PE's attaching router ─
|
|
|
|
|
|
def test_per_pe_hbm_ctrl_connects_only_to_owning_router():
|
|
"""``hbm_ctrl.pe{X}`` must have exactly one router edge (to and from
|
|
r_X_attach). No other router may have an edge to/from it."""
|
|
graph = _graph()
|
|
edge_map = {(e.src, e.dst): e for e in graph.edges}
|
|
for pe in range(8):
|
|
nid = f"sip0.cube0.hbm_ctrl.pe{pe}"
|
|
owner = f"sip0.cube0.{PE_ATTACH_ROUTER[pe]}"
|
|
# incoming edges (router→hbm_ctrl)
|
|
incoming = [src for (src, dst) in edge_map if dst == nid]
|
|
outgoing = [dst for (src, dst) in edge_map if src == nid]
|
|
assert incoming == [owner], (
|
|
f"{nid} must have a single incoming edge from {owner}; got {incoming}"
|
|
)
|
|
assert outgoing == [owner], (
|
|
f"{nid} must have a single outgoing edge to {owner}; got {outgoing}"
|
|
)
|
|
|
|
|
|
# ── 3. Resolver: PA pe_id → correct hbm_ctrl.pe{X} ───────────────────
|
|
|
|
|
|
@pytest.mark.parametrize("pe_id", list(range(8)))
|
|
def test_resolver_maps_pe_id_to_correct_hbm_ctrl(pe_id):
|
|
"""AddressResolver must dispatch HBM PA to the hbm_ctrl owned by the
|
|
target PE (encoded by pe_local_hbm_offset / slice_size_bytes)."""
|
|
graph = _graph()
|
|
spec = graph.spec
|
|
resolver = AddressResolver(graph)
|
|
pa_val = _hbm_pa(sip=0, cube=0, pe_id=pe_id, offset=0x1000, spec=spec)
|
|
pa = PhysAddr.decode(pa_val)
|
|
dst = resolver.resolve(pa)
|
|
assert dst == f"sip0.cube0.hbm_ctrl.pe{pe_id}", (
|
|
f"PA with pe_id={pe_id} must resolve to hbm_ctrl.pe{pe_id}; got {dst!r}"
|
|
)
|
|
|
|
|
|
# ── 4. Path: PE_X → PE_X_slice is single mesh hop ────────────────────
|
|
|
|
|
|
def test_pe0_to_pe0_slice_is_single_mesh_hop():
|
|
"""PE0 accessing its OWN HBM slice must take exactly one router hop
|
|
(r0c0 → hbm_ctrl.pe0)."""
|
|
graph = _graph()
|
|
spec = graph.spec
|
|
router = PathRouter(graph)
|
|
dst = f"sip0.cube0.hbm_ctrl.pe0"
|
|
path = router.find_path("sip0.cube0.pe0", dst)
|
|
expected = [
|
|
"sip0.cube0.pe0.pe_dma",
|
|
"sip0.cube0.r0c0",
|
|
"sip0.cube0.hbm_ctrl.pe0",
|
|
]
|
|
assert path == expected, (
|
|
f"pe0 → pe0_slice path must be {expected}; got {path}"
|
|
)
|
|
|
|
|
|
# ── 5. Path: PE_X → PE_Y_slice traverses mesh through r_Y_attach ─────
|
|
|
|
|
|
def test_pe0_to_pe7_slice_traverses_mesh_to_r5c5():
|
|
"""PE0 accessing PE7's slice must mesh-route to r5c5 (PE7's attaching
|
|
router) before entering hbm_ctrl.pe7. Last two nodes must be
|
|
r5c5 → hbm_ctrl.pe7."""
|
|
graph = _graph()
|
|
router = PathRouter(graph)
|
|
dst = "sip0.cube0.hbm_ctrl.pe7"
|
|
path = router.find_path("sip0.cube0.pe0", dst)
|
|
assert path[0] == "sip0.cube0.pe0.pe_dma"
|
|
assert path[-2:] == ["sip0.cube0.r5c5", "sip0.cube0.hbm_ctrl.pe7"], (
|
|
f"Last 2 path nodes must be r5c5 → hbm_ctrl.pe7; got {path[-2:]}"
|
|
)
|
|
# Multi-hop mesh between r0c0 and r5c5
|
|
assert len(path) >= 5, f"Cross-PE path must traverse mesh; got {len(path)} nodes"
|
|
|
|
|
|
# ── 6. End-to-end: cross-PE latency > local PE latency ───────────────
|
|
|
|
|
|
def test_pe_dma_cross_pe_slower_than_local():
|
|
"""For a non-trivial transfer (16KB), PE0 → PE7_slice must measurably
|
|
exceed PE0 → PE0_slice (the additional mesh hops add per-router
|
|
overhead and consume per-link wire BW)."""
|
|
graph = _graph()
|
|
spec = graph.spec
|
|
nbytes = 16384
|
|
|
|
def _run(dst_pe: int) -> float:
|
|
engine = GraphEngine(_graph())
|
|
pa = _hbm_pa(sip=0, cube=0, pe_id=dst_pe, offset=0x1000, spec=spec)
|
|
msg = PeDmaMsg(
|
|
correlation_id="per-pe-hbm", request_id=f"to-pe{dst_pe}",
|
|
src_sip=0, src_cube=0, src_pe=0,
|
|
dst_pa=pa, nbytes=nbytes,
|
|
)
|
|
h = engine.submit(msg)
|
|
engine.wait(h)
|
|
_, trace = engine.get_completion(h)
|
|
return float(trace["total_ns"])
|
|
|
|
local = _run(0)
|
|
cross = _run(7)
|
|
assert cross > local * 1.05, (
|
|
f"Cross-PE HBM access (pe0 → pe7_slice) must take measurably more "
|
|
f"time than local (pe0 → pe0_slice). local={local:.2f}ns, "
|
|
f"cross={cross:.2f}ns, ratio={cross/local:.3f} (expected > 1.05)"
|
|
)
|
|
|
|
|
|
# ── 7. Probe CLI monotonicity (existing case names) ──────────────────
|
|
|
|
|
|
def test_probe_cli_intra_cube_cases_are_monotonic():
|
|
"""Probe CLI cases must show monotonic latency:
|
|
pe-local-hbm < pe-same-half-hbm < pe-cross-half-hbm.
|
|
|
|
Per ADR-0017 D7, same-half (pe0→pe1) is 1 mesh hop further than
|
|
local, and cross-half (pe0→pe4) is several hops further.
|
|
"""
|
|
graph = _graph()
|
|
spec = graph.spec
|
|
nbytes = 32768
|
|
|
|
def _run(dst_pe: int) -> float:
|
|
engine = GraphEngine(_graph())
|
|
pa = _hbm_pa(sip=0, cube=0, pe_id=dst_pe, offset=0x1000, spec=spec)
|
|
msg = PeDmaMsg(
|
|
correlation_id="probe", request_id=f"to-pe{dst_pe}",
|
|
src_sip=0, src_cube=0, src_pe=0,
|
|
dst_pa=pa, nbytes=nbytes,
|
|
)
|
|
h = engine.submit(msg)
|
|
engine.wait(h)
|
|
_, trace = engine.get_completion(h)
|
|
return float(trace["total_ns"])
|
|
|
|
local = _run(0) # pe-local-hbm
|
|
same_half = _run(1) # pe-same-half-hbm (adjacent: r0c0 → r0c1)
|
|
cross_half = _run(4) # pe-cross-half-hbm (r0c0 → r4c0)
|
|
assert local < same_half < cross_half, (
|
|
f"intra-cube DMA must be monotonic with mesh distance. "
|
|
f"local={local:.2f}, same_half={same_half:.2f}, "
|
|
f"cross_half={cross_half:.2f}"
|
|
)
|