687c98086d
Filename + lifecycle:
- ADR rename to ADR-NNNN-<cat>-title.md with 8 3-letter category prefixes
(dev / mem / lat / prog / algo / par / api / ver). Numbers stay immutable.
- ADR Lifecycle split into 3 folders, documented in CLAUDE.md Part 2:
docs/adr/ (Accepted), docs/adr-proposed/ (Proposed/Stub/Draft),
docs/adr-history/ (Superseded/Merged). Status field gains "Draft" for
retroactive docs pending verification.
Merges (one ADR per topic, no change-history annotations):
- ADR-0017 absorbs ADR-0019 (Cube NOC + per-PE HBM connectivity, 10 D-items)
- ADR-0014 absorbs ADR-0021 (PE pipeline execution model, 8 D-items incl.
TileToken self-routing and multi-op composite epilogue scope)
- ADR-0023 absorbs docs/ipcq-dma-codesign-hw.md as new "HW Realization
Notes (Informative)" section (D16-D23 + Open HW Questions). codesign-hw.md
deleted; ADR-0019/0021 moved to adr-history with one-line stub status
Retroactive documentation (G4 closures, code-verified):
- ADR-0037 forwarding component (TransitComponent: first-flit overhead,
serial worker, path-based routing, single impl/multiple names)
- ADR-0036 IO_CPU component (target_start_ns global barrier stamping,
per-cube fan-out, response aggregation)
- ADR-0035 M_CPU & M_CPU.DMA component (3 fan-out paths, DMA Resources,
target_start_ns passthrough)
- ADR-0034 HBM controller internal design (per-PC state, address-based
selection, flit-aware per-flit commit, async finalize, command-only
fallback path)
Content updates:
- ADR-0010 expanded to full CLI surface (run/probe/web), retitled
"Command Line Interface and Execution Semantics"
- ADR-0007 D2 rewritten to current state; ADR-0015 supersession notes pruned
- ADR-0005 wrapped in Decision header with D1-D5; ADR-0022 metadata
block replaced with standard Status header
- ADR-0024 trimmed to rank=SIP launcher essentials (D1-D4);
ADR-0027 cleaned of supersession history
- ADR-0033 D6 cleanup: address-based PC selection moved out of future-work
(now documented in ADR-0034 D3); related D1/D3 wording realigned
- Cross-references back-filled in 5 ADRs (G3 gaps closed)
Onboarding docs split:
- docs/onboarding/ created
- moved: hw-architecture-overview.md, latency-model.md, di-presentation.md,
ccl-author-guide{,.en}.md
- references updated in README, ADR-0023{,.en}, src/kernbench/ccl/__init__.py
Source / test / yaml: ADR-NNNN cross-references in docstrings and YAML
comments updated after the merges (ADR-0021->0014 D6, ADR-0019->0017 D8).
No behavior change.
Tooling:
- tools/verify_adr_lang_pairs.py + tests/test_verify_adr_lang_pairs.py
(ADR EN/KO pair invariant checker)
- .claude/commands/report.md tracked (/report slash command)
- .gitignore: allow .claude/commands/*.md while keeping settings files ignored
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
140 lines
6.1 KiB
YAML
140 lines
6.1 KiB
YAML
|
||
system:
|
||
ns_per_mm: 0.01 # wire propagation delay: 10 ps/mm (on-chip silicon)
|
||
|
||
sips:
|
||
count: 2
|
||
topology: ring_1d
|
||
|
||
components:
|
||
switch: { kind: switch, impl: builtin.switch, attrs: { overhead_ns: 5.0 } }
|
||
|
||
links:
|
||
io_ep_to_switch:
|
||
kind: pcie
|
||
bw_gbs_per_ep: 768.0
|
||
distance_mm: 20.0
|
||
|
||
sip:
|
||
cube_mesh: { w: 4, h: 4 }
|
||
|
||
iochiplet:
|
||
components:
|
||
pcie_ep: { kind: pcie_ep, impl: builtin.pcie_ep, attrs: { overhead_ns: 5.0 } }
|
||
io_cpu: { kind: io_cpu, impl: builtin.io_cpu, attrs: { overhead_ns: 10.0 } }
|
||
io_noc: { kind: io_noc, impl: builtin.forwarding, attrs: { overhead_ns: 0.0 } }
|
||
links:
|
||
pcie_ep_to_noc_bw_gbs: 256.0
|
||
pcie_ep_to_noc_mm: 1.0
|
||
io_cpu_to_noc_bw_gbs: 256.0
|
||
io_cpu_to_noc_mm: 0.5
|
||
ucie:
|
||
overhead_ns: 8.0
|
||
n_connections: 4
|
||
per_connection_bw_gbs: 128.0 # 4 × 128 = 512 GB/s = PHY BW
|
||
noc_to_ucie_mm: 0.5
|
||
instances:
|
||
- id: io0
|
||
place: { side: N, offset_norm: 0.5 }
|
||
ucie: { phy_bw_gbs: 512.0, phys: [P0, P1, P2, P3] }
|
||
cube_ports:
|
||
- { cube: {xy: [0,0]}, cube_side: N, phy: P0, distance_mm: 2.0 }
|
||
- { cube: {xy: [1,0]}, cube_side: N, phy: P1, distance_mm: 2.0 }
|
||
- { cube: {xy: [2,0]}, cube_side: N, phy: P2, distance_mm: 2.0 }
|
||
- { cube: {xy: [3,0]}, cube_side: N, phy: P3, distance_mm: 2.0 }
|
||
|
||
links:
|
||
inter_cube_mesh:
|
||
bw_gbs_per_ucie_phy: 512.0
|
||
distance_mm_across_seam: 1.0
|
||
routing: { algo: xy }
|
||
|
||
cube:
|
||
geometry:
|
||
cube_mm: { w: 17.0, h: 14.0 }
|
||
hbm_mm: { w: 9.0, h: 5.0 }
|
||
ucie_mm: { size: 2.0 }
|
||
|
||
pe_layout:
|
||
corners: [NW, NE, SW, SE] # N corners → top PE rows; S corners → bottom PE rows
|
||
pe_per_corner: 2 # total PEs per cube: 4 * 2 = 8
|
||
|
||
pe_template:
|
||
components:
|
||
pe_cpu: { kind: pe_cpu, impl: builtin.pe_cpu, attrs: { overhead_ns: 2.0 } }
|
||
pe_scheduler: { kind: pe_scheduler, impl: builtin.pe_scheduler, attrs: { overhead_ns: 1.0 } }
|
||
pe_dma: { kind: pe_dma, impl: builtin.pe_dma, attrs: { rd_engines: 1, wr_engines: 1 } }
|
||
pe_gemm: { kind: pe_gemm, impl: builtin.pe_gemm, attrs: { overhead_ns: 0.0, shared_resource: accel_slot, peak_tflops_f16: 8.0 } }
|
||
pe_math: { kind: pe_math, impl: builtin.pe_math, attrs: { overhead_ns: 0.0, shared_resource: accel_slot } }
|
||
pe_fetch_store: { kind: pe_fetch_store, impl: builtin.pe_fetch_store, attrs: { overhead_ns: 0.0 } }
|
||
pe_mmu: { kind: pe_mmu, impl: builtin.pe_mmu, attrs: { tlb_overhead_ns: 0.5, page_size: 4096 } }
|
||
pe_tcm: { kind: pe_tcm, impl: builtin.pe_tcm, attrs: { size_mb: 16, read_bw_gbs: 512.0, write_bw_gbs: 512.0, kernel_scratch_mb: 1 } }
|
||
pe_ipcq: { kind: pe_ipcq, impl: builtin.pe_ipcq, attrs: { overhead_ns: 0.0 } }
|
||
links:
|
||
pe_cpu_to_scheduler_mm: 0.5
|
||
scheduler_to_dma_mm: 0.5
|
||
scheduler_to_gemm_mm: 0.5
|
||
scheduler_to_math_mm: 0.5
|
||
scheduler_to_fetch_store_mm: 0.5
|
||
dma_to_tcm_bw_gbs: 512.0
|
||
dma_to_tcm_mm: 0.5
|
||
dma_to_fetch_store_mm: 0.0 # DMA → fetch_store chaining (ADR-0014 D6)
|
||
fetch_store_to_tcm_bw_gbs: 512.0
|
||
fetch_store_to_tcm_mm: 0.0
|
||
fetch_store_to_gemm_mm: 0.0 # fetch → GEMM chaining (ADR-0014 D6)
|
||
fetch_store_to_math_mm: 0.0 # fetch → MATH chaining (ADR-0014 D6)
|
||
gemm_to_fetch_store_mm: 0.0 # GEMM → store chaining (ADR-0014 D6)
|
||
gemm_to_math_mm: 0.0 # GEMM → MATH epilogue chaining (ADR-0014 D6)
|
||
math_to_fetch_store_mm: 0.0 # MATH → store chaining (ADR-0014 D6)
|
||
fetch_store_to_dma_mm: 0.0 # store → DMA writeback chaining (ADR-0014 D6)
|
||
gemm_to_tcm_bw_gbs: 512.0
|
||
gemm_to_tcm_mm: 0.5
|
||
math_to_tcm_bw_gbs: 512.0
|
||
math_to_tcm_mm: 0.5
|
||
cpu_to_ipcq_mm: 0.5 # PE_CPU → PE_IPCQ (ADR-0023)
|
||
ipcq_to_dma_mm: 0.0 # PE_IPCQ → PE_DMA token forwarding (ADR-0023)
|
||
dma_to_ipcq_mm: 0.0 # PE_DMA → PE_IPCQ metadata arrival (ADR-0023)
|
||
|
||
memory_map:
|
||
hbm_total_gb_per_cube: 48
|
||
hbm_slices_per_cube: 8
|
||
hbm_total_bw_gbs: 1024.0
|
||
hbm_mapping_mode: n_to_one # one_to_one | n_to_one (ADR-0017 D8)
|
||
hbm_pseudo_channels: 64 # total pseudo channels per cube
|
||
hbm_channels_per_pe: 8 # = pseudo_channels / pes_per_cube
|
||
hbm_channel_bw_gbs: 32.0 # per-channel bandwidth (GB/s)
|
||
|
||
components:
|
||
noc_router: { kind: noc_router, impl: builtin.forwarding, attrs: { overhead_ns: 2.0 } }
|
||
m_cpu: { kind: m_cpu, impl: builtin.m_cpu, attrs: { overhead_ns: 5.0 } }
|
||
hbm_ctrl: { kind: hbm_ctrl, impl: builtin.hbm_ctrl, attrs: { capacity: 1, efficiency: 1.0, num_pcs: 8, burst_bytes: 256, switch_penalty_ns: 0.0 } }
|
||
sram: { kind: sram, impl: builtin.sram, attrs: { size_mb: 32, overhead_ns: 2.0 } }
|
||
|
||
# Physical placement of non-PE components (mm coordinates)
|
||
placement:
|
||
m_cpu: { pos_mm: [7.5, 3.0] } # top center, below UCIe-N
|
||
sram: { pos_mm: [1.5, 9.0] } # left side, below HBM zone
|
||
|
||
ucie:
|
||
decompose: true
|
||
ports: [N, S, E, W]
|
||
overhead_ns: 8.0
|
||
n_connections: 4 # independent NOC↔UCIe connections per port
|
||
per_connection_bw_gbs: 128.0 # BW per connection; 4 × 128 = 512 GB/s = UCIe PHY BW
|
||
|
||
links:
|
||
# Router mesh links (ADR-0017 D5)
|
||
router_link_bw_gbs: 256.0 # inter-router XY mesh link BW
|
||
router_overhead_ns: 2.0 # per-router switching overhead
|
||
pe_to_router_bw_gbs: 256.0 # PE_DMA ↔ router (= N × channel_bw)
|
||
hbm_to_router_bw_gbs: 256.0 # HBM_CTRL ↔ router (= N × channel_bw)
|
||
sram_to_router_bw_gbs: 128.0 # SRAM ↔ router
|
||
m_cpu_to_router_mm: 0.0 # M_CPU ↔ router distance
|
||
pe_dma_to_noc_bw_gbs: 256.0 # PE → router BW (= HBM slice BW, no bottleneck)
|
||
noc_to_pe_cpu_mm: 0.0 # router → PE_CPU distance (command path)
|
||
|
||
visualization:
|
||
emit_views: [system, sip, cube]
|
||
sip_ids: [0]
|
||
cubes: [0, 9, 15]
|