From 81cc32c46b16208928ab0d5a6eb633e393c179c4 Mon Sep 17 00:00:00 2001 From: Yangwook Kang Date: Mon, 27 Apr 2026 15:52:29 -0700 Subject: [PATCH] ADR-0001 Rev 2: 51-bit PhysAddr layout with concrete sub-unit tables MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove rack_id (4 bits), rename sip_seg→die_id, shift fields to enable 42-bit local_offset (4 TB per die). Define PE_LOCAL/MCPU_LOCAL/CUBE_SRAM sub-unit tables for AHBM dies and IOCPU sub-unit table for IOCHIPLET dies (1 TB window). Supersedes ADR-0031. Also fixes latent VA/PA confusion in pe_dma pipeline DMA path where virtual addresses were decoded as physical addresses without MMU translation — previously masked by coincidental bit-position alignment. 529 passed (+6 recovered), 10 pre-existing failures unchanged. Co-Authored-By: Claude Opus 4.6 (1M context) --- docs/adr/ADR-0001-physaddr-layout.md | 352 ++++++++++++++--- ...ADR-0031-physaddr-pe-resource-extension.md | 6 +- src/kernbench/cli/probe.py | 2 +- src/kernbench/components/builtin/io_cpu.py | 4 +- src/kernbench/components/builtin/pe_dma.py | 13 +- .../components/legacy/builtin/io_cpu.py | 4 +- src/kernbench/policy/address/allocator.py | 9 +- src/kernbench/policy/address/phyaddr.py | 372 +++++++++++++----- src/kernbench/policy/routing/router.py | 10 +- src/kernbench/runtime_api/context.py | 2 +- src/kernbench/sim_engine/event_log.py | 6 +- tests/test_adr0026_dppolicy_intra_device.py | 2 +- tests/test_bw_occupancy.py | 2 +- tests/test_component_registry.py | 2 +- tests/test_e2e_data.py | 2 +- tests/test_e2e_pipeline.py | 2 +- tests/test_engine.py | 4 +- tests/test_iochiplet_noc_d2h.py | 2 +- tests/test_noc_mesh.py | 2 +- tests/test_pe_components.py | 2 +- tests/test_phyaddr.py | 244 +++++++++--- tests/test_probe.py | 2 +- tests/test_routing.py | 30 +- tests/test_tensor.py | 2 +- tests/test_tensor_free.py | 4 +- tests/test_va_integration.py | 2 +- tests/test_va_offset.py | 2 +- 27 files changed, 814 insertions(+), 272 deletions(-) diff --git a/docs/adr/ADR-0001-physaddr-layout.md b/docs/adr/ADR-0001-physaddr-layout.md index 9158f7a..5f33108 100644 --- a/docs/adr/ADR-0001-physaddr-layout.md +++ b/docs/adr/ADR-0001-physaddr-layout.md @@ -1,25 +1,39 @@ -# ADR-0001: PhysAddr Layout & Address Decoding Contract +# ADR-0001: 51-bit Physical Address Layout & Decoding Contract ## Status -Accepted +Accepted (Revision 2 — 2026-04-27: concrete bit layout, rack_id removal, +Tray->SIP / SIP->DIE renaming, PE/MCPU/IOCPU sub-unit tables. +Supersedes ADR-0031.) ## Date -2026-02-27 +2026-04-27 (original: 2026-02-27) ## Context -KernBench Graph Latency Simulator must route requests deterministically and compute end-to-end latency strictly by graph traversal. -To model local vs remote traffic (same/different SIP, same/different CUBE, optional PE-group), requests need a stable, parsable address/location scheme that: +KernBench requires a stable, parsable physical address scheme that: -- can be decoded into routing domains (SIP/CUBE/HBM/PE-resource, etc.) +- can be decoded into routing domains (SIP / die / HBM / PE-resource / IOCPU) - remains topology-agnostic (no hardcoded counts) -- supports swappable policy and DI-first components without leaking topology assumptions into node implementations +- supports swappable policy and DI-first components +- covers multiple SIPs, AHBM dies, and IO chiplet dies in a unified space + +### History + +- Original ADR-0001 defined a 51-bit layout with `rack_id(4) + sip_id(4) + + sip_seg(5) + local_offset(38)`. `rack_id` was never used in practice. +- ADR-0031 (stub) requested PE-resource range partition but was never + implemented. + +Revision 2 removes `rack_id`, renames `sip_seg -> die_id`, and provides +concrete sub-unit tables for PE, MCPU, CUBE_SRAM, and IOCPU resources. +ADR-0031 is superseded. ## Decision -We define a **PhysAddr value object** and an **address decoding contract** that converts an integer address into routing domains. +We define a **PhysAddr value object** and an **address decoding contract** +that converts an integer address into routing domains. ### D1. PhysAddr is an immutable value object @@ -27,82 +41,322 @@ We define a **PhysAddr value object** and an **address decoding contract** that - Any allocator returns a **fully specified PhysAddr** (not partial metadata). - No global state may be required to interpret a PhysAddr. -### D2. PhysAddr fields (logical contract) +### D2. 51-bit Physical Address Layout -PhysAddr must be able to represent at least: +A 51-bit physical address is adopted. -- `rack_id` (optional but reserved for scale-out) -- `sip_id` (device / SIP domain) -- `sip_seg` (SIP-level segment/window selection, e.g., cube window) -- `local_offset` (offset within the chosen segment/window) +#### 2.1 Top-Level Address Map -Decoded/derived fields may include (optional): +```text +[50:47] sip_id (4) -- 16 SIPs +[46:42] die_id (5) -- 32 dies per SIP +[41: 0] local_offset (42) -- 4 TB per die +``` -- `cube_id` -- `kind` (e.g., HBM vs PE-resource vs raw) -- `unit_type` / `pe_id` (if PE-level addressing is modeled) +```text +50 47 46 42 41 0 ++---------+----------+-------------------------+ +| sip_id | die_id | local_offset | ++---------+----------+-------------------------+ +``` -**Important:** The exact bit allocation may evolve, but the *semantic fields above* must remain decodable without hidden assumptions. +#### 2.2 die_id Allocation -### D3. Decoding is deterministic and policy-compatible +| die_id | Meaning | +|--------|---------| +| 0..15 | AHBM dies | +| 16..20 | IOCHIPLET dies | +| 21..31 | Reserved | -- Decoding must deterministically map an integer address to: - - destination SIP domain (`sip_id`) - - destination sub-domain (`cube_id` if applicable) - - destination target kind (HBM/PE-resource/other) -- Decoding must not depend on runtime topology sizes; it may depend on **explicit topology parameters** provided through configuration (e.g., segment size, slice size), and those parameters must live in the topology/config layer (not in random components). +#### 2.3 AHBM Die Layout -### D4. Topology-derived constants live in the topology layer +Only lower 256 GB of the 4 TB die-local window is assigned. -Constants such as segment sizes (e.g., HBM slice size / window size) are derived from topology configuration (YAML/JSON/dict) and are provided to the decoder via DI/config. -They must not be hardcoded in node implementations. +```text +[41:38] MBZ (4) +[37] addr_space (1) -- 0 = local resource, 1 = HBM memory +[36: 0] sub-address (37) +``` + +| addr_space | Meaning | +|------------|---------| +| 0 | Local resource | +| 1 | HBM memory | + +##### 2.3.1 HBM Window (addr_space = 1) + +```text +[36:0] hbm_offset (37) -- 128 GB decode window +``` + +The architectural decode window is fixed at 128 GB. Implemented capacity +may be smaller depending on SKU/topology (see D4). + +##### 2.3.2 Resource Window (addr_space = 0) + +```text +[36:34] resource_kind (3) +[33: 0] kind_local (34) -- 16 GB per kind +``` + +| resource_kind | Meaning | +|---------------|---------| +| 000 | PE_LOCAL | +| 001 | MCPU_LOCAL | +| 010 | CUBE_SRAM | +| 011..111 | Reserved | + +Each kind gets a 16 GB decode region. + +##### 2.3.3 PE_LOCAL (resource_kind = 000) + +```text +[33] MBZ (1) +[32:29] pe_id (4) -- 0..15 +[28:25] pe_sub_unit (4) +[24: 0] sub_offset (25) -- 32 MB per slot +``` + +16 PEs x 16 sub-unit slots x 32 MB = 8 GB active decode. + +| pe_sub_unit | Name | Budget | +|-------------|------|--------| +| 0 | PE_CPU_DTCM | 8 KB | +| 1 | MATH_ENGINE_DTCM | 8 KB | +| 2 | IPCQ | 256 KB | +| 3 | PE_CPU_SFR | 16 KB | +| 4 | MATH_ENGINE_SFR | 16 KB | +| 5 | DMA_ENGINE_SFR | 192 KB | +| 6 | PE_TCM | 2 MB | +| 7..15 | Reserved | -- | + +##### 2.3.4 MCPU_LOCAL (resource_kind = 001) + +```text +[33:30] MBZ (4) +[29:25] mcpu_sub_unit (5) +[24: 0] sub_offset (25) -- 32 MB per slot +``` + +1 GB active decode. + +| mcpu_sub_unit | Name | Budget | +|---------------|------|--------| +| 0 | MCPU_ITCM | 512 KB | +| 1 | MCPU_DTCM | 512 KB | +| 2 | IPCQ | 256 KB | +| 3 | MCPU_SFR | 8 KB | +| 4 | MCPU_DMA_SFR | 16 KB | +| 5 | MCPU_SRAM | 10 MB | +| 6..31 | Reserved | -- | + +##### 2.3.5 CUBE_SRAM (resource_kind = 010) + +```text +[33:25] MBZ (9) +[24: 0] sram_offset (25) -- flat 32 MB +``` + +#### 2.4 IOCHIPLET Die Layout + +Only lower 1 TB of the 4 TB die-local window is assigned. + +```text +[41:40] MBZ (2) +[39: 0] chiplet_offset (40) -- 1 TB +``` + +Region split by address range: + +| Range | Meaning | Decode condition | +|-------|---------|------------------| +| [0, 2 GB) | IOCPU resource | chiplet_offset < 0x8000_0000 | +| [2 GB, 1 TB) | UAL | chiplet_offset >= 0x8000_0000 | + +##### 2.4.1 IOCPU Region + +```text +[30:27] iocpu_sub_unit (4) +[26: 0] sub_offset (27) -- 128 MB per slot +``` + +16 x 128 MB slots. 2 GB active decode. + +| iocpu_sub_unit | Name | Budget | +|----------------|------|--------| +| 0 | IOCPU_ITCM | 512 KB | +| 1 | IOCPU_DTCM | 512 KB | +| 2 | IPCQ | 2 MB | +| 3 | IOCPU_SFR | 8 KB | +| 4 | IO_DMA_SFR | 16 KB | +| 5 | IO_SRAM | 64 MB | +| 6..15 | Reserved | -- | + +##### 2.4.2 UAL Region + +Sub-layout TBD (separate ADR). + +#### 2.5 Addressing Rules + +1. MBZ bits must be zero. An address with non-zero MBZ bits is + **architecturally invalid**. Implementation may raise a decode fault + or return an error -- behavior is not prescribed by this ADR. +2. Fixed slot sizes are chosen for simple hardware decode; actual + implemented capacity may be smaller than the slot. +3. Access beyond a sub-unit's implemented budget within a slot is + **architecturally invalid** (same policy as MBZ). + +### D3. Bitfield decoding is deterministic + +Given an integer address, field extraction (`sip_id`, `die_id`, `kind`, +`sub_unit`, `offset`) is purely positional. No runtime state is required. +Decoding deterministically maps an integer address to destination domains: +`sip_id`, `die_id`, target kind (HBM / PE_LOCAL / MCPU_LOCAL / CUBE_SRAM / +IOCPU / UAL). + +### D4. Capacity validation may depend on topology config + +Whether a decoded address falls within **implemented capacity** (e.g., +HBM 96 GB on a specific SKU) is checked against topology parameters +provided via DI/config. Decode itself (D3) never consults topology -- +only validation does. These parameters must live in the topology/config +layer, not in node implementations. ### D5. Routing consumes decoded domains, not raw bits Routing policy uses decoded domains: -- `src` location (sip/cube/pe or node_id) +- `src` location (sip / die / pe or node_id) - `dst` domains derived from PhysAddr decoding - `size_bytes` for size-aware link latency -Routing must not inspect raw bit-fields directly except inside the decoding module. + +Routing must not inspect raw bit-fields directly except inside the +decoding module. ## Alternatives Considered -1) **Use raw integers everywhere, decode ad-hoc in routing** +1. **Keep `rack_id` (4 bits)**: Rejected -- never used in practice, + consumes 4 bits that enable die-local expansion to 42 bits + (IOCHIPLET 1 TB). -- Rejected: leads to duplicated logic, inconsistent routing, and hidden assumptions embedded in multiple components. +2. **Uniform 256 GB per die**: Rejected -- IOCHIPLET UAL requires ~1 TB. + Freed rack_id bits enable 42-bit local_offset. -1) **Hardcode topology sizes (SIP/CUBE/PE counts) into decoding** +3. **Variable-width die windows (AHBM 256 GB, CHIPLET 1 TB via multi-seg + spanning)**: Rejected -- complicates D3 (deterministic decoding). + Uniform 4 TB window with MBZ padding is simpler. -- Rejected: violates SPEC (R3) and breaks swappability and configuration-driven topologies. +4. **Use raw integers everywhere, decode ad-hoc in routing**: Rejected -- + leads to duplicated logic, inconsistent routing, and hidden + assumptions. -1) **Put decoding inside memory controllers or routers** +5. **Hardcode topology sizes (SIP/CUBE/PE counts) into decoding**: + Rejected -- violates SPEC R3 and breaks swappability. -- Rejected: leaks policy into components and undermines DI-first, swappable implementations (SPEC R4). +6. **Put decoding inside memory controllers or routers**: Rejected -- + leaks policy into components, violates SPEC R4 / D5. ## Consequences ### Positive -- Deterministic routing domains enable clear test invariants for local vs remote paths (SPEC R1, R5). -- Keeps topology variability (SPEC R3) while preserving consistent semantics. -- DI-first: decoder can be swapped or extended without changing components or tests (SPEC R4). +- Simple hierarchical decoder: SIP -> die -> kind -> sub-unit. +- Clean separation of memory (HBM) vs local resource (PE/MCPU/SRAM/IOCPU). +- Deterministic routing domains enable clear test invariants (SPEC R1, R5). +- Expandable: 11 reserved die_id slots, reserved resource_kind / sub-unit + slots, reserved MBZ bits. +- DI-first: decoder can be swapped without changing components (SPEC R4). -### Tradeoffs / Costs +### Tradeoffs -- Requires explicit configuration for any topology-derived sizes. -- Introduces a single “blessed” decoding module that must remain stable and well-tested. +- Sparse address holes due to power-of-2 slot alignment. +- Large reserved/MBZ regions (intentional for future extension). +- Requires explicit configuration for topology-derived sizes (D4). +- Introduces a single "blessed" decoding module that must remain stable + and well-tested. + +## Supersedes + +- **ADR-0031 (PhysAddr PE-Resource Extension)**: stub status. The + PE_LOCAL / MCPU_LOCAL / CUBE_SRAM sub-unit tables in D2.3.3-D2.3.5 + fulfill ADR-0031's stated goals. ## Implementation Notes (Non-normative) -- Recommended module boundary: - - `src/kernbench/policy/address/phyaddr.py` +- Recommended module: `src/kernbench/policy/address/phyaddr.py` +- Tests should cover: encode/decode round-trip per kind, MBZ enforcement, + die_id dispatch (AHBM / IOCHIPLET / reserved), sub-unit boundary + values, backward compatibility of factory APIs. +- Factory methods: `hbm_addr`, `pe_hbm_addr`, `pe_tcm_addr`, + `cube_sram_addr` retain signatures (minus `rack_id`); `cube_id` + parameter renamed to `die_id`. +- New factories: `pe_resource_addr`, `mcpu_resource_addr`, + `iocpu_resource_addr`, `ual_addr`. -- Tests should cover: - - deterministic decoding - - local vs remote classification from decoded fields - - invariants: “allocator returns full PhysAddr”, “decoding requires no global state” +## Appendix A. Address Examples + +### A.1 AHBM HBM access + +sip=2, die=5, HBM offset=0x1000 + +```text +sip_id = 2 -> [50:47] = 0b0010 +die_id = 5 -> [46:42] = 0b00101 +addr_space = 1 -> [37] = 1 (HBM) +hbm_offset = 0x1000 -> [36:0] + +51-bit addr = (2 << 47) | (5 << 42) | (1 << 37) | 0x1000 +``` + +### A.2 AHBM PE_LOCAL -- PE3 PE_TCM, offset=0x400 + +```text +sip_id = 0 -> [50:47] = 0 +die_id = 0 -> [46:42] = 0 +addr_space = 0 -> [37] = 0 +resource_kind = 0 -> [36:34] = 000 (PE_LOCAL) +pe_id = 3 -> [32:29] = 0011 +pe_sub_unit = 6 -> [28:25] = 0110 (PE_TCM) +sub_offset = 0x400 -> [24:0] + +local_offset = (0 << 34) | (3 << 29) | (6 << 25) | 0x400 +``` + +### A.3 AHBM MCPU_LOCAL -- MCPU_SRAM, offset=0x0 + +```text +sip_id = 1 -> [50:47] = 0001 +die_id = 3 -> [46:42] = 00011 +addr_space = 0 -> [37] = 0 +resource_kind = 1 -> [36:34] = 001 (MCPU_LOCAL) +mcpu_sub_unit = 5 -> [29:25] = 00101 (MCPU_SRAM) +sub_offset = 0 -> [24:0] = 0 + +local_offset = (1 << 34) | (5 << 25) +``` + +### A.4 IOCHIPLET -- IOCPU IPCQ, offset=0x20000 + +```text +sip_id = 1 -> [50:47] = 0001 +die_id = 17 -> [46:42] = 10001 (IOCHIPLET[1]) +iocpu_sub_unit = 2 -> [30:27] = 0010 (IPCQ) +sub_offset = 0x20000 -> [26:0] + +chiplet_offset = (2 << 27) | 0x20000 + (< 0x8000_0000 -> IOCPU region) +``` + +### A.5 IOCHIPLET -- UAL region, offset=4 GB + +```text +sip_id = 0 -> [50:47] = 0 +die_id = 16 -> [46:42] = 10000 (IOCHIPLET[0]) +chiplet_offset = 0x1_0000_0000 (4 GB >= 2 GB -> UAL region) +``` ## Links -- SPEC.md: R1 (routing), R3 (configurable topology), R4 (DI-first), R5 (multi-domain comm) +- SPEC.md: R1 (routing), R3 (configurable topology), R4 (DI-first), + R5 (multi-domain comm) +- ADR-0031: Superseded diff --git a/docs/adr/ADR-0031-physaddr-pe-resource-extension.md b/docs/adr/ADR-0031-physaddr-pe-resource-extension.md index 8633b2c..e8bb39e 100644 --- a/docs/adr/ADR-0031-physaddr-pe-resource-extension.md +++ b/docs/adr/ADR-0031-physaddr-pe-resource-extension.md @@ -2,7 +2,11 @@ ## Status -Stub (Blocker for ADR-0030 — specific range allocations TBD) +Superseded by ADR-0001 (Revision 2, 2026-04-27). +PE_LOCAL / MCPU_LOCAL / CUBE_SRAM sub-unit tables are now defined in +ADR-0001 D2.3.3-D2.3.5. + +Previous status: Stub (Blocker for ADR-0030 — specific range allocations TBD) ## Context diff --git a/src/kernbench/cli/probe.py b/src/kernbench/cli/probe.py index 1333c8c..dc0e6f5 100644 --- a/src/kernbench/cli/probe.py +++ b/src/kernbench/cli/probe.py @@ -23,7 +23,7 @@ def _hbm_pa(sip: int, cube: int, pe_id: int, spec: dict) -> int: mm = spec["cube"]["memory_map"] slice_bytes = mm["hbm_total_gb_per_cube"] * (1 << 30) // mm["hbm_slices_per_cube"] pa = PhysAddr.pe_hbm_addr( - rack_id=0, sip_id=sip, cube_id=cube, pe_id=pe_id, + sip_id=sip, die_id=cube, pe_id=pe_id, pe_local_hbm_offset=0x1000, slice_size_bytes=slice_bytes, ) return pa.encode() diff --git a/src/kernbench/components/builtin/io_cpu.py b/src/kernbench/components/builtin/io_cpu.py index 4c21945..e5102dd 100644 --- a/src/kernbench/components/builtin/io_cpu.py +++ b/src/kernbench/components/builtin/io_cpu.py @@ -220,10 +220,10 @@ class IoCpuComponent(ComponentBase): return [] def _cube_from_pa(self, pa_val: int, fallback: int) -> int: - """Extract cube_id from a physical address, with fallback.""" + """Extract die_id from a physical address, with fallback.""" from kernbench.policy.address.phyaddr import PhysAddr try: - return PhysAddr.decode(pa_val).cube_id + return PhysAddr.decode(pa_val).die_id except Exception: return fallback diff --git a/src/kernbench/components/builtin/pe_dma.py b/src/kernbench/components/builtin/pe_dma.py index 84988c3..04c6129 100644 --- a/src/kernbench/components/builtin/pe_dma.py +++ b/src/kernbench/components/builtin/pe_dma.py @@ -302,7 +302,16 @@ class PeDmaComponent(PeEngineBase): dma_res = self._dma_write if is_write else self._dma_read assert dma_res is not None - pa = PhysAddr.decode(addr) + # Translate VA → PA via MMU (same logic as non-pipeline path) + target_pa = addr + if self._mmu is not None: + from kernbench.policy.address.pe_mmu import PageFault + try: + target_pa = self._mmu.translate(addr) + except PageFault: + target_pa = addr # fallback: treat as PA directly + + pa = PhysAddr.decode(target_pa) dst_node = self.ctx.resolver.resolve(pa) path = self.ctx.router.find_path(self._pe_prefix, dst_node) drain_ns = self.ctx.compute_drain_ns(path, nbytes) @@ -314,7 +323,7 @@ class PeDmaComponent(PeEngineBase): correlation_id="pipeline", request_id=f"tile_{token.tile_id}", src_sip=0, src_cube=0, src_pe=0, - dst_pa=addr, nbytes=nbytes, + dst_pa=target_pa, nbytes=nbytes, is_write=is_write, ) sub_txn = Transaction( diff --git a/src/kernbench/components/legacy/builtin/io_cpu.py b/src/kernbench/components/legacy/builtin/io_cpu.py index 2583c62..1ece161 100644 --- a/src/kernbench/components/legacy/builtin/io_cpu.py +++ b/src/kernbench/components/legacy/builtin/io_cpu.py @@ -207,10 +207,10 @@ class IoCpuComponent(ComponentBase): return [] def _cube_from_pa(self, pa_val: int, fallback: int) -> int: - """Extract cube_id from a physical address, with fallback.""" + """Extract die_id from a physical address, with fallback.""" from kernbench.policy.address.phyaddr import PhysAddr try: - return PhysAddr.decode(pa_val).cube_id + return PhysAddr.decode(pa_val).die_id except Exception: return fallback diff --git a/src/kernbench/policy/address/allocator.py b/src/kernbench/policy/address/allocator.py index 068bc2d..4879964 100644 --- a/src/kernbench/policy/address/allocator.py +++ b/src/kernbench/policy/address/allocator.py @@ -89,11 +89,10 @@ class _FreeList: class PEMemAllocator: def __init__( - self, rack_id: int, sip_id: int, cube_id: int, pe_id: int, cfg: AddressConfig, + self, sip_id: int, die_id: int, pe_id: int, cfg: AddressConfig, ) -> None: - self._rack_id = rack_id self._sip_id = sip_id - self._cube_id = cube_id + self._die_id = die_id self._pe_id = pe_id self._cfg = cfg self._hbm = _FreeList(cfg.hbm_slice_bytes) @@ -108,7 +107,7 @@ class PEMemAllocator: f"available {self._cfg.hbm_slice_bytes - self._hbm.used}" ) return PhysAddr.pe_hbm_addr( - rack_id=self._rack_id, sip_id=self._sip_id, cube_id=self._cube_id, + sip_id=self._sip_id, die_id=self._die_id, pe_id=self._pe_id, pe_local_hbm_offset=offset, slice_size_bytes=self._cfg.hbm_slice_bytes, ) @@ -128,7 +127,7 @@ class PEMemAllocator: f"available {self._cfg.tcm_allocatable_bytes - self._tcm.used}" ) return PhysAddr.pe_tcm_addr( - rack_id=self._rack_id, sip_id=self._sip_id, cube_id=self._cube_id, + sip_id=self._sip_id, die_id=self._die_id, pe_id=self._pe_id, tcm_offset=offset, ) diff --git a/src/kernbench/policy/address/phyaddr.py b/src/kernbench/policy/address/phyaddr.py index b19e3cc..5abf625 100644 --- a/src/kernbench/policy/address/phyaddr.py +++ b/src/kernbench/policy/address/phyaddr.py @@ -6,6 +6,47 @@ from typing import Literal MAX_51 = (1 << 51) - 1 +# ── Layout constants (ADR-0001 Rev 2) ──────────────────────────────── +# [50:47] sip_id (4) +# [46:42] die_id (5) +# [41: 0] local_offset (42) +_SIP_SHIFT = 47 +_DIE_SHIFT = 42 +_LOCAL_BITS = 42 +_LOCAL_MASK = (1 << _LOCAL_BITS) - 1 + +# AHBM die: [41:38] MBZ, [37] addr_space, [36:0] sub-address +_AHBM_SEL_BIT = 37 +_AHBM_LOCAL_USED = 38 # bits actually meaningful for AHBM + +# Resource window: [36:34] resource_kind, [33:0] kind_local +_RES_KIND_SHIFT = 34 +_RES_KIND_MASK = 0x7 + +# PE_LOCAL: [32:29] pe_id, [28:25] pe_sub_unit, [24:0] sub_offset +_PE_ID_SHIFT = 29 +_PE_SUB_SHIFT = 25 +_PE_SUB_OFFSET_BITS = 25 + +# MCPU_LOCAL: [29:25] mcpu_sub_unit, [24:0] sub_offset +_MCPU_SUB_SHIFT = 25 + +# CUBE_SRAM: [24:0] sram_offset +_SRAM_OFFSET_BITS = 25 + +# IOCHIPLET: [41:40] MBZ, [39:0] chiplet_offset +_CHIPLET_LOCAL_BITS = 40 +_IOCPU_BOUNDARY = 1 << 31 # 2 GB + +# IOCPU: [30:27] iocpu_sub_unit, [26:0] sub_offset +_IOCPU_SUB_SHIFT = 27 +_IOCPU_SUB_OFFSET_BITS = 27 + +# die_id ranges +_AHBM_DIE_MAX = 15 +_CHIPLET_DIE_MIN = 16 +_CHIPLET_DIE_MAX = 20 + class PhysAddrError(Exception): pass @@ -22,163 +63,278 @@ def _chk_max(name: str, v: int, maxv: int) -> None: class UnitType(IntEnum): - PE = 0 - MCPU = 1 - SRAM = 2 + """resource_kind values for AHBM resource window.""" + PE = 0 # PE_LOCAL + MCPU = 1 # MCPU_LOCAL + SRAM = 2 # CUBE_SRAM + + +class PESubUnit(IntEnum): + PE_CPU_DTCM = 0 + MATH_ENGINE_DTCM = 1 + IPCQ = 2 + PE_CPU_SFR = 3 + MATH_ENGINE_SFR = 4 + DMA_ENGINE_SFR = 5 + PE_TCM = 6 + + +class MCPUSubUnit(IntEnum): + MCPU_ITCM = 0 + MCPU_DTCM = 1 + IPCQ = 2 + MCPU_SFR = 3 + MCPU_DMA_SFR = 4 + MCPU_SRAM = 5 + + +class IOCPUSubUnit(IntEnum): + IOCPU_ITCM = 0 + IOCPU_DTCM = 1 + IPCQ = 2 + IOCPU_SFR = 3 + IO_DMA_SFR = 4 + IO_SRAM = 5 @dataclass(frozen=True) class PhysAddr: - """ - 51-bit physical address value object. + """51-bit physical address value object (ADR-0001 Rev 2). Layout: - [50:47] rack_id (4) - [46:43] sip_id (4) - [42:38] sip_seg (5) # cube_id - [37:0] local_offset (38) => each segment is 256GB - - local_offset: - [37] selector: 1 = HBM window (128GB reserved), 0 = PE resource window + [50:47] sip_id (4) -- 16 SIPs + [46:42] die_id (5) -- 0..15 AHBM, 16..20 IOCHIPLET + [41: 0] local_offset (42) -- 4 TB per die """ - rack_id: int sip_id: int - sip_seg: int + die_id: int local_offset: int - kind: Literal["hbm", "pe_resource", "raw"] = "raw" - cube_id: int = 0 + kind: Literal["hbm", "pe_resource", "iocpu", "ual", "raw"] = "raw" unit_type: UnitType = UnitType.PE pe_id: int = 0 - ext: int = 0 + pe_sub_unit: int = 0 sub_offset: int = 0 hbm_offset: int = 0 + iocpu_sub_unit: int = 0 + chiplet_offset: int = 0 + mcpu_sub_unit: int = 0 - HBM_WINDOW_BYTES = 1 << 37 # 128GB + HBM_WINDOW_BYTES = 1 << 37 # 128 GB + + # ── encode / decode ────────────────────────────────────────────── def encode(self) -> int: - _chk_range("rack_id", self.rack_id, 4) _chk_range("sip_id", self.sip_id, 4) - _chk_range("sip_seg", self.sip_seg, 5) - _chk_range("local_offset", self.local_offset, 38) - addr = (self.rack_id << 47) | (self.sip_id << 43) | (self.sip_seg << 38) | self.local_offset - if not (0 <= addr <= MAX_51): - raise PhysAddrError("address exceeds 51-bit space") + _chk_range("die_id", self.die_id, 5) + _chk_range("local_offset", self.local_offset, _LOCAL_BITS) + # MBZ enforcement + if self.die_id <= _AHBM_DIE_MAX: + mbz_top = (self.local_offset >> _AHBM_LOCAL_USED) & 0xF + if mbz_top != 0: + raise PhysAddrError("AHBM local_offset bits [41:38] must be zero") + elif _CHIPLET_DIE_MIN <= self.die_id <= _CHIPLET_DIE_MAX: + mbz_top = (self.local_offset >> _CHIPLET_LOCAL_BITS) & 0x3 + if mbz_top != 0: + raise PhysAddrError("IOCHIPLET local_offset bits [41:40] must be zero") + addr = (self.sip_id << _SIP_SHIFT) | (self.die_id << _DIE_SHIFT) | self.local_offset return addr @staticmethod def decode(addr: int) -> PhysAddr: if not (0 <= addr <= MAX_51): raise PhysAddrError("addr must be a 51-bit value") - rack = (addr >> 47) & 0xF - sip_id = (addr >> 43) & 0xF - sip_seg = (addr >> 38) & 0x1F - off = addr & ((1 << 38) - 1) - cube_id = sip_seg - sel = (off >> 37) & 0x1 - if sel == 1: - hbm_offset = int(off & ((1 << 37) - 1)) - return PhysAddr( - rack_id=rack, - sip_id=sip_id, - sip_seg=sip_seg, - local_offset=off, - kind="hbm", - cube_id=cube_id, - hbm_offset=hbm_offset, - ) - # PE resource decode - raw_ut = int((off >> 34) & 0x7) - try: - unit_type = UnitType(raw_ut) - except ValueError: - raise PhysAddrError(f"unknown unit_type: {raw_ut}") from None - pe_id = int((off >> 30) & 0xF) - ext = int((off >> 29) & 0x1) - sub_offset = int(off & ((1 << 29) - 1)) - return PhysAddr( - rack_id=rack, - sip_id=sip_id, - sip_seg=sip_seg, - local_offset=off, - kind="pe_resource", - cube_id=cube_id, - unit_type=unit_type, - pe_id=pe_id, - ext=ext, - sub_offset=sub_offset, - hbm_offset=0, - ) + sip_id = (addr >> _SIP_SHIFT) & 0xF + die_id = (addr >> _DIE_SHIFT) & 0x1F + local_offset = addr & _LOCAL_MASK + + if die_id <= _AHBM_DIE_MAX: + return PhysAddr._decode_ahbm(sip_id, die_id, local_offset) + elif _CHIPLET_DIE_MIN <= die_id <= _CHIPLET_DIE_MAX: + return PhysAddr._decode_chiplet(sip_id, die_id, local_offset) + else: + raise PhysAddrError(f"die_id {die_id} is reserved (21..31)") @staticmethod - def hbm_addr(*, rack_id: int, sip_id: int, cube_id: int, hbm_offset: int) -> PhysAddr: - _chk_max("cube_id", cube_id, 31) - _chk_range("hbm_offset", hbm_offset, 37) - sip_seg = cube_id - local_offset = (1 << 37) | int(hbm_offset) + def _decode_ahbm(sip_id: int, die_id: int, local_offset: int) -> PhysAddr: + sel = (local_offset >> _AHBM_SEL_BIT) & 0x1 + if sel == 1: + hbm_offset = int(local_offset & ((1 << _AHBM_SEL_BIT) - 1)) + return PhysAddr( + sip_id=sip_id, die_id=die_id, local_offset=local_offset, + kind="hbm", hbm_offset=hbm_offset, + ) + # Resource window + res_kind = int((local_offset >> _RES_KIND_SHIFT) & _RES_KIND_MASK) + try: + unit_type = UnitType(res_kind) + except ValueError: + raise PhysAddrError(f"unknown resource_kind: {res_kind}") from None + + if unit_type == UnitType.PE: + pe_id = int((local_offset >> _PE_ID_SHIFT) & 0xF) + pe_sub = int((local_offset >> _PE_SUB_SHIFT) & 0xF) + sub_off = int(local_offset & ((1 << _PE_SUB_OFFSET_BITS) - 1)) + return PhysAddr( + sip_id=sip_id, die_id=die_id, local_offset=local_offset, + kind="pe_resource", unit_type=unit_type, + pe_id=pe_id, pe_sub_unit=pe_sub, sub_offset=sub_off, + ) + elif unit_type == UnitType.MCPU: + mcpu_sub = int((local_offset >> _MCPU_SUB_SHIFT) & 0x1F) + sub_off = int(local_offset & ((1 << _PE_SUB_OFFSET_BITS) - 1)) + return PhysAddr( + sip_id=sip_id, die_id=die_id, local_offset=local_offset, + kind="pe_resource", unit_type=unit_type, + mcpu_sub_unit=mcpu_sub, sub_offset=sub_off, + ) + else: # SRAM + sub_off = int(local_offset & ((1 << _SRAM_OFFSET_BITS) - 1)) + return PhysAddr( + sip_id=sip_id, die_id=die_id, local_offset=local_offset, + kind="pe_resource", unit_type=unit_type, + sub_offset=sub_off, + ) + + @staticmethod + def _decode_chiplet(sip_id: int, die_id: int, local_offset: int) -> PhysAddr: + chip_off = local_offset & ((1 << _CHIPLET_LOCAL_BITS) - 1) + if chip_off < _IOCPU_BOUNDARY: + iocpu_sub = int((chip_off >> _IOCPU_SUB_SHIFT) & 0xF) + sub_off = int(chip_off & ((1 << _IOCPU_SUB_OFFSET_BITS) - 1)) + return PhysAddr( + sip_id=sip_id, die_id=die_id, local_offset=local_offset, + kind="iocpu", chiplet_offset=chip_off, + iocpu_sub_unit=iocpu_sub, sub_offset=sub_off, + ) + else: + return PhysAddr( + sip_id=sip_id, die_id=die_id, local_offset=local_offset, + kind="ual", chiplet_offset=chip_off, + ) + + # ── AHBM factory methods ──────────────────────────────────────── + + @staticmethod + def hbm_addr(*, sip_id: int, die_id: int, hbm_offset: int) -> PhysAddr: + _chk_max("die_id", die_id, _AHBM_DIE_MAX) + _chk_range("hbm_offset", hbm_offset, _AHBM_SEL_BIT) + local_offset = (1 << _AHBM_SEL_BIT) | int(hbm_offset) return PhysAddr( - rack_id=rack_id, - sip_id=sip_id, - sip_seg=sip_seg, - local_offset=local_offset, - kind="hbm", - cube_id=cube_id, - hbm_offset=int(hbm_offset), + sip_id=sip_id, die_id=die_id, local_offset=local_offset, + kind="hbm", hbm_offset=int(hbm_offset), ) @staticmethod def pe_hbm_addr( - *, - rack_id: int, - sip_id: int, - cube_id: int, - pe_id: int, - pe_local_hbm_offset: int, - slice_size_bytes: int, + *, sip_id: int, die_id: int, + pe_id: int, pe_local_hbm_offset: int, slice_size_bytes: int, ) -> PhysAddr: - _chk_max("cube_id", cube_id, 31) + _chk_max("die_id", die_id, _AHBM_DIE_MAX) _chk_range("pe_id", pe_id, 4) if not (0 <= pe_local_hbm_offset < slice_size_bytes): raise PhysAddrError("pe_local_hbm_offset out of PE local slice range") hbm_offset = int(pe_id) * int(slice_size_bytes) + int(pe_local_hbm_offset) if not (0 <= hbm_offset < PhysAddr.HBM_WINDOW_BYTES): raise PhysAddrError("HBM offset exceeds reserved 128GB window") - return PhysAddr.hbm_addr( - rack_id=rack_id, sip_id=sip_id, cube_id=cube_id, hbm_offset=hbm_offset - ) + return PhysAddr.hbm_addr(sip_id=sip_id, die_id=die_id, hbm_offset=hbm_offset) @staticmethod def hbm_pe_id(hbm_offset: int, slice_size_bytes: int) -> int: return hbm_offset // slice_size_bytes @staticmethod - def cube_sram_addr( - *, rack_id: int, sip_id: int, cube_id: int, sram_offset: int, + def pe_tcm_addr( + *, sip_id: int, die_id: int, pe_id: int, tcm_offset: int, ) -> PhysAddr: - _chk_max("cube_id", cube_id, 31) - _chk_range("sram_offset", sram_offset, 29) - sip_seg = cube_id - local_offset = (UnitType.SRAM << 34) | sram_offset - return PhysAddr( - rack_id=rack_id, sip_id=sip_id, sip_seg=sip_seg, - local_offset=local_offset, - kind="pe_resource", cube_id=cube_id, - unit_type=UnitType.SRAM, sub_offset=sram_offset, + return PhysAddr.pe_resource_addr( + sip_id=sip_id, die_id=die_id, pe_id=pe_id, + pe_sub_unit=PESubUnit.PE_TCM, sub_offset=tcm_offset, ) @staticmethod - def pe_tcm_addr( - *, rack_id: int, sip_id: int, cube_id: int, pe_id: int, tcm_offset: int, + def pe_resource_addr( + *, sip_id: int, die_id: int, pe_id: int, + pe_sub_unit: int, sub_offset: int, ) -> PhysAddr: - _chk_max("cube_id", cube_id, 31) + _chk_max("die_id", die_id, _AHBM_DIE_MAX) _chk_range("pe_id", pe_id, 4) - _chk_range("tcm_offset", tcm_offset, 29) - sip_seg = cube_id - local_offset = (UnitType.PE << 34) | (pe_id << 30) | tcm_offset - return PhysAddr( - rack_id=rack_id, sip_id=sip_id, sip_seg=sip_seg, - local_offset=local_offset, - kind="pe_resource", cube_id=cube_id, - unit_type=UnitType.PE, pe_id=pe_id, sub_offset=tcm_offset, + _chk_range("pe_sub_unit", pe_sub_unit, 4) + _chk_range("sub_offset", sub_offset, _PE_SUB_OFFSET_BITS) + local_offset = ( + (UnitType.PE << _RES_KIND_SHIFT) + | (pe_id << _PE_ID_SHIFT) + | (pe_sub_unit << _PE_SUB_SHIFT) + | sub_offset + ) + return PhysAddr( + sip_id=sip_id, die_id=die_id, local_offset=local_offset, + kind="pe_resource", unit_type=UnitType.PE, + pe_id=pe_id, pe_sub_unit=pe_sub_unit, sub_offset=sub_offset, + ) + + @staticmethod + def cube_sram_addr( + *, sip_id: int, die_id: int, sram_offset: int, + ) -> PhysAddr: + _chk_max("die_id", die_id, _AHBM_DIE_MAX) + _chk_range("sram_offset", sram_offset, _SRAM_OFFSET_BITS) + local_offset = (UnitType.SRAM << _RES_KIND_SHIFT) | sram_offset + return PhysAddr( + sip_id=sip_id, die_id=die_id, local_offset=local_offset, + kind="pe_resource", unit_type=UnitType.SRAM, sub_offset=sram_offset, + ) + + @staticmethod + def mcpu_resource_addr( + *, sip_id: int, die_id: int, mcpu_sub_unit: int, sub_offset: int, + ) -> PhysAddr: + _chk_max("die_id", die_id, _AHBM_DIE_MAX) + _chk_range("mcpu_sub_unit", mcpu_sub_unit, 5) + _chk_range("sub_offset", sub_offset, _PE_SUB_OFFSET_BITS) + local_offset = ( + (UnitType.MCPU << _RES_KIND_SHIFT) + | (mcpu_sub_unit << _MCPU_SUB_SHIFT) + | sub_offset + ) + return PhysAddr( + sip_id=sip_id, die_id=die_id, local_offset=local_offset, + kind="pe_resource", unit_type=UnitType.MCPU, + mcpu_sub_unit=mcpu_sub_unit, sub_offset=sub_offset, + ) + + # ── IOCHIPLET factory methods ──────────────────────────────────── + + @staticmethod + def iocpu_resource_addr( + *, sip_id: int, die_id: int, iocpu_sub_unit: int, sub_offset: int, + ) -> PhysAddr: + _chk_max("die_id", die_id, _CHIPLET_DIE_MAX) + if die_id < _CHIPLET_DIE_MIN: + raise PhysAddrError( + f"die_id {die_id} is not an IOCHIPLET " + f"(must be {_CHIPLET_DIE_MIN}..{_CHIPLET_DIE_MAX})" + ) + _chk_range("iocpu_sub_unit", iocpu_sub_unit, 4) + _chk_range("sub_offset", sub_offset, _IOCPU_SUB_OFFSET_BITS) + chiplet_offset = (iocpu_sub_unit << _IOCPU_SUB_SHIFT) | sub_offset + if chiplet_offset >= _IOCPU_BOUNDARY: + raise PhysAddrError("IOCPU region overflow (must be < 2 GB)") + return PhysAddr( + sip_id=sip_id, die_id=die_id, local_offset=chiplet_offset, + kind="iocpu", chiplet_offset=chiplet_offset, + iocpu_sub_unit=iocpu_sub_unit, sub_offset=sub_offset, + ) + + @staticmethod + def ual_addr(*, sip_id: int, die_id: int, ual_offset: int) -> PhysAddr: + _chk_max("die_id", die_id, _CHIPLET_DIE_MAX) + if die_id < _CHIPLET_DIE_MIN: + raise PhysAddrError(f"die_id {die_id} is not an IOCHIPLET") + chiplet_offset = _IOCPU_BOUNDARY + ual_offset + _chk_range("chiplet_offset", chiplet_offset, _CHIPLET_LOCAL_BITS) + return PhysAddr( + sip_id=sip_id, die_id=die_id, local_offset=chiplet_offset, + kind="ual", chiplet_offset=chiplet_offset, ) diff --git a/src/kernbench/policy/routing/router.py b/src/kernbench/policy/routing/router.py index ce3d21e..9ec044c 100644 --- a/src/kernbench/policy/routing/router.py +++ b/src/kernbench/policy/routing/router.py @@ -27,16 +27,16 @@ class AddressResolver: def resolve(self, addr: PhysAddr) -> str: s = addr.sip_id - c = addr.cube_id + d = addr.die_id if addr.kind == "hbm": - node_id = f"sip{s}.cube{c}.hbm_ctrl" + node_id = f"sip{s}.cube{d}.hbm_ctrl" elif addr.kind == "pe_resource": if addr.unit_type == UnitType.PE: - node_id = f"sip{s}.cube{c}.pe{addr.pe_id}.pe_tcm" + node_id = f"sip{s}.cube{d}.pe{addr.pe_id}.pe_tcm" elif addr.unit_type == UnitType.SRAM: - node_id = f"sip{s}.cube{c}.sram" + node_id = f"sip{s}.cube{d}.sram" elif addr.unit_type == UnitType.MCPU: - node_id = f"sip{s}.cube{c}.m_cpu" + node_id = f"sip{s}.cube{d}.m_cpu" else: raise RoutingError(f"unsupported unit_type: {addr.unit_type}") else: diff --git a/src/kernbench/runtime_api/context.py b/src/kernbench/runtime_api/context.py index aaa861a..d339875 100644 --- a/src/kernbench/runtime_api/context.py +++ b/src/kernbench/runtime_api/context.py @@ -385,7 +385,7 @@ class RuntimeContext: for cube_id in range(cubes_per_sip): for pe_id in range(pes_per_cube): self._allocators[(sip_id, cube_id, pe_id)] = PEMemAllocator( - rack_id=0, sip_id=sip_id, cube_id=cube_id, pe_id=pe_id, cfg=cfg, + sip_id=sip_id, die_id=cube_id, pe_id=pe_id, cfg=cfg, ) # Initialize VA allocator (MMU mappings are installed via fabric MmuMapMsg) diff --git a/src/kernbench/sim_engine/event_log.py b/src/kernbench/sim_engine/event_log.py index b053084..1e96392 100644 --- a/src/kernbench/sim_engine/event_log.py +++ b/src/kernbench/sim_engine/event_log.py @@ -212,7 +212,7 @@ def _generate_probe_h2d(graph, edge_map) -> list[dict]: t_offset = 0.0 for rid, (name, cube, hops) in enumerate(cases): pa = PhysAddr.pe_hbm_addr( - rack_id=0, sip_id=0, cube_id=cube, pe_id=0, + sip_id=0, die_id=cube, pe_id=0, pe_local_hbm_offset=0x1000, slice_size_bytes=slice_bytes, ) dst_node = resolver.resolve(pa) @@ -256,7 +256,7 @@ def _generate_probe_d2h(graph, edge_map) -> list[dict]: t_offset = 0.0 for rid, (name, cube, hops) in enumerate(cases): pa = PhysAddr.pe_hbm_addr( - rack_id=0, sip_id=0, cube_id=cube, pe_id=0, + sip_id=0, die_id=cube, pe_id=0, pe_local_hbm_offset=0x1000, slice_size_bytes=slice_bytes, ) dst_node = resolver.resolve(pa) @@ -310,7 +310,7 @@ def _generate_probe_pe_dma(graph, edge_map) -> list[dict]: t_offset = 0.0 for rid, (name, sip, src_cube, src_pe, dst_cube, dst_pe) in enumerate(cases): pa = PhysAddr.pe_hbm_addr( - rack_id=0, sip_id=sip, cube_id=dst_cube, pe_id=dst_pe, + sip_id=sip, die_id=dst_cube, pe_id=dst_pe, pe_local_hbm_offset=0x1000, slice_size_bytes=slice_bytes, ) dst_node = resolver.resolve(pa) diff --git a/tests/test_adr0026_dppolicy_intra_device.py b/tests/test_adr0026_dppolicy_intra_device.py index 1fcf096..1277585 100644 --- a/tests/test_adr0026_dppolicy_intra_device.py +++ b/tests/test_adr0026_dppolicy_intra_device.py @@ -149,7 +149,7 @@ def _make_tuple_allocators( ) -> dict[tuple[int, int, int], PEMemAllocator]: return { (s, c, p): PEMemAllocator( - rack_id=0, sip_id=s, cube_id=c, pe_id=p, cfg=_CFG, + sip_id=s, die_id=c, pe_id=p, cfg=_CFG, ) for s in range(num_sips) for c in range(num_cubes) diff --git a/tests/test_bw_occupancy.py b/tests/test_bw_occupancy.py index 15b7f33..3575fbb 100644 --- a/tests/test_bw_occupancy.py +++ b/tests/test_bw_occupancy.py @@ -23,7 +23,7 @@ def _engine(): def _hbm_pa(sip: int = 0, cube: int = 0, pe_id: int = 0) -> int: slice_bytes = 48 * (1 << 30) // 8 pa = PhysAddr.pe_hbm_addr( - rack_id=0, sip_id=sip, cube_id=cube, pe_id=pe_id, + sip_id=sip, die_id=cube, pe_id=pe_id, pe_local_hbm_offset=0x1000, slice_size_bytes=slice_bytes, ) return pa.encode() diff --git a/tests/test_component_registry.py b/tests/test_component_registry.py index 055bb6d..e6d3803 100644 --- a/tests/test_component_registry.py +++ b/tests/test_component_registry.py @@ -30,7 +30,7 @@ def _graph(): def _hbm_pa(pe_id: int = 0) -> int: slice_bytes = 48 * (1 << 30) // 8 pa = PhysAddr.pe_hbm_addr( - rack_id=0, sip_id=0, cube_id=0, pe_id=pe_id, + sip_id=0, die_id=0, pe_id=pe_id, pe_local_hbm_offset=0x1000, slice_size_bytes=slice_bytes, ) return pa.encode() diff --git a/tests/test_e2e_data.py b/tests/test_e2e_data.py index 6420ef5..a645488 100644 --- a/tests/test_e2e_data.py +++ b/tests/test_e2e_data.py @@ -50,7 +50,7 @@ def _hbm_pa(sip: int = 0, cube: int = 0, pe_id: int = 0) -> int: from kernbench.policy.address.phyaddr import PhysAddr slice_bytes = 48 * (1 << 30) // 8 pa = PhysAddr.pe_hbm_addr( - rack_id=0, sip_id=sip, cube_id=cube, pe_id=pe_id, + sip_id=sip, die_id=cube, pe_id=pe_id, pe_local_hbm_offset=0x1000, slice_size_bytes=slice_bytes, ) return pa.encode() diff --git a/tests/test_e2e_pipeline.py b/tests/test_e2e_pipeline.py index c6b24ed..aa35685 100644 --- a/tests/test_e2e_pipeline.py +++ b/tests/test_e2e_pipeline.py @@ -31,7 +31,7 @@ def _hbm_pa(sip=0, cube=0, pe_id=0): from kernbench.policy.address.phyaddr import PhysAddr slice_bytes = 48 * (1 << 30) // 8 pa = PhysAddr.pe_hbm_addr( - rack_id=0, sip_id=sip, cube_id=cube, pe_id=pe_id, + sip_id=sip, die_id=cube, pe_id=pe_id, pe_local_hbm_offset=0x1000, slice_size_bytes=slice_bytes, ) return pa.encode() diff --git a/tests/test_engine.py b/tests/test_engine.py index c2c9849..2465bf8 100644 --- a/tests/test_engine.py +++ b/tests/test_engine.py @@ -29,7 +29,7 @@ def _hbm_pa(sip: int = 0, cube: int = 0, pe_id: int = 0) -> int: # 48 GB / 8 slices = 6 GB per slice slice_bytes = 48 * (1 << 30) // 8 pa = PhysAddr.pe_hbm_addr( - rack_id=0, sip_id=sip, cube_id=cube, pe_id=pe_id, + sip_id=sip, die_id=cube, pe_id=pe_id, pe_local_hbm_offset=0x1000, slice_size_bytes=slice_bytes, ) return pa.encode() @@ -37,7 +37,7 @@ def _hbm_pa(sip: int = 0, cube: int = 0, pe_id: int = 0) -> int: def _sram_pa(sip: int = 0, cube: int = 0) -> int: """Create an SRAM physical address.""" - pa = PhysAddr.cube_sram_addr(rack_id=0, sip_id=sip, cube_id=cube, sram_offset=0x800) + pa = PhysAddr.cube_sram_addr(sip_id=sip, die_id=cube, sram_offset=0x800) return pa.encode() diff --git a/tests/test_iochiplet_noc_d2h.py b/tests/test_iochiplet_noc_d2h.py index d03d941..1edf344 100644 --- a/tests/test_iochiplet_noc_d2h.py +++ b/tests/test_iochiplet_noc_d2h.py @@ -36,7 +36,7 @@ def _engine(): def _hbm_pa(sip: int = 0, cube: int = 0, pe_id: int = 0) -> int: slice_bytes = 48 * (1 << 30) // 8 pa = PhysAddr.pe_hbm_addr( - rack_id=0, sip_id=sip, cube_id=cube, pe_id=pe_id, + sip_id=sip, die_id=cube, pe_id=pe_id, pe_local_hbm_offset=0x1000, slice_size_bytes=slice_bytes, ) return pa.encode() diff --git a/tests/test_noc_mesh.py b/tests/test_noc_mesh.py index 92e8280..6c3f295 100644 --- a/tests/test_noc_mesh.py +++ b/tests/test_noc_mesh.py @@ -38,7 +38,7 @@ def _engine(): def _hbm_pa(sip=0, cube=0, pe_id=0): slice_bytes = 48 * (1 << 30) // 8 pa = PhysAddr.pe_hbm_addr( - rack_id=0, sip_id=sip, cube_id=cube, pe_id=pe_id, + sip_id=sip, die_id=cube, pe_id=pe_id, pe_local_hbm_offset=0x1000, slice_size_bytes=slice_bytes, ) return pa.encode() diff --git a/tests/test_pe_components.py b/tests/test_pe_components.py index eceb27b..99cb024 100644 --- a/tests/test_pe_components.py +++ b/tests/test_pe_components.py @@ -53,7 +53,7 @@ def _engine(): def _hbm_pa(sip: int = 0, cube: int = 0, pe_id: int = 0) -> int: slice_bytes = 48 * (1 << 30) // 8 pa = PhysAddr.pe_hbm_addr( - rack_id=0, sip_id=sip, cube_id=cube, pe_id=pe_id, + sip_id=sip, die_id=cube, pe_id=pe_id, pe_local_hbm_offset=0x1000, slice_size_bytes=slice_bytes, ) return pa.encode() diff --git a/tests/test_phyaddr.py b/tests/test_phyaddr.py index 2cbb252..c071f2a 100644 --- a/tests/test_phyaddr.py +++ b/tests/test_phyaddr.py @@ -1,7 +1,10 @@ import pytest from kernbench.policy.address.allocator import AddressConfig, AllocationError, PEMemAllocator -from kernbench.policy.address.phyaddr import PhysAddr, PhysAddrError, UnitType +from kernbench.policy.address.phyaddr import ( + PhysAddr, PhysAddrError, UnitType, + PESubUnit, MCPUSubUnit, IOCPUSubUnit, +) _MB = 1 << 20 _GB = 1 << 30 @@ -23,13 +26,11 @@ _CFG = AddressConfig( def test_physaddr_immutable(): - pa = PhysAddr.hbm_addr(rack_id=0, sip_id=0, cube_id=0, hbm_offset=0) + pa = PhysAddr.hbm_addr(sip_id=0, die_id=0, hbm_offset=0) with pytest.raises(AttributeError): - pa.rack_id = 1 # type: ignore[misc] - # hashable - {pa} - # comparable - pa2 = PhysAddr.hbm_addr(rack_id=0, sip_id=0, cube_id=0, hbm_offset=0) + pa.sip_id = 1 # type: ignore[misc] + {pa} # hashable + pa2 = PhysAddr.hbm_addr(sip_id=0, die_id=0, hbm_offset=0) assert pa == pa2 @@ -37,120 +38,133 @@ def test_physaddr_immutable(): def test_hbm_encode_decode_roundtrip(): - pa = PhysAddr.hbm_addr(rack_id=2, sip_id=3, cube_id=5, hbm_offset=0x1000) + pa = PhysAddr.hbm_addr(sip_id=3, die_id=5, hbm_offset=0x1000) raw = pa.encode() dec = PhysAddr.decode(raw) - assert dec.rack_id == 2 assert dec.sip_id == 3 - assert dec.cube_id == 5 + assert dec.die_id == 5 assert dec.kind == "hbm" assert dec.hbm_offset == 0x1000 -# ── PE resource encode/decode roundtrip ───────────────────────────── +# ── PE resource encode/decode roundtrip (new layout) ─────────────── def test_pe_resource_encode_decode_roundtrip(): - pa = PhysAddr( - rack_id=1, sip_id=2, sip_seg=7, local_offset=0, - kind="pe_resource", cube_id=7, - unit_type=UnitType.PE, pe_id=3, ext=1, sub_offset=0xFF, + pa = PhysAddr.pe_resource_addr( + sip_id=2, die_id=7, pe_id=3, + pe_sub_unit=PESubUnit.PE_TCM, sub_offset=0xFF, ) - # manually build local_offset matching bit layout - local_offset = (UnitType.PE << 34) | (3 << 30) | (1 << 29) | 0xFF - pa2 = PhysAddr( - rack_id=1, sip_id=2, sip_seg=7, local_offset=local_offset, - kind="pe_resource", cube_id=7, - unit_type=UnitType.PE, pe_id=3, ext=1, sub_offset=0xFF, - ) - raw = pa2.encode() + raw = pa.encode() dec = PhysAddr.decode(raw) assert dec.kind == "pe_resource" assert dec.unit_type == UnitType.PE assert dec.pe_id == 3 - assert dec.ext == 1 + assert dec.pe_sub_unit == PESubUnit.PE_TCM assert dec.sub_offset == 0xFF + assert dec.die_id == 7 + assert dec.sip_id == 2 + + +def test_pe_resource_all_sub_units(): + """Each PE sub-unit roundtrips correctly.""" + for su in PESubUnit: + pa = PhysAddr.pe_resource_addr( + sip_id=0, die_id=0, pe_id=0, + pe_sub_unit=su, sub_offset=42, + ) + dec = PhysAddr.decode(pa.encode()) + assert dec.pe_sub_unit == su + assert dec.sub_offset == 42 # ── pe_hbm_addr factory ──────────────────────────────────────────── def test_pe_hbm_addr_factory(): - SLICE = 6 * (1 << 30) # 6 GB per PE slice + SLICE = 6 * _GB pa = PhysAddr.pe_hbm_addr( - rack_id=0, sip_id=0, cube_id=0, + sip_id=0, die_id=0, pe_id=2, pe_local_hbm_offset=1024, slice_size_bytes=SLICE, ) assert pa.kind == "hbm" - assert pa.cube_id == 0 + assert pa.die_id == 0 assert pa.hbm_offset == 2 * SLICE + 1024 def test_pe_hbm_addr_overflow(): - SLICE = 6 * (1 << 30) + SLICE = 6 * _GB with pytest.raises(PhysAddrError, match="pe_local_hbm_offset"): PhysAddr.pe_hbm_addr( - rack_id=0, sip_id=0, cube_id=0, + sip_id=0, die_id=0, pe_id=0, pe_local_hbm_offset=SLICE, slice_size_bytes=SLICE, ) -# ── Invalid unit_type decode (fix #1) ────────────────────────────── +# ── Invalid resource_kind decode ────────────────────────────────── -def test_invalid_unit_type_raises(): - # Craft a PE-resource address with unit_type=7 (invalid) - local_offset = (7 << 34) | (0 << 30) | 0 - pa_raw = PhysAddr( - rack_id=0, sip_id=0, sip_seg=0, local_offset=local_offset, - ) +def test_invalid_resource_kind_raises(): + # resource_kind=7 (invalid), addr_space=0 + local_offset = (7 << 34) | 0 + pa_raw = PhysAddr(sip_id=0, die_id=0, local_offset=local_offset) raw = pa_raw.encode() - with pytest.raises(PhysAddrError, match="unit_type"): + with pytest.raises(PhysAddrError, match="resource_kind"): PhysAddr.decode(raw) -# ── hbm_pe_id utility (fix #3) ───────────────────────────────────── +# ── hbm_pe_id utility ───────────────────────────────────────────── def test_hbm_pe_id_utility(): - SLICE = 6 * (1 << 30) # 6 GB + SLICE = 6 * _GB pa = PhysAddr.pe_hbm_addr( - rack_id=0, sip_id=0, cube_id=0, + sip_id=0, die_id=0, pe_id=5, pe_local_hbm_offset=256, slice_size_bytes=SLICE, ) assert PhysAddr.hbm_pe_id(pa.hbm_offset, SLICE) == 5 -# ── UnitType.SRAM exists (fix #5) ────────────────────────────────── +# ── UnitType / sub-unit enums ────────────────────────────────────── def test_sram_unit_type_exists(): assert UnitType.SRAM == 2 +def test_pe_sub_unit_enum(): + assert PESubUnit.PE_TCM == 6 + assert PESubUnit.IPCQ == 2 + + +def test_mcpu_sub_unit_enum(): + assert MCPUSubUnit.MCPU_SRAM == 5 + + +def test_iocpu_sub_unit_enum(): + assert IOCPUSubUnit.IO_SRAM == 5 + + # ── cube_sram_addr factory + roundtrip ────────────────────────────── def test_cube_sram_addr_roundtrip(): - pa = PhysAddr.cube_sram_addr( - rack_id=0, sip_id=1, cube_id=3, sram_offset=0x800, - ) + pa = PhysAddr.cube_sram_addr(sip_id=1, die_id=3, sram_offset=0x800) assert pa.kind == "pe_resource" assert pa.unit_type == UnitType.SRAM - assert pa.cube_id == 3 + assert pa.die_id == 3 assert pa.sub_offset == 0x800 - # encode → decode roundtrip dec = PhysAddr.decode(pa.encode()) assert dec.unit_type == UnitType.SRAM - assert dec.cube_id == 3 + assert dec.die_id == 3 assert dec.sub_offset == 0x800 def test_cube_sram_addr_range_check(): with pytest.raises(PhysAddrError): PhysAddr.cube_sram_addr( - rack_id=0, sip_id=0, cube_id=0, - sram_offset=(1 << 29), # exceeds 29-bit sub_offset + sip_id=0, die_id=0, + sram_offset=(1 << 25), # exceeds 25-bit sub_offset ) @@ -158,29 +172,137 @@ def test_cube_sram_addr_range_check(): def test_pe_tcm_addr_roundtrip(): - pa = PhysAddr.pe_tcm_addr( - rack_id=0, sip_id=0, cube_id=2, pe_id=7, tcm_offset=0x400, - ) + pa = PhysAddr.pe_tcm_addr(sip_id=0, die_id=2, pe_id=7, tcm_offset=0x400) assert pa.kind == "pe_resource" assert pa.unit_type == UnitType.PE assert pa.pe_id == 7 - assert pa.cube_id == 2 + assert pa.die_id == 2 + assert pa.pe_sub_unit == PESubUnit.PE_TCM assert pa.sub_offset == 0x400 - # encode → decode roundtrip dec = PhysAddr.decode(pa.encode()) assert dec.unit_type == UnitType.PE assert dec.pe_id == 7 + assert dec.pe_sub_unit == PESubUnit.PE_TCM assert dec.sub_offset == 0x400 def test_pe_tcm_addr_range_check(): with pytest.raises(PhysAddrError): PhysAddr.pe_tcm_addr( - rack_id=0, sip_id=0, cube_id=0, pe_id=0, - tcm_offset=(1 << 29), # exceeds 29-bit sub_offset + sip_id=0, die_id=0, pe_id=0, + tcm_offset=(1 << 25), # exceeds 25-bit sub_offset ) +# ── MCPU resource factory + roundtrip ────────────────────────────── + + +def test_mcpu_resource_roundtrip(): + pa = PhysAddr.mcpu_resource_addr( + sip_id=0, die_id=1, + mcpu_sub_unit=MCPUSubUnit.MCPU_SRAM, sub_offset=0x100, + ) + assert pa.kind == "pe_resource" + assert pa.unit_type == UnitType.MCPU + assert pa.mcpu_sub_unit == MCPUSubUnit.MCPU_SRAM + assert pa.sub_offset == 0x100 + dec = PhysAddr.decode(pa.encode()) + assert dec.unit_type == UnitType.MCPU + assert dec.mcpu_sub_unit == MCPUSubUnit.MCPU_SRAM + assert dec.sub_offset == 0x100 + + +# ── IOCHIPLET: IOCPU factory + roundtrip ──────────────────────────── + + +def test_iocpu_resource_roundtrip(): + pa = PhysAddr.iocpu_resource_addr( + sip_id=1, die_id=17, + iocpu_sub_unit=IOCPUSubUnit.IPCQ, sub_offset=0x20000, + ) + assert pa.kind == "iocpu" + assert pa.iocpu_sub_unit == IOCPUSubUnit.IPCQ + assert pa.sub_offset == 0x20000 + dec = PhysAddr.decode(pa.encode()) + assert dec.kind == "iocpu" + assert dec.iocpu_sub_unit == IOCPUSubUnit.IPCQ + assert dec.sub_offset == 0x20000 + assert dec.die_id == 17 + + +def test_iocpu_die_range_check(): + with pytest.raises(PhysAddrError, match="IOCHIPLET"): + PhysAddr.iocpu_resource_addr( + sip_id=0, die_id=5, # not a chiplet die + iocpu_sub_unit=0, sub_offset=0, + ) + + +# ── IOCHIPLET: UAL factory + roundtrip ────────────────────────────── + + +def test_ual_addr_roundtrip(): + pa = PhysAddr.ual_addr(sip_id=0, die_id=16, ual_offset=0x1000) + assert pa.kind == "ual" + dec = PhysAddr.decode(pa.encode()) + assert dec.kind == "ual" + assert dec.die_id == 16 + assert dec.chiplet_offset >= (1 << 31) # >= 2 GB boundary + + +# ── die_id dispatch ──────────────────────────────────────────────── + + +def test_die_id_ahbm_range(): + for die in [0, 15]: + pa = PhysAddr.hbm_addr(sip_id=0, die_id=die, hbm_offset=0) + dec = PhysAddr.decode(pa.encode()) + assert dec.kind == "hbm" + assert dec.die_id == die + + +def test_die_id_chiplet_range(): + for die in [16, 20]: + pa = PhysAddr.iocpu_resource_addr( + sip_id=0, die_id=die, + iocpu_sub_unit=0, sub_offset=0, + ) + dec = PhysAddr.decode(pa.encode()) + assert dec.kind == "iocpu" + assert dec.die_id == die + + +def test_die_id_reserved_raises(): + raw = (0 << 47) | (21 << 42) | 0 # die_id=21 (reserved) + with pytest.raises(PhysAddrError, match="reserved"): + PhysAddr.decode(raw) + + +# ── Boundary values ──────────────────────────────────────────────── + + +def test_sip_boundary(): + pa = PhysAddr.hbm_addr(sip_id=15, die_id=0, hbm_offset=0) + dec = PhysAddr.decode(pa.encode()) + assert dec.sip_id == 15 + + +def test_mbz_enforcement_ahbm(): + """AHBM local_offset bits [41:38] must be zero.""" + local_offset = (1 << 38) | (1 << 37) # MBZ bit set + HBM + pa = PhysAddr(sip_id=0, die_id=0, local_offset=local_offset) + with pytest.raises(PhysAddrError, match="bits \\[41:38\\]"): + pa.encode() + + +def test_mbz_enforcement_chiplet(): + """IOCHIPLET local_offset bits [41:40] must be zero.""" + local_offset = (1 << 40) | 0 # MBZ bit set + pa = PhysAddr(sip_id=0, die_id=16, local_offset=local_offset) + with pytest.raises(PhysAddrError, match="bits \\[41:40\\]"): + pa.encode() + + # ── AddressConfig ─────────────────────────────────────────────────── @@ -193,7 +315,7 @@ def test_address_config_derived_sizes(): def _make_alloc(pe_id: int = 0) -> PEMemAllocator: - return PEMemAllocator(rack_id=0, sip_id=0, cube_id=0, pe_id=pe_id, cfg=_CFG) + return PEMemAllocator(sip_id=0, die_id=0, pe_id=pe_id, cfg=_CFG) def test_allocator_hbm_basic(): @@ -201,8 +323,7 @@ def test_allocator_hbm_basic(): pa = a.alloc_hbm(4096) assert pa.kind == "hbm" assert pa.sip_id == 0 - assert pa.cube_id == 0 - # hbm_offset should be pe3's slice start + assert pa.die_id == 0 assert pa.hbm_offset == 3 * 6 * _GB @@ -210,8 +331,8 @@ def test_allocator_hbm_sequential(): a = _make_alloc() pa1 = a.alloc_hbm(1024) pa2 = a.alloc_hbm(2048) - assert pa1.hbm_offset == 0 # pe0 slice start + 0 - assert pa2.hbm_offset == 1024 # pe0 slice start + 1024 + assert pa1.hbm_offset == 0 + assert pa2.hbm_offset == 1024 def test_allocator_hbm_overflow(): @@ -235,7 +356,6 @@ def test_allocator_tcm_basic(): def test_allocator_tcm_respects_reserved(): a = _make_alloc() - # allocatable = 12 MB, should succeed a.alloc_tcm(12 * _MB) assert a.tcm_used == 12 * _MB assert a.tcm_total == 12 * _MB diff --git a/tests/test_probe.py b/tests/test_probe.py index 59e73fc..0893e18 100644 --- a/tests/test_probe.py +++ b/tests/test_probe.py @@ -21,7 +21,7 @@ def _engine(): def _hbm_pa(sip: int = 0, cube: int = 0, pe_id: int = 0) -> int: slice_bytes = 48 * (1 << 30) // 8 pa = PhysAddr.pe_hbm_addr( - rack_id=0, sip_id=sip, cube_id=cube, pe_id=pe_id, + sip_id=sip, die_id=cube, pe_id=pe_id, pe_local_hbm_offset=0x1000, slice_size_bytes=slice_bytes, ) return pa.encode() diff --git a/tests/test_routing.py b/tests/test_routing.py index 474a337..da1d1c4 100644 --- a/tests/test_routing.py +++ b/tests/test_routing.py @@ -20,7 +20,7 @@ def test_resolve_hbm_addr(): """HBM address -> sip{S}.cube{C}.hbm_ctrl (single controller per cube).""" g = _graph() resolver = AddressResolver(g) - pa = PhysAddr.hbm_addr(rack_id=0, sip_id=0, cube_id=3, hbm_offset=0x1000) + pa = PhysAddr.hbm_addr(sip_id=0, die_id=3, hbm_offset=0x1000) assert resolver.resolve(pa) == "sip0.cube3.hbm_ctrl" @@ -28,33 +28,33 @@ def test_resolve_hbm_addr_high_offset(): """HBM address with large offset still resolves to same hbm_ctrl.""" g = _graph() resolver = AddressResolver(g) - pa = PhysAddr.hbm_addr(rack_id=0, sip_id=0, cube_id=0, hbm_offset=0x600000000) + pa = PhysAddr.hbm_addr(sip_id=0, die_id=0, hbm_offset=0x600000000) assert resolver.resolve(pa) == "sip0.cube0.hbm_ctrl" def test_resolve_pe_tcm_addr(): - """PE TCM address → sip{S}.cube{C}.pe{P}.pe_tcm""" + """PE TCM address -> sip{S}.cube{C}.pe{P}.pe_tcm""" g = _graph() resolver = AddressResolver(g) - pa = PhysAddr.pe_tcm_addr(rack_id=0, sip_id=1, cube_id=5, pe_id=7, tcm_offset=0x400) + pa = PhysAddr.pe_tcm_addr(sip_id=1, die_id=5, pe_id=7, tcm_offset=0x400) assert resolver.resolve(pa) == "sip1.cube5.pe7.pe_tcm" def test_resolve_sram_addr(): - """SRAM address → sip{S}.cube{C}.sram""" + """SRAM address -> sip{S}.cube{C}.sram""" g = _graph() resolver = AddressResolver(g) - pa = PhysAddr.cube_sram_addr(rack_id=0, sip_id=0, cube_id=10, sram_offset=0x800) + pa = PhysAddr.cube_sram_addr(sip_id=0, die_id=10, sram_offset=0x800) assert resolver.resolve(pa) == "sip0.cube10.sram" def test_resolve_mcpu_addr(): - """MCPU pe_resource address → sip{S}.cube{C}.m_cpu""" + """MCPU pe_resource address -> sip{S}.cube{C}.m_cpu""" g = _graph() resolver = AddressResolver(g) - pa = PhysAddr( - rack_id=0, sip_id=0, sip_seg=2, local_offset=(UnitType.MCPU << 34), - kind="pe_resource", cube_id=2, unit_type=UnitType.MCPU, + pa = PhysAddr.mcpu_resource_addr( + sip_id=0, die_id=2, + mcpu_sub_unit=0, sub_offset=0, ) assert resolver.resolve(pa) == "sip0.cube2.m_cpu" @@ -64,7 +64,7 @@ def test_resolve_nonexistent_node(): g = _graph() resolver = AddressResolver(g) # sip_id=15 doesn't exist in the 2-SIP topology - pa = PhysAddr.hbm_addr(rack_id=0, sip_id=15, cube_id=0, hbm_offset=0) + pa = PhysAddr.hbm_addr(sip_id=15, die_id=0, hbm_offset=0) with pytest.raises(RoutingError): resolver.resolve(pa) @@ -73,7 +73,7 @@ def test_resolve_nonexistent_node(): def test_path_local_hbm(): - """PE0 -> hbm_ctrl: pe_dma → router → hbm_ctrl (through router mesh).""" + """PE0 -> hbm_ctrl: pe_dma -> router -> hbm_ctrl (through router mesh).""" g = _graph() router = PathRouter(g) path = router.find_path("sip0.cube0.pe0", "sip0.cube0.hbm_ctrl") @@ -107,7 +107,7 @@ def test_all_pe_hbm_equidistant(): """All PEs in a cube have equal routing distance to hbm_ctrl. With n_to_one mapping and high routing weight on HBM edges, - all PE→hbm_ctrl paths have the same accumulated distance. + all PE->hbm_ctrl paths have the same accumulated distance. """ g = _graph() router = PathRouter(g) @@ -151,7 +151,7 @@ def test_path_remote_cube_hbm(): def test_path_sram_via_router_mesh(): - """PE → SRAM must go through router mesh nodes.""" + """PE -> SRAM must go through router mesh nodes.""" g = _graph() router = PathRouter(g) path = router.find_path("sip0.cube0.pe0", "sip0.cube0.sram") @@ -168,7 +168,7 @@ def test_path_sram_via_router_mesh(): def test_path_local_tcm(): - """PE0 → own TCM is PE-internal, not via router mesh.""" + """PE0 -> own TCM is PE-internal, not via router mesh.""" g = _graph() router = PathRouter(g) path = router.find_path("sip0.cube0.pe0", "sip0.cube0.pe0.pe_tcm") diff --git a/tests/test_tensor.py b/tests/test_tensor.py index 7a8b568..fc2de0e 100644 --- a/tests/test_tensor.py +++ b/tests/test_tensor.py @@ -44,7 +44,7 @@ _CFG = AddressConfig( def _make_allocators(num_pe: int = 8) -> dict[tuple[int, int, int], PEMemAllocator]: return { - (0, 0, i): PEMemAllocator(rack_id=0, sip_id=0, cube_id=0, pe_id=i, cfg=_CFG) + (0, 0, i): PEMemAllocator(sip_id=0, die_id=0, pe_id=i, cfg=_CFG) for i in range(num_pe) } diff --git a/tests/test_tensor_free.py b/tests/test_tensor_free.py index 20d9913..ca46fa9 100644 --- a/tests/test_tensor_free.py +++ b/tests/test_tensor_free.py @@ -55,7 +55,7 @@ def _make_ctx(): def test_allocator_free_hbm_reclaims_space(): """free_hbm returns HBM space; subsequent alloc can reuse it.""" - a = PEMemAllocator(rack_id=0, sip_id=0, cube_id=0, pe_id=0, cfg=_CFG) + a = PEMemAllocator(sip_id=0, die_id=0, pe_id=0, cfg=_CFG) pa1 = a.alloc_hbm(4096) used_after_alloc = a.hbm_used a.free_hbm(pa1, 4096) @@ -66,7 +66,7 @@ def test_allocator_free_hbm_reclaims_space(): def test_allocator_free_tcm_reclaims_space(): """free_tcm returns TCM space.""" - a = PEMemAllocator(rack_id=0, sip_id=0, cube_id=0, pe_id=0, cfg=_CFG) + a = PEMemAllocator(sip_id=0, die_id=0, pe_id=0, cfg=_CFG) pa1 = a.alloc_tcm(256) used_after_alloc = a.tcm_used a.free_tcm(pa1, 256) diff --git a/tests/test_va_integration.py b/tests/test_va_integration.py index 2998bde..6c5ecbe 100644 --- a/tests/test_va_integration.py +++ b/tests/test_va_integration.py @@ -39,7 +39,7 @@ _CFG = AddressConfig( def _make_allocators(num_pe: int = 8) -> dict[tuple[int, int, int], PEMemAllocator]: return { - (0, 0, i): PEMemAllocator(rack_id=0, sip_id=0, cube_id=0, pe_id=i, cfg=_CFG) + (0, 0, i): PEMemAllocator(sip_id=0, die_id=0, pe_id=i, cfg=_CFG) for i in range(num_pe) } diff --git a/tests/test_va_offset.py b/tests/test_va_offset.py index d7f71d7..9ea2eb3 100644 --- a/tests/test_va_offset.py +++ b/tests/test_va_offset.py @@ -70,7 +70,7 @@ def _make_standalone(shape, num_pe=NUM_PE): sram_bytes_per_cube=32 * _MB, ) allocators = { - (0, 0, i): PEMemAllocator(rack_id=0, sip_id=0, cube_id=0, pe_id=i, cfg=cfg) + (0, 0, i): PEMemAllocator(sip_id=0, die_id=0, pe_id=i, cfg=cfg) for i in range(num_pe) } va_alloc = VirtualAllocator(va_base=0x1_0000_0000, va_size=64 * _GB, page_size=4096)