PE-to-PE latency test + supporting fixes
Adds tests/test_pe_to_pe_latency.py: a sweep that measures PE-to-PE transfer latency for five hop types (intra-cube horizontal/vertical, inter-cube horizontal/vertical, inter-SIP) across data sizes 128 B to 10 KB, on both the IPCQ (tl.send/tl.recv) and raw-DMA (tl.load+tl.store) paths. Emits per-hop PNG plots, an overview PNG, and a CSV summary into tests/pe2pe_latency_plots/. Latency is reported as max(pe_exec_ns) across participating PEs, read from engine.get_completion(), so the measurement captures the SRC/DST PE's kernel body time rather than the full launch+ response-aggregation envelope. Two simulator fixes were needed to make this measurement meaningful: - PeMMU now stores a list of (start, end, pa) sub-regions per page rather than a single PA. DPPolicy layouts with shards smaller than page_size (e.g. 128 B payloads with 4 KB pages) used to silently overwrite each other through last-write-wins, causing DMAs intended for cube0 to physically route to cube3 - inflating latency by ~170 ns per DMA at small sizes. STOPGAP: real MMUs don't support sub-page regions; long-term fix is either smaller MMU page size or DPPolicy validation that refuses sub-page shards. - M_CPU's per-PE metrics aggregation (pe_exec_ns, dma_ns, compute_ns) now max-merges against the existing value in result_data rather than overwriting. Multi-cube workloads share one result_data dict via IO_CPU fanout; the previous overwrite caused whichever cube's M_CPU finished last to clobber others' values, so multi-cube pe_exec_ns was racy and frequently 0. Same fix applied in legacy/builtin/m_cpu.py. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -204,16 +204,21 @@ class MCpuComponent(ComponentBase):
|
||||
yield all_done
|
||||
del self._parent_txns[request.request_id]
|
||||
|
||||
# Aggregate PE-internal metrics (max across PEs)
|
||||
# Aggregate PE-internal metrics (max across PEs and across cubes).
|
||||
# Multiple M_CPUs share the same result_data dict via IO_CPU fanout;
|
||||
# merge against the existing value so cubes don't clobber each other.
|
||||
pe_exec_values = [st.result_data.get("pe_exec_ns", 0.0) for st in sub_txns]
|
||||
if pe_exec_values:
|
||||
txn.result_data["pe_exec_ns"] = max(pe_exec_values)
|
||||
cur = txn.result_data.get("pe_exec_ns", 0.0) or 0.0
|
||||
txn.result_data["pe_exec_ns"] = max(cur, max(pe_exec_values))
|
||||
dma_values = [st.result_data.get("dma_ns", 0.0) for st in sub_txns]
|
||||
if dma_values:
|
||||
txn.result_data["dma_ns"] = max(dma_values)
|
||||
cur = txn.result_data.get("dma_ns", 0.0) or 0.0
|
||||
txn.result_data["dma_ns"] = max(cur, max(dma_values))
|
||||
compute_values = [st.result_data.get("compute_ns", 0.0) for st in sub_txns]
|
||||
if compute_values:
|
||||
txn.result_data["compute_ns"] = max(compute_values)
|
||||
cur = txn.result_data.get("compute_ns", 0.0) or 0.0
|
||||
txn.result_data["compute_ns"] = max(cur, max(compute_values))
|
||||
|
||||
# Send aggregate response on reverse command path back to IO_CPU
|
||||
reverse_path = list(reversed(txn.path))
|
||||
|
||||
@@ -204,16 +204,21 @@ class MCpuComponent(ComponentBase):
|
||||
yield all_done
|
||||
del self._parent_txns[request.request_id]
|
||||
|
||||
# Aggregate PE-internal metrics (max across PEs)
|
||||
# Aggregate PE-internal metrics (max across PEs and across cubes).
|
||||
# Multiple M_CPUs share the same result_data dict via IO_CPU fanout;
|
||||
# merge against the existing value so cubes don't clobber each other.
|
||||
pe_exec_values = [st.result_data.get("pe_exec_ns", 0.0) for st in sub_txns]
|
||||
if pe_exec_values:
|
||||
txn.result_data["pe_exec_ns"] = max(pe_exec_values)
|
||||
cur = txn.result_data.get("pe_exec_ns", 0.0) or 0.0
|
||||
txn.result_data["pe_exec_ns"] = max(cur, max(pe_exec_values))
|
||||
dma_values = [st.result_data.get("dma_ns", 0.0) for st in sub_txns]
|
||||
if dma_values:
|
||||
txn.result_data["dma_ns"] = max(dma_values)
|
||||
cur = txn.result_data.get("dma_ns", 0.0) or 0.0
|
||||
txn.result_data["dma_ns"] = max(cur, max(dma_values))
|
||||
compute_values = [st.result_data.get("compute_ns", 0.0) for st in sub_txns]
|
||||
if compute_values:
|
||||
txn.result_data["compute_ns"] = max(compute_values)
|
||||
cur = txn.result_data.get("compute_ns", 0.0) or 0.0
|
||||
txn.result_data["compute_ns"] = max(cur, max(compute_values))
|
||||
|
||||
# Send aggregate response on reverse command path back to IO_CPU
|
||||
reverse_path = list(reversed(txn.path))
|
||||
|
||||
@@ -19,7 +19,14 @@ class PageFault(Exception):
|
||||
|
||||
|
||||
class PeMMU:
|
||||
"""Per-PE MMU with page-aligned VA→PA translation table.
|
||||
"""Per-PE MMU with sub-page-capable VA→PA translation table.
|
||||
|
||||
Each page-table entry is a list of (start_in_page, end_in_page,
|
||||
pa_at_offset_zero) regions. This is a SIMULATOR STOPGAP — real MMUs
|
||||
store one PA per page-table entry. Sub-page regions exist here so
|
||||
DPPolicy layouts that shard below page granularity (e.g. 128 B
|
||||
payloads with 4 KB pages) don't silently mis-route through last-
|
||||
write-wins overwrites. Memory note: project_mmu_subpage_stopgap.md.
|
||||
|
||||
Args:
|
||||
page_size: Page size in bytes (default 2 MB).
|
||||
@@ -34,7 +41,11 @@ class PeMMU:
|
||||
self._page_size = page_size
|
||||
self._page_shift = (page_size - 1).bit_length()
|
||||
self._page_mask = page_size - 1
|
||||
self._table: dict[int, int] = {} # va_page_number → pa_page_base
|
||||
# vpn → list of (start_in_page, end_in_page, pa_at_offset_zero).
|
||||
# pa_at_offset_zero is the PA that offset 0 of the page would map
|
||||
# to under this region — i.e. translate(off) = pa_at_offset_zero
|
||||
# + off when start <= off < end.
|
||||
self._table: dict[int, list[tuple[int, int, int]]] = {}
|
||||
self._overhead_ns = overhead_ns
|
||||
|
||||
@property
|
||||
@@ -46,21 +57,67 @@ class PeMMU:
|
||||
return len(self._table)
|
||||
|
||||
def map(self, va: int, pa: int, size: int) -> None:
|
||||
"""Register VA→PA mapping for a contiguous range."""
|
||||
for off in range(0, size, self._page_size):
|
||||
vpn = (va + off) >> self._page_shift
|
||||
self._table[vpn] = pa + off
|
||||
"""Register VA→PA mapping for a contiguous range.
|
||||
|
||||
Sub-page-aware: a single page can hold multiple disjoint regions,
|
||||
each pointing to a different PA. Later map() calls APPEND a new
|
||||
region; on overlap with an existing region, the new region wins
|
||||
for the overlapping offsets (translate iterates in reverse so the
|
||||
last write takes precedence — matches legacy single-PA behavior
|
||||
when a full page is re-mapped).
|
||||
"""
|
||||
end_va = va + size
|
||||
cur = va
|
||||
while cur < end_va:
|
||||
vpn = cur >> self._page_shift
|
||||
page_base_va = vpn << self._page_shift
|
||||
page_end_va = page_base_va + self._page_size
|
||||
region_start = cur - page_base_va
|
||||
region_end = min(end_va, page_end_va) - page_base_va
|
||||
# PA seen at offset 0 of page if this region's mapping covered it
|
||||
pa_at_offset_zero = pa + (cur - va) - region_start
|
||||
self._table.setdefault(vpn, []).append(
|
||||
(region_start, region_end, pa_at_offset_zero)
|
||||
)
|
||||
cur = page_base_va + region_end
|
||||
|
||||
def unmap(self, va: int, size: int) -> None:
|
||||
"""Remove VA mapping for a contiguous range."""
|
||||
for off in range(0, size, self._page_size):
|
||||
vpn = (va + off) >> self._page_shift
|
||||
self._table.pop(vpn, None)
|
||||
"""Remove VA mapping for a contiguous range.
|
||||
|
||||
Drops any region whose extent is contained within the unmapped
|
||||
range. Partial overlaps (region straddles the range boundary)
|
||||
are left in place — caller is expected to unmap on the same
|
||||
boundaries it mapped on.
|
||||
"""
|
||||
end_va = va + size
|
||||
cur = va
|
||||
while cur < end_va:
|
||||
vpn = cur >> self._page_shift
|
||||
page_base_va = vpn << self._page_shift
|
||||
page_end_va = page_base_va + self._page_size
|
||||
unmap_start = cur - page_base_va
|
||||
unmap_end = min(end_va, page_end_va) - page_base_va
|
||||
regions = self._table.get(vpn)
|
||||
if regions is not None:
|
||||
kept = [
|
||||
r for r in regions
|
||||
if not (r[0] >= unmap_start and r[1] <= unmap_end)
|
||||
]
|
||||
if kept:
|
||||
self._table[vpn] = kept
|
||||
else:
|
||||
del self._table[vpn]
|
||||
cur = page_base_va + unmap_end
|
||||
|
||||
def translate(self, va: int) -> int:
|
||||
"""Translate VA to PA. Raises PageFault if unmapped."""
|
||||
vpn = va >> self._page_shift
|
||||
pa_page_base = self._table.get(vpn)
|
||||
if pa_page_base is None:
|
||||
regions = self._table.get(vpn)
|
||||
if regions is None:
|
||||
raise PageFault(va)
|
||||
return pa_page_base + (va & self._page_mask)
|
||||
offset = va & self._page_mask
|
||||
# Iterate latest-first so newer map() calls win on overlap
|
||||
for start, end, pa_at_offset_zero in reversed(regions):
|
||||
if start <= offset < end:
|
||||
return pa_at_offset_zero + offset
|
||||
raise PageFault(va)
|
||||
|
||||
Reference in New Issue
Block a user