PE-to-PE latency test + supporting fixes

Adds tests/test_pe_to_pe_latency.py: a sweep that measures PE-to-PE transfer latency for five hop types (intra-cube horizontal/vertical, inter-cube horizontal/vertical, inter-SIP) across data sizes 128 B to 10 KB, on both the IPCQ (tl.send/tl.recv) and raw-DMA (tl.load+tl.store) paths. Emits per-hop PNG plots, an overview PNG, and a CSV summary into tests/pe2pe_latency_plots/. Latency is reported as max(pe_exec_ns) across participating PEs, read from engine.get_completion(), so the measurement captures the SRC/DST PE's kernel body time rather than the full launch+ response-aggregation envelope. Two simulator fixes were needed to make this measurement meaningful: - PeMMU now stores a list of (start, end, pa) sub-regions per page rather than a single PA. DPPolicy layouts with shards smaller than page_size (e.g. 128 B payloads with 4 KB pages) used to silently overwrite each other through last-write-wins, causing DMAs intended for cube0 to physically route to cube3 - inflating latency by ~170 ns per DMA at small sizes. STOPGAP: real MMUs don't support sub-page regions; long-term fix is either smaller MMU page size or DPPolicy validation that refuses sub-page shards. - M_CPU's per-PE metrics aggregation (pe_exec_ns, dma_ns, compute_ns) now max-merges against the existing value in result_data rather than overwriting. Multi-cube workloads share one result_data dict via IO_CPU fanout; the previous overwrite caused whichever cube's M_CPU finished last to clobber others' values, so multi-cube pe_exec_ns was racy and frequently 0. Same fix applied in legacy/builtin/m_cpu.py. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-22 21:04:31 -07:00
parent 1d8b9401e5
commit 6918e6e906
4 changed files with 446 additions and 21 deletions
@@ -204,16 +204,21 @@ class MCpuComponent(ComponentBase):
        yield all_done
        del self._parent_txns[request.request_id]

-        # Aggregate PE-internal metrics (max across PEs)
+        # Aggregate PE-internal metrics (max across PEs and across cubes).
+        # Multiple M_CPUs share the same result_data dict via IO_CPU fanout;
+        # merge against the existing value so cubes don't clobber each other.
        pe_exec_values = [st.result_data.get("pe_exec_ns", 0.0) for st in sub_txns]
        if pe_exec_values:
-            txn.result_data["pe_exec_ns"] = max(pe_exec_values)
+            cur = txn.result_data.get("pe_exec_ns", 0.0) or 0.0
+            txn.result_data["pe_exec_ns"] = max(cur, max(pe_exec_values))
        dma_values = [st.result_data.get("dma_ns", 0.0) for st in sub_txns]
        if dma_values:
-            txn.result_data["dma_ns"] = max(dma_values)
+            cur = txn.result_data.get("dma_ns", 0.0) or 0.0
+            txn.result_data["dma_ns"] = max(cur, max(dma_values))
        compute_values = [st.result_data.get("compute_ns", 0.0) for st in sub_txns]
        if compute_values:
-            txn.result_data["compute_ns"] = max(compute_values)
+            cur = txn.result_data.get("compute_ns", 0.0) or 0.0
+            txn.result_data["compute_ns"] = max(cur, max(compute_values))

        # Send aggregate response on reverse command path back to IO_CPU
        reverse_path = list(reversed(txn.path))
@@ -204,16 +204,21 @@ class MCpuComponent(ComponentBase):
        yield all_done
        del self._parent_txns[request.request_id]

-        # Aggregate PE-internal metrics (max across PEs)
+        # Aggregate PE-internal metrics (max across PEs and across cubes).
+        # Multiple M_CPUs share the same result_data dict via IO_CPU fanout;
+        # merge against the existing value so cubes don't clobber each other.
        pe_exec_values = [st.result_data.get("pe_exec_ns", 0.0) for st in sub_txns]
        if pe_exec_values:
-            txn.result_data["pe_exec_ns"] = max(pe_exec_values)
+            cur = txn.result_data.get("pe_exec_ns", 0.0) or 0.0
+            txn.result_data["pe_exec_ns"] = max(cur, max(pe_exec_values))
        dma_values = [st.result_data.get("dma_ns", 0.0) for st in sub_txns]
        if dma_values:
-            txn.result_data["dma_ns"] = max(dma_values)
+            cur = txn.result_data.get("dma_ns", 0.0) or 0.0
+            txn.result_data["dma_ns"] = max(cur, max(dma_values))
        compute_values = [st.result_data.get("compute_ns", 0.0) for st in sub_txns]
        if compute_values:
-            txn.result_data["compute_ns"] = max(compute_values)
+            cur = txn.result_data.get("compute_ns", 0.0) or 0.0
+            txn.result_data["compute_ns"] = max(cur, max(compute_values))

        # Send aggregate response on reverse command path back to IO_CPU
        reverse_path = list(reversed(txn.path))
@@ -19,7 +19,14 @@ class PageFault(Exception):


 class PeMMU:
-    """Per-PE MMU with page-aligned VA→PA translation table.
+    """Per-PE MMU with sub-page-capable VA→PA translation table.
+
+    Each page-table entry is a list of (start_in_page, end_in_page,
+    pa_at_offset_zero) regions. This is a SIMULATOR STOPGAP — real MMUs
+    store one PA per page-table entry. Sub-page regions exist here so
+    DPPolicy layouts that shard below page granularity (e.g. 128 B
+    payloads with 4 KB pages) don't silently mis-route through last-
+    write-wins overwrites. Memory note: project_mmu_subpage_stopgap.md.

    Args:
        page_size: Page size in bytes (default 2 MB).
@@ -34,7 +41,11 @@ class PeMMU:
        self._page_size = page_size
        self._page_shift = (page_size - 1).bit_length()
        self._page_mask = page_size - 1
-        self._table: dict[int, int] = {}  # va_page_number → pa_page_base
+        # vpn → list of (start_in_page, end_in_page, pa_at_offset_zero).
+        # pa_at_offset_zero is the PA that offset 0 of the page would map
+        # to under this region — i.e. translate(off) = pa_at_offset_zero
+        # + off when start <= off < end.
+        self._table: dict[int, list[tuple[int, int, int]]] = {}
        self._overhead_ns = overhead_ns

    @property
@@ -46,21 +57,67 @@ class PeMMU:
        return len(self._table)

    def map(self, va: int, pa: int, size: int) -> None:
-        """Register VA→PA mapping for a contiguous range."""
-        for off in range(0, size, self._page_size):
-            vpn = (va + off) >> self._page_shift
-            self._table[vpn] = pa + off
+        """Register VA→PA mapping for a contiguous range.
+
+        Sub-page-aware: a single page can hold multiple disjoint regions,
+        each pointing to a different PA. Later map() calls APPEND a new
+        region; on overlap with an existing region, the new region wins
+        for the overlapping offsets (translate iterates in reverse so the
+        last write takes precedence — matches legacy single-PA behavior
+        when a full page is re-mapped).
+        """
+        end_va = va + size
+        cur = va
+        while cur < end_va:
+            vpn = cur >> self._page_shift
+            page_base_va = vpn << self._page_shift
+            page_end_va = page_base_va + self._page_size
+            region_start = cur - page_base_va
+            region_end = min(end_va, page_end_va) - page_base_va
+            # PA seen at offset 0 of page if this region's mapping covered it
+            pa_at_offset_zero = pa + (cur - va) - region_start
+            self._table.setdefault(vpn, []).append(
+                (region_start, region_end, pa_at_offset_zero)
+            )
+            cur = page_base_va + region_end

    def unmap(self, va: int, size: int) -> None:
-        """Remove VA mapping for a contiguous range."""
-        for off in range(0, size, self._page_size):
-            vpn = (va + off) >> self._page_shift
-            self._table.pop(vpn, None)
+        """Remove VA mapping for a contiguous range.
+
+        Drops any region whose extent is contained within the unmapped
+        range. Partial overlaps (region straddles the range boundary)
+        are left in place — caller is expected to unmap on the same
+        boundaries it mapped on.
+        """
+        end_va = va + size
+        cur = va
+        while cur < end_va:
+            vpn = cur >> self._page_shift
+            page_base_va = vpn << self._page_shift
+            page_end_va = page_base_va + self._page_size
+            unmap_start = cur - page_base_va
+            unmap_end = min(end_va, page_end_va) - page_base_va
+            regions = self._table.get(vpn)
+            if regions is not None:
+                kept = [
+                    r for r in regions
+                    if not (r[0] >= unmap_start and r[1] <= unmap_end)
+                ]
+                if kept:
+                    self._table[vpn] = kept
+                else:
+                    del self._table[vpn]
+            cur = page_base_va + unmap_end

    def translate(self, va: int) -> int:
        """Translate VA to PA. Raises PageFault if unmapped."""
        vpn = va >> self._page_shift
-        pa_page_base = self._table.get(vpn)
-        if pa_page_base is None:
+        regions = self._table.get(vpn)
+        if regions is None:
            raise PageFault(va)
-        return pa_page_base + (va & self._page_mask)
+        offset = va & self._page_mask
+        # Iterate latest-first so newer map() calls win on overlap
+        for start, end, pa_at_offset_zero in reversed(regions):
+            if start <= offset < end:
+                return pa_at_offset_zero + offset
+        raise PageFault(va)