Implement ADR-0021: PE pipeline refactor with token self-routing

Step 1-2: Backup existing code
- builtin/ → builtin_legacy/ (unchanged backup)
- custom/pe_accel/ → custom/pe_accel_legacy/ (unchanged backup)

Step 3-4: New pipeline types and tiling
- pe_types.py: StageType, Stage, TilePlan, PipelinePlan, PipelineContext, TileToken
- tiling.py: generate_gemm_plan, generate_math_plan (ported from pe_accel)

Step 5: Component implementations (ADR-0021 D4-D6)
- PE_SCHEDULER: _feed_loop (singleton FIFO feeder) + plan generation
- PE_FETCH_STORE: new component — TCM ↔ Register File
- PE_GEMM: TileToken pipeline + legacy PeInternalTxn dual-mode
- PE_MATH: TileToken pipeline + legacy dual-mode
- PE_DMA: TileToken pipeline + legacy + fabric Transaction triple-mode
- PE_TCM: TcmRequest handler with dual-channel BW serialization

Step 6: Infrastructure
- topology.yaml: pe_fetch_store component + chaining edges
- components.yaml: pe_fetch_store_v1 registration
- builder.py: PE_COMP_OFFSETS, _add_pe_internal_edges, PE view positions
- Tests: node/edge counts, PE component sets updated

All components handle both TileToken (pipeline) and PeInternalTxn (legacy).
Token self-routing: components read next stage from token.plan, chain via out_port.
366 tests passing.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-08 23:35:31 -07:00
parent 161132cdcb
commit b6eb97c49a
40 changed files with 4055 additions and 214 deletions
+52 -1
View File
@@ -20,6 +20,7 @@ _PE_COMP_OFFSETS = {
"pe_cpu": (-0.3, 0.0),
"pe_scheduler": (-0.15, 0.0),
"pe_dma": (0.0, -0.15),
"pe_fetch_store": (0.15, 0.0),
"pe_gemm": (0.0, 0.0),
"pe_math": (0.0, 0.15),
"pe_mmu": (0.15, -0.15),
@@ -637,12 +638,13 @@ def _instantiate_cube(
def _add_pe_internal_edges(edges: list[Edge], pp: str, pe_links: dict) -> None:
"""Add PE-internal edges for a single PE instance."""
"""Add PE-internal edges for a single PE instance (ADR-0021)."""
edges.append(Edge(
src=f"{pp}.pe_cpu", dst=f"{pp}.pe_scheduler",
distance_mm=pe_links["pe_cpu_to_scheduler_mm"],
kind="pe_internal",
))
# Scheduler → engines (initial dispatch)
for eng, key in [("pe_dma", "scheduler_to_dma_mm"),
("pe_gemm", "scheduler_to_gemm_mm"),
("pe_math", "scheduler_to_math_mm")]:
@@ -651,6 +653,15 @@ def _add_pe_internal_edges(edges: list[Edge], pp: str, pe_links: dict) -> None:
distance_mm=pe_links[key],
kind="pe_internal",
))
# Scheduler → fetch_store (initial dispatch)
if "scheduler_to_fetch_store_mm" in pe_links:
edges.append(Edge(
src=f"{pp}.pe_scheduler", dst=f"{pp}.pe_fetch_store",
distance_mm=pe_links["scheduler_to_fetch_store_mm"],
kind="pe_internal",
))
# Engine → TCM (legacy BW edges)
for eng, mm_key, bw_key in [("pe_dma", "dma_to_tcm_mm", "dma_to_tcm_bw_gbs"),
("pe_gemm", "gemm_to_tcm_mm", "gemm_to_tcm_bw_gbs"),
("pe_math", "math_to_tcm_mm", "math_to_tcm_bw_gbs")]:
@@ -661,6 +672,32 @@ def _add_pe_internal_edges(edges: list[Edge], pp: str, pe_links: dict) -> None:
kind="pe_internal",
))
# Fetch/Store → TCM (ADR-0021 D5)
if "fetch_store_to_tcm_mm" in pe_links:
edges.append(Edge(
src=f"{pp}.pe_fetch_store", dst=f"{pp}.pe_tcm",
distance_mm=pe_links["fetch_store_to_tcm_mm"],
bw_gbs=pe_links.get("fetch_store_to_tcm_bw_gbs", 512.0),
kind="pe_internal",
))
# Chaining edges (ADR-0021 D4 — token self-routing)
chaining = [
("pe_dma", "pe_fetch_store", "dma_to_fetch_store_mm"),
("pe_fetch_store", "pe_gemm", "fetch_store_to_gemm_mm"),
("pe_fetch_store", "pe_math", "fetch_store_to_math_mm"),
("pe_gemm", "pe_fetch_store", "gemm_to_fetch_store_mm"),
("pe_math", "pe_fetch_store", "math_to_fetch_store_mm"),
("pe_fetch_store", "pe_dma", "fetch_store_to_dma_mm"),
]
for src_eng, dst_eng, mm_key in chaining:
if mm_key in pe_links:
edges.append(Edge(
src=f"{pp}.{src_eng}", dst=f"{pp}.{dst_eng}",
distance_mm=pe_links[mm_key],
kind="pe_internal",
))
# ── Inter-cube / IO / system edges ──────────────────────────────────
@@ -1071,6 +1108,7 @@ def _build_pe_view(spec: dict) -> ViewGraph:
"pe_cpu": (1.5, 4.0),
"pe_scheduler": (4.0, 4.0),
"pe_dma": (7.0, 1.5),
"pe_fetch_store": (8.5, 4.0),
"pe_gemm": (7.0, 4.0),
"pe_math": (7.0, 6.5),
"pe_mmu": (4.0, 1.5),
@@ -1101,6 +1139,12 @@ def _build_pe_view(spec: dict) -> ViewGraph:
distance_mm=pe_links[key],
kind="pe_internal",
))
if "scheduler_to_fetch_store_mm" in pe_links:
view_edges.append(Edge(
src="pe_scheduler", dst="pe_fetch_store",
distance_mm=pe_links["scheduler_to_fetch_store_mm"],
kind="pe_internal",
))
for eng, mm_key, bw_key in [("pe_dma", "dma_to_tcm_mm", "dma_to_tcm_bw_gbs"),
("pe_gemm", "gemm_to_tcm_mm", "gemm_to_tcm_bw_gbs"),
("pe_math", "math_to_tcm_mm", "math_to_tcm_bw_gbs")]:
@@ -1110,6 +1154,13 @@ def _build_pe_view(spec: dict) -> ViewGraph:
bw_gbs=pe_links[bw_key],
kind="pe_internal",
))
if "fetch_store_to_tcm_mm" in pe_links:
view_edges.append(Edge(
src="pe_fetch_store", dst="pe_tcm",
distance_mm=pe_links["fetch_store_to_tcm_mm"],
bw_gbs=pe_links.get("fetch_store_to_tcm_bw_gbs", 512.0),
kind="pe_internal",
))
return ViewGraph(
name="pe", nodes=nodes, edges=view_edges,