Implement ADR-0021: PE pipeline refactor with token self-routing

Step 1-2: Backup existing code
- builtin/ → builtin_legacy/ (unchanged backup)
- custom/pe_accel/ → custom/pe_accel_legacy/ (unchanged backup)

Step 3-4: New pipeline types and tiling
- pe_types.py: StageType, Stage, TilePlan, PipelinePlan, PipelineContext, TileToken
- tiling.py: generate_gemm_plan, generate_math_plan (ported from pe_accel)

Step 5: Component implementations (ADR-0021 D4-D6)
- PE_SCHEDULER: _feed_loop (singleton FIFO feeder) + plan generation
- PE_FETCH_STORE: new component — TCM ↔ Register File
- PE_GEMM: TileToken pipeline + legacy PeInternalTxn dual-mode
- PE_MATH: TileToken pipeline + legacy dual-mode
- PE_DMA: TileToken pipeline + legacy + fabric Transaction triple-mode
- PE_TCM: TcmRequest handler with dual-channel BW serialization

Step 6: Infrastructure
- topology.yaml: pe_fetch_store component + chaining edges
- components.yaml: pe_fetch_store_v1 registration
- builder.py: PE_COMP_OFFSETS, _add_pe_internal_edges, PE view positions
- Tests: node/edge counts, PE component sets updated

All components handle both TileToken (pipeline) and PeInternalTxn (legacy).
Token self-routing: components read next stage from token.plan, chain via out_port.
366 tests passing.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-08 23:35:31 -07:00
parent 161132cdcb
commit b6eb97c49a
40 changed files with 4055 additions and 214 deletions
@@ -105,6 +105,73 @@ class PeDmaComponent(PeEngineBase):
yield sub_done
pe_txn.done.succeed()
def _worker(self, env: simpy.Environment) -> Generator:
"""Handle TileToken (pipeline), PeInternalTxn (legacy), and Transaction (fabric)."""
from kernbench.common.pe_commands import PeInternalTxn
from kernbench.components.builtin.pe_types import TileToken
while True:
msg: Any = yield self._inbox.get()
if isinstance(msg, TileToken):
env.process(self._pipeline_process(env, msg))
elif isinstance(msg, PeInternalTxn):
env.process(self._handle_with_hooks(env, msg))
else:
env.process(self._forward_txn(env, msg))
def _pipeline_process(self, env: simpy.Environment, token: Any) -> Generator:
"""Pipeline mode: DMA read/write via fabric, then self-route."""
from kernbench.common.pe_commands import DmaReadCmd, DmaWriteCmd, TensorHandle
from kernbench.policy.address.phyaddr import PhysAddr
from kernbench.runtime_api.kernel import PeDmaMsg
self._on_process_start(env, token)
params = token.params
stage_type = token.current_stage.stage_type
from kernbench.components.builtin.pe_types import StageType
is_write = stage_type == StageType.DMA_WRITE
addr = params.get("dst_addr" if is_write else "src_addr", 0)
nbytes = params.get("nbytes", 0)
if nbytes > 0 and self.ctx:
dma_res = self._dma_write if is_write else self._dma_read
assert dma_res is not None
pa = PhysAddr.decode(addr)
dst_node = self.ctx.resolver.resolve(pa)
path = self.ctx.router.find_path(self._pe_prefix, dst_node)
drain_ns = self.ctx.compute_drain_ns(path, nbytes)
with dma_res.request() as req:
yield req
sub_done = env.event()
sub_request = PeDmaMsg(
correlation_id="pipeline",
request_id=f"tile_{token.tile_id}",
src_sip=0, src_cube=0, src_pe=0,
dst_pa=addr, nbytes=nbytes,
is_write=is_write,
)
sub_txn = Transaction(
request=sub_request, path=path, step=0,
nbytes=nbytes, done=sub_done, drain_ns=drain_ns,
)
if len(path) > 1:
yield self.out_ports[path[1]].put(sub_txn.advance())
yield sub_done
self._on_process_end(env, token)
# Self-routing
next_stage = token.advance()
if next_stage is not None:
yield self.out_ports[next_stage.component].put(token)
else:
token.pipeline_ctx.complete_tile()
def _forward_txn(self, env: simpy.Environment, txn: Any) -> Generator:
"""Handle external Transaction (PeDmaMsg probe, M_CPU DMA) with channel acquisition."""
# Response transactions bypass DMA channel (no outbound resource needed)