533e699299
Add hardware design document (docs/ipcq-dma-codesign-hw.md) covering PE_IPCQ high-level architecture, simulator verification, proposed HW implementation, and alternatives analysis. Include D2 block diagrams for baseline and proposed PE architectures. Fix IPCQ slot-memory bandwidth parameters to match topology.yaml: SRAM 128→512 GB/s (intrinsic BW, NoC-bottlenecked at 128), HBM 32→256 GB/s (was per-channel, now per-PE aggregate). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
158 lines
3.1 KiB
Plaintext
158 lines
3.1 KiB
Plaintext
direction: right
|
|
|
|
pe: PE {
|
|
style.fill: "#f8f9fa"
|
|
style.stroke: "#495057"
|
|
style.border-radius: 8
|
|
|
|
cpu: PE_CPU (control) {
|
|
style.fill: "#bbdefb"
|
|
style.stroke: "#1565c0"
|
|
style.border-radius: 4
|
|
}
|
|
|
|
sched: PE_SCHED (dispatch) {
|
|
style.fill: "#bbdefb"
|
|
style.stroke: "#1565c0"
|
|
style.border-radius: 4
|
|
}
|
|
|
|
ipcq_added: IPCQ (added) {
|
|
style.fill: "#e1f5fe"
|
|
style.stroke: "#0277bd"
|
|
style.stroke-dash: 5
|
|
style.stroke-width: 2
|
|
style.border-radius: 6
|
|
|
|
ipcq: PE_IPCQ (control plane) {
|
|
style.fill: "#bbdefb"
|
|
style.stroke: "#1565c0"
|
|
style.border-radius: 4
|
|
}
|
|
}
|
|
|
|
dma: PE_DMA (single FIFO inbox) {
|
|
style.fill: "#fff3e0"
|
|
style.stroke: "#e65100"
|
|
style.border-radius: 6
|
|
}
|
|
|
|
fs: PE_FETCH_STORE {
|
|
style.fill: "#c8e6c9"
|
|
style.stroke: "#2e7d32"
|
|
style.border-radius: 4
|
|
}
|
|
|
|
tcm: TCM (16MB SRAM) {
|
|
style.fill: "#fce4ec"
|
|
style.stroke: "#c62828"
|
|
style.border-radius: 6
|
|
|
|
ipcq_slot: IPCQ Slot Region {
|
|
style.stroke-dash: 5
|
|
style.fill: "#ffcdd2"
|
|
style.stroke: "#c62828"
|
|
style.border-radius: 3
|
|
}
|
|
}
|
|
|
|
gemm: GEMM engine {
|
|
style.fill: "#c8e6c9"
|
|
style.stroke: "#2e7d32"
|
|
style.border-radius: 4
|
|
}
|
|
|
|
math: MATH engine {
|
|
style.fill: "#c8e6c9"
|
|
style.stroke: "#2e7d32"
|
|
style.border-radius: 4
|
|
}
|
|
|
|
fport: Fabric Port {
|
|
style.fill: "#ffe0b2"
|
|
style.stroke: "#e65100"
|
|
style.border-radius: 4
|
|
}
|
|
|
|
# Control → dispatch
|
|
cpu -> sched: cmd dispatch
|
|
cpu -> ipcq_added.ipcq: IpcqRequest
|
|
|
|
# Compute pipeline
|
|
sched -> dma: TileToken\n(compute port)
|
|
dma -> fs: TileToken
|
|
dma <-> tcm: DMA_READ/WRITE\n(HBM ↔ TCM)
|
|
fs <-> tcm: fetch/store\n(TCM ↔ reg)
|
|
fs -> gemm: TileToken
|
|
fs -> math: TileToken
|
|
gemm -> fs: TileToken
|
|
math -> fs: TileToken
|
|
|
|
# IPCQ data path — outbound
|
|
ipcq_added.ipcq -> dma: IpcqDmaToken\n(IPCQ port) {style.stroke: "#1565c0"}
|
|
|
|
# IPCQ data path — inbound (MetaArrival: DMA → IPCQ)
|
|
dma -> ipcq_added.ipcq: IpcqMetaArrival {style.stroke: "#1565c0"}
|
|
|
|
# Credit return (dashed)
|
|
ipcq_added.ipcq -> dma: IpcqCreditMetadata\n(NoC latency charged) {
|
|
style.stroke: "#7b1fa2"
|
|
style.stroke-dash: 5
|
|
}
|
|
|
|
# DMA ↔ Fabric
|
|
dma <-> fport
|
|
}
|
|
|
|
# ── NoC Router + attached resources ──
|
|
|
|
noc: NoC Router {
|
|
style.fill: "#f3e5f5"
|
|
style.stroke: "#6a1b9a"
|
|
style.border-radius: 6
|
|
}
|
|
|
|
hbm: Local HBM {
|
|
style.fill: "#e8eaf6"
|
|
style.stroke: "#283593"
|
|
style.border-radius: 6
|
|
|
|
ipcq_slot_hbm: IPCQ Slot Region {
|
|
style.stroke-dash: 5
|
|
style.fill: "#c5cae9"
|
|
style.stroke: "#283593"
|
|
style.border-radius: 3
|
|
}
|
|
}
|
|
|
|
sram: Cube SRAM {
|
|
style.fill: "#e0f7fa"
|
|
style.stroke: "#00695c"
|
|
style.border-radius: 6
|
|
|
|
ipcq_slot_sram: IPCQ Slot Region {
|
|
style.stroke-dash: 5
|
|
style.fill: "#b2dfdb"
|
|
style.stroke: "#00695c"
|
|
style.border-radius: 3
|
|
}
|
|
}
|
|
|
|
other_pe: Other PEs {
|
|
style.fill: "#ede7f6"
|
|
style.stroke: "#6a1b9a"
|
|
style.border-radius: 6
|
|
}
|
|
|
|
other_cube: Other Cubes / SIPs {
|
|
style.fill: "#ede7f6"
|
|
style.stroke: "#6a1b9a"
|
|
style.border-radius: 6
|
|
}
|
|
|
|
pe.fport <-> noc
|
|
noc <-> hbm
|
|
noc <-> sram
|
|
noc <-> other_pe
|
|
noc <-> other_cube
|