533e699299
Add hardware design document (docs/ipcq-dma-codesign-hw.md) covering PE_IPCQ high-level architecture, simulator verification, proposed HW implementation, and alternatives analysis. Include D2 block diagrams for baseline and proposed PE architectures. Fix IPCQ slot-memory bandwidth parameters to match topology.yaml: SRAM 128→512 GB/s (intrinsic BW, NoC-bottlenecked at 128), HBM 32→256 GB/s (was per-channel, now per-PE aggregate). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
167 lines
3.3 KiB
Plaintext
167 lines
3.3 KiB
Plaintext
direction: right
|
|
|
|
pe: PE {
|
|
style.fill: "#f8f9fa"
|
|
style.stroke: "#495057"
|
|
style.border-radius: 8
|
|
|
|
cpu: PE_CPU (control) {
|
|
style.fill: "#bbdefb"
|
|
style.stroke: "#1565c0"
|
|
style.border-radius: 4
|
|
}
|
|
|
|
sched: PE_SCHED (dispatch) {
|
|
style.fill: "#bbdefb"
|
|
style.stroke: "#1565c0"
|
|
style.border-radius: 4
|
|
}
|
|
|
|
ipcq: IPCQ Controller (NEW) {
|
|
style.fill: "#e1f5fe"
|
|
style.stroke: "#0277bd"
|
|
style.border-radius: 6
|
|
style.stroke-width: 2
|
|
|
|
ptrmgmt: Pointer Mgmt {
|
|
style.fill: "#b3e5fc"
|
|
style.stroke: "#0277bd"
|
|
style.border-radius: 4
|
|
|
|
qprf: QPair Reg File
|
|
bp: Backpressure
|
|
sag: Slot Addr Gen
|
|
}
|
|
|
|
sideband: Sideband {
|
|
style.fill: "#b3e5fc"
|
|
style.stroke: "#0277bd"
|
|
style.border-radius: 4
|
|
|
|
metax: Meta Extractor
|
|
crinj: Credit Injector
|
|
crrcv: Credit Receiver
|
|
}
|
|
}
|
|
|
|
dma: PE_DMA (MOD) {
|
|
style.fill: "#fff3e0"
|
|
style.stroke: "#e65100"
|
|
style.border-radius: 6
|
|
|
|
compute_port: compute port {
|
|
style.fill: "#ffe0b2"
|
|
style.stroke: "#e65100"
|
|
style.border-radius: 4
|
|
}
|
|
ipcq_port: IPCQ port {
|
|
style.fill: "#ffe0b2"
|
|
style.stroke: "#e65100"
|
|
style.border-radius: 4
|
|
}
|
|
wrr: WRR Arbiter (NEW) {
|
|
style.fill: "#ffcc80"
|
|
style.stroke: "#e65100"
|
|
style.border-radius: 4
|
|
style.stroke-width: 2
|
|
}
|
|
|
|
compute_port -> wrr
|
|
ipcq_port -> wrr
|
|
}
|
|
|
|
fs: PE_FETCH_STORE {
|
|
style.fill: "#c8e6c9"
|
|
style.stroke: "#2e7d32"
|
|
style.border-radius: 4
|
|
}
|
|
|
|
tcm: TCM (16MB SRAM) {
|
|
style.fill: "#fce4ec"
|
|
style.stroke: "#c62828"
|
|
style.border-radius: 6
|
|
|
|
work: Kernel Working Memory {
|
|
style.fill: "#f8bbd0"
|
|
style.stroke: "#c62828"
|
|
style.border-radius: 4
|
|
}
|
|
slot: IPCQ Slot Region (rsv) {
|
|
style.fill: "#f48fb1"
|
|
style.stroke: "#c62828"
|
|
style.border-radius: 4
|
|
style.stroke-width: 2
|
|
}
|
|
}
|
|
|
|
gemm: GEMM engine {
|
|
style.fill: "#c8e6c9"
|
|
style.stroke: "#2e7d32"
|
|
style.border-radius: 4
|
|
}
|
|
|
|
math: MATH engine {
|
|
style.fill: "#c8e6c9"
|
|
style.stroke: "#2e7d32"
|
|
style.border-radius: 4
|
|
}
|
|
|
|
fport: Fabric Port {
|
|
style.fill: "#ffe0b2"
|
|
style.stroke: "#e65100"
|
|
style.border-radius: 4
|
|
}
|
|
|
|
# Control
|
|
cpu -> sched: cmd dispatch
|
|
cpu -> ipcq: MMIO
|
|
|
|
# Compute pipeline
|
|
sched -> dma.compute_port: TileToken
|
|
dma -> fs: TileToken
|
|
dma <-> tcm.work: DMA_READ/WRITE\n(HBM ↔ TCM)
|
|
fs <-> tcm.work: fetch/store\n(TCM ↔ reg)
|
|
fs -> gemm: TileToken
|
|
fs -> math: TileToken
|
|
gemm -> fs: TileToken
|
|
math -> fs: TileToken
|
|
|
|
# IPCQ data path
|
|
ipcq -> dma.ipcq_port: IpcqDmaToken {style.stroke: "#0277bd"}
|
|
dma -> ipcq.sideband.metax: IpcqMetaArrival {style.stroke: "#0277bd"}
|
|
|
|
# IPCQ slot R/W
|
|
dma <-> tcm.slot: slot read/write {
|
|
style.stroke: "#0277bd"
|
|
style.stroke-dash: 3
|
|
}
|
|
|
|
# Credit via fabric port
|
|
ipcq.sideband.crinj -> fport: credit out (16B) {
|
|
style.stroke: "#7b1fa2"
|
|
style.stroke-dash: 5
|
|
}
|
|
fport -> ipcq.sideband.crrcv: credit in (16B) {
|
|
style.stroke: "#7b1fa2"
|
|
style.stroke-dash: 5
|
|
}
|
|
|
|
# DMA ↔ Fabric
|
|
dma.wrr <-> fport
|
|
}
|
|
|
|
noc: NoC Router {
|
|
style.fill: "#f3e5f5"
|
|
style.stroke: "#6a1b9a"
|
|
style.border-radius: 6
|
|
}
|
|
|
|
ext: Other PEs / Cubes / SIPs {
|
|
style.fill: "#ede7f6"
|
|
style.stroke: "#6a1b9a"
|
|
style.border-radius: 6
|
|
}
|
|
|
|
pe.fport <-> noc
|
|
noc <-> ext
|