Add reverse path response latency for PE DMA and PE_CPU→M_CPU
Model fabric response hop latency for PE-internal operations: - HBM_CTRL sends PeDmaMsg response on reverse path instead of direct done signal - PE_CPU sends ResponseMsg via NOC→M_CPU on kernel completion - Add NOC→PE_DMA and PE_CPU→NOC edges in topology builder - Make HBM BW test assertions dynamic based on topology efficiency Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
+16
-5
@@ -143,6 +143,14 @@ def _graph():
|
||||
return load_topology(TOPOLOGY_PATH)
|
||||
|
||||
|
||||
def _hbm_effective_bw() -> float:
|
||||
"""Compute HBM effective BW from topology spec: xbar_to_hbm_bw_gbs * efficiency."""
|
||||
g = _graph()
|
||||
raw_bw = g.spec["cube"]["links"]["xbar_to_hbm_bw_gbs"]
|
||||
eff = g.spec["cube"]["components"]["hbm_ctrl"].get("attrs", {}).get("efficiency", 1.0)
|
||||
return raw_bw * eff
|
||||
|
||||
|
||||
def _pe_dma_latency(src_cube: int, src_pe: int, dst_pe: int) -> float:
|
||||
engine = _engine()
|
||||
msg = PeDmaMsg(
|
||||
@@ -190,15 +198,17 @@ def test_pe_dma_local_completes():
|
||||
|
||||
|
||||
def test_pe_dma_local_bottleneck_hbm():
|
||||
"""PE DMA pe0→slice0 (local): bottleneck = HBM effective BW (256 * 0.8 = 204.8)."""
|
||||
"""PE DMA pe0→slice0 (local): bottleneck = HBM effective BW."""
|
||||
bn = _pe_dma_bottleneck(src_cube=0, src_pe=0, dst_pe=0)
|
||||
assert bn == 204.8, f"Local PE DMA bottleneck {bn}, expected 204.8"
|
||||
expected = _hbm_effective_bw()
|
||||
assert bn == expected, f"Local PE DMA bottleneck {bn}, expected {expected}"
|
||||
|
||||
|
||||
def test_pe_dma_same_half_bottleneck_hbm():
|
||||
"""PE DMA pe0→slice1 (same half via xbar_top): bottleneck = HBM effective BW."""
|
||||
bn = _pe_dma_bottleneck(src_cube=0, src_pe=0, dst_pe=1)
|
||||
assert bn == 204.8, f"Same-half PE DMA bottleneck {bn}, expected 204.8"
|
||||
expected = _hbm_effective_bw()
|
||||
assert bn == expected, f"Same-half PE DMA bottleneck {bn}, expected {expected}"
|
||||
|
||||
|
||||
def test_pe_dma_deterministic():
|
||||
@@ -311,12 +321,13 @@ def test_d2h_latency_gte_h2d():
|
||||
|
||||
|
||||
def test_hbm_efficiency_applied():
|
||||
"""HBM edge BW should reflect efficiency factor (256 * 0.8 = 204.8)."""
|
||||
"""HBM edge BW should reflect efficiency factor from topology spec."""
|
||||
graph = _graph()
|
||||
edge_map = {(e.src, e.dst): e for e in graph.edges}
|
||||
e = edge_map.get(("sip0.cube0.xbar_top", "sip0.cube0.hbm_ctrl.slice0"))
|
||||
assert e is not None, "xbar_top -> hbm_ctrl.slice0 edge missing"
|
||||
assert e.bw_gbs == 204.8, f"HBM edge BW {e.bw_gbs}, expected 204.8 (256*0.8)"
|
||||
expected = _hbm_effective_bw()
|
||||
assert e.bw_gbs == expected, f"HBM edge BW {e.bw_gbs}, expected {expected}"
|
||||
|
||||
|
||||
# ── 11. Sweep saturation ──────────────────────────────────────
|
||||
|
||||
@@ -29,19 +29,19 @@ def test_full_graph_node_count():
|
||||
|
||||
def test_full_graph_edge_count():
|
||||
g = _graph()
|
||||
# Per cube: 168
|
||||
# Per cube: 184
|
||||
# PE-internal: 56
|
||||
# PE_DMA→noc: 8, noc→pe_cpu: 8
|
||||
# PE_DMA→noc: 8, noc→pe_dma: 8, noc→pe_cpu: 8, pe_cpu→noc: 8
|
||||
# xbar_top→hbm{0..3}: 4+4=8, xbar_bot→hbm{4..7}: 4+4=8
|
||||
# noc↔xbar_top: 2, noc↔xbar_bot: 2
|
||||
# xbar_top↔bridge.left: 2, bridge.left↔xbar_bot: 2
|
||||
# xbar_top↔bridge.right: 2, bridge.right↔xbar_bot: 2
|
||||
# ucie: 64, m_cpu↔noc: 2, noc↔sram: 2
|
||||
# Total: 56+8+8+8+8+2+2+2+2+2+2+64+2+2 = 168
|
||||
# Total: 56+8+8+8+8+8+8+2+2+2+2+2+2+64+2+2 = 184
|
||||
# IO edges per SIP: 77
|
||||
# Per SIP: 16*168 + 48 inter-cube + 77 IO = 2813
|
||||
# Total: 2 * 2813 = 5626
|
||||
assert len(g.edges) == 5626
|
||||
# Per SIP: 16*184 + 48 inter-cube + 77 IO = 3069
|
||||
# Total: 2 * 3069 = 6138
|
||||
assert len(g.edges) == 6138
|
||||
|
||||
|
||||
# ── Full graph: specific nodes exist ─────────────────────────────────
|
||||
|
||||
Reference in New Issue
Block a user