Add tl.recv_no_consume diagnostic API for apples-to-apples pe2pe plot
The pe2pe overview compared IPCQ (tl.send + tl.recv) against raw DMA (tl.load + tl.store), but DMA is one-sided — DST never reads — while tl.recv pays a slot-read on DST. The comparison was unfair: IPCQ looked slower partly because it does more work. Adds tl.recv_no_consume() — a separate, diagnostic-only entry point that blocks for slot arrival but skips the slot-read (and bank-hop) charge on DST. Production tl.recv is unchanged (no `consume` kwarg on the public API), so the diagnostic flag can never accidentally leak into real workloads. Updates test_pe_to_pe_latency to call tl.recv_no_consume so the overview.png shows IPCQ no-consume vs raw DMA on equal footing. Also fixes PLOT_DIR back to docs/diagrams/pe2pe_latency_plots/ (was lost in a merge). Adds scripts/replot_pe2pe.py for label-only re-renders without re-measuring. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,7 +1,12 @@
|
||||
"""PE-to-PE latency sweep across hop types and data sizes.
|
||||
|
||||
Compares IPCQ send/recv vs raw-DMA (tl.load + tl.store) latency for four
|
||||
hop types:
|
||||
hop types. The IPCQ path uses ``tl.recv_no_consume(...)`` so that DST
|
||||
does not pay the slot-read latency — apples-to-apples with the DMA
|
||||
path, which is a one-sided write that has no read on DST.
|
||||
|
||||
``tl.recv_no_consume`` is a DIAGNOSTIC-only entry point that exists
|
||||
solely to draw this graph; production kernels use ``tl.recv``.
|
||||
|
||||
H1 Intra-cube horizontal pe0 → pe1
|
||||
H2 Intra-cube vertical pe0 → pe4
|
||||
@@ -28,7 +33,9 @@ from kernbench.sim_engine.engine import GraphEngine
|
||||
from kernbench.topology.builder import resolve_topology
|
||||
|
||||
TOPOLOGY_PATH = Path(__file__).parent.parent / "topology.yaml"
|
||||
PLOT_DIR = Path(__file__).parent / "pe2pe_latency_plots"
|
||||
PLOT_DIR = (
|
||||
Path(__file__).parent.parent / "docs" / "diagrams" / "pe2pe_latency_plots"
|
||||
)
|
||||
|
||||
SIZES = [128, 256, 384, 512, 768, 1024, 2048, 4096, 8192, 10240]
|
||||
|
||||
@@ -101,7 +108,12 @@ def _measure_ipcq(hop: Hop, nbytes: int) -> float:
|
||||
data = tl.load(t_ptr, shape=(n_elem,), dtype="f16")
|
||||
tl.send(dir=send_dir, src=data)
|
||||
elif cube_id == dst_cube and pe_id == dst_pe:
|
||||
tl.recv(dir=recv_dir, shape=(n_elem,), dtype="f16")
|
||||
# tl.recv_no_consume: DST blocks until bytes land in
|
||||
# slot but skips slot-read latency. Apples-to-apples
|
||||
# with the raw-DMA path below, which has no DST read.
|
||||
# Diagnostic-only — production kernels use tl.recv.
|
||||
tl.recv_no_consume(dir=recv_dir,
|
||||
shape=(n_elem,), dtype="f16")
|
||||
|
||||
tensors = []
|
||||
for s in sorted({src_sip, dst_sip}):
|
||||
@@ -238,7 +250,8 @@ def _plot_per_hop(records, hop: Hop, path: Path) -> None:
|
||||
ax.plot(
|
||||
[r["size_bytes"] for r in ipcq],
|
||||
[r["total_ns"] for r in ipcq],
|
||||
marker="o", label="IPCQ (send/recv)", color="tab:blue",
|
||||
marker="o", label="IPCQ no-consume (send/recv, no slot read)",
|
||||
color="tab:blue",
|
||||
)
|
||||
if raw:
|
||||
ax.plot(
|
||||
@@ -275,13 +288,13 @@ def _plot_overview(records, path: Path) -> None:
|
||||
ax.plot(
|
||||
[r["size_bytes"] for r in ipcq],
|
||||
[r["total_ns"] for r in ipcq],
|
||||
marker="o", label="IPCQ", color="tab:blue",
|
||||
marker="o", label="IPCQ no-consume", color="tab:blue",
|
||||
)
|
||||
if raw:
|
||||
ax.plot(
|
||||
[r["size_bytes"] for r in raw],
|
||||
[r["total_ns"] for r in raw],
|
||||
marker="s", label="Raw", color="tab:orange",
|
||||
marker="s", label="Raw DMA", color="tab:orange",
|
||||
)
|
||||
ax.set_title(hop.label, fontsize=10)
|
||||
ax.set_xlabel("bytes")
|
||||
@@ -291,7 +304,7 @@ def _plot_overview(records, path: Path) -> None:
|
||||
for j in range(len(HOPS), len(axes)):
|
||||
axes[j].axis("off")
|
||||
fig.suptitle(
|
||||
"PE-to-PE latency: IPCQ vs raw DMA",
|
||||
"PE-to-PE latency: IPCQ no-consume vs raw DMA",
|
||||
fontsize=14,
|
||||
)
|
||||
fig.tight_layout()
|
||||
@@ -307,7 +320,8 @@ def test_pe_to_pe_latency_sweep():
|
||||
|
||||
for hop in HOPS:
|
||||
for size in SIZES:
|
||||
# IPCQ path
|
||||
# IPCQ path uses tl.recv(consume=False) — apples-to-apples
|
||||
# with the raw-DMA path, which has no DST read either.
|
||||
ipcq_ns = _measure_ipcq(hop, size)
|
||||
records.append({
|
||||
"hop": hop.id, "label": hop.label,
|
||||
|
||||
Reference in New Issue
Block a user