From 81ce55571d84fdd61f17881315989c28837c9f88 Mon Sep 17 00:00:00 2001
From: Yangwook Kang <ywkang80@gmail.com>
Date: Thu, 9 Apr 2026 00:16:24 -0700
Subject: [PATCH] Rename impl names: add builtin. prefix for clear provenance
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- components.yaml: all builtin impls use builtin.xxx naming
- topology.yaml: all impl references updated to builtin.xxx
- builder.py: hardcoded ucie impl → builtin.ucie
- Tests: all impl string references updated

Convention: builtin.<name> for built-in, custom.<name> for user-defined.
382 tests passing.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 components.yaml                   | 45 +++++++++++++++++--------------
 src/kernbench/topology/builder.py | 12 ++++-----
 tests/test_component_registry.py  | 14 +++++-----
 tests/test_mmu_component.py       |  8 +++---
 tests/test_pe_components.py       | 26 +++++++++---------
 tests/test_phase_a_components.py  | 32 +++++++++++-----------
 topology.yaml                     | 32 +++++++++++-----------
 7 files changed, 87 insertions(+), 82 deletions(-)
diff --git a/components.yaml b/components.yaml
index ad39d6d..f3ab639 100644
--- a/components.yaml
+++ b/components.yaml
@@ -2,6 +2,10 @@
 # Maps impl names (used in topology.yaml) to Python class paths.
 # Format: impl_name: module.path:ClassName
 #
+# Naming convention:
+#   builtin.<name>  — built-in implementations
+#   custom.<name>   — user-defined implementations
+#
 # ── Adding custom components ──────────────────────────────────────────
 #
 # 1. Create your implementation in:
@@ -10,41 +14,42 @@
 #    Your class must inherit from ComponentBase (or PeEngineBase for PE engines).
 #
 # 2. Register it below under "Custom" with a unique impl name:
-#      my_pe_cpu_v2: kernbench.components.custom.my_pe_cpu:MyPeCpuComponent
+#      custom.my_pe_cpu: kernbench.components.custom.my_pe_cpu:MyPeCpuComponent
 #
 # 3. Reference it in topology.yaml:
-#      pe_cpu: { kind: pe_cpu, impl: my_pe_cpu_v2, attrs: { ... } }
+#      pe_cpu: { kind: pe_cpu, impl: custom.my_pe_cpu, attrs: { ... } }
 #
 # 4. Add unit tests in:
 #      tests/custom/test_<your_component>.py
 #
 # External packages also work — use the full module path:
-#      fast_gemm_v1: my_team.accel.fast_gemm:FastGemmComponent
+#      custom.fast_gemm: my_team.accel.fast_gemm:FastGemmComponent
 # ──────────────────────────────────────────────────────────────────────
 
 components:
   # Infrastructure
-  forwarding_v1:  kernbench.components.builtin.forwarding:TransitComponent
-  switch_v1:      kernbench.components.builtin.forwarding:TransitComponent
-  noc_v1:         kernbench.components.builtin.forwarding:TransitComponent
-  ucie_v1:        kernbench.components.builtin.forwarding:TransitComponent
+  builtin.forwarding:  kernbench.components.builtin.forwarding:TransitComponent
+  builtin.switch:      kernbench.components.builtin.forwarding:TransitComponent
+  builtin.noc:         kernbench.components.builtin.forwarding:TransitComponent
+  builtin.ucie:        kernbench.components.builtin.forwarding:TransitComponent
+
   # IO / Host interface
-  pcie_ep_v1:     kernbench.components.builtin.pcie_ep:PcieEpComponent
-  io_cpu_v1:      kernbench.components.builtin.io_cpu:IoCpuComponent
+  builtin.pcie_ep:     kernbench.components.builtin.pcie_ep:PcieEpComponent
+  builtin.io_cpu:      kernbench.components.builtin.io_cpu:IoCpuComponent
 
   # Cube-level
-  m_cpu_v1:       kernbench.components.builtin.m_cpu:MCpuComponent
-  hbm_ctrl_v1:    kernbench.components.builtin.hbm_ctrl:HbmCtrlComponent
-  sram_v1:        kernbench.components.builtin.sram:SramComponent
+  builtin.m_cpu:       kernbench.components.builtin.m_cpu:MCpuComponent
+  builtin.hbm_ctrl:    kernbench.components.builtin.hbm_ctrl:HbmCtrlComponent
+  builtin.sram:        kernbench.components.builtin.sram:SramComponent
 
   # PE-level
-  pe_cpu_v1:         kernbench.components.builtin.pe_cpu:PeCpuComponent
-  pe_scheduler_v1:   kernbench.components.builtin.pe_scheduler:PeSchedulerComponent
-  pe_dma_v1:         kernbench.components.builtin.pe_dma:PeDmaComponent
-  pe_gemm_v1:        kernbench.components.builtin.pe_gemm:PeGemmComponent
-  pe_math_v1:        kernbench.components.builtin.pe_math:PeMathComponent
-  pe_fetch_store_v1: kernbench.components.builtin.pe_fetch_store:PeFetchStoreComponent
-  pe_mmu_v1:         kernbench.components.builtin.pe_mmu:PeMmuComponent
-  pe_tcm_v1:         kernbench.components.builtin.pe_tcm:PeTcmComponent
+  builtin.pe_cpu:         kernbench.components.builtin.pe_cpu:PeCpuComponent
+  builtin.pe_scheduler:   kernbench.components.builtin.pe_scheduler:PeSchedulerComponent
+  builtin.pe_dma:         kernbench.components.builtin.pe_dma:PeDmaComponent
+  builtin.pe_gemm:        kernbench.components.builtin.pe_gemm:PeGemmComponent
+  builtin.pe_math:        kernbench.components.builtin.pe_math:PeMathComponent
+  builtin.pe_fetch_store: kernbench.components.builtin.pe_fetch_store:PeFetchStoreComponent
+  builtin.pe_mmu:         kernbench.components.builtin.pe_mmu:PeMmuComponent
+  builtin.pe_tcm:         kernbench.components.builtin.pe_tcm:PeTcmComponent
 
   # Custom — add your implementations here
diff --git a/src/kernbench/topology/builder.py b/src/kernbench/topology/builder.py
index c525f4f..ae18f3c 100644
--- a/src/kernbench/topology/builder.py
+++ b/src/kernbench/topology/builder.py
@@ -277,7 +277,7 @@ def _instantiate_io_chiplets(
         for phy in inst["ucie"]["phys"]:
             phy_id = f"{prefix}.ucie-{phy}"
             nodes[phy_id] = Node(
-                id=phy_id, kind="io_ucie", impl="ucie_v1",
+                id=phy_id, kind="io_ucie", impl="builtin.ucie",
                 attrs={"overhead_ns": io_ucie_ns},
                 pos_mm=(cx, noc_y), label=f"IO UCIe-{phy}",
             )
@@ -285,7 +285,7 @@ def _instantiate_io_chiplets(
             for ci in range(io_n_conn):
                 conn_id = f"{phy_id}.conn{ci}"
                 nodes[conn_id] = Node(
-                    id=conn_id, kind="io_ucie_conn", impl="ucie_v1",
+                    id=conn_id, kind="io_ucie_conn", impl="builtin.ucie",
                     attrs={"overhead_ns": 0.0},
                     pos_mm=(cx, noc_y), label=f"IO UCIe-{phy} C{ci}",
                 )
@@ -379,14 +379,14 @@ def _instantiate_cube(
         pid = f"{cp}.ucie-{port}"
         lx, ly = local_pos[f"ucie-{port}"]
         nodes[pid] = Node(
-            id=pid, kind="ucie_port", impl="ucie_v1",
+            id=pid, kind="ucie_port", impl="builtin.ucie",
             attrs={"overhead_ns": ucie_ns}, pos_mm=(ox + lx, oy + ly),
             label=f"UCIe-{port}",
         )
         for ci in range(ucie_n_conn):
             conn_id = f"{cp}.ucie-{port}.conn{ci}"
             nodes[conn_id] = Node(
-                id=conn_id, kind="ucie_conn", impl="ucie_v1",
+                id=conn_id, kind="ucie_conn", impl="builtin.ucie",
                 attrs={"overhead_ns": 0.0},
                 pos_mm=(ox + lx, oy + ly),
                 label=f"UCIe-{port} C{ci}",
@@ -937,13 +937,13 @@ def _build_cube_view(spec: dict) -> ViewGraph:
         pid = f"ucie-{port}"
         lx, ly = local_pos[pid]
         nodes[pid] = Node(
-            id=pid, kind="ucie_port", impl="ucie_v1",
+            id=pid, kind="ucie_port", impl="builtin.ucie",
             attrs={}, pos_mm=(lx, ly), label=f"UCIe-{port}",
         )
         for ci in range(ucie_n_conn):
             conn_id = f"ucie-{port}.conn{ci}"
             nodes[conn_id] = Node(
-                id=conn_id, kind="ucie_conn", impl="ucie_v1",
+                id=conn_id, kind="ucie_conn", impl="builtin.ucie",
                 attrs={"overhead_ns": 0.0}, pos_mm=(lx, ly),
                 label=f"UCIe-{port} C{ci}",
             )
diff --git a/tests/test_component_registry.py b/tests/test_component_registry.py
index 4e344a5..055bb6d 100644
--- a/tests/test_component_registry.py
+++ b/tests/test_component_registry.py
@@ -55,7 +55,7 @@ def test_registry_unknown_impl_raises_error():
 
 def test_transit_component_yields_overhead_ns():
     """TransitComponent.run() yields exactly node.attrs['overhead_ns'] ns."""
-    node = _node("forwarding_v1", overhead_ns=3.0)
+    node = _node("builtin.forwarding", overhead_ns=3.0)
     comp = TransitComponent(node)
     env = simpy.Environment()
 
@@ -69,7 +69,7 @@ def test_transit_component_yields_overhead_ns():
 
 def test_transit_component_zero_overhead_ns():
     """TransitComponent with overhead_ns=0 still yields (no infinite loop)."""
-    node = _node("noc_v1", overhead_ns=0.0)
+    node = _node("builtin.noc", overhead_ns=0.0)
     comp = TransitComponent(node)
     env = simpy.Environment()
 
@@ -100,7 +100,7 @@ def test_engine_component_override_is_called():
 
     SpyXbar.calls = 0
     graph = _graph()
-    engine = GraphEngine(graph, component_overrides={"forwarding_v1": SpyXbar})
+    engine = GraphEngine(graph, component_overrides={"builtin.forwarding": SpyXbar})
     msg = MemoryReadMsg(
         correlation_id="c", request_id="r",
         src_sip=0, src_cube=0, src_pe=0,
@@ -108,7 +108,7 @@ def test_engine_component_override_is_called():
     )
     h = engine.submit(msg)
     engine.wait(h)
-    # Path passes through router nodes (impl=forwarding_v1)
+    # Path passes through router nodes (impl=forwarding)
     assert SpyXbar.calls > 0
 
 
@@ -140,7 +140,7 @@ def test_engine_component_model_latency():
 
 
 def test_engine_override_is_scoped_to_impl():
-    """forwarding_v1 override (ZeroRouter, no overhead) reduces total_ns.
+    """forwarding override (ZeroRouter, no overhead) reduces total_ns.
 
     Router nodes have overhead_ns=2.0. Replacing with zero-latency impl
     removes router overhead from the path.
@@ -152,7 +152,7 @@ def test_engine_override_is_scoped_to_impl():
 
     graph = _graph()
     engine_default = GraphEngine(graph)
-    engine_override = GraphEngine(graph, component_overrides={"forwarding_v1": ZeroRouter})
+    engine_override = GraphEngine(graph, component_overrides={"builtin.forwarding": ZeroRouter})
 
     msg = MemoryReadMsg(
         correlation_id="c", request_id="r",
@@ -168,5 +168,5 @@ def test_engine_override_is_scoped_to_impl():
     engine_override.wait(h_o)
     _, t_override = engine_override.get_completion(h_o)
 
-    # ZeroRouter removes overhead from all forwarding_v1 nodes in path.
+    # ZeroRouter removes overhead from all forwarding nodes in path.
     assert t_override["total_ns"] < t_default["total_ns"]
diff --git a/tests/test_mmu_component.py b/tests/test_mmu_component.py
index b4ec8ed..8ab98d5 100644
--- a/tests/test_mmu_component.py
+++ b/tests/test_mmu_component.py
@@ -71,7 +71,7 @@ def test_mmu_unmap_msg_fields():
 
 
 def test_pe_mmu_registry():
-    """pe_mmu_v1 impl resolves in ComponentRegistry."""
+    """pe_mmu impl resolves in ComponentRegistry."""
     from kernbench.components.base import ComponentRegistry
     from kernbench.components.builtin.pe_mmu import PeMmuComponent
     from kernbench.topology.types import Node
@@ -79,7 +79,7 @@ def test_pe_mmu_registry():
     node = Node(
         id="sip0.cube0.pe0.pe_mmu",
         kind="pe_mmu",
-        impl="pe_mmu_v1",
+        impl="builtin.pe_mmu",
         pos_mm=None,
         attrs={"tlb_overhead_ns": 0.5},
     )
@@ -101,7 +101,7 @@ def test_pe_mmu_processes_map_msg():
     node = Node(
         id="sip0.cube0.pe0.pe_mmu",
         kind="pe_mmu",
-        impl="pe_mmu_v1",
+        impl="builtin.pe_mmu",
         pos_mm=None,
         attrs={"tlb_overhead_ns": 0.5, "page_size": 4096},
     )
@@ -158,7 +158,7 @@ def test_pe_dma_translates_va():
     node = Node(
         id="sip0.cube0.pe0.pe_dma",
         kind="pe_dma",
-        impl="pe_dma_v1",
+        impl="builtin.pe_dma",
         pos_mm=None,
         attrs={"rd_engines": 1, "wr_engines": 1},
     )
diff --git a/tests/test_pe_components.py b/tests/test_pe_components.py
index f6d7685..eceb27b 100644
--- a/tests/test_pe_components.py
+++ b/tests/test_pe_components.py
@@ -65,15 +65,15 @@ def _hbm_pa(sip: int = 0, cube: int = 0, pe_id: int = 0) -> int:
 def test_pe_registry_resolves_all():
     """All 6 PE component impl strings must resolve to their specific classes."""
     expected = {
-        "pe_cpu_v1": PeCpuComponent,
-        "pe_scheduler_v1": PeSchedulerComponent,
-        "pe_dma_v1": PeDmaComponent,
-        "pe_gemm_v1": PeGemmComponent,
-        "pe_math_v1": PeMathComponent,
-        "pe_tcm_v1": PeTcmComponent,
+        "builtin.pe_cpu": PeCpuComponent,
+        "builtin.pe_scheduler": PeSchedulerComponent,
+        "builtin.pe_dma": PeDmaComponent,
+        "builtin.pe_gemm": PeGemmComponent,
+        "builtin.pe_math": PeMathComponent,
+        "builtin.pe_tcm": PeTcmComponent,
     }
     for impl, cls in expected.items():
-        node = Node(id=f"test.{impl}", kind=impl.replace("_v1", ""),
+        node = Node(id=f"test.{impl}", kind=impl.replace("", ""),
                      impl=impl, pos_mm=None, attrs={})
         comp = ComponentRegistry.create(node)
         assert isinstance(comp, cls), f"{impl} resolved to {type(comp)}, expected {cls}"
@@ -90,7 +90,7 @@ def test_pe_dma_dual_channel_concurrent():
     """
     env = simpy.Environment()
     node = Node(id="sip0.cube0.pe0.pe_dma", kind="pe_dma",
-                impl="pe_dma_v1", pos_mm=None,
+                impl="builtin.pe_dma", pos_mm=None,
                 attrs={"rd_engines": 1, "wr_engines": 1})
     comp = PeDmaComponent(node)
 
@@ -151,7 +151,7 @@ def test_pe_dma_same_channel_serializes():
     """Two READ operations on the same PE_DMA must serialize (capacity=1)."""
     env = simpy.Environment()
     node = Node(id="sip0.cube0.pe0.pe_dma", kind="pe_dma",
-                impl="pe_dma_v1", pos_mm=None,
+                impl="builtin.pe_dma", pos_mm=None,
                 attrs={"rd_engines": 1, "wr_engines": 1})
     comp = PeDmaComponent(node)
 
@@ -219,11 +219,11 @@ def test_pe_accel_shared_slot():
 
     pe_prefix = "sip0.cube0.pe0"
     gemm_node = Node(
-        id=f"{pe_prefix}.pe_gemm", kind="pe_gemm", impl="pe_gemm_v1",
+        id=f"{pe_prefix}.pe_gemm", kind="pe_gemm", impl="builtin.pe_gemm",
         pos_mm=None, attrs={"overhead_ns": 10.0, "shared_resource": "accel_slot"},
     )
     math_node = Node(
-        id=f"{pe_prefix}.pe_math", kind="pe_math", impl="pe_math_v1",
+        id=f"{pe_prefix}.pe_math", kind="pe_math", impl="builtin.pe_math",
         pos_mm=None, attrs={"overhead_ns": 10.0, "shared_resource": "accel_slot"},
     )
     gemm = PeGemmComponent(gemm_node, ctx)
@@ -302,7 +302,7 @@ def test_pe_gemm_handles_pe_internal_txn():
 
     pe_prefix = "sip0.cube0.pe0"
     gemm_node = Node(
-        id=f"{pe_prefix}.pe_gemm", kind="pe_gemm", impl="pe_gemm_v1",
+        id=f"{pe_prefix}.pe_gemm", kind="pe_gemm", impl="builtin.pe_gemm",
         pos_mm=None, attrs={"overhead_ns": 5.0, "shared_resource": "accel_slot"},
     )
     gemm = PeGemmComponent(gemm_node, ctx)
@@ -343,7 +343,7 @@ def test_pe_math_handles_pe_internal_txn():
 
     pe_prefix = "sip0.cube0.pe0"
     math_node = Node(
-        id=f"{pe_prefix}.pe_math", kind="pe_math", impl="pe_math_v1",
+        id=f"{pe_prefix}.pe_math", kind="pe_math", impl="builtin.pe_math",
         pos_mm=None, attrs={"overhead_ns": 3.0, "shared_resource": "accel_slot"},
     )
     math_comp = PeMathComponent(math_node, ctx)
diff --git a/tests/test_phase_a_components.py b/tests/test_phase_a_components.py
index 466cb73..0330943 100644
--- a/tests/test_phase_a_components.py
+++ b/tests/test_phase_a_components.py
@@ -57,7 +57,7 @@ def _inject(store: simpy.Store, txn: Transaction):
 
 def test_transit_component_run_overhead_ns():
     """TransitComponent.run() yields exactly overhead_ns."""
-    node = _node("forwarding_v1", {"overhead_ns": 7.5})
+    node = _node("builtin.forwarding", {"overhead_ns": 7.5})
     comp = TransitComponent(node)
     env = simpy.Environment()
 
@@ -71,7 +71,7 @@ def test_transit_component_run_overhead_ns():
 
 def test_transit_component_run_zero_overhead_ns():
     """TransitComponent.run() with overhead_ns=0 completes immediately."""
-    node = _node("noc_v1", {"overhead_ns": 0.0})
+    node = _node("builtin.noc", {"overhead_ns": 0.0})
     comp = TransitComponent(node)
     env = simpy.Environment()
     done = []
@@ -91,7 +91,7 @@ def test_transit_component_run_zero_overhead_ns():
 
 def test_io_cpu_component_run_overhead_ns():
     """IoCpuComponent.run() yields exactly overhead_ns."""
-    node = _node("io_cpu_v1", {"overhead_ns": 10.0})
+    node = _node("builtin.io_cpu", {"overhead_ns": 10.0})
     comp = IoCpuComponent(node)
     env = simpy.Environment()
 
@@ -108,7 +108,7 @@ def test_io_cpu_component_run_overhead_ns():
 
 def test_hbm_ctrl_terminal_succeeds_done():
     """HbmCtrlComponent is a terminal node: succeeds txn.done after run()."""
-    node = _node("hbm_ctrl_v1", {"overhead_ns": 0.0, "capacity": 1})
+    node = _node("builtin.hbm_ctrl", {"overhead_ns": 0.0, "capacity": 1})
     comp = HbmCtrlComponent(node)
     env = simpy.Environment()
     done_event = env.event()
@@ -121,7 +121,7 @@ def test_hbm_ctrl_terminal_succeeds_done():
 
 def test_hbm_ctrl_resource_serializes_requests():
     """HbmCtrlComponent with capacity=1 serializes concurrent requests."""
-    node = _node("hbm_ctrl_v1", {"overhead_ns": 5.0, "capacity": 1})
+    node = _node("builtin.hbm_ctrl", {"overhead_ns": 5.0, "capacity": 1})
     comp = HbmCtrlComponent(node)
     env = simpy.Environment()
     in_store: simpy.Store = simpy.Store(env)
@@ -151,7 +151,7 @@ def test_hbm_ctrl_resource_serializes_requests():
 
 def test_sram_terminal_succeeds_done():
     """SramComponent is a terminal node: succeeds txn.done after run()."""
-    node = _node("sram_v1", {"overhead_ns": 2.0})
+    node = _node("builtin.sram", {"overhead_ns": 2.0})
     comp = SramComponent(node)
     env = simpy.Environment()
     done_event = env.event()
@@ -168,7 +168,7 @@ def test_sram_terminal_succeeds_done():
 
 def test_m_cpu_forwards_when_not_terminal():
     """MCpuComponent forwards Transaction to next hop when not terminal."""
-    node = _node("m_cpu_v1", {"overhead_ns": 5.0})
+    node = _node("builtin.m_cpu", {"overhead_ns": 5.0})
     comp = MCpuComponent(node)
     env = simpy.Environment()
 
@@ -213,7 +213,7 @@ def test_m_cpu_forwards_when_not_terminal():
 
 def test_m_cpu_terminal_no_ctx_completes():
     """MCpuComponent without ctx completes txn.done when it is the terminal hop."""
-    node = _node("m_cpu_v1", {"overhead_ns": 0.0})
+    node = _node("builtin.m_cpu", {"overhead_ns": 0.0})
     comp = MCpuComponent(node, ctx=None)
     env = simpy.Environment()
     done_event = env.event()
@@ -228,14 +228,14 @@ def test_m_cpu_terminal_no_ctx_completes():
 
 
 @pytest.mark.parametrize("impl,expected_cls", [
-    ("forwarding_v1", TransitComponent),
-    ("noc_v1",        TransitComponent),
-    ("ucie_v1",       TransitComponent),
-    ("pcie_ep_v1",    PcieEpComponent),
-    ("io_cpu_v1",     IoCpuComponent),
-    ("m_cpu_v1",      MCpuComponent),
-    ("hbm_ctrl_v1",   HbmCtrlComponent),
-    ("sram_v1",       SramComponent),
+    ("builtin.forwarding", TransitComponent),
+    ("builtin.noc",        TransitComponent),
+    ("builtin.ucie",       TransitComponent),
+    ("builtin.pcie_ep",    PcieEpComponent),
+    ("builtin.io_cpu",     IoCpuComponent),
+    ("builtin.m_cpu",      MCpuComponent),
+    ("builtin.hbm_ctrl",   HbmCtrlComponent),
+    ("builtin.sram",       SramComponent),
 ])
 def test_registry_resolves_impl(impl, expected_cls):
     """ComponentRegistry.create() returns the correct concrete class for each impl."""
diff --git a/topology.yaml b/topology.yaml
index fedde91..ad936cc 100644
--- a/topology.yaml
+++ b/topology.yaml
@@ -6,7 +6,7 @@ system:
     count: 2
 
   components:
-    switch: { kind: switch, impl: switch_v1, attrs: { overhead_ns: 5.0 } }
+    switch: { kind: switch, impl: builtin.switch, attrs: { overhead_ns: 5.0 } }
 
   links:
     io_ep_to_switch:
@@ -19,9 +19,9 @@ sip:
 
   iochiplet:
     components:
-      pcie_ep: { kind: pcie_ep, impl: pcie_ep_v1, attrs: { overhead_ns: 5.0 } }
-      io_cpu:  { kind: io_cpu,  impl: io_cpu_v1,  attrs: { overhead_ns: 10.0 } }
-      io_noc:  { kind: io_noc,  impl: forwarding_v1, attrs: { overhead_ns: 0.0 } }
+      pcie_ep: { kind: pcie_ep, impl: builtin.pcie_ep, attrs: { overhead_ns: 5.0 } }
+      io_cpu:  { kind: io_cpu,  impl: builtin.io_cpu,  attrs: { overhead_ns: 10.0 } }
+      io_noc:  { kind: io_noc,  impl: builtin.forwarding, attrs: { overhead_ns: 0.0 } }
     links:
       pcie_ep_to_noc_bw_gbs: 256.0
       pcie_ep_to_noc_mm: 1.0
@@ -60,14 +60,14 @@ cube:
 
   pe_template:
     components:
-      pe_cpu:       { kind: pe_cpu,       impl: pe_cpu_v1,       attrs: { overhead_ns: 2.0 } }
-      pe_scheduler: { kind: pe_scheduler, impl: pe_scheduler_v1, attrs: { overhead_ns: 1.0 } }
-      pe_dma:       { kind: pe_dma,       impl: pe_dma_v1,       attrs: { rd_engines: 1, wr_engines: 1 } }
-      pe_gemm:        { kind: pe_gemm,        impl: pe_gemm_v1,        attrs: { overhead_ns: 0.0, shared_resource: accel_slot, peak_tflops_f16: 8.0 } }
-      pe_math:        { kind: pe_math,        impl: pe_math_v1,        attrs: { overhead_ns: 0.0, shared_resource: accel_slot } }
-      pe_fetch_store: { kind: pe_fetch_store, impl: pe_fetch_store_v1, attrs: { overhead_ns: 0.0 } }
-      pe_mmu:         { kind: pe_mmu,         impl: pe_mmu_v1,         attrs: { tlb_overhead_ns: 0.5, page_size: 4096 } }
-      pe_tcm:         { kind: pe_tcm,         impl: pe_tcm_v1,         attrs: { size_mb: 16, read_bw_gbs: 512.0, write_bw_gbs: 512.0 } }
+      pe_cpu:       { kind: pe_cpu,       impl: builtin.pe_cpu,       attrs: { overhead_ns: 2.0 } }
+      pe_scheduler: { kind: pe_scheduler, impl: builtin.pe_scheduler, attrs: { overhead_ns: 1.0 } }
+      pe_dma:       { kind: pe_dma,       impl: builtin.pe_dma,       attrs: { rd_engines: 1, wr_engines: 1 } }
+      pe_gemm:        { kind: pe_gemm,        impl: builtin.pe_gemm,        attrs: { overhead_ns: 0.0, shared_resource: accel_slot, peak_tflops_f16: 8.0 } }
+      pe_math:        { kind: pe_math,        impl: builtin.pe_math,        attrs: { overhead_ns: 0.0, shared_resource: accel_slot } }
+      pe_fetch_store: { kind: pe_fetch_store, impl: builtin.pe_fetch_store, attrs: { overhead_ns: 0.0 } }
+      pe_mmu:         { kind: pe_mmu,         impl: builtin.pe_mmu,         attrs: { tlb_overhead_ns: 0.5, page_size: 4096 } }
+      pe_tcm:         { kind: pe_tcm,         impl: builtin.pe_tcm,         attrs: { size_mb: 16, read_bw_gbs: 512.0, write_bw_gbs: 512.0 } }
     links:
       pe_cpu_to_scheduler_mm:  0.5
       scheduler_to_dma_mm:     0.5
@@ -99,10 +99,10 @@ cube:
     hbm_channel_bw_gbs: 32.0          # per-channel bandwidth (GB/s)
 
   components:
-    noc_router: { kind: noc_router, impl: forwarding_v1, attrs: { overhead_ns: 2.0 } }
-    m_cpu:      { kind: m_cpu,   impl: m_cpu_v1,   attrs: { overhead_ns: 5.0 } }
-    hbm_ctrl:   { kind: hbm_ctrl, impl: hbm_ctrl_v1, attrs: { capacity: 1, efficiency: 1.0 } }
-    sram:       { kind: sram,     impl: sram_v1,     attrs: { size_mb: 32, overhead_ns: 2.0 } }
+    noc_router: { kind: noc_router, impl: builtin.forwarding, attrs: { overhead_ns: 2.0 } }
+    m_cpu:      { kind: m_cpu,   impl: builtin.m_cpu,   attrs: { overhead_ns: 5.0 } }
+    hbm_ctrl:   { kind: hbm_ctrl, impl: builtin.hbm_ctrl, attrs: { capacity: 1, efficiency: 1.0 } }
+    sram:       { kind: sram,     impl: builtin.sram,     attrs: { size_mb: 32, overhead_ns: 2.0 } }
 
   # Physical placement of non-PE components (mm coordinates)
   placement: