diff --git a/cutedsl_loader/cutedsl b/cutedsl_loader/cutedsl
deleted file mode 120000
index 0a2ab70e..00000000
--- a/cutedsl_loader/cutedsl
+++ /dev/null
@@ -1 +0,0 @@
-/root/dsv4-nvfp4-workspace/kernel/cutedsl
\ No newline at end of file
diff --git a/cutedsl/__init__.py b/dsv4/__init__.py
similarity index 100%
rename from cutedsl/__init__.py
rename to dsv4/__init__.py
diff --git a/cutedsl/kernel/__init__.py b/dsv4/cache/__init__.py
similarity index 100%
rename from cutedsl/kernel/__init__.py
rename to dsv4/cache/__init__.py
diff --git a/dsv4/cache/block_table.py b/dsv4/cache/block_table.py
new file mode 100644
index 00000000..b48028f8
--- /dev/null
+++ b/dsv4/cache/block_table.py
@@ -0,0 +1,2 @@
+"""Block table for paged KV cache."""
+# TODO: Phase 3
diff --git a/dsv4/cache/paged_cache.py b/dsv4/cache/paged_cache.py
new file mode 100644
index 00000000..fce4b419
--- /dev/null
+++ b/dsv4/cache/paged_cache.py
@@ -0,0 +1,2 @@
+"""Paged KV cache."""
+# TODO: Phase 3
diff --git a/dsv4/cache/state_cache.py b/dsv4/cache/state_cache.py
new file mode 100644
index 00000000..e19485ba
--- /dev/null
+++ b/dsv4/cache/state_cache.py
@@ -0,0 +1,2 @@
+"""State cache for KV."""
+# TODO: Phase 3
diff --git a/cutedsl/kernel/blockscaled_gemm/__init__.py b/dsv4/kernels/__init__.py
similarity index 100%
rename from cutedsl/kernel/blockscaled_gemm/__init__.py
rename to dsv4/kernels/__init__.py
diff --git a/cutedsl/kernel/moe/__init__.py b/dsv4/kernels/attention/__init__.py
similarity index 100%
rename from cutedsl/kernel/moe/__init__.py
rename to dsv4/kernels/attention/__init__.py
diff --git a/dsv4/kernels/attention/fmha.py b/dsv4/kernels/attention/fmha.py
new file mode 100644
index 00000000..263ef354
--- /dev/null
+++ b/dsv4/kernels/attention/fmha.py
@@ -0,0 +1,2 @@
+"""FMHA kernel: QK -> online softmax -> PV (CuTeDSL, Stage B+). Extracted from test_fmha_v3.py."""
+# TODO: Extract FmhaV3 kernel class here
diff --git a/dsv4/kernels/compressor/__init__.py b/dsv4/kernels/compressor/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/cutedsl/csa_hca_compressor.py b/dsv4/kernels/compressor/csa_hca.py
similarity index 100%
rename from cutedsl/csa_hca_compressor.py
rename to dsv4/kernels/compressor/csa_hca.py
diff --git a/dsv4/kernels/cuda/__init__.py b/dsv4/kernels/cuda/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/cutedsl/kernels/deinterleave_quantize.cu b/dsv4/kernels/cuda/deinterleave_quantize.cu
similarity index 100%
rename from cutedsl/kernels/deinterleave_quantize.cu
rename to dsv4/kernels/cuda/deinterleave_quantize.cu
diff --git a/cutedsl/kernels/sparse_topk_metadata.cu b/dsv4/kernels/cuda/sparse_topk_metadata.cu
similarity index 100%
rename from cutedsl/kernels/sparse_topk_metadata.cu
rename to dsv4/kernels/cuda/sparse_topk_metadata.cu
diff --git a/cutedsl/fp8_bf16.py b/dsv4/kernels/decode/_NOTES_fp8_bf16.md
similarity index 99%
rename from cutedsl/fp8_bf16.py
rename to dsv4/kernels/decode/_NOTES_fp8_bf16.md
index 828ab66a..ac373f82 100644
--- a/cutedsl/fp8_bf16.py
+++ b/dsv4/kernels/decode/_NOTES_fp8_bf16.md
@@ -1,4 +1,4 @@
-"""
+
 FP8 E4M3 -> BF16 conversion for CuTeDSL on Blackwell (SM100+).
 
 STATUS: NOT USABLE INSIDE CUTE KERNELS.
@@ -23,4 +23,4 @@ or when we can properly construct vector<4xf8E4M3FN> inside kernel code,
 we can fuse the dequant into the attention kernel. The PTX instruction
 exists (cvt.rn.bf16x2.e4m3x2), but CuTeDSL's AST preprocessor currently
 prevents us from injecting the necessary MLIR ops.
-"""
+
diff --git a/dsv4/kernels/decode/__init__.py b/dsv4/kernels/decode/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/dsv4/kernels/gemm/__init__.py b/dsv4/kernels/gemm/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/cutedsl/kernel/blockscaled_gemm/dense_blockscaled_gemm_persistent.py b/dsv4/kernels/gemm/dense.py
similarity index 100%
rename from cutedsl/kernel/blockscaled_gemm/dense_blockscaled_gemm_persistent.py
rename to dsv4/kernels/gemm/dense.py
diff --git a/cutedsl/kernel/moe/fused_swiglu_grouped_mm.py b/dsv4/kernels/gemm/fused_swiglu.py
similarity index 99%
rename from cutedsl/kernel/moe/fused_swiglu_grouped_mm.py
rename to dsv4/kernels/gemm/fused_swiglu.py
index 0866f694..bb0e467c 100644
--- a/cutedsl/kernel/moe/fused_swiglu_grouped_mm.py
+++ b/dsv4/kernels/gemm/fused_swiglu.py
@@ -60,15 +60,15 @@ if __name__ == "__main__":
     current_dir = os.path.dirname(os.path.abspath(__file__))
     sys.path.insert(0, os.path.join(current_dir, "../../.."))
 
-from cutedsl.kernel.moe.moe_utils import (
+from dsv4.kernels.gemm.utils import (
     MoEScaledGroupedGemmTensormapConstructor,
 )
-from cutedsl.kernel.moe.moe_persistent_scheduler import (
+from dsv4.kernels.gemm.scheduler import (
     MoEStaticSchedulerParams,
     MoEStaticPersistentTileScheduler,
     MoEWorkTileInfo,
 )
-from cutedsl.kernel.moe.moe_sched_extension import ScaledGroupedMmSchedExtension
+from dsv4.kernels.gemm.sched_extension import ScaledGroupedMmSchedExtension
 import cutlass.utils.blackwell_helpers as sm100_utils
 import cutlass.utils.blockscaled_layout as blockscaled_utils
 from cutlass.utils.gemm.sm100 import (
@@ -3665,7 +3665,7 @@ class ScaledGroupedGemmTester:
         if _examples_root not in sys.path:
             sys.path.insert(0, _examples_root)
 
-        from cutedsl.kernel.blockscaled_gemm.dense_blockscaled_gemm_persistent import (
+        from dsv4.kernels.gemm.dense import (
             Sm100BlockScaledPersistentDenseGemmKernel,
         )
         from cutlass.cute.nvgpu import OperandMajorMode
diff --git a/cutedsl/kernel/moe/torch_scaled_grouped_mm.py b/dsv4/kernels/gemm/grouped.py
similarity index 99%
rename from cutedsl/kernel/moe/torch_scaled_grouped_mm.py
rename to dsv4/kernels/gemm/grouped.py
index 570a2a1a..c1f7c1d8 100644
--- a/cutedsl/kernel/moe/torch_scaled_grouped_mm.py
+++ b/dsv4/kernels/gemm/grouped.py
@@ -60,15 +60,15 @@ if __name__ == "__main__":
     current_dir = os.path.dirname(os.path.abspath(__file__))
     sys.path.insert(0, os.path.join(current_dir, "../../.."))
 
-from cutedsl.kernel.moe.moe_utils import (
+from dsv4.kernels.gemm.utils import (
     MoEScaledGroupedGemmTensormapConstructor,
 )
-from cutedsl.kernel.moe.moe_persistent_scheduler import (
+from dsv4.kernels.gemm.scheduler import (
     MoEStaticSchedulerParams,
     MoEStaticPersistentTileScheduler,
     MoEWorkTileInfo,
 )
-from cutedsl.kernel.moe.moe_sched_extension import ScaledGroupedMmSchedExtension
+from dsv4.kernels.gemm.sched_extension import ScaledGroupedMmSchedExtension
 import cutlass.utils.blackwell_helpers as sm100_utils
 import cutlass.utils.blockscaled_layout as blockscaled_utils
 from cutlass.utils.gemm.sm100 import (
@@ -3608,7 +3608,7 @@ class ScaledGroupedGemmTester:
         if _examples_root not in sys.path:
             sys.path.insert(0, _examples_root)
 
-        from cutedsl.kernel.blockscaled_gemm.dense_blockscaled_gemm_persistent import (
+        from dsv4.kernels.gemm.dense import (
             Sm100BlockScaledPersistentDenseGemmKernel,
         )
         from cutlass.cute.nvgpu import OperandMajorMode
diff --git a/cutedsl/kernel/moe/moe_sched_extension.py b/dsv4/kernels/gemm/sched_extension.py
similarity index 99%
rename from cutedsl/kernel/moe/moe_sched_extension.py
rename to dsv4/kernels/gemm/sched_extension.py
index ce05431b..82ff2ceb 100644
--- a/cutedsl/kernel/moe/moe_sched_extension.py
+++ b/dsv4/kernels/gemm/sched_extension.py
@@ -73,14 +73,14 @@ from cutlass.cutlass_dsl import Int32
 from dataclasses import dataclass
 
 from cutlass.utils.blockscaled_layout import tile_atom_to_shape_SF
-from cutedsl.kernel.moe.moe_utils import (
+from dsv4.kernels.gemm.utils import (
     OnlineTensormapDescCreator,
     tensormap_ptr_for_copy,
     compute_expert_token_range,
     rewrite_tensor_shape,
     prefetch_tma_descriptor,
 )
-from cutedsl.kernel.moe.moe_persistent_scheduler import MoEWorkTileInfo
+from dsv4.kernels.gemm.scheduler import MoEWorkTileInfo
 
 
 @dataclass(frozen=True)
diff --git a/cutedsl/kernel/moe/moe_persistent_scheduler.py b/dsv4/kernels/gemm/scheduler.py
similarity index 100%
rename from cutedsl/kernel/moe/moe_persistent_scheduler.py
rename to dsv4/kernels/gemm/scheduler.py
diff --git a/cutedsl/kernel/moe/moe_utils.py b/dsv4/kernels/gemm/utils.py
similarity index 100%
rename from cutedsl/kernel/moe/moe_utils.py
rename to dsv4/kernels/gemm/utils.py
diff --git a/dsv4/layers/__init__.py b/dsv4/layers/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/dsv4/layers/attention.py b/dsv4/layers/attention.py
new file mode 100644
index 00000000..5ed0a464
--- /dev/null
+++ b/dsv4/layers/attention.py
@@ -0,0 +1,2 @@
+"""DSV4 attention sub-block."""
+# TODO: Phase 3+4
diff --git a/dsv4/layers/embedding.py b/dsv4/layers/embedding.py
new file mode 100644
index 00000000..4af9f9b6
--- /dev/null
+++ b/dsv4/layers/embedding.py
@@ -0,0 +1,2 @@
+"""Token embedding + mHC init wrapper."""
+# TODO: Implement
diff --git a/dsv4/layers/ffn.py b/dsv4/layers/ffn.py
new file mode 100644
index 00000000..8c4e53e2
--- /dev/null
+++ b/dsv4/layers/ffn.py
@@ -0,0 +1,2 @@
+"""FFN: router + MoE + shared expert."""
+# TODO: Phase 2
diff --git a/cutedsl/wo_a_grouped_linear.py b/dsv4/layers/grouped_linear.py
similarity index 96%
rename from cutedsl/wo_a_grouped_linear.py
rename to dsv4/layers/grouped_linear.py
index c90a102a..1142f5b2 100644
--- a/cutedsl/wo_a_grouped_linear.py
+++ b/dsv4/layers/grouped_linear.py
@@ -14,22 +14,26 @@ CUDA-graph-compatible: all buffers pre-allocated, no CPU-GPU syncs.
 
 import torch
 
-from cutedsl.bridge import (
+from dsv4.ops.quantize import (
     quantize_activation_nvfp4,
     quantize_weight_to_nvfp4,
+)
+from dsv4.ops.layouts import (
     make_b_k_major,
     assemble_scales_2d_side,
     assemble_scales_3d_side,
+)
+from dsv4.ops.gemm_runner import (
     run_nvfp4_grouped_gemm,
 )
-from cutedsl.kernel.moe.torch_scaled_grouped_mm import (
+from dsv4.ops.layouts import (
     ceil_div as cutedsl_ceil_div,
     pad_and_swizzle_single,
 )
-from cutedsl.custom_ops import register_runner, nvfp4_linear_gemm
+from dsv4.ops.custom_ops import register_runner, nvfp4_linear_gemm
 
 
-class CuTeDSLNvfp4WoA:
+class Nvfp4GroupedLinear:
     """Grouped NVFP4 linear for wo_a (o-projection first half).
 
     Handles the "bhr,hdr->bhd" einsum pattern:
@@ -181,7 +185,9 @@ class CuTeDSLNvfp4WoA:
         # Reshape to grouped format, then flatten to 2D for quantization
         o_grouped = o_sample.reshape(-1, self.n_local_groups, self.group_in_features)
         # We need a single gs for all groups — use the overall amax
-        from cutedsl.bridge import quantize_to_nvfp4
+        from dsv4.ops.quantize import (
+            quantize_to_nvfp4,
+        )
         o_flat = o_sample.reshape(-1, o_sample.shape[-1])  # (tokens, n_local_heads * head_dim) — not right
         # Actually, for grouped GEMM, each group's activation is (tokens, group_in_features)
         # The global scale should be computed per-group, but for simplicity use one scale
@@ -256,7 +262,9 @@ class CuTeDSLNvfp4WoA:
         # Assemble A-side scales for all groups
         # The grouped GEMM expects scales for all groups assembled together
         # For 2Dx3D scenario, scale_a is assembled from per-group scale tensors
-        from cutedsl.bridge import assemble_scales_2d_side
+        from dsv4.ops.layouts import (
+            assemble_scales_2d_side,
+        )
         scale_a = assemble_scales_2d_side(all_x_sf)
 
         # Expert offsets: cumulative [padded_T, 2*padded_T, ..., n_groups*padded_T]
diff --git a/cutedsl/nvfp4_linear.py b/dsv4/layers/linear.py
similarity index 95%
rename from cutedsl/nvfp4_linear.py
rename to dsv4/layers/linear.py
index 3795c083..9d7e9d0a 100644
--- a/cutedsl/nvfp4_linear.py
+++ b/dsv4/layers/linear.py
@@ -8,21 +8,25 @@ CUDA-graph-compatible: all buffers pre-allocated, no CPU-GPU syncs.
 
 import torch
 
-from cutedsl.bridge import (
+from dsv4.ops.quantize import (
     quantize_activation_nvfp4,
     quantize_to_nvfp4,
+)
+from dsv4.ops.layouts import (
     make_b_k_major,
     assemble_scales_3d_side,
+)
+from dsv4.ops.gemm_runner import (
     run_nvfp4_grouped_gemm,
 )
-from cutedsl.kernel.moe.torch_scaled_grouped_mm import (
+from dsv4.kernels.gemm.grouped import (
     ceil_div as cutedsl_ceil_div,
     pad_and_swizzle_single,
 )
-from cutedsl.custom_ops import register_runner, nvfp4_linear_gemm
+from dsv4.ops.custom_ops import register_runner, nvfp4_linear_gemm
 
 
-class CuTeDSLNvfp4Linear:
+class Nvfp4Linear:
     """Single NVFP4 GEMM using CuTeDSL (num_groups=1).
 
     Handles any (K, N) weight matrix in NVFP4 format.
@@ -76,7 +80,6 @@ class CuTeDSLNvfp4Linear:
 
         # Eagerly JIT-compile the GEMM kernel for this (K, N) shape.
         # Uses num_groups=1 since this is a single linear layer.
-        # from cutedsl.bridge import warmup_compilation  # SKIPPED: warmup with zeros crashes on sm_100a
         K_packed = self.in_features // 2
         N_packed = self.out_features // 2
         # warmup_compilation(1, K_packed, N_packed, self.device)  # Lazy compile on first real forward
diff --git a/cutedsl/mhc_inference_layer.py b/dsv4/layers/mhc.py
similarity index 100%
rename from cutedsl/mhc_inference_layer.py
rename to dsv4/layers/mhc.py
diff --git a/cutedsl/runner.py b/dsv4/layers/moe.py
similarity index 95%
rename from cutedsl/runner.py
rename to dsv4/layers/moe.py
index bbd0bd2b..7fc38fa1 100644
--- a/cutedsl/runner.py
+++ b/dsv4/layers/moe.py
@@ -15,26 +15,30 @@ processes max_slots = budget * top_k rows; padding rows are zeros.
 """
 import torch
 
-from cutedsl.bridge import (
+from dsv4.ops.quantize import (
     quantize_activation_nvfp4,
     quantize_weight_to_nvfp4,
     quantize_to_nvfp4,
+)
+from dsv4.ops.layouts import (
     make_b_k_major,
     assemble_scales_3d_side,
     interleave_l1_weights,
     deinterleave_l1_weights,
+)
+from dsv4.ops.gemm_runner import (
     run_nvfp4_grouped_gemm,
     run_fused_swiglu_grouped_gemm,
     warmup_fused_swiglu_compilation,
 )
-from cutedsl.kernel.moe.torch_scaled_grouped_mm import (
+from dsv4.ops.layouts import (
     ceil_div as cutedsl_ceil_div,
     pad_and_swizzle_single,
 )
-from cutedsl.custom_ops import register_runner, nvfp4_moe_gemm
+from dsv4.ops.custom_ops import register_runner, nvfp4_moe_gemm
 
 
-class CuTeDSLMoERunner:
+class Nvfp4MoE:
     """Manages NVFP4 MoE execution via the CuTeDSL kernel.
     
     CUDA-graph-compatible: all buffers pre-allocated, no CPU-GPU syncs,
@@ -127,15 +131,15 @@ class CuTeDSLMoERunner:
         
         # Initialize shared buffers dict (if not already)
         device_key = str(self.device)
-        if not hasattr(CuTeDSLMoERunner, '_shared_padded_bufs'):
-            CuTeDSLMoERunner._shared_padded_bufs = {}
-        if device_key not in CuTeDSLMoERunner._shared_padded_bufs:
-            CuTeDSLMoERunner._shared_padded_bufs[device_key] = {}
+        if not hasattr(Nvfp4MoE, '_shared_padded_bufs'):
+            Nvfp4MoE._shared_padded_bufs = {}
+        if device_key not in Nvfp4MoE._shared_padded_bufs:
+            Nvfp4MoE._shared_padded_bufs[device_key] = {}
         
         # Padded x_sf buffers: SHARED across all runners (not per-layer)
         max_sf_rows = self.num_experts * self._max_chunks_per_expert * 128
-        if 'xsf_l1' not in CuTeDSLMoERunner._shared_padded_bufs[device_key]:
-            CuTeDSLMoERunner._shared_padded_bufs[device_key].update({
+        if 'xsf_l1' not in Nvfp4MoE._shared_padded_bufs[device_key]:
+            Nvfp4MoE._shared_padded_bufs[device_key].update({
                 'xsf_l1': torch.zeros(
                     max_sf_rows, padded_cols_l1, dtype=torch.float16, device=self.device
                 ).to(torch.float8_e4m3fn),
@@ -146,9 +150,9 @@ class CuTeDSLMoERunner:
                     self.max_num_tokens, self.hidden_size, dtype=torch.bfloat16, device=self.device
                 ),
             })
-        self._padded_x_sf_buf_l1 = CuTeDSLMoERunner._shared_padded_bufs[device_key]['xsf_l1']
-        self._padded_x_sf_buf_l2 = CuTeDSLMoERunner._shared_padded_bufs[device_key]['xsf_l2']
-        self._output_buf = CuTeDSLMoERunner._shared_padded_bufs[device_key]['output']
+        self._padded_x_sf_buf_l1 = Nvfp4MoE._shared_padded_bufs[device_key]['xsf_l1']
+        self._padded_x_sf_buf_l2 = Nvfp4MoE._shared_padded_bufs[device_key]['xsf_l2']
+        self._output_buf = Nvfp4MoE._shared_padded_bufs[device_key]['output']
         
         # Pre-allocated global_scale_a buffers (filled via .fill_(), no torch.full during capture)
         self._l1_gsa_buf = torch.zeros(self.num_experts, dtype=torch.float32, device=self.device)
@@ -162,8 +166,8 @@ class CuTeDSLMoERunner:
         # Padded hidden/activated: SHARED across all runners (not per-layer)
         max_rows_per_expert = self._max_chunks_per_expert * 128
         padded_max_slots = self.num_experts * max_rows_per_expert
-        if 'hidden' not in CuTeDSLMoERunner._shared_padded_bufs[device_key]:
-            CuTeDSLMoERunner._shared_padded_bufs[device_key].update({
+        if 'hidden' not in Nvfp4MoE._shared_padded_bufs[device_key]:
+            Nvfp4MoE._shared_padded_bufs[device_key].update({
                 'hidden': torch.zeros(
                     padded_max_slots, self.hidden_size, dtype=torch.bfloat16, device=self.device
                 ),
@@ -177,7 +181,7 @@ class CuTeDSLMoERunner:
                     padded_max_slots, self.intermediate_size // 2, dtype=torch.uint8, device=self.device
                 ).view(torch.float4_e2m1fn_x2),
             })
-        self._shared_bufs = CuTeDSLMoERunner._shared_padded_bufs[device_key]
+        self._shared_bufs = Nvfp4MoE._shared_padded_bufs[device_key]
         
         # Padded expert offsets buffer: [0, max_rows, 2*max_rows, ...] (fixed)
         self._padded_expert_offsets_buf = torch.zeros(
@@ -237,7 +241,7 @@ class CuTeDSLMoERunner:
             # assemble_scales_3d_side expects (K_sf, N) per expert and transposes
             # to (N, K_sf) internally. But our scales are already (N, K_sf) from
             # the checkpoint! Skip the transpose by calling the assembly directly.
-            from cutedsl.kernel.moe.torch_scaled_grouped_mm import (
+            from dsv4.ops.layouts import (
                 assemble_raw_scales_2d3d_3d_side,
             )
             self._l1_scale_b = assemble_raw_scales_2d3d_3d_side(l1_sf_list)
@@ -285,7 +289,13 @@ class CuTeDSLMoERunner:
         # This triggers cute.compile once per shape, caching the compiled
         # kernel + workspace. Subsequent run() calls hit the cache.
         # MUST happen before model forward pass to avoid OOM from lazy JIT.
-        from cutedsl.bridge import warmup_compilation, warmup_fused_swiglu_compilation, ceil_div as bridge_ceil_div
+        from dsv4.ops.layouts import (
+            ceil_div as bridge_ceil_div,
+        )
+        from dsv4.ops.gemm_runner import (
+            warmup_compilation,
+            warmup_fused_swiglu_compilation,
+        )
         K_packed = self.hidden_size // 2
         N_packed_l1 = (2 * self.intermediate_size) // 2  # gate+up combined
         N_packed_l2 = self.hidden_size // 2  # down
diff --git a/dsv4/layers/norm.py b/dsv4/layers/norm.py
new file mode 100644
index 00000000..04c95489
--- /dev/null
+++ b/dsv4/layers/norm.py
@@ -0,0 +1,2 @@
+"""RMSNorm placeholder."""
+# TODO: Implement RMSNorm
diff --git a/dsv4/layers/router.py b/dsv4/layers/router.py
new file mode 100644
index 00000000..0897501d
--- /dev/null
+++ b/dsv4/layers/router.py
@@ -0,0 +1,2 @@
+"""Router: sqrt(softplus) + topk + aux-free bias + hash routing."""
+# TODO: Phase 2
diff --git a/cutedsl/shared_expert_pipeline.py b/dsv4/layers/shared_expert.py
similarity index 98%
rename from cutedsl/shared_expert_pipeline.py
rename to dsv4/layers/shared_expert.py
index ce9aa4be..5d51cbba 100644
--- a/cutedsl/shared_expert_pipeline.py
+++ b/dsv4/layers/shared_expert.py
@@ -20,14 +20,18 @@ no dynamic shapes. Padding rows are zeros that contribute nothing to GEMM output
 
 import torch
 
-from cutedsl.bridge import (
+from dsv4.ops.quantize import (
     quantize_activation_nvfp4,
     quantize_to_nvfp4,
+)
+from dsv4.ops.layouts import (
     make_b_k_major,
     assemble_scales_3d_side,
+)
+from dsv4.ops.gemm_runner import (
     run_nvfp4_grouped_gemm,
 )
-from cutedsl.kernel.moe.torch_scaled_grouped_mm import (
+from dsv4.kernels.gemm.grouped import (
     ceil_div as cutedsl_ceil_div,
     pad_and_swizzle_single,
 )
@@ -40,7 +44,7 @@ class _SharedExpertApply(torch.autograd.Function):
         return runner._run_impl(hidden_states)
 
 
-class CuTeDSLSharedExpertRunner:
+class Nvfp4SharedExpert:
     """NVFP4 shared expert runner using CuTeDSL GEMM (num_groups=1).
 
     CUDA-graph-compatible: all buffers pre-allocated, no CPU-GPU syncs.
diff --git a/dsv4/loader/__init__.py b/dsv4/loader/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/dsv4/loader/hf_checkpoint.py b/dsv4/loader/hf_checkpoint.py
new file mode 100644
index 00000000..eb49186f
--- /dev/null
+++ b/dsv4/loader/hf_checkpoint.py
@@ -0,0 +1,2 @@
+"""HuggingFace checkpoint reader."""
+# TODO
diff --git a/dsv4/loader/layout_convert.py b/dsv4/loader/layout_convert.py
new file mode 100644
index 00000000..a4d10adb
--- /dev/null
+++ b/dsv4/loader/layout_convert.py
@@ -0,0 +1,2 @@
+"""Checkpoint layout conversion."""
+# TODO
diff --git a/dsv4/model/__init__.py b/dsv4/model/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/dsv4/model/config.py b/dsv4/model/config.py
new file mode 100644
index 00000000..db67e083
--- /dev/null
+++ b/dsv4/model/config.py
@@ -0,0 +1,2 @@
+"""DSV4Config (Flash + Pro)."""
+# TODO: Phase 1
diff --git a/dsv4/model/dsv4.py b/dsv4/model/dsv4.py
new file mode 100644
index 00000000..948a3564
--- /dev/null
+++ b/dsv4/model/dsv4.py
@@ -0,0 +1,2 @@
+"""Full DSV4 model."""
+# TODO: Phase 1
diff --git a/dsv4/model/layer.py b/dsv4/model/layer.py
new file mode 100644
index 00000000..be929a3c
--- /dev/null
+++ b/dsv4/model/layer.py
@@ -0,0 +1,2 @@
+"""Single transformer layer."""
+# TODO: Phase 1
diff --git a/dsv4/model/mtp.py b/dsv4/model/mtp.py
new file mode 100644
index 00000000..cc75722b
--- /dev/null
+++ b/dsv4/model/mtp.py
@@ -0,0 +1,2 @@
+"""Multi-token prediction."""
+# TODO
diff --git a/dsv4/model/sampler.py b/dsv4/model/sampler.py
new file mode 100644
index 00000000..06ce8038
--- /dev/null
+++ b/dsv4/model/sampler.py
@@ -0,0 +1,2 @@
+"""Token sampler."""
+# TODO
diff --git a/dsv4/ops/__init__.py b/dsv4/ops/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/cutedsl/custom_ops.py b/dsv4/ops/custom_ops.py
similarity index 100%
rename from cutedsl/custom_ops.py
rename to dsv4/ops/custom_ops.py
diff --git a/cutedsl/native_sparse_decode.py b/dsv4/ops/decode_sparse.py
similarity index 100%
rename from cutedsl/native_sparse_decode.py
rename to dsv4/ops/decode_sparse.py
diff --git a/cutedsl/native_swa_decode.py b/dsv4/ops/decode_swa.py
similarity index 100%
rename from cutedsl/native_swa_decode.py
rename to dsv4/ops/decode_swa.py
diff --git a/cutedsl/bridge.py b/dsv4/ops/gemm_runner.py
similarity index 55%
rename from cutedsl/bridge.py
rename to dsv4/ops/gemm_runner.py
index b15a7ff3..a29a357c 100644
--- a/cutedsl/bridge.py
+++ b/dsv4/ops/gemm_runner.py
@@ -1,13 +1,4 @@
-"""
-Bridge layer for the CuTeDSL NVFP4 MoE kernel.
-
-Handles tensor layout conversion from our pipeline's format to what
-the ScaledGroupedGemmKernel expects:
-- BF16 → NVFP4 quantization (float4_e2m1fn_x2)
-- Scale factor assembly (padding + swizzle)
-- B tensor K-major stride conversion
-- Expert offset computation
-"""
+"""NVFP4 GEMM runner: warmup, compile, and execute grouped/fused GEMMs."""
 import math
 import torch
 import cutlass
@@ -15,18 +6,24 @@ import cutlass.cute as cute
 import cutlass.torch as cutlass_torch
 import cutlass.utils as utils
 
-from cutedsl.kernel.moe.torch_scaled_grouped_mm import (
-    ScaledGroupedGemmKernel,
-    pad_and_swizzle_single,
-    assemble_raw_scales_2d3d_2d_side,
-    assemble_raw_scales_2d3d_3d_side,
-    cat_byte_reinterpretable_tensors,
-    stack_byte_reinterpretable_tensors,
+from dsv4.kernels.gemm.grouped import ScaledGroupedGemmKernel
+from dsv4.kernels.gemm.fused_swiglu import FusedSwiGLUScaledGroupedGemmKernel
+from dsv4.ops.quantize import (
+    quantize_activation_nvfp4,
+    quantize_weight_to_nvfp4,
+    quantize_to_nvfp4,
+    deinterleave_quantize_nvfp4_cuda,
+)
+from dsv4.ops.layouts import (
+    interleave_l1_weights,
+    deinterleave_l1_weights,
+    assemble_scales_2d_side,
+    assemble_scales_3d_side,
+    make_b_k_major,
+    compute_expert_offsets,
+    ceil_div,
+    round_up,
 )
-
-# ── Constants ──────────────────────────────────────────────────────────
-
-E2M1_MAGNITUDES = [0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0]
 
 # Cache compiled kernels + pre-allocated workspace by cache_key
 # Each entry: {'compiled': callable, 'workspace': Tensor, 'workspace_size': int}
@@ -42,326 +39,6 @@ E2M1_MAGNITUDES = [0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0]
 #   Caching them would hold stale references to tensors that get freed.
 _compiled_kernel_cache = {}
 
-# Cached LUT for E2M1 quantization (created once per device, cudagraph-safe)
-_NVFP4_STEP_LUT_CACHE = {}
-def _get_step_to_idx_lut(device):
-    """Get or create the E2M1 step-to-index LUT for the given device.
-    
-    Cached per device to avoid CPU->CUDA copies during cudagraph capture.
-    Must be pre-populated during warmup (before torch.compile/cudagraph capture)
-    so the lock is never entered on the compiled path.
-    """
-    # Fast path: already cached — no lock needed (torch.compile-safe)
-    if device in _NVFP4_STEP_LUT_CACHE:
-        return _NVFP4_STEP_LUT_CACHE[device]
-    # Slow path: first call, create the LUT
-    lut = torch.as_tensor(
-        [0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 7, 7],
-        dtype=torch.int8, device=device,
-    )
-    _NVFP4_STEP_LUT_CACHE[device] = lut
-    return lut
-SF_VEC_SIZE = 16  # NVFP4 block size
-
-
-def ceil_div(a, b):
-    return (a + b - 1) // b
-
-
-def round_up(a, b):
-    return ceil_div(a, b) * b
-
-
-# ── Quantization ──────────────────────────────────────────────────────
-
-def quantize_to_nvfp4(x_bf16, block_size=SF_VEC_SIZE):
-    """Quantize BF16 tensor to NVFP4.
-    
-    Args:
-        x_bf16: (..., D) BF16 tensor
-    
-    Returns:
-        x_fp4: (..., D//2) float4_e2m1fn_x2 — native PyTorch FP4
-        x_sf: (..., D//16) float8_e4m3fn — block scales
-        global_scale: float32 scalar
-    """
-    x_f32 = x_bf16.float()
-    amax = x_f32.abs().max().clamp(min=1e-8).float()
-    global_scale = amax / (6.0 * 448.0)
-    x_norm = x_f32 / global_scale
-
-    last_dim = x_norm.shape[-1]
-    n_blocks = ceil_div(last_dim, block_size)
-
-    if last_dim % block_size != 0:
-        pad_size = n_blocks * block_size - last_dim
-        x_norm = torch.nn.functional.pad(x_norm, (0, pad_size))
-
-    x_reshaped = x_norm.reshape(*x_norm.shape[:-1], n_blocks, block_size)
-    block_amax = x_reshaped.abs().amax(dim=-1)
-    # Detect zero blocks and underflow blocks (amax > 0 but too small for FP8).
-    # Smallest positive FP8 e4m3fn is 2^-9 ≈ 1.95e-3. If amax/6 < this,
-    # the block scale underflows to 0, and dividing x by the clamped 1e-8
-    # inflates values into nonzero FP4 buckets — producing wrong results.
-    zero_block = block_amax < (6.0 * 2.0 ** -9)  # < ~0.0117
-    # Zero out x for zero/underflow blocks before division.
-    # This ensures x_scaled = 0 → FP4 nibbles = 0.
-    x_reshaped = torch.where(zero_block.unsqueeze(-1),
-                              torch.zeros_like(x_reshaped), x_reshaped)
-    block_amax = block_amax.clamp(min=1e-8)
-    block_scale = (block_amax / 6.0).to(torch.float8_e4m3fn)
-    # Force zero/underflow blocks: FP8 scale = 0 (exact zero).
-    block_scale = torch.where(zero_block, torch.zeros_like(block_scale), block_scale)
-
-    # Nearest E2M1
-    block_sf_expanded = block_scale.float().unsqueeze(-1)
-    x_scaled = x_reshaped / block_sf_expanded.clamp(min=1e-8)
-
-    signs = torch.sign(x_scaled)
-    abs_scaled = x_scaled.abs().clamp(max=6.0)
-    
-    half_steps = (abs_scaled * 2.0).round().clamp(0, 12).to(torch.int8)
-    step_to_idx = _get_step_to_idx_lut(x_bf16.device)
-    indices = step_to_idx[half_steps.long()]
-
-    nibbles = torch.where(signs < 0, indices + 8, indices).to(torch.uint8)
-    even = nibbles[..., ::2]
-    odd = nibbles[..., 1::2]
-    packed = (odd << 4) | even
-
-    packed_shape = list(x_bf16.shape)
-    packed_shape[-1] = last_dim // 2
-    x_fp4 = packed.view(torch.float4_e2m1fn_x2).reshape(packed_shape)
-
-    sf_shape = list(x_bf16.shape[:-1]) + [n_blocks]
-    block_scale = block_scale.reshape(sf_shape)
-
-    return x_fp4, block_scale, global_scale
-
-
-def quantize_activation_nvfp4(x_bf16, global_scale, block_size=SF_VEC_SIZE):
-    """Quantize BF16 activation tensor to NVFP4 (cudagraph-safe).
-
-    Unlike quantize_to_nvfp4(), this takes a pre-computed global_scale
-    instead of computing it via .max() (which forces CPU-GPU sync).
-    All operations are pure GPU with no CPU-GPU syncs.
-
-    Args:
-        x_bf16: (..., D) BF16 tensor
-        global_scale: float32 scalar (pre-computed, NOT from .max())
-        block_size: NVFP4 block size
-    
-    Returns:
-        x_fp4: (..., D//2) float4_e2m1fn_x2
-        x_sf: (..., D//16) float8_e4m3fn
-    """
-    x_f32 = x_bf16.float()
-    x_norm = x_f32 / global_scale
-
-    last_dim = x_norm.shape[-1]
-    n_blocks = ceil_div(last_dim, block_size)
-
-    if last_dim % block_size != 0:
-        pad_size = n_blocks * block_size - last_dim
-        x_norm = torch.nn.functional.pad(x_norm, (0, pad_size))
-
-    x_reshaped = x_norm.reshape(*x_norm.shape[:-1], n_blocks, block_size)
-    block_amax = x_reshaped.abs().amax(dim=-1)
-    # Detect zero blocks and underflow blocks (same threshold as quantize_to_nvfp4).
-    zero_block = block_amax < (6.0 * 2.0 ** -9)
-    x_reshaped = torch.where(zero_block.unsqueeze(-1),
-                              torch.zeros_like(x_reshaped), x_reshaped)
-    block_amax = block_amax.clamp(min=1e-8)
-    block_scale = (block_amax / 6.0).to(torch.float8_e4m3fn)
-    block_scale = torch.where(zero_block, torch.zeros_like(block_scale), block_scale)
-
-    block_sf_expanded = block_scale.float().unsqueeze(-1)
-    x_scaled = x_reshaped / block_sf_expanded.clamp(min=1e-8)
-    signs = torch.sign(x_scaled)
-    abs_scaled = x_scaled.abs().clamp(max=6.0)
-    
-    half_steps = (abs_scaled * 2.0).round().clamp(0, 12).to(torch.int8)
-    step_to_idx = _get_step_to_idx_lut(x_bf16.device)
-    indices = step_to_idx[half_steps.long()]
-
-    nibbles = torch.where(signs < 0, indices + 8, indices).to(torch.uint8)
-    even = nibbles[..., ::2]
-    odd = nibbles[..., 1::2]
-    packed = (odd << 4) | even
-
-    packed_shape = list(x_bf16.shape)
-    packed_shape[-1] = last_dim // 2
-    x_fp4 = packed.view(torch.float4_e2m1fn_x2).reshape(packed_shape)
-
-    sf_shape = list(x_bf16.shape[:-1]) + [n_blocks]
-    block_scale = block_scale.reshape(sf_shape)
-
-    return x_fp4, block_scale
-
-
-def quantize_weight_to_nvfp4(w_bf16, block_size=SF_VEC_SIZE):
-    """Quantize BF16 weight matrix to NVFP4.
-    
-    The weight is (K, N) where K is the input dim (packed dimension).
-    Block scales are computed along K (dim 0).
-    
-    Args:
-        w_bf16: (K, N) BF16 weight matrix
-    
-    Returns:
-        w_fp4: (K//2, N) float4_e2m1fn_x2 — K is the packed dim
-        w_sf: (K//16, N) float8_e4m3fn — block scales along K
-        global_scale: float32 scalar
-    """
-    K, N = w_bf16.shape
-    w_f32 = w_bf16.float()
-    amax = w_f32.abs().max().clamp(min=1e-8).float()
-    global_scale = amax / (6.0 * 448.0)
-    w_norm = w_f32 / global_scale
-
-    k_blocks = ceil_div(K, block_size)
-    if K % block_size != 0:
-        w_norm = torch.nn.functional.pad(w_norm, (0, 0, 0, k_blocks * block_size - K))
-
-    w_reshaped = w_norm.reshape(k_blocks, block_size, N)
-    w_block_amax = w_reshaped.abs().amax(dim=1)
-    # Detect zero blocks and underflow blocks (same threshold).
-    zero_block = w_block_amax < (6.0 * 2.0 ** -9)
-    w_reshaped = torch.where(zero_block.unsqueeze(1),
-                              torch.zeros_like(w_reshaped), w_reshaped)
-    w_block_amax = w_block_amax.clamp(min=1e-8)
-    w_sf = (w_block_amax / 6.0).to(torch.float8_e4m3fn)
-    w_sf = torch.where(zero_block, torch.zeros_like(w_sf), w_sf)
-
-    w_block_sf = w_sf.float().unsqueeze(1)
-    w_scaled = w_reshaped / w_block_sf.clamp(min=1e-8)
-
-    signs = torch.sign(w_scaled)
-    abs_scaled = w_scaled.abs().clamp(max=6.0)
-    
-    half_steps = (abs_scaled * 2.0).round().clamp(0, 12).to(torch.int8)
-    step_to_idx = _get_step_to_idx_lut(w_bf16.device)
-    indices = step_to_idx[half_steps.long()]
-    nibbles = torch.where(signs < 0, indices + 8, indices).to(torch.uint8)
-
-    even = nibbles[:, ::2, :]
-    odd = nibbles[:, 1::2, :]
-    packed = (odd << 4) | even
-
-    w_fp4 = packed.reshape(K // 2, N).view(torch.float4_e2m1fn_x2)
-    return w_fp4, w_sf, global_scale
-
-
-# ── Scale Factor Assembly ─────────────────────────────────────────────
-
-def interleave_l1_weights(w_ekn, granularity_bf16=8):
-    """Interleave gate/up weights at granularity 8 in BF16 (4 in FP4).
-    
-    The fused SwiGLU epilogue requires gate/up pairs to be adjacent in the
-    MMA accumulator. With interleaved weights, the MMA tile produces
-    gate[i*8..i*8+7] and up[i*8..i*8+7] next to each other in registers,
-    enabling a single-register SwiGLU without SMEM round-trips.
-    
-    Before:  [gate_0..gate_N/2-1 | up_0..up_N/2-1]
-    After:   [gate_0..gate_7, up_0..up_7, gate_8..gate_15, up_8..up_15, ...]
-    
-    The interleave operates along the N dimension, where each column = 1 BF16
-    (FP4 packing is along K, not N). So g = granularity_bf16 directly.
-    
-    Args:
-        w_ekn: (E, K_packed, N_packed) FP4 weight tensor in K-major layout
-                N_packed = 2*intermediate/2 = intermediate (gate+up fused)
-        granularity_bf16: interleave group size in BF16 elements (default 8)
-    
-    Returns:
-        (E, K_packed, N_packed) FP4 weight tensor with interleaved gate/up
-    """
-    E, K, N = w_ekn.shape
-    N_half = N // 2  # gate and up each have N/2 FP4 columns
-    g = granularity_bf16  # N-axis interleave: each N-col = 1 BF16 col (packing is along K)
-    
-    gate = w_ekn[:, :, :N_half].reshape(E, K, N_half // g, g)
-    up = w_ekn[:, :, N_half:].reshape(E, K, N_half // g, g)
-    return torch.stack([gate, up], dim=3).reshape(E, K, N)
-
-
-def deinterleave_l1_weights(w_ekn, granularity_bf16=8):
-    """De-interleave gate/up weights (inverse of interleave_l1_weights).
-    
-    Used for testing/verification only.
-    """
-    g = granularity_bf16  # N-axis: each N-col = 1 BF16 col
-    E, K, N = w_ekn.shape
-    w_reshaped = w_ekn.reshape(E, K, N // (2 * g), 2, g)
-    gate = w_reshaped[:, :, :, 0, :].reshape(E, K, N // 2)
-    up = w_reshaped[:, :, :, 1, :].reshape(E, K, N // 2)
-    return torch.cat([gate, up], dim=2)
-
-
-def assemble_scales_2d_side(raw_scales):
-    """Assemble activation scale factors for the 2Dx3D scenario.
-    
-    Args:
-        raw_scales: list of (M_e, K_sf) float8_e4m3fn tensors, one per expert
-    
-    Returns:
-        Assembled and swizzled scale tensor
-    """
-    return assemble_raw_scales_2d3d_2d_side(raw_scales)
-
-
-def assemble_scales_3d_side(raw_scales):
-    """Assemble weight scale factors for the 2Dx3D scenario.
-    
-    Args:
-        raw_scales: list of (K_sf, N) float8_e4m3fn tensors, one per expert
-        NOTE: These will be transposed to (N, K_sf) before swizzling,
-        since the kernel expects N as the non-K dimension.
-    
-    Returns:
-        Assembled and swizzled scale tensor
-    """
-    # Kernel expects (N, K_sf) — transpose before swizzling
-    transposed = [sf.T.contiguous() for sf in raw_scales]
-    return assemble_raw_scales_2d3d_3d_side(transposed)
-
-
-# ── Tensor Layout Conversion ──────────────────────────────────────────
-
-def make_b_k_major(b_tensor):
-    """Convert B tensor from N-major to K-major layout.
-    
-    The kernel expects B with stride (E*K*N, 1, K) — K is contiguous.
-    torch.stack produces stride (E*K*N, N, 1) — N is contiguous.
-    
-    Args:
-        b_tensor: (experts, K_packed, N_packed) float4_e2m1fn_x2, N-major
-    
-    Returns:
-        Same shape, K-major strides
-    """
-    return b_tensor.permute(0, 2, 1).contiguous().permute(0, 2, 1)
-
-
-def compute_expert_offsets(tokens_per_expert, num_experts, device="cuda"):
-    """Compute cumulative token offsets for the grouped GEMM.
-    
-    Args:
-        tokens_per_expert: list of int, one per expert
-    
-    Returns:
-        offs: (num_experts,) int32 — cumulative sum
-    """
-    offs = torch.tensor(
-        [sum(tokens_per_expert[:e+1]) for e in range(num_experts)],
-        dtype=torch.int32, device=device,
-    )
-    return offs
-
-
-# ── Kernel Launch ─────────────────────────────────────────────────────
-
-
 def warmup_compilation(num_experts, K_packed, N_packed, device,
                        mma_tiler_mn=(128, 128), cluster_shape_mn=(1, 1)):
     """Eagerly JIT-compile the GEMM kernel for a specific shape.
@@ -589,10 +266,7 @@ def run_nvfp4_grouped_gemm(
 
 # ── Fused SwiGLU GEMM (Stage 1: SiLU in registers, BF16 output) ──────
 
-# Cache for fused kernel (separate from standard GEMM cache)
 _fused_kernel_cache = {}
-
-
 def warmup_fused_swiglu_compilation(num_experts, K_packed, N_packed, device,
                                      swiglu_limit=0.0,
                                      mma_tiler_mn=(128, 128),
@@ -602,7 +276,7 @@ def warmup_fused_swiglu_compilation(num_experts, K_packed, N_packed, device,
     Must be called during model initialization. See warmup_compilation()
     for the standard GEMM equivalent.
     """
-    from cutedsl.kernel.moe.fused_swiglu_grouped_mm import FusedSwiGLUScaledGroupedGemmKernel
+    from dsv4.kernels.gemm.fused_swiglu import FusedSwiGLUScaledGroupedGemmKernel
     
     cache_key = ('fused', num_experts, str(device), mma_tiler_mn, cluster_shape_mn,
                  K_packed, N_packed, swiglu_limit)
@@ -697,7 +371,7 @@ def run_fused_swiglu_grouped_gemm(
     Stage 1: SiLU is applied to the full accumulator in registers,
     then written as BF16 to C. Gate/up pairing is not yet implemented.
     """
-    from cutedsl.kernel.moe.fused_swiglu_grouped_mm import FusedSwiGLUScaledGroupedGemmKernel
+    from dsv4.kernels.gemm.fused_swiglu import FusedSwiGLUScaledGroupedGemmKernel
     
     num_experts = mat_b.shape[0]
     n_dim = mat_b.shape[2]
@@ -789,28 +463,3 @@ def run_fused_swiglu_grouped_gemm(
 
 
 
-def deinterleave_quantize_nvfp4_cuda(fused_bf16, intermediate, global_scale, granularity=8):
-    """De-interleave + quantize fused SwiGLU output using a custom CUDA kernel.
-    
-    Single kernel launch, no Python loop. 4x faster than the Python path.
-    
-    Args:
-        fused_bf16: (M, 2*intermediate) BF16 — fused L1 output with interleaved gate/up
-        intermediate: intermediate dimension (e.g., 3072)
-        global_scale: pre-computed global scale for quantization
-        granularity: interleave granularity in BF16 columns (default 8)
-    
-    Returns:
-        x_fp4: (M, intermediate//2) float4_e2m1fn_x2 — quantized SwiGLU
-        x_sf: (M, intermediate//16) float8_e4m3fn — block scales
-    """
-    from torch.utils.cpp_extension import load
-    import os
-    kernel_dir = os.path.join(os.path.dirname(__file__), "kernels")
-    mod = load(
-        name="deinterleave_quantize_nvfp4",
-        sources=[os.path.join(kernel_dir, "deinterleave_quantize.cu")],
-        extra_cuda_cflags=["-gencode=arch=compute_100a,code=sm_100a"],
-        verbose=False,
-    )
-    return mod.deinterleave_quantize_nvfp4(fused_bf16, intermediate, granularity, global_scale)
diff --git a/dsv4/ops/layouts.py b/dsv4/ops/layouts.py
new file mode 100644
index 00000000..e092ca0e
--- /dev/null
+++ b/dsv4/ops/layouts.py
@@ -0,0 +1,123 @@
+"""Tensor layout helpers: scale swizzle, gate/up interleave, K-major, offsets."""
+import torch
+
+from dsv4.kernels.gemm.grouped import (
+    pad_and_swizzle_single,
+    assemble_raw_scales_2d3d_2d_side,
+    assemble_raw_scales_2d3d_3d_side,
+)
+
+def ceil_div(a, b):
+    return (a + b - 1) // b
+
+
+def round_up(a, b):
+    return ceil_div(a, b) * b
+
+def interleave_l1_weights(w_ekn, granularity_bf16=8):
+    """Interleave gate/up weights at granularity 8 in BF16 (4 in FP4).
+    
+    The fused SwiGLU epilogue requires gate/up pairs to be adjacent in the
+    MMA accumulator. With interleaved weights, the MMA tile produces
+    gate[i*8..i*8+7] and up[i*8..i*8+7] next to each other in registers,
+    enabling a single-register SwiGLU without SMEM round-trips.
+    
+    Before:  [gate_0..gate_N/2-1 | up_0..up_N/2-1]
+    After:   [gate_0..gate_7, up_0..up_7, gate_8..gate_15, up_8..up_15, ...]
+    
+    The interleave operates along the N dimension, where each column = 1 BF16
+    (FP4 packing is along K, not N). So g = granularity_bf16 directly.
+    
+    Args:
+        w_ekn: (E, K_packed, N_packed) FP4 weight tensor in K-major layout
+                N_packed = 2*intermediate/2 = intermediate (gate+up fused)
+        granularity_bf16: interleave group size in BF16 elements (default 8)
+    
+    Returns:
+        (E, K_packed, N_packed) FP4 weight tensor with interleaved gate/up
+    """
+    E, K, N = w_ekn.shape
+    N_half = N // 2  # gate and up each have N/2 FP4 columns
+    g = granularity_bf16  # N-axis interleave: each N-col = 1 BF16 col (packing is along K)
+    
+    gate = w_ekn[:, :, :N_half].reshape(E, K, N_half // g, g)
+    up = w_ekn[:, :, N_half:].reshape(E, K, N_half // g, g)
+    return torch.stack([gate, up], dim=3).reshape(E, K, N)
+
+
+def deinterleave_l1_weights(w_ekn, granularity_bf16=8):
+    """De-interleave gate/up weights (inverse of interleave_l1_weights).
+    
+    Used for testing/verification only.
+    """
+    g = granularity_bf16  # N-axis: each N-col = 1 BF16 col
+    E, K, N = w_ekn.shape
+    w_reshaped = w_ekn.reshape(E, K, N // (2 * g), 2, g)
+    gate = w_reshaped[:, :, :, 0, :].reshape(E, K, N // 2)
+    up = w_reshaped[:, :, :, 1, :].reshape(E, K, N // 2)
+    return torch.cat([gate, up], dim=2)
+
+
+def assemble_scales_2d_side(raw_scales):
+    """Assemble activation scale factors for the 2Dx3D scenario.
+    
+    Args:
+        raw_scales: list of (M_e, K_sf) float8_e4m3fn tensors, one per expert
+    
+    Returns:
+        Assembled and swizzled scale tensor
+    """
+    return assemble_raw_scales_2d3d_2d_side(raw_scales)
+
+
+def assemble_scales_3d_side(raw_scales):
+    """Assemble weight scale factors for the 2Dx3D scenario.
+    
+    Args:
+        raw_scales: list of (K_sf, N) float8_e4m3fn tensors, one per expert
+        NOTE: These will be transposed to (N, K_sf) before swizzling,
+        since the kernel expects N as the non-K dimension.
+    
+    Returns:
+        Assembled and swizzled scale tensor
+    """
+    # Kernel expects (N, K_sf) — transpose before swizzling
+    transposed = [sf.T.contiguous() for sf in raw_scales]
+    return assemble_raw_scales_2d3d_3d_side(transposed)
+
+
+# ── Tensor Layout Conversion ──────────────────────────────────────────
+
+def make_b_k_major(b_tensor):
+    """Convert B tensor from N-major to K-major layout.
+    
+    The kernel expects B with stride (E*K*N, 1, K) — K is contiguous.
+    torch.stack produces stride (E*K*N, N, 1) — N is contiguous.
+    
+    Args:
+        b_tensor: (experts, K_packed, N_packed) float4_e2m1fn_x2, N-major
+    
+    Returns:
+        Same shape, K-major strides
+    """
+    return b_tensor.permute(0, 2, 1).contiguous().permute(0, 2, 1)
+
+
+def compute_expert_offsets(tokens_per_expert, num_experts, device="cuda"):
+    """Compute cumulative token offsets for the grouped GEMM.
+    
+    Args:
+        tokens_per_expert: list of int, one per expert
+    
+    Returns:
+        offs: (num_experts,) int32 — cumulative sum
+    """
+    offs = torch.tensor(
+        [sum(tokens_per_expert[:e+1]) for e in range(num_experts)],
+        dtype=torch.int32, device=device,
+    )
+    return offs
+
+
+# ── Kernel Launch ─────────────────────────────────────────────────────
+
diff --git a/dsv4/ops/quantize.py b/dsv4/ops/quantize.py
new file mode 100644
index 00000000..ecdf6544
--- /dev/null
+++ b/dsv4/ops/quantize.py
@@ -0,0 +1,253 @@
+"""NVFP4 quantization: BF16 <-> NVFP4 conversion, scale factor computation."""
+import math
+import torch
+import cutlass
+import cutlass.cute as cute
+import cutlass.torch as cutlass_torch
+import cutlass.utils as utils
+
+from dsv4.kernels.gemm.grouped import (
+    cat_byte_reinterpretable_tensors,
+    stack_byte_reinterpretable_tensors,
+)
+
+E2M1_MAGNITUDES = [0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0]
+
+# Cache compiled kernels + pre-allocated workspace by cache_key
+# Each entry: {'compiled': callable, 'workspace': Tensor, 'workspace_size': int}
+#
+# Key design decisions (Bug #1 fix):
+# - cute.compile does NOT corrupt GPU memory (verified 2026-05-20 on B200).
+#   The original _needs_token_refill hack was a misdiagnosis. The real bug
+#   was elsewhere (likely OOB write or weight loading).
+# - Workspace is pre-allocated per cache entry during warmup_compilation()
+#   and reused on subsequent calls. No torch.full() in the hot path.
+# - CuTe tensor wrappers (from_dlpack + mark_layout_dynamic) are cheap
+#   metadata wrappers. We re-create them per call from real tensors.
+#   Caching them would hold stale references to tensors that get freed.
+
+# Cached LUT for E2M1 quantization (created once per device, cudagraph-safe)
+_NVFP4_STEP_LUT_CACHE = {}
+def _get_step_to_idx_lut(device):
+    """Get or create the E2M1 step-to-index LUT for the given device.
+    
+    Cached per device to avoid CPU->CUDA copies during cudagraph capture.
+    Must be pre-populated during warmup (before torch.compile/cudagraph capture)
+    so the lock is never entered on the compiled path.
+    """
+    # Fast path: already cached — no lock needed (torch.compile-safe)
+    if device in _NVFP4_STEP_LUT_CACHE:
+        return _NVFP4_STEP_LUT_CACHE[device]
+    # Slow path: first call, create the LUT
+    lut = torch.as_tensor(
+        [0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 7, 7],
+        dtype=torch.int8, device=device,
+    )
+    _NVFP4_STEP_LUT_CACHE[device] = lut
+    return lut
+SF_VEC_SIZE = 16  # NVFP4 block size
+
+def quantize_to_nvfp4(x_bf16, block_size=SF_VEC_SIZE):
+    """Quantize BF16 tensor to NVFP4.
+    
+    Args:
+        x_bf16: (..., D) BF16 tensor
+    
+    Returns:
+        x_fp4: (..., D//2) float4_e2m1fn_x2 — native PyTorch FP4
+        x_sf: (..., D//16) float8_e4m3fn — block scales
+        global_scale: float32 scalar
+    """
+    x_f32 = x_bf16.float()
+    amax = x_f32.abs().max().clamp(min=1e-8).float()
+    global_scale = amax / (6.0 * 448.0)
+    x_norm = x_f32 / global_scale
+
+    last_dim = x_norm.shape[-1]
+    n_blocks = ceil_div(last_dim, block_size)
+
+    if last_dim % block_size != 0:
+        pad_size = n_blocks * block_size - last_dim
+        x_norm = torch.nn.functional.pad(x_norm, (0, pad_size))
+
+    x_reshaped = x_norm.reshape(*x_norm.shape[:-1], n_blocks, block_size)
+    block_amax = x_reshaped.abs().amax(dim=-1)
+    # Detect zero blocks and underflow blocks (amax > 0 but too small for FP8).
+    # Smallest positive FP8 e4m3fn is 2^-9 ≈ 1.95e-3. If amax/6 < this,
+    # the block scale underflows to 0, and dividing x by the clamped 1e-8
+    # inflates values into nonzero FP4 buckets — producing wrong results.
+    zero_block = block_amax < (6.0 * 2.0 ** -9)  # < ~0.0117
+    # Zero out x for zero/underflow blocks before division.
+    # This ensures x_scaled = 0 → FP4 nibbles = 0.
+    x_reshaped = torch.where(zero_block.unsqueeze(-1),
+                              torch.zeros_like(x_reshaped), x_reshaped)
+    block_amax = block_amax.clamp(min=1e-8)
+    block_scale = (block_amax / 6.0).to(torch.float8_e4m3fn)
+    # Force zero/underflow blocks: FP8 scale = 0 (exact zero).
+    block_scale = torch.where(zero_block, torch.zeros_like(block_scale), block_scale)
+
+    # Nearest E2M1
+    block_sf_expanded = block_scale.float().unsqueeze(-1)
+    x_scaled = x_reshaped / block_sf_expanded.clamp(min=1e-8)
+
+    signs = torch.sign(x_scaled)
+    abs_scaled = x_scaled.abs().clamp(max=6.0)
+    
+    half_steps = (abs_scaled * 2.0).round().clamp(0, 12).to(torch.int8)
+    step_to_idx = _get_step_to_idx_lut(x_bf16.device)
+    indices = step_to_idx[half_steps.long()]
+
+    nibbles = torch.where(signs < 0, indices + 8, indices).to(torch.uint8)
+    even = nibbles[..., ::2]
+    odd = nibbles[..., 1::2]
+    packed = (odd << 4) | even
+
+    packed_shape = list(x_bf16.shape)
+    packed_shape[-1] = last_dim // 2
+    x_fp4 = packed.view(torch.float4_e2m1fn_x2).reshape(packed_shape)
+
+    sf_shape = list(x_bf16.shape[:-1]) + [n_blocks]
+    block_scale = block_scale.reshape(sf_shape)
+
+    return x_fp4, block_scale, global_scale
+
+
+def quantize_activation_nvfp4(x_bf16, global_scale, block_size=SF_VEC_SIZE):
+    """Quantize BF16 activation tensor to NVFP4 (cudagraph-safe).
+
+    Unlike quantize_to_nvfp4(), this takes a pre-computed global_scale
+    instead of computing it via .max() (which forces CPU-GPU sync).
+    All operations are pure GPU with no CPU-GPU syncs.
+
+    Args:
+        x_bf16: (..., D) BF16 tensor
+        global_scale: float32 scalar (pre-computed, NOT from .max())
+        block_size: NVFP4 block size
+    
+    Returns:
+        x_fp4: (..., D//2) float4_e2m1fn_x2
+        x_sf: (..., D//16) float8_e4m3fn
+    """
+    x_f32 = x_bf16.float()
+    x_norm = x_f32 / global_scale
+
+    last_dim = x_norm.shape[-1]
+    n_blocks = ceil_div(last_dim, block_size)
+
+    if last_dim % block_size != 0:
+        pad_size = n_blocks * block_size - last_dim
+        x_norm = torch.nn.functional.pad(x_norm, (0, pad_size))
+
+    x_reshaped = x_norm.reshape(*x_norm.shape[:-1], n_blocks, block_size)
+    block_amax = x_reshaped.abs().amax(dim=-1)
+    # Detect zero blocks and underflow blocks (same threshold as quantize_to_nvfp4).
+    zero_block = block_amax < (6.0 * 2.0 ** -9)
+    x_reshaped = torch.where(zero_block.unsqueeze(-1),
+                              torch.zeros_like(x_reshaped), x_reshaped)
+    block_amax = block_amax.clamp(min=1e-8)
+    block_scale = (block_amax / 6.0).to(torch.float8_e4m3fn)
+    block_scale = torch.where(zero_block, torch.zeros_like(block_scale), block_scale)
+
+    block_sf_expanded = block_scale.float().unsqueeze(-1)
+    x_scaled = x_reshaped / block_sf_expanded.clamp(min=1e-8)
+    signs = torch.sign(x_scaled)
+    abs_scaled = x_scaled.abs().clamp(max=6.0)
+    
+    half_steps = (abs_scaled * 2.0).round().clamp(0, 12).to(torch.int8)
+    step_to_idx = _get_step_to_idx_lut(x_bf16.device)
+    indices = step_to_idx[half_steps.long()]
+
+    nibbles = torch.where(signs < 0, indices + 8, indices).to(torch.uint8)
+    even = nibbles[..., ::2]
+    odd = nibbles[..., 1::2]
+    packed = (odd << 4) | even
+
+    packed_shape = list(x_bf16.shape)
+    packed_shape[-1] = last_dim // 2
+    x_fp4 = packed.view(torch.float4_e2m1fn_x2).reshape(packed_shape)
+
+    sf_shape = list(x_bf16.shape[:-1]) + [n_blocks]
+    block_scale = block_scale.reshape(sf_shape)
+
+    return x_fp4, block_scale
+
+
+def quantize_weight_to_nvfp4(w_bf16, block_size=SF_VEC_SIZE):
+    """Quantize BF16 weight matrix to NVFP4.
+    
+    The weight is (K, N) where K is the input dim (packed dimension).
+    Block scales are computed along K (dim 0).
+    
+    Args:
+        w_bf16: (K, N) BF16 weight matrix
+    
+    Returns:
+        w_fp4: (K//2, N) float4_e2m1fn_x2 — K is the packed dim
+        w_sf: (K//16, N) float8_e4m3fn — block scales along K
+        global_scale: float32 scalar
+    """
+    K, N = w_bf16.shape
+    w_f32 = w_bf16.float()
+    amax = w_f32.abs().max().clamp(min=1e-8).float()
+    global_scale = amax / (6.0 * 448.0)
+    w_norm = w_f32 / global_scale
+
+    k_blocks = ceil_div(K, block_size)
+    if K % block_size != 0:
+        w_norm = torch.nn.functional.pad(w_norm, (0, 0, 0, k_blocks * block_size - K))
+
+    w_reshaped = w_norm.reshape(k_blocks, block_size, N)
+    w_block_amax = w_reshaped.abs().amax(dim=1)
+    # Detect zero blocks and underflow blocks (same threshold).
+    zero_block = w_block_amax < (6.0 * 2.0 ** -9)
+    w_reshaped = torch.where(zero_block.unsqueeze(1),
+                              torch.zeros_like(w_reshaped), w_reshaped)
+    w_block_amax = w_block_amax.clamp(min=1e-8)
+    w_sf = (w_block_amax / 6.0).to(torch.float8_e4m3fn)
+    w_sf = torch.where(zero_block, torch.zeros_like(w_sf), w_sf)
+
+    w_block_sf = w_sf.float().unsqueeze(1)
+    w_scaled = w_reshaped / w_block_sf.clamp(min=1e-8)
+
+    signs = torch.sign(w_scaled)
+    abs_scaled = w_scaled.abs().clamp(max=6.0)
+    
+    half_steps = (abs_scaled * 2.0).round().clamp(0, 12).to(torch.int8)
+    step_to_idx = _get_step_to_idx_lut(w_bf16.device)
+    indices = step_to_idx[half_steps.long()]
+    nibbles = torch.where(signs < 0, indices + 8, indices).to(torch.uint8)
+
+    even = nibbles[:, ::2, :]
+    odd = nibbles[:, 1::2, :]
+    packed = (odd << 4) | even
+
+    w_fp4 = packed.reshape(K // 2, N).view(torch.float4_e2m1fn_x2)
+    return w_fp4, w_sf, global_scale
+
+
+# ── Scale Factor Assembly ─────────────────────────────────────────────
+def deinterleave_quantize_nvfp4_cuda(fused_bf16, intermediate, global_scale, granularity=8):
+    """De-interleave + quantize fused SwiGLU output using a custom CUDA kernel.
+    
+    Single kernel launch, no Python loop. 4x faster than the Python path.
+    
+    Args:
+        fused_bf16: (M, 2*intermediate) BF16 — fused L1 output with interleaved gate/up
+        intermediate: intermediate dimension (e.g., 3072)
+        global_scale: pre-computed global scale for quantization
+        granularity: interleave granularity in BF16 columns (default 8)
+    
+    Returns:
+        x_fp4: (M, intermediate//2) float4_e2m1fn_x2 — quantized SwiGLU
+        x_sf: (M, intermediate//16) float8_e4m3fn — block scales
+    """
+    from torch.utils.cpp_extension import load
+    import os
+    kernel_dir = os.path.join(os.path.dirname(__file__), "kernels")
+    mod = load(
+        name="deinterleave_quantize_nvfp4",
+        sources=[os.path.join(kernel_dir, "deinterleave_quantize.cu")],
+        extra_cuda_cflags=["-gencode=arch=compute_100a,code=sm_100a"],
+        verbose=False,
+    )
+    return mod.deinterleave_quantize_nvfp4(fused_bf16, intermediate, granularity, global_scale)
diff --git a/cutedsl/inverse_rope.py b/dsv4/ops/rope.py
similarity index 100%
rename from cutedsl/inverse_rope.py
rename to dsv4/ops/rope.py
diff --git a/cutedsl/sparse_topk_metadata.py b/dsv4/ops/topk.py
similarity index 100%
rename from cutedsl/sparse_topk_metadata.py
rename to dsv4/ops/topk.py
diff --git a/dsv4/reference/__init__.py b/dsv4/reference/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/cutedsl/blackwell_attention.py b/dsv4/reference/attention.py
similarity index 100%
rename from cutedsl/blackwell_attention.py
rename to dsv4/reference/attention.py
diff --git a/cutedsl/csa_hca_compressor_PYTORCH_EXAMPLE.py b/dsv4/reference/compressor.py
similarity index 100%
rename from cutedsl/csa_hca_compressor_PYTORCH_EXAMPLE.py
rename to dsv4/reference/compressor.py
diff --git a/cutedsl/csa_attention.py b/dsv4/reference/csa_attention.py
similarity index 100%
rename from cutedsl/csa_attention.py
rename to dsv4/reference/csa_attention.py
diff --git a/cutedsl/moe_pipeline.py b/dsv4/reference/moe_pipeline.py
similarity index 97%
rename from cutedsl/moe_pipeline.py
rename to dsv4/reference/moe_pipeline.py
index 3fcef365..076300ea 100644
--- a/cutedsl/moe_pipeline.py
+++ b/dsv4/reference/moe_pipeline.py
@@ -14,15 +14,19 @@ block scales in float8_e4m3fn, global scales in float32.
 """
 import torch
 
-from cutedsl.bridge import (
+from dsv4.ops.quantize import (
     quantize_to_nvfp4,
     quantize_weight_to_nvfp4,
+)
+from dsv4.ops.layouts import (
     assemble_scales_2d_side,
     assemble_scales_3d_side,
     make_b_k_major,
     compute_expert_offsets,
     interleave_l1_weights,
     deinterleave_l1_weights,
+)
+from dsv4.ops.gemm_runner import (
     run_nvfp4_grouped_gemm,
     run_fused_swiglu_grouped_gemm,
     warmup_fused_swiglu_compilation,
@@ -198,7 +202,7 @@ def run_nvfp4_moe(
         sf_ekn = sf.unsqueeze(0)              # (1, K_sf, N)
         sf_ekn = interleave_l1_weights(sf_ekn) # interleaved along N
         l1_sf_il.append(sf_ekn[0].T.contiguous())  # (N, K_sf) for assembly
-    from cutedsl.kernel.moe.torch_scaled_grouped_mm import assemble_raw_scales_2d3d_3d_side as _assemble_3d
+    from dsv4.kernels.gemm.grouped import assemble_raw_scales_2d3d_3d_side as _assemble_3d
     l1_scale_b = _assemble_3d(l1_sf_il)
     
     # Global scales: alpha = igs * weight_gs for each expert
@@ -347,7 +351,7 @@ def run_nvfp4_moe_fused(
         sf_ekn = sf.unsqueeze(0)
         sf_ekn = interleave_l1_weights(sf_ekn)
         l1_sf_il.append(sf_ekn[0].T.contiguous())
-    from cutedsl.kernel.moe.torch_scaled_grouped_mm import assemble_raw_scales_2d3d_3d_side as _assemble_3d
+    from dsv4.kernels.gemm.grouped import assemble_raw_scales_2d3d_3d_side as _assemble_3d
     l1_scale_b = _assemble_3d(l1_sf_il)
 
     l1_global_scale_a = torch.tensor([x_igs] * num_experts, dtype=torch.float32, device=device)
@@ -368,7 +372,10 @@ def run_nvfp4_moe_fused(
     intermediate_size = l1_fused_out.shape[1] // 2
     # Use pre-computed L2 activation gs, or compute from amax (fallback)
     l2_gs = l2_activation_gs if l2_activation_gs is not None else l1_fused_out.abs().amax().float().item() / 2688.0
-    from cutedsl.bridge import deinterleave_quantize_nvfp4_cuda, quantize_activation_nvfp4
+    from dsv4.ops.quantize import (
+        deinterleave_quantize_nvfp4_cuda,
+        quantize_activation_nvfp4,
+    )
     l2_x_fp4, l2_x_sf = deinterleave_quantize_nvfp4_cuda(l1_fused_out, intermediate_size, l2_gs)
     # Skip the separate L2 quantize step below — we already have FP4+SF
     # Set activated to None to signal we already quantized
diff --git a/pyproject.toml b/pyproject.toml
index 13ae227a..40044755 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -3,7 +3,7 @@ requires = ["setuptools>=68.0", "wheel"]
 build-backend = "setuptools.build_meta"
 
 [project]
-name = "nvfp4-megamoe-kernel"
+name = "dsv4-inference"
 version = "0.1.0"
 description = "NVFP4 Mega MoE kernel for DeepSeek-V4-Pro on Blackwell (TileLang)"
 requires-python = ">=3.10"
@@ -13,3 +13,4 @@ dependencies = [
 
 [tool.setuptools.packages.find]
 where = ["."]
+include = ["dsv4*"]
diff --git a/tests/debug_output.py b/tests/archive/debug_output.py
similarity index 97%
rename from tests/debug_output.py
rename to tests/archive/debug_output.py
index 17cc0453..f7bef38b 100644
--- a/tests/debug_output.py
+++ b/tests/archive/debug_output.py
@@ -6,7 +6,7 @@ sys.path.insert(0, '/root/nvfp4-megamoe-kernel/cutedsl')
 sys.path.insert(0, '/root/nvfp4-megamoe-kernel/vllm')
 
 from cutedsl.reference.moe_pipeline import moe_pipeline
-from vllm.nvfp4_cutedsl import CuTeDSLMoERunner
+from vllm.nvfp4_cutedsl import Nvfp4MoE
 
 torch.cuda.set_device(0)
 
@@ -33,7 +33,7 @@ ref_out = moe_pipeline(
 print(f"Reference output: amax={ref_out.amax().item():.4f} mean={ref_out.mean().item():.4f}")
 
 # Run runner with warmup gs
-runner = CuTeDSLMoERunner(
+runner = Nvfp4MoE(
     num_experts=3, hidden_size=256, intermediate_size=512,
     max_num_tokens=4, top_k=2, device='cuda'
 )
diff --git a/tests/debug_stages.py b/tests/archive/debug_stages.py
similarity index 100%
rename from tests/debug_stages.py
rename to tests/archive/debug_stages.py
diff --git a/tests/diag_layouts.py b/tests/archive/diag_layouts.py
similarity index 100%
rename from tests/diag_layouts.py
rename to tests/archive/diag_layouts.py
diff --git a/tests/diag_tmem.py b/tests/archive/diag_tmem.py
similarity index 100%
rename from tests/diag_tmem.py
rename to tests/archive/diag_tmem.py
diff --git a/tests/stage_b_debug5.py b/tests/archive/stage_b_debug5.py
similarity index 100%
rename from tests/stage_b_debug5.py
rename to tests/archive/stage_b_debug5.py
diff --git a/tests/test_128_128_fmha_v.py b/tests/archive/test_128_128_fmha_v.py
similarity index 100%
rename from tests/test_128_128_fmha_v.py
rename to tests/archive/test_128_128_fmha_v.py
diff --git a/tests/test_128_16_bigP.py b/tests/archive/test_128_16_bigP.py
similarity index 100%
rename from tests/test_128_16_bigP.py
rename to tests/archive/test_128_16_bigP.py
diff --git a/tests/test_128_16_debug.py b/tests/archive/test_128_16_debug.py
similarity index 100%
rename from tests/test_128_16_debug.py
rename to tests/archive/test_128_16_debug.py
diff --git a/tests/test_128_16_debug2.py b/tests/archive/test_128_16_debug2.py
similarity index 100%
rename from tests/test_128_16_debug2.py
rename to tests/archive/test_128_16_debug2.py
diff --git a/tests/test_128_16_debug3.py b/tests/archive/test_128_16_debug3.py
similarity index 100%
rename from tests/test_128_16_debug3.py
rename to tests/archive/test_128_16_debug3.py
diff --git a/tests/test_128_16_fp16.py b/tests/archive/test_128_16_fp16.py
similarity index 100%
rename from tests/test_128_16_fp16.py
rename to tests/archive/test_128_16_fp16.py
diff --git a/tests/test_128_16_full.py b/tests/archive/test_128_16_full.py
similarity index 100%
rename from tests/test_128_16_full.py
rename to tests/archive/test_128_16_full.py
diff --git a/tests/test_128_16_minimal.py b/tests/archive/test_128_16_minimal.py
similarity index 100%
rename from tests/test_128_16_minimal.py
rename to tests/archive/test_128_16_minimal.py
diff --git a/tests/test_128_16_nogC.py b/tests/archive/test_128_16_nogC.py
similarity index 100%
rename from tests/test_128_16_nogC.py
rename to tests/archive/test_128_16_nogC.py
diff --git a/tests/test_128_16_nopack.py b/tests/archive/test_128_16_nopack.py
similarity index 100%
rename from tests/test_128_16_nopack.py
rename to tests/archive/test_128_16_nopack.py
diff --git a/tests/test_128_16_nosoftmax.py b/tests/archive/test_128_16_nosoftmax.py
similarity index 100%
rename from tests/test_128_16_nosoftmax.py
rename to tests/archive/test_128_16_nosoftmax.py
diff --git a/tests/test_128_16_pAtS.py b/tests/archive/test_128_16_pAtS.py
similarity index 100%
rename from tests/test_128_16_pAtS.py
rename to tests/archive/test_128_16_pAtS.py
diff --git a/tests/test_128_16_pvlayout.py b/tests/archive/test_128_16_pvlayout.py
similarity index 100%
rename from tests/test_128_16_pvlayout.py
rename to tests/archive/test_128_16_pvlayout.py
diff --git a/tests/test_128_16_pvpack.py b/tests/archive/test_128_16_pvpack.py
similarity index 100%
rename from tests/test_128_16_pvpack.py
rename to tests/archive/test_128_16_pvpack.py
diff --git a/tests/test_128_16_pvwrite.py b/tests/archive/test_128_16_pvwrite.py
similarity index 100%
rename from tests/test_128_16_pvwrite.py
rename to tests/archive/test_128_16_pvwrite.py
diff --git a/tests/test_128_16_qkread.py b/tests/archive/test_128_16_qkread.py
similarity index 100%
rename from tests/test_128_16_qkread.py
rename to tests/archive/test_128_16_qkread.py
diff --git a/tests/test_128_16_smem.py b/tests/archive/test_128_16_smem.py
similarity index 100%
rename from tests/test_128_16_smem.py
rename to tests/archive/test_128_16_smem.py
diff --git a/tests/test_128_16_stepA.py b/tests/archive/test_128_16_stepA.py
similarity index 100%
rename from tests/test_128_16_stepA.py
rename to tests/archive/test_128_16_stepA.py
diff --git a/tests/test_128_16_stepB.py b/tests/archive/test_128_16_stepB.py
similarity index 100%
rename from tests/test_128_16_stepB.py
rename to tests/archive/test_128_16_stepB.py
diff --git a/tests/test_128_16_stepC.py b/tests/archive/test_128_16_stepC.py
similarity index 100%
rename from tests/test_128_16_stepC.py
rename to tests/archive/test_128_16_stepC.py
diff --git a/tests/test_128_16_stepD.py b/tests/archive/test_128_16_stepD.py
similarity index 100%
rename from tests/test_128_16_stepD.py
rename to tests/archive/test_128_16_stepD.py
diff --git a/tests/test_128_16_stepE.py b/tests/archive/test_128_16_stepE.py
similarity index 100%
rename from tests/test_128_16_stepE.py
rename to tests/archive/test_128_16_stepE.py
diff --git a/tests/test_128_16_tiler.py b/tests/archive/test_128_16_tiler.py
similarity index 100%
rename from tests/test_128_16_tiler.py
rename to tests/archive/test_128_16_tiler.py
diff --git a/tests/test_128_16_v8.py b/tests/archive/test_128_16_v8.py
similarity index 100%
rename from tests/test_128_16_v8.py
rename to tests/archive/test_128_16_v8.py
diff --git a/tests/test_128_16_zeropad.py b/tests/archive/test_128_16_zeropad.py
similarity index 100%
rename from tests/test_128_16_zeropad.py
rename to tests/archive/test_128_16_zeropad.py
diff --git a/tests/test_128_32_ctafix.py b/tests/archive/test_128_32_ctafix.py
similarity index 100%
rename from tests/test_128_32_ctafix.py
rename to tests/archive/test_128_32_ctafix.py
diff --git a/tests/test_128_32_ctafix2.py b/tests/archive/test_128_32_ctafix2.py
similarity index 100%
rename from tests/test_128_32_ctafix2.py
rename to tests/archive/test_128_32_ctafix2.py
diff --git a/tests/test_128_32_native.py b/tests/archive/test_128_32_native.py
similarity index 100%
rename from tests/test_128_32_native.py
rename to tests/archive/test_128_32_native.py
diff --git a/tests/test_128_32_vdiag.py b/tests/archive/test_128_32_vdiag.py
similarity index 100%
rename from tests/test_128_32_vdiag.py
rename to tests/archive/test_128_32_vdiag.py
diff --git a/tests/test_128_32_zeropad.py b/tests/archive/test_128_32_zeropad.py
similarity index 100%
rename from tests/test_128_32_zeropad.py
rename to tests/archive/test_128_32_zeropad.py
diff --git a/tests/test_afrag_roundtrip.py b/tests/archive/test_afrag_roundtrip.py
similarity index 100%
rename from tests/test_afrag_roundtrip.py
rename to tests/archive/test_afrag_roundtrip.py
diff --git a/tests/test_attention.py b/tests/archive/test_attention.py
similarity index 98%
rename from tests/test_attention.py
rename to tests/archive/test_attention.py
index 841f0950..5faacf93 100644
--- a/tests/test_attention.py
+++ b/tests/archive/test_attention.py
@@ -51,14 +51,14 @@ def dequant_nvfp4(packed_uint8, scale_e4m3, global_scale):
 def test_projection(name, weight, weight_sf, weight_gs, hidden_states, in_features, out_features):
     """Test a single NVFP4 projection."""
     sys.path.insert(0, "/root/nvfp4-megamoe-kernel")
-    from cutedsl.nvfp4_linear import CuTeDSLNvfp4Linear
+    from dsv4.layers.linear import Nvfp4Linear
 
     # Convert weight to CuTeDSL format: (out, in_packed) uint8 → (in_packed, out) float4
     fp4 = [weight.view(torch.float4_e2m1fn_x2).permute(1, 0).contiguous()]
     sf = [weight_sf.permute(1, 0).contiguous()]
     gs = [weight_gs]
 
-    runner = CuTeDSLNvfp4Linear(
+    runner = Nvfp4Linear(
         in_features=in_features,
         out_features=out_features,
         max_num_tokens=8192,
diff --git a/tests/test_attention_path_b200.py b/tests/archive/test_attention_path_b200.py
similarity index 98%
rename from tests/test_attention_path_b200.py
rename to tests/archive/test_attention_path_b200.py
index 91528d5d..334a513c 100644
--- a/tests/test_attention_path_b200.py
+++ b/tests/archive/test_attention_path_b200.py
@@ -55,7 +55,7 @@ def rms(x, w, eps=1e-6):
     return (w.float() * (x * torch.rsqrt(v+eps)).float()).to(x.dtype)
 
 def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None):
-    from cutedsl.nvfp4_linear import CuTeDSLNvfp4Linear
+    from dsv4.layers.linear import Nvfp4Linear
     fp4 = w.view(torch.float4_e2m1fn_x2).permute(1,0).contiguous()
     s = sf.to(torch.float8_e4m3fn) if sf.dtype != torch.float8_e4m3fn else sf
     s = s.permute(1,0).contiguous()
@@ -66,7 +66,7 @@ def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None):
             s32[:sp] *= g1/gs; s32[sp:] *= g2/gs; s = s32.to(torch.float8_e4m3fn)
     else:
         gs = gs_t.max().item() if gs_t.numel() > 1 else gs_t.item()
-    r = CuTeDSLNvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device))
+    r = Nvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device))
     r.fp4 = [fp4]; r.sf = [s]; r.gs = [gs]
     r.finalize_weights(); r._ensure_initialized()
     return r
diff --git a/tests/test_b_afrag2.py b/tests/archive/test_b_afrag2.py
similarity index 100%
rename from tests/test_b_afrag2.py
rename to tests/archive/test_b_afrag2.py
diff --git a/tests/test_b_layout.py b/tests/archive/test_b_layout.py
similarity index 100%
rename from tests/test_b_layout.py
rename to tests/archive/test_b_layout.py
diff --git a/tests/test_bf16_elemwise.py b/tests/archive/test_bf16_elemwise.py
similarity index 100%
rename from tests/test_bf16_elemwise.py
rename to tests/archive/test_bf16_elemwise.py
diff --git a/tests/test_bf16_pack.py b/tests/archive/test_bf16_pack.py
similarity index 100%
rename from tests/test_bf16_pack.py
rename to tests/archive/test_bf16_pack.py
diff --git a/tests/test_bf16_recast_full.py b/tests/archive/test_bf16_recast_full.py
similarity index 100%
rename from tests/test_bf16_recast_full.py
rename to tests/archive/test_bf16_recast_full.py
diff --git a/tests/test_bf16_recast_simple.py b/tests/archive/test_bf16_recast_simple.py
similarity index 100%
rename from tests/test_bf16_recast_simple.py
rename to tests/archive/test_bf16_recast_simple.py
diff --git a/tests/test_blackwell_attn_b200.py b/tests/archive/test_blackwell_attn_b200.py
similarity index 98%
rename from tests/test_blackwell_attn_b200.py
rename to tests/archive/test_blackwell_attn_b200.py
index 110e5b1e..41394760 100644
--- a/tests/test_blackwell_attn_b200.py
+++ b/tests/archive/test_blackwell_attn_b200.py
@@ -50,7 +50,7 @@ def rms(x, w, eps=1e-6):
     return (w.float() * (x * torch.rsqrt(v+eps)).float()).to(x.dtype)
 
 def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None):
-    from cutedsl.nvfp4_linear import CuTeDSLNvfp4Linear
+    from dsv4.layers.linear import Nvfp4Linear
     fp4 = w.view(torch.float4_e2m1fn_x2).permute(1,0).contiguous()
     s = sf.to(torch.float8_e4m3fn) if sf.dtype != torch.float8_e4m3fn else sf
     s = s.permute(1,0).contiguous()
@@ -61,7 +61,7 @@ def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None):
             s32[:sp] *= g1/gs; s32[sp:] *= g2/gs; s = s32.to(torch.float8_e4m3fn)
     else:
         gs = gs_t.max().item() if gs_t.numel() > 1 else gs_t.item()
-    r = CuTeDSLNvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device))
+    r = Nvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device))
     r.fp4 = [fp4]; r.sf = [s]; r.gs = [gs]
     r.finalize_weights(); r._ensure_initialized()
     return r
@@ -82,7 +82,7 @@ def build_cos_sin(max_pos=4096, rope_dim=ROPE):
 
 def test_blackwell_attention(layer_id, compress_ratio):
     """Test the full blackwell attention pipeline for a specific layer."""
-    from cutedsl.blackwell_attention import (
+    from dsv4.reference.attention import (
         apply_gptj_rope, apply_inv_gptj_rope,
         blackwell_attention_forward,
         kv_quantize_fp8, kv_dequantize_fp8,
diff --git a/tests/test_csa_attention_b200.py b/tests/archive/test_csa_attention_b200.py
similarity index 97%
rename from tests/test_csa_attention_b200.py
rename to tests/archive/test_csa_attention_b200.py
index 8b4e27f8..6f4f6935 100644
--- a/tests/test_csa_attention_b200.py
+++ b/tests/archive/test_csa_attention_b200.py
@@ -52,7 +52,7 @@ def rms(x, w, eps=1e-6):
     return (w.float() * (x * torch.rsqrt(v+eps)).float()).to(x.dtype)
 
 def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None):
-    from cutedsl.nvfp4_linear import CuTeDSLNvfp4Linear
+    from dsv4.layers.linear import Nvfp4Linear
     fp4 = w.view(torch.float4_e2m1fn_x2).permute(1,0).contiguous()
     s = sf.to(torch.float8_e4m3fn) if sf.dtype != torch.float8_e4m3fn else sf
     s = s.permute(1,0).contiguous()
@@ -63,7 +63,7 @@ def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None):
             s32[:sp] *= g1/gs; s32[sp:] *= g2/gs; s = s32.to(torch.float8_e4m3fn)
     else:
         gs = gs_t.max().item() if gs_t.numel() > 1 else gs_t.item()
-    r = CuTeDSLNvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device))
+    r = Nvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device))
     r.fp4 = [fp4]; r.sf = [s]; r.gs = [gs]
     r.finalize_weights(); r._ensure_initialized()
     return r
@@ -169,7 +169,7 @@ def main():
         # For this test, we use kv_n directly as the KV for attention
 
         # ── Step 6: FULL ATTENTION (PyTorch SDPA, works on Blackwell) ──
-        from cutedsl.csa_attention import full_attention_reference
+        from dsv4.reference.csa_attention import full_attention_reference
 
         o_attn = full_attention_reference(q_rope, kv_n, scale=SCALE)
         print(f"  Attention output: amax={o_attn.amax():.4f} NaN={torch.isnan(o_attn).any()}")
diff --git a/tests/test_csa_sparse_attn_b200.py b/tests/archive/test_csa_sparse_attn_b200.py
similarity index 99%
rename from tests/test_csa_sparse_attn_b200.py
rename to tests/archive/test_csa_sparse_attn_b200.py
index 5c75e229..d7afd83f 100644
--- a/tests/test_csa_sparse_attn_b200.py
+++ b/tests/archive/test_csa_sparse_attn_b200.py
@@ -42,7 +42,7 @@ def rms(x, w, eps=1e-6):
     return (w.float() * (x * torch.rsqrt(v+eps)).float()).to(x.dtype)
 
 def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None):
-    from cutedsl.nvfp4_linear import CuTeDSLNvfp4Linear
+    from dsv4.layers.linear import Nvfp4Linear
     fp4 = w.view(torch.float4_e2m1fn_x2).permute(1,0).contiguous()
     s = sf.to(torch.float8_e4m3fn) if sf.dtype != torch.float8_e4m3fn else sf
     s = s.permute(1,0).contiguous()
@@ -53,7 +53,7 @@ def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None):
             s32[:sp] *= g1/gs; s32[sp:] *= g2/gs; s = s32.to(torch.float8_e4m3fn)
     else:
         gs = gs_t.max().item() if gs_t.numel() > 1 else gs_t.item()
-    r = CuTeDSLNvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device))
+    r = Nvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device))
     r.fp4 = [fp4]; r.sf = [s]; r.gs = [gs]
     r.finalize_weights(); r._ensure_initialized()
     return r
diff --git a/tests/test_decode_attention_b200.py b/tests/archive/test_decode_attention_b200.py
similarity index 99%
rename from tests/test_decode_attention_b200.py
rename to tests/archive/test_decode_attention_b200.py
index c5d4da1f..a1339423 100644
--- a/tests/test_decode_attention_b200.py
+++ b/tests/archive/test_decode_attention_b200.py
@@ -61,7 +61,7 @@ def rms(x, w, eps=1e-6):
     return (w.float() * (x * torch.rsqrt(v+eps)).float()).to(x.dtype)
 
 def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None):
-    from cutedsl.nvfp4_linear import CuTeDSLNvfp4Linear
+    from dsv4.layers.linear import Nvfp4Linear
     fp4 = w.view(torch.float4_e2m1fn_x2).permute(1,0).contiguous()
     s = sf.to(torch.float8_e4m3fn) if sf.dtype != torch.float8_e4m3fn else sf
     s = s.permute(1,0).contiguous()
@@ -72,7 +72,7 @@ def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None):
             s32[:sp] *= g1/gs; s32[sp:] *= g2/gs; s = s32.to(torch.float8_e4m3fn)
     else:
         gs = gs_t.max().item() if gs_t.numel() > 1 else gs_t.item()
-    r = CuTeDSLNvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device))
+    r = Nvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device))
     r.fp4 = [fp4]; r.sf = [s]; r.gs = [gs]
     r.finalize_weights(); r._ensure_initialized()
     return r
diff --git a/tests/test_decode_pipeline.py b/tests/archive/test_decode_pipeline.py
similarity index 98%
rename from tests/test_decode_pipeline.py
rename to tests/archive/test_decode_pipeline.py
index f562b127..459c84ef 100644
--- a/tests/test_decode_pipeline.py
+++ b/tests/archive/test_decode_pipeline.py
@@ -24,7 +24,7 @@ from vllm.model_executor.layers.csa_attention import (
     apply_gptj_rope,
     apply_inv_gptj_rope,
 )
-from cutedsl.native_swa_decode import native_swa_decode_attention
+from dsv4.ops.decode_swa import native_swa_decode_attention
 
 torch.manual_seed(42)
 torch.cuda.set_device(0)
diff --git a/tests/test_decode_vs_prefill_b200.py b/tests/archive/test_decode_vs_prefill_b200.py
similarity index 98%
rename from tests/test_decode_vs_prefill_b200.py
rename to tests/archive/test_decode_vs_prefill_b200.py
index 2d236d33..eec978ec 100644
--- a/tests/test_decode_vs_prefill_b200.py
+++ b/tests/archive/test_decode_vs_prefill_b200.py
@@ -43,7 +43,7 @@ def rms(x, w, eps=1e-6):
     return (w.float() * (x * torch.rsqrt(v+eps)).float()).to(x.dtype)
 
 def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None):
-    from cutedsl.nvfp4_linear import CuTeDSLNvfp4Linear
+    from dsv4.layers.linear import Nvfp4Linear
     fp4 = w.view(torch.float4_e2m1fn_x2).permute(1,0).contiguous()
     s = sf.to(torch.float8_e4m3fn) if sf.dtype != torch.float8_e4m3fn else sf
     s = s.permute(1,0).contiguous()
@@ -54,7 +54,7 @@ def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None):
             s32[:sp] *= g1/gs; s32[sp:] *= g2/gs; s = s32.to(torch.float8_e4m3fn)
     else:
         gs = gs_t.max().item() if gs_t.numel() > 1 else gs_t.item()
-    r = CuTeDSLNvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device))
+    r = Nvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device))
     r.fp4 = [fp4]; r.sf = [s]; r.gs = [gs]
     r.finalize_weights(); r._ensure_initialized()
     return r
diff --git a/tests/test_diag_layout.py b/tests/archive/test_diag_layout.py
similarity index 100%
rename from tests/test_diag_layout.py
rename to tests/archive/test_diag_layout.py
diff --git a/tests/test_diag_permute.py b/tests/archive/test_diag_permute.py
similarity index 100%
rename from tests/test_diag_permute.py
rename to tests/archive/test_diag_permute.py
diff --git a/tests/test_diag_smem_layout.py b/tests/archive/test_diag_smem_layout.py
similarity index 100%
rename from tests/test_diag_smem_layout.py
rename to tests/archive/test_diag_smem_layout.py
diff --git a/tests/test_diag_v_mma128.py b/tests/archive/test_diag_v_mma128.py
similarity index 100%
rename from tests/test_diag_v_mma128.py
rename to tests/archive/test_diag_v_mma128.py
diff --git a/tests/test_diag_v_ones.py b/tests/archive/test_diag_v_ones.py
similarity index 100%
rename from tests/test_diag_v_ones.py
rename to tests/archive/test_diag_v_ones.py
diff --git a/tests/test_diag_v_truncid.py b/tests/archive/test_diag_v_truncid.py
similarity index 100%
rename from tests/test_diag_v_truncid.py
rename to tests/archive/test_diag_v_truncid.py
diff --git a/tests/test_e2e_decode_b200.py b/tests/archive/test_e2e_decode_b200.py
similarity index 99%
rename from tests/test_e2e_decode_b200.py
rename to tests/archive/test_e2e_decode_b200.py
index 79d96e68..f5096c8d 100644
--- a/tests/test_e2e_decode_b200.py
+++ b/tests/archive/test_e2e_decode_b200.py
@@ -45,7 +45,7 @@ def rms(x, w, eps=1e-6):
     return (w.float() * (x * torch.rsqrt(v+eps)).float()).to(x.dtype)
 
 def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None):
-    from cutedsl.nvfp4_linear import CuTeDSLNvfp4Linear
+    from dsv4.layers.linear import Nvfp4Linear
     fp4 = w.view(torch.float4_e2m1fn_x2).permute(1,0).contiguous()
     s = sf.to(torch.float8_e4m3fn) if sf.dtype != torch.float8_e4m3fn else sf
     s = s.permute(1,0).contiguous()
@@ -56,7 +56,7 @@ def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None):
             s32[:sp] *= g1/gs; s32[sp:] *= g2/gs; s = s32.to(torch.float8_e4m3fn)
     else:
         gs = gs_t.max().item() if gs_t.numel() > 1 else gs_t.item()
-    r = CuTeDSLNvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device))
+    r = Nvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device))
     r.fp4 = [fp4]; r.sf = [s]; r.gs = [gs]
     r.finalize_weights(); r._ensure_initialized()
     return r
diff --git a/tests/test_error_pattern.py b/tests/archive/test_error_pattern.py
similarity index 100%
rename from tests/test_error_pattern.py
rename to tests/archive/test_error_pattern.py
diff --git a/tests/test_fmha_pipeline.py b/tests/archive/test_fmha_pipeline.py
similarity index 100%
rename from tests/test_fmha_pipeline.py
rename to tests/archive/test_fmha_pipeline.py
diff --git a/tests/test_fmha_v1.py b/tests/archive/test_fmha_v1.py
similarity index 100%
rename from tests/test_fmha_v1.py
rename to tests/archive/test_fmha_v1.py
diff --git a/tests/test_fmha_v2.py b/tests/archive/test_fmha_v2.py
similarity index 100%
rename from tests/test_fmha_v2.py
rename to tests/archive/test_fmha_v2.py
diff --git a/tests/test_fmha_v2_fixed.py b/tests/archive/test_fmha_v2_fixed.py
similarity index 100%
rename from tests/test_fmha_v2_fixed.py
rename to tests/archive/test_fmha_v2_fixed.py
diff --git a/tests/test_fmha_v3_debug.py b/tests/archive/test_fmha_v3_debug.py
similarity index 100%
rename from tests/test_fmha_v3_debug.py
rename to tests/archive/test_fmha_v3_debug.py
diff --git a/tests/test_full_layer_b200.py b/tests/archive/test_full_layer_b200.py
similarity index 97%
rename from tests/test_full_layer_b200.py
rename to tests/archive/test_full_layer_b200.py
index 858021d2..c4db4920 100644
--- a/tests/test_full_layer_b200.py
+++ b/tests/archive/test_full_layer_b200.py
@@ -57,7 +57,7 @@ def rms(x, w, eps=1e-6):
     return (w.float() * (x * torch.rsqrt(v+eps)).float()).to(x.dtype)
 
 def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None):
-    from cutedsl.nvfp4_linear import CuTeDSLNvfp4Linear
+    from dsv4.layers.linear import Nvfp4Linear
     fp4 = w.view(torch.float4_e2m1fn_x2).permute(1,0).contiguous()
     s = sf.to(torch.float8_e4m3fn) if sf.dtype != torch.float8_e4m3fn else sf
     s = s.permute(1,0).contiguous()
@@ -68,7 +68,7 @@ def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None):
             s32[:sp] *= g1/gs; s32[sp:] *= g2/gs; s = s32.to(torch.float8_e4m3fn)
     else:
         gs = gs_t.max().item() if gs_t.numel() > 1 else gs_t.item()
-    r = CuTeDSLNvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device))
+    r = Nvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device))
     r.fp4 = [fp4]; r.sf = [s]; r.gs = [gs]
     r.finalize_weights(); r._ensure_initialized()
     return r
@@ -195,7 +195,7 @@ def main():
 
     # ── Shared expert ─────────────────────────────────────────────────
     print("\n--- Shared Expert: CuTeDSL vs BF16 ---")
-    from cutedsl.shared_expert_pipeline import CuTeDSLSharedExpertRunner
+    from dsv4.layers.shared_expert import Nvfp4SharedExpert
 
     sgw = G(f"{m}.shared_experts.gate_proj.weight"); sgsf = G(f"{m}.shared_experts.gate_proj.weight_scale")
     sggs = G(f"{m}.shared_experts.gate_proj.weight_scale_2").item()
@@ -211,7 +211,7 @@ def main():
         s32 = sgu_sf.float(); s32[:si] *= sggs/smgs; s32[si:] *= sugs/smgs
         sgu_sf = s32.to(torch.float8_e4m3fn)
 
-    ser = CuTeDSLSharedExpertRunner(hidden_size=H, intermediate_size=si, max_num_tokens=8192,
+    ser = Nvfp4SharedExpert(hidden_size=H, intermediate_size=si, max_num_tokens=8192,
                                      device=DEV, swiglu_limit=SL)
     ser.l1_fp4 = [sgu_w.view(torch.float4_e2m1fn_x2).permute(1,0).contiguous()]
     ser.l1_sf = [sgu_sf.permute(1,0).contiguous()]; ser.l1_gs = [smgs]
diff --git a/tests/test_full_layer_nan_b200.py b/tests/archive/test_full_layer_nan_b200.py
similarity index 98%
rename from tests/test_full_layer_nan_b200.py
rename to tests/archive/test_full_layer_nan_b200.py
index b52bc7b6..87aa5c67 100644
--- a/tests/test_full_layer_nan_b200.py
+++ b/tests/archive/test_full_layer_nan_b200.py
@@ -39,12 +39,12 @@ def rms(x, w, eps=1e-6):
     return (w.float() * (x * torch.rsqrt(v+eps)).float()).to(x.dtype)
 
 def make_runner(w, sf, gs_t, inf, outf):
-    from cutedsl.nvfp4_linear import CuTeDSLNvfp4Linear
+    from dsv4.layers.linear import Nvfp4Linear
     fp4 = w.view(torch.float4_e2m1fn_x2).permute(1,0).contiguous()
     s = sf.to(torch.float8_e4m3fn) if sf.dtype != torch.float8_e4m3fn else sf
     s = s.permute(1,0).contiguous()
     gs = gs_t.max().item() if gs_t.numel() > 1 else gs_t.item()
-    r = CuTeDSLNvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device))
+    r = Nvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device))
     r.fp4 = [fp4]; r.sf = [s]; r.gs = [gs]
     r.finalize_weights(); r._ensure_initialized()
     return r
@@ -102,7 +102,7 @@ def causal_prefill_attention(q, kv, scale):
 
 def test_full_layer(layer_id, num_tokens=8, num_moe_experts=16):
     """Test a complete transformer layer with attention + MoE."""
-    from cutedsl.runner import CuTeDSLMoERunner
+    from dsv4.layers.moe import Nvfp4MoE
     
     torch.cuda.set_device(0)
     torch.manual_seed(42)
@@ -162,7 +162,7 @@ def test_full_layer(layer_id, num_tokens=8, num_moe_experts=16):
     # Free per-expert lists
     del gate_ws, gate_sfs, gate_gss, up_ws, up_sfs, up_gss, down_ws, down_sfs, down_gss
     
-    moe_runner = CuTeDSLMoERunner(
+    moe_runner = Nvfp4MoE(
         num_experts=num_moe_experts,
         hidden_size=H,
         intermediate_size=INTERMEDIATE,
diff --git a/tests/test_full_model_b200.py b/tests/archive/test_full_model_b200.py
similarity index 98%
rename from tests/test_full_model_b200.py
rename to tests/archive/test_full_model_b200.py
index 3d0aaed7..be4141f5 100644
--- a/tests/test_full_model_b200.py
+++ b/tests/archive/test_full_model_b200.py
@@ -56,7 +56,7 @@ def rms(x, w, eps=1e-6):
     return (w.float() * (x * torch.rsqrt(v+eps)).float()).to(x.dtype)
 
 def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None):
-    from cutedsl.nvfp4_linear import CuTeDSLNvfp4Linear
+    from dsv4.layers.linear import Nvfp4Linear
     fp4 = w.view(torch.float4_e2m1fn_x2).permute(1,0).contiguous()
     s = sf.to(torch.float8_e4m3fn) if sf.dtype != torch.float8_e4m3fn else sf
     s = s.permute(1,0).contiguous()
@@ -67,7 +67,7 @@ def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None):
             s32[:sp] *= g1/gs; s32[sp:] *= g2/gs; s = s32.to(torch.float8_e4m3fn)
     else:
         gs = gs_t.max().item() if gs_t.numel() > 1 else gs_t.item()
-    r = CuTeDSLNvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device))
+    r = Nvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device))
     r.fp4 = [fp4]; r.sf = [s]; r.gs = [gs]
     r.finalize_weights(); r._ensure_initialized()
     return r
@@ -124,7 +124,7 @@ def bf16_causal_attention(q, kv, scale):
 
 def make_moe_runner(layer_id, wm, model_path):
     """Create CuTeDSL MoE runner for a layer."""
-    from cutedsl.runner import CuTeDSLMoERunner
+    from dsv4.layers.moe import Nvfp4MoE
     
     p = f"model.layers.{layer_id}.mlp"
     G = lambda k: P(k, wm, model_path).to(DEV)
@@ -147,7 +147,7 @@ def make_moe_runner(layer_id, wm, model_path):
     l2_sf = w2_sf.to(torch.float8_e4m3fn).permute(1,0).contiguous() if w2_sf.dtype != torch.float8_e4m3fn else w2_sf.permute(1,0).contiguous()
     
     intermediate_size = 3072  # per expert
-    runner = CuTeDSLMoERunner(
+    runner = Nvfp4MoE(
         num_experts=NUM_EXPERTS,
         hidden_size=H,
         intermediate_size=intermediate_size,
diff --git a/tests/test_inspect_types.py b/tests/archive/test_inspect_types.py
similarity index 100%
rename from tests/test_inspect_types.py
rename to tests/archive/test_inspect_types.py
diff --git a/tests/test_inv_rope.py b/tests/archive/test_inv_rope.py
similarity index 100%
rename from tests/test_inv_rope.py
rename to tests/archive/test_inv_rope.py
diff --git a/tests/test_kv_cache_b200.py b/tests/archive/test_kv_cache_b200.py
similarity index 98%
rename from tests/test_kv_cache_b200.py
rename to tests/archive/test_kv_cache_b200.py
index 29625338..c827d4ff 100644
--- a/tests/test_kv_cache_b200.py
+++ b/tests/archive/test_kv_cache_b200.py
@@ -62,7 +62,7 @@ def rms(x, w, eps=1e-6):
     return (w.float() * (x * torch.rsqrt(v+eps)).float()).to(x.dtype)
 
 def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None):
-    from cutedsl.nvfp4_linear import CuTeDSLNvfp4Linear
+    from dsv4.layers.linear import Nvfp4Linear
     fp4 = w.view(torch.float4_e2m1fn_x2).permute(1,0).contiguous()
     s = sf.to(torch.float8_e4m3fn) if sf.dtype != torch.float8_e4m3fn else sf
     s = s.permute(1,0).contiguous()
@@ -73,7 +73,7 @@ def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None):
             s32[:sp] *= g1/gs; s32[sp:] *= g2/gs; s = s32.to(torch.float8_e4m3fn)
     else:
         gs = gs_t.max().item() if gs_t.numel() > 1 else gs_t.item()
-    r = CuTeDSLNvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device))
+    r = Nvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device))
     r.fp4 = [fp4]; r.sf = [s]; r.gs = [gs]
     r.finalize_weights(); r._ensure_initialized()
     return r
@@ -135,7 +135,9 @@ def kv_quantize_nvfp4(kv_bf16):
     kv_bf16: (T, HD) BF16
     Returns: (T, HD//2) fp4, (T, HD//16) sf, scalar gs
     """
-    from cutedsl.bridge import quantize_to_nvfp4
+from dsv4.ops.quantize import (
+    quantize_to_nvfp4,
+)
     return quantize_to_nvfp4(kv_bf16)
 
 
diff --git a/tests/test_layout_compare.py b/tests/archive/test_layout_compare.py
similarity index 100%
rename from tests/test_layout_compare.py
rename to tests/archive/test_layout_compare.py
diff --git a/tests/test_mma_si_only.py b/tests/archive/test_mma_si_only.py
similarity index 100%
rename from tests/test_mma_si_only.py
rename to tests/archive/test_mma_si_only.py
diff --git a/tests/test_mma_si_pv.py b/tests/archive/test_mma_si_pv.py
similarity index 100%
rename from tests/test_mma_si_pv.py
rename to tests/archive/test_mma_si_pv.py
diff --git a/tests/test_model_forward_b200.py b/tests/archive/test_model_forward_b200.py
similarity index 95%
rename from tests/test_model_forward_b200.py
rename to tests/archive/test_model_forward_b200.py
index 92313977..05a62d23 100644
--- a/tests/test_model_forward_b200.py
+++ b/tests/archive/test_model_forward_b200.py
@@ -50,7 +50,7 @@ def rms(x, w, eps=1e-6):
     return (w.float() * (x * torch.rsqrt(v+eps)).float()).to(x.dtype)
 
 def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None):
-    from cutedsl.nvfp4_linear import CuTeDSLNvfp4Linear
+    from dsv4.layers.linear import Nvfp4Linear
     fp4 = w.view(torch.float4_e2m1fn_x2).permute(1,0).contiguous()
     s = sf.to(torch.float8_e4m3fn) if sf.dtype != torch.float8_e4m3fn else sf
     s = s.permute(1,0).contiguous()
@@ -61,7 +61,7 @@ def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None):
             s32[:sp] *= g1/gs; s32[sp:] *= g2/gs; s = s32.to(torch.float8_e4m3fn)
     else:
         gs = gs_t.max().item() if gs_t.numel() > 1 else gs_t.item()
-    r = CuTeDSLNvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device))
+    r = Nvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device))
     r.fp4 = [fp4]; r.sf = [s]; r.gs = [gs]
     r.finalize_weights(); r._ensure_initialized()
     return r
@@ -80,15 +80,17 @@ def main():
 
     # ── INSPECT: How does CuTeDSL runner.run() use gs? ────────────────
     print("\n--- INSPECTING CuTeDSL runner internals ---")
-    from cutedsl.nvfp4_linear import CuTeDSLNvfp4Linear
-    from cutedsl.bridge import quantize_activation_nvfp4
+    from dsv4.layers.linear import Nvfp4Linear
+from dsv4.ops.quantize import (
+    quantize_activation_nvfp4,
+)
 
     print("\n  quantize_activation_nvfp4 signature:")
     sig = inspect.signature(quantize_activation_nvfp4)
     print(f"    {sig}")
 
-    print("\n  CuTeDSLNvfp4Linear._run_impl source (key lines):")
-    src = inspect.getsource(CuTeDSLNvfp4Linear._run_impl)
+    print("\n  Nvfp4Linear._run_impl source (key lines):")
+    src = inspect.getsource(Nvfp4Linear._run_impl)
     for i, line in enumerate(src.split('\n')):
         stripped = line.strip()
         if any(kw in stripped for kw in ['global_scale', '_activation', 'quantize', 'return', 'def ']):
diff --git a/tests/test_moe_nan_b200.py b/tests/archive/test_moe_nan_b200.py
similarity index 98%
rename from tests/test_moe_nan_b200.py
rename to tests/archive/test_moe_nan_b200.py
index d25431a1..c2a5b8aa 100644
--- a/tests/test_moe_nan_b200.py
+++ b/tests/archive/test_moe_nan_b200.py
@@ -47,12 +47,12 @@ def rms(x, w, eps=1e-6):
     return (w.float() * (x * torch.rsqrt(v+eps)).float()).to(x.dtype)
 
 def make_runner(w, sf, gs_t, inf, outf):
-    from cutedsl.nvfp4_linear import CuTeDSLNvfp4Linear
+    from dsv4.layers.linear import Nvfp4Linear
     fp4 = w.view(torch.float4_e2m1fn_x2).permute(1,0).contiguous()
     s = sf.to(torch.float8_e4m3fn) if sf.dtype != torch.float8_e4m3fn else sf
     s = s.permute(1,0).contiguous()
     gs = gs_t.max().item() if gs_t.numel() > 1 else gs_t.item()
-    r = CuTeDSLNvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device))
+    r = Nvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device))
     r.fp4 = [fp4]; r.sf = [s]; r.gs = [gs]
     r.finalize_weights(); r._ensure_initialized()
     return r
diff --git a/tests/test_moe_runner_nan_b200.py b/tests/archive/test_moe_runner_nan_b200.py
similarity index 94%
rename from tests/test_moe_runner_nan_b200.py
rename to tests/archive/test_moe_runner_nan_b200.py
index 9f279bcc..ae447a9b 100644
--- a/tests/test_moe_runner_nan_b200.py
+++ b/tests/archive/test_moe_runner_nan_b200.py
@@ -2,7 +2,7 @@
 """
 DeepSeek-V4 MoE Runner NaN Test
 
-Tests the CuTeDSLMoERunner (grouped GEMM path) with real weights.
+Tests the Nvfp4MoE (grouped GEMM path) with real weights.
 The single-expert tests pass — this test exercises the FULL MoE runner
 with routing, padding, grouped GEMM, and combine.
 
@@ -39,7 +39,7 @@ def rms(x, w, eps=1e-6):
 
 
 def pack_expert_weights(wm, G, layer_id=2, num_local_experts=16):
-    """Pack per-expert weights into stacked format for CuTeDSLMoERunner.
+    """Pack per-expert weights into stacked format for Nvfp4MoE.
     Only loads the first num_local_experts to fit in memory.
     """
     m = f"model.layers.{layer_id}.mlp"
@@ -77,7 +77,7 @@ def pack_expert_weights(wm, G, layer_id=2, num_local_experts=16):
     # Actually w13 = stacked gate+up, w2 = down
     # But our runner expects separate L1 (gate+up) and L2 (down)
     # The w13 format is (E, 2*intermediate, hidden//2) with gate and up interleaved
-    # For CuTeDSLMoERunner, we stack gate and up side-by-side
+    # For Nvfp4MoE, we stack gate and up side-by-side
     
     # Stack gate and up into w13 format: (E, 2*intermediate, hidden//2)
     w13_w = torch.cat([torch.stack(gate_ws), torch.stack(up_ws)], dim=1)  # (E, 6144, 3584)
@@ -92,8 +92,8 @@ def pack_expert_weights(wm, G, layer_id=2, num_local_experts=16):
 
 
 def test_moe_runner(layer_id=2):
-    """Test the CuTeDSLMoERunner with real weights."""
-    from cutedsl.runner import CuTeDSLMoERunner
+    """Test the Nvfp4MoE with real weights."""
+    from dsv4.layers.moe import Nvfp4MoE
     
     torch.cuda.set_device(0)
     torch.manual_seed(42)
@@ -118,7 +118,7 @@ def test_moe_runner(layer_id=2):
     intermediate_size = INTERMEDIATE  # 3072
     hidden_size = H  # 7168
     
-    runner = CuTeDSLMoERunner(
+    runner = Nvfp4MoE(
         num_experts=num_local_experts,
         hidden_size=hidden_size,
         intermediate_size=intermediate_size,
@@ -178,7 +178,7 @@ def test_moe_runner(layer_id=2):
 def main():
     print("=" * 70)
     print("  DeepSeek-V4 MoE Runner NaN Test")
-    print("  Tests CuTeDSLMoERunner (grouped GEMM) with real weights")
+    print("  Tests Nvfp4MoE (grouped GEMM) with real weights")
     print("=" * 70)
     
     test_moe_runner(layer_id=2)
diff --git a/tests/test_multilayer.py b/tests/archive/test_multilayer.py
similarity index 98%
rename from tests/test_multilayer.py
rename to tests/archive/test_multilayer.py
index e75e206e..829256ff 100644
--- a/tests/test_multilayer.py
+++ b/tests/archive/test_multilayer.py
@@ -58,8 +58,11 @@ def main():
     topk_weights = torch.ones(NUM_TOKENS, TOP_K, dtype=torch.float32, device=DEVICE) / TOP_K
     
     # Setup runner
-    from vllm.nvfp4_cutedsl import CuTeDSLMoERunner
-    from cutedsl.bridge import assemble_scales_3d_side, make_b_k_major
+    from vllm.nvfp4_cutedsl import Nvfp4MoE
+from dsv4.ops.layouts import (
+    assemble_scales_3d_side,
+    make_b_k_major,
+)
     
     l1_fp4, l1_sf, l1_gs_list = [], [], []
     l2_fp4, l2_sf, l2_gs_list = [], [], []
@@ -91,7 +94,7 @@ def main():
             l2_sf.append(torch.ones(INTERMEDIATE_SIZE//16, HIDDEN_SIZE, dtype=torch.float8_e4m3fn, device=DEVICE))
             l2_gs_list.append(1.0)
     
-    runner = CuTeDSLMoERunner(
+    runner = Nvfp4MoE(
         num_experts=NUM_EXPERTS, hidden_size=HIDDEN_SIZE,
         intermediate_size=INTERMEDIATE_SIZE, max_num_tokens=NUM_TOKENS,
         top_k=TOP_K, device=DEVICE,
diff --git a/tests/test_nvfp4_attention_b200.py b/tests/archive/test_nvfp4_attention_b200.py
similarity index 96%
rename from tests/test_nvfp4_attention_b200.py
rename to tests/archive/test_nvfp4_attention_b200.py
index c0bdf170..0fed2f2d 100644
--- a/tests/test_nvfp4_attention_b200.py
+++ b/tests/archive/test_nvfp4_attention_b200.py
@@ -48,7 +48,7 @@ def rms(x, w, eps=1e-6):
     return (w.float() * (x * torch.rsqrt(v+eps)).float()).to(x.dtype)
 
 def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None):
-    from cutedsl.nvfp4_linear import CuTeDSLNvfp4Linear
+    from dsv4.layers.linear import Nvfp4Linear
     fp4 = w.view(torch.float4_e2m1fn_x2).permute(1,0).contiguous()
     s = sf.to(torch.float8_e4m3fn) if sf.dtype != torch.float8_e4m3fn else sf
     s = s.permute(1,0).contiguous()
@@ -59,7 +59,7 @@ def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None):
             s32[:sp] *= g1/gs; s32[sp:] *= g2/gs; s = s32.to(torch.float8_e4m3fn)
     else:
         gs = gs_t.max().item() if gs_t.numel() > 1 else gs_t.item()
-    r = CuTeDSLNvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device))
+    r = Nvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device))
     r.fp4 = [fp4]; r.sf = [s]; r.gs = [gs]
     r.finalize_weights(); r._ensure_initialized()
     return r
@@ -108,8 +108,11 @@ def nvfp4_qk_attention(q, kv, scale):
     This is a standard GEMM that CuTeDSL can handle.
     We quantize Q as the "activation" and K^T as the "weight".
     """
-    from cutedsl.bridge import quantize_to_nvfp4, quantize_activation_nvfp4
-    from cutedsl.nvfp4_linear import CuTeDSLNvfp4Linear
+from dsv4.ops.quantize import (
+    quantize_to_nvfp4,
+    quantize_activation_nvfp4,
+)
+    from dsv4.layers.linear import Nvfp4Linear
     
     T, NH, HD = q.shape
     device = q.device
@@ -123,7 +126,7 @@ def nvfp4_qk_attention(q, kv, scale):
     kv_T = kv.T.contiguous()  # (HD, T)
     w_fp4, w_sf, w_gs = quantize_to_nvfp4(kv_T)  # (HD//2, T), (HD//16, T), scalar
     
-    # Use CuTeDSLNvfp4Linear runner for Q×K^T GEMM
+    # Use Nvfp4Linear runner for Q×K^T GEMM
     # in_features=HD, out_features=T
     # Q is "activation" side, K^T is "weight" side
     M = T * NH
@@ -131,7 +134,7 @@ def nvfp4_qk_attention(q, kv, scale):
     N = T
     
     # Create runner for this specific (M, K, N) combination
-    runner = CuTeDSLNvfp4Linear(
+    runner = Nvfp4Linear(
         in_features=K, out_features=N, max_num_tokens=M, device=str(device)
     )
     
diff --git a/tests/test_nvfp4_attn_gemm_b200.py b/tests/archive/test_nvfp4_attn_gemm_b200.py
similarity index 96%
rename from tests/test_nvfp4_attn_gemm_b200.py
rename to tests/archive/test_nvfp4_attn_gemm_b200.py
index ebfac828..a47b6263 100644
--- a/tests/test_nvfp4_attn_gemm_b200.py
+++ b/tests/archive/test_nvfp4_attn_gemm_b200.py
@@ -64,7 +64,7 @@ def rms(x, w, eps=1e-6):
     return (w.float() * (x * torch.rsqrt(v+eps)).float()).to(x.dtype)
 
 def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None):
-    from cutedsl.nvfp4_linear import CuTeDSLNvfp4Linear
+    from dsv4.layers.linear import Nvfp4Linear
     fp4 = w.view(torch.float4_e2m1fn_x2).permute(1,0).contiguous()
     s = sf.to(torch.float8_e4m3fn) if sf.dtype != torch.float8_e4m3fn else sf
     s = s.permute(1,0).contiguous()
@@ -75,7 +75,7 @@ def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None):
             s32[:sp] *= g1/gs; s32[sp:] *= g2/gs; s = s32.to(torch.float8_e4m3fn)
     else:
         gs = gs_t.max().item() if gs_t.numel() > 1 else gs_t.item()
-    r = CuTeDSLNvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device))
+    r = Nvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device))
     r.fp4 = [fp4]; r.sf = [s]; r.gs = [gs]
     r.finalize_weights(); r._ensure_initialized()
     return r
@@ -163,7 +163,7 @@ class NVFP4Attention:
         Returns:
             (T, NH, HD) attention output
         """
-        from cutedsl.nvfp4_linear import CuTeDSLNvfp4Linear
+        from dsv4.layers.linear import Nvfp4Linear
         
         T, NH, HD = q_bf16.shape
         device = q_bf16.device
@@ -175,13 +175,13 @@ class NVFP4Attention:
         # Q is "activation" (T*NH, HD), K^T is "weight" (T, HD)
         # GEMM: (T*NH, HD) × (HD, T) → (T*NH, T)
         # 
-        # We use CuTeDSLNvfp4Linear with in_features=HD, out_features=T
+        # We use Nvfp4Linear with in_features=HD, out_features=T
         # Q is the "hidden_states", K (kv) is the "weight" matrix
         
         # Create or get cached runner
         cache_key = (T, HD, NH)
         if self._runner is None or getattr(self, '_cache_key', None) != cache_key:
-            runner = CuTeDSLNvfp4Linear(
+            runner = Nvfp4Linear(
                 in_features=HD,
                 out_features=T,
                 max_num_tokens=T * NH,
@@ -191,15 +191,17 @@ class NVFP4Attention:
             # Set K as the weight: kv (T, HD) → treat as weight (N=T, K=HD)
             # quantize_to_nvfp4 quantizes along last dim (D=HD) as activation
             # For weight, we need (K, N) layout — but kv is (T, HD) = (N, K)
-            # CuTeDSLNvfp4Linear expects weight in (N, K//2) after permute
+            # Nvfp4Linear expects weight in (N, K//2) after permute
             
-            from cutedsl.bridge import quantize_to_nvfp4
+from dsv4.ops.quantize import (
+    quantize_to_nvfp4,
+)
             # Quantize KV as a 2D tensor: (T, HD)
             # quantize_to_nvfp4 works on last dim (D=HD), returns:
             #   (T, HD//2) fp4, (T, HD//16) sf, scalar gs
             kv_fp4, kv_sf, kv_gs = quantize_to_nvfp4(kv_bf16)
             
-            # For CuTeDSLNvfp4Linear, weight is (N, K_packed) = (T, HD//2)
+            # For Nvfp4Linear, weight is (N, K_packed) = (T, HD//2)
             # Our kv_fp4 is already (T, HD//2) — perfect!
             # sf needs to be (N, K_sf) = (T, HD//16) — already correct
             
diff --git a/tests/test_nvfp4_mapper.py b/tests/archive/test_nvfp4_mapper.py
similarity index 100%
rename from tests/test_nvfp4_mapper.py
rename to tests/archive/test_nvfp4_mapper.py
diff --git a/tests/test_o_projection.py b/tests/archive/test_o_projection.py
similarity index 100%
rename from tests/test_o_projection.py
rename to tests/archive/test_o_projection.py
diff --git a/tests/test_o_projection_b200.py b/tests/archive/test_o_projection_b200.py
similarity index 99%
rename from tests/test_o_projection_b200.py
rename to tests/archive/test_o_projection_b200.py
index dc51e85c..f65bcd83 100644
--- a/tests/test_o_projection_b200.py
+++ b/tests/archive/test_o_projection_b200.py
@@ -163,7 +163,7 @@ def dequant_nvfp4(packed_uint8, scale_e4m3, global_scale):
 def test_wo_b_nvfp4(z, wo_b_weight, wo_b_sf, wo_b_gs):
     """Test wo_b NVFP4 GEMM against BF16 reference."""
     sys.path.insert(0, "/root/nvfp4-megamoe-kernel")
-    from cutedsl.nvfp4_linear import CuTeDSLNvfp4Linear
+    from dsv4.layers.linear import Nvfp4Linear
 
     in_features = wo_b_weight.shape[1] * 2
     out_features = wo_b_weight.shape[0]
@@ -173,7 +173,7 @@ def test_wo_b_nvfp4(z, wo_b_weight, wo_b_sf, wo_b_gs):
     sf = [wo_b_sf.permute(1, 0).contiguous()]
     gs = [wo_b_gs]
     
-    runner = CuTeDSLNvfp4Linear(
+    runner = Nvfp4Linear(
         in_features=in_features,
         out_features=out_features,
         max_num_tokens=8192,
diff --git a/tests/test_packing_diag.py b/tests/archive/test_packing_diag.py
similarity index 100%
rename from tests/test_packing_diag.py
rename to tests/archive/test_packing_diag.py
diff --git a/tests/test_pair_swap.py b/tests/archive/test_pair_swap.py
similarity index 100%
rename from tests/test_pair_swap.py
rename to tests/archive/test_pair_swap.py
diff --git a/tests/test_pair_swap2.py b/tests/archive/test_pair_swap2.py
similarity index 100%
rename from tests/test_pair_swap2.py
rename to tests/archive/test_pair_swap2.py
diff --git a/tests/test_pipeline_real_weights.py b/tests/archive/test_pipeline_real_weights.py
similarity index 97%
rename from tests/test_pipeline_real_weights.py
rename to tests/archive/test_pipeline_real_weights.py
index cf2ce97c..517db7cb 100644
--- a/tests/test_pipeline_real_weights.py
+++ b/tests/archive/test_pipeline_real_weights.py
@@ -91,8 +91,11 @@ def main():
     print(f"BF16 ref: amax={ref_out.amax().item():.4f}")
     
     # CuTeDSL runner
-    from vllm.nvfp4_cutedsl import CuTeDSLMoERunner
-    from cutedsl.bridge import assemble_scales_3d_side, make_b_k_major
+    from vllm.nvfp4_cutedsl import Nvfp4MoE
+from dsv4.ops.layouts import (
+    assemble_scales_3d_side,
+    make_b_k_major,
+)
     
     l1_fp4, l1_sf, l1_gs = [], [], []
     l2_fp4, l2_sf, l2_gs = [], [], []
@@ -125,7 +128,7 @@ def main():
             l2_sf.append(torch.ones(INTERMEDIATE_SIZE//16, HIDDEN_SIZE, dtype=torch.float8_e4m3fn, device=DEVICE))
             l2_gs.append(1.0)
     
-    runner = CuTeDSLMoERunner(
+    runner = Nvfp4MoE(
         num_experts=NUM_EXPERTS, hidden_size=HIDDEN_SIZE,
         intermediate_size=INTERMEDIATE_SIZE, max_num_tokens=MAX_NUM_TOKENS,
         top_k=TOP_K, device=DEVICE,
diff --git a/tests/test_pv64.py b/tests/archive/test_pv64.py
similarity index 100%
rename from tests/test_pv64.py
rename to tests/archive/test_pv64.py
diff --git a/tests/test_pv64_fmha_v.py b/tests/archive/test_pv64_fmha_v.py
similarity index 100%
rename from tests/test_pv64_fmha_v.py
rename to tests/archive/test_pv64_fmha_v.py
diff --git a/tests/test_pv64_kmajor_v.py b/tests/archive/test_pv64_kmajor_v.py
similarity index 100%
rename from tests/test_pv64_kmajor_v.py
rename to tests/archive/test_pv64_kmajor_v.py
diff --git a/tests/test_pv64_no_softmax.py b/tests/archive/test_pv64_no_softmax.py
similarity index 100%
rename from tests/test_pv64_no_softmax.py
rename to tests/archive/test_pv64_no_softmax.py
diff --git a/tests/test_pv64_nosoftmax_fmha_v.py b/tests/archive/test_pv64_nosoftmax_fmha_v.py
similarity index 100%
rename from tests/test_pv64_nosoftmax_fmha_v.py
rename to tests/archive/test_pv64_nosoftmax_fmha_v.py
diff --git a/tests/test_pv_diag.py b/tests/archive/test_pv_diag.py
similarity index 100%
rename from tests/test_pv_diag.py
rename to tests/archive/test_pv_diag.py
diff --git a/tests/test_pv_mma_mn_major.py b/tests/archive/test_pv_mma_mn_major.py
similarity index 100%
rename from tests/test_pv_mma_mn_major.py
rename to tests/archive/test_pv_mma_mn_major.py
diff --git a/tests/test_quick_rand.py b/tests/archive/test_quick_rand.py
similarity index 100%
rename from tests/test_quick_rand.py
rename to tests/archive/test_quick_rand.py
diff --git a/tests/test_recast_minimal.py b/tests/archive/test_recast_minimal.py
similarity index 100%
rename from tests/test_recast_minimal.py
rename to tests/archive/test_recast_minimal.py
diff --git a/tests/test_rope_kv_b200.py b/tests/archive/test_rope_kv_b200.py
similarity index 96%
rename from tests/test_rope_kv_b200.py
rename to tests/archive/test_rope_kv_b200.py
index a2e5a73d..ceb53710 100644
--- a/tests/test_rope_kv_b200.py
+++ b/tests/archive/test_rope_kv_b200.py
@@ -39,12 +39,12 @@ def rms(x, w, eps=1e-6):
     return (w.float() * (x * torch.rsqrt(v+eps)).float()).to(x.dtype)
 
 def make_runner(w, sf, gs_t, inf, outf):
-    from cutedsl.nvfp4_linear import CuTeDSLNvfp4Linear
+    from dsv4.layers.linear import Nvfp4Linear
     fp4 = w.view(torch.float4_e2m1fn_x2).permute(1,0).contiguous()
     s = sf.to(torch.float8_e4m3fn) if sf.dtype != torch.float8_e4m3fn else sf
     s = s.permute(1,0).contiguous()
     gs = gs_t.max().item() if gs_t.numel() > 1 else gs_t.item()
-    r = CuTeDSLNvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device))
+    r = Nvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device))
     r.fp4 = [fp4]; r.sf = [s]; r.gs = [gs]
     r.finalize_weights(); r._ensure_initialized()
     return r
@@ -132,7 +132,7 @@ with torch.no_grad():
     print(f"  Output: amax={o_with_rope.amax():.4f} NaN={torch.isnan(o_with_rope).any()}")
     
     # Test 3: Full pipeline
-    from cutedsl.csa_attention import apply_inv_gptj_rope
+    from dsv4.reference.csa_attention import apply_inv_gptj_rope
     o_inv = apply_inv_gptj_rope(o_with_rope, positions, cos_sin, NOPE, ROPE)
     o_grouped = o_inv.view(NT, OG, HPG * HD).permute(1, 0, 2)
     woa_3d = woa.view(OG, OL, HPG * HD)
diff --git a/tests/test_runner_vs_pipeline.py b/tests/archive/test_runner_vs_pipeline.py
similarity index 91%
rename from tests/test_runner_vs_pipeline.py
rename to tests/archive/test_runner_vs_pipeline.py
index 8ae00ad6..26f6d25a 100644
--- a/tests/test_runner_vs_pipeline.py
+++ b/tests/archive/test_runner_vs_pipeline.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 """
-Test A: Compare moe_pipeline output vs CuTeDSLMoERunner output.
+Test A: Compare moe_pipeline output vs Nvfp4MoE output.
 
 Uses the same weights and inputs. If they differ, the runner is broken.
 Runs on the B200 host (not inside Docker):
@@ -13,9 +13,17 @@ from safetensors import safe_open
 REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 sys.path.insert(0, REPO_ROOT)
 
-from cutedsl.moe_pipeline import run_nvfp4_moe
-from vllm.nvfp4_cutedsl import CuTeDSLMoERunner
-from cutedsl.bridge import quantize_to_nvfp4, quantize_weight_to_nvfp4, make_b_k_major, assemble_scales_3d_side, compute_expert_offsets
+from dsv4.reference.moe_pipeline import run_nvfp4_moe
+from vllm.nvfp4_cutedsl import Nvfp4MoE
+from dsv4.ops.quantize import (
+    quantize_to_nvfp4,
+    quantize_weight_to_nvfp4,
+)
+from dsv4.ops.layouts import (
+    make_b_k_major,
+    assemble_scales_3d_side,
+    compute_expert_offsets,
+)
 
 MODEL_DIR = "/root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4"
 DEVICE = "cuda"
@@ -126,9 +134,9 @@ def main():
     )
     print(f"    Pipeline: amax={pipeline_out.abs().max():.4f}, mean={pipeline_out.float().mean():.6f}")
 
-    # ── Path 2: CuTeDSLMoERunner with checkpoint input_scale (what vLLM uses) ──
-    print("\n  Running CuTeDSLMoERunner (checkpoint gs)...")
-    runner = CuTeDSLMoERunner(num_experts, hidden_size, intermediate_size, device=DEVICE)
+    # ── Path 2: Nvfp4MoE with checkpoint input_scale (what vLLM uses) ──
+    print("\n  Running Nvfp4MoE (checkpoint gs)...")
+    runner = Nvfp4MoE(num_experts, hidden_size, intermediate_size, device=DEVICE)
     runner.prepare_weights_direct(
         [w.clone() for w in weights['l1_fp4']],
         [w.clone() for w in weights['l1_sf']],
@@ -157,12 +165,12 @@ def main():
     ).item()
     print(f"    Cosine vs pipeline: {cos_ckpt:.6f}")
 
-    # ── Path 3: CuTeDSLMoERunner with dynamic gs ──
-    print("\n  Running CuTeDSLMoERunner (dynamic gs)...")
+    # ── Path 3: Nvfp4MoE with dynamic gs ──
+    print("\n  Running Nvfp4MoE (dynamic gs)...")
     # We can't use quantize_to_nvfp4 in the runner (cudagraph), but we can
     # compute the gs from the input and set it before calling run
     x_igs = (hidden_states.abs().max().item()) / (6.0 * 448.0)
-    runner2 = CuTeDSLMoERunner(num_experts, hidden_size, intermediate_size, device=DEVICE)
+    runner2 = Nvfp4MoE(num_experts, hidden_size, intermediate_size, device=DEVICE)
     runner2.prepare_weights_direct(
         [w.clone() for w in weights['l1_fp4']],
         [w.clone() for w in weights['l1_sf']],
diff --git a/tests/test_scale_assembly.py b/tests/archive/test_scale_assembly.py
similarity index 93%
rename from tests/test_scale_assembly.py
rename to tests/archive/test_scale_assembly.py
index 9623ad4e..7be030a7 100644
--- a/tests/test_scale_assembly.py
+++ b/tests/archive/test_scale_assembly.py
@@ -14,9 +14,14 @@ import os, sys, torch
 REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 sys.path.insert(0, REPO_ROOT)
 
-from cutedsl.bridge import quantize_to_nvfp4, assemble_scales_2d_side
-from cutedsl.kernel.moe.torch_scaled_grouped_mm import pad_and_swizzle_single, ceil_div
-from vllm.nvfp4_cutedsl import CuTeDSLMoERunner
+from dsv4.ops.quantize import (
+    quantize_to_nvfp4,
+)
+from dsv4.ops.layouts import (
+    assemble_scales_2d_side,
+)
+from dsv4.kernels.gemm.grouped import pad_and_swizzle_single, ceil_div
+from vllm.nvfp4_cutedsl import Nvfp4MoE
 
 
 def test_scale_assembly():
@@ -27,7 +32,7 @@ def test_scale_assembly():
     intermediate_size = 3072
 
     # Create a runner just to use its _assemble_scales_cudagraph_safe
-    runner = CuTeDSLMoERunner(num_experts, hidden_size, intermediate_size, device=DEVICE)
+    runner = Nvfp4MoE(num_experts, hidden_size, intermediate_size, device=DEVICE)
     # Trigger _ensure_stacked and buffer allocation with dummy weights
     def rand_fp4(*shape):
         return torch.randint(0, 256, shape, dtype=torch.uint8, device=DEVICE).view(torch.float4_e2m1fn_x2)
diff --git a/tests/test_scale_debug.py b/tests/archive/test_scale_debug.py
similarity index 90%
rename from tests/test_scale_debug.py
rename to tests/archive/test_scale_debug.py
index 05738d58..794f3285 100644
--- a/tests/test_scale_debug.py
+++ b/tests/archive/test_scale_debug.py
@@ -3,15 +3,20 @@
 import os, sys, torch
 REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 sys.path.insert(0, REPO_ROOT)
-from cutedsl.bridge import quantize_to_nvfp4, assemble_scales_2d_side
-from cutedsl.kernel.moe.torch_scaled_grouped_mm import pad_and_swizzle_single, ceil_div
-from vllm.nvfp4_cutedsl import CuTeDSLMoERunner
+from dsv4.ops.quantize import (
+    quantize_to_nvfp4,
+)
+from dsv4.ops.layouts import (
+    assemble_scales_2d_side,
+)
+from dsv4.kernels.gemm.grouped import pad_and_swizzle_single, ceil_div
+from vllm.nvfp4_cutedsl import Nvfp4MoE
 
 DEVICE = "cuda"
 num_experts = 3
 hidden_size = 7168
 
-runner = CuTeDSLMoERunner(num_experts, hidden_size, 3072, device=DEVICE)
+runner = Nvfp4MoE(num_experts, hidden_size, 3072, device=DEVICE)
 def rand_fp4(*shape):
     return torch.randint(0, 256, shape, dtype=torch.uint8, device=DEVICE).view(torch.float4_e2m1fn_x2)
 def rand_sf(*shape):
diff --git a/tests/test_shared_expert.py b/tests/archive/test_shared_expert.py
similarity index 97%
rename from tests/test_shared_expert.py
rename to tests/archive/test_shared_expert.py
index 1b12db0d..bf6856ab 100644
--- a/tests/test_shared_expert.py
+++ b/tests/archive/test_shared_expert.py
@@ -1,6 +1,6 @@
 """Standalone test: Shared expert using CuTeDSL dedicated runner.
 
-Tests the CuTeDSLSharedExpertRunner for the shared expert path.
+Tests the Nvfp4SharedExpert for the shared expert path.
 Compares against BF16 dequantized reference.
 
 Usage: python3 test_shared_expert.py
@@ -55,7 +55,7 @@ def main():
     torch.manual_seed(42)
 
     sys.path.insert(0, "/root/nvfp4-megamoe-kernel")
-    from cutedsl.shared_expert_pipeline import CuTeDSLSharedExpertRunner
+    from dsv4.layers.shared_expert import Nvfp4SharedExpert
 
     with open(os.path.join(MODEL_PATH, "model.safetensors.index.json")) as f:
         wm = json.load(f)["weight_map"]
@@ -101,7 +101,7 @@ def main():
     l2_sf = [down_sf.permute(1, 0).contiguous()]
 
     # Create runner
-    runner = CuTeDSLSharedExpertRunner(
+    runner = Nvfp4SharedExpert(
         hidden_size=HIDDEN_SIZE,
         intermediate_size=INTERMEDIATE_SIZE,
         max_num_tokens=8192,
diff --git a/tests/test_silu_step1.py b/tests/archive/test_silu_step1.py
similarity index 97%
rename from tests/test_silu_step1.py
rename to tests/archive/test_silu_step1.py
index 9f54eca6..8b11b35c 100644
--- a/tests/test_silu_step1.py
+++ b/tests/archive/test_silu_step1.py
@@ -13,12 +13,16 @@ import torch
 import sys
 sys.path.insert(0, '/root/dsv4-nvfp4-workspace/kernel')
 
-from cutedsl.bridge import (
+from dsv4.ops.quantize import (
     quantize_weight_to_nvfp4,
     quantize_activation_nvfp4,
+)
+from dsv4.ops.layouts import (
     make_b_k_major,
     assemble_scales_2d_side,
     assemble_scales_3d_side,
+)
+from dsv4.ops.gemm_runner import (
     run_nvfp4_grouped_gemm,
     warmup_compilation,
 )
diff --git a/tests/test_softmax_only.py b/tests/archive/test_softmax_only.py
similarity index 100%
rename from tests/test_softmax_only.py
rename to tests/archive/test_softmax_only.py
diff --git a/tests/test_softmax_store_debug.py b/tests/archive/test_softmax_store_debug.py
similarity index 100%
rename from tests/test_softmax_store_debug.py
rename to tests/archive/test_softmax_store_debug.py
diff --git a/tests/test_sparse_attn_b200.py b/tests/archive/test_sparse_attn_b200.py
similarity index 98%
rename from tests/test_sparse_attn_b200.py
rename to tests/archive/test_sparse_attn_b200.py
index 9cc15148..da7adc42 100644
--- a/tests/test_sparse_attn_b200.py
+++ b/tests/archive/test_sparse_attn_b200.py
@@ -54,7 +54,7 @@ def rms(x, w, eps=1e-6):
     return (w.float() * (x * torch.rsqrt(v+eps)).float()).to(x.dtype)
 
 def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None):
-    from cutedsl.nvfp4_linear import CuTeDSLNvfp4Linear
+    from dsv4.layers.linear import Nvfp4Linear
     fp4 = w.view(torch.float4_e2m1fn_x2).permute(1,0).contiguous()
     s = sf.to(torch.float8_e4m3fn) if sf.dtype != torch.float8_e4m3fn else sf
     s = s.permute(1,0).contiguous()
@@ -65,7 +65,7 @@ def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None):
             s32[:sp] *= g1/gs; s32[sp:] *= g2/gs; s = s32.to(torch.float8_e4m3fn)
     else:
         gs = gs_t.max().item() if gs_t.numel() > 1 else gs_t.item()
-    r = CuTeDSLNvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device))
+    r = Nvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device))
     r.fp4 = [fp4]; r.sf = [s]; r.gs = [gs]
     r.finalize_weights(); r._ensure_initialized()
     return r
diff --git a/tests/test_sparse_decode.py b/tests/archive/test_sparse_decode.py
similarity index 97%
rename from tests/test_sparse_decode.py
rename to tests/archive/test_sparse_decode.py
index eb1d6d18..31964e4e 100644
--- a/tests/test_sparse_decode.py
+++ b/tests/archive/test_sparse_decode.py
@@ -1,6 +1,6 @@
 import sys, torch, torch.nn.functional as F
 sys.path.insert(0, "/root/dsv4-nvfp4-workspace/kernel")
-from cutedsl.native_sparse_decode import native_sparse_decode_attention
+from dsv4.ops.decode_sparse import native_sparse_decode_attention
 
 torch.manual_seed(42)
 torch.cuda.set_device(0)
diff --git a/tests/test_stage_a_copy.py b/tests/archive/test_stage_a_copy.py
similarity index 100%
rename from tests/test_stage_a_copy.py
rename to tests/archive/test_stage_a_copy.py
diff --git a/tests/test_stage_a_minimal.py b/tests/archive/test_stage_a_minimal.py
similarity index 100%
rename from tests/test_stage_a_minimal.py
rename to tests/archive/test_stage_a_minimal.py
diff --git a/tests/test_stage_a_pv_created.py b/tests/archive/test_stage_a_pv_created.py
similarity index 100%
rename from tests/test_stage_a_pv_created.py
rename to tests/archive/test_stage_a_pv_created.py
diff --git a/tests/test_stage_a_pv_param.py b/tests/archive/test_stage_a_pv_param.py
similarity index 100%
rename from tests/test_stage_a_pv_param.py
rename to tests/archive/test_stage_a_pv_param.py
diff --git a/tests/test_stage_a_qk.py b/tests/archive/test_stage_a_qk.py
similarity index 100%
rename from tests/test_stage_a_qk.py
rename to tests/archive/test_stage_a_qk.py
diff --git a/tests/test_stage_a_v2.py b/tests/archive/test_stage_a_v2.py
similarity index 100%
rename from tests/test_stage_a_v2.py
rename to tests/archive/test_stage_a_v2.py
diff --git a/tests/test_stage_a_with_pv_mma.py b/tests/archive/test_stage_a_with_pv_mma.py
similarity index 100%
rename from tests/test_stage_a_with_pv_mma.py
rename to tests/archive/test_stage_a_with_pv_mma.py
diff --git a/tests/test_stage_b_afrag.py b/tests/archive/test_stage_b_afrag.py
similarity index 100%
rename from tests/test_stage_b_afrag.py
rename to tests/archive/test_stage_b_afrag.py
diff --git a/tests/test_stage_b_afrag2.py b/tests/archive/test_stage_b_afrag2.py
similarity index 100%
rename from tests/test_stage_b_afrag2.py
rename to tests/archive/test_stage_b_afrag2.py
diff --git a/tests/test_stage_b_debug.py b/tests/archive/test_stage_b_debug.py
similarity index 100%
rename from tests/test_stage_b_debug.py
rename to tests/archive/test_stage_b_debug.py
diff --git a/tests/test_stage_b_debug2.py b/tests/archive/test_stage_b_debug2.py
similarity index 100%
rename from tests/test_stage_b_debug2.py
rename to tests/archive/test_stage_b_debug2.py
diff --git a/tests/test_stage_b_debug3.py b/tests/archive/test_stage_b_debug3.py
similarity index 100%
rename from tests/test_stage_b_debug3.py
rename to tests/archive/test_stage_b_debug3.py
diff --git a/tests/test_stage_b_debug4.py b/tests/archive/test_stage_b_debug4.py
similarity index 100%
rename from tests/test_stage_b_debug4.py
rename to tests/archive/test_stage_b_debug4.py
diff --git a/tests/test_stage_b_diag.py b/tests/archive/test_stage_b_diag.py
similarity index 100%
rename from tests/test_stage_b_diag.py
rename to tests/archive/test_stage_b_diag.py
diff --git a/tests/test_stage_b_final.py b/tests/archive/test_stage_b_final.py
similarity index 100%
rename from tests/test_stage_b_final.py
rename to tests/archive/test_stage_b_final.py
diff --git a/tests/test_stage_b_identity.py b/tests/archive/test_stage_b_identity.py
similarity index 100%
rename from tests/test_stage_b_identity.py
rename to tests/archive/test_stage_b_identity.py
diff --git a/tests/test_stage_b_minimal.py b/tests/archive/test_stage_b_minimal.py
similarity index 100%
rename from tests/test_stage_b_minimal.py
rename to tests/archive/test_stage_b_minimal.py
diff --git a/tests/test_stage_b_ntile_v1.py b/tests/archive/test_stage_b_ntile_v1.py
similarity index 100%
rename from tests/test_stage_b_ntile_v1.py
rename to tests/archive/test_stage_b_ntile_v1.py
diff --git a/tests/test_stage_b_ntile_v3.py b/tests/archive/test_stage_b_ntile_v3.py
similarity index 100%
rename from tests/test_stage_b_ntile_v3.py
rename to tests/archive/test_stage_b_ntile_v3.py
diff --git a/tests/test_stage_b_ntile_v5.py b/tests/archive/test_stage_b_ntile_v5.py
similarity index 100%
rename from tests/test_stage_b_ntile_v5.py
rename to tests/archive/test_stage_b_ntile_v5.py
diff --git a/tests/test_stage_b_ntile_v6.py b/tests/archive/test_stage_b_ntile_v6.py
similarity index 100%
rename from tests/test_stage_b_ntile_v6.py
rename to tests/archive/test_stage_b_ntile_v6.py
diff --git a/tests/test_stage_b_ntile_v7.py b/tests/archive/test_stage_b_ntile_v7.py
similarity index 100%
rename from tests/test_stage_b_ntile_v7.py
rename to tests/archive/test_stage_b_ntile_v7.py
diff --git a/tests/test_stage_b_ntile_v8.py b/tests/archive/test_stage_b_ntile_v8.py
similarity index 100%
rename from tests/test_stage_b_ntile_v8.py
rename to tests/archive/test_stage_b_ntile_v8.py
diff --git a/tests/test_stage_b_pipeline_only.py b/tests/archive/test_stage_b_pipeline_only.py
similarity index 100%
rename from tests/test_stage_b_pipeline_only.py
rename to tests/archive/test_stage_b_pipeline_only.py
diff --git a/tests/test_stage_b_v1.py b/tests/archive/test_stage_b_v1.py
similarity index 100%
rename from tests/test_stage_b_v1.py
rename to tests/archive/test_stage_b_v1.py
diff --git a/tests/test_stage_b_v10.py b/tests/archive/test_stage_b_v10.py
similarity index 100%
rename from tests/test_stage_b_v10.py
rename to tests/archive/test_stage_b_v10.py
diff --git a/tests/test_stage_b_v11.py b/tests/archive/test_stage_b_v11.py
similarity index 100%
rename from tests/test_stage_b_v11.py
rename to tests/archive/test_stage_b_v11.py
diff --git a/tests/test_stage_b_v11b.py b/tests/archive/test_stage_b_v11b.py
similarity index 100%
rename from tests/test_stage_b_v11b.py
rename to tests/archive/test_stage_b_v11b.py
diff --git a/tests/test_stage_b_v12.py b/tests/archive/test_stage_b_v12.py
similarity index 100%
rename from tests/test_stage_b_v12.py
rename to tests/archive/test_stage_b_v12.py
diff --git a/tests/test_stage_b_v13.py b/tests/archive/test_stage_b_v13.py
similarity index 100%
rename from tests/test_stage_b_v13.py
rename to tests/archive/test_stage_b_v13.py
diff --git a/tests/test_stage_b_v14.py b/tests/archive/test_stage_b_v14.py
similarity index 100%
rename from tests/test_stage_b_v14.py
rename to tests/archive/test_stage_b_v14.py
diff --git a/tests/test_stage_b_v16.py b/tests/archive/test_stage_b_v16.py
similarity index 100%
rename from tests/test_stage_b_v16.py
rename to tests/archive/test_stage_b_v16.py
diff --git a/tests/test_stage_b_v17.py b/tests/archive/test_stage_b_v17.py
similarity index 100%
rename from tests/test_stage_b_v17.py
rename to tests/archive/test_stage_b_v17.py
diff --git a/tests/test_stage_b_v18.py b/tests/archive/test_stage_b_v18.py
similarity index 100%
rename from tests/test_stage_b_v18.py
rename to tests/archive/test_stage_b_v18.py
diff --git a/tests/test_stage_b_v19.py b/tests/archive/test_stage_b_v19.py
similarity index 100%
rename from tests/test_stage_b_v19.py
rename to tests/archive/test_stage_b_v19.py
diff --git a/tests/test_stage_b_v2.py b/tests/archive/test_stage_b_v2.py
similarity index 100%
rename from tests/test_stage_b_v2.py
rename to tests/archive/test_stage_b_v2.py
diff --git a/tests/test_stage_b_v20.py b/tests/archive/test_stage_b_v20.py
similarity index 100%
rename from tests/test_stage_b_v20.py
rename to tests/archive/test_stage_b_v20.py
diff --git a/tests/test_stage_b_v22.py b/tests/archive/test_stage_b_v22.py
similarity index 100%
rename from tests/test_stage_b_v22.py
rename to tests/archive/test_stage_b_v22.py
diff --git a/tests/test_stage_b_v22_bug1fix.py b/tests/archive/test_stage_b_v22_bug1fix.py
similarity index 100%
rename from tests/test_stage_b_v22_bug1fix.py
rename to tests/archive/test_stage_b_v22_bug1fix.py
diff --git a/tests/test_stage_b_v23.py b/tests/archive/test_stage_b_v23.py
similarity index 100%
rename from tests/test_stage_b_v23.py
rename to tests/archive/test_stage_b_v23.py
diff --git a/tests/test_stage_b_v24.py b/tests/archive/test_stage_b_v24.py
similarity index 100%
rename from tests/test_stage_b_v24.py
rename to tests/archive/test_stage_b_v24.py
diff --git a/tests/test_stage_b_v25.py b/tests/archive/test_stage_b_v25.py
similarity index 100%
rename from tests/test_stage_b_v25.py
rename to tests/archive/test_stage_b_v25.py
diff --git a/tests/test_stage_b_v26.py b/tests/archive/test_stage_b_v26.py
similarity index 100%
rename from tests/test_stage_b_v26.py
rename to tests/archive/test_stage_b_v26.py
diff --git a/tests/test_stage_b_v27.py b/tests/archive/test_stage_b_v27.py
similarity index 100%
rename from tests/test_stage_b_v27.py
rename to tests/archive/test_stage_b_v27.py
diff --git a/tests/test_stage_b_v28.py b/tests/archive/test_stage_b_v28.py
similarity index 100%
rename from tests/test_stage_b_v28.py
rename to tests/archive/test_stage_b_v28.py
diff --git a/tests/test_stage_b_v29.py b/tests/archive/test_stage_b_v29.py
similarity index 100%
rename from tests/test_stage_b_v29.py
rename to tests/archive/test_stage_b_v29.py
diff --git a/tests/test_stage_b_v3.py b/tests/archive/test_stage_b_v3.py
similarity index 100%
rename from tests/test_stage_b_v3.py
rename to tests/archive/test_stage_b_v3.py
diff --git a/tests/test_stage_b_v30.py b/tests/archive/test_stage_b_v30.py
similarity index 100%
rename from tests/test_stage_b_v30.py
rename to tests/archive/test_stage_b_v30.py
diff --git a/tests/test_stage_b_v4.py b/tests/archive/test_stage_b_v4.py
similarity index 100%
rename from tests/test_stage_b_v4.py
rename to tests/archive/test_stage_b_v4.py
diff --git a/tests/test_stage_b_v5.py b/tests/archive/test_stage_b_v5.py
similarity index 100%
rename from tests/test_stage_b_v5.py
rename to tests/archive/test_stage_b_v5.py
diff --git a/tests/test_stage_b_v6.py b/tests/archive/test_stage_b_v6.py
similarity index 100%
rename from tests/test_stage_b_v6.py
rename to tests/archive/test_stage_b_v6.py
diff --git a/tests/test_stage_b_v7.py b/tests/archive/test_stage_b_v7.py
similarity index 100%
rename from tests/test_stage_b_v7.py
rename to tests/archive/test_stage_b_v7.py
diff --git a/tests/test_stage_b_v7_rep128.py b/tests/archive/test_stage_b_v7_rep128.py
similarity index 100%
rename from tests/test_stage_b_v7_rep128.py
rename to tests/archive/test_stage_b_v7_rep128.py
diff --git a/tests/test_stage_b_v7_rep16.py b/tests/archive/test_stage_b_v7_rep16.py
similarity index 100%
rename from tests/test_stage_b_v7_rep16.py
rename to tests/archive/test_stage_b_v7_rep16.py
diff --git a/tests/test_stage_b_v7_rep64.py b/tests/archive/test_stage_b_v7_rep64.py
similarity index 100%
rename from tests/test_stage_b_v7_rep64.py
rename to tests/archive/test_stage_b_v7_rep64.py
diff --git a/tests/test_stage_b_v7_rep8.py b/tests/archive/test_stage_b_v7_rep8.py
similarity index 100%
rename from tests/test_stage_b_v7_rep8.py
rename to tests/archive/test_stage_b_v7_rep8.py
diff --git a/tests/test_stage_b_v8.py b/tests/archive/test_stage_b_v8.py
similarity index 100%
rename from tests/test_stage_b_v8.py
rename to tests/archive/test_stage_b_v8.py
diff --git a/tests/test_stage_b_v8b.py b/tests/archive/test_stage_b_v8b.py
similarity index 100%
rename from tests/test_stage_b_v8b.py
rename to tests/archive/test_stage_b_v8b.py
diff --git a/tests/test_stage_b_v9.py b/tests/archive/test_stage_b_v9.py
similarity index 100%
rename from tests/test_stage_b_v9.py
rename to tests/archive/test_stage_b_v9.py
diff --git a/tests/test_step2_subtile.py b/tests/archive/test_step2_subtile.py
similarity index 97%
rename from tests/test_step2_subtile.py
rename to tests/archive/test_step2_subtile.py
index 9fe5d435..2c76e1f5 100644
--- a/tests/test_step2_subtile.py
+++ b/tests/archive/test_step2_subtile.py
@@ -11,12 +11,16 @@ import torch
 import sys
 sys.path.insert(0, '/root/dsv4-nvfp4-workspace/kernel')
 
-from cutedsl.bridge import (
+from dsv4.ops.quantize import (
     quantize_weight_to_nvfp4,
     quantize_activation_nvfp4,
+)
+from dsv4.ops.layouts import (
     make_b_k_major,
     assemble_scales_2d_side,
     assemble_scales_3d_side,
+)
+from dsv4.ops.gemm_runner import (
     run_nvfp4_grouped_gemm,
     run_fused_swiglu_grouped_gemm,
     warmup_compilation,
diff --git a/tests/test_step2_subtile_v2.py b/tests/archive/test_step2_subtile_v2.py
similarity index 97%
rename from tests/test_step2_subtile_v2.py
rename to tests/archive/test_step2_subtile_v2.py
index 3c40c26d..a91015a6 100644
--- a/tests/test_step2_subtile_v2.py
+++ b/tests/archive/test_step2_subtile_v2.py
@@ -12,12 +12,16 @@ import torch
 import sys
 sys.path.insert(0, '/root/dsv4-nvfp4-workspace/kernel')
 
-from cutedsl.bridge import (
+from dsv4.ops.quantize import (
     quantize_weight_to_nvfp4,
     quantize_activation_nvfp4,
+)
+from dsv4.ops.layouts import (
     make_b_k_major,
     assemble_scales_2d_side,
     assemble_scales_3d_side,
+)
+from dsv4.ops.gemm_runner import (
     run_nvfp4_grouped_gemm,
     run_fused_swiglu_grouped_gemm,
     warmup_compilation,
diff --git a/tests/test_store_verify.py b/tests/archive/test_store_verify.py
similarity index 100%
rename from tests/test_store_verify.py
rename to tests/archive/test_store_verify.py
diff --git a/tests/test_store_verify2.py b/tests/archive/test_store_verify2.py
similarity index 100%
rename from tests/test_store_verify2.py
rename to tests/archive/test_store_verify2.py
diff --git a/tests/test_tmem_addressing.py b/tests/archive/test_tmem_addressing.py
similarity index 100%
rename from tests/test_tmem_addressing.py
rename to tests/archive/test_tmem_addressing.py
diff --git a/tests/test_tmem_col2.py b/tests/archive/test_tmem_col2.py
similarity index 100%
rename from tests/test_tmem_col2.py
rename to tests/archive/test_tmem_col2.py
diff --git a/tests/test_tmem_col3.py b/tests/archive/test_tmem_col3.py
similarity index 100%
rename from tests/test_tmem_col3.py
rename to tests/archive/test_tmem_col3.py
diff --git a/tests/test_tmem_col4.py b/tests/archive/test_tmem_col4.py
similarity index 100%
rename from tests/test_tmem_col4.py
rename to tests/archive/test_tmem_col4.py
diff --git a/tests/test_tmem_col5.py b/tests/archive/test_tmem_col5.py
similarity index 100%
rename from tests/test_tmem_col5.py
rename to tests/archive/test_tmem_col5.py
diff --git a/tests/test_tmem_col5_16.py b/tests/archive/test_tmem_col5_16.py
similarity index 100%
rename from tests/test_tmem_col5_16.py
rename to tests/archive/test_tmem_col5_16.py
diff --git a/tests/test_tmem_col5_32.py b/tests/archive/test_tmem_col5_32.py
similarity index 100%
rename from tests/test_tmem_col5_32.py
rename to tests/archive/test_tmem_col5_32.py
diff --git a/tests/test_tmem_col_offset.py b/tests/archive/test_tmem_col_offset.py
similarity index 100%
rename from tests/test_tmem_col_offset.py
rename to tests/archive/test_tmem_col_offset.py
diff --git a/tests/test_tmem_copy_roundtrip.py b/tests/archive/test_tmem_copy_roundtrip.py
similarity index 100%
rename from tests/test_tmem_copy_roundtrip.py
rename to tests/archive/test_tmem_copy_roundtrip.py
diff --git a/tests/test_tmem_debug.py b/tests/archive/test_tmem_debug.py
similarity index 100%
rename from tests/test_tmem_debug.py
rename to tests/archive/test_tmem_debug.py
diff --git a/tests/test_tmem_debug2.py b/tests/archive/test_tmem_debug2.py
similarity index 100%
rename from tests/test_tmem_debug2.py
rename to tests/archive/test_tmem_debug2.py
diff --git a/tests/test_tmem_fp32_roundtrip.py b/tests/archive/test_tmem_fp32_roundtrip.py
similarity index 100%
rename from tests/test_tmem_fp32_roundtrip.py
rename to tests/archive/test_tmem_fp32_roundtrip.py
diff --git a/tests/test_tmem_layout_diag.py b/tests/archive/test_tmem_layout_diag.py
similarity index 100%
rename from tests/test_tmem_layout_diag.py
rename to tests/archive/test_tmem_layout_diag.py
diff --git a/tests/test_tmem_pure_fp32.py b/tests/archive/test_tmem_pure_fp32.py
similarity index 100%
rename from tests/test_tmem_pure_fp32.py
rename to tests/archive/test_tmem_pure_fp32.py
diff --git a/tests/test_uniform_fp4.py b/tests/archive/test_uniform_fp4.py
similarity index 100%
rename from tests/test_uniform_fp4.py
rename to tests/archive/test_uniform_fp4.py
diff --git a/tests/test_v28c_noepi.py b/tests/archive/test_v28c_noepi.py
similarity index 100%
rename from tests/test_v28c_noepi.py
rename to tests/archive/test_v28c_noepi.py
diff --git a/tests/test_v4_attention_b200.py b/tests/archive/test_v4_attention_b200.py
similarity index 98%
rename from tests/test_v4_attention_b200.py
rename to tests/archive/test_v4_attention_b200.py
index f3b453b7..a4024679 100644
--- a/tests/test_v4_attention_b200.py
+++ b/tests/archive/test_v4_attention_b200.py
@@ -57,7 +57,7 @@ def rms(x, w, eps=1e-6):
     return (w.float() * (x * torch.rsqrt(v+eps)).float()).to(x.dtype)
 
 def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None):
-    from cutedsl.nvfp4_linear import CuTeDSLNvfp4Linear
+    from dsv4.layers.linear import Nvfp4Linear
     fp4 = w.view(torch.float4_e2m1fn_x2).permute(1,0).contiguous()
     s = sf.to(torch.float8_e4m3fn) if sf.dtype != torch.float8_e4m3fn else sf
     s = s.permute(1,0).contiguous()
@@ -68,7 +68,7 @@ def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None):
             s32[:sp] *= g1/gs; s32[sp:] *= g2/gs; s = s32.to(torch.float8_e4m3fn)
     else:
         gs = gs_t.max().item() if gs_t.numel() > 1 else gs_t.item()
-    r = CuTeDSLNvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device))
+    r = Nvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device))
     r.fp4 = [fp4]; r.sf = [s]; r.gs = [gs]
     r.finalize_weights(); r._ensure_initialized()
     return r
diff --git a/tests/test_v_mode_fix.py b/tests/archive/test_v_mode_fix.py
similarity index 100%
rename from tests/test_v_mode_fix.py
rename to tests/archive/test_v_mode_fix.py
diff --git a/tests/test_vllm_codepaths_b200.py b/tests/archive/test_vllm_codepaths_b200.py
similarity index 98%
rename from tests/test_vllm_codepaths_b200.py
rename to tests/archive/test_vllm_codepaths_b200.py
index 346d3aa1..505a193e 100644
--- a/tests/test_vllm_codepaths_b200.py
+++ b/tests/archive/test_vllm_codepaths_b200.py
@@ -34,12 +34,12 @@ def rms(x, w, eps=1e-6):
     return (w.float() * (x * torch.rsqrt(v+eps)).float()).to(x.dtype)
 
 def make_runner(w, sf, gs_t, inf, outf):
-    from cutedsl.nvfp4_linear import CuTeDSLNvfp4Linear
+    from dsv4.layers.linear import Nvfp4Linear
     fp4 = w.view(torch.float4_e2m1fn_x2).permute(1,0).contiguous()
     s = sf.to(torch.float8_e4m3fn) if sf.dtype != torch.float8_e4m3fn else sf
     s = s.permute(1,0).contiguous()
     gs = gs_t.max().item() if gs_t.numel() > 1 else gs_t.item()
-    r = CuTeDSLNvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device))
+    r = Nvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device))
     r.fp4 = [fp4]; r.sf = [s]; r.gs = [gs]
     r.finalize_weights(); r._ensure_initialized()
     return r
@@ -97,7 +97,7 @@ def causal_prefill_attention(q, kv, scale):
 
 def main():
     """Test the exact csa_attention.py code paths used in the container."""
-    from cutedsl.blackwell_attention import (
+    from dsv4.reference.attention import (
         apply_gptj_rope,
         apply_inv_gptj_rope,
     )
diff --git a/tests/test_vsmem_diag.py b/tests/archive/test_vsmem_diag.py
similarity index 100%
rename from tests/test_vsmem_diag.py
rename to tests/archive/test_vsmem_diag.py
diff --git a/tests/test_warmup_gs.py b/tests/archive/test_warmup_gs.py
similarity index 93%
rename from tests/test_warmup_gs.py
rename to tests/archive/test_warmup_gs.py
index b46a7a6b..a4517d53 100644
--- a/tests/test_warmup_gs.py
+++ b/tests/archive/test_warmup_gs.py
@@ -14,13 +14,21 @@ Usage (on B200):
 import torch, sys, os, json
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
-from cutedsl.bridge import (
-    quantize_to_nvfp4, quantize_activation_nvfp4,
-    make_b_k_major, assemble_scales_2d_side, assemble_scales_3d_side,
-    run_nvfp4_grouped_gemm, compute_expert_offsets,
+from dsv4.ops.quantize import (
+    quantize_to_nvfp4,
+    quantize_activation_nvfp4,
 )
-from cutedsl.moe_pipeline import run_nvfp4_moe
-from vllm.nvfp4_cutedsl import CuTeDSLMoERunner
+from dsv4.ops.layouts import (
+    make_b_k_major,
+    assemble_scales_2d_side,
+    assemble_scales_3d_side,
+    compute_expert_offsets,
+)
+from dsv4.ops.gemm_runner import (
+    run_nvfp4_grouped_gemm,
+)
+from dsv4.reference.moe_pipeline import run_nvfp4_moe
+from vllm.nvfp4_cutedsl import Nvfp4MoE
 from safetensors import safe_open
 
 MODEL_DIR = "/root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4"
@@ -148,7 +156,7 @@ def main():
 
     # ── Test 1: Runner with warmup gs (no safety margin) ──
     print("\n--- Test 1: Warmup gs, no safety margin ---")
-    runner = CuTeDSLMoERunner(num_experts, hidden_size, intermediate_size, device=DEVICE)
+    runner = Nvfp4MoE(num_experts, hidden_size, intermediate_size, device=DEVICE)
     runner.prepare_weights_direct(
         [w.clone() for w in l1_fp4], [w.clone() for w in l1_sf], list(l1_gs),
         [w.clone() for w in l2_fp4], [w.clone() for w in l2_sf], list(l2_gs),
@@ -166,7 +174,7 @@ def main():
 
     # ── Test 2: Runner with warmup gs + safety margins ──
     for safety in [1.0, 1.1, 1.2, 1.5, 2.0]:
-        runner2 = CuTeDSLMoERunner(num_experts, hidden_size, intermediate_size, device=DEVICE)
+        runner2 = Nvfp4MoE(num_experts, hidden_size, intermediate_size, device=DEVICE)
         runner2.prepare_weights_direct(
             [w.clone() for w in l1_fp4], [w.clone() for w in l1_sf], list(l1_gs),
             [w.clone() for w in l2_fp4], [w.clone() for w in l2_sf], list(l2_gs),
diff --git a/tests/test_wo_a.py b/tests/archive/test_wo_a.py
similarity index 97%
rename from tests/test_wo_a.py
rename to tests/archive/test_wo_a.py
index 2fb21062..71f086d9 100644
--- a/tests/test_wo_a.py
+++ b/tests/archive/test_wo_a.py
@@ -18,8 +18,8 @@ import torch.nn.functional as F
 # Add repo root to path
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
-from cutedsl.inverse_rope import inverse_rope_bf16
-from cutedsl.wo_a_grouped_linear import CuTeDSLNvfp4WoA
+from dsv4.ops.rope import inverse_rope_bf16
+from dsv4.layers.grouped_linear import Nvfp4GroupedLinear
 
 DEVICE = "cuda:0"
 
@@ -112,7 +112,7 @@ def test_wo_a_grouped_linear():
         z_ref[:, g, :] = o_grouped[:, g, :] @ wo_a_weight[g]
 
     # CuTeDSL NVFP4 runner
-    runner = CuTeDSLNvfp4WoA(
+    runner = Nvfp4GroupedLinear(
         n_local_groups=N_LOCAL_GROUPS,
         heads_per_group=HEADS_PER_GROUP,
         head_dim=HEAD_DIM,
diff --git a/tests/test_wo_a_bmm.py b/tests/archive/test_wo_a_bmm.py
similarity index 100%
rename from tests/test_wo_a_bmm.py
rename to tests/archive/test_wo_a_bmm.py
diff --git a/tests/e2e/__init__.py b/tests/e2e/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/cudagraph_test.py b/tests/unit/cudagraph_test.py
similarity index 98%
rename from tests/cudagraph_test.py
rename to tests/unit/cudagraph_test.py
index 3d3fc55c..2cc2497d 100644
--- a/tests/cudagraph_test.py
+++ b/tests/unit/cudagraph_test.py
@@ -17,7 +17,7 @@ import contextlib
 REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 sys.path.insert(0, REPO_ROOT)
 
-from vllm.nvfp4_cutedsl import CuTeDSLMoERunner
+from vllm.nvfp4_cutedsl import Nvfp4MoE
 
 
 class CUDASyncDetector:
@@ -103,7 +103,7 @@ class CUDASyncDetector:
 
 def make_dummy_runner(num_experts=32, hidden_size=7168, intermediate_size=3072, device="cuda"):
     """Create a CuTeDSL runner with dummy weights for testing."""
-    runner = CuTeDSLMoERunner(num_experts, hidden_size, intermediate_size, device=device)
+    runner = Nvfp4MoE(num_experts, hidden_size, intermediate_size, device=device)
     
     # Create minimal dummy weights
     # Create minimal dummy weights (uint8 → view as float4)
diff --git a/tests/layertest.py b/tests/unit/layertest.py
similarity index 99%
rename from tests/layertest.py
rename to tests/unit/layertest.py
index 7ecd3a3d..eef37d18 100644
--- a/tests/layertest.py
+++ b/tests/unit/layertest.py
@@ -15,7 +15,7 @@ from safetensors import safe_open
 REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 sys.path.insert(0, REPO_ROOT)
 
-from cutedsl.moe_pipeline import (
+from dsv4.reference.moe_pipeline import (
     run_nvfp4_moe,
     run_nvfp4_moe_fused,
 )
diff --git a/tests/test_128_128_vdiag.py b/tests/unit/test_128_128_vdiag.py
similarity index 100%
rename from tests/test_128_128_vdiag.py
rename to tests/unit/test_128_128_vdiag.py
diff --git a/tests/test_compile_custom_op.py b/tests/unit/test_compile_custom_op.py
similarity index 97%
rename from tests/test_compile_custom_op.py
rename to tests/unit/test_compile_custom_op.py
index d4580c38..46b033d1 100644
--- a/tests/test_compile_custom_op.py
+++ b/tests/unit/test_compile_custom_op.py
@@ -21,8 +21,8 @@ from safetensors import safe_open
 REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 sys.path.insert(0, REPO_ROOT)
 
-from cutedsl.runner import CuTeDSLMoERunner
-from cutedsl.custom_ops import register_runner, nvfp4_moe_gemm
+from dsv4.layers.moe import Nvfp4MoE
+from dsv4.ops.custom_ops import register_runner, nvfp4_moe_gemm
 
 NVFP4_MODEL_DIR = "/root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4"
 DEVICE = "cuda"
@@ -62,7 +62,10 @@ def load_layer_tensors(model_dir, layer_idx):
 
 
 def prepare_nvfp4_weights_direct(nvfp4_tensors, layer_idx, expert_indices, intermediate_size):
-    from cutedsl.bridge import quantize_activation_nvfp4, quantize_weight_to_nvfp4
+from dsv4.ops.quantize import (
+    quantize_activation_nvfp4,
+    quantize_weight_to_nvfp4,
+)
     l1_fp4, l1_sf, l1_gs = [], [], []
     l2_fp4, l2_sf, l2_gs = [], [], []
     
@@ -117,7 +120,7 @@ def main():
     weights = prepare_nvfp4_weights_direct(nvfp4_tensors, 0, expert_indices, intermediate_size)
 
     # Create runner
-    runner = CuTeDSLMoERunner(
+    runner = Nvfp4MoE(
         num_experts=len(expert_indices),
         hidden_size=hidden_size,
         intermediate_size=intermediate_size,
diff --git a/tests/test_custom_op.py b/tests/unit/test_custom_op.py
similarity index 94%
rename from tests/test_custom_op.py
rename to tests/unit/test_custom_op.py
index 0e0f1f7e..fac8b60b 100644
--- a/tests/test_custom_op.py
+++ b/tests/unit/test_custom_op.py
@@ -19,7 +19,7 @@ sys.path.insert(0, REPO_ROOT)
 
 def test_custom_op_registered():
     """Verify nvfp4::linear_gemm and nvfp4::moe_gemm are registered."""
-    from cutedsl.custom_ops import nvfp4_linear_gemm, nvfp4_moe_gemm
+    from dsv4.ops.custom_ops import nvfp4_linear_gemm, nvfp4_moe_gemm
     
     # Check they exist as custom ops
     assert hasattr(nvfp4_linear_gemm, '_name')
@@ -29,7 +29,7 @@ def test_custom_op_registered():
 
 def test_runner_registry():
     """Test the runner registry."""
-    from cutedsl.custom_ops import register_runner, get_runner
+    from dsv4.ops.custom_ops import register_runner, get_runner
     
     class FakeRunner:
         def _run_impl(self, x):
@@ -46,7 +46,7 @@ def test_runner_registry():
 
 def test_fake_tensor_shape_inference():
     """Test that FakeTensor impl returns correct shapes."""
-    from cutedsl.custom_ops import nvfp4_linear_gemm, nvfp4_moe_gemm
+    from dsv4.ops.custom_ops import nvfp4_linear_gemm, nvfp4_moe_gemm
     
     # linear_gemm fake impl
     x_fake = torch.empty(4, 7168, dtype=torch.bfloat16, device='meta')
@@ -73,7 +73,7 @@ def test_torch_compile_skips_custom_op():
     If torch.compile correctly treats it as opaque, it won't call it during
     compilation — only the fake impl runs.
     """
-    from cutedsl.custom_ops import register_runner, nvfp4_linear_gemm
+    from dsv4.ops.custom_ops import register_runner, nvfp4_linear_gemm
     
     class ExplodingRunner:
         """Runner that explodes if _run_impl is ever called."""
diff --git a/tests/test_cutedsl.py b/tests/unit/test_cutedsl.py
similarity index 99%
rename from tests/test_cutedsl.py
rename to tests/unit/test_cutedsl.py
index c110e2d8..64101b45 100644
--- a/tests/test_cutedsl.py
+++ b/tests/unit/test_cutedsl.py
@@ -24,7 +24,7 @@ import cutlass.torch as cutlass_torch
 import cutlass.utils as utils
 import cutlass.utils.blockscaled_layout as blockscaled_utils
 
-from cutedsl.kernel.moe.torch_scaled_grouped_mm import (
+from dsv4.kernels.gemm.grouped import (
     ScaledGroupedGemmKernel,
     pad_and_swizzle_single,
     assemble_raw_scales_2d3d_2d_side,
diff --git a/tests/test_fmha_v3.py b/tests/unit/test_fmha_v3.py
similarity index 100%
rename from tests/test_fmha_v3.py
rename to tests/unit/test_fmha_v3.py
diff --git a/tests/test_fmha_v3_softmax.py b/tests/unit/test_fmha_v3_softmax.py
similarity index 100%
rename from tests/test_fmha_v3_softmax.py
rename to tests/unit/test_fmha_v3_softmax.py
diff --git a/tests/test_fp4_roundtrip.py b/tests/unit/test_fp4_roundtrip.py
similarity index 98%
rename from tests/test_fp4_roundtrip.py
rename to tests/unit/test_fp4_roundtrip.py
index c7b00870..4a787983 100644
--- a/tests/test_fp4_roundtrip.py
+++ b/tests/unit/test_fp4_roundtrip.py
@@ -86,7 +86,9 @@ def test_roundtrip():
     
     # Step 2: Re-quantize BF16 → FP4 using our convention
     sys.path.insert(0, '/root/dsv4-nvfp4-workspace/kernel')
-    from cutedsl.bridge import quantize_weight_to_nvfp4
+from dsv4.ops.quantize import (
+    quantize_weight_to_nvfp4,
+)
     
     # quantize_weight_to_nvfp4 expects (K, N) where K is the packed dim
     # Our gate is (3072, 7168) in BF16, so K=3072, N=7168
diff --git a/tests/test_fused_step1.py b/tests/unit/test_fused_step1.py
similarity index 96%
rename from tests/test_fused_step1.py
rename to tests/unit/test_fused_step1.py
index 5c7d513d..4b83fb26 100644
--- a/tests/test_fused_step1.py
+++ b/tests/unit/test_fused_step1.py
@@ -7,12 +7,16 @@ import torch
 import sys
 sys.path.insert(0, '/root/dsv4-nvfp4-workspace/kernel')
 
-from cutedsl.bridge import (
+from dsv4.ops.quantize import (
     quantize_weight_to_nvfp4,
     quantize_activation_nvfp4,
+)
+from dsv4.ops.layouts import (
     make_b_k_major,
     assemble_scales_2d_side,
     assemble_scales_3d_side,
+)
+from dsv4.ops.gemm_runner import (
     run_nvfp4_grouped_gemm,
     run_fused_swiglu_grouped_gemm,
     warmup_compilation,
diff --git a/tests/test_interleave.py b/tests/unit/test_interleave.py
similarity index 98%
rename from tests/test_interleave.py
rename to tests/unit/test_interleave.py
index b1985b39..a217f75f 100644
--- a/tests/test_interleave.py
+++ b/tests/unit/test_interleave.py
@@ -9,15 +9,19 @@ import torch
 import sys
 sys.path.insert(0 = '/root/dsv4-nvfp4-workspace/kernel')  # FIXME
 
-from cutedsl.bridge import (
+from dsv4.ops.quantize import (
     quantize_to_nvfp4,
     quantize_activation_nvfp4,
     quantize_weight_to_nvfp4,
+)
+from dsv4.ops.layouts import (
     interleave_l1_weights,
     deinterleave_l1_weights,
     make_b_k_major,
     assemble_scales_2d_side,
     assemble_scales_3d_side,
+)
+from dsv4.ops.gemm_runner import (
     run_nvfp4_grouped_gemm,
 )
 
diff --git a/tests/test_interleave_gemm.py b/tests/unit/test_interleave_gemm.py
similarity index 98%
rename from tests/test_interleave_gemm.py
rename to tests/unit/test_interleave_gemm.py
index 4e5ae193..d131f73a 100644
--- a/tests/test_interleave_gemm.py
+++ b/tests/unit/test_interleave_gemm.py
@@ -8,13 +8,17 @@ import torch
 import sys
 sys.path.insert(0, '/root/dsv4-nvfp4-workspace/kernel')
 
-from cutedsl.bridge import (
+from dsv4.ops.quantize import (
     quantize_weight_to_nvfp4,
     quantize_activation_nvfp4,
+)
+from dsv4.ops.layouts import (
     interleave_l1_weights,
     make_b_k_major,
     assemble_scales_2d_side,
     assemble_scales_3d_side,
+)
+from dsv4.ops.gemm_runner import (
     run_nvfp4_grouped_gemm,
     warmup_compilation,
 )
diff --git a/tests/test_pv64_with_softmax.py b/tests/unit/test_pv64_with_softmax.py
similarity index 100%
rename from tests/test_pv64_with_softmax.py
rename to tests/unit/test_pv64_with_softmax.py
diff --git a/tests/test_qk_softmax.py b/tests/unit/test_qk_softmax.py
similarity index 100%
rename from tests/test_qk_softmax.py
rename to tests/unit/test_qk_softmax.py
diff --git a/tests/test_qkonly.py b/tests/unit/test_qkonly.py
similarity index 100%
rename from tests/test_qkonly.py
rename to tests/unit/test_qkonly.py
diff --git a/reference/blockscaled_layout.py b/vendored/blockscaled_layout.py
similarity index 100%
rename from reference/blockscaled_layout.py
rename to vendored/blockscaled_layout.py
diff --git a/reference/dense_blockscaled_gemm_persistent.py b/vendored/dense_blockscaled_gemm_persistent.py
similarity index 100%
rename from reference/dense_blockscaled_gemm_persistent.py
rename to vendored/dense_blockscaled_gemm_persistent.py
diff --git a/reference/grouped_blockscaled_gemm.py b/vendored/grouped_blockscaled_gemm.py
similarity index 100%
rename from reference/grouped_blockscaled_gemm.py
rename to vendored/grouped_blockscaled_gemm.py
diff --git a/reference/moe_moe_persistent_scheduler.py b/vendored/moe_moe_persistent_scheduler.py
similarity index 100%
rename from reference/moe_moe_persistent_scheduler.py
rename to vendored/moe_moe_persistent_scheduler.py
diff --git a/reference/moe_moe_sched_extension.py b/vendored/moe_moe_sched_extension.py
similarity index 100%
rename from reference/moe_moe_sched_extension.py
rename to vendored/moe_moe_sched_extension.py
diff --git a/reference/moe_moe_utils.py b/vendored/moe_moe_utils.py
similarity index 100%
rename from reference/moe_moe_utils.py
rename to vendored/moe_moe_utils.py
diff --git a/reference/moe_torch_grouped_mm.py b/vendored/moe_torch_grouped_mm.py
similarity index 100%
rename from reference/moe_torch_grouped_mm.py
rename to vendored/moe_torch_grouped_mm.py
diff --git a/reference/moe_torch_scaled_grouped_mm.py b/vendored/moe_torch_scaled_grouped_mm.py
similarity index 100%
rename from reference/moe_torch_scaled_grouped_mm.py
rename to vendored/moe_torch_scaled_grouped_mm.py