diff --git a/cutedsl_loader/cutedsl b/cutedsl_loader/cutedsl deleted file mode 120000 index 0a2ab70e..00000000 --- a/cutedsl_loader/cutedsl +++ /dev/null @@ -1 +0,0 @@ -/root/dsv4-nvfp4-workspace/kernel/cutedsl \ No newline at end of file diff --git a/cutedsl/__init__.py b/dsv4/__init__.py similarity index 100% rename from cutedsl/__init__.py rename to dsv4/__init__.py diff --git a/cutedsl/kernel/__init__.py b/dsv4/cache/__init__.py similarity index 100% rename from cutedsl/kernel/__init__.py rename to dsv4/cache/__init__.py diff --git a/dsv4/cache/block_table.py b/dsv4/cache/block_table.py new file mode 100644 index 00000000..b48028f8 --- /dev/null +++ b/dsv4/cache/block_table.py @@ -0,0 +1,2 @@ +"""Block table for paged KV cache.""" +# TODO: Phase 3 diff --git a/dsv4/cache/paged_cache.py b/dsv4/cache/paged_cache.py new file mode 100644 index 00000000..fce4b419 --- /dev/null +++ b/dsv4/cache/paged_cache.py @@ -0,0 +1,2 @@ +"""Paged KV cache.""" +# TODO: Phase 3 diff --git a/dsv4/cache/state_cache.py b/dsv4/cache/state_cache.py new file mode 100644 index 00000000..e19485ba --- /dev/null +++ b/dsv4/cache/state_cache.py @@ -0,0 +1,2 @@ +"""State cache for KV.""" +# TODO: Phase 3 diff --git a/cutedsl/kernel/blockscaled_gemm/__init__.py b/dsv4/kernels/__init__.py similarity index 100% rename from cutedsl/kernel/blockscaled_gemm/__init__.py rename to dsv4/kernels/__init__.py diff --git a/cutedsl/kernel/moe/__init__.py b/dsv4/kernels/attention/__init__.py similarity index 100% rename from cutedsl/kernel/moe/__init__.py rename to dsv4/kernels/attention/__init__.py diff --git a/dsv4/kernels/attention/fmha.py b/dsv4/kernels/attention/fmha.py new file mode 100644 index 00000000..263ef354 --- /dev/null +++ b/dsv4/kernels/attention/fmha.py @@ -0,0 +1,2 @@ +"""FMHA kernel: QK -> online softmax -> PV (CuTeDSL, Stage B+). Extracted from test_fmha_v3.py.""" +# TODO: Extract FmhaV3 kernel class here diff --git a/dsv4/kernels/compressor/__init__.py b/dsv4/kernels/compressor/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/cutedsl/csa_hca_compressor.py b/dsv4/kernels/compressor/csa_hca.py similarity index 100% rename from cutedsl/csa_hca_compressor.py rename to dsv4/kernels/compressor/csa_hca.py diff --git a/dsv4/kernels/cuda/__init__.py b/dsv4/kernels/cuda/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/cutedsl/kernels/deinterleave_quantize.cu b/dsv4/kernels/cuda/deinterleave_quantize.cu similarity index 100% rename from cutedsl/kernels/deinterleave_quantize.cu rename to dsv4/kernels/cuda/deinterleave_quantize.cu diff --git a/cutedsl/kernels/sparse_topk_metadata.cu b/dsv4/kernels/cuda/sparse_topk_metadata.cu similarity index 100% rename from cutedsl/kernels/sparse_topk_metadata.cu rename to dsv4/kernels/cuda/sparse_topk_metadata.cu diff --git a/cutedsl/fp8_bf16.py b/dsv4/kernels/decode/_NOTES_fp8_bf16.md similarity index 99% rename from cutedsl/fp8_bf16.py rename to dsv4/kernels/decode/_NOTES_fp8_bf16.md index 828ab66a..ac373f82 100644 --- a/cutedsl/fp8_bf16.py +++ b/dsv4/kernels/decode/_NOTES_fp8_bf16.md @@ -1,4 +1,4 @@ -""" + FP8 E4M3 -> BF16 conversion for CuTeDSL on Blackwell (SM100+). STATUS: NOT USABLE INSIDE CUTE KERNELS. @@ -23,4 +23,4 @@ or when we can properly construct vector<4xf8E4M3FN> inside kernel code, we can fuse the dequant into the attention kernel. The PTX instruction exists (cvt.rn.bf16x2.e4m3x2), but CuTeDSL's AST preprocessor currently prevents us from injecting the necessary MLIR ops. -""" + diff --git a/dsv4/kernels/decode/__init__.py b/dsv4/kernels/decode/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/dsv4/kernels/gemm/__init__.py b/dsv4/kernels/gemm/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/cutedsl/kernel/blockscaled_gemm/dense_blockscaled_gemm_persistent.py b/dsv4/kernels/gemm/dense.py similarity index 100% rename from cutedsl/kernel/blockscaled_gemm/dense_blockscaled_gemm_persistent.py rename to dsv4/kernels/gemm/dense.py diff --git a/cutedsl/kernel/moe/fused_swiglu_grouped_mm.py b/dsv4/kernels/gemm/fused_swiglu.py similarity index 99% rename from cutedsl/kernel/moe/fused_swiglu_grouped_mm.py rename to dsv4/kernels/gemm/fused_swiglu.py index 0866f694..bb0e467c 100644 --- a/cutedsl/kernel/moe/fused_swiglu_grouped_mm.py +++ b/dsv4/kernels/gemm/fused_swiglu.py @@ -60,15 +60,15 @@ if __name__ == "__main__": current_dir = os.path.dirname(os.path.abspath(__file__)) sys.path.insert(0, os.path.join(current_dir, "../../..")) -from cutedsl.kernel.moe.moe_utils import ( +from dsv4.kernels.gemm.utils import ( MoEScaledGroupedGemmTensormapConstructor, ) -from cutedsl.kernel.moe.moe_persistent_scheduler import ( +from dsv4.kernels.gemm.scheduler import ( MoEStaticSchedulerParams, MoEStaticPersistentTileScheduler, MoEWorkTileInfo, ) -from cutedsl.kernel.moe.moe_sched_extension import ScaledGroupedMmSchedExtension +from dsv4.kernels.gemm.sched_extension import ScaledGroupedMmSchedExtension import cutlass.utils.blackwell_helpers as sm100_utils import cutlass.utils.blockscaled_layout as blockscaled_utils from cutlass.utils.gemm.sm100 import ( @@ -3665,7 +3665,7 @@ class ScaledGroupedGemmTester: if _examples_root not in sys.path: sys.path.insert(0, _examples_root) - from cutedsl.kernel.blockscaled_gemm.dense_blockscaled_gemm_persistent import ( + from dsv4.kernels.gemm.dense import ( Sm100BlockScaledPersistentDenseGemmKernel, ) from cutlass.cute.nvgpu import OperandMajorMode diff --git a/cutedsl/kernel/moe/torch_scaled_grouped_mm.py b/dsv4/kernels/gemm/grouped.py similarity index 99% rename from cutedsl/kernel/moe/torch_scaled_grouped_mm.py rename to dsv4/kernels/gemm/grouped.py index 570a2a1a..c1f7c1d8 100644 --- a/cutedsl/kernel/moe/torch_scaled_grouped_mm.py +++ b/dsv4/kernels/gemm/grouped.py @@ -60,15 +60,15 @@ if __name__ == "__main__": current_dir = os.path.dirname(os.path.abspath(__file__)) sys.path.insert(0, os.path.join(current_dir, "../../..")) -from cutedsl.kernel.moe.moe_utils import ( +from dsv4.kernels.gemm.utils import ( MoEScaledGroupedGemmTensormapConstructor, ) -from cutedsl.kernel.moe.moe_persistent_scheduler import ( +from dsv4.kernels.gemm.scheduler import ( MoEStaticSchedulerParams, MoEStaticPersistentTileScheduler, MoEWorkTileInfo, ) -from cutedsl.kernel.moe.moe_sched_extension import ScaledGroupedMmSchedExtension +from dsv4.kernels.gemm.sched_extension import ScaledGroupedMmSchedExtension import cutlass.utils.blackwell_helpers as sm100_utils import cutlass.utils.blockscaled_layout as blockscaled_utils from cutlass.utils.gemm.sm100 import ( @@ -3608,7 +3608,7 @@ class ScaledGroupedGemmTester: if _examples_root not in sys.path: sys.path.insert(0, _examples_root) - from cutedsl.kernel.blockscaled_gemm.dense_blockscaled_gemm_persistent import ( + from dsv4.kernels.gemm.dense import ( Sm100BlockScaledPersistentDenseGemmKernel, ) from cutlass.cute.nvgpu import OperandMajorMode diff --git a/cutedsl/kernel/moe/moe_sched_extension.py b/dsv4/kernels/gemm/sched_extension.py similarity index 99% rename from cutedsl/kernel/moe/moe_sched_extension.py rename to dsv4/kernels/gemm/sched_extension.py index ce05431b..82ff2ceb 100644 --- a/cutedsl/kernel/moe/moe_sched_extension.py +++ b/dsv4/kernels/gemm/sched_extension.py @@ -73,14 +73,14 @@ from cutlass.cutlass_dsl import Int32 from dataclasses import dataclass from cutlass.utils.blockscaled_layout import tile_atom_to_shape_SF -from cutedsl.kernel.moe.moe_utils import ( +from dsv4.kernels.gemm.utils import ( OnlineTensormapDescCreator, tensormap_ptr_for_copy, compute_expert_token_range, rewrite_tensor_shape, prefetch_tma_descriptor, ) -from cutedsl.kernel.moe.moe_persistent_scheduler import MoEWorkTileInfo +from dsv4.kernels.gemm.scheduler import MoEWorkTileInfo @dataclass(frozen=True) diff --git a/cutedsl/kernel/moe/moe_persistent_scheduler.py b/dsv4/kernels/gemm/scheduler.py similarity index 100% rename from cutedsl/kernel/moe/moe_persistent_scheduler.py rename to dsv4/kernels/gemm/scheduler.py diff --git a/cutedsl/kernel/moe/moe_utils.py b/dsv4/kernels/gemm/utils.py similarity index 100% rename from cutedsl/kernel/moe/moe_utils.py rename to dsv4/kernels/gemm/utils.py diff --git a/dsv4/layers/__init__.py b/dsv4/layers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/dsv4/layers/attention.py b/dsv4/layers/attention.py new file mode 100644 index 00000000..5ed0a464 --- /dev/null +++ b/dsv4/layers/attention.py @@ -0,0 +1,2 @@ +"""DSV4 attention sub-block.""" +# TODO: Phase 3+4 diff --git a/dsv4/layers/embedding.py b/dsv4/layers/embedding.py new file mode 100644 index 00000000..4af9f9b6 --- /dev/null +++ b/dsv4/layers/embedding.py @@ -0,0 +1,2 @@ +"""Token embedding + mHC init wrapper.""" +# TODO: Implement diff --git a/dsv4/layers/ffn.py b/dsv4/layers/ffn.py new file mode 100644 index 00000000..8c4e53e2 --- /dev/null +++ b/dsv4/layers/ffn.py @@ -0,0 +1,2 @@ +"""FFN: router + MoE + shared expert.""" +# TODO: Phase 2 diff --git a/cutedsl/wo_a_grouped_linear.py b/dsv4/layers/grouped_linear.py similarity index 96% rename from cutedsl/wo_a_grouped_linear.py rename to dsv4/layers/grouped_linear.py index c90a102a..1142f5b2 100644 --- a/cutedsl/wo_a_grouped_linear.py +++ b/dsv4/layers/grouped_linear.py @@ -14,22 +14,26 @@ CUDA-graph-compatible: all buffers pre-allocated, no CPU-GPU syncs. import torch -from cutedsl.bridge import ( +from dsv4.ops.quantize import ( quantize_activation_nvfp4, quantize_weight_to_nvfp4, +) +from dsv4.ops.layouts import ( make_b_k_major, assemble_scales_2d_side, assemble_scales_3d_side, +) +from dsv4.ops.gemm_runner import ( run_nvfp4_grouped_gemm, ) -from cutedsl.kernel.moe.torch_scaled_grouped_mm import ( +from dsv4.ops.layouts import ( ceil_div as cutedsl_ceil_div, pad_and_swizzle_single, ) -from cutedsl.custom_ops import register_runner, nvfp4_linear_gemm +from dsv4.ops.custom_ops import register_runner, nvfp4_linear_gemm -class CuTeDSLNvfp4WoA: +class Nvfp4GroupedLinear: """Grouped NVFP4 linear for wo_a (o-projection first half). Handles the "bhr,hdr->bhd" einsum pattern: @@ -181,7 +185,9 @@ class CuTeDSLNvfp4WoA: # Reshape to grouped format, then flatten to 2D for quantization o_grouped = o_sample.reshape(-1, self.n_local_groups, self.group_in_features) # We need a single gs for all groups — use the overall amax - from cutedsl.bridge import quantize_to_nvfp4 + from dsv4.ops.quantize import ( + quantize_to_nvfp4, + ) o_flat = o_sample.reshape(-1, o_sample.shape[-1]) # (tokens, n_local_heads * head_dim) — not right # Actually, for grouped GEMM, each group's activation is (tokens, group_in_features) # The global scale should be computed per-group, but for simplicity use one scale @@ -256,7 +262,9 @@ class CuTeDSLNvfp4WoA: # Assemble A-side scales for all groups # The grouped GEMM expects scales for all groups assembled together # For 2Dx3D scenario, scale_a is assembled from per-group scale tensors - from cutedsl.bridge import assemble_scales_2d_side + from dsv4.ops.layouts import ( + assemble_scales_2d_side, + ) scale_a = assemble_scales_2d_side(all_x_sf) # Expert offsets: cumulative [padded_T, 2*padded_T, ..., n_groups*padded_T] diff --git a/cutedsl/nvfp4_linear.py b/dsv4/layers/linear.py similarity index 95% rename from cutedsl/nvfp4_linear.py rename to dsv4/layers/linear.py index 3795c083..9d7e9d0a 100644 --- a/cutedsl/nvfp4_linear.py +++ b/dsv4/layers/linear.py @@ -8,21 +8,25 @@ CUDA-graph-compatible: all buffers pre-allocated, no CPU-GPU syncs. import torch -from cutedsl.bridge import ( +from dsv4.ops.quantize import ( quantize_activation_nvfp4, quantize_to_nvfp4, +) +from dsv4.ops.layouts import ( make_b_k_major, assemble_scales_3d_side, +) +from dsv4.ops.gemm_runner import ( run_nvfp4_grouped_gemm, ) -from cutedsl.kernel.moe.torch_scaled_grouped_mm import ( +from dsv4.kernels.gemm.grouped import ( ceil_div as cutedsl_ceil_div, pad_and_swizzle_single, ) -from cutedsl.custom_ops import register_runner, nvfp4_linear_gemm +from dsv4.ops.custom_ops import register_runner, nvfp4_linear_gemm -class CuTeDSLNvfp4Linear: +class Nvfp4Linear: """Single NVFP4 GEMM using CuTeDSL (num_groups=1). Handles any (K, N) weight matrix in NVFP4 format. @@ -76,7 +80,6 @@ class CuTeDSLNvfp4Linear: # Eagerly JIT-compile the GEMM kernel for this (K, N) shape. # Uses num_groups=1 since this is a single linear layer. - # from cutedsl.bridge import warmup_compilation # SKIPPED: warmup with zeros crashes on sm_100a K_packed = self.in_features // 2 N_packed = self.out_features // 2 # warmup_compilation(1, K_packed, N_packed, self.device) # Lazy compile on first real forward diff --git a/cutedsl/mhc_inference_layer.py b/dsv4/layers/mhc.py similarity index 100% rename from cutedsl/mhc_inference_layer.py rename to dsv4/layers/mhc.py diff --git a/cutedsl/runner.py b/dsv4/layers/moe.py similarity index 95% rename from cutedsl/runner.py rename to dsv4/layers/moe.py index bbd0bd2b..7fc38fa1 100644 --- a/cutedsl/runner.py +++ b/dsv4/layers/moe.py @@ -15,26 +15,30 @@ processes max_slots = budget * top_k rows; padding rows are zeros. """ import torch -from cutedsl.bridge import ( +from dsv4.ops.quantize import ( quantize_activation_nvfp4, quantize_weight_to_nvfp4, quantize_to_nvfp4, +) +from dsv4.ops.layouts import ( make_b_k_major, assemble_scales_3d_side, interleave_l1_weights, deinterleave_l1_weights, +) +from dsv4.ops.gemm_runner import ( run_nvfp4_grouped_gemm, run_fused_swiglu_grouped_gemm, warmup_fused_swiglu_compilation, ) -from cutedsl.kernel.moe.torch_scaled_grouped_mm import ( +from dsv4.ops.layouts import ( ceil_div as cutedsl_ceil_div, pad_and_swizzle_single, ) -from cutedsl.custom_ops import register_runner, nvfp4_moe_gemm +from dsv4.ops.custom_ops import register_runner, nvfp4_moe_gemm -class CuTeDSLMoERunner: +class Nvfp4MoE: """Manages NVFP4 MoE execution via the CuTeDSL kernel. CUDA-graph-compatible: all buffers pre-allocated, no CPU-GPU syncs, @@ -127,15 +131,15 @@ class CuTeDSLMoERunner: # Initialize shared buffers dict (if not already) device_key = str(self.device) - if not hasattr(CuTeDSLMoERunner, '_shared_padded_bufs'): - CuTeDSLMoERunner._shared_padded_bufs = {} - if device_key not in CuTeDSLMoERunner._shared_padded_bufs: - CuTeDSLMoERunner._shared_padded_bufs[device_key] = {} + if not hasattr(Nvfp4MoE, '_shared_padded_bufs'): + Nvfp4MoE._shared_padded_bufs = {} + if device_key not in Nvfp4MoE._shared_padded_bufs: + Nvfp4MoE._shared_padded_bufs[device_key] = {} # Padded x_sf buffers: SHARED across all runners (not per-layer) max_sf_rows = self.num_experts * self._max_chunks_per_expert * 128 - if 'xsf_l1' not in CuTeDSLMoERunner._shared_padded_bufs[device_key]: - CuTeDSLMoERunner._shared_padded_bufs[device_key].update({ + if 'xsf_l1' not in Nvfp4MoE._shared_padded_bufs[device_key]: + Nvfp4MoE._shared_padded_bufs[device_key].update({ 'xsf_l1': torch.zeros( max_sf_rows, padded_cols_l1, dtype=torch.float16, device=self.device ).to(torch.float8_e4m3fn), @@ -146,9 +150,9 @@ class CuTeDSLMoERunner: self.max_num_tokens, self.hidden_size, dtype=torch.bfloat16, device=self.device ), }) - self._padded_x_sf_buf_l1 = CuTeDSLMoERunner._shared_padded_bufs[device_key]['xsf_l1'] - self._padded_x_sf_buf_l2 = CuTeDSLMoERunner._shared_padded_bufs[device_key]['xsf_l2'] - self._output_buf = CuTeDSLMoERunner._shared_padded_bufs[device_key]['output'] + self._padded_x_sf_buf_l1 = Nvfp4MoE._shared_padded_bufs[device_key]['xsf_l1'] + self._padded_x_sf_buf_l2 = Nvfp4MoE._shared_padded_bufs[device_key]['xsf_l2'] + self._output_buf = Nvfp4MoE._shared_padded_bufs[device_key]['output'] # Pre-allocated global_scale_a buffers (filled via .fill_(), no torch.full during capture) self._l1_gsa_buf = torch.zeros(self.num_experts, dtype=torch.float32, device=self.device) @@ -162,8 +166,8 @@ class CuTeDSLMoERunner: # Padded hidden/activated: SHARED across all runners (not per-layer) max_rows_per_expert = self._max_chunks_per_expert * 128 padded_max_slots = self.num_experts * max_rows_per_expert - if 'hidden' not in CuTeDSLMoERunner._shared_padded_bufs[device_key]: - CuTeDSLMoERunner._shared_padded_bufs[device_key].update({ + if 'hidden' not in Nvfp4MoE._shared_padded_bufs[device_key]: + Nvfp4MoE._shared_padded_bufs[device_key].update({ 'hidden': torch.zeros( padded_max_slots, self.hidden_size, dtype=torch.bfloat16, device=self.device ), @@ -177,7 +181,7 @@ class CuTeDSLMoERunner: padded_max_slots, self.intermediate_size // 2, dtype=torch.uint8, device=self.device ).view(torch.float4_e2m1fn_x2), }) - self._shared_bufs = CuTeDSLMoERunner._shared_padded_bufs[device_key] + self._shared_bufs = Nvfp4MoE._shared_padded_bufs[device_key] # Padded expert offsets buffer: [0, max_rows, 2*max_rows, ...] (fixed) self._padded_expert_offsets_buf = torch.zeros( @@ -237,7 +241,7 @@ class CuTeDSLMoERunner: # assemble_scales_3d_side expects (K_sf, N) per expert and transposes # to (N, K_sf) internally. But our scales are already (N, K_sf) from # the checkpoint! Skip the transpose by calling the assembly directly. - from cutedsl.kernel.moe.torch_scaled_grouped_mm import ( + from dsv4.ops.layouts import ( assemble_raw_scales_2d3d_3d_side, ) self._l1_scale_b = assemble_raw_scales_2d3d_3d_side(l1_sf_list) @@ -285,7 +289,13 @@ class CuTeDSLMoERunner: # This triggers cute.compile once per shape, caching the compiled # kernel + workspace. Subsequent run() calls hit the cache. # MUST happen before model forward pass to avoid OOM from lazy JIT. - from cutedsl.bridge import warmup_compilation, warmup_fused_swiglu_compilation, ceil_div as bridge_ceil_div + from dsv4.ops.layouts import ( + ceil_div as bridge_ceil_div, + ) + from dsv4.ops.gemm_runner import ( + warmup_compilation, + warmup_fused_swiglu_compilation, + ) K_packed = self.hidden_size // 2 N_packed_l1 = (2 * self.intermediate_size) // 2 # gate+up combined N_packed_l2 = self.hidden_size // 2 # down diff --git a/dsv4/layers/norm.py b/dsv4/layers/norm.py new file mode 100644 index 00000000..04c95489 --- /dev/null +++ b/dsv4/layers/norm.py @@ -0,0 +1,2 @@ +"""RMSNorm placeholder.""" +# TODO: Implement RMSNorm diff --git a/dsv4/layers/router.py b/dsv4/layers/router.py new file mode 100644 index 00000000..0897501d --- /dev/null +++ b/dsv4/layers/router.py @@ -0,0 +1,2 @@ +"""Router: sqrt(softplus) + topk + aux-free bias + hash routing.""" +# TODO: Phase 2 diff --git a/cutedsl/shared_expert_pipeline.py b/dsv4/layers/shared_expert.py similarity index 98% rename from cutedsl/shared_expert_pipeline.py rename to dsv4/layers/shared_expert.py index ce9aa4be..5d51cbba 100644 --- a/cutedsl/shared_expert_pipeline.py +++ b/dsv4/layers/shared_expert.py @@ -20,14 +20,18 @@ no dynamic shapes. Padding rows are zeros that contribute nothing to GEMM output import torch -from cutedsl.bridge import ( +from dsv4.ops.quantize import ( quantize_activation_nvfp4, quantize_to_nvfp4, +) +from dsv4.ops.layouts import ( make_b_k_major, assemble_scales_3d_side, +) +from dsv4.ops.gemm_runner import ( run_nvfp4_grouped_gemm, ) -from cutedsl.kernel.moe.torch_scaled_grouped_mm import ( +from dsv4.kernels.gemm.grouped import ( ceil_div as cutedsl_ceil_div, pad_and_swizzle_single, ) @@ -40,7 +44,7 @@ class _SharedExpertApply(torch.autograd.Function): return runner._run_impl(hidden_states) -class CuTeDSLSharedExpertRunner: +class Nvfp4SharedExpert: """NVFP4 shared expert runner using CuTeDSL GEMM (num_groups=1). CUDA-graph-compatible: all buffers pre-allocated, no CPU-GPU syncs. diff --git a/dsv4/loader/__init__.py b/dsv4/loader/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/dsv4/loader/hf_checkpoint.py b/dsv4/loader/hf_checkpoint.py new file mode 100644 index 00000000..eb49186f --- /dev/null +++ b/dsv4/loader/hf_checkpoint.py @@ -0,0 +1,2 @@ +"""HuggingFace checkpoint reader.""" +# TODO diff --git a/dsv4/loader/layout_convert.py b/dsv4/loader/layout_convert.py new file mode 100644 index 00000000..a4d10adb --- /dev/null +++ b/dsv4/loader/layout_convert.py @@ -0,0 +1,2 @@ +"""Checkpoint layout conversion.""" +# TODO diff --git a/dsv4/model/__init__.py b/dsv4/model/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/dsv4/model/config.py b/dsv4/model/config.py new file mode 100644 index 00000000..db67e083 --- /dev/null +++ b/dsv4/model/config.py @@ -0,0 +1,2 @@ +"""DSV4Config (Flash + Pro).""" +# TODO: Phase 1 diff --git a/dsv4/model/dsv4.py b/dsv4/model/dsv4.py new file mode 100644 index 00000000..948a3564 --- /dev/null +++ b/dsv4/model/dsv4.py @@ -0,0 +1,2 @@ +"""Full DSV4 model.""" +# TODO: Phase 1 diff --git a/dsv4/model/layer.py b/dsv4/model/layer.py new file mode 100644 index 00000000..be929a3c --- /dev/null +++ b/dsv4/model/layer.py @@ -0,0 +1,2 @@ +"""Single transformer layer.""" +# TODO: Phase 1 diff --git a/dsv4/model/mtp.py b/dsv4/model/mtp.py new file mode 100644 index 00000000..cc75722b --- /dev/null +++ b/dsv4/model/mtp.py @@ -0,0 +1,2 @@ +"""Multi-token prediction.""" +# TODO diff --git a/dsv4/model/sampler.py b/dsv4/model/sampler.py new file mode 100644 index 00000000..06ce8038 --- /dev/null +++ b/dsv4/model/sampler.py @@ -0,0 +1,2 @@ +"""Token sampler.""" +# TODO diff --git a/dsv4/ops/__init__.py b/dsv4/ops/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/cutedsl/custom_ops.py b/dsv4/ops/custom_ops.py similarity index 100% rename from cutedsl/custom_ops.py rename to dsv4/ops/custom_ops.py diff --git a/cutedsl/native_sparse_decode.py b/dsv4/ops/decode_sparse.py similarity index 100% rename from cutedsl/native_sparse_decode.py rename to dsv4/ops/decode_sparse.py diff --git a/cutedsl/native_swa_decode.py b/dsv4/ops/decode_swa.py similarity index 100% rename from cutedsl/native_swa_decode.py rename to dsv4/ops/decode_swa.py diff --git a/cutedsl/bridge.py b/dsv4/ops/gemm_runner.py similarity index 55% rename from cutedsl/bridge.py rename to dsv4/ops/gemm_runner.py index b15a7ff3..a29a357c 100644 --- a/cutedsl/bridge.py +++ b/dsv4/ops/gemm_runner.py @@ -1,13 +1,4 @@ -""" -Bridge layer for the CuTeDSL NVFP4 MoE kernel. - -Handles tensor layout conversion from our pipeline's format to what -the ScaledGroupedGemmKernel expects: -- BF16 → NVFP4 quantization (float4_e2m1fn_x2) -- Scale factor assembly (padding + swizzle) -- B tensor K-major stride conversion -- Expert offset computation -""" +"""NVFP4 GEMM runner: warmup, compile, and execute grouped/fused GEMMs.""" import math import torch import cutlass @@ -15,18 +6,24 @@ import cutlass.cute as cute import cutlass.torch as cutlass_torch import cutlass.utils as utils -from cutedsl.kernel.moe.torch_scaled_grouped_mm import ( - ScaledGroupedGemmKernel, - pad_and_swizzle_single, - assemble_raw_scales_2d3d_2d_side, - assemble_raw_scales_2d3d_3d_side, - cat_byte_reinterpretable_tensors, - stack_byte_reinterpretable_tensors, +from dsv4.kernels.gemm.grouped import ScaledGroupedGemmKernel +from dsv4.kernels.gemm.fused_swiglu import FusedSwiGLUScaledGroupedGemmKernel +from dsv4.ops.quantize import ( + quantize_activation_nvfp4, + quantize_weight_to_nvfp4, + quantize_to_nvfp4, + deinterleave_quantize_nvfp4_cuda, +) +from dsv4.ops.layouts import ( + interleave_l1_weights, + deinterleave_l1_weights, + assemble_scales_2d_side, + assemble_scales_3d_side, + make_b_k_major, + compute_expert_offsets, + ceil_div, + round_up, ) - -# ── Constants ────────────────────────────────────────────────────────── - -E2M1_MAGNITUDES = [0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0] # Cache compiled kernels + pre-allocated workspace by cache_key # Each entry: {'compiled': callable, 'workspace': Tensor, 'workspace_size': int} @@ -42,326 +39,6 @@ E2M1_MAGNITUDES = [0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0] # Caching them would hold stale references to tensors that get freed. _compiled_kernel_cache = {} -# Cached LUT for E2M1 quantization (created once per device, cudagraph-safe) -_NVFP4_STEP_LUT_CACHE = {} -def _get_step_to_idx_lut(device): - """Get or create the E2M1 step-to-index LUT for the given device. - - Cached per device to avoid CPU->CUDA copies during cudagraph capture. - Must be pre-populated during warmup (before torch.compile/cudagraph capture) - so the lock is never entered on the compiled path. - """ - # Fast path: already cached — no lock needed (torch.compile-safe) - if device in _NVFP4_STEP_LUT_CACHE: - return _NVFP4_STEP_LUT_CACHE[device] - # Slow path: first call, create the LUT - lut = torch.as_tensor( - [0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 7, 7], - dtype=torch.int8, device=device, - ) - _NVFP4_STEP_LUT_CACHE[device] = lut - return lut -SF_VEC_SIZE = 16 # NVFP4 block size - - -def ceil_div(a, b): - return (a + b - 1) // b - - -def round_up(a, b): - return ceil_div(a, b) * b - - -# ── Quantization ────────────────────────────────────────────────────── - -def quantize_to_nvfp4(x_bf16, block_size=SF_VEC_SIZE): - """Quantize BF16 tensor to NVFP4. - - Args: - x_bf16: (..., D) BF16 tensor - - Returns: - x_fp4: (..., D//2) float4_e2m1fn_x2 — native PyTorch FP4 - x_sf: (..., D//16) float8_e4m3fn — block scales - global_scale: float32 scalar - """ - x_f32 = x_bf16.float() - amax = x_f32.abs().max().clamp(min=1e-8).float() - global_scale = amax / (6.0 * 448.0) - x_norm = x_f32 / global_scale - - last_dim = x_norm.shape[-1] - n_blocks = ceil_div(last_dim, block_size) - - if last_dim % block_size != 0: - pad_size = n_blocks * block_size - last_dim - x_norm = torch.nn.functional.pad(x_norm, (0, pad_size)) - - x_reshaped = x_norm.reshape(*x_norm.shape[:-1], n_blocks, block_size) - block_amax = x_reshaped.abs().amax(dim=-1) - # Detect zero blocks and underflow blocks (amax > 0 but too small for FP8). - # Smallest positive FP8 e4m3fn is 2^-9 ≈ 1.95e-3. If amax/6 < this, - # the block scale underflows to 0, and dividing x by the clamped 1e-8 - # inflates values into nonzero FP4 buckets — producing wrong results. - zero_block = block_amax < (6.0 * 2.0 ** -9) # < ~0.0117 - # Zero out x for zero/underflow blocks before division. - # This ensures x_scaled = 0 → FP4 nibbles = 0. - x_reshaped = torch.where(zero_block.unsqueeze(-1), - torch.zeros_like(x_reshaped), x_reshaped) - block_amax = block_amax.clamp(min=1e-8) - block_scale = (block_amax / 6.0).to(torch.float8_e4m3fn) - # Force zero/underflow blocks: FP8 scale = 0 (exact zero). - block_scale = torch.where(zero_block, torch.zeros_like(block_scale), block_scale) - - # Nearest E2M1 - block_sf_expanded = block_scale.float().unsqueeze(-1) - x_scaled = x_reshaped / block_sf_expanded.clamp(min=1e-8) - - signs = torch.sign(x_scaled) - abs_scaled = x_scaled.abs().clamp(max=6.0) - - half_steps = (abs_scaled * 2.0).round().clamp(0, 12).to(torch.int8) - step_to_idx = _get_step_to_idx_lut(x_bf16.device) - indices = step_to_idx[half_steps.long()] - - nibbles = torch.where(signs < 0, indices + 8, indices).to(torch.uint8) - even = nibbles[..., ::2] - odd = nibbles[..., 1::2] - packed = (odd << 4) | even - - packed_shape = list(x_bf16.shape) - packed_shape[-1] = last_dim // 2 - x_fp4 = packed.view(torch.float4_e2m1fn_x2).reshape(packed_shape) - - sf_shape = list(x_bf16.shape[:-1]) + [n_blocks] - block_scale = block_scale.reshape(sf_shape) - - return x_fp4, block_scale, global_scale - - -def quantize_activation_nvfp4(x_bf16, global_scale, block_size=SF_VEC_SIZE): - """Quantize BF16 activation tensor to NVFP4 (cudagraph-safe). - - Unlike quantize_to_nvfp4(), this takes a pre-computed global_scale - instead of computing it via .max() (which forces CPU-GPU sync). - All operations are pure GPU with no CPU-GPU syncs. - - Args: - x_bf16: (..., D) BF16 tensor - global_scale: float32 scalar (pre-computed, NOT from .max()) - block_size: NVFP4 block size - - Returns: - x_fp4: (..., D//2) float4_e2m1fn_x2 - x_sf: (..., D//16) float8_e4m3fn - """ - x_f32 = x_bf16.float() - x_norm = x_f32 / global_scale - - last_dim = x_norm.shape[-1] - n_blocks = ceil_div(last_dim, block_size) - - if last_dim % block_size != 0: - pad_size = n_blocks * block_size - last_dim - x_norm = torch.nn.functional.pad(x_norm, (0, pad_size)) - - x_reshaped = x_norm.reshape(*x_norm.shape[:-1], n_blocks, block_size) - block_amax = x_reshaped.abs().amax(dim=-1) - # Detect zero blocks and underflow blocks (same threshold as quantize_to_nvfp4). - zero_block = block_amax < (6.0 * 2.0 ** -9) - x_reshaped = torch.where(zero_block.unsqueeze(-1), - torch.zeros_like(x_reshaped), x_reshaped) - block_amax = block_amax.clamp(min=1e-8) - block_scale = (block_amax / 6.0).to(torch.float8_e4m3fn) - block_scale = torch.where(zero_block, torch.zeros_like(block_scale), block_scale) - - block_sf_expanded = block_scale.float().unsqueeze(-1) - x_scaled = x_reshaped / block_sf_expanded.clamp(min=1e-8) - signs = torch.sign(x_scaled) - abs_scaled = x_scaled.abs().clamp(max=6.0) - - half_steps = (abs_scaled * 2.0).round().clamp(0, 12).to(torch.int8) - step_to_idx = _get_step_to_idx_lut(x_bf16.device) - indices = step_to_idx[half_steps.long()] - - nibbles = torch.where(signs < 0, indices + 8, indices).to(torch.uint8) - even = nibbles[..., ::2] - odd = nibbles[..., 1::2] - packed = (odd << 4) | even - - packed_shape = list(x_bf16.shape) - packed_shape[-1] = last_dim // 2 - x_fp4 = packed.view(torch.float4_e2m1fn_x2).reshape(packed_shape) - - sf_shape = list(x_bf16.shape[:-1]) + [n_blocks] - block_scale = block_scale.reshape(sf_shape) - - return x_fp4, block_scale - - -def quantize_weight_to_nvfp4(w_bf16, block_size=SF_VEC_SIZE): - """Quantize BF16 weight matrix to NVFP4. - - The weight is (K, N) where K is the input dim (packed dimension). - Block scales are computed along K (dim 0). - - Args: - w_bf16: (K, N) BF16 weight matrix - - Returns: - w_fp4: (K//2, N) float4_e2m1fn_x2 — K is the packed dim - w_sf: (K//16, N) float8_e4m3fn — block scales along K - global_scale: float32 scalar - """ - K, N = w_bf16.shape - w_f32 = w_bf16.float() - amax = w_f32.abs().max().clamp(min=1e-8).float() - global_scale = amax / (6.0 * 448.0) - w_norm = w_f32 / global_scale - - k_blocks = ceil_div(K, block_size) - if K % block_size != 0: - w_norm = torch.nn.functional.pad(w_norm, (0, 0, 0, k_blocks * block_size - K)) - - w_reshaped = w_norm.reshape(k_blocks, block_size, N) - w_block_amax = w_reshaped.abs().amax(dim=1) - # Detect zero blocks and underflow blocks (same threshold). - zero_block = w_block_amax < (6.0 * 2.0 ** -9) - w_reshaped = torch.where(zero_block.unsqueeze(1), - torch.zeros_like(w_reshaped), w_reshaped) - w_block_amax = w_block_amax.clamp(min=1e-8) - w_sf = (w_block_amax / 6.0).to(torch.float8_e4m3fn) - w_sf = torch.where(zero_block, torch.zeros_like(w_sf), w_sf) - - w_block_sf = w_sf.float().unsqueeze(1) - w_scaled = w_reshaped / w_block_sf.clamp(min=1e-8) - - signs = torch.sign(w_scaled) - abs_scaled = w_scaled.abs().clamp(max=6.0) - - half_steps = (abs_scaled * 2.0).round().clamp(0, 12).to(torch.int8) - step_to_idx = _get_step_to_idx_lut(w_bf16.device) - indices = step_to_idx[half_steps.long()] - nibbles = torch.where(signs < 0, indices + 8, indices).to(torch.uint8) - - even = nibbles[:, ::2, :] - odd = nibbles[:, 1::2, :] - packed = (odd << 4) | even - - w_fp4 = packed.reshape(K // 2, N).view(torch.float4_e2m1fn_x2) - return w_fp4, w_sf, global_scale - - -# ── Scale Factor Assembly ───────────────────────────────────────────── - -def interleave_l1_weights(w_ekn, granularity_bf16=8): - """Interleave gate/up weights at granularity 8 in BF16 (4 in FP4). - - The fused SwiGLU epilogue requires gate/up pairs to be adjacent in the - MMA accumulator. With interleaved weights, the MMA tile produces - gate[i*8..i*8+7] and up[i*8..i*8+7] next to each other in registers, - enabling a single-register SwiGLU without SMEM round-trips. - - Before: [gate_0..gate_N/2-1 | up_0..up_N/2-1] - After: [gate_0..gate_7, up_0..up_7, gate_8..gate_15, up_8..up_15, ...] - - The interleave operates along the N dimension, where each column = 1 BF16 - (FP4 packing is along K, not N). So g = granularity_bf16 directly. - - Args: - w_ekn: (E, K_packed, N_packed) FP4 weight tensor in K-major layout - N_packed = 2*intermediate/2 = intermediate (gate+up fused) - granularity_bf16: interleave group size in BF16 elements (default 8) - - Returns: - (E, K_packed, N_packed) FP4 weight tensor with interleaved gate/up - """ - E, K, N = w_ekn.shape - N_half = N // 2 # gate and up each have N/2 FP4 columns - g = granularity_bf16 # N-axis interleave: each N-col = 1 BF16 col (packing is along K) - - gate = w_ekn[:, :, :N_half].reshape(E, K, N_half // g, g) - up = w_ekn[:, :, N_half:].reshape(E, K, N_half // g, g) - return torch.stack([gate, up], dim=3).reshape(E, K, N) - - -def deinterleave_l1_weights(w_ekn, granularity_bf16=8): - """De-interleave gate/up weights (inverse of interleave_l1_weights). - - Used for testing/verification only. - """ - g = granularity_bf16 # N-axis: each N-col = 1 BF16 col - E, K, N = w_ekn.shape - w_reshaped = w_ekn.reshape(E, K, N // (2 * g), 2, g) - gate = w_reshaped[:, :, :, 0, :].reshape(E, K, N // 2) - up = w_reshaped[:, :, :, 1, :].reshape(E, K, N // 2) - return torch.cat([gate, up], dim=2) - - -def assemble_scales_2d_side(raw_scales): - """Assemble activation scale factors for the 2Dx3D scenario. - - Args: - raw_scales: list of (M_e, K_sf) float8_e4m3fn tensors, one per expert - - Returns: - Assembled and swizzled scale tensor - """ - return assemble_raw_scales_2d3d_2d_side(raw_scales) - - -def assemble_scales_3d_side(raw_scales): - """Assemble weight scale factors for the 2Dx3D scenario. - - Args: - raw_scales: list of (K_sf, N) float8_e4m3fn tensors, one per expert - NOTE: These will be transposed to (N, K_sf) before swizzling, - since the kernel expects N as the non-K dimension. - - Returns: - Assembled and swizzled scale tensor - """ - # Kernel expects (N, K_sf) — transpose before swizzling - transposed = [sf.T.contiguous() for sf in raw_scales] - return assemble_raw_scales_2d3d_3d_side(transposed) - - -# ── Tensor Layout Conversion ────────────────────────────────────────── - -def make_b_k_major(b_tensor): - """Convert B tensor from N-major to K-major layout. - - The kernel expects B with stride (E*K*N, 1, K) — K is contiguous. - torch.stack produces stride (E*K*N, N, 1) — N is contiguous. - - Args: - b_tensor: (experts, K_packed, N_packed) float4_e2m1fn_x2, N-major - - Returns: - Same shape, K-major strides - """ - return b_tensor.permute(0, 2, 1).contiguous().permute(0, 2, 1) - - -def compute_expert_offsets(tokens_per_expert, num_experts, device="cuda"): - """Compute cumulative token offsets for the grouped GEMM. - - Args: - tokens_per_expert: list of int, one per expert - - Returns: - offs: (num_experts,) int32 — cumulative sum - """ - offs = torch.tensor( - [sum(tokens_per_expert[:e+1]) for e in range(num_experts)], - dtype=torch.int32, device=device, - ) - return offs - - -# ── Kernel Launch ───────────────────────────────────────────────────── - - def warmup_compilation(num_experts, K_packed, N_packed, device, mma_tiler_mn=(128, 128), cluster_shape_mn=(1, 1)): """Eagerly JIT-compile the GEMM kernel for a specific shape. @@ -589,10 +266,7 @@ def run_nvfp4_grouped_gemm( # ── Fused SwiGLU GEMM (Stage 1: SiLU in registers, BF16 output) ────── -# Cache for fused kernel (separate from standard GEMM cache) _fused_kernel_cache = {} - - def warmup_fused_swiglu_compilation(num_experts, K_packed, N_packed, device, swiglu_limit=0.0, mma_tiler_mn=(128, 128), @@ -602,7 +276,7 @@ def warmup_fused_swiglu_compilation(num_experts, K_packed, N_packed, device, Must be called during model initialization. See warmup_compilation() for the standard GEMM equivalent. """ - from cutedsl.kernel.moe.fused_swiglu_grouped_mm import FusedSwiGLUScaledGroupedGemmKernel + from dsv4.kernels.gemm.fused_swiglu import FusedSwiGLUScaledGroupedGemmKernel cache_key = ('fused', num_experts, str(device), mma_tiler_mn, cluster_shape_mn, K_packed, N_packed, swiglu_limit) @@ -697,7 +371,7 @@ def run_fused_swiglu_grouped_gemm( Stage 1: SiLU is applied to the full accumulator in registers, then written as BF16 to C. Gate/up pairing is not yet implemented. """ - from cutedsl.kernel.moe.fused_swiglu_grouped_mm import FusedSwiGLUScaledGroupedGemmKernel + from dsv4.kernels.gemm.fused_swiglu import FusedSwiGLUScaledGroupedGemmKernel num_experts = mat_b.shape[0] n_dim = mat_b.shape[2] @@ -789,28 +463,3 @@ def run_fused_swiglu_grouped_gemm( -def deinterleave_quantize_nvfp4_cuda(fused_bf16, intermediate, global_scale, granularity=8): - """De-interleave + quantize fused SwiGLU output using a custom CUDA kernel. - - Single kernel launch, no Python loop. 4x faster than the Python path. - - Args: - fused_bf16: (M, 2*intermediate) BF16 — fused L1 output with interleaved gate/up - intermediate: intermediate dimension (e.g., 3072) - global_scale: pre-computed global scale for quantization - granularity: interleave granularity in BF16 columns (default 8) - - Returns: - x_fp4: (M, intermediate//2) float4_e2m1fn_x2 — quantized SwiGLU - x_sf: (M, intermediate//16) float8_e4m3fn — block scales - """ - from torch.utils.cpp_extension import load - import os - kernel_dir = os.path.join(os.path.dirname(__file__), "kernels") - mod = load( - name="deinterleave_quantize_nvfp4", - sources=[os.path.join(kernel_dir, "deinterleave_quantize.cu")], - extra_cuda_cflags=["-gencode=arch=compute_100a,code=sm_100a"], - verbose=False, - ) - return mod.deinterleave_quantize_nvfp4(fused_bf16, intermediate, granularity, global_scale) diff --git a/dsv4/ops/layouts.py b/dsv4/ops/layouts.py new file mode 100644 index 00000000..e092ca0e --- /dev/null +++ b/dsv4/ops/layouts.py @@ -0,0 +1,123 @@ +"""Tensor layout helpers: scale swizzle, gate/up interleave, K-major, offsets.""" +import torch + +from dsv4.kernels.gemm.grouped import ( + pad_and_swizzle_single, + assemble_raw_scales_2d3d_2d_side, + assemble_raw_scales_2d3d_3d_side, +) + +def ceil_div(a, b): + return (a + b - 1) // b + + +def round_up(a, b): + return ceil_div(a, b) * b + +def interleave_l1_weights(w_ekn, granularity_bf16=8): + """Interleave gate/up weights at granularity 8 in BF16 (4 in FP4). + + The fused SwiGLU epilogue requires gate/up pairs to be adjacent in the + MMA accumulator. With interleaved weights, the MMA tile produces + gate[i*8..i*8+7] and up[i*8..i*8+7] next to each other in registers, + enabling a single-register SwiGLU without SMEM round-trips. + + Before: [gate_0..gate_N/2-1 | up_0..up_N/2-1] + After: [gate_0..gate_7, up_0..up_7, gate_8..gate_15, up_8..up_15, ...] + + The interleave operates along the N dimension, where each column = 1 BF16 + (FP4 packing is along K, not N). So g = granularity_bf16 directly. + + Args: + w_ekn: (E, K_packed, N_packed) FP4 weight tensor in K-major layout + N_packed = 2*intermediate/2 = intermediate (gate+up fused) + granularity_bf16: interleave group size in BF16 elements (default 8) + + Returns: + (E, K_packed, N_packed) FP4 weight tensor with interleaved gate/up + """ + E, K, N = w_ekn.shape + N_half = N // 2 # gate and up each have N/2 FP4 columns + g = granularity_bf16 # N-axis interleave: each N-col = 1 BF16 col (packing is along K) + + gate = w_ekn[:, :, :N_half].reshape(E, K, N_half // g, g) + up = w_ekn[:, :, N_half:].reshape(E, K, N_half // g, g) + return torch.stack([gate, up], dim=3).reshape(E, K, N) + + +def deinterleave_l1_weights(w_ekn, granularity_bf16=8): + """De-interleave gate/up weights (inverse of interleave_l1_weights). + + Used for testing/verification only. + """ + g = granularity_bf16 # N-axis: each N-col = 1 BF16 col + E, K, N = w_ekn.shape + w_reshaped = w_ekn.reshape(E, K, N // (2 * g), 2, g) + gate = w_reshaped[:, :, :, 0, :].reshape(E, K, N // 2) + up = w_reshaped[:, :, :, 1, :].reshape(E, K, N // 2) + return torch.cat([gate, up], dim=2) + + +def assemble_scales_2d_side(raw_scales): + """Assemble activation scale factors for the 2Dx3D scenario. + + Args: + raw_scales: list of (M_e, K_sf) float8_e4m3fn tensors, one per expert + + Returns: + Assembled and swizzled scale tensor + """ + return assemble_raw_scales_2d3d_2d_side(raw_scales) + + +def assemble_scales_3d_side(raw_scales): + """Assemble weight scale factors for the 2Dx3D scenario. + + Args: + raw_scales: list of (K_sf, N) float8_e4m3fn tensors, one per expert + NOTE: These will be transposed to (N, K_sf) before swizzling, + since the kernel expects N as the non-K dimension. + + Returns: + Assembled and swizzled scale tensor + """ + # Kernel expects (N, K_sf) — transpose before swizzling + transposed = [sf.T.contiguous() for sf in raw_scales] + return assemble_raw_scales_2d3d_3d_side(transposed) + + +# ── Tensor Layout Conversion ────────────────────────────────────────── + +def make_b_k_major(b_tensor): + """Convert B tensor from N-major to K-major layout. + + The kernel expects B with stride (E*K*N, 1, K) — K is contiguous. + torch.stack produces stride (E*K*N, N, 1) — N is contiguous. + + Args: + b_tensor: (experts, K_packed, N_packed) float4_e2m1fn_x2, N-major + + Returns: + Same shape, K-major strides + """ + return b_tensor.permute(0, 2, 1).contiguous().permute(0, 2, 1) + + +def compute_expert_offsets(tokens_per_expert, num_experts, device="cuda"): + """Compute cumulative token offsets for the grouped GEMM. + + Args: + tokens_per_expert: list of int, one per expert + + Returns: + offs: (num_experts,) int32 — cumulative sum + """ + offs = torch.tensor( + [sum(tokens_per_expert[:e+1]) for e in range(num_experts)], + dtype=torch.int32, device=device, + ) + return offs + + +# ── Kernel Launch ───────────────────────────────────────────────────── + diff --git a/dsv4/ops/quantize.py b/dsv4/ops/quantize.py new file mode 100644 index 00000000..ecdf6544 --- /dev/null +++ b/dsv4/ops/quantize.py @@ -0,0 +1,253 @@ +"""NVFP4 quantization: BF16 <-> NVFP4 conversion, scale factor computation.""" +import math +import torch +import cutlass +import cutlass.cute as cute +import cutlass.torch as cutlass_torch +import cutlass.utils as utils + +from dsv4.kernels.gemm.grouped import ( + cat_byte_reinterpretable_tensors, + stack_byte_reinterpretable_tensors, +) + +E2M1_MAGNITUDES = [0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0] + +# Cache compiled kernels + pre-allocated workspace by cache_key +# Each entry: {'compiled': callable, 'workspace': Tensor, 'workspace_size': int} +# +# Key design decisions (Bug #1 fix): +# - cute.compile does NOT corrupt GPU memory (verified 2026-05-20 on B200). +# The original _needs_token_refill hack was a misdiagnosis. The real bug +# was elsewhere (likely OOB write or weight loading). +# - Workspace is pre-allocated per cache entry during warmup_compilation() +# and reused on subsequent calls. No torch.full() in the hot path. +# - CuTe tensor wrappers (from_dlpack + mark_layout_dynamic) are cheap +# metadata wrappers. We re-create them per call from real tensors. +# Caching them would hold stale references to tensors that get freed. + +# Cached LUT for E2M1 quantization (created once per device, cudagraph-safe) +_NVFP4_STEP_LUT_CACHE = {} +def _get_step_to_idx_lut(device): + """Get or create the E2M1 step-to-index LUT for the given device. + + Cached per device to avoid CPU->CUDA copies during cudagraph capture. + Must be pre-populated during warmup (before torch.compile/cudagraph capture) + so the lock is never entered on the compiled path. + """ + # Fast path: already cached — no lock needed (torch.compile-safe) + if device in _NVFP4_STEP_LUT_CACHE: + return _NVFP4_STEP_LUT_CACHE[device] + # Slow path: first call, create the LUT + lut = torch.as_tensor( + [0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 7, 7], + dtype=torch.int8, device=device, + ) + _NVFP4_STEP_LUT_CACHE[device] = lut + return lut +SF_VEC_SIZE = 16 # NVFP4 block size + +def quantize_to_nvfp4(x_bf16, block_size=SF_VEC_SIZE): + """Quantize BF16 tensor to NVFP4. + + Args: + x_bf16: (..., D) BF16 tensor + + Returns: + x_fp4: (..., D//2) float4_e2m1fn_x2 — native PyTorch FP4 + x_sf: (..., D//16) float8_e4m3fn — block scales + global_scale: float32 scalar + """ + x_f32 = x_bf16.float() + amax = x_f32.abs().max().clamp(min=1e-8).float() + global_scale = amax / (6.0 * 448.0) + x_norm = x_f32 / global_scale + + last_dim = x_norm.shape[-1] + n_blocks = ceil_div(last_dim, block_size) + + if last_dim % block_size != 0: + pad_size = n_blocks * block_size - last_dim + x_norm = torch.nn.functional.pad(x_norm, (0, pad_size)) + + x_reshaped = x_norm.reshape(*x_norm.shape[:-1], n_blocks, block_size) + block_amax = x_reshaped.abs().amax(dim=-1) + # Detect zero blocks and underflow blocks (amax > 0 but too small for FP8). + # Smallest positive FP8 e4m3fn is 2^-9 ≈ 1.95e-3. If amax/6 < this, + # the block scale underflows to 0, and dividing x by the clamped 1e-8 + # inflates values into nonzero FP4 buckets — producing wrong results. + zero_block = block_amax < (6.0 * 2.0 ** -9) # < ~0.0117 + # Zero out x for zero/underflow blocks before division. + # This ensures x_scaled = 0 → FP4 nibbles = 0. + x_reshaped = torch.where(zero_block.unsqueeze(-1), + torch.zeros_like(x_reshaped), x_reshaped) + block_amax = block_amax.clamp(min=1e-8) + block_scale = (block_amax / 6.0).to(torch.float8_e4m3fn) + # Force zero/underflow blocks: FP8 scale = 0 (exact zero). + block_scale = torch.where(zero_block, torch.zeros_like(block_scale), block_scale) + + # Nearest E2M1 + block_sf_expanded = block_scale.float().unsqueeze(-1) + x_scaled = x_reshaped / block_sf_expanded.clamp(min=1e-8) + + signs = torch.sign(x_scaled) + abs_scaled = x_scaled.abs().clamp(max=6.0) + + half_steps = (abs_scaled * 2.0).round().clamp(0, 12).to(torch.int8) + step_to_idx = _get_step_to_idx_lut(x_bf16.device) + indices = step_to_idx[half_steps.long()] + + nibbles = torch.where(signs < 0, indices + 8, indices).to(torch.uint8) + even = nibbles[..., ::2] + odd = nibbles[..., 1::2] + packed = (odd << 4) | even + + packed_shape = list(x_bf16.shape) + packed_shape[-1] = last_dim // 2 + x_fp4 = packed.view(torch.float4_e2m1fn_x2).reshape(packed_shape) + + sf_shape = list(x_bf16.shape[:-1]) + [n_blocks] + block_scale = block_scale.reshape(sf_shape) + + return x_fp4, block_scale, global_scale + + +def quantize_activation_nvfp4(x_bf16, global_scale, block_size=SF_VEC_SIZE): + """Quantize BF16 activation tensor to NVFP4 (cudagraph-safe). + + Unlike quantize_to_nvfp4(), this takes a pre-computed global_scale + instead of computing it via .max() (which forces CPU-GPU sync). + All operations are pure GPU with no CPU-GPU syncs. + + Args: + x_bf16: (..., D) BF16 tensor + global_scale: float32 scalar (pre-computed, NOT from .max()) + block_size: NVFP4 block size + + Returns: + x_fp4: (..., D//2) float4_e2m1fn_x2 + x_sf: (..., D//16) float8_e4m3fn + """ + x_f32 = x_bf16.float() + x_norm = x_f32 / global_scale + + last_dim = x_norm.shape[-1] + n_blocks = ceil_div(last_dim, block_size) + + if last_dim % block_size != 0: + pad_size = n_blocks * block_size - last_dim + x_norm = torch.nn.functional.pad(x_norm, (0, pad_size)) + + x_reshaped = x_norm.reshape(*x_norm.shape[:-1], n_blocks, block_size) + block_amax = x_reshaped.abs().amax(dim=-1) + # Detect zero blocks and underflow blocks (same threshold as quantize_to_nvfp4). + zero_block = block_amax < (6.0 * 2.0 ** -9) + x_reshaped = torch.where(zero_block.unsqueeze(-1), + torch.zeros_like(x_reshaped), x_reshaped) + block_amax = block_amax.clamp(min=1e-8) + block_scale = (block_amax / 6.0).to(torch.float8_e4m3fn) + block_scale = torch.where(zero_block, torch.zeros_like(block_scale), block_scale) + + block_sf_expanded = block_scale.float().unsqueeze(-1) + x_scaled = x_reshaped / block_sf_expanded.clamp(min=1e-8) + signs = torch.sign(x_scaled) + abs_scaled = x_scaled.abs().clamp(max=6.0) + + half_steps = (abs_scaled * 2.0).round().clamp(0, 12).to(torch.int8) + step_to_idx = _get_step_to_idx_lut(x_bf16.device) + indices = step_to_idx[half_steps.long()] + + nibbles = torch.where(signs < 0, indices + 8, indices).to(torch.uint8) + even = nibbles[..., ::2] + odd = nibbles[..., 1::2] + packed = (odd << 4) | even + + packed_shape = list(x_bf16.shape) + packed_shape[-1] = last_dim // 2 + x_fp4 = packed.view(torch.float4_e2m1fn_x2).reshape(packed_shape) + + sf_shape = list(x_bf16.shape[:-1]) + [n_blocks] + block_scale = block_scale.reshape(sf_shape) + + return x_fp4, block_scale + + +def quantize_weight_to_nvfp4(w_bf16, block_size=SF_VEC_SIZE): + """Quantize BF16 weight matrix to NVFP4. + + The weight is (K, N) where K is the input dim (packed dimension). + Block scales are computed along K (dim 0). + + Args: + w_bf16: (K, N) BF16 weight matrix + + Returns: + w_fp4: (K//2, N) float4_e2m1fn_x2 — K is the packed dim + w_sf: (K//16, N) float8_e4m3fn — block scales along K + global_scale: float32 scalar + """ + K, N = w_bf16.shape + w_f32 = w_bf16.float() + amax = w_f32.abs().max().clamp(min=1e-8).float() + global_scale = amax / (6.0 * 448.0) + w_norm = w_f32 / global_scale + + k_blocks = ceil_div(K, block_size) + if K % block_size != 0: + w_norm = torch.nn.functional.pad(w_norm, (0, 0, 0, k_blocks * block_size - K)) + + w_reshaped = w_norm.reshape(k_blocks, block_size, N) + w_block_amax = w_reshaped.abs().amax(dim=1) + # Detect zero blocks and underflow blocks (same threshold). + zero_block = w_block_amax < (6.0 * 2.0 ** -9) + w_reshaped = torch.where(zero_block.unsqueeze(1), + torch.zeros_like(w_reshaped), w_reshaped) + w_block_amax = w_block_amax.clamp(min=1e-8) + w_sf = (w_block_amax / 6.0).to(torch.float8_e4m3fn) + w_sf = torch.where(zero_block, torch.zeros_like(w_sf), w_sf) + + w_block_sf = w_sf.float().unsqueeze(1) + w_scaled = w_reshaped / w_block_sf.clamp(min=1e-8) + + signs = torch.sign(w_scaled) + abs_scaled = w_scaled.abs().clamp(max=6.0) + + half_steps = (abs_scaled * 2.0).round().clamp(0, 12).to(torch.int8) + step_to_idx = _get_step_to_idx_lut(w_bf16.device) + indices = step_to_idx[half_steps.long()] + nibbles = torch.where(signs < 0, indices + 8, indices).to(torch.uint8) + + even = nibbles[:, ::2, :] + odd = nibbles[:, 1::2, :] + packed = (odd << 4) | even + + w_fp4 = packed.reshape(K // 2, N).view(torch.float4_e2m1fn_x2) + return w_fp4, w_sf, global_scale + + +# ── Scale Factor Assembly ───────────────────────────────────────────── +def deinterleave_quantize_nvfp4_cuda(fused_bf16, intermediate, global_scale, granularity=8): + """De-interleave + quantize fused SwiGLU output using a custom CUDA kernel. + + Single kernel launch, no Python loop. 4x faster than the Python path. + + Args: + fused_bf16: (M, 2*intermediate) BF16 — fused L1 output with interleaved gate/up + intermediate: intermediate dimension (e.g., 3072) + global_scale: pre-computed global scale for quantization + granularity: interleave granularity in BF16 columns (default 8) + + Returns: + x_fp4: (M, intermediate//2) float4_e2m1fn_x2 — quantized SwiGLU + x_sf: (M, intermediate//16) float8_e4m3fn — block scales + """ + from torch.utils.cpp_extension import load + import os + kernel_dir = os.path.join(os.path.dirname(__file__), "kernels") + mod = load( + name="deinterleave_quantize_nvfp4", + sources=[os.path.join(kernel_dir, "deinterleave_quantize.cu")], + extra_cuda_cflags=["-gencode=arch=compute_100a,code=sm_100a"], + verbose=False, + ) + return mod.deinterleave_quantize_nvfp4(fused_bf16, intermediate, granularity, global_scale) diff --git a/cutedsl/inverse_rope.py b/dsv4/ops/rope.py similarity index 100% rename from cutedsl/inverse_rope.py rename to dsv4/ops/rope.py diff --git a/cutedsl/sparse_topk_metadata.py b/dsv4/ops/topk.py similarity index 100% rename from cutedsl/sparse_topk_metadata.py rename to dsv4/ops/topk.py diff --git a/dsv4/reference/__init__.py b/dsv4/reference/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/cutedsl/blackwell_attention.py b/dsv4/reference/attention.py similarity index 100% rename from cutedsl/blackwell_attention.py rename to dsv4/reference/attention.py diff --git a/cutedsl/csa_hca_compressor_PYTORCH_EXAMPLE.py b/dsv4/reference/compressor.py similarity index 100% rename from cutedsl/csa_hca_compressor_PYTORCH_EXAMPLE.py rename to dsv4/reference/compressor.py diff --git a/cutedsl/csa_attention.py b/dsv4/reference/csa_attention.py similarity index 100% rename from cutedsl/csa_attention.py rename to dsv4/reference/csa_attention.py diff --git a/cutedsl/moe_pipeline.py b/dsv4/reference/moe_pipeline.py similarity index 97% rename from cutedsl/moe_pipeline.py rename to dsv4/reference/moe_pipeline.py index 3fcef365..076300ea 100644 --- a/cutedsl/moe_pipeline.py +++ b/dsv4/reference/moe_pipeline.py @@ -14,15 +14,19 @@ block scales in float8_e4m3fn, global scales in float32. """ import torch -from cutedsl.bridge import ( +from dsv4.ops.quantize import ( quantize_to_nvfp4, quantize_weight_to_nvfp4, +) +from dsv4.ops.layouts import ( assemble_scales_2d_side, assemble_scales_3d_side, make_b_k_major, compute_expert_offsets, interleave_l1_weights, deinterleave_l1_weights, +) +from dsv4.ops.gemm_runner import ( run_nvfp4_grouped_gemm, run_fused_swiglu_grouped_gemm, warmup_fused_swiglu_compilation, @@ -198,7 +202,7 @@ def run_nvfp4_moe( sf_ekn = sf.unsqueeze(0) # (1, K_sf, N) sf_ekn = interleave_l1_weights(sf_ekn) # interleaved along N l1_sf_il.append(sf_ekn[0].T.contiguous()) # (N, K_sf) for assembly - from cutedsl.kernel.moe.torch_scaled_grouped_mm import assemble_raw_scales_2d3d_3d_side as _assemble_3d + from dsv4.kernels.gemm.grouped import assemble_raw_scales_2d3d_3d_side as _assemble_3d l1_scale_b = _assemble_3d(l1_sf_il) # Global scales: alpha = igs * weight_gs for each expert @@ -347,7 +351,7 @@ def run_nvfp4_moe_fused( sf_ekn = sf.unsqueeze(0) sf_ekn = interleave_l1_weights(sf_ekn) l1_sf_il.append(sf_ekn[0].T.contiguous()) - from cutedsl.kernel.moe.torch_scaled_grouped_mm import assemble_raw_scales_2d3d_3d_side as _assemble_3d + from dsv4.kernels.gemm.grouped import assemble_raw_scales_2d3d_3d_side as _assemble_3d l1_scale_b = _assemble_3d(l1_sf_il) l1_global_scale_a = torch.tensor([x_igs] * num_experts, dtype=torch.float32, device=device) @@ -368,7 +372,10 @@ def run_nvfp4_moe_fused( intermediate_size = l1_fused_out.shape[1] // 2 # Use pre-computed L2 activation gs, or compute from amax (fallback) l2_gs = l2_activation_gs if l2_activation_gs is not None else l1_fused_out.abs().amax().float().item() / 2688.0 - from cutedsl.bridge import deinterleave_quantize_nvfp4_cuda, quantize_activation_nvfp4 + from dsv4.ops.quantize import ( + deinterleave_quantize_nvfp4_cuda, + quantize_activation_nvfp4, + ) l2_x_fp4, l2_x_sf = deinterleave_quantize_nvfp4_cuda(l1_fused_out, intermediate_size, l2_gs) # Skip the separate L2 quantize step below — we already have FP4+SF # Set activated to None to signal we already quantized diff --git a/pyproject.toml b/pyproject.toml index 13ae227a..40044755 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ requires = ["setuptools>=68.0", "wheel"] build-backend = "setuptools.build_meta" [project] -name = "nvfp4-megamoe-kernel" +name = "dsv4-inference" version = "0.1.0" description = "NVFP4 Mega MoE kernel for DeepSeek-V4-Pro on Blackwell (TileLang)" requires-python = ">=3.10" @@ -13,3 +13,4 @@ dependencies = [ [tool.setuptools.packages.find] where = ["."] +include = ["dsv4*"] diff --git a/tests/debug_output.py b/tests/archive/debug_output.py similarity index 97% rename from tests/debug_output.py rename to tests/archive/debug_output.py index 17cc0453..f7bef38b 100644 --- a/tests/debug_output.py +++ b/tests/archive/debug_output.py @@ -6,7 +6,7 @@ sys.path.insert(0, '/root/nvfp4-megamoe-kernel/cutedsl') sys.path.insert(0, '/root/nvfp4-megamoe-kernel/vllm') from cutedsl.reference.moe_pipeline import moe_pipeline -from vllm.nvfp4_cutedsl import CuTeDSLMoERunner +from vllm.nvfp4_cutedsl import Nvfp4MoE torch.cuda.set_device(0) @@ -33,7 +33,7 @@ ref_out = moe_pipeline( print(f"Reference output: amax={ref_out.amax().item():.4f} mean={ref_out.mean().item():.4f}") # Run runner with warmup gs -runner = CuTeDSLMoERunner( +runner = Nvfp4MoE( num_experts=3, hidden_size=256, intermediate_size=512, max_num_tokens=4, top_k=2, device='cuda' ) diff --git a/tests/debug_stages.py b/tests/archive/debug_stages.py similarity index 100% rename from tests/debug_stages.py rename to tests/archive/debug_stages.py diff --git a/tests/diag_layouts.py b/tests/archive/diag_layouts.py similarity index 100% rename from tests/diag_layouts.py rename to tests/archive/diag_layouts.py diff --git a/tests/diag_tmem.py b/tests/archive/diag_tmem.py similarity index 100% rename from tests/diag_tmem.py rename to tests/archive/diag_tmem.py diff --git a/tests/stage_b_debug5.py b/tests/archive/stage_b_debug5.py similarity index 100% rename from tests/stage_b_debug5.py rename to tests/archive/stage_b_debug5.py diff --git a/tests/test_128_128_fmha_v.py b/tests/archive/test_128_128_fmha_v.py similarity index 100% rename from tests/test_128_128_fmha_v.py rename to tests/archive/test_128_128_fmha_v.py diff --git a/tests/test_128_16_bigP.py b/tests/archive/test_128_16_bigP.py similarity index 100% rename from tests/test_128_16_bigP.py rename to tests/archive/test_128_16_bigP.py diff --git a/tests/test_128_16_debug.py b/tests/archive/test_128_16_debug.py similarity index 100% rename from tests/test_128_16_debug.py rename to tests/archive/test_128_16_debug.py diff --git a/tests/test_128_16_debug2.py b/tests/archive/test_128_16_debug2.py similarity index 100% rename from tests/test_128_16_debug2.py rename to tests/archive/test_128_16_debug2.py diff --git a/tests/test_128_16_debug3.py b/tests/archive/test_128_16_debug3.py similarity index 100% rename from tests/test_128_16_debug3.py rename to tests/archive/test_128_16_debug3.py diff --git a/tests/test_128_16_fp16.py b/tests/archive/test_128_16_fp16.py similarity index 100% rename from tests/test_128_16_fp16.py rename to tests/archive/test_128_16_fp16.py diff --git a/tests/test_128_16_full.py b/tests/archive/test_128_16_full.py similarity index 100% rename from tests/test_128_16_full.py rename to tests/archive/test_128_16_full.py diff --git a/tests/test_128_16_minimal.py b/tests/archive/test_128_16_minimal.py similarity index 100% rename from tests/test_128_16_minimal.py rename to tests/archive/test_128_16_minimal.py diff --git a/tests/test_128_16_nogC.py b/tests/archive/test_128_16_nogC.py similarity index 100% rename from tests/test_128_16_nogC.py rename to tests/archive/test_128_16_nogC.py diff --git a/tests/test_128_16_nopack.py b/tests/archive/test_128_16_nopack.py similarity index 100% rename from tests/test_128_16_nopack.py rename to tests/archive/test_128_16_nopack.py diff --git a/tests/test_128_16_nosoftmax.py b/tests/archive/test_128_16_nosoftmax.py similarity index 100% rename from tests/test_128_16_nosoftmax.py rename to tests/archive/test_128_16_nosoftmax.py diff --git a/tests/test_128_16_pAtS.py b/tests/archive/test_128_16_pAtS.py similarity index 100% rename from tests/test_128_16_pAtS.py rename to tests/archive/test_128_16_pAtS.py diff --git a/tests/test_128_16_pvlayout.py b/tests/archive/test_128_16_pvlayout.py similarity index 100% rename from tests/test_128_16_pvlayout.py rename to tests/archive/test_128_16_pvlayout.py diff --git a/tests/test_128_16_pvpack.py b/tests/archive/test_128_16_pvpack.py similarity index 100% rename from tests/test_128_16_pvpack.py rename to tests/archive/test_128_16_pvpack.py diff --git a/tests/test_128_16_pvwrite.py b/tests/archive/test_128_16_pvwrite.py similarity index 100% rename from tests/test_128_16_pvwrite.py rename to tests/archive/test_128_16_pvwrite.py diff --git a/tests/test_128_16_qkread.py b/tests/archive/test_128_16_qkread.py similarity index 100% rename from tests/test_128_16_qkread.py rename to tests/archive/test_128_16_qkread.py diff --git a/tests/test_128_16_smem.py b/tests/archive/test_128_16_smem.py similarity index 100% rename from tests/test_128_16_smem.py rename to tests/archive/test_128_16_smem.py diff --git a/tests/test_128_16_stepA.py b/tests/archive/test_128_16_stepA.py similarity index 100% rename from tests/test_128_16_stepA.py rename to tests/archive/test_128_16_stepA.py diff --git a/tests/test_128_16_stepB.py b/tests/archive/test_128_16_stepB.py similarity index 100% rename from tests/test_128_16_stepB.py rename to tests/archive/test_128_16_stepB.py diff --git a/tests/test_128_16_stepC.py b/tests/archive/test_128_16_stepC.py similarity index 100% rename from tests/test_128_16_stepC.py rename to tests/archive/test_128_16_stepC.py diff --git a/tests/test_128_16_stepD.py b/tests/archive/test_128_16_stepD.py similarity index 100% rename from tests/test_128_16_stepD.py rename to tests/archive/test_128_16_stepD.py diff --git a/tests/test_128_16_stepE.py b/tests/archive/test_128_16_stepE.py similarity index 100% rename from tests/test_128_16_stepE.py rename to tests/archive/test_128_16_stepE.py diff --git a/tests/test_128_16_tiler.py b/tests/archive/test_128_16_tiler.py similarity index 100% rename from tests/test_128_16_tiler.py rename to tests/archive/test_128_16_tiler.py diff --git a/tests/test_128_16_v8.py b/tests/archive/test_128_16_v8.py similarity index 100% rename from tests/test_128_16_v8.py rename to tests/archive/test_128_16_v8.py diff --git a/tests/test_128_16_zeropad.py b/tests/archive/test_128_16_zeropad.py similarity index 100% rename from tests/test_128_16_zeropad.py rename to tests/archive/test_128_16_zeropad.py diff --git a/tests/test_128_32_ctafix.py b/tests/archive/test_128_32_ctafix.py similarity index 100% rename from tests/test_128_32_ctafix.py rename to tests/archive/test_128_32_ctafix.py diff --git a/tests/test_128_32_ctafix2.py b/tests/archive/test_128_32_ctafix2.py similarity index 100% rename from tests/test_128_32_ctafix2.py rename to tests/archive/test_128_32_ctafix2.py diff --git a/tests/test_128_32_native.py b/tests/archive/test_128_32_native.py similarity index 100% rename from tests/test_128_32_native.py rename to tests/archive/test_128_32_native.py diff --git a/tests/test_128_32_vdiag.py b/tests/archive/test_128_32_vdiag.py similarity index 100% rename from tests/test_128_32_vdiag.py rename to tests/archive/test_128_32_vdiag.py diff --git a/tests/test_128_32_zeropad.py b/tests/archive/test_128_32_zeropad.py similarity index 100% rename from tests/test_128_32_zeropad.py rename to tests/archive/test_128_32_zeropad.py diff --git a/tests/test_afrag_roundtrip.py b/tests/archive/test_afrag_roundtrip.py similarity index 100% rename from tests/test_afrag_roundtrip.py rename to tests/archive/test_afrag_roundtrip.py diff --git a/tests/test_attention.py b/tests/archive/test_attention.py similarity index 98% rename from tests/test_attention.py rename to tests/archive/test_attention.py index 841f0950..5faacf93 100644 --- a/tests/test_attention.py +++ b/tests/archive/test_attention.py @@ -51,14 +51,14 @@ def dequant_nvfp4(packed_uint8, scale_e4m3, global_scale): def test_projection(name, weight, weight_sf, weight_gs, hidden_states, in_features, out_features): """Test a single NVFP4 projection.""" sys.path.insert(0, "/root/nvfp4-megamoe-kernel") - from cutedsl.nvfp4_linear import CuTeDSLNvfp4Linear + from dsv4.layers.linear import Nvfp4Linear # Convert weight to CuTeDSL format: (out, in_packed) uint8 → (in_packed, out) float4 fp4 = [weight.view(torch.float4_e2m1fn_x2).permute(1, 0).contiguous()] sf = [weight_sf.permute(1, 0).contiguous()] gs = [weight_gs] - runner = CuTeDSLNvfp4Linear( + runner = Nvfp4Linear( in_features=in_features, out_features=out_features, max_num_tokens=8192, diff --git a/tests/test_attention_path_b200.py b/tests/archive/test_attention_path_b200.py similarity index 98% rename from tests/test_attention_path_b200.py rename to tests/archive/test_attention_path_b200.py index 91528d5d..334a513c 100644 --- a/tests/test_attention_path_b200.py +++ b/tests/archive/test_attention_path_b200.py @@ -55,7 +55,7 @@ def rms(x, w, eps=1e-6): return (w.float() * (x * torch.rsqrt(v+eps)).float()).to(x.dtype) def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None): - from cutedsl.nvfp4_linear import CuTeDSLNvfp4Linear + from dsv4.layers.linear import Nvfp4Linear fp4 = w.view(torch.float4_e2m1fn_x2).permute(1,0).contiguous() s = sf.to(torch.float8_e4m3fn) if sf.dtype != torch.float8_e4m3fn else sf s = s.permute(1,0).contiguous() @@ -66,7 +66,7 @@ def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None): s32[:sp] *= g1/gs; s32[sp:] *= g2/gs; s = s32.to(torch.float8_e4m3fn) else: gs = gs_t.max().item() if gs_t.numel() > 1 else gs_t.item() - r = CuTeDSLNvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device)) + r = Nvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device)) r.fp4 = [fp4]; r.sf = [s]; r.gs = [gs] r.finalize_weights(); r._ensure_initialized() return r diff --git a/tests/test_b_afrag2.py b/tests/archive/test_b_afrag2.py similarity index 100% rename from tests/test_b_afrag2.py rename to tests/archive/test_b_afrag2.py diff --git a/tests/test_b_layout.py b/tests/archive/test_b_layout.py similarity index 100% rename from tests/test_b_layout.py rename to tests/archive/test_b_layout.py diff --git a/tests/test_bf16_elemwise.py b/tests/archive/test_bf16_elemwise.py similarity index 100% rename from tests/test_bf16_elemwise.py rename to tests/archive/test_bf16_elemwise.py diff --git a/tests/test_bf16_pack.py b/tests/archive/test_bf16_pack.py similarity index 100% rename from tests/test_bf16_pack.py rename to tests/archive/test_bf16_pack.py diff --git a/tests/test_bf16_recast_full.py b/tests/archive/test_bf16_recast_full.py similarity index 100% rename from tests/test_bf16_recast_full.py rename to tests/archive/test_bf16_recast_full.py diff --git a/tests/test_bf16_recast_simple.py b/tests/archive/test_bf16_recast_simple.py similarity index 100% rename from tests/test_bf16_recast_simple.py rename to tests/archive/test_bf16_recast_simple.py diff --git a/tests/test_blackwell_attn_b200.py b/tests/archive/test_blackwell_attn_b200.py similarity index 98% rename from tests/test_blackwell_attn_b200.py rename to tests/archive/test_blackwell_attn_b200.py index 110e5b1e..41394760 100644 --- a/tests/test_blackwell_attn_b200.py +++ b/tests/archive/test_blackwell_attn_b200.py @@ -50,7 +50,7 @@ def rms(x, w, eps=1e-6): return (w.float() * (x * torch.rsqrt(v+eps)).float()).to(x.dtype) def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None): - from cutedsl.nvfp4_linear import CuTeDSLNvfp4Linear + from dsv4.layers.linear import Nvfp4Linear fp4 = w.view(torch.float4_e2m1fn_x2).permute(1,0).contiguous() s = sf.to(torch.float8_e4m3fn) if sf.dtype != torch.float8_e4m3fn else sf s = s.permute(1,0).contiguous() @@ -61,7 +61,7 @@ def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None): s32[:sp] *= g1/gs; s32[sp:] *= g2/gs; s = s32.to(torch.float8_e4m3fn) else: gs = gs_t.max().item() if gs_t.numel() > 1 else gs_t.item() - r = CuTeDSLNvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device)) + r = Nvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device)) r.fp4 = [fp4]; r.sf = [s]; r.gs = [gs] r.finalize_weights(); r._ensure_initialized() return r @@ -82,7 +82,7 @@ def build_cos_sin(max_pos=4096, rope_dim=ROPE): def test_blackwell_attention(layer_id, compress_ratio): """Test the full blackwell attention pipeline for a specific layer.""" - from cutedsl.blackwell_attention import ( + from dsv4.reference.attention import ( apply_gptj_rope, apply_inv_gptj_rope, blackwell_attention_forward, kv_quantize_fp8, kv_dequantize_fp8, diff --git a/tests/test_csa_attention_b200.py b/tests/archive/test_csa_attention_b200.py similarity index 97% rename from tests/test_csa_attention_b200.py rename to tests/archive/test_csa_attention_b200.py index 8b4e27f8..6f4f6935 100644 --- a/tests/test_csa_attention_b200.py +++ b/tests/archive/test_csa_attention_b200.py @@ -52,7 +52,7 @@ def rms(x, w, eps=1e-6): return (w.float() * (x * torch.rsqrt(v+eps)).float()).to(x.dtype) def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None): - from cutedsl.nvfp4_linear import CuTeDSLNvfp4Linear + from dsv4.layers.linear import Nvfp4Linear fp4 = w.view(torch.float4_e2m1fn_x2).permute(1,0).contiguous() s = sf.to(torch.float8_e4m3fn) if sf.dtype != torch.float8_e4m3fn else sf s = s.permute(1,0).contiguous() @@ -63,7 +63,7 @@ def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None): s32[:sp] *= g1/gs; s32[sp:] *= g2/gs; s = s32.to(torch.float8_e4m3fn) else: gs = gs_t.max().item() if gs_t.numel() > 1 else gs_t.item() - r = CuTeDSLNvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device)) + r = Nvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device)) r.fp4 = [fp4]; r.sf = [s]; r.gs = [gs] r.finalize_weights(); r._ensure_initialized() return r @@ -169,7 +169,7 @@ def main(): # For this test, we use kv_n directly as the KV for attention # ── Step 6: FULL ATTENTION (PyTorch SDPA, works on Blackwell) ── - from cutedsl.csa_attention import full_attention_reference + from dsv4.reference.csa_attention import full_attention_reference o_attn = full_attention_reference(q_rope, kv_n, scale=SCALE) print(f" Attention output: amax={o_attn.amax():.4f} NaN={torch.isnan(o_attn).any()}") diff --git a/tests/test_csa_sparse_attn_b200.py b/tests/archive/test_csa_sparse_attn_b200.py similarity index 99% rename from tests/test_csa_sparse_attn_b200.py rename to tests/archive/test_csa_sparse_attn_b200.py index 5c75e229..d7afd83f 100644 --- a/tests/test_csa_sparse_attn_b200.py +++ b/tests/archive/test_csa_sparse_attn_b200.py @@ -42,7 +42,7 @@ def rms(x, w, eps=1e-6): return (w.float() * (x * torch.rsqrt(v+eps)).float()).to(x.dtype) def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None): - from cutedsl.nvfp4_linear import CuTeDSLNvfp4Linear + from dsv4.layers.linear import Nvfp4Linear fp4 = w.view(torch.float4_e2m1fn_x2).permute(1,0).contiguous() s = sf.to(torch.float8_e4m3fn) if sf.dtype != torch.float8_e4m3fn else sf s = s.permute(1,0).contiguous() @@ -53,7 +53,7 @@ def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None): s32[:sp] *= g1/gs; s32[sp:] *= g2/gs; s = s32.to(torch.float8_e4m3fn) else: gs = gs_t.max().item() if gs_t.numel() > 1 else gs_t.item() - r = CuTeDSLNvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device)) + r = Nvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device)) r.fp4 = [fp4]; r.sf = [s]; r.gs = [gs] r.finalize_weights(); r._ensure_initialized() return r diff --git a/tests/test_decode_attention_b200.py b/tests/archive/test_decode_attention_b200.py similarity index 99% rename from tests/test_decode_attention_b200.py rename to tests/archive/test_decode_attention_b200.py index c5d4da1f..a1339423 100644 --- a/tests/test_decode_attention_b200.py +++ b/tests/archive/test_decode_attention_b200.py @@ -61,7 +61,7 @@ def rms(x, w, eps=1e-6): return (w.float() * (x * torch.rsqrt(v+eps)).float()).to(x.dtype) def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None): - from cutedsl.nvfp4_linear import CuTeDSLNvfp4Linear + from dsv4.layers.linear import Nvfp4Linear fp4 = w.view(torch.float4_e2m1fn_x2).permute(1,0).contiguous() s = sf.to(torch.float8_e4m3fn) if sf.dtype != torch.float8_e4m3fn else sf s = s.permute(1,0).contiguous() @@ -72,7 +72,7 @@ def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None): s32[:sp] *= g1/gs; s32[sp:] *= g2/gs; s = s32.to(torch.float8_e4m3fn) else: gs = gs_t.max().item() if gs_t.numel() > 1 else gs_t.item() - r = CuTeDSLNvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device)) + r = Nvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device)) r.fp4 = [fp4]; r.sf = [s]; r.gs = [gs] r.finalize_weights(); r._ensure_initialized() return r diff --git a/tests/test_decode_pipeline.py b/tests/archive/test_decode_pipeline.py similarity index 98% rename from tests/test_decode_pipeline.py rename to tests/archive/test_decode_pipeline.py index f562b127..459c84ef 100644 --- a/tests/test_decode_pipeline.py +++ b/tests/archive/test_decode_pipeline.py @@ -24,7 +24,7 @@ from vllm.model_executor.layers.csa_attention import ( apply_gptj_rope, apply_inv_gptj_rope, ) -from cutedsl.native_swa_decode import native_swa_decode_attention +from dsv4.ops.decode_swa import native_swa_decode_attention torch.manual_seed(42) torch.cuda.set_device(0) diff --git a/tests/test_decode_vs_prefill_b200.py b/tests/archive/test_decode_vs_prefill_b200.py similarity index 98% rename from tests/test_decode_vs_prefill_b200.py rename to tests/archive/test_decode_vs_prefill_b200.py index 2d236d33..eec978ec 100644 --- a/tests/test_decode_vs_prefill_b200.py +++ b/tests/archive/test_decode_vs_prefill_b200.py @@ -43,7 +43,7 @@ def rms(x, w, eps=1e-6): return (w.float() * (x * torch.rsqrt(v+eps)).float()).to(x.dtype) def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None): - from cutedsl.nvfp4_linear import CuTeDSLNvfp4Linear + from dsv4.layers.linear import Nvfp4Linear fp4 = w.view(torch.float4_e2m1fn_x2).permute(1,0).contiguous() s = sf.to(torch.float8_e4m3fn) if sf.dtype != torch.float8_e4m3fn else sf s = s.permute(1,0).contiguous() @@ -54,7 +54,7 @@ def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None): s32[:sp] *= g1/gs; s32[sp:] *= g2/gs; s = s32.to(torch.float8_e4m3fn) else: gs = gs_t.max().item() if gs_t.numel() > 1 else gs_t.item() - r = CuTeDSLNvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device)) + r = Nvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device)) r.fp4 = [fp4]; r.sf = [s]; r.gs = [gs] r.finalize_weights(); r._ensure_initialized() return r diff --git a/tests/test_diag_layout.py b/tests/archive/test_diag_layout.py similarity index 100% rename from tests/test_diag_layout.py rename to tests/archive/test_diag_layout.py diff --git a/tests/test_diag_permute.py b/tests/archive/test_diag_permute.py similarity index 100% rename from tests/test_diag_permute.py rename to tests/archive/test_diag_permute.py diff --git a/tests/test_diag_smem_layout.py b/tests/archive/test_diag_smem_layout.py similarity index 100% rename from tests/test_diag_smem_layout.py rename to tests/archive/test_diag_smem_layout.py diff --git a/tests/test_diag_v_mma128.py b/tests/archive/test_diag_v_mma128.py similarity index 100% rename from tests/test_diag_v_mma128.py rename to tests/archive/test_diag_v_mma128.py diff --git a/tests/test_diag_v_ones.py b/tests/archive/test_diag_v_ones.py similarity index 100% rename from tests/test_diag_v_ones.py rename to tests/archive/test_diag_v_ones.py diff --git a/tests/test_diag_v_truncid.py b/tests/archive/test_diag_v_truncid.py similarity index 100% rename from tests/test_diag_v_truncid.py rename to tests/archive/test_diag_v_truncid.py diff --git a/tests/test_e2e_decode_b200.py b/tests/archive/test_e2e_decode_b200.py similarity index 99% rename from tests/test_e2e_decode_b200.py rename to tests/archive/test_e2e_decode_b200.py index 79d96e68..f5096c8d 100644 --- a/tests/test_e2e_decode_b200.py +++ b/tests/archive/test_e2e_decode_b200.py @@ -45,7 +45,7 @@ def rms(x, w, eps=1e-6): return (w.float() * (x * torch.rsqrt(v+eps)).float()).to(x.dtype) def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None): - from cutedsl.nvfp4_linear import CuTeDSLNvfp4Linear + from dsv4.layers.linear import Nvfp4Linear fp4 = w.view(torch.float4_e2m1fn_x2).permute(1,0).contiguous() s = sf.to(torch.float8_e4m3fn) if sf.dtype != torch.float8_e4m3fn else sf s = s.permute(1,0).contiguous() @@ -56,7 +56,7 @@ def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None): s32[:sp] *= g1/gs; s32[sp:] *= g2/gs; s = s32.to(torch.float8_e4m3fn) else: gs = gs_t.max().item() if gs_t.numel() > 1 else gs_t.item() - r = CuTeDSLNvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device)) + r = Nvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device)) r.fp4 = [fp4]; r.sf = [s]; r.gs = [gs] r.finalize_weights(); r._ensure_initialized() return r diff --git a/tests/test_error_pattern.py b/tests/archive/test_error_pattern.py similarity index 100% rename from tests/test_error_pattern.py rename to tests/archive/test_error_pattern.py diff --git a/tests/test_fmha_pipeline.py b/tests/archive/test_fmha_pipeline.py similarity index 100% rename from tests/test_fmha_pipeline.py rename to tests/archive/test_fmha_pipeline.py diff --git a/tests/test_fmha_v1.py b/tests/archive/test_fmha_v1.py similarity index 100% rename from tests/test_fmha_v1.py rename to tests/archive/test_fmha_v1.py diff --git a/tests/test_fmha_v2.py b/tests/archive/test_fmha_v2.py similarity index 100% rename from tests/test_fmha_v2.py rename to tests/archive/test_fmha_v2.py diff --git a/tests/test_fmha_v2_fixed.py b/tests/archive/test_fmha_v2_fixed.py similarity index 100% rename from tests/test_fmha_v2_fixed.py rename to tests/archive/test_fmha_v2_fixed.py diff --git a/tests/test_fmha_v3_debug.py b/tests/archive/test_fmha_v3_debug.py similarity index 100% rename from tests/test_fmha_v3_debug.py rename to tests/archive/test_fmha_v3_debug.py diff --git a/tests/test_full_layer_b200.py b/tests/archive/test_full_layer_b200.py similarity index 97% rename from tests/test_full_layer_b200.py rename to tests/archive/test_full_layer_b200.py index 858021d2..c4db4920 100644 --- a/tests/test_full_layer_b200.py +++ b/tests/archive/test_full_layer_b200.py @@ -57,7 +57,7 @@ def rms(x, w, eps=1e-6): return (w.float() * (x * torch.rsqrt(v+eps)).float()).to(x.dtype) def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None): - from cutedsl.nvfp4_linear import CuTeDSLNvfp4Linear + from dsv4.layers.linear import Nvfp4Linear fp4 = w.view(torch.float4_e2m1fn_x2).permute(1,0).contiguous() s = sf.to(torch.float8_e4m3fn) if sf.dtype != torch.float8_e4m3fn else sf s = s.permute(1,0).contiguous() @@ -68,7 +68,7 @@ def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None): s32[:sp] *= g1/gs; s32[sp:] *= g2/gs; s = s32.to(torch.float8_e4m3fn) else: gs = gs_t.max().item() if gs_t.numel() > 1 else gs_t.item() - r = CuTeDSLNvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device)) + r = Nvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device)) r.fp4 = [fp4]; r.sf = [s]; r.gs = [gs] r.finalize_weights(); r._ensure_initialized() return r @@ -195,7 +195,7 @@ def main(): # ── Shared expert ───────────────────────────────────────────────── print("\n--- Shared Expert: CuTeDSL vs BF16 ---") - from cutedsl.shared_expert_pipeline import CuTeDSLSharedExpertRunner + from dsv4.layers.shared_expert import Nvfp4SharedExpert sgw = G(f"{m}.shared_experts.gate_proj.weight"); sgsf = G(f"{m}.shared_experts.gate_proj.weight_scale") sggs = G(f"{m}.shared_experts.gate_proj.weight_scale_2").item() @@ -211,7 +211,7 @@ def main(): s32 = sgu_sf.float(); s32[:si] *= sggs/smgs; s32[si:] *= sugs/smgs sgu_sf = s32.to(torch.float8_e4m3fn) - ser = CuTeDSLSharedExpertRunner(hidden_size=H, intermediate_size=si, max_num_tokens=8192, + ser = Nvfp4SharedExpert(hidden_size=H, intermediate_size=si, max_num_tokens=8192, device=DEV, swiglu_limit=SL) ser.l1_fp4 = [sgu_w.view(torch.float4_e2m1fn_x2).permute(1,0).contiguous()] ser.l1_sf = [sgu_sf.permute(1,0).contiguous()]; ser.l1_gs = [smgs] diff --git a/tests/test_full_layer_nan_b200.py b/tests/archive/test_full_layer_nan_b200.py similarity index 98% rename from tests/test_full_layer_nan_b200.py rename to tests/archive/test_full_layer_nan_b200.py index b52bc7b6..87aa5c67 100644 --- a/tests/test_full_layer_nan_b200.py +++ b/tests/archive/test_full_layer_nan_b200.py @@ -39,12 +39,12 @@ def rms(x, w, eps=1e-6): return (w.float() * (x * torch.rsqrt(v+eps)).float()).to(x.dtype) def make_runner(w, sf, gs_t, inf, outf): - from cutedsl.nvfp4_linear import CuTeDSLNvfp4Linear + from dsv4.layers.linear import Nvfp4Linear fp4 = w.view(torch.float4_e2m1fn_x2).permute(1,0).contiguous() s = sf.to(torch.float8_e4m3fn) if sf.dtype != torch.float8_e4m3fn else sf s = s.permute(1,0).contiguous() gs = gs_t.max().item() if gs_t.numel() > 1 else gs_t.item() - r = CuTeDSLNvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device)) + r = Nvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device)) r.fp4 = [fp4]; r.sf = [s]; r.gs = [gs] r.finalize_weights(); r._ensure_initialized() return r @@ -102,7 +102,7 @@ def causal_prefill_attention(q, kv, scale): def test_full_layer(layer_id, num_tokens=8, num_moe_experts=16): """Test a complete transformer layer with attention + MoE.""" - from cutedsl.runner import CuTeDSLMoERunner + from dsv4.layers.moe import Nvfp4MoE torch.cuda.set_device(0) torch.manual_seed(42) @@ -162,7 +162,7 @@ def test_full_layer(layer_id, num_tokens=8, num_moe_experts=16): # Free per-expert lists del gate_ws, gate_sfs, gate_gss, up_ws, up_sfs, up_gss, down_ws, down_sfs, down_gss - moe_runner = CuTeDSLMoERunner( + moe_runner = Nvfp4MoE( num_experts=num_moe_experts, hidden_size=H, intermediate_size=INTERMEDIATE, diff --git a/tests/test_full_model_b200.py b/tests/archive/test_full_model_b200.py similarity index 98% rename from tests/test_full_model_b200.py rename to tests/archive/test_full_model_b200.py index 3d0aaed7..be4141f5 100644 --- a/tests/test_full_model_b200.py +++ b/tests/archive/test_full_model_b200.py @@ -56,7 +56,7 @@ def rms(x, w, eps=1e-6): return (w.float() * (x * torch.rsqrt(v+eps)).float()).to(x.dtype) def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None): - from cutedsl.nvfp4_linear import CuTeDSLNvfp4Linear + from dsv4.layers.linear import Nvfp4Linear fp4 = w.view(torch.float4_e2m1fn_x2).permute(1,0).contiguous() s = sf.to(torch.float8_e4m3fn) if sf.dtype != torch.float8_e4m3fn else sf s = s.permute(1,0).contiguous() @@ -67,7 +67,7 @@ def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None): s32[:sp] *= g1/gs; s32[sp:] *= g2/gs; s = s32.to(torch.float8_e4m3fn) else: gs = gs_t.max().item() if gs_t.numel() > 1 else gs_t.item() - r = CuTeDSLNvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device)) + r = Nvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device)) r.fp4 = [fp4]; r.sf = [s]; r.gs = [gs] r.finalize_weights(); r._ensure_initialized() return r @@ -124,7 +124,7 @@ def bf16_causal_attention(q, kv, scale): def make_moe_runner(layer_id, wm, model_path): """Create CuTeDSL MoE runner for a layer.""" - from cutedsl.runner import CuTeDSLMoERunner + from dsv4.layers.moe import Nvfp4MoE p = f"model.layers.{layer_id}.mlp" G = lambda k: P(k, wm, model_path).to(DEV) @@ -147,7 +147,7 @@ def make_moe_runner(layer_id, wm, model_path): l2_sf = w2_sf.to(torch.float8_e4m3fn).permute(1,0).contiguous() if w2_sf.dtype != torch.float8_e4m3fn else w2_sf.permute(1,0).contiguous() intermediate_size = 3072 # per expert - runner = CuTeDSLMoERunner( + runner = Nvfp4MoE( num_experts=NUM_EXPERTS, hidden_size=H, intermediate_size=intermediate_size, diff --git a/tests/test_inspect_types.py b/tests/archive/test_inspect_types.py similarity index 100% rename from tests/test_inspect_types.py rename to tests/archive/test_inspect_types.py diff --git a/tests/test_inv_rope.py b/tests/archive/test_inv_rope.py similarity index 100% rename from tests/test_inv_rope.py rename to tests/archive/test_inv_rope.py diff --git a/tests/test_kv_cache_b200.py b/tests/archive/test_kv_cache_b200.py similarity index 98% rename from tests/test_kv_cache_b200.py rename to tests/archive/test_kv_cache_b200.py index 29625338..c827d4ff 100644 --- a/tests/test_kv_cache_b200.py +++ b/tests/archive/test_kv_cache_b200.py @@ -62,7 +62,7 @@ def rms(x, w, eps=1e-6): return (w.float() * (x * torch.rsqrt(v+eps)).float()).to(x.dtype) def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None): - from cutedsl.nvfp4_linear import CuTeDSLNvfp4Linear + from dsv4.layers.linear import Nvfp4Linear fp4 = w.view(torch.float4_e2m1fn_x2).permute(1,0).contiguous() s = sf.to(torch.float8_e4m3fn) if sf.dtype != torch.float8_e4m3fn else sf s = s.permute(1,0).contiguous() @@ -73,7 +73,7 @@ def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None): s32[:sp] *= g1/gs; s32[sp:] *= g2/gs; s = s32.to(torch.float8_e4m3fn) else: gs = gs_t.max().item() if gs_t.numel() > 1 else gs_t.item() - r = CuTeDSLNvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device)) + r = Nvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device)) r.fp4 = [fp4]; r.sf = [s]; r.gs = [gs] r.finalize_weights(); r._ensure_initialized() return r @@ -135,7 +135,9 @@ def kv_quantize_nvfp4(kv_bf16): kv_bf16: (T, HD) BF16 Returns: (T, HD//2) fp4, (T, HD//16) sf, scalar gs """ - from cutedsl.bridge import quantize_to_nvfp4 +from dsv4.ops.quantize import ( + quantize_to_nvfp4, +) return quantize_to_nvfp4(kv_bf16) diff --git a/tests/test_layout_compare.py b/tests/archive/test_layout_compare.py similarity index 100% rename from tests/test_layout_compare.py rename to tests/archive/test_layout_compare.py diff --git a/tests/test_mma_si_only.py b/tests/archive/test_mma_si_only.py similarity index 100% rename from tests/test_mma_si_only.py rename to tests/archive/test_mma_si_only.py diff --git a/tests/test_mma_si_pv.py b/tests/archive/test_mma_si_pv.py similarity index 100% rename from tests/test_mma_si_pv.py rename to tests/archive/test_mma_si_pv.py diff --git a/tests/test_model_forward_b200.py b/tests/archive/test_model_forward_b200.py similarity index 95% rename from tests/test_model_forward_b200.py rename to tests/archive/test_model_forward_b200.py index 92313977..05a62d23 100644 --- a/tests/test_model_forward_b200.py +++ b/tests/archive/test_model_forward_b200.py @@ -50,7 +50,7 @@ def rms(x, w, eps=1e-6): return (w.float() * (x * torch.rsqrt(v+eps)).float()).to(x.dtype) def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None): - from cutedsl.nvfp4_linear import CuTeDSLNvfp4Linear + from dsv4.layers.linear import Nvfp4Linear fp4 = w.view(torch.float4_e2m1fn_x2).permute(1,0).contiguous() s = sf.to(torch.float8_e4m3fn) if sf.dtype != torch.float8_e4m3fn else sf s = s.permute(1,0).contiguous() @@ -61,7 +61,7 @@ def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None): s32[:sp] *= g1/gs; s32[sp:] *= g2/gs; s = s32.to(torch.float8_e4m3fn) else: gs = gs_t.max().item() if gs_t.numel() > 1 else gs_t.item() - r = CuTeDSLNvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device)) + r = Nvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device)) r.fp4 = [fp4]; r.sf = [s]; r.gs = [gs] r.finalize_weights(); r._ensure_initialized() return r @@ -80,15 +80,17 @@ def main(): # ── INSPECT: How does CuTeDSL runner.run() use gs? ──────────────── print("\n--- INSPECTING CuTeDSL runner internals ---") - from cutedsl.nvfp4_linear import CuTeDSLNvfp4Linear - from cutedsl.bridge import quantize_activation_nvfp4 + from dsv4.layers.linear import Nvfp4Linear +from dsv4.ops.quantize import ( + quantize_activation_nvfp4, +) print("\n quantize_activation_nvfp4 signature:") sig = inspect.signature(quantize_activation_nvfp4) print(f" {sig}") - print("\n CuTeDSLNvfp4Linear._run_impl source (key lines):") - src = inspect.getsource(CuTeDSLNvfp4Linear._run_impl) + print("\n Nvfp4Linear._run_impl source (key lines):") + src = inspect.getsource(Nvfp4Linear._run_impl) for i, line in enumerate(src.split('\n')): stripped = line.strip() if any(kw in stripped for kw in ['global_scale', '_activation', 'quantize', 'return', 'def ']): diff --git a/tests/test_moe_nan_b200.py b/tests/archive/test_moe_nan_b200.py similarity index 98% rename from tests/test_moe_nan_b200.py rename to tests/archive/test_moe_nan_b200.py index d25431a1..c2a5b8aa 100644 --- a/tests/test_moe_nan_b200.py +++ b/tests/archive/test_moe_nan_b200.py @@ -47,12 +47,12 @@ def rms(x, w, eps=1e-6): return (w.float() * (x * torch.rsqrt(v+eps)).float()).to(x.dtype) def make_runner(w, sf, gs_t, inf, outf): - from cutedsl.nvfp4_linear import CuTeDSLNvfp4Linear + from dsv4.layers.linear import Nvfp4Linear fp4 = w.view(torch.float4_e2m1fn_x2).permute(1,0).contiguous() s = sf.to(torch.float8_e4m3fn) if sf.dtype != torch.float8_e4m3fn else sf s = s.permute(1,0).contiguous() gs = gs_t.max().item() if gs_t.numel() > 1 else gs_t.item() - r = CuTeDSLNvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device)) + r = Nvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device)) r.fp4 = [fp4]; r.sf = [s]; r.gs = [gs] r.finalize_weights(); r._ensure_initialized() return r diff --git a/tests/test_moe_runner_nan_b200.py b/tests/archive/test_moe_runner_nan_b200.py similarity index 94% rename from tests/test_moe_runner_nan_b200.py rename to tests/archive/test_moe_runner_nan_b200.py index 9f279bcc..ae447a9b 100644 --- a/tests/test_moe_runner_nan_b200.py +++ b/tests/archive/test_moe_runner_nan_b200.py @@ -2,7 +2,7 @@ """ DeepSeek-V4 MoE Runner NaN Test -Tests the CuTeDSLMoERunner (grouped GEMM path) with real weights. +Tests the Nvfp4MoE (grouped GEMM path) with real weights. The single-expert tests pass — this test exercises the FULL MoE runner with routing, padding, grouped GEMM, and combine. @@ -39,7 +39,7 @@ def rms(x, w, eps=1e-6): def pack_expert_weights(wm, G, layer_id=2, num_local_experts=16): - """Pack per-expert weights into stacked format for CuTeDSLMoERunner. + """Pack per-expert weights into stacked format for Nvfp4MoE. Only loads the first num_local_experts to fit in memory. """ m = f"model.layers.{layer_id}.mlp" @@ -77,7 +77,7 @@ def pack_expert_weights(wm, G, layer_id=2, num_local_experts=16): # Actually w13 = stacked gate+up, w2 = down # But our runner expects separate L1 (gate+up) and L2 (down) # The w13 format is (E, 2*intermediate, hidden//2) with gate and up interleaved - # For CuTeDSLMoERunner, we stack gate and up side-by-side + # For Nvfp4MoE, we stack gate and up side-by-side # Stack gate and up into w13 format: (E, 2*intermediate, hidden//2) w13_w = torch.cat([torch.stack(gate_ws), torch.stack(up_ws)], dim=1) # (E, 6144, 3584) @@ -92,8 +92,8 @@ def pack_expert_weights(wm, G, layer_id=2, num_local_experts=16): def test_moe_runner(layer_id=2): - """Test the CuTeDSLMoERunner with real weights.""" - from cutedsl.runner import CuTeDSLMoERunner + """Test the Nvfp4MoE with real weights.""" + from dsv4.layers.moe import Nvfp4MoE torch.cuda.set_device(0) torch.manual_seed(42) @@ -118,7 +118,7 @@ def test_moe_runner(layer_id=2): intermediate_size = INTERMEDIATE # 3072 hidden_size = H # 7168 - runner = CuTeDSLMoERunner( + runner = Nvfp4MoE( num_experts=num_local_experts, hidden_size=hidden_size, intermediate_size=intermediate_size, @@ -178,7 +178,7 @@ def test_moe_runner(layer_id=2): def main(): print("=" * 70) print(" DeepSeek-V4 MoE Runner NaN Test") - print(" Tests CuTeDSLMoERunner (grouped GEMM) with real weights") + print(" Tests Nvfp4MoE (grouped GEMM) with real weights") print("=" * 70) test_moe_runner(layer_id=2) diff --git a/tests/test_multilayer.py b/tests/archive/test_multilayer.py similarity index 98% rename from tests/test_multilayer.py rename to tests/archive/test_multilayer.py index e75e206e..829256ff 100644 --- a/tests/test_multilayer.py +++ b/tests/archive/test_multilayer.py @@ -58,8 +58,11 @@ def main(): topk_weights = torch.ones(NUM_TOKENS, TOP_K, dtype=torch.float32, device=DEVICE) / TOP_K # Setup runner - from vllm.nvfp4_cutedsl import CuTeDSLMoERunner - from cutedsl.bridge import assemble_scales_3d_side, make_b_k_major + from vllm.nvfp4_cutedsl import Nvfp4MoE +from dsv4.ops.layouts import ( + assemble_scales_3d_side, + make_b_k_major, +) l1_fp4, l1_sf, l1_gs_list = [], [], [] l2_fp4, l2_sf, l2_gs_list = [], [], [] @@ -91,7 +94,7 @@ def main(): l2_sf.append(torch.ones(INTERMEDIATE_SIZE//16, HIDDEN_SIZE, dtype=torch.float8_e4m3fn, device=DEVICE)) l2_gs_list.append(1.0) - runner = CuTeDSLMoERunner( + runner = Nvfp4MoE( num_experts=NUM_EXPERTS, hidden_size=HIDDEN_SIZE, intermediate_size=INTERMEDIATE_SIZE, max_num_tokens=NUM_TOKENS, top_k=TOP_K, device=DEVICE, diff --git a/tests/test_nvfp4_attention_b200.py b/tests/archive/test_nvfp4_attention_b200.py similarity index 96% rename from tests/test_nvfp4_attention_b200.py rename to tests/archive/test_nvfp4_attention_b200.py index c0bdf170..0fed2f2d 100644 --- a/tests/test_nvfp4_attention_b200.py +++ b/tests/archive/test_nvfp4_attention_b200.py @@ -48,7 +48,7 @@ def rms(x, w, eps=1e-6): return (w.float() * (x * torch.rsqrt(v+eps)).float()).to(x.dtype) def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None): - from cutedsl.nvfp4_linear import CuTeDSLNvfp4Linear + from dsv4.layers.linear import Nvfp4Linear fp4 = w.view(torch.float4_e2m1fn_x2).permute(1,0).contiguous() s = sf.to(torch.float8_e4m3fn) if sf.dtype != torch.float8_e4m3fn else sf s = s.permute(1,0).contiguous() @@ -59,7 +59,7 @@ def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None): s32[:sp] *= g1/gs; s32[sp:] *= g2/gs; s = s32.to(torch.float8_e4m3fn) else: gs = gs_t.max().item() if gs_t.numel() > 1 else gs_t.item() - r = CuTeDSLNvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device)) + r = Nvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device)) r.fp4 = [fp4]; r.sf = [s]; r.gs = [gs] r.finalize_weights(); r._ensure_initialized() return r @@ -108,8 +108,11 @@ def nvfp4_qk_attention(q, kv, scale): This is a standard GEMM that CuTeDSL can handle. We quantize Q as the "activation" and K^T as the "weight". """ - from cutedsl.bridge import quantize_to_nvfp4, quantize_activation_nvfp4 - from cutedsl.nvfp4_linear import CuTeDSLNvfp4Linear +from dsv4.ops.quantize import ( + quantize_to_nvfp4, + quantize_activation_nvfp4, +) + from dsv4.layers.linear import Nvfp4Linear T, NH, HD = q.shape device = q.device @@ -123,7 +126,7 @@ def nvfp4_qk_attention(q, kv, scale): kv_T = kv.T.contiguous() # (HD, T) w_fp4, w_sf, w_gs = quantize_to_nvfp4(kv_T) # (HD//2, T), (HD//16, T), scalar - # Use CuTeDSLNvfp4Linear runner for Q×K^T GEMM + # Use Nvfp4Linear runner for Q×K^T GEMM # in_features=HD, out_features=T # Q is "activation" side, K^T is "weight" side M = T * NH @@ -131,7 +134,7 @@ def nvfp4_qk_attention(q, kv, scale): N = T # Create runner for this specific (M, K, N) combination - runner = CuTeDSLNvfp4Linear( + runner = Nvfp4Linear( in_features=K, out_features=N, max_num_tokens=M, device=str(device) ) diff --git a/tests/test_nvfp4_attn_gemm_b200.py b/tests/archive/test_nvfp4_attn_gemm_b200.py similarity index 96% rename from tests/test_nvfp4_attn_gemm_b200.py rename to tests/archive/test_nvfp4_attn_gemm_b200.py index ebfac828..a47b6263 100644 --- a/tests/test_nvfp4_attn_gemm_b200.py +++ b/tests/archive/test_nvfp4_attn_gemm_b200.py @@ -64,7 +64,7 @@ def rms(x, w, eps=1e-6): return (w.float() * (x * torch.rsqrt(v+eps)).float()).to(x.dtype) def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None): - from cutedsl.nvfp4_linear import CuTeDSLNvfp4Linear + from dsv4.layers.linear import Nvfp4Linear fp4 = w.view(torch.float4_e2m1fn_x2).permute(1,0).contiguous() s = sf.to(torch.float8_e4m3fn) if sf.dtype != torch.float8_e4m3fn else sf s = s.permute(1,0).contiguous() @@ -75,7 +75,7 @@ def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None): s32[:sp] *= g1/gs; s32[sp:] *= g2/gs; s = s32.to(torch.float8_e4m3fn) else: gs = gs_t.max().item() if gs_t.numel() > 1 else gs_t.item() - r = CuTeDSLNvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device)) + r = Nvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device)) r.fp4 = [fp4]; r.sf = [s]; r.gs = [gs] r.finalize_weights(); r._ensure_initialized() return r @@ -163,7 +163,7 @@ class NVFP4Attention: Returns: (T, NH, HD) attention output """ - from cutedsl.nvfp4_linear import CuTeDSLNvfp4Linear + from dsv4.layers.linear import Nvfp4Linear T, NH, HD = q_bf16.shape device = q_bf16.device @@ -175,13 +175,13 @@ class NVFP4Attention: # Q is "activation" (T*NH, HD), K^T is "weight" (T, HD) # GEMM: (T*NH, HD) × (HD, T) → (T*NH, T) # - # We use CuTeDSLNvfp4Linear with in_features=HD, out_features=T + # We use Nvfp4Linear with in_features=HD, out_features=T # Q is the "hidden_states", K (kv) is the "weight" matrix # Create or get cached runner cache_key = (T, HD, NH) if self._runner is None or getattr(self, '_cache_key', None) != cache_key: - runner = CuTeDSLNvfp4Linear( + runner = Nvfp4Linear( in_features=HD, out_features=T, max_num_tokens=T * NH, @@ -191,15 +191,17 @@ class NVFP4Attention: # Set K as the weight: kv (T, HD) → treat as weight (N=T, K=HD) # quantize_to_nvfp4 quantizes along last dim (D=HD) as activation # For weight, we need (K, N) layout — but kv is (T, HD) = (N, K) - # CuTeDSLNvfp4Linear expects weight in (N, K//2) after permute + # Nvfp4Linear expects weight in (N, K//2) after permute - from cutedsl.bridge import quantize_to_nvfp4 +from dsv4.ops.quantize import ( + quantize_to_nvfp4, +) # Quantize KV as a 2D tensor: (T, HD) # quantize_to_nvfp4 works on last dim (D=HD), returns: # (T, HD//2) fp4, (T, HD//16) sf, scalar gs kv_fp4, kv_sf, kv_gs = quantize_to_nvfp4(kv_bf16) - # For CuTeDSLNvfp4Linear, weight is (N, K_packed) = (T, HD//2) + # For Nvfp4Linear, weight is (N, K_packed) = (T, HD//2) # Our kv_fp4 is already (T, HD//2) — perfect! # sf needs to be (N, K_sf) = (T, HD//16) — already correct diff --git a/tests/test_nvfp4_mapper.py b/tests/archive/test_nvfp4_mapper.py similarity index 100% rename from tests/test_nvfp4_mapper.py rename to tests/archive/test_nvfp4_mapper.py diff --git a/tests/test_o_projection.py b/tests/archive/test_o_projection.py similarity index 100% rename from tests/test_o_projection.py rename to tests/archive/test_o_projection.py diff --git a/tests/test_o_projection_b200.py b/tests/archive/test_o_projection_b200.py similarity index 99% rename from tests/test_o_projection_b200.py rename to tests/archive/test_o_projection_b200.py index dc51e85c..f65bcd83 100644 --- a/tests/test_o_projection_b200.py +++ b/tests/archive/test_o_projection_b200.py @@ -163,7 +163,7 @@ def dequant_nvfp4(packed_uint8, scale_e4m3, global_scale): def test_wo_b_nvfp4(z, wo_b_weight, wo_b_sf, wo_b_gs): """Test wo_b NVFP4 GEMM against BF16 reference.""" sys.path.insert(0, "/root/nvfp4-megamoe-kernel") - from cutedsl.nvfp4_linear import CuTeDSLNvfp4Linear + from dsv4.layers.linear import Nvfp4Linear in_features = wo_b_weight.shape[1] * 2 out_features = wo_b_weight.shape[0] @@ -173,7 +173,7 @@ def test_wo_b_nvfp4(z, wo_b_weight, wo_b_sf, wo_b_gs): sf = [wo_b_sf.permute(1, 0).contiguous()] gs = [wo_b_gs] - runner = CuTeDSLNvfp4Linear( + runner = Nvfp4Linear( in_features=in_features, out_features=out_features, max_num_tokens=8192, diff --git a/tests/test_packing_diag.py b/tests/archive/test_packing_diag.py similarity index 100% rename from tests/test_packing_diag.py rename to tests/archive/test_packing_diag.py diff --git a/tests/test_pair_swap.py b/tests/archive/test_pair_swap.py similarity index 100% rename from tests/test_pair_swap.py rename to tests/archive/test_pair_swap.py diff --git a/tests/test_pair_swap2.py b/tests/archive/test_pair_swap2.py similarity index 100% rename from tests/test_pair_swap2.py rename to tests/archive/test_pair_swap2.py diff --git a/tests/test_pipeline_real_weights.py b/tests/archive/test_pipeline_real_weights.py similarity index 97% rename from tests/test_pipeline_real_weights.py rename to tests/archive/test_pipeline_real_weights.py index cf2ce97c..517db7cb 100644 --- a/tests/test_pipeline_real_weights.py +++ b/tests/archive/test_pipeline_real_weights.py @@ -91,8 +91,11 @@ def main(): print(f"BF16 ref: amax={ref_out.amax().item():.4f}") # CuTeDSL runner - from vllm.nvfp4_cutedsl import CuTeDSLMoERunner - from cutedsl.bridge import assemble_scales_3d_side, make_b_k_major + from vllm.nvfp4_cutedsl import Nvfp4MoE +from dsv4.ops.layouts import ( + assemble_scales_3d_side, + make_b_k_major, +) l1_fp4, l1_sf, l1_gs = [], [], [] l2_fp4, l2_sf, l2_gs = [], [], [] @@ -125,7 +128,7 @@ def main(): l2_sf.append(torch.ones(INTERMEDIATE_SIZE//16, HIDDEN_SIZE, dtype=torch.float8_e4m3fn, device=DEVICE)) l2_gs.append(1.0) - runner = CuTeDSLMoERunner( + runner = Nvfp4MoE( num_experts=NUM_EXPERTS, hidden_size=HIDDEN_SIZE, intermediate_size=INTERMEDIATE_SIZE, max_num_tokens=MAX_NUM_TOKENS, top_k=TOP_K, device=DEVICE, diff --git a/tests/test_pv64.py b/tests/archive/test_pv64.py similarity index 100% rename from tests/test_pv64.py rename to tests/archive/test_pv64.py diff --git a/tests/test_pv64_fmha_v.py b/tests/archive/test_pv64_fmha_v.py similarity index 100% rename from tests/test_pv64_fmha_v.py rename to tests/archive/test_pv64_fmha_v.py diff --git a/tests/test_pv64_kmajor_v.py b/tests/archive/test_pv64_kmajor_v.py similarity index 100% rename from tests/test_pv64_kmajor_v.py rename to tests/archive/test_pv64_kmajor_v.py diff --git a/tests/test_pv64_no_softmax.py b/tests/archive/test_pv64_no_softmax.py similarity index 100% rename from tests/test_pv64_no_softmax.py rename to tests/archive/test_pv64_no_softmax.py diff --git a/tests/test_pv64_nosoftmax_fmha_v.py b/tests/archive/test_pv64_nosoftmax_fmha_v.py similarity index 100% rename from tests/test_pv64_nosoftmax_fmha_v.py rename to tests/archive/test_pv64_nosoftmax_fmha_v.py diff --git a/tests/test_pv_diag.py b/tests/archive/test_pv_diag.py similarity index 100% rename from tests/test_pv_diag.py rename to tests/archive/test_pv_diag.py diff --git a/tests/test_pv_mma_mn_major.py b/tests/archive/test_pv_mma_mn_major.py similarity index 100% rename from tests/test_pv_mma_mn_major.py rename to tests/archive/test_pv_mma_mn_major.py diff --git a/tests/test_quick_rand.py b/tests/archive/test_quick_rand.py similarity index 100% rename from tests/test_quick_rand.py rename to tests/archive/test_quick_rand.py diff --git a/tests/test_recast_minimal.py b/tests/archive/test_recast_minimal.py similarity index 100% rename from tests/test_recast_minimal.py rename to tests/archive/test_recast_minimal.py diff --git a/tests/test_rope_kv_b200.py b/tests/archive/test_rope_kv_b200.py similarity index 96% rename from tests/test_rope_kv_b200.py rename to tests/archive/test_rope_kv_b200.py index a2e5a73d..ceb53710 100644 --- a/tests/test_rope_kv_b200.py +++ b/tests/archive/test_rope_kv_b200.py @@ -39,12 +39,12 @@ def rms(x, w, eps=1e-6): return (w.float() * (x * torch.rsqrt(v+eps)).float()).to(x.dtype) def make_runner(w, sf, gs_t, inf, outf): - from cutedsl.nvfp4_linear import CuTeDSLNvfp4Linear + from dsv4.layers.linear import Nvfp4Linear fp4 = w.view(torch.float4_e2m1fn_x2).permute(1,0).contiguous() s = sf.to(torch.float8_e4m3fn) if sf.dtype != torch.float8_e4m3fn else sf s = s.permute(1,0).contiguous() gs = gs_t.max().item() if gs_t.numel() > 1 else gs_t.item() - r = CuTeDSLNvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device)) + r = Nvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device)) r.fp4 = [fp4]; r.sf = [s]; r.gs = [gs] r.finalize_weights(); r._ensure_initialized() return r @@ -132,7 +132,7 @@ with torch.no_grad(): print(f" Output: amax={o_with_rope.amax():.4f} NaN={torch.isnan(o_with_rope).any()}") # Test 3: Full pipeline - from cutedsl.csa_attention import apply_inv_gptj_rope + from dsv4.reference.csa_attention import apply_inv_gptj_rope o_inv = apply_inv_gptj_rope(o_with_rope, positions, cos_sin, NOPE, ROPE) o_grouped = o_inv.view(NT, OG, HPG * HD).permute(1, 0, 2) woa_3d = woa.view(OG, OL, HPG * HD) diff --git a/tests/test_runner_vs_pipeline.py b/tests/archive/test_runner_vs_pipeline.py similarity index 91% rename from tests/test_runner_vs_pipeline.py rename to tests/archive/test_runner_vs_pipeline.py index 8ae00ad6..26f6d25a 100644 --- a/tests/test_runner_vs_pipeline.py +++ b/tests/archive/test_runner_vs_pipeline.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 """ -Test A: Compare moe_pipeline output vs CuTeDSLMoERunner output. +Test A: Compare moe_pipeline output vs Nvfp4MoE output. Uses the same weights and inputs. If they differ, the runner is broken. Runs on the B200 host (not inside Docker): @@ -13,9 +13,17 @@ from safetensors import safe_open REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, REPO_ROOT) -from cutedsl.moe_pipeline import run_nvfp4_moe -from vllm.nvfp4_cutedsl import CuTeDSLMoERunner -from cutedsl.bridge import quantize_to_nvfp4, quantize_weight_to_nvfp4, make_b_k_major, assemble_scales_3d_side, compute_expert_offsets +from dsv4.reference.moe_pipeline import run_nvfp4_moe +from vllm.nvfp4_cutedsl import Nvfp4MoE +from dsv4.ops.quantize import ( + quantize_to_nvfp4, + quantize_weight_to_nvfp4, +) +from dsv4.ops.layouts import ( + make_b_k_major, + assemble_scales_3d_side, + compute_expert_offsets, +) MODEL_DIR = "/root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4" DEVICE = "cuda" @@ -126,9 +134,9 @@ def main(): ) print(f" Pipeline: amax={pipeline_out.abs().max():.4f}, mean={pipeline_out.float().mean():.6f}") - # ── Path 2: CuTeDSLMoERunner with checkpoint input_scale (what vLLM uses) ── - print("\n Running CuTeDSLMoERunner (checkpoint gs)...") - runner = CuTeDSLMoERunner(num_experts, hidden_size, intermediate_size, device=DEVICE) + # ── Path 2: Nvfp4MoE with checkpoint input_scale (what vLLM uses) ── + print("\n Running Nvfp4MoE (checkpoint gs)...") + runner = Nvfp4MoE(num_experts, hidden_size, intermediate_size, device=DEVICE) runner.prepare_weights_direct( [w.clone() for w in weights['l1_fp4']], [w.clone() for w in weights['l1_sf']], @@ -157,12 +165,12 @@ def main(): ).item() print(f" Cosine vs pipeline: {cos_ckpt:.6f}") - # ── Path 3: CuTeDSLMoERunner with dynamic gs ── - print("\n Running CuTeDSLMoERunner (dynamic gs)...") + # ── Path 3: Nvfp4MoE with dynamic gs ── + print("\n Running Nvfp4MoE (dynamic gs)...") # We can't use quantize_to_nvfp4 in the runner (cudagraph), but we can # compute the gs from the input and set it before calling run x_igs = (hidden_states.abs().max().item()) / (6.0 * 448.0) - runner2 = CuTeDSLMoERunner(num_experts, hidden_size, intermediate_size, device=DEVICE) + runner2 = Nvfp4MoE(num_experts, hidden_size, intermediate_size, device=DEVICE) runner2.prepare_weights_direct( [w.clone() for w in weights['l1_fp4']], [w.clone() for w in weights['l1_sf']], diff --git a/tests/test_scale_assembly.py b/tests/archive/test_scale_assembly.py similarity index 93% rename from tests/test_scale_assembly.py rename to tests/archive/test_scale_assembly.py index 9623ad4e..7be030a7 100644 --- a/tests/test_scale_assembly.py +++ b/tests/archive/test_scale_assembly.py @@ -14,9 +14,14 @@ import os, sys, torch REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, REPO_ROOT) -from cutedsl.bridge import quantize_to_nvfp4, assemble_scales_2d_side -from cutedsl.kernel.moe.torch_scaled_grouped_mm import pad_and_swizzle_single, ceil_div -from vllm.nvfp4_cutedsl import CuTeDSLMoERunner +from dsv4.ops.quantize import ( + quantize_to_nvfp4, +) +from dsv4.ops.layouts import ( + assemble_scales_2d_side, +) +from dsv4.kernels.gemm.grouped import pad_and_swizzle_single, ceil_div +from vllm.nvfp4_cutedsl import Nvfp4MoE def test_scale_assembly(): @@ -27,7 +32,7 @@ def test_scale_assembly(): intermediate_size = 3072 # Create a runner just to use its _assemble_scales_cudagraph_safe - runner = CuTeDSLMoERunner(num_experts, hidden_size, intermediate_size, device=DEVICE) + runner = Nvfp4MoE(num_experts, hidden_size, intermediate_size, device=DEVICE) # Trigger _ensure_stacked and buffer allocation with dummy weights def rand_fp4(*shape): return torch.randint(0, 256, shape, dtype=torch.uint8, device=DEVICE).view(torch.float4_e2m1fn_x2) diff --git a/tests/test_scale_debug.py b/tests/archive/test_scale_debug.py similarity index 90% rename from tests/test_scale_debug.py rename to tests/archive/test_scale_debug.py index 05738d58..794f3285 100644 --- a/tests/test_scale_debug.py +++ b/tests/archive/test_scale_debug.py @@ -3,15 +3,20 @@ import os, sys, torch REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, REPO_ROOT) -from cutedsl.bridge import quantize_to_nvfp4, assemble_scales_2d_side -from cutedsl.kernel.moe.torch_scaled_grouped_mm import pad_and_swizzle_single, ceil_div -from vllm.nvfp4_cutedsl import CuTeDSLMoERunner +from dsv4.ops.quantize import ( + quantize_to_nvfp4, +) +from dsv4.ops.layouts import ( + assemble_scales_2d_side, +) +from dsv4.kernels.gemm.grouped import pad_and_swizzle_single, ceil_div +from vllm.nvfp4_cutedsl import Nvfp4MoE DEVICE = "cuda" num_experts = 3 hidden_size = 7168 -runner = CuTeDSLMoERunner(num_experts, hidden_size, 3072, device=DEVICE) +runner = Nvfp4MoE(num_experts, hidden_size, 3072, device=DEVICE) def rand_fp4(*shape): return torch.randint(0, 256, shape, dtype=torch.uint8, device=DEVICE).view(torch.float4_e2m1fn_x2) def rand_sf(*shape): diff --git a/tests/test_shared_expert.py b/tests/archive/test_shared_expert.py similarity index 97% rename from tests/test_shared_expert.py rename to tests/archive/test_shared_expert.py index 1b12db0d..bf6856ab 100644 --- a/tests/test_shared_expert.py +++ b/tests/archive/test_shared_expert.py @@ -1,6 +1,6 @@ """Standalone test: Shared expert using CuTeDSL dedicated runner. -Tests the CuTeDSLSharedExpertRunner for the shared expert path. +Tests the Nvfp4SharedExpert for the shared expert path. Compares against BF16 dequantized reference. Usage: python3 test_shared_expert.py @@ -55,7 +55,7 @@ def main(): torch.manual_seed(42) sys.path.insert(0, "/root/nvfp4-megamoe-kernel") - from cutedsl.shared_expert_pipeline import CuTeDSLSharedExpertRunner + from dsv4.layers.shared_expert import Nvfp4SharedExpert with open(os.path.join(MODEL_PATH, "model.safetensors.index.json")) as f: wm = json.load(f)["weight_map"] @@ -101,7 +101,7 @@ def main(): l2_sf = [down_sf.permute(1, 0).contiguous()] # Create runner - runner = CuTeDSLSharedExpertRunner( + runner = Nvfp4SharedExpert( hidden_size=HIDDEN_SIZE, intermediate_size=INTERMEDIATE_SIZE, max_num_tokens=8192, diff --git a/tests/test_silu_step1.py b/tests/archive/test_silu_step1.py similarity index 97% rename from tests/test_silu_step1.py rename to tests/archive/test_silu_step1.py index 9f54eca6..8b11b35c 100644 --- a/tests/test_silu_step1.py +++ b/tests/archive/test_silu_step1.py @@ -13,12 +13,16 @@ import torch import sys sys.path.insert(0, '/root/dsv4-nvfp4-workspace/kernel') -from cutedsl.bridge import ( +from dsv4.ops.quantize import ( quantize_weight_to_nvfp4, quantize_activation_nvfp4, +) +from dsv4.ops.layouts import ( make_b_k_major, assemble_scales_2d_side, assemble_scales_3d_side, +) +from dsv4.ops.gemm_runner import ( run_nvfp4_grouped_gemm, warmup_compilation, ) diff --git a/tests/test_softmax_only.py b/tests/archive/test_softmax_only.py similarity index 100% rename from tests/test_softmax_only.py rename to tests/archive/test_softmax_only.py diff --git a/tests/test_softmax_store_debug.py b/tests/archive/test_softmax_store_debug.py similarity index 100% rename from tests/test_softmax_store_debug.py rename to tests/archive/test_softmax_store_debug.py diff --git a/tests/test_sparse_attn_b200.py b/tests/archive/test_sparse_attn_b200.py similarity index 98% rename from tests/test_sparse_attn_b200.py rename to tests/archive/test_sparse_attn_b200.py index 9cc15148..da7adc42 100644 --- a/tests/test_sparse_attn_b200.py +++ b/tests/archive/test_sparse_attn_b200.py @@ -54,7 +54,7 @@ def rms(x, w, eps=1e-6): return (w.float() * (x * torch.rsqrt(v+eps)).float()).to(x.dtype) def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None): - from cutedsl.nvfp4_linear import CuTeDSLNvfp4Linear + from dsv4.layers.linear import Nvfp4Linear fp4 = w.view(torch.float4_e2m1fn_x2).permute(1,0).contiguous() s = sf.to(torch.float8_e4m3fn) if sf.dtype != torch.float8_e4m3fn else sf s = s.permute(1,0).contiguous() @@ -65,7 +65,7 @@ def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None): s32[:sp] *= g1/gs; s32[sp:] *= g2/gs; s = s32.to(torch.float8_e4m3fn) else: gs = gs_t.max().item() if gs_t.numel() > 1 else gs_t.item() - r = CuTeDSLNvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device)) + r = Nvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device)) r.fp4 = [fp4]; r.sf = [s]; r.gs = [gs] r.finalize_weights(); r._ensure_initialized() return r diff --git a/tests/test_sparse_decode.py b/tests/archive/test_sparse_decode.py similarity index 97% rename from tests/test_sparse_decode.py rename to tests/archive/test_sparse_decode.py index eb1d6d18..31964e4e 100644 --- a/tests/test_sparse_decode.py +++ b/tests/archive/test_sparse_decode.py @@ -1,6 +1,6 @@ import sys, torch, torch.nn.functional as F sys.path.insert(0, "/root/dsv4-nvfp4-workspace/kernel") -from cutedsl.native_sparse_decode import native_sparse_decode_attention +from dsv4.ops.decode_sparse import native_sparse_decode_attention torch.manual_seed(42) torch.cuda.set_device(0) diff --git a/tests/test_stage_a_copy.py b/tests/archive/test_stage_a_copy.py similarity index 100% rename from tests/test_stage_a_copy.py rename to tests/archive/test_stage_a_copy.py diff --git a/tests/test_stage_a_minimal.py b/tests/archive/test_stage_a_minimal.py similarity index 100% rename from tests/test_stage_a_minimal.py rename to tests/archive/test_stage_a_minimal.py diff --git a/tests/test_stage_a_pv_created.py b/tests/archive/test_stage_a_pv_created.py similarity index 100% rename from tests/test_stage_a_pv_created.py rename to tests/archive/test_stage_a_pv_created.py diff --git a/tests/test_stage_a_pv_param.py b/tests/archive/test_stage_a_pv_param.py similarity index 100% rename from tests/test_stage_a_pv_param.py rename to tests/archive/test_stage_a_pv_param.py diff --git a/tests/test_stage_a_qk.py b/tests/archive/test_stage_a_qk.py similarity index 100% rename from tests/test_stage_a_qk.py rename to tests/archive/test_stage_a_qk.py diff --git a/tests/test_stage_a_v2.py b/tests/archive/test_stage_a_v2.py similarity index 100% rename from tests/test_stage_a_v2.py rename to tests/archive/test_stage_a_v2.py diff --git a/tests/test_stage_a_with_pv_mma.py b/tests/archive/test_stage_a_with_pv_mma.py similarity index 100% rename from tests/test_stage_a_with_pv_mma.py rename to tests/archive/test_stage_a_with_pv_mma.py diff --git a/tests/test_stage_b_afrag.py b/tests/archive/test_stage_b_afrag.py similarity index 100% rename from tests/test_stage_b_afrag.py rename to tests/archive/test_stage_b_afrag.py diff --git a/tests/test_stage_b_afrag2.py b/tests/archive/test_stage_b_afrag2.py similarity index 100% rename from tests/test_stage_b_afrag2.py rename to tests/archive/test_stage_b_afrag2.py diff --git a/tests/test_stage_b_debug.py b/tests/archive/test_stage_b_debug.py similarity index 100% rename from tests/test_stage_b_debug.py rename to tests/archive/test_stage_b_debug.py diff --git a/tests/test_stage_b_debug2.py b/tests/archive/test_stage_b_debug2.py similarity index 100% rename from tests/test_stage_b_debug2.py rename to tests/archive/test_stage_b_debug2.py diff --git a/tests/test_stage_b_debug3.py b/tests/archive/test_stage_b_debug3.py similarity index 100% rename from tests/test_stage_b_debug3.py rename to tests/archive/test_stage_b_debug3.py diff --git a/tests/test_stage_b_debug4.py b/tests/archive/test_stage_b_debug4.py similarity index 100% rename from tests/test_stage_b_debug4.py rename to tests/archive/test_stage_b_debug4.py diff --git a/tests/test_stage_b_diag.py b/tests/archive/test_stage_b_diag.py similarity index 100% rename from tests/test_stage_b_diag.py rename to tests/archive/test_stage_b_diag.py diff --git a/tests/test_stage_b_final.py b/tests/archive/test_stage_b_final.py similarity index 100% rename from tests/test_stage_b_final.py rename to tests/archive/test_stage_b_final.py diff --git a/tests/test_stage_b_identity.py b/tests/archive/test_stage_b_identity.py similarity index 100% rename from tests/test_stage_b_identity.py rename to tests/archive/test_stage_b_identity.py diff --git a/tests/test_stage_b_minimal.py b/tests/archive/test_stage_b_minimal.py similarity index 100% rename from tests/test_stage_b_minimal.py rename to tests/archive/test_stage_b_minimal.py diff --git a/tests/test_stage_b_ntile_v1.py b/tests/archive/test_stage_b_ntile_v1.py similarity index 100% rename from tests/test_stage_b_ntile_v1.py rename to tests/archive/test_stage_b_ntile_v1.py diff --git a/tests/test_stage_b_ntile_v3.py b/tests/archive/test_stage_b_ntile_v3.py similarity index 100% rename from tests/test_stage_b_ntile_v3.py rename to tests/archive/test_stage_b_ntile_v3.py diff --git a/tests/test_stage_b_ntile_v5.py b/tests/archive/test_stage_b_ntile_v5.py similarity index 100% rename from tests/test_stage_b_ntile_v5.py rename to tests/archive/test_stage_b_ntile_v5.py diff --git a/tests/test_stage_b_ntile_v6.py b/tests/archive/test_stage_b_ntile_v6.py similarity index 100% rename from tests/test_stage_b_ntile_v6.py rename to tests/archive/test_stage_b_ntile_v6.py diff --git a/tests/test_stage_b_ntile_v7.py b/tests/archive/test_stage_b_ntile_v7.py similarity index 100% rename from tests/test_stage_b_ntile_v7.py rename to tests/archive/test_stage_b_ntile_v7.py diff --git a/tests/test_stage_b_ntile_v8.py b/tests/archive/test_stage_b_ntile_v8.py similarity index 100% rename from tests/test_stage_b_ntile_v8.py rename to tests/archive/test_stage_b_ntile_v8.py diff --git a/tests/test_stage_b_pipeline_only.py b/tests/archive/test_stage_b_pipeline_only.py similarity index 100% rename from tests/test_stage_b_pipeline_only.py rename to tests/archive/test_stage_b_pipeline_only.py diff --git a/tests/test_stage_b_v1.py b/tests/archive/test_stage_b_v1.py similarity index 100% rename from tests/test_stage_b_v1.py rename to tests/archive/test_stage_b_v1.py diff --git a/tests/test_stage_b_v10.py b/tests/archive/test_stage_b_v10.py similarity index 100% rename from tests/test_stage_b_v10.py rename to tests/archive/test_stage_b_v10.py diff --git a/tests/test_stage_b_v11.py b/tests/archive/test_stage_b_v11.py similarity index 100% rename from tests/test_stage_b_v11.py rename to tests/archive/test_stage_b_v11.py diff --git a/tests/test_stage_b_v11b.py b/tests/archive/test_stage_b_v11b.py similarity index 100% rename from tests/test_stage_b_v11b.py rename to tests/archive/test_stage_b_v11b.py diff --git a/tests/test_stage_b_v12.py b/tests/archive/test_stage_b_v12.py similarity index 100% rename from tests/test_stage_b_v12.py rename to tests/archive/test_stage_b_v12.py diff --git a/tests/test_stage_b_v13.py b/tests/archive/test_stage_b_v13.py similarity index 100% rename from tests/test_stage_b_v13.py rename to tests/archive/test_stage_b_v13.py diff --git a/tests/test_stage_b_v14.py b/tests/archive/test_stage_b_v14.py similarity index 100% rename from tests/test_stage_b_v14.py rename to tests/archive/test_stage_b_v14.py diff --git a/tests/test_stage_b_v16.py b/tests/archive/test_stage_b_v16.py similarity index 100% rename from tests/test_stage_b_v16.py rename to tests/archive/test_stage_b_v16.py diff --git a/tests/test_stage_b_v17.py b/tests/archive/test_stage_b_v17.py similarity index 100% rename from tests/test_stage_b_v17.py rename to tests/archive/test_stage_b_v17.py diff --git a/tests/test_stage_b_v18.py b/tests/archive/test_stage_b_v18.py similarity index 100% rename from tests/test_stage_b_v18.py rename to tests/archive/test_stage_b_v18.py diff --git a/tests/test_stage_b_v19.py b/tests/archive/test_stage_b_v19.py similarity index 100% rename from tests/test_stage_b_v19.py rename to tests/archive/test_stage_b_v19.py diff --git a/tests/test_stage_b_v2.py b/tests/archive/test_stage_b_v2.py similarity index 100% rename from tests/test_stage_b_v2.py rename to tests/archive/test_stage_b_v2.py diff --git a/tests/test_stage_b_v20.py b/tests/archive/test_stage_b_v20.py similarity index 100% rename from tests/test_stage_b_v20.py rename to tests/archive/test_stage_b_v20.py diff --git a/tests/test_stage_b_v22.py b/tests/archive/test_stage_b_v22.py similarity index 100% rename from tests/test_stage_b_v22.py rename to tests/archive/test_stage_b_v22.py diff --git a/tests/test_stage_b_v22_bug1fix.py b/tests/archive/test_stage_b_v22_bug1fix.py similarity index 100% rename from tests/test_stage_b_v22_bug1fix.py rename to tests/archive/test_stage_b_v22_bug1fix.py diff --git a/tests/test_stage_b_v23.py b/tests/archive/test_stage_b_v23.py similarity index 100% rename from tests/test_stage_b_v23.py rename to tests/archive/test_stage_b_v23.py diff --git a/tests/test_stage_b_v24.py b/tests/archive/test_stage_b_v24.py similarity index 100% rename from tests/test_stage_b_v24.py rename to tests/archive/test_stage_b_v24.py diff --git a/tests/test_stage_b_v25.py b/tests/archive/test_stage_b_v25.py similarity index 100% rename from tests/test_stage_b_v25.py rename to tests/archive/test_stage_b_v25.py diff --git a/tests/test_stage_b_v26.py b/tests/archive/test_stage_b_v26.py similarity index 100% rename from tests/test_stage_b_v26.py rename to tests/archive/test_stage_b_v26.py diff --git a/tests/test_stage_b_v27.py b/tests/archive/test_stage_b_v27.py similarity index 100% rename from tests/test_stage_b_v27.py rename to tests/archive/test_stage_b_v27.py diff --git a/tests/test_stage_b_v28.py b/tests/archive/test_stage_b_v28.py similarity index 100% rename from tests/test_stage_b_v28.py rename to tests/archive/test_stage_b_v28.py diff --git a/tests/test_stage_b_v29.py b/tests/archive/test_stage_b_v29.py similarity index 100% rename from tests/test_stage_b_v29.py rename to tests/archive/test_stage_b_v29.py diff --git a/tests/test_stage_b_v3.py b/tests/archive/test_stage_b_v3.py similarity index 100% rename from tests/test_stage_b_v3.py rename to tests/archive/test_stage_b_v3.py diff --git a/tests/test_stage_b_v30.py b/tests/archive/test_stage_b_v30.py similarity index 100% rename from tests/test_stage_b_v30.py rename to tests/archive/test_stage_b_v30.py diff --git a/tests/test_stage_b_v4.py b/tests/archive/test_stage_b_v4.py similarity index 100% rename from tests/test_stage_b_v4.py rename to tests/archive/test_stage_b_v4.py diff --git a/tests/test_stage_b_v5.py b/tests/archive/test_stage_b_v5.py similarity index 100% rename from tests/test_stage_b_v5.py rename to tests/archive/test_stage_b_v5.py diff --git a/tests/test_stage_b_v6.py b/tests/archive/test_stage_b_v6.py similarity index 100% rename from tests/test_stage_b_v6.py rename to tests/archive/test_stage_b_v6.py diff --git a/tests/test_stage_b_v7.py b/tests/archive/test_stage_b_v7.py similarity index 100% rename from tests/test_stage_b_v7.py rename to tests/archive/test_stage_b_v7.py diff --git a/tests/test_stage_b_v7_rep128.py b/tests/archive/test_stage_b_v7_rep128.py similarity index 100% rename from tests/test_stage_b_v7_rep128.py rename to tests/archive/test_stage_b_v7_rep128.py diff --git a/tests/test_stage_b_v7_rep16.py b/tests/archive/test_stage_b_v7_rep16.py similarity index 100% rename from tests/test_stage_b_v7_rep16.py rename to tests/archive/test_stage_b_v7_rep16.py diff --git a/tests/test_stage_b_v7_rep64.py b/tests/archive/test_stage_b_v7_rep64.py similarity index 100% rename from tests/test_stage_b_v7_rep64.py rename to tests/archive/test_stage_b_v7_rep64.py diff --git a/tests/test_stage_b_v7_rep8.py b/tests/archive/test_stage_b_v7_rep8.py similarity index 100% rename from tests/test_stage_b_v7_rep8.py rename to tests/archive/test_stage_b_v7_rep8.py diff --git a/tests/test_stage_b_v8.py b/tests/archive/test_stage_b_v8.py similarity index 100% rename from tests/test_stage_b_v8.py rename to tests/archive/test_stage_b_v8.py diff --git a/tests/test_stage_b_v8b.py b/tests/archive/test_stage_b_v8b.py similarity index 100% rename from tests/test_stage_b_v8b.py rename to tests/archive/test_stage_b_v8b.py diff --git a/tests/test_stage_b_v9.py b/tests/archive/test_stage_b_v9.py similarity index 100% rename from tests/test_stage_b_v9.py rename to tests/archive/test_stage_b_v9.py diff --git a/tests/test_step2_subtile.py b/tests/archive/test_step2_subtile.py similarity index 97% rename from tests/test_step2_subtile.py rename to tests/archive/test_step2_subtile.py index 9fe5d435..2c76e1f5 100644 --- a/tests/test_step2_subtile.py +++ b/tests/archive/test_step2_subtile.py @@ -11,12 +11,16 @@ import torch import sys sys.path.insert(0, '/root/dsv4-nvfp4-workspace/kernel') -from cutedsl.bridge import ( +from dsv4.ops.quantize import ( quantize_weight_to_nvfp4, quantize_activation_nvfp4, +) +from dsv4.ops.layouts import ( make_b_k_major, assemble_scales_2d_side, assemble_scales_3d_side, +) +from dsv4.ops.gemm_runner import ( run_nvfp4_grouped_gemm, run_fused_swiglu_grouped_gemm, warmup_compilation, diff --git a/tests/test_step2_subtile_v2.py b/tests/archive/test_step2_subtile_v2.py similarity index 97% rename from tests/test_step2_subtile_v2.py rename to tests/archive/test_step2_subtile_v2.py index 3c40c26d..a91015a6 100644 --- a/tests/test_step2_subtile_v2.py +++ b/tests/archive/test_step2_subtile_v2.py @@ -12,12 +12,16 @@ import torch import sys sys.path.insert(0, '/root/dsv4-nvfp4-workspace/kernel') -from cutedsl.bridge import ( +from dsv4.ops.quantize import ( quantize_weight_to_nvfp4, quantize_activation_nvfp4, +) +from dsv4.ops.layouts import ( make_b_k_major, assemble_scales_2d_side, assemble_scales_3d_side, +) +from dsv4.ops.gemm_runner import ( run_nvfp4_grouped_gemm, run_fused_swiglu_grouped_gemm, warmup_compilation, diff --git a/tests/test_store_verify.py b/tests/archive/test_store_verify.py similarity index 100% rename from tests/test_store_verify.py rename to tests/archive/test_store_verify.py diff --git a/tests/test_store_verify2.py b/tests/archive/test_store_verify2.py similarity index 100% rename from tests/test_store_verify2.py rename to tests/archive/test_store_verify2.py diff --git a/tests/test_tmem_addressing.py b/tests/archive/test_tmem_addressing.py similarity index 100% rename from tests/test_tmem_addressing.py rename to tests/archive/test_tmem_addressing.py diff --git a/tests/test_tmem_col2.py b/tests/archive/test_tmem_col2.py similarity index 100% rename from tests/test_tmem_col2.py rename to tests/archive/test_tmem_col2.py diff --git a/tests/test_tmem_col3.py b/tests/archive/test_tmem_col3.py similarity index 100% rename from tests/test_tmem_col3.py rename to tests/archive/test_tmem_col3.py diff --git a/tests/test_tmem_col4.py b/tests/archive/test_tmem_col4.py similarity index 100% rename from tests/test_tmem_col4.py rename to tests/archive/test_tmem_col4.py diff --git a/tests/test_tmem_col5.py b/tests/archive/test_tmem_col5.py similarity index 100% rename from tests/test_tmem_col5.py rename to tests/archive/test_tmem_col5.py diff --git a/tests/test_tmem_col5_16.py b/tests/archive/test_tmem_col5_16.py similarity index 100% rename from tests/test_tmem_col5_16.py rename to tests/archive/test_tmem_col5_16.py diff --git a/tests/test_tmem_col5_32.py b/tests/archive/test_tmem_col5_32.py similarity index 100% rename from tests/test_tmem_col5_32.py rename to tests/archive/test_tmem_col5_32.py diff --git a/tests/test_tmem_col_offset.py b/tests/archive/test_tmem_col_offset.py similarity index 100% rename from tests/test_tmem_col_offset.py rename to tests/archive/test_tmem_col_offset.py diff --git a/tests/test_tmem_copy_roundtrip.py b/tests/archive/test_tmem_copy_roundtrip.py similarity index 100% rename from tests/test_tmem_copy_roundtrip.py rename to tests/archive/test_tmem_copy_roundtrip.py diff --git a/tests/test_tmem_debug.py b/tests/archive/test_tmem_debug.py similarity index 100% rename from tests/test_tmem_debug.py rename to tests/archive/test_tmem_debug.py diff --git a/tests/test_tmem_debug2.py b/tests/archive/test_tmem_debug2.py similarity index 100% rename from tests/test_tmem_debug2.py rename to tests/archive/test_tmem_debug2.py diff --git a/tests/test_tmem_fp32_roundtrip.py b/tests/archive/test_tmem_fp32_roundtrip.py similarity index 100% rename from tests/test_tmem_fp32_roundtrip.py rename to tests/archive/test_tmem_fp32_roundtrip.py diff --git a/tests/test_tmem_layout_diag.py b/tests/archive/test_tmem_layout_diag.py similarity index 100% rename from tests/test_tmem_layout_diag.py rename to tests/archive/test_tmem_layout_diag.py diff --git a/tests/test_tmem_pure_fp32.py b/tests/archive/test_tmem_pure_fp32.py similarity index 100% rename from tests/test_tmem_pure_fp32.py rename to tests/archive/test_tmem_pure_fp32.py diff --git a/tests/test_uniform_fp4.py b/tests/archive/test_uniform_fp4.py similarity index 100% rename from tests/test_uniform_fp4.py rename to tests/archive/test_uniform_fp4.py diff --git a/tests/test_v28c_noepi.py b/tests/archive/test_v28c_noepi.py similarity index 100% rename from tests/test_v28c_noepi.py rename to tests/archive/test_v28c_noepi.py diff --git a/tests/test_v4_attention_b200.py b/tests/archive/test_v4_attention_b200.py similarity index 98% rename from tests/test_v4_attention_b200.py rename to tests/archive/test_v4_attention_b200.py index f3b453b7..a4024679 100644 --- a/tests/test_v4_attention_b200.py +++ b/tests/archive/test_v4_attention_b200.py @@ -57,7 +57,7 @@ def rms(x, w, eps=1e-6): return (w.float() * (x * torch.rsqrt(v+eps)).float()).to(x.dtype) def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None): - from cutedsl.nvfp4_linear import CuTeDSLNvfp4Linear + from dsv4.layers.linear import Nvfp4Linear fp4 = w.view(torch.float4_e2m1fn_x2).permute(1,0).contiguous() s = sf.to(torch.float8_e4m3fn) if sf.dtype != torch.float8_e4m3fn else sf s = s.permute(1,0).contiguous() @@ -68,7 +68,7 @@ def make_runner(w, sf, gs_t, inf, outf, fused=False, lw=None): s32[:sp] *= g1/gs; s32[sp:] *= g2/gs; s = s32.to(torch.float8_e4m3fn) else: gs = gs_t.max().item() if gs_t.numel() > 1 else gs_t.item() - r = CuTeDSLNvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device)) + r = Nvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device)) r.fp4 = [fp4]; r.sf = [s]; r.gs = [gs] r.finalize_weights(); r._ensure_initialized() return r diff --git a/tests/test_v_mode_fix.py b/tests/archive/test_v_mode_fix.py similarity index 100% rename from tests/test_v_mode_fix.py rename to tests/archive/test_v_mode_fix.py diff --git a/tests/test_vllm_codepaths_b200.py b/tests/archive/test_vllm_codepaths_b200.py similarity index 98% rename from tests/test_vllm_codepaths_b200.py rename to tests/archive/test_vllm_codepaths_b200.py index 346d3aa1..505a193e 100644 --- a/tests/test_vllm_codepaths_b200.py +++ b/tests/archive/test_vllm_codepaths_b200.py @@ -34,12 +34,12 @@ def rms(x, w, eps=1e-6): return (w.float() * (x * torch.rsqrt(v+eps)).float()).to(x.dtype) def make_runner(w, sf, gs_t, inf, outf): - from cutedsl.nvfp4_linear import CuTeDSLNvfp4Linear + from dsv4.layers.linear import Nvfp4Linear fp4 = w.view(torch.float4_e2m1fn_x2).permute(1,0).contiguous() s = sf.to(torch.float8_e4m3fn) if sf.dtype != torch.float8_e4m3fn else sf s = s.permute(1,0).contiguous() gs = gs_t.max().item() if gs_t.numel() > 1 else gs_t.item() - r = CuTeDSLNvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device)) + r = Nvfp4Linear(in_features=inf, out_features=outf, max_num_tokens=8192, device=str(w.device)) r.fp4 = [fp4]; r.sf = [s]; r.gs = [gs] r.finalize_weights(); r._ensure_initialized() return r @@ -97,7 +97,7 @@ def causal_prefill_attention(q, kv, scale): def main(): """Test the exact csa_attention.py code paths used in the container.""" - from cutedsl.blackwell_attention import ( + from dsv4.reference.attention import ( apply_gptj_rope, apply_inv_gptj_rope, ) diff --git a/tests/test_vsmem_diag.py b/tests/archive/test_vsmem_diag.py similarity index 100% rename from tests/test_vsmem_diag.py rename to tests/archive/test_vsmem_diag.py diff --git a/tests/test_warmup_gs.py b/tests/archive/test_warmup_gs.py similarity index 93% rename from tests/test_warmup_gs.py rename to tests/archive/test_warmup_gs.py index b46a7a6b..a4517d53 100644 --- a/tests/test_warmup_gs.py +++ b/tests/archive/test_warmup_gs.py @@ -14,13 +14,21 @@ Usage (on B200): import torch, sys, os, json sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from cutedsl.bridge import ( - quantize_to_nvfp4, quantize_activation_nvfp4, - make_b_k_major, assemble_scales_2d_side, assemble_scales_3d_side, - run_nvfp4_grouped_gemm, compute_expert_offsets, +from dsv4.ops.quantize import ( + quantize_to_nvfp4, + quantize_activation_nvfp4, ) -from cutedsl.moe_pipeline import run_nvfp4_moe -from vllm.nvfp4_cutedsl import CuTeDSLMoERunner +from dsv4.ops.layouts import ( + make_b_k_major, + assemble_scales_2d_side, + assemble_scales_3d_side, + compute_expert_offsets, +) +from dsv4.ops.gemm_runner import ( + run_nvfp4_grouped_gemm, +) +from dsv4.reference.moe_pipeline import run_nvfp4_moe +from vllm.nvfp4_cutedsl import Nvfp4MoE from safetensors import safe_open MODEL_DIR = "/root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4" @@ -148,7 +156,7 @@ def main(): # ── Test 1: Runner with warmup gs (no safety margin) ── print("\n--- Test 1: Warmup gs, no safety margin ---") - runner = CuTeDSLMoERunner(num_experts, hidden_size, intermediate_size, device=DEVICE) + runner = Nvfp4MoE(num_experts, hidden_size, intermediate_size, device=DEVICE) runner.prepare_weights_direct( [w.clone() for w in l1_fp4], [w.clone() for w in l1_sf], list(l1_gs), [w.clone() for w in l2_fp4], [w.clone() for w in l2_sf], list(l2_gs), @@ -166,7 +174,7 @@ def main(): # ── Test 2: Runner with warmup gs + safety margins ── for safety in [1.0, 1.1, 1.2, 1.5, 2.0]: - runner2 = CuTeDSLMoERunner(num_experts, hidden_size, intermediate_size, device=DEVICE) + runner2 = Nvfp4MoE(num_experts, hidden_size, intermediate_size, device=DEVICE) runner2.prepare_weights_direct( [w.clone() for w in l1_fp4], [w.clone() for w in l1_sf], list(l1_gs), [w.clone() for w in l2_fp4], [w.clone() for w in l2_sf], list(l2_gs), diff --git a/tests/test_wo_a.py b/tests/archive/test_wo_a.py similarity index 97% rename from tests/test_wo_a.py rename to tests/archive/test_wo_a.py index 2fb21062..71f086d9 100644 --- a/tests/test_wo_a.py +++ b/tests/archive/test_wo_a.py @@ -18,8 +18,8 @@ import torch.nn.functional as F # Add repo root to path sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from cutedsl.inverse_rope import inverse_rope_bf16 -from cutedsl.wo_a_grouped_linear import CuTeDSLNvfp4WoA +from dsv4.ops.rope import inverse_rope_bf16 +from dsv4.layers.grouped_linear import Nvfp4GroupedLinear DEVICE = "cuda:0" @@ -112,7 +112,7 @@ def test_wo_a_grouped_linear(): z_ref[:, g, :] = o_grouped[:, g, :] @ wo_a_weight[g] # CuTeDSL NVFP4 runner - runner = CuTeDSLNvfp4WoA( + runner = Nvfp4GroupedLinear( n_local_groups=N_LOCAL_GROUPS, heads_per_group=HEADS_PER_GROUP, head_dim=HEAD_DIM, diff --git a/tests/test_wo_a_bmm.py b/tests/archive/test_wo_a_bmm.py similarity index 100% rename from tests/test_wo_a_bmm.py rename to tests/archive/test_wo_a_bmm.py diff --git a/tests/e2e/__init__.py b/tests/e2e/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/cudagraph_test.py b/tests/unit/cudagraph_test.py similarity index 98% rename from tests/cudagraph_test.py rename to tests/unit/cudagraph_test.py index 3d3fc55c..2cc2497d 100644 --- a/tests/cudagraph_test.py +++ b/tests/unit/cudagraph_test.py @@ -17,7 +17,7 @@ import contextlib REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, REPO_ROOT) -from vllm.nvfp4_cutedsl import CuTeDSLMoERunner +from vllm.nvfp4_cutedsl import Nvfp4MoE class CUDASyncDetector: @@ -103,7 +103,7 @@ class CUDASyncDetector: def make_dummy_runner(num_experts=32, hidden_size=7168, intermediate_size=3072, device="cuda"): """Create a CuTeDSL runner with dummy weights for testing.""" - runner = CuTeDSLMoERunner(num_experts, hidden_size, intermediate_size, device=device) + runner = Nvfp4MoE(num_experts, hidden_size, intermediate_size, device=device) # Create minimal dummy weights # Create minimal dummy weights (uint8 → view as float4) diff --git a/tests/layertest.py b/tests/unit/layertest.py similarity index 99% rename from tests/layertest.py rename to tests/unit/layertest.py index 7ecd3a3d..eef37d18 100644 --- a/tests/layertest.py +++ b/tests/unit/layertest.py @@ -15,7 +15,7 @@ from safetensors import safe_open REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, REPO_ROOT) -from cutedsl.moe_pipeline import ( +from dsv4.reference.moe_pipeline import ( run_nvfp4_moe, run_nvfp4_moe_fused, ) diff --git a/tests/test_128_128_vdiag.py b/tests/unit/test_128_128_vdiag.py similarity index 100% rename from tests/test_128_128_vdiag.py rename to tests/unit/test_128_128_vdiag.py diff --git a/tests/test_compile_custom_op.py b/tests/unit/test_compile_custom_op.py similarity index 97% rename from tests/test_compile_custom_op.py rename to tests/unit/test_compile_custom_op.py index d4580c38..46b033d1 100644 --- a/tests/test_compile_custom_op.py +++ b/tests/unit/test_compile_custom_op.py @@ -21,8 +21,8 @@ from safetensors import safe_open REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, REPO_ROOT) -from cutedsl.runner import CuTeDSLMoERunner -from cutedsl.custom_ops import register_runner, nvfp4_moe_gemm +from dsv4.layers.moe import Nvfp4MoE +from dsv4.ops.custom_ops import register_runner, nvfp4_moe_gemm NVFP4_MODEL_DIR = "/root/nvidia-meeting/DeepSeek-V4-Pro-NVFP4" DEVICE = "cuda" @@ -62,7 +62,10 @@ def load_layer_tensors(model_dir, layer_idx): def prepare_nvfp4_weights_direct(nvfp4_tensors, layer_idx, expert_indices, intermediate_size): - from cutedsl.bridge import quantize_activation_nvfp4, quantize_weight_to_nvfp4 +from dsv4.ops.quantize import ( + quantize_activation_nvfp4, + quantize_weight_to_nvfp4, +) l1_fp4, l1_sf, l1_gs = [], [], [] l2_fp4, l2_sf, l2_gs = [], [], [] @@ -117,7 +120,7 @@ def main(): weights = prepare_nvfp4_weights_direct(nvfp4_tensors, 0, expert_indices, intermediate_size) # Create runner - runner = CuTeDSLMoERunner( + runner = Nvfp4MoE( num_experts=len(expert_indices), hidden_size=hidden_size, intermediate_size=intermediate_size, diff --git a/tests/test_custom_op.py b/tests/unit/test_custom_op.py similarity index 94% rename from tests/test_custom_op.py rename to tests/unit/test_custom_op.py index 0e0f1f7e..fac8b60b 100644 --- a/tests/test_custom_op.py +++ b/tests/unit/test_custom_op.py @@ -19,7 +19,7 @@ sys.path.insert(0, REPO_ROOT) def test_custom_op_registered(): """Verify nvfp4::linear_gemm and nvfp4::moe_gemm are registered.""" - from cutedsl.custom_ops import nvfp4_linear_gemm, nvfp4_moe_gemm + from dsv4.ops.custom_ops import nvfp4_linear_gemm, nvfp4_moe_gemm # Check they exist as custom ops assert hasattr(nvfp4_linear_gemm, '_name') @@ -29,7 +29,7 @@ def test_custom_op_registered(): def test_runner_registry(): """Test the runner registry.""" - from cutedsl.custom_ops import register_runner, get_runner + from dsv4.ops.custom_ops import register_runner, get_runner class FakeRunner: def _run_impl(self, x): @@ -46,7 +46,7 @@ def test_runner_registry(): def test_fake_tensor_shape_inference(): """Test that FakeTensor impl returns correct shapes.""" - from cutedsl.custom_ops import nvfp4_linear_gemm, nvfp4_moe_gemm + from dsv4.ops.custom_ops import nvfp4_linear_gemm, nvfp4_moe_gemm # linear_gemm fake impl x_fake = torch.empty(4, 7168, dtype=torch.bfloat16, device='meta') @@ -73,7 +73,7 @@ def test_torch_compile_skips_custom_op(): If torch.compile correctly treats it as opaque, it won't call it during compilation — only the fake impl runs. """ - from cutedsl.custom_ops import register_runner, nvfp4_linear_gemm + from dsv4.ops.custom_ops import register_runner, nvfp4_linear_gemm class ExplodingRunner: """Runner that explodes if _run_impl is ever called.""" diff --git a/tests/test_cutedsl.py b/tests/unit/test_cutedsl.py similarity index 99% rename from tests/test_cutedsl.py rename to tests/unit/test_cutedsl.py index c110e2d8..64101b45 100644 --- a/tests/test_cutedsl.py +++ b/tests/unit/test_cutedsl.py @@ -24,7 +24,7 @@ import cutlass.torch as cutlass_torch import cutlass.utils as utils import cutlass.utils.blockscaled_layout as blockscaled_utils -from cutedsl.kernel.moe.torch_scaled_grouped_mm import ( +from dsv4.kernels.gemm.grouped import ( ScaledGroupedGemmKernel, pad_and_swizzle_single, assemble_raw_scales_2d3d_2d_side, diff --git a/tests/test_fmha_v3.py b/tests/unit/test_fmha_v3.py similarity index 100% rename from tests/test_fmha_v3.py rename to tests/unit/test_fmha_v3.py diff --git a/tests/test_fmha_v3_softmax.py b/tests/unit/test_fmha_v3_softmax.py similarity index 100% rename from tests/test_fmha_v3_softmax.py rename to tests/unit/test_fmha_v3_softmax.py diff --git a/tests/test_fp4_roundtrip.py b/tests/unit/test_fp4_roundtrip.py similarity index 98% rename from tests/test_fp4_roundtrip.py rename to tests/unit/test_fp4_roundtrip.py index c7b00870..4a787983 100644 --- a/tests/test_fp4_roundtrip.py +++ b/tests/unit/test_fp4_roundtrip.py @@ -86,7 +86,9 @@ def test_roundtrip(): # Step 2: Re-quantize BF16 → FP4 using our convention sys.path.insert(0, '/root/dsv4-nvfp4-workspace/kernel') - from cutedsl.bridge import quantize_weight_to_nvfp4 +from dsv4.ops.quantize import ( + quantize_weight_to_nvfp4, +) # quantize_weight_to_nvfp4 expects (K, N) where K is the packed dim # Our gate is (3072, 7168) in BF16, so K=3072, N=7168 diff --git a/tests/test_fused_step1.py b/tests/unit/test_fused_step1.py similarity index 96% rename from tests/test_fused_step1.py rename to tests/unit/test_fused_step1.py index 5c7d513d..4b83fb26 100644 --- a/tests/test_fused_step1.py +++ b/tests/unit/test_fused_step1.py @@ -7,12 +7,16 @@ import torch import sys sys.path.insert(0, '/root/dsv4-nvfp4-workspace/kernel') -from cutedsl.bridge import ( +from dsv4.ops.quantize import ( quantize_weight_to_nvfp4, quantize_activation_nvfp4, +) +from dsv4.ops.layouts import ( make_b_k_major, assemble_scales_2d_side, assemble_scales_3d_side, +) +from dsv4.ops.gemm_runner import ( run_nvfp4_grouped_gemm, run_fused_swiglu_grouped_gemm, warmup_compilation, diff --git a/tests/test_interleave.py b/tests/unit/test_interleave.py similarity index 98% rename from tests/test_interleave.py rename to tests/unit/test_interleave.py index b1985b39..a217f75f 100644 --- a/tests/test_interleave.py +++ b/tests/unit/test_interleave.py @@ -9,15 +9,19 @@ import torch import sys sys.path.insert(0 = '/root/dsv4-nvfp4-workspace/kernel') # FIXME -from cutedsl.bridge import ( +from dsv4.ops.quantize import ( quantize_to_nvfp4, quantize_activation_nvfp4, quantize_weight_to_nvfp4, +) +from dsv4.ops.layouts import ( interleave_l1_weights, deinterleave_l1_weights, make_b_k_major, assemble_scales_2d_side, assemble_scales_3d_side, +) +from dsv4.ops.gemm_runner import ( run_nvfp4_grouped_gemm, ) diff --git a/tests/test_interleave_gemm.py b/tests/unit/test_interleave_gemm.py similarity index 98% rename from tests/test_interleave_gemm.py rename to tests/unit/test_interleave_gemm.py index 4e5ae193..d131f73a 100644 --- a/tests/test_interleave_gemm.py +++ b/tests/unit/test_interleave_gemm.py @@ -8,13 +8,17 @@ import torch import sys sys.path.insert(0, '/root/dsv4-nvfp4-workspace/kernel') -from cutedsl.bridge import ( +from dsv4.ops.quantize import ( quantize_weight_to_nvfp4, quantize_activation_nvfp4, +) +from dsv4.ops.layouts import ( interleave_l1_weights, make_b_k_major, assemble_scales_2d_side, assemble_scales_3d_side, +) +from dsv4.ops.gemm_runner import ( run_nvfp4_grouped_gemm, warmup_compilation, ) diff --git a/tests/test_pv64_with_softmax.py b/tests/unit/test_pv64_with_softmax.py similarity index 100% rename from tests/test_pv64_with_softmax.py rename to tests/unit/test_pv64_with_softmax.py diff --git a/tests/test_qk_softmax.py b/tests/unit/test_qk_softmax.py similarity index 100% rename from tests/test_qk_softmax.py rename to tests/unit/test_qk_softmax.py diff --git a/tests/test_qkonly.py b/tests/unit/test_qkonly.py similarity index 100% rename from tests/test_qkonly.py rename to tests/unit/test_qkonly.py diff --git a/reference/blockscaled_layout.py b/vendored/blockscaled_layout.py similarity index 100% rename from reference/blockscaled_layout.py rename to vendored/blockscaled_layout.py diff --git a/reference/dense_blockscaled_gemm_persistent.py b/vendored/dense_blockscaled_gemm_persistent.py similarity index 100% rename from reference/dense_blockscaled_gemm_persistent.py rename to vendored/dense_blockscaled_gemm_persistent.py diff --git a/reference/grouped_blockscaled_gemm.py b/vendored/grouped_blockscaled_gemm.py similarity index 100% rename from reference/grouped_blockscaled_gemm.py rename to vendored/grouped_blockscaled_gemm.py diff --git a/reference/moe_moe_persistent_scheduler.py b/vendored/moe_moe_persistent_scheduler.py similarity index 100% rename from reference/moe_moe_persistent_scheduler.py rename to vendored/moe_moe_persistent_scheduler.py diff --git a/reference/moe_moe_sched_extension.py b/vendored/moe_moe_sched_extension.py similarity index 100% rename from reference/moe_moe_sched_extension.py rename to vendored/moe_moe_sched_extension.py diff --git a/reference/moe_moe_utils.py b/vendored/moe_moe_utils.py similarity index 100% rename from reference/moe_moe_utils.py rename to vendored/moe_moe_utils.py diff --git a/reference/moe_torch_grouped_mm.py b/vendored/moe_torch_grouped_mm.py similarity index 100% rename from reference/moe_torch_grouped_mm.py rename to vendored/moe_torch_grouped_mm.py diff --git a/reference/moe_torch_scaled_grouped_mm.py b/vendored/moe_torch_scaled_grouped_mm.py similarity index 100% rename from reference/moe_torch_scaled_grouped_mm.py rename to vendored/moe_torch_scaled_grouped_mm.py