[Hardware/NVIDIA/Kernel] Enable nvidia/DeepSeek-R1-FP4 Model (#16362)

2025-05-09 16:24:41 -07:00
parent 3b602cdea7
commit 0c0fdae84f
16 changed files with 1994 additions and 112 deletions
--- a/csrc/quantization/fp4/nvfp4_quant_entry.cu
+++ b/csrc/quantization/fp4/nvfp4_quant_entry.cu
@@ -23,10 +23,32 @@ void scaled_fp4_quant_sm100a(torch::Tensor const& output,
                             torch::Tensor const& input_sf);
 #endif

+#if defined ENABLE_NVFP4 && ENABLE_NVFP4
+void scaled_fp4_experts_quant_sm100a(
+    torch::Tensor& output, torch::Tensor& output_scale,
+    torch::Tensor const& input, torch::Tensor const& input_global_scale,
+    torch::Tensor const& input_offset_by_experts,
+    torch::Tensor const& output_scale_offset_by_experts);
+#endif
+
 void scaled_fp4_quant(torch::Tensor& output, torch::Tensor const& input,
                      torch::Tensor& output_sf, torch::Tensor const& input_sf) {
 #if defined ENABLE_NVFP4 && ENABLE_NVFP4
  return scaled_fp4_quant_sm100a(output, input, output_sf, input_sf);
 #endif
-  TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled nvfp4 quantization");
+  TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled nvfp4 quantization kernel");
+}
+
+void scaled_fp4_experts_quant(
+    torch::Tensor& output, torch::Tensor& output_scale,
+    torch::Tensor const& input, torch::Tensor const& input_global_scale,
+    torch::Tensor const& input_offset_by_experts,
+    torch::Tensor const& output_scale_offset_by_experts) {
+#if defined ENABLE_NVFP4 && ENABLE_NVFP4
+  return scaled_fp4_experts_quant_sm100a(
+      output, output_scale, input, input_global_scale, input_offset_by_experts,
+      output_scale_offset_by_experts);
+#endif
+  TORCH_CHECK_NOT_IMPLEMENTED(false,
+                              "No compiled nvfp4 experts quantization kernel");
 }