[Kernel][Quantization] Integrate block-quantized CUTLASS kernels for DeepSeekV3 (#12587)

Integrates the block-quantized kernels introduced in https://github.com/vllm-project/vllm/pull/11868 for use in linear layers. Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-01-31 18:29:11 -05:00
parent 145c2ff648
commit eb5741ad42
8 changed files with 160 additions and 37 deletions
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
@@ -58,7 +58,13 @@ void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,

    vllm::cutlass_scaled_mm_blockwise_sm90_fp8(c, a, b, a_scales, b_scales);
  } else {
-    TORCH_CHECK(false, "Unsupported scale group shapes for CUTLASS 3.x GEMM");
+    TORCH_CHECK(false,
+                "Unsupported scale group shapes for CUTLASS 3.x GEMM.\n "
+                "a_scale_group_shape must be [1, 128], got: [",
+                a_scale_group_shape[0], ", ", a_scale_group_shape[1],
+                "]\n"
+                "b_scale_group_shape must be [128, 128], got: [",
+                b_scale_group_shape[0], ", ", b_scale_group_shape[1], "]");
  }
 }

--- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@@ -81,6 +81,19 @@ bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) {
  return false;
 }

+bool cutlass_scaled_mm_supports_block_fp8(int64_t cuda_device_capability) {
+  // CUTLASS block-quantized FP8 kernels need at least CUDA 12.0
+  // and at least SM90 (Hopper)
+
+#if defined CUDA_VERSION
+  if (cuda_device_capability >= 90) {
+    return CUDA_VERSION >= 12000;
+  }
+#endif
+
+  return false;
+}
+
 void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
                       torch::Tensor const& b, torch::Tensor const& a_scales,
                       torch::Tensor const& b_scales,
@@ -212,4 +225,4 @@ void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a,
      "No compiled cutlass_scaled_mm_azp for a compute capability less than "
      "CUDA device capability: ",
      version_num);
-}
+}