fix cutlass_3x_gemm_fp8_blockwise on sm103a (#32224)

Signed-off-by: Siyuan Fu <siyuanf@nvidia.com> Co-authored-by: Pavani Majety <pmajety@nvidia.com>
2026-02-02 11:47:46 -08:00
parent 0130223bd9
commit 089cd4f002
7 changed files with 129 additions and 82 deletions
--- a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm.cuh
+++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm.cuh
@@ -141,8 +141,8 @@ struct cutlass_3x_gemm_sm100 {
              sizeof(typename CollectiveEpilogue::SharedStorage))>,
          KernelSchedule>::CollectiveOp;

-  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
-      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>;
+  using GemmKernel = enable_sm100f_only<cutlass::gemm::kernel::GemmUniversal<
+      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>>;
 };

 template <typename ElementAB_, typename ElementD_,
@@ -202,8 +202,8 @@ struct cutlass_3x_gemm_sm120 {
              sizeof(typename CollectiveEpilogue::SharedStorage))>,
          KernelSchedule>::CollectiveOp;

-  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
-      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>;
+  using GemmKernel = enable_sm120_only<cutlass::gemm::kernel::GemmUniversal<
+      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>>;
 };

 }  // namespace vllm
--- a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh
+++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh
@@ -123,7 +123,7 @@ struct cutlass_3x_gemm_fp8_blockwise {
          MainloopScheduler
      >::CollectiveOp>;

-  using KernelType = enable_sm100_only<cutlass::gemm::kernel::GemmUniversal<
+  using KernelType = enable_sm100f_only<cutlass::gemm::kernel::GemmUniversal<
      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue>>;

  struct GemmKernel : public KernelType {};
--- a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8_dispatch.cuh
+++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8_dispatch.cuh
@@ -90,8 +90,8 @@ struct cutlass_3x_gemm_sm100_fp8 {
  // -----------------------------------------------------------
  // Kernel definition
  // -----------------------------------------------------------
-  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
-      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>;
+  using GemmKernel = enable_sm100f_only<cutlass::gemm::kernel::GemmUniversal<
+      Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue, void>>;
 };

 template <typename InType, typename OutType, bool EnableBias>