From bc32444b238d2ec3726f599cf3fc67dbaf51a6c6 Mon Sep 17 00:00:00 2001
From: Vel <110626982+Code4me2@users.noreply.github.com>
Date: Fri, 6 Feb 2026 20:28:01 -0800
Subject: [PATCH] [Kernel] Add enable_sm120_or_later for SM121 (DGX Spark)
 CUTLASS support (#33517)

Signed-off-by: code4me2 <velvetmoon222999@gmail.com>
---
 csrc/cutlass_extensions/common.hpp                    | 11 +++++++++++
 .../c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh    |  3 ++-
 2 files changed, 13 insertions(+), 1 deletion(-)
diff --git a/csrc/cutlass_extensions/common.hpp b/csrc/cutlass_extensions/common.hpp
index 1d5280139..91c215071 100644
--- a/csrc/cutlass_extensions/common.hpp
+++ b/csrc/cutlass_extensions/common.hpp
@@ -152,3 +152,14 @@ struct enable_sm120_only : Kernel {
 #endif
   }
 };
+
+// SM12x family includes SM120 (RTX 5090) and SM121 (DGX Spark GB10)
+template <typename Kernel>
+struct enable_sm120_family : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE void operator()(Args&&... args) {
+#if defined __CUDA_ARCH__ && (__CUDA_ARCH__ >= 1200 && __CUDA_ARCH__ < 1300)
+    Kernel::operator()(std::forward<Args>(args)...);
+#endif
+  }
+};
diff --git a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh
index 811741aee..f255b27a1 100644
--- a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh
+++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8_dispatch.cuh
@@ -103,7 +103,8 @@ struct cutlass_3x_gemm_fp8_blockwise {
           MainloopScheduler
       >::CollectiveOp;
 
-  using KernelType = enable_sm120_only<cutlass::gemm::kernel::GemmUniversal<
+  // SM12x family to support both SM120 (RTX 5090) and SM121 (DGX Spark)
+  using KernelType = enable_sm120_family<cutlass::gemm::kernel::GemmUniversal<
       Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue>>;
 
   struct GemmKernel : public KernelType {};