[NVIDIA] Fix DGX Spark logic (#38126)

Signed-off-by: johnnynunez <johnnynuca14@gmail.com> Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Nick Hill <nickhill123@gmail.com> Signed-off-by: Mark McLoughlin <markmc@redhat.com> Signed-off-by: Andreas Karatzas <akaratza@amd.com> Signed-off-by: Matthew Wong <Matthew.Wong2@amd.com> Signed-off-by: Sathish Sanjeevi <sathish.krishnan.p.s@gmail.com> Signed-off-by: guillaume_guy <guillaume.guy@airbnb.com> Signed-off-by: Guillaume Guy <guillaume.c.guy@gmail.com> Co-authored-by: Yongye Zhu <zyy1102000@gmail.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk> Co-authored-by: Nick Hill <nickhill123@gmail.com> Co-authored-by: Mark McLoughlin <markmc@redhat.com> Co-authored-by: Andreas Karatzas <akaratza@amd.com> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Co-authored-by: Matthew Wong <Matthew.Wong2@amd.com> Co-authored-by: Sathish Sanjeevi <SKPsanjeevi@users.noreply.github.com> Co-authored-by: Guillaume Guy <guillaume.c.guy@gmail.com> Co-authored-by: guillaume_guy <guillaume.guy@airbnb.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2026-03-27 23:26:07 +01:00
parent 384e4d5f48
commit 97d19197bc
3 changed files with 47 additions and 12 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -363,7 +363,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # - sm80 doesn't support fp8 computation
  # - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
  # so we only enable fp8 computation for SM89 (e.g. RTX 40x0)  and 12.0 (e.g. RTX 50x0)
-  cuda_archs_loose_intersection(MARLIN_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(MARLIN_FP8_ARCHS "8.9;12.0;12.1" "${CUDA_ARCHS}")
  # marlin arches for other files
  cuda_archs_loose_intersection(MARLIN_OTHER_ARCHS "7.5;8.0+PTX" "${CUDA_ARCHS}")

@@ -523,12 +523,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  endif()


-  # The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require
+  # The cutlass_scaled_mm kernels for Blackwell SM12x (c3x, i.e. CUTLASS 3.x) require
  # CUDA 12.8 or later
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
    cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0f" "${CUDA_ARCHS}")
  else()
-    cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0a" "${CUDA_ARCHS}")
+    cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0a;12.1a" "${CUDA_ARCHS}")
  endif()
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
    set(SRCS
@@ -616,12 +616,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    endif()
  endif()

-  # The nvfp4_scaled_mm_sm120 kernels for Geforce Blackwell SM120 require
+  # The nvfp4_scaled_mm_sm120 kernels for Blackwell SM12x require
  # CUDA 12.8 or later
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
    cuda_archs_loose_intersection(FP4_ARCHS "12.0f" "${CUDA_ARCHS}")
  else()
-    cuda_archs_loose_intersection(FP4_ARCHS "12.0a" "${CUDA_ARCHS}")
+    cuda_archs_loose_intersection(FP4_ARCHS "12.0a;12.1a" "${CUDA_ARCHS}")
  endif()
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
    set(SRCS
@@ -1050,7 +1050,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # - sm80 doesn't support fp8 computation
  # - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
  # so we only enable fp8 computation for SM89 (e.g. RTX 40x0)  and 12.0 (e.g. RTX 50x0)
-  cuda_archs_loose_intersection(MARLIN_MOE_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(MARLIN_MOE_FP8_ARCHS "8.9;12.0;12.1" "${CUDA_ARCHS}")
  # moe marlin arches for other files
  cuda_archs_loose_intersection(MARLIN_MOE_OTHER_ARCHS "7.5;8.0+PTX" "${CUDA_ARCHS}")
  if (MARLIN_MOE_OTHER_ARCHS)