diff --git a/CMakeLists.txt b/CMakeLists.txt
index cf59f18eb..ff57473cf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -309,7 +309,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
 
   # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
-  set(CUTLASS_REVISION "v4.2.1")
+  set(CUTLASS_REVISION "v4.4.2")
 
   # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
   if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
diff --git a/csrc/quantization/fp4/nvfp4_quant_entry.cu b/csrc/quantization/fp4/nvfp4_quant_entry.cu
index 8b5a1fd22..8729b01e4 100644
--- a/csrc/quantization/fp4/nvfp4_quant_entry.cu
+++ b/csrc/quantization/fp4/nvfp4_quant_entry.cu
@@ -16,6 +16,7 @@
 
 #include <torch/all.h>
 
+#include "cutlass_extensions/common.hpp"
 #include "nvfp4_utils.cuh"
 
 #if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
@@ -53,12 +54,27 @@ void silu_and_mul_scaled_fp4_experts_quant_sm1xxa(
     torch::Tensor const& output_scale_offset_by_experts);
 #endif
 
+static bool nvfp4_quant_sm_supported() {
+  const int32_t sm = get_sm_version_num();
+#if defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100
+  if (sm >= 100 && sm < 120) return true;
+#endif
+#if defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120
+  if (sm >= 120 && sm < 130) return true;
+#endif
+  return false;
+}
+
 void scaled_fp4_quant_out(torch::Tensor const& input,
                           torch::Tensor const& input_sf,
                           bool is_sf_swizzled_layout, torch::Tensor& output,
                           torch::Tensor& output_sf) {
 #if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
     (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
+  TORCH_CHECK(nvfp4_quant_sm_supported(),
+              "No compiled nvfp4 quantization kernel for SM ",
+              get_sm_version_num(),
+              ". Recompile with the appropriate CUDA arch.");
   return scaled_fp4_quant_sm1xxa(output, input, output_sf, input_sf,
                                  is_sf_swizzled_layout);
 #endif
@@ -100,6 +116,10 @@ void scaled_fp4_experts_quant(
     torch::Tensor const& output_scale_offset_by_experts) {
 #if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
     (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
+  TORCH_CHECK(nvfp4_quant_sm_supported(),
+              "No compiled nvfp4 experts quantization kernel for SM ",
+              get_sm_version_num(),
+              ". Recompile with the appropriate CUDA arch.");
   return scaled_fp4_experts_quant_sm1xxa(
       output, output_scale, input, input_global_scale, input_offset_by_experts,
       output_scale_offset_by_experts);
@@ -112,6 +132,10 @@ void silu_and_mul_nvfp4_quant(torch::Tensor& output, torch::Tensor& output_sf,
                               torch::Tensor& input, torch::Tensor& input_sf) {
 #if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
     (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
+  TORCH_CHECK(nvfp4_quant_sm_supported(),
+              "No compiled silu_and_mul nvfp4 quantization kernel for SM ",
+              get_sm_version_num(),
+              ". Recompile with the appropriate CUDA arch.");
   return silu_and_mul_nvfp4_quant_sm1xxa(output, output_sf, input, input_sf);
 #endif
   TORCH_CHECK_NOT_IMPLEMENTED(
@@ -125,6 +149,11 @@ void silu_and_mul_scaled_fp4_experts_quant(
     torch::Tensor const& output_scale_offset_by_experts) {
 #if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
     (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
+  TORCH_CHECK(nvfp4_quant_sm_supported(),
+              "No compiled silu_and_mul nvfp4 experts quantization kernel "
+              "for SM ",
+              get_sm_version_num(),
+              ". Recompile with the appropriate CUDA arch.");
   return silu_and_mul_scaled_fp4_experts_quant_sm1xxa(
       output, output_scale, input, input_global_scale, input_offset_by_experts,
       output_scale_offset_by_experts);
diff --git a/csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu b/csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu
index d9c4d24d8..4985bf1eb 100644
--- a/csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu
+++ b/csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu
@@ -63,5 +63,17 @@ void cutlass_scaled_fp4_mm(torch::Tensor& D, const torch::Tensor& A,
 bool cutlass_scaled_mm_supports_fp4(int64_t cuda_device_capability) {
   int runtimeVersion;
   cudaRuntimeGetVersion(&runtimeVersion);
-  return cuda_device_capability >= 100 && runtimeVersion >= 12080;
+  if (runtimeVersion < 12080) return false;
+  // Only report support when the SM-specific kernel was actually compiled in,
+  // so the Python-side backend selector does not choose CUTLASS and then hit
+  // TORCH_CHECK_NOT_IMPLEMENTED (or worse, fall through to Marlin).
+#if defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100
+  if (cuda_device_capability >= 100 && cuda_device_capability < 120)
+    return true;
+#endif
+#if defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120
+  if (cuda_device_capability >= 120 && cuda_device_capability < 130)
+    return true;
+#endif
+  return false;
 }
diff --git a/csrc/quantization/machete/machete_mainloop.cuh b/csrc/quantization/machete/machete_mainloop.cuh
index 9f02f4f17..a0d1ded88 100644
--- a/csrc/quantization/machete/machete_mainloop.cuh
+++ b/csrc/quantization/machete/machete_mainloop.cuh
@@ -154,6 +154,7 @@ struct MacheteCollectiveMma {
   struct DispatchPolicy {
     constexpr static int Stages = PipelineStages;
     using ClusterShape = ClusterShape_MNK;
+    using ArchTag = arch::Sm90;
     using Schedule = KernelScheduleType;
   };
 
diff --git a/docker/Dockerfile b/docker/Dockerfile
index b26b82eb5..2d63b39dc 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -590,7 +590,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 # Install FlashInfer JIT cache (requires CUDA-version-specific index URL)
 # https://docs.flashinfer.ai/installation.html
 # From versions.json: .flashinfer.version
-ARG FLASHINFER_VERSION=0.6.6
+# 0.6.7: CUTLASS 4.4.2 bump, fixes TMA grouped GEMM on SM12x (flashinfer#2798)
+# TODO: bump to 0.6.8 when released for NVFP4/MXFP4 group GEMMs on
+#   SM120/SM121 (RTX 50 / DGX Spark) via flashinfer#2738
+ARG FLASHINFER_VERSION=0.6.7
 RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system flashinfer-jit-cache==${FLASHINFER_VERSION} \
         --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
diff --git a/docker/Dockerfile.nightly_torch b/docker/Dockerfile.nightly_torch
index 5c424980e..045a09a42 100644
--- a/docker/Dockerfile.nightly_torch
+++ b/docker/Dockerfile.nightly_torch
@@ -217,13 +217,16 @@ RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2.
 
 
 # build flashinfer for torch nightly from source around 10 mins
-# release version: v0.6.6
+# release version: v0.6.7
+# 0.6.7: CUTLASS 4.4.2 bump, fixes TMA grouped GEMM on SM12x (flashinfer#2798)
+# TODO: bump to 0.6.8 when released for NVFP4/MXFP4 group GEMMs on
+#   SM120/SM121 (RTX 50 / DGX Spark) via flashinfer#2738
 # todo(elainewy): cache flashinfer build result for faster build
 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
     --mount=type=cache,target=/root/.cache/uv \
     echo "git clone flashinfer..." \
-    && git clone --depth 1 --branch v0.6.6 --recursive https://github.com/flashinfer-ai/flashinfer.git \
+    && git clone --depth 1 --branch v0.6.7 --recursive https://github.com/flashinfer-ai/flashinfer.git \
     && cd flashinfer \
     && git submodule update --init --recursive \
     && echo "finish git clone flashinfer..." \
diff --git a/docker/versions.json b/docker/versions.json
index 582d1bd54..51be33954 100644
--- a/docker/versions.json
+++ b/docker/versions.json
@@ -68,7 +68,7 @@
       "default": "true"
     },
     "FLASHINFER_VERSION": {
-      "default": "0.6.6"
+      "default": "0.6.7"
     },
     "GDRCOPY_CUDA_VERSION": {
       "default": "12.8"
diff --git a/requirements/cuda.txt b/requirements/cuda.txt
index fe566db35..6d7f9693f 100644
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@@ -9,8 +9,8 @@ torchaudio==2.10.0
 # These must be updated alongside torch
 torchvision==0.25.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 # FlashInfer should be updated together with the Dockerfile
-flashinfer-python==0.6.6
-flashinfer-cubin==0.6.6
+flashinfer-python==0.6.7
+flashinfer-cubin==0.6.7
 # Cap nvidia-cudnn-frontend (transitive dep of flashinfer) due to
 # breaking changes in 1.19.0
 nvidia-cudnn-frontend>=1.13.0,<1.19.0
diff --git a/tests/kernels/moe/test_unquantized_backend_selection.py b/tests/kernels/moe/test_unquantized_backend_selection.py
index 1d9e1d685..48f1a4b94 100644
--- a/tests/kernels/moe/test_unquantized_backend_selection.py
+++ b/tests/kernels/moe/test_unquantized_backend_selection.py
@@ -57,7 +57,6 @@ def test_select_default_backend_by_platform(
         moe_config = make_dummy_moe_config()
         selected_backend = select_unquantized_moe_backend(
             moe_config=moe_config,
-            use_ep=False,
             use_dp=False,
         )
 
@@ -90,7 +89,6 @@ def test_select_rocm_aiter_backend(mock_aiter_enabled, mock_has_flashinfer):
         moe_config = make_dummy_moe_config()
         selected_backend = select_unquantized_moe_backend(
             moe_config=moe_config,
-            use_ep=False,
             use_dp=False,
         )
 
@@ -129,7 +127,6 @@ def test_select_cuda_flashinfer_trtllm_backend(
 
         selected_backend = select_unquantized_moe_backend(
             moe_config=moe_config,
-            use_ep=True,
             use_dp=False,
         )
 
@@ -171,7 +168,6 @@ def test_select_cuda_flashinfer_cutlass_backend(
 
         selected_backend = select_unquantized_moe_backend(
             moe_config=moe_config,
-            use_ep=True,  # CUTLASS requires EP
             use_dp=False,  # CUTLASS doesn't support DP
         )
 
diff --git a/vllm/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py b/vllm/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py
index c0a7dfc49..4cb12a8c1 100644
--- a/vllm/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py
+++ b/vllm/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py
@@ -406,6 +406,11 @@ class TrtLlmFp8ExpertsMonolithic(TrtLlmFp8ExpertsBase, mk.FusedMoEExpertsMonolit
         if self.routing_method_type == RoutingMethodType.DeepSeekV3:
             router_logits = router_logits.to(torch.float32)
 
+        # Currently FI requires bfloat16 routing bias.
+        # https://github.com/flashinfer-ai/flashinfer/issues/2909
+        if e_score_correction_bias is not None:
+            e_score_correction_bias = e_score_correction_bias.to(torch.bfloat16)
+
         out = flashinfer.fused_moe.trtllm_fp8_per_tensor_scale_moe(
             routing_logits=router_logits,
             routing_bias=e_score_correction_bias,
diff --git a/vllm/model_executor/layers/fused_moe/experts/trtllm_nvfp4_moe.py b/vllm/model_executor/layers/fused_moe/experts/trtllm_nvfp4_moe.py
index b47391c41..81b778c8f 100644
--- a/vllm/model_executor/layers/fused_moe/experts/trtllm_nvfp4_moe.py
+++ b/vllm/model_executor/layers/fused_moe/experts/trtllm_nvfp4_moe.py
@@ -5,6 +5,7 @@ import flashinfer
 import torch
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.activation import MoEActivation
 from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEConfig,
@@ -27,6 +28,8 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
 from vllm.platforms import current_platform
 from vllm.utils.flashinfer import has_flashinfer_trtllm_fused_moe
 
+logger = init_logger(__name__)
+
 
 class TrtLlmNvFp4ExpertsBase:
     """
@@ -315,6 +318,11 @@ class TrtLlmNvFp4ExpertsMonolithic(
             else router_logits
         )
 
+        # Currently FI requires bfloat16 routing bias.
+        # https://github.com/flashinfer-ai/flashinfer/issues/2909
+        if e_score_correction_bias is not None:
+            e_score_correction_bias = e_score_correction_bias.to(torch.bfloat16)
+
         # Invoke kernel.
         return flashinfer.fused_moe.trtllm_fp4_block_scale_moe(
             routing_logits=router_logits,
diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
index 91f7a83f6..26409804c 100644
--- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
@@ -361,7 +361,7 @@ class FlashInferExperts(mk.FusedMoEExpertsModular):
             fc1_expert_weights = w1
             fc2_expert_weights = w2
         else:
-            quant_scales = None
+            quant_scales = []
             a1q_scale = None
             fc1_expert_weights = w1
             fc2_expert_weights = w2
diff --git a/vllm/model_executor/layers/fused_moe/oracle/unquantized.py b/vllm/model_executor/layers/fused_moe/oracle/unquantized.py
index 9c31da10d..84f4df010 100644
--- a/vllm/model_executor/layers/fused_moe/oracle/unquantized.py
+++ b/vllm/model_executor/layers/fused_moe/oracle/unquantized.py
@@ -70,7 +70,6 @@ def map_unquantized_backend(runner_backend: MoEBackend) -> UnquantizedMoeBackend
 
 def select_unquantized_moe_backend(
     moe_config: FusedMoEConfig,
-    use_ep: bool,
     use_dp: bool,
 ) -> UnquantizedMoeBackend:
     """
@@ -96,7 +95,6 @@ def select_unquantized_moe_backend(
     # FlashInfer CUTLASS MoE is only supported on Hopper and later GPUS
     flashinfer_cutlass_available = (
         has_flashinfer_cutlass_fused_moe()
-        and use_ep
         and (not use_dp)
         and current_platform.has_device_capability(90)
     )
@@ -161,9 +159,9 @@ def select_unquantized_moe_backend(
                     "to enable it for better performance.",
                     scope="local",
                 )
-            elif use_ep and (not use_dp):
+            elif not use_dp and flashinfer_cutlass_available:
                 logger.info_once(
-                    "FlashInfer MoE is available for EP"
+                    "FlashInfer CUTLASS MoE is available"
                     " but not enabled, consider setting"
                     " VLLM_USE_FLASHINFER_MOE_FP16=1 to enable it.",
                     scope="local",
diff --git a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
index 38b552b02..cf1afcb29 100644
--- a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
+++ b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
@@ -61,7 +61,6 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         super().__init__(moe)
         self.unquantized_backend = select_unquantized_moe_backend(
             moe_config=self.moe,
-            use_ep=self.moe.moe_parallel_config.use_ep,
             use_dp=self.moe.moe_parallel_config.dp_size > 1,
         )
 
diff --git a/vllm/model_executor/layers/quantization/utils/nvfp4_utils.py b/vllm/model_executor/layers/quantization/utils/nvfp4_utils.py
index bcb4769e4..f21f2ef23 100644
--- a/vllm/model_executor/layers/quantization/utils/nvfp4_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/nvfp4_utils.py
@@ -55,8 +55,16 @@ def select_nvfp4_linear_backend() -> NvFp4LinearBackend:
     elif envs.VLLM_USE_NVFP4_CT_EMULATIONS:
         backend = NvFp4LinearBackend.EMULATION
     elif envs.VLLM_NVFP4_GEMM_BACKEND is None:
-        # Auto-select best available backend
-        if current_platform.has_device_capability(100) and has_flashinfer():
+        # Auto-select best available backend.
+        # cutlass_fp4_supported() checks that the vLLM NVFP4 kernels (both
+        # quantization and GEMM) were compiled for the current SM version.
+        # FlashInfer backends still rely on the vLLM quantization kernels,
+        # so we gate them on the same check.
+        if (
+            cutlass_fp4_supported()
+            and current_platform.has_device_capability(100)
+            and has_flashinfer()
+        ):
             backend = NvFp4LinearBackend.FLASHINFER_CUTLASS
         elif cutlass_fp4_supported():
             backend = NvFp4LinearBackend.VLLM_CUTLASS
@@ -72,6 +80,10 @@ def select_nvfp4_linear_backend() -> NvFp4LinearBackend:
         NvFp4LinearBackend.FLASHINFER_CUDNN,
     ):
         assert has_flashinfer(), f"FlashInfer is required for {backend}"
+        assert cutlass_fp4_supported(), (
+            f"{backend} requires vLLM NVFP4 quantization kernels compiled "
+            f"for the current GPU (SM {current_platform.get_device_capability()})"
+        )
     elif backend == NvFp4LinearBackend.VLLM_CUTLASS:
         assert cutlass_fp4_supported(), f"Cutlass is required for {backend}"
     elif backend == NvFp4LinearBackend.MARLIN: