diff --git a/CMakeLists.txt b/CMakeLists.txt index cf59f18eb..ff57473cf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -309,7 +309,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library") # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building. - set(CUTLASS_REVISION "v4.2.1") + set(CUTLASS_REVISION "v4.4.2") # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR}) diff --git a/csrc/quantization/fp4/nvfp4_quant_entry.cu b/csrc/quantization/fp4/nvfp4_quant_entry.cu index 8b5a1fd22..8729b01e4 100644 --- a/csrc/quantization/fp4/nvfp4_quant_entry.cu +++ b/csrc/quantization/fp4/nvfp4_quant_entry.cu @@ -16,6 +16,7 @@ #include +#include "cutlass_extensions/common.hpp" #include "nvfp4_utils.cuh" #if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \ @@ -53,12 +54,27 @@ void silu_and_mul_scaled_fp4_experts_quant_sm1xxa( torch::Tensor const& output_scale_offset_by_experts); #endif +static bool nvfp4_quant_sm_supported() { + const int32_t sm = get_sm_version_num(); +#if defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100 + if (sm >= 100 && sm < 120) return true; +#endif +#if defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120 + if (sm >= 120 && sm < 130) return true; +#endif + return false; +} + void scaled_fp4_quant_out(torch::Tensor const& input, torch::Tensor const& input_sf, bool is_sf_swizzled_layout, torch::Tensor& output, torch::Tensor& output_sf) { #if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \ (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120) + TORCH_CHECK(nvfp4_quant_sm_supported(), + "No compiled nvfp4 quantization kernel for SM ", + get_sm_version_num(), + ". Recompile with the appropriate CUDA arch."); return scaled_fp4_quant_sm1xxa(output, input, output_sf, input_sf, is_sf_swizzled_layout); #endif @@ -100,6 +116,10 @@ void scaled_fp4_experts_quant( torch::Tensor const& output_scale_offset_by_experts) { #if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \ (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120) + TORCH_CHECK(nvfp4_quant_sm_supported(), + "No compiled nvfp4 experts quantization kernel for SM ", + get_sm_version_num(), + ". Recompile with the appropriate CUDA arch."); return scaled_fp4_experts_quant_sm1xxa( output, output_scale, input, input_global_scale, input_offset_by_experts, output_scale_offset_by_experts); @@ -112,6 +132,10 @@ void silu_and_mul_nvfp4_quant(torch::Tensor& output, torch::Tensor& output_sf, torch::Tensor& input, torch::Tensor& input_sf) { #if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \ (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120) + TORCH_CHECK(nvfp4_quant_sm_supported(), + "No compiled silu_and_mul nvfp4 quantization kernel for SM ", + get_sm_version_num(), + ". Recompile with the appropriate CUDA arch."); return silu_and_mul_nvfp4_quant_sm1xxa(output, output_sf, input, input_sf); #endif TORCH_CHECK_NOT_IMPLEMENTED( @@ -125,6 +149,11 @@ void silu_and_mul_scaled_fp4_experts_quant( torch::Tensor const& output_scale_offset_by_experts) { #if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \ (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120) + TORCH_CHECK(nvfp4_quant_sm_supported(), + "No compiled silu_and_mul nvfp4 experts quantization kernel " + "for SM ", + get_sm_version_num(), + ". Recompile with the appropriate CUDA arch."); return silu_and_mul_scaled_fp4_experts_quant_sm1xxa( output, output_scale, input, input_global_scale, input_offset_by_experts, output_scale_offset_by_experts); diff --git a/csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu b/csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu index d9c4d24d8..4985bf1eb 100644 --- a/csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu +++ b/csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu @@ -63,5 +63,17 @@ void cutlass_scaled_fp4_mm(torch::Tensor& D, const torch::Tensor& A, bool cutlass_scaled_mm_supports_fp4(int64_t cuda_device_capability) { int runtimeVersion; cudaRuntimeGetVersion(&runtimeVersion); - return cuda_device_capability >= 100 && runtimeVersion >= 12080; + if (runtimeVersion < 12080) return false; + // Only report support when the SM-specific kernel was actually compiled in, + // so the Python-side backend selector does not choose CUTLASS and then hit + // TORCH_CHECK_NOT_IMPLEMENTED (or worse, fall through to Marlin). +#if defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100 + if (cuda_device_capability >= 100 && cuda_device_capability < 120) + return true; +#endif +#if defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120 + if (cuda_device_capability >= 120 && cuda_device_capability < 130) + return true; +#endif + return false; } diff --git a/csrc/quantization/machete/machete_mainloop.cuh b/csrc/quantization/machete/machete_mainloop.cuh index 9f02f4f17..a0d1ded88 100644 --- a/csrc/quantization/machete/machete_mainloop.cuh +++ b/csrc/quantization/machete/machete_mainloop.cuh @@ -154,6 +154,7 @@ struct MacheteCollectiveMma { struct DispatchPolicy { constexpr static int Stages = PipelineStages; using ClusterShape = ClusterShape_MNK; + using ArchTag = arch::Sm90; using Schedule = KernelScheduleType; }; diff --git a/docker/Dockerfile b/docker/Dockerfile index b26b82eb5..2d63b39dc 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -590,7 +590,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \ # Install FlashInfer JIT cache (requires CUDA-version-specific index URL) # https://docs.flashinfer.ai/installation.html # From versions.json: .flashinfer.version -ARG FLASHINFER_VERSION=0.6.6 +# 0.6.7: CUTLASS 4.4.2 bump, fixes TMA grouped GEMM on SM12x (flashinfer#2798) +# TODO: bump to 0.6.8 when released for NVFP4/MXFP4 group GEMMs on +# SM120/SM121 (RTX 50 / DGX Spark) via flashinfer#2738 +ARG FLASHINFER_VERSION=0.6.7 RUN --mount=type=cache,target=/root/.cache/uv \ uv pip install --system flashinfer-jit-cache==${FLASHINFER_VERSION} \ --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \ diff --git a/docker/Dockerfile.nightly_torch b/docker/Dockerfile.nightly_torch index 5c424980e..045a09a42 100644 --- a/docker/Dockerfile.nightly_torch +++ b/docker/Dockerfile.nightly_torch @@ -217,13 +217,16 @@ RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2. # build flashinfer for torch nightly from source around 10 mins -# release version: v0.6.6 +# release version: v0.6.7 +# 0.6.7: CUTLASS 4.4.2 bump, fixes TMA grouped GEMM on SM12x (flashinfer#2798) +# TODO: bump to 0.6.8 when released for NVFP4/MXFP4 group GEMMs on +# SM120/SM121 (RTX 50 / DGX Spark) via flashinfer#2738 # todo(elainewy): cache flashinfer build result for faster build ENV CCACHE_DIR=/root/.cache/ccache RUN --mount=type=cache,target=/root/.cache/ccache \ --mount=type=cache,target=/root/.cache/uv \ echo "git clone flashinfer..." \ - && git clone --depth 1 --branch v0.6.6 --recursive https://github.com/flashinfer-ai/flashinfer.git \ + && git clone --depth 1 --branch v0.6.7 --recursive https://github.com/flashinfer-ai/flashinfer.git \ && cd flashinfer \ && git submodule update --init --recursive \ && echo "finish git clone flashinfer..." \ diff --git a/docker/versions.json b/docker/versions.json index 582d1bd54..51be33954 100644 --- a/docker/versions.json +++ b/docker/versions.json @@ -68,7 +68,7 @@ "default": "true" }, "FLASHINFER_VERSION": { - "default": "0.6.6" + "default": "0.6.7" }, "GDRCOPY_CUDA_VERSION": { "default": "12.8" diff --git a/requirements/cuda.txt b/requirements/cuda.txt index fe566db35..6d7f9693f 100644 --- a/requirements/cuda.txt +++ b/requirements/cuda.txt @@ -9,8 +9,8 @@ torchaudio==2.10.0 # These must be updated alongside torch torchvision==0.25.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version # FlashInfer should be updated together with the Dockerfile -flashinfer-python==0.6.6 -flashinfer-cubin==0.6.6 +flashinfer-python==0.6.7 +flashinfer-cubin==0.6.7 # Cap nvidia-cudnn-frontend (transitive dep of flashinfer) due to # breaking changes in 1.19.0 nvidia-cudnn-frontend>=1.13.0,<1.19.0 diff --git a/tests/kernels/moe/test_unquantized_backend_selection.py b/tests/kernels/moe/test_unquantized_backend_selection.py index 1d9e1d685..48f1a4b94 100644 --- a/tests/kernels/moe/test_unquantized_backend_selection.py +++ b/tests/kernels/moe/test_unquantized_backend_selection.py @@ -57,7 +57,6 @@ def test_select_default_backend_by_platform( moe_config = make_dummy_moe_config() selected_backend = select_unquantized_moe_backend( moe_config=moe_config, - use_ep=False, use_dp=False, ) @@ -90,7 +89,6 @@ def test_select_rocm_aiter_backend(mock_aiter_enabled, mock_has_flashinfer): moe_config = make_dummy_moe_config() selected_backend = select_unquantized_moe_backend( moe_config=moe_config, - use_ep=False, use_dp=False, ) @@ -129,7 +127,6 @@ def test_select_cuda_flashinfer_trtllm_backend( selected_backend = select_unquantized_moe_backend( moe_config=moe_config, - use_ep=True, use_dp=False, ) @@ -171,7 +168,6 @@ def test_select_cuda_flashinfer_cutlass_backend( selected_backend = select_unquantized_moe_backend( moe_config=moe_config, - use_ep=True, # CUTLASS requires EP use_dp=False, # CUTLASS doesn't support DP ) diff --git a/vllm/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py b/vllm/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py index c0a7dfc49..4cb12a8c1 100644 --- a/vllm/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py +++ b/vllm/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py @@ -406,6 +406,11 @@ class TrtLlmFp8ExpertsMonolithic(TrtLlmFp8ExpertsBase, mk.FusedMoEExpertsMonolit if self.routing_method_type == RoutingMethodType.DeepSeekV3: router_logits = router_logits.to(torch.float32) + # Currently FI requires bfloat16 routing bias. + # https://github.com/flashinfer-ai/flashinfer/issues/2909 + if e_score_correction_bias is not None: + e_score_correction_bias = e_score_correction_bias.to(torch.bfloat16) + out = flashinfer.fused_moe.trtllm_fp8_per_tensor_scale_moe( routing_logits=router_logits, routing_bias=e_score_correction_bias, diff --git a/vllm/model_executor/layers/fused_moe/experts/trtllm_nvfp4_moe.py b/vllm/model_executor/layers/fused_moe/experts/trtllm_nvfp4_moe.py index b47391c41..81b778c8f 100644 --- a/vllm/model_executor/layers/fused_moe/experts/trtllm_nvfp4_moe.py +++ b/vllm/model_executor/layers/fused_moe/experts/trtllm_nvfp4_moe.py @@ -5,6 +5,7 @@ import flashinfer import torch import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.activation import MoEActivation from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, @@ -27,6 +28,8 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( from vllm.platforms import current_platform from vllm.utils.flashinfer import has_flashinfer_trtllm_fused_moe +logger = init_logger(__name__) + class TrtLlmNvFp4ExpertsBase: """ @@ -315,6 +318,11 @@ class TrtLlmNvFp4ExpertsMonolithic( else router_logits ) + # Currently FI requires bfloat16 routing bias. + # https://github.com/flashinfer-ai/flashinfer/issues/2909 + if e_score_correction_bias is not None: + e_score_correction_bias = e_score_correction_bias.to(torch.bfloat16) + # Invoke kernel. return flashinfer.fused_moe.trtllm_fp4_block_scale_moe( routing_logits=router_logits, diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py index 91f7a83f6..26409804c 100644 --- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py @@ -361,7 +361,7 @@ class FlashInferExperts(mk.FusedMoEExpertsModular): fc1_expert_weights = w1 fc2_expert_weights = w2 else: - quant_scales = None + quant_scales = [] a1q_scale = None fc1_expert_weights = w1 fc2_expert_weights = w2 diff --git a/vllm/model_executor/layers/fused_moe/oracle/unquantized.py b/vllm/model_executor/layers/fused_moe/oracle/unquantized.py index 9c31da10d..84f4df010 100644 --- a/vllm/model_executor/layers/fused_moe/oracle/unquantized.py +++ b/vllm/model_executor/layers/fused_moe/oracle/unquantized.py @@ -70,7 +70,6 @@ def map_unquantized_backend(runner_backend: MoEBackend) -> UnquantizedMoeBackend def select_unquantized_moe_backend( moe_config: FusedMoEConfig, - use_ep: bool, use_dp: bool, ) -> UnquantizedMoeBackend: """ @@ -96,7 +95,6 @@ def select_unquantized_moe_backend( # FlashInfer CUTLASS MoE is only supported on Hopper and later GPUS flashinfer_cutlass_available = ( has_flashinfer_cutlass_fused_moe() - and use_ep and (not use_dp) and current_platform.has_device_capability(90) ) @@ -161,9 +159,9 @@ def select_unquantized_moe_backend( "to enable it for better performance.", scope="local", ) - elif use_ep and (not use_dp): + elif not use_dp and flashinfer_cutlass_available: logger.info_once( - "FlashInfer MoE is available for EP" + "FlashInfer CUTLASS MoE is available" " but not enabled, consider setting" " VLLM_USE_FLASHINFER_MOE_FP16=1 to enable it.", scope="local", diff --git a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py index 38b552b02..cf1afcb29 100644 --- a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py +++ b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py @@ -61,7 +61,6 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): super().__init__(moe) self.unquantized_backend = select_unquantized_moe_backend( moe_config=self.moe, - use_ep=self.moe.moe_parallel_config.use_ep, use_dp=self.moe.moe_parallel_config.dp_size > 1, ) diff --git a/vllm/model_executor/layers/quantization/utils/nvfp4_utils.py b/vllm/model_executor/layers/quantization/utils/nvfp4_utils.py index bcb4769e4..f21f2ef23 100644 --- a/vllm/model_executor/layers/quantization/utils/nvfp4_utils.py +++ b/vllm/model_executor/layers/quantization/utils/nvfp4_utils.py @@ -55,8 +55,16 @@ def select_nvfp4_linear_backend() -> NvFp4LinearBackend: elif envs.VLLM_USE_NVFP4_CT_EMULATIONS: backend = NvFp4LinearBackend.EMULATION elif envs.VLLM_NVFP4_GEMM_BACKEND is None: - # Auto-select best available backend - if current_platform.has_device_capability(100) and has_flashinfer(): + # Auto-select best available backend. + # cutlass_fp4_supported() checks that the vLLM NVFP4 kernels (both + # quantization and GEMM) were compiled for the current SM version. + # FlashInfer backends still rely on the vLLM quantization kernels, + # so we gate them on the same check. + if ( + cutlass_fp4_supported() + and current_platform.has_device_capability(100) + and has_flashinfer() + ): backend = NvFp4LinearBackend.FLASHINFER_CUTLASS elif cutlass_fp4_supported(): backend = NvFp4LinearBackend.VLLM_CUTLASS @@ -72,6 +80,10 @@ def select_nvfp4_linear_backend() -> NvFp4LinearBackend: NvFp4LinearBackend.FLASHINFER_CUDNN, ): assert has_flashinfer(), f"FlashInfer is required for {backend}" + assert cutlass_fp4_supported(), ( + f"{backend} requires vLLM NVFP4 quantization kernels compiled " + f"for the current GPU (SM {current_platform.get_device_capability()})" + ) elif backend == NvFp4LinearBackend.VLLM_CUTLASS: assert cutlass_fp4_supported(), f"Cutlass is required for {backend}" elif backend == NvFp4LinearBackend.MARLIN: