diff --git a/vllm/model_executor/layers/fused_moe/experts/flashinfer_cutedsl_moe.py b/vllm/model_executor/layers/fused_moe/experts/flashinfer_cutedsl_moe.py index 5805a4dd5..a1db26619 100644 --- a/vllm/model_executor/layers/fused_moe/experts/flashinfer_cutedsl_moe.py +++ b/vllm/model_executor/layers/fused_moe/experts/flashinfer_cutedsl_moe.py @@ -23,6 +23,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( from vllm.platforms import current_platform from vllm.utils.flashinfer import ( flashinfer_cutedsl_grouped_gemm_nt_masked, + has_flashinfer_cutedsl_grouped_gemm_nt_masked, scaled_fp4_grouped_quantize, silu_and_mul_scaled_nvfp4_experts_quantize, ) @@ -60,7 +61,11 @@ class FlashInferCuteDSLExperts(mk.FusedMoEExpertsModular): @staticmethod def _supports_current_device() -> bool: p = current_platform - return p.is_cuda() and p.is_device_capability_family(100) + return ( + p.is_cuda() + and p.is_device_capability_family(100) + and has_flashinfer_cutedsl_grouped_gemm_nt_masked() + ) @staticmethod def _supports_no_act_and_mul() -> bool: diff --git a/vllm/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py b/vllm/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py index f57a05dc6..671435a88 100644 --- a/vllm/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py +++ b/vllm/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py @@ -27,6 +27,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( kMxfp8Static, ) from vllm.platforms import current_platform +from vllm.utils.flashinfer import has_flashinfer_trtllm_fused_moe logger = init_logger(__name__) @@ -61,8 +62,11 @@ class TrtLlmFp8ExpertsBase: def _supports_current_device() -> bool: """Supports only Blackwell-family GPUs.""" p = current_platform - # Add check flashinfer trtllm is available - return p.is_cuda() and p.is_device_capability_family(100) + return ( + p.is_cuda() + and p.is_device_capability_family(100) + and has_flashinfer_trtllm_fused_moe() + ) @staticmethod def _supports_no_act_and_mul() -> bool: diff --git a/vllm/model_executor/layers/fused_moe/experts/trtllm_nvfp4_moe.py b/vllm/model_executor/layers/fused_moe/experts/trtllm_nvfp4_moe.py index 1df50197c..7960bdf44 100644 --- a/vllm/model_executor/layers/fused_moe/experts/trtllm_nvfp4_moe.py +++ b/vllm/model_executor/layers/fused_moe/experts/trtllm_nvfp4_moe.py @@ -24,6 +24,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( kNvfp4Static, ) from vllm.platforms import current_platform +from vllm.utils.flashinfer import has_flashinfer_trtllm_fused_moe class TrtLlmNvFp4ExpertsBase: @@ -80,7 +81,11 @@ class TrtLlmNvFp4ExpertsBase: def _supports_current_device() -> bool: """Supports only Blackwell-family GPUs.""" p = current_platform - return p.is_cuda() and p.is_device_capability_family(100) + return ( + p.is_cuda() + and p.is_device_capability_family(100) + and has_flashinfer_trtllm_fused_moe() + ) @staticmethod def _supports_no_act_and_mul() -> bool: diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py index fed44d04f..0db05851b 100644 --- a/vllm/utils/flashinfer.py +++ b/vllm/utils/flashinfer.py @@ -244,7 +244,7 @@ def has_flashinfer_cutedsl_grouped_gemm_nt_masked() -> bool: required_functions = [ ("flashinfer.cute_dsl.blockscaled_gemm", "grouped_gemm_nt_masked"), ("flashinfer", "scaled_fp4_grouped_quantize"), - ("flashinfer", "silu_and_scaled_nvfp4_experts_quantize"), + ("flashinfer", "silu_and_mul_scaled_nvfp4_experts_quantize"), ] for module_name, attr_name in required_functions: