diff --git a/vllm/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py b/vllm/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py index febb3b2ef..183324420 100644 --- a/vllm/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py +++ b/vllm/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py @@ -240,12 +240,11 @@ class TrtLlmFp8Experts(mk.FusedMoEExpertsMonolithic): ) -> torch.Tensor: # Delay import for non-CUDA. import flashinfer - from flashinfer.fused_moe.core import ActivationType # Confirm supported activation function. assert activation in [MoEActivation.SILU, MoEActivation.RELU2_NO_MUL] - activation_type = ActivationType(activation_to_flashinfer_int(activation)) + activation_type = activation_to_flashinfer_int(activation) # Confirm Llama-4 routing is proper. if self.routing_method_type == RoutingMethodType.Llama4: diff --git a/vllm/model_executor/layers/fused_moe/experts/trtllm_nvfp4_moe.py b/vllm/model_executor/layers/fused_moe/experts/trtllm_nvfp4_moe.py index 502671766..174c581b3 100644 --- a/vllm/model_executor/layers/fused_moe/experts/trtllm_nvfp4_moe.py +++ b/vllm/model_executor/layers/fused_moe/experts/trtllm_nvfp4_moe.py @@ -323,4 +323,5 @@ class TrtLlmNvFp4ExpertsMonolithic( routed_scaling_factor=routed_scaling_factor, routing_method_type=self.routing_method_type, do_finalize=True, + activation_type=activation_to_flashinfer_int(activation), )[0]