diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py index d86896e54..910c83877 100644 --- a/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py +++ b/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py @@ -309,6 +309,10 @@ def fi_trtllm_fp8_per_tensor_moe( from vllm.utils.flashinfer import flashinfer_trtllm_fp8_per_tensor_scale_moe + # The DeepSeekV3 routing method requires float32 router logits. + if routing_method_type == RoutingMethodType.DeepSeekV3: + routing_logits = routing_logits.to(torch.float32) + return flashinfer_trtllm_fp8_per_tensor_scale_moe( routing_logits=routing_logits, routing_bias=routing_bias, diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py index 06141013c..d51becac7 100644 --- a/vllm/model_executor/models/nemotron_h.py +++ b/vllm/model_executor/models/nemotron_h.py @@ -148,12 +148,10 @@ class NemotronHMoE(nn.Module): self.is_sequence_parallel = parallel_config.use_sequence_parallel_moe - router_logits_dtype = torch.float32 self.gate = ReplicatedLinear( config.hidden_size, config.n_routed_experts, bias=False, - params_dtype=router_logits_dtype, quant_config=None, prefix=f"{prefix}.gate", ) @@ -232,7 +230,6 @@ class NemotronHMoE(nn.Module): enable_eplb=self.enable_eplb, num_redundant_experts=self.n_redundant_experts, is_sequence_parallel=self.is_sequence_parallel, - router_logits_dtype=router_logits_dtype, routed_input_transform=self.fc1_latent_proj, ) @@ -244,7 +241,7 @@ class NemotronHMoE(nn.Module): hidden_states = sequence_parallel_chunk(hidden_states) # router_logits: (num_tokens, n_experts) - router_logits, _ = self.gate(hidden_states.to(dtype=torch.float32)) + router_logits, _ = self.gate(hidden_states) # SharedFusedMoE handles: # - shared experts (with original hidden_states)