diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py index d51becac7..06141013c 100644 --- a/vllm/model_executor/models/nemotron_h.py +++ b/vllm/model_executor/models/nemotron_h.py @@ -148,10 +148,12 @@ class NemotronHMoE(nn.Module): self.is_sequence_parallel = parallel_config.use_sequence_parallel_moe + router_logits_dtype = torch.float32 self.gate = ReplicatedLinear( config.hidden_size, config.n_routed_experts, bias=False, + params_dtype=router_logits_dtype, quant_config=None, prefix=f"{prefix}.gate", ) @@ -230,6 +232,7 @@ class NemotronHMoE(nn.Module): enable_eplb=self.enable_eplb, num_redundant_experts=self.n_redundant_experts, is_sequence_parallel=self.is_sequence_parallel, + router_logits_dtype=router_logits_dtype, routed_input_transform=self.fc1_latent_proj, ) @@ -241,7 +244,7 @@ class NemotronHMoE(nn.Module): hidden_states = sequence_parallel_chunk(hidden_states) # router_logits: (num_tokens, n_experts) - router_logits, _ = self.gate(hidden_states) + router_logits, _ = self.gate(hidden_states.to(dtype=torch.float32)) # SharedFusedMoE handles: # - shared experts (with original hidden_states)