From 3eff45d793daa976a21d0df5954cf6cc6723335f Mon Sep 17 00:00:00 2001 From: roikoren755 <26850796+roikoren755@users.noreply.github.com> Date: Thu, 19 Feb 2026 19:47:05 +0200 Subject: [PATCH] Revert "[NemotronH] Do not force router to run in fp32 (#34582)" (#34808) Signed-off-by: Roi Koren Co-authored-by: Michael Goin --- vllm/model_executor/models/nemotron_h.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py index d51becac7..06141013c 100644 --- a/vllm/model_executor/models/nemotron_h.py +++ b/vllm/model_executor/models/nemotron_h.py @@ -148,10 +148,12 @@ class NemotronHMoE(nn.Module): self.is_sequence_parallel = parallel_config.use_sequence_parallel_moe + router_logits_dtype = torch.float32 self.gate = ReplicatedLinear( config.hidden_size, config.n_routed_experts, bias=False, + params_dtype=router_logits_dtype, quant_config=None, prefix=f"{prefix}.gate", ) @@ -230,6 +232,7 @@ class NemotronHMoE(nn.Module): enable_eplb=self.enable_eplb, num_redundant_experts=self.n_redundant_experts, is_sequence_parallel=self.is_sequence_parallel, + router_logits_dtype=router_logits_dtype, routed_input_transform=self.fc1_latent_proj, ) @@ -241,7 +244,7 @@ class NemotronHMoE(nn.Module): hidden_states = sequence_parallel_chunk(hidden_states) # router_logits: (num_tokens, n_experts) - router_logits, _ = self.gate(hidden_states) + router_logits, _ = self.gate(hidden_states.to(dtype=torch.float32)) # SharedFusedMoE handles: # - shared experts (with original hidden_states)