[Bugfix] Fix GLM-4 MoE router logits dtype for data parallel chunking (#31055)
Signed-off-by: ReinforcedKnowledge <reinforced.knowledge@gmail.com>
This commit is contained in:
@@ -197,6 +197,7 @@ class Glm4MoE(nn.Module):
|
||||
e_score_correction_bias=self.gate.e_score_correction_bias,
|
||||
enable_eplb=self.enable_eplb,
|
||||
num_redundant_experts=self.n_redundant_experts,
|
||||
router_logits_dtype=torch.float32,
|
||||
)
|
||||
|
||||
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
||||
|
||||
Reference in New Issue
Block a user