Bugfix: Pass router logits dtype in nemotron shared experts (#32669)
Signed-off-by: Amir Klein <203507526+amirkl94@users.noreply.github.com>
This commit is contained in:
@@ -145,11 +145,12 @@ class NemotronHMoE(nn.Module):
|
|||||||
|
|
||||||
self.is_sequence_parallel = parallel_config.use_sequence_parallel_moe
|
self.is_sequence_parallel = parallel_config.use_sequence_parallel_moe
|
||||||
|
|
||||||
|
router_logits_dtype = torch.float32
|
||||||
self.gate = ReplicatedLinear(
|
self.gate = ReplicatedLinear(
|
||||||
config.hidden_size,
|
config.hidden_size,
|
||||||
config.n_routed_experts,
|
config.n_routed_experts,
|
||||||
bias=False,
|
bias=False,
|
||||||
params_dtype=torch.float32,
|
params_dtype=router_logits_dtype,
|
||||||
quant_config=None,
|
quant_config=None,
|
||||||
prefix=f"{prefix}.gate",
|
prefix=f"{prefix}.gate",
|
||||||
)
|
)
|
||||||
@@ -209,6 +210,7 @@ class NemotronHMoE(nn.Module):
|
|||||||
enable_eplb=self.enable_eplb,
|
enable_eplb=self.enable_eplb,
|
||||||
num_redundant_experts=self.n_redundant_experts,
|
num_redundant_experts=self.n_redundant_experts,
|
||||||
is_sequence_parallel=self.is_sequence_parallel,
|
is_sequence_parallel=self.is_sequence_parallel,
|
||||||
|
router_logits_dtype=router_logits_dtype,
|
||||||
)
|
)
|
||||||
|
|
||||||
if self.use_latent_moe:
|
if self.use_latent_moe:
|
||||||
|
|||||||
Reference in New Issue
Block a user