[Misc] Fix Qwen2-MoE shared_expert_gate (#31339)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
This commit is contained in:
@@ -111,7 +111,7 @@ class Qwen2MoeMLP(nn.Module):
|
||||
out, _ = self.down_proj(out)
|
||||
|
||||
if self.expert_gate is not None:
|
||||
out = F.sigmoid(self.expert_gate(x)) * out
|
||||
out = F.sigmoid(self.expert_gate(x)[0]) * out
|
||||
|
||||
return out
|
||||
|
||||
@@ -140,7 +140,13 @@ class Qwen2MoeSparseMoeBlock(nn.Module):
|
||||
prefix=f"{prefix}.gate",
|
||||
)
|
||||
|
||||
self.shared_expert_gate = torch.nn.Linear(config.hidden_size, 1, bias=False)
|
||||
self.shared_expert_gate = ReplicatedLinear(
|
||||
config.hidden_size,
|
||||
1,
|
||||
bias=False,
|
||||
quant_config=None,
|
||||
prefix=f"{prefix}.shared_expert_gate",
|
||||
)
|
||||
|
||||
if config.shared_expert_intermediate_size > 0:
|
||||
self.shared_expert = Qwen2MoeMLP(
|
||||
|
||||
Reference in New Issue
Block a user