diff --git a/vllm/lora/request.py b/vllm/lora/request.py index 538d51031..2811fee1d 100644 --- a/vllm/lora/request.py +++ b/vllm/lora/request.py @@ -20,7 +20,6 @@ class LoRARequest( lora_name: str lora_int_id: int lora_path: str = "" - long_lora_max_len: int | None = None base_model_name: str | None = msgspec.field(default=None) tensorizer_config_dict: dict | None = None diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index 2750f1864..82837b77e 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -111,7 +111,7 @@ class Qwen2MoeMLP(nn.Module): out, _ = self.down_proj(out) if self.expert_gate is not None: - out = F.sigmoid(self.expert_gate(x)) * out + out = F.sigmoid(self.expert_gate(x)[0]) * out return out @@ -140,7 +140,13 @@ class Qwen2MoeSparseMoeBlock(nn.Module): prefix=f"{prefix}.gate", ) - self.shared_expert_gate = torch.nn.Linear(config.hidden_size, 1, bias=False) + self.shared_expert_gate = ReplicatedLinear( + config.hidden_size, + 1, + bias=False, + quant_config=None, + prefix=f"{prefix}.shared_expert_gate", + ) if config.shared_expert_intermediate_size > 0: self.shared_expert = Qwen2MoeMLP(