Support LoRA and GPTQModel for PLaMo 2/3 (#31322)
Signed-off-by: Shinichi Hemmi <50256998+Alnusjaponica@users.noreply.github.com>
This commit is contained in:
@@ -438,8 +438,8 @@ th {
|
|||||||
| `Phi3ForCausalLM` | Phi-4, Phi-3 | `microsoft/Phi-4-mini-instruct`, `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc. | ✅︎ | ✅︎ |
|
| `Phi3ForCausalLM` | Phi-4, Phi-3 | `microsoft/Phi-4-mini-instruct`, `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc. | ✅︎ | ✅︎ |
|
||||||
| `PhiMoEForCausalLM` | Phi-3.5-MoE | `microsoft/Phi-3.5-MoE-instruct`, etc. | ✅︎ | ✅︎ |
|
| `PhiMoEForCausalLM` | Phi-3.5-MoE | `microsoft/Phi-3.5-MoE-instruct`, etc. | ✅︎ | ✅︎ |
|
||||||
| `PersimmonForCausalLM` | Persimmon | `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc. | | ✅︎ |
|
| `PersimmonForCausalLM` | Persimmon | `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc. | | ✅︎ |
|
||||||
| `Plamo2ForCausalLM` | PLaMo2 | `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc. | | ✅︎ |
|
| `Plamo2ForCausalLM` | PLaMo2 | `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc. | ✅ | ✅︎ |
|
||||||
| `Plamo3ForCausalLM` | PLaMo3 | `pfnet/plamo-3-nict-2b-base`, `pfnet/plamo-3-nict-8b-base`, etc. | | ✅︎ |
|
| `Plamo3ForCausalLM` | PLaMo3 | `pfnet/plamo-3-nict-2b-base`, `pfnet/plamo-3-nict-8b-base`, etc. | ✅ | ✅︎ |
|
||||||
| `QWenLMHeadModel` | Qwen | `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc. | ✅︎ | ✅︎ |
|
| `QWenLMHeadModel` | Qwen | `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc. | ✅︎ | ✅︎ |
|
||||||
| `Qwen2ForCausalLM` | QwQ, Qwen2 | `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc. | ✅︎ | ✅︎ |
|
| `Qwen2ForCausalLM` | QwQ, Qwen2 | `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc. | ✅︎ | ✅︎ |
|
||||||
| `Qwen2MoeForCausalLM` | Qwen2MoE | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. | ✅︎ | ✅︎ |
|
| `Qwen2MoeForCausalLM` | Qwen2MoE | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. | ✅︎ | ✅︎ |
|
||||||
|
|||||||
@@ -50,8 +50,14 @@ from vllm.model_executor.model_loader.weight_utils import (
|
|||||||
default_weight_loader,
|
default_weight_loader,
|
||||||
sharded_weight_loader,
|
sharded_weight_loader,
|
||||||
)
|
)
|
||||||
from vllm.model_executor.models.interfaces import HasInnerState, IsHybrid, SupportsPP
|
from vllm.model_executor.models.interfaces import (
|
||||||
|
HasInnerState,
|
||||||
|
IsHybrid,
|
||||||
|
SupportsLoRA,
|
||||||
|
SupportsPP,
|
||||||
|
)
|
||||||
from vllm.model_executor.models.utils import (
|
from vllm.model_executor.models.utils import (
|
||||||
|
AutoWeightsLoader,
|
||||||
is_pp_missing_parameter,
|
is_pp_missing_parameter,
|
||||||
make_empty_intermediate_tensors_factory,
|
make_empty_intermediate_tensors_factory,
|
||||||
make_layers,
|
make_layers,
|
||||||
@@ -105,6 +111,7 @@ class Plamo2MambaMixer(MambaBase, CustomOp):
|
|||||||
self.cache_config = vllm_config.cache_config
|
self.cache_config = vllm_config.cache_config
|
||||||
self.model_config = vllm_config.model_config
|
self.model_config = vllm_config.model_config
|
||||||
self.quant_config = vllm_config.quant_config
|
self.quant_config = vllm_config.quant_config
|
||||||
|
self.is_lora_enabled = bool(vllm_config.lora_config)
|
||||||
self.hidden_size = self.config.hidden_size
|
self.hidden_size = self.config.hidden_size
|
||||||
self.ssm_state_size = self.config.mamba_d_state
|
self.ssm_state_size = self.config.mamba_d_state
|
||||||
self.conv_kernel_size = self.config.mamba_d_conv
|
self.conv_kernel_size = self.config.mamba_d_conv
|
||||||
@@ -202,6 +209,10 @@ class Plamo2MambaMixer(MambaBase, CustomOp):
|
|||||||
self.prefix = prefix
|
self.prefix = prefix
|
||||||
|
|
||||||
def _project_ssm_parameters(self, hidden_states):
|
def _project_ssm_parameters(self, hidden_states):
|
||||||
|
if self.is_lora_enabled:
|
||||||
|
# Lora kernel requires contiguous tensor.
|
||||||
|
ssm_parameters = self.bcdt_proj(hidden_states.contiguous())
|
||||||
|
else:
|
||||||
ssm_parameters = self.bcdt_proj(hidden_states)
|
ssm_parameters = self.bcdt_proj(hidden_states)
|
||||||
B, C, time_step = torch.split(
|
B, C, time_step = torch.split(
|
||||||
ssm_parameters,
|
ssm_parameters,
|
||||||
@@ -780,13 +791,13 @@ class Plamo2Model(torch.nn.Module):
|
|||||||
return hidden_states
|
return hidden_states
|
||||||
|
|
||||||
|
|
||||||
class Plamo2ForCausalLM(torch.nn.Module, HasInnerState, SupportsPP, IsHybrid):
|
class Plamo2ForCausalLM(
|
||||||
|
torch.nn.Module, HasInnerState, SupportsLoRA, SupportsPP, IsHybrid
|
||||||
|
):
|
||||||
packed_modules_mapping = {
|
packed_modules_mapping = {
|
||||||
"qkv_proj": [
|
"qkv_proj": ["qkv_proj"],
|
||||||
"q_proj",
|
"gate_up_proj": ["gate_up_proj"],
|
||||||
"k_proj",
|
"in_proj": ["in_proj"],
|
||||||
"v_proj",
|
|
||||||
],
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
|
def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
|
||||||
@@ -892,6 +903,12 @@ class Plamo2ForCausalLM(torch.nn.Module, HasInnerState, SupportsPP, IsHybrid):
|
|||||||
if name == "lm_head.weight" and self.config.tie_word_embeddings:
|
if name == "lm_head.weight" and self.config.tie_word_embeddings:
|
||||||
assert "lm_head.weight" not in params_dict
|
assert "lm_head.weight" not in params_dict
|
||||||
continue
|
continue
|
||||||
|
# Same workaround as AutoWeightsLoader for GPTQModel
|
||||||
|
if any(
|
||||||
|
substr in name
|
||||||
|
for substr in AutoWeightsLoader.ROTARY_EMBEDS_UNUSED_WEIGHTS
|
||||||
|
):
|
||||||
|
continue
|
||||||
|
|
||||||
# Update the weight names to be compatible with the vllm version
|
# Update the weight names to be compatible with the vllm version
|
||||||
# of the model.
|
# of the model.
|
||||||
|
|||||||
@@ -35,7 +35,7 @@ from vllm.model_executor.model_loader.weight_utils import (
|
|||||||
composed_weight_loader,
|
composed_weight_loader,
|
||||||
default_weight_loader,
|
default_weight_loader,
|
||||||
)
|
)
|
||||||
from vllm.model_executor.models.interfaces import SupportsPP
|
from vllm.model_executor.models.interfaces import SupportsLoRA, SupportsPP
|
||||||
from vllm.model_executor.models.utils import (
|
from vllm.model_executor.models.utils import (
|
||||||
AutoWeightsLoader,
|
AutoWeightsLoader,
|
||||||
extract_layer_index,
|
extract_layer_index,
|
||||||
@@ -369,13 +369,10 @@ class Plamo3Model(nn.Module):
|
|||||||
return hidden_states
|
return hidden_states
|
||||||
|
|
||||||
|
|
||||||
class Plamo3ForCausalLM(nn.Module, SupportsPP):
|
class Plamo3ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
|
||||||
packed_modules_mapping = {
|
packed_modules_mapping = {
|
||||||
"qkv_proj": [
|
"qkv_proj": ["qkv_proj"],
|
||||||
"q_proj",
|
"gate_up_proj": ["gate_up_proj"],
|
||||||
"k_proj",
|
|
||||||
"v_proj",
|
|
||||||
],
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
|
def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
|
||||||
|
|||||||
Reference in New Issue
Block a user