Enable more models to inference based on LoRA (#3382)

Co-authored-by: Antoni Baum <antoni.baum@protonmail.com>
This commit is contained in:
Jee Li
2024-03-26 09:09:31 +08:00
committed by GitHub
parent dfeb2ecc3a
commit 8af890a865
10 changed files with 401 additions and 44 deletions

View File

@@ -9,6 +9,7 @@ from torch import nn
from torch.nn import LayerNorm
from vllm.attention import Attention, AttentionMetadata
from vllm.config import LoRAConfig
from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.linear import (LinearMethodBase,
@@ -317,11 +318,25 @@ class ChatGLMModel(nn.Module):
class ChatGLMForCausalLM(nn.Module):
packed_modules_mapping = {
"query_key_value": ["query_key_value"],
"dense_h_to_4h": ["dense_h_to_4h"]
}
# LoRA specific attributes
supported_lora_modules = [
"query_key_value",
"dense",
"dense_h_to_4h",
"dense_4h_to_h",
]
embedding_modules = {}
embedding_padding_modules = []
def __init__(
self,
config: ChatGLMConfig,
linear_method: Optional[LinearMethodBase] = None,
lora_config: Optional[LoRAConfig] = None,
):
super().__init__()
self.config: ChatGLMConfig = config