Enable more models to inference based on LoRA (#3382)

Co-authored-by: Antoni Baum <antoni.baum@protonmail.com>
2024-03-26 09:09:31 +08:00
parent dfeb2ecc3a
commit 8af890a865
10 changed files with 401 additions and 44 deletions
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -9,6 +9,7 @@ from torch import nn
 from torch.nn import LayerNorm

 from vllm.attention import Attention, AttentionMetadata
+from vllm.config import LoRAConfig
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (LinearMethodBase,
@@ -317,11 +318,25 @@ class ChatGLMModel(nn.Module):


 class ChatGLMForCausalLM(nn.Module):
+    packed_modules_mapping = {
+        "query_key_value": ["query_key_value"],
+        "dense_h_to_4h": ["dense_h_to_4h"]
+    }
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "query_key_value",
+        "dense",
+        "dense_h_to_4h",
+        "dense_4h_to_h",
+    ]
+    embedding_modules = {}
+    embedding_padding_modules = []

    def __init__(
        self,
        config: ChatGLMConfig,
        linear_method: Optional[LinearMethodBase] = None,
+        lora_config: Optional[LoRAConfig] = None,
    ):
        super().__init__()
        self.config: ChatGLMConfig = config