[Model] Support GGUF models newly added in transformers 4.46.0 (#9685)

Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-01-13 08:13:44 +08:00
parent 9597a095f2
commit d14e98d924
7 changed files with 162 additions and 87 deletions
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -198,7 +198,10 @@ class GPT2Model(nn.Module):
        assert not config.scale_attn_by_inverse_layer_idx
        assert not config.reorder_and_upcast_attn
        self.embed_dim = config.hidden_size
-        self.wte = VocabParallelEmbedding(config.vocab_size, self.embed_dim)
+        self.wte = VocabParallelEmbedding(config.vocab_size,
+                                          self.embed_dim,
+                                          quant_config=quant_config,
+                                          prefix=f"{prefix}.wte")
        self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
        self.start_layer, self.end_layer, self.h = make_layers(
            config.num_hidden_layers,
@@ -259,7 +262,9 @@ class GPT2LMHeadModel(nn.Module, SupportsPP):
            self.lm_head = self.transformer.wte
        else:
            self.lm_head = ParallelLMHead(self.config.vocab_size,
-                                          self.config.hidden_size)
+                                          self.config.hidden_size,
+                                          quant_config=quant_config,
+                                          prefix=f"{prefix}.lm_head")
        self.logits_processor = LogitsProcessor(config.vocab_size)
        self.sampler = get_sampler()
        self.make_empty_intermediate_tensors = (
@@ -304,7 +309,7 @@ class GPT2LMHeadModel(nn.Module, SupportsPP):
        params_dict = dict(self.named_parameters(remove_duplicate=False))
        loaded_params: Set[str] = set()
        for name, loaded_weight in weights:
-            if "lm_head.weight" in name:
+            if name.startswith("lm_head"):
                # GPT-2 ties the weights of the embedding layer and the final
                # linear layer.
                continue