[Core] Refactor GGUF parameters packing and forwarding (#8859)

2024-10-07 18:01:46 +08:00
parent 4f95ffee6f
commit f19da64871
4 changed files with 64 additions and 62 deletions
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -512,7 +512,7 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
                quant_config=quant_config,
            )
            if config.tie_word_embeddings:
-                self.lm_head.weight = self.model.embed_tokens.weight
+                self.lm_head = self.model.embed_tokens

            logit_scale = getattr(config, "logit_scale", 1.0)
            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,