[Core] Support loading GGUF model (#5191)

Co-authored-by: Michael Goin <michael@neuralmagic.com>
2024-08-06 07:54:23 +08:00
parent ef527be06c
commit 360bd67cf0
29 changed files with 4970 additions and 21 deletions
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -140,6 +140,7 @@ class LlamaAttention(nn.Module):
            quant_config=quant_config,
            prefix=f"{prefix}.qkv_proj",
        )
+
        self.o_proj = RowParallelLinear(
            input_size=self.total_num_heads * self.head_dim,
            output_size=hidden_size,
@@ -148,12 +149,17 @@ class LlamaAttention(nn.Module):
            prefix=f"{prefix}.o_proj",
        )

+        is_neox_style = True
+        if quant_config is not None and quant_config.get_name() == "gguf":
+            is_neox_style = False
+
        self.rotary_emb = get_rope(
            self.head_dim,
            rotary_dim=self.head_dim,
            max_position=max_position_embeddings,
            base=rope_theta,
            rope_scaling=rope_scaling,
+            is_neox_style=is_neox_style,
        )
        self.attn = Attention(self.num_heads,
                              self.head_dim,
@@ -279,6 +285,7 @@ class LlamaModel(nn.Module):
                self.vocab_size,
                config.hidden_size,
                org_num_embeddings=config.vocab_size,
+                quant_config=quant_config,
            )
        else:
            self.embed_tokens = PPMissingLayer()