[Core] Support loading GGUF model (#5191)

Co-authored-by: Michael Goin <michael@neuralmagic.com>
2024-08-06 07:54:23 +08:00
parent ef527be06c
commit 360bd67cf0
29 changed files with 4970 additions and 21 deletions
--- a/tests/quantization/test_lm_head.py
+++ b/tests/quantization/test_lm_head.py
@@ -7,11 +7,12 @@ from typing import Tuple
 import pytest
 import torch

-from vllm.model_executor.layers.linear import UnquantizedLinearMethod
 from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
 from vllm.model_executor.layers.quantization.gptq_marlin import (
    GPTQMarlinLinearMethod)
 from vllm.model_executor.layers.quantization.marlin import MarlinLinearMethod
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    UnquantizedEmbeddingMethod)

 PROMPT = "On the surface of Mars, we found"

@@ -37,7 +38,8 @@ def test_lm_head(
            lm_head_layer.linear_method,
            (GPTQLinearMethod, GPTQMarlinLinearMethod, MarlinLinearMethod))
    else:
-        assert isinstance(lm_head_layer.linear_method, UnquantizedLinearMethod)
+        assert isinstance(lm_head_layer.linear_method,
+                          UnquantizedEmbeddingMethod)

    print(
        vllm_model.generate_greedy(prompts=["Hello my name is"],