[Core] Support loading GGUF model (#5191)

Co-authored-by: Michael Goin <michael@neuralmagic.com>
This commit is contained in:
Isotr0py
2024-08-06 07:54:23 +08:00
committed by GitHub
parent ef527be06c
commit 360bd67cf0
29 changed files with 4970 additions and 21 deletions

View File

@@ -7,11 +7,12 @@ from typing import Tuple
import pytest
import torch
from vllm.model_executor.layers.linear import UnquantizedLinearMethod
from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
from vllm.model_executor.layers.quantization.gptq_marlin import (
GPTQMarlinLinearMethod)
from vllm.model_executor.layers.quantization.marlin import MarlinLinearMethod
from vllm.model_executor.layers.vocab_parallel_embedding import (
UnquantizedEmbeddingMethod)
PROMPT = "On the surface of Mars, we found"
@@ -37,7 +38,8 @@ def test_lm_head(
lm_head_layer.linear_method,
(GPTQLinearMethod, GPTQMarlinLinearMethod, MarlinLinearMethod))
else:
assert isinstance(lm_head_layer.linear_method, UnquantizedLinearMethod)
assert isinstance(lm_head_layer.linear_method,
UnquantizedEmbeddingMethod)
print(
vllm_model.generate_greedy(prompts=["Hello my name is"],