[Kernel/Quant] Remove the original marlin format and qqq (#23204)
Signed-off-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
@@ -11,7 +11,6 @@ import torch
|
||||
from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
|
||||
from vllm.model_executor.layers.quantization.gptq_marlin import (
|
||||
GPTQMarlinLinearMethod)
|
||||
from vllm.model_executor.layers.quantization.marlin import MarlinLinearMethod
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
||||
UnquantizedEmbeddingMethod)
|
||||
|
||||
@@ -19,9 +18,7 @@ PROMPT = "On the surface of Mars, we found"
|
||||
|
||||
MODELS_QUANT = [
|
||||
("ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head", True),
|
||||
("ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-10-25-2024", False),
|
||||
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", False),
|
||||
("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", False)
|
||||
]
|
||||
|
||||
|
||||
@@ -41,8 +38,7 @@ def test_lm_head(
|
||||
lm_head_layer = model.lm_head
|
||||
if lm_head_quantized:
|
||||
assert isinstance(lm_head_layer.quant_method,
|
||||
(GPTQLinearMethod, GPTQMarlinLinearMethod,
|
||||
MarlinLinearMethod))
|
||||
(GPTQLinearMethod, GPTQMarlinLinearMethod))
|
||||
else:
|
||||
assert isinstance(lm_head_layer.quant_method,
|
||||
UnquantizedEmbeddingMethod)
|
||||
|
||||
Reference in New Issue
Block a user