[Kernel/Quant] Remove the original marlin format and qqq (#23204)

Signed-off-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
Michael Goin
2025-08-20 15:13:36 -04:00
committed by GitHub
parent ebe56a0064
commit 0cdbf5e61c
26 changed files with 92 additions and 3698 deletions

View File

@@ -11,7 +11,6 @@ import torch
from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
from vllm.model_executor.layers.quantization.gptq_marlin import (
GPTQMarlinLinearMethod)
from vllm.model_executor.layers.quantization.marlin import MarlinLinearMethod
from vllm.model_executor.layers.vocab_parallel_embedding import (
UnquantizedEmbeddingMethod)
@@ -19,9 +18,7 @@ PROMPT = "On the surface of Mars, we found"
MODELS_QUANT = [
("ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head", True),
("ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-10-25-2024", False),
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", False),
("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", False)
]
@@ -41,8 +38,7 @@ def test_lm_head(
lm_head_layer = model.lm_head
if lm_head_quantized:
assert isinstance(lm_head_layer.quant_method,
(GPTQLinearMethod, GPTQMarlinLinearMethod,
MarlinLinearMethod))
(GPTQLinearMethod, GPTQMarlinLinearMethod))
else:
assert isinstance(lm_head_layer.quant_method,
UnquantizedEmbeddingMethod)