[Kernel/Quant] Remove the original marlin format and qqq (#23204)

Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-20 15:13:36 -04:00
parent ebe56a0064
commit 0cdbf5e61c
26 changed files with 92 additions and 3698 deletions
--- a/tests/quantization/test_lm_head.py
+++ b/tests/quantization/test_lm_head.py
@@ -11,7 +11,6 @@ import torch
 from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
 from vllm.model_executor.layers.quantization.gptq_marlin import (
    GPTQMarlinLinearMethod)
-from vllm.model_executor.layers.quantization.marlin import MarlinLinearMethod
 from vllm.model_executor.layers.vocab_parallel_embedding import (
    UnquantizedEmbeddingMethod)

@@ -19,9 +18,7 @@ PROMPT = "On the surface of Mars, we found"

 MODELS_QUANT = [
    ("ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head", True),
-    ("ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-10-25-2024", False),
    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", False),
-    ("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", False)
 ]


@@ -41,8 +38,7 @@ def test_lm_head(
            lm_head_layer = model.lm_head
            if lm_head_quantized:
                assert isinstance(lm_head_layer.quant_method,
-                                  (GPTQLinearMethod, GPTQMarlinLinearMethod,
-                                   MarlinLinearMethod))
+                                  (GPTQLinearMethod, GPTQMarlinLinearMethod))
            else:
                assert isinstance(lm_head_layer.quant_method,
                                  UnquantizedEmbeddingMethod)