[Kernel/Quant] Remove the original marlin format and qqq (#23204)

Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-20 15:13:36 -04:00
parent ebe56a0064
commit 0cdbf5e61c
26 changed files with 92 additions and 3698 deletions
--- a/vllm/config/init.py
+++ b/vllm/config/init.py
@@ -1112,9 +1112,9 @@ class ModelConfig:
    def _verify_quantization(self) -> None:
        supported_quantization = me_quant.QUANTIZATION_METHODS
        optimized_quantization_methods = [
-            "fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin",
-            "awq_marlin", "fbgemm_fp8", "compressed-tensors", "experts_int8",
-            "quark", "modelopt_fp4", "bitblas", "gptq_bitblas", "inc"
+            "fp8", "modelopt", "gptq_marlin_24", "gptq_marlin", "awq_marlin",
+            "fbgemm_fp8", "compressed-tensors", "experts_int8", "quark",
+            "modelopt_fp4", "bitblas", "gptq_bitblas", "inc"
        ]
        if self.quantization is not None:
            self.quantization = cast(me_quant.QuantizationMethods,
@@ -1137,7 +1137,6 @@ class ModelConfig:
            # `override_quantization_method` method) must be checked in order
            # of preference (this is particularly important for GPTQ).
            overrides = [
-                "marlin",
                "bitblas",
                "gptq_marlin_24",
                "gptq_marlin",