[Quantization] Deprecate Long Tail of Schemes (#31688)

Signed-off-by: Robert Shaw <robshaw@redhat.com> Signed-off-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Co-authored-by: Robert Shaw <robshaw@redhat.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
2026-01-08 19:07:45 -05:00
parent d62cfe546d
commit 5825bbc1f7
8 changed files with 61 additions and 5 deletions
--- a/tests/compile/fullgraph/test_full_graph.py
+++ b/tests/compile/fullgraph/test_full_graph.py
@@ -62,7 +62,10 @@ def models_list(*, all: bool = True, keywords: list[str] | None = None):
            TEST_MODELS.append(
                (
                    "alexm-nm/tinyllama-24-marlin24-4bit-g128",
-                    {"quantization": "gptq_marlin_24"},
+                    {
+                        "quantization": "gptq_marlin_24",
+                        "allow_deprecated_quantization": True,
+                    },
                )
            )

--- a/tests/models/quantization/test_gptq_marlin_24.py
+++ b/tests/models/quantization/test_gptq_marlin_24.py
@@ -63,7 +63,10 @@ def test_models(
    num_logprobs: int,
 ) -> None:
    with vllm_runner(
-        model_pair.model_marlin, dtype=dtype, quantization="gptq_marlin_24"
+        model_pair.model_marlin,
+        dtype=dtype,
+        quantization="gptq_marlin_24",
+        allow_deprecated_quantization=True,
    ) as marlin_24_model:
        marlin_24_outputs = marlin_24_model.generate_greedy_logprobs(
            example_prompts, max_tokens, num_logprobs
--- a/tests/quantization/test_auto_round.py
+++ b/tests/quantization/test_auto_round.py
@@ -26,7 +26,9 @@ MODELS = [
 )
@pytest.mark.parametrize("model", MODELS)
 def test_auto_round(vllm_runner, model):
-    with vllm_runner(model, enforce_eager=True) as llm:
+    with vllm_runner(
+        model, enforce_eager=True, allow_deprecated_quantization=True
+    ) as llm:
        output = llm.generate_greedy(["The capital of France is"], max_tokens=8)
    assert output
    print(f"{output[0][1]}")
--- a/tests/quantization/test_experts_int8.py
+++ b/tests/quantization/test_experts_int8.py
@@ -34,6 +34,10 @@ def test_model_experts_int8_startup(
    model_info.check_transformers_version(on_fail="skip")

    with vllm_runner(
-        model, dtype=dtype, enforce_eager=True, quantization="experts_int8"
+        model,
+        dtype=dtype,
+        enforce_eager=True,
+        quantization="experts_int8",
+        allow_deprecated_quantization=True,
    ) as vllm_model:
        vllm_model.generate_greedy(example_prompts, max_tokens)
--- a/tests/quantization/test_rtn.py
+++ b/tests/quantization/test_rtn.py
@@ -30,6 +30,10 @@ def test_model_rtn_startup(
    max_tokens: int,
 ) -> None:
    with vllm_runner(
-        model, enforce_eager=True, dtype=dtype, quantization="rtn"
+        model,
+        enforce_eager=True,
+        dtype=dtype,
+        quantization="rtn",
+        allow_deprecated_quantization=True,
    ) as vllm_model:
        vllm_model.generate_greedy(example_prompts, max_tokens)
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -191,6 +191,8 @@ class ModelConfig:
    `quantization_config` attribute in the model config file. If that is
    `None`, we assume the model weights are not quantized and use `dtype` to
    determine the data type of the weights."""
+    allow_deprecated_quantization: bool = False
+    """Whether to allow deprecated quantization methods."""
    enforce_eager: bool = False
    """Whether to always use eager-mode PyTorch. If True, we will disable CUDA
    graph and always execute the model in eager mode. If False, we will use
@@ -940,6 +942,21 @@ class ModelConfig:

            current_platform.verify_quantization(self.quantization)

+        if self.quantization in me_quant.DEPRECATED_QUANTIZATION_METHODS:
+            if self.allow_deprecated_quantization:
+                logger.warning(
+                    "The quantization method %s is deprecated "
+                    "and will be removed in future versions of vLLM.",
+                    self.quantization,
+                )
+            else:
+                raise ValueError(
+                    "The quantization method %s is deprecated "
+                    "and will be removed in future versions of vLLM. To bypass, "
+                    "set `--allow-deprecated-quantization`.",
+                    self.quantization,
+                )
+
    def _verify_cuda_graph(self) -> None:
        # CUDAGraph capture not supported for encoder-decoder models on ROCm
        unsupported_rocm = self.is_encoder_decoder
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -451,6 +451,7 @@ class EngineArgs:
    hf_overrides: HfOverrides = get_field(ModelConfig, "hf_overrides")
    tokenizer_revision: str | None = ModelConfig.tokenizer_revision
    quantization: QuantizationMethods | None = ModelConfig.quantization
+    allow_deprecated_quantization: bool = ModelConfig.allow_deprecated_quantization
    enforce_eager: bool = ModelConfig.enforce_eager
    disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce
    limit_mm_per_prompt: dict[str, int | dict[str, int]] = get_field(
@@ -648,6 +649,10 @@ class EngineArgs:
        )
        model_group.add_argument("--max-model-len", **model_kwargs["max_model_len"])
        model_group.add_argument("--quantization", "-q", **model_kwargs["quantization"])
+        model_group.add_argument(
+            "--allow-deprecated-quantization",
+            **model_kwargs["allow_deprecated_quantization"],
+        )
        model_group.add_argument("--enforce-eager", **model_kwargs["enforce_eager"])
        model_group.add_argument("--max-logprobs", **model_kwargs["max_logprobs"])
        model_group.add_argument("--logprobs-mode", **model_kwargs["logprobs_mode"])
@@ -1225,6 +1230,7 @@ class EngineArgs:
            tokenizer_revision=self.tokenizer_revision,
            max_model_len=self.max_model_len,
            quantization=self.quantization,
+            allow_deprecated_quantization=self.allow_deprecated_quantization,
            enforce_eager=self.enforce_eager,
            max_logprobs=self.max_logprobs,
            logprobs_mode=self.logprobs_mode,
--- a/vllm/model_executor/layers/quantization/init.py
+++ b/vllm/model_executor/layers/quantization/init.py
@@ -41,6 +41,23 @@ QuantizationMethods = Literal[
 ]
 QUANTIZATION_METHODS: list[str] = list(get_args(QuantizationMethods))

+DEPRECATED_QUANTIZATION_METHODS = [
+    "deepspeedfp",
+    "tpu_int8",
+    "ptpc_fp8",
+    "fbgemm_fp8",
+    "fp_quant",
+    "bitblas",
+    "gptq_marlin_24",
+    "gptq_bitblas",
+    "hqq",
+    "experts_int8",
+    "ipex",
+    "auto-round",
+    "rtn",
+    "petit_nvfp4",
+]
+
 # The customized quantization methods which will be added to this dict.
 _CUSTOMIZED_METHOD_TO_QUANT_CONFIG = {}