[Quantization] Deprecate Long Tail of Schemes (#31688)

Signed-off-by: Robert Shaw <robshaw@redhat.com>
Signed-off-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
This commit is contained in:
Robert Shaw
2026-01-08 19:07:45 -05:00
committed by GitHub
parent d62cfe546d
commit 5825bbc1f7
8 changed files with 61 additions and 5 deletions

View File

@@ -62,7 +62,10 @@ def models_list(*, all: bool = True, keywords: list[str] | None = None):
TEST_MODELS.append(
(
"alexm-nm/tinyllama-24-marlin24-4bit-g128",
{"quantization": "gptq_marlin_24"},
{
"quantization": "gptq_marlin_24",
"allow_deprecated_quantization": True,
},
)
)

View File

@@ -63,7 +63,10 @@ def test_models(
num_logprobs: int,
) -> None:
with vllm_runner(
model_pair.model_marlin, dtype=dtype, quantization="gptq_marlin_24"
model_pair.model_marlin,
dtype=dtype,
quantization="gptq_marlin_24",
allow_deprecated_quantization=True,
) as marlin_24_model:
marlin_24_outputs = marlin_24_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs

View File

@@ -26,7 +26,9 @@ MODELS = [
)
@pytest.mark.parametrize("model", MODELS)
def test_auto_round(vllm_runner, model):
with vllm_runner(model, enforce_eager=True) as llm:
with vllm_runner(
model, enforce_eager=True, allow_deprecated_quantization=True
) as llm:
output = llm.generate_greedy(["The capital of France is"], max_tokens=8)
assert output
print(f"{output[0][1]}")

View File

@@ -34,6 +34,10 @@ def test_model_experts_int8_startup(
model_info.check_transformers_version(on_fail="skip")
with vllm_runner(
model, dtype=dtype, enforce_eager=True, quantization="experts_int8"
model,
dtype=dtype,
enforce_eager=True,
quantization="experts_int8",
allow_deprecated_quantization=True,
) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)

View File

@@ -30,6 +30,10 @@ def test_model_rtn_startup(
max_tokens: int,
) -> None:
with vllm_runner(
model, enforce_eager=True, dtype=dtype, quantization="rtn"
model,
enforce_eager=True,
dtype=dtype,
quantization="rtn",
allow_deprecated_quantization=True,
) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)

View File

@@ -191,6 +191,8 @@ class ModelConfig:
`quantization_config` attribute in the model config file. If that is
`None`, we assume the model weights are not quantized and use `dtype` to
determine the data type of the weights."""
allow_deprecated_quantization: bool = False
"""Whether to allow deprecated quantization methods."""
enforce_eager: bool = False
"""Whether to always use eager-mode PyTorch. If True, we will disable CUDA
graph and always execute the model in eager mode. If False, we will use
@@ -940,6 +942,21 @@ class ModelConfig:
current_platform.verify_quantization(self.quantization)
if self.quantization in me_quant.DEPRECATED_QUANTIZATION_METHODS:
if self.allow_deprecated_quantization:
logger.warning(
"The quantization method %s is deprecated "
"and will be removed in future versions of vLLM.",
self.quantization,
)
else:
raise ValueError(
"The quantization method %s is deprecated "
"and will be removed in future versions of vLLM. To bypass, "
"set `--allow-deprecated-quantization`.",
self.quantization,
)
def _verify_cuda_graph(self) -> None:
# CUDAGraph capture not supported for encoder-decoder models on ROCm
unsupported_rocm = self.is_encoder_decoder

View File

@@ -451,6 +451,7 @@ class EngineArgs:
hf_overrides: HfOverrides = get_field(ModelConfig, "hf_overrides")
tokenizer_revision: str | None = ModelConfig.tokenizer_revision
quantization: QuantizationMethods | None = ModelConfig.quantization
allow_deprecated_quantization: bool = ModelConfig.allow_deprecated_quantization
enforce_eager: bool = ModelConfig.enforce_eager
disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce
limit_mm_per_prompt: dict[str, int | dict[str, int]] = get_field(
@@ -648,6 +649,10 @@ class EngineArgs:
)
model_group.add_argument("--max-model-len", **model_kwargs["max_model_len"])
model_group.add_argument("--quantization", "-q", **model_kwargs["quantization"])
model_group.add_argument(
"--allow-deprecated-quantization",
**model_kwargs["allow_deprecated_quantization"],
)
model_group.add_argument("--enforce-eager", **model_kwargs["enforce_eager"])
model_group.add_argument("--max-logprobs", **model_kwargs["max_logprobs"])
model_group.add_argument("--logprobs-mode", **model_kwargs["logprobs_mode"])
@@ -1225,6 +1230,7 @@ class EngineArgs:
tokenizer_revision=self.tokenizer_revision,
max_model_len=self.max_model_len,
quantization=self.quantization,
allow_deprecated_quantization=self.allow_deprecated_quantization,
enforce_eager=self.enforce_eager,
max_logprobs=self.max_logprobs,
logprobs_mode=self.logprobs_mode,

View File

@@ -41,6 +41,23 @@ QuantizationMethods = Literal[
]
QUANTIZATION_METHODS: list[str] = list(get_args(QuantizationMethods))
DEPRECATED_QUANTIZATION_METHODS = [
"deepspeedfp",
"tpu_int8",
"ptpc_fp8",
"fbgemm_fp8",
"fp_quant",
"bitblas",
"gptq_marlin_24",
"gptq_bitblas",
"hqq",
"experts_int8",
"ipex",
"auto-round",
"rtn",
"petit_nvfp4",
]
# The customized quantization methods which will be added to this dict.
_CUSTOMIZED_METHOD_TO_QUANT_CONFIG = {}