[Quantization] Deprecate Long Tail of Schemes (#31688)
Signed-off-by: Robert Shaw <robshaw@redhat.com> Signed-off-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Co-authored-by: Robert Shaw <robshaw@redhat.com> Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
This commit is contained in:
@@ -62,7 +62,10 @@ def models_list(*, all: bool = True, keywords: list[str] | None = None):
|
||||
TEST_MODELS.append(
|
||||
(
|
||||
"alexm-nm/tinyllama-24-marlin24-4bit-g128",
|
||||
{"quantization": "gptq_marlin_24"},
|
||||
{
|
||||
"quantization": "gptq_marlin_24",
|
||||
"allow_deprecated_quantization": True,
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
@@ -63,7 +63,10 @@ def test_models(
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
with vllm_runner(
|
||||
model_pair.model_marlin, dtype=dtype, quantization="gptq_marlin_24"
|
||||
model_pair.model_marlin,
|
||||
dtype=dtype,
|
||||
quantization="gptq_marlin_24",
|
||||
allow_deprecated_quantization=True,
|
||||
) as marlin_24_model:
|
||||
marlin_24_outputs = marlin_24_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs
|
||||
|
||||
@@ -26,7 +26,9 @@ MODELS = [
|
||||
)
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
def test_auto_round(vllm_runner, model):
|
||||
with vllm_runner(model, enforce_eager=True) as llm:
|
||||
with vllm_runner(
|
||||
model, enforce_eager=True, allow_deprecated_quantization=True
|
||||
) as llm:
|
||||
output = llm.generate_greedy(["The capital of France is"], max_tokens=8)
|
||||
assert output
|
||||
print(f"{output[0][1]}")
|
||||
|
||||
@@ -34,6 +34,10 @@ def test_model_experts_int8_startup(
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
|
||||
with vllm_runner(
|
||||
model, dtype=dtype, enforce_eager=True, quantization="experts_int8"
|
||||
model,
|
||||
dtype=dtype,
|
||||
enforce_eager=True,
|
||||
quantization="experts_int8",
|
||||
allow_deprecated_quantization=True,
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
@@ -30,6 +30,10 @@ def test_model_rtn_startup(
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
with vllm_runner(
|
||||
model, enforce_eager=True, dtype=dtype, quantization="rtn"
|
||||
model,
|
||||
enforce_eager=True,
|
||||
dtype=dtype,
|
||||
quantization="rtn",
|
||||
allow_deprecated_quantization=True,
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
@@ -191,6 +191,8 @@ class ModelConfig:
|
||||
`quantization_config` attribute in the model config file. If that is
|
||||
`None`, we assume the model weights are not quantized and use `dtype` to
|
||||
determine the data type of the weights."""
|
||||
allow_deprecated_quantization: bool = False
|
||||
"""Whether to allow deprecated quantization methods."""
|
||||
enforce_eager: bool = False
|
||||
"""Whether to always use eager-mode PyTorch. If True, we will disable CUDA
|
||||
graph and always execute the model in eager mode. If False, we will use
|
||||
@@ -940,6 +942,21 @@ class ModelConfig:
|
||||
|
||||
current_platform.verify_quantization(self.quantization)
|
||||
|
||||
if self.quantization in me_quant.DEPRECATED_QUANTIZATION_METHODS:
|
||||
if self.allow_deprecated_quantization:
|
||||
logger.warning(
|
||||
"The quantization method %s is deprecated "
|
||||
"and will be removed in future versions of vLLM.",
|
||||
self.quantization,
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
"The quantization method %s is deprecated "
|
||||
"and will be removed in future versions of vLLM. To bypass, "
|
||||
"set `--allow-deprecated-quantization`.",
|
||||
self.quantization,
|
||||
)
|
||||
|
||||
def _verify_cuda_graph(self) -> None:
|
||||
# CUDAGraph capture not supported for encoder-decoder models on ROCm
|
||||
unsupported_rocm = self.is_encoder_decoder
|
||||
|
||||
@@ -451,6 +451,7 @@ class EngineArgs:
|
||||
hf_overrides: HfOverrides = get_field(ModelConfig, "hf_overrides")
|
||||
tokenizer_revision: str | None = ModelConfig.tokenizer_revision
|
||||
quantization: QuantizationMethods | None = ModelConfig.quantization
|
||||
allow_deprecated_quantization: bool = ModelConfig.allow_deprecated_quantization
|
||||
enforce_eager: bool = ModelConfig.enforce_eager
|
||||
disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce
|
||||
limit_mm_per_prompt: dict[str, int | dict[str, int]] = get_field(
|
||||
@@ -648,6 +649,10 @@ class EngineArgs:
|
||||
)
|
||||
model_group.add_argument("--max-model-len", **model_kwargs["max_model_len"])
|
||||
model_group.add_argument("--quantization", "-q", **model_kwargs["quantization"])
|
||||
model_group.add_argument(
|
||||
"--allow-deprecated-quantization",
|
||||
**model_kwargs["allow_deprecated_quantization"],
|
||||
)
|
||||
model_group.add_argument("--enforce-eager", **model_kwargs["enforce_eager"])
|
||||
model_group.add_argument("--max-logprobs", **model_kwargs["max_logprobs"])
|
||||
model_group.add_argument("--logprobs-mode", **model_kwargs["logprobs_mode"])
|
||||
@@ -1225,6 +1230,7 @@ class EngineArgs:
|
||||
tokenizer_revision=self.tokenizer_revision,
|
||||
max_model_len=self.max_model_len,
|
||||
quantization=self.quantization,
|
||||
allow_deprecated_quantization=self.allow_deprecated_quantization,
|
||||
enforce_eager=self.enforce_eager,
|
||||
max_logprobs=self.max_logprobs,
|
||||
logprobs_mode=self.logprobs_mode,
|
||||
|
||||
@@ -41,6 +41,23 @@ QuantizationMethods = Literal[
|
||||
]
|
||||
QUANTIZATION_METHODS: list[str] = list(get_args(QuantizationMethods))
|
||||
|
||||
DEPRECATED_QUANTIZATION_METHODS = [
|
||||
"deepspeedfp",
|
||||
"tpu_int8",
|
||||
"ptpc_fp8",
|
||||
"fbgemm_fp8",
|
||||
"fp_quant",
|
||||
"bitblas",
|
||||
"gptq_marlin_24",
|
||||
"gptq_bitblas",
|
||||
"hqq",
|
||||
"experts_int8",
|
||||
"ipex",
|
||||
"auto-round",
|
||||
"rtn",
|
||||
"petit_nvfp4",
|
||||
]
|
||||
|
||||
# The customized quantization methods which will be added to this dict.
|
||||
_CUSTOMIZED_METHOD_TO_QUANT_CONFIG = {}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user