[V1][Quantization] Add CUDA graph compatible v1 GGUF support (#18646)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Signed-off-by: Isotr0py <2037008807@qq.com>
2025-05-27 12:40:28 +08:00
parent 1f88dbd2bb
commit 1f1b1bc03b
5 changed files with 188 additions and 59 deletions
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1291,14 +1291,6 @@ class EngineArgs:
                               recommend_to_remove=False)
            return False

-        # Some quantization is not compatible with torch.compile.
-        V1_UNSUPPORTED_QUANT = ["gguf"]
-        if model_config.quantization in V1_UNSUPPORTED_QUANT:
-            _raise_or_fallback(
-                feature_name=f"--quantization {model_config.quantization}",
-                recommend_to_remove=False)
-            return False
-
        # No Embedding Models so far.
        if model_config.task not in ["generate"]:
            _raise_or_fallback(feature_name=f"--task {model_config.task}",