[Feature]: Improve GGUF loading from HuggingFace user experience like repo_id:quant_type (#29137)

Signed-off-by: Injae Ryou <injaeryou@gmail.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-11-25 23:28:53 +09:00
parent 0231ce836a
commit 794029f012
10 changed files with 579 additions and 36 deletions
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -86,7 +86,7 @@ from vllm.transformers_utils.config import (
    is_interleaved,
    maybe_override_with_speculators,
 )
-from vllm.transformers_utils.utils import check_gguf_file, is_cloud_storage
+from vllm.transformers_utils.utils import is_cloud_storage, is_gguf
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.mem_constants import GiB_bytes
 from vllm.utils.network_utils import get_ip
@@ -1148,8 +1148,8 @@ class EngineArgs:
        return engine_args

    def create_model_config(self) -> ModelConfig:
-        # gguf file needs a specific model loader and doesn't use hf_repo
-        if check_gguf_file(self.model):
+        # gguf file needs a specific model loader
+        if is_gguf(self.model):
            self.quantization = self.load_format = "gguf"

        # NOTE(woosuk): In V1, we use separate processes for workers (unless