[Core] Support loading GGUF model (#5191)

Co-authored-by: Michael Goin <michael@neuralmagic.com>
2024-08-06 07:54:23 +08:00
parent ef527be06c
commit 360bd67cf0
29 changed files with 4970 additions and 21 deletions
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -672,6 +672,9 @@ class EngineArgs:
        return engine_args

    def create_engine_config(self, ) -> EngineConfig:
+        # gguf file needs a specific model loader and doesn't use hf_repo
+        if self.model.endswith(".gguf"):
+            self.quantization = self.load_format = "gguf"

        # bitsandbytes quantization needs a specific model loader
        # so we make sure the quant method and the load format are consistent