[Bugfix] Fix unable to load some models (#10312)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-11-15 08:55:54 +08:00
parent 11cd1ae6ad
commit 972112d82f
13 changed files with 340 additions and 59 deletions
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -9,7 +9,7 @@ from tqdm import tqdm
 from vllm import envs
 from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput,
                              BeamSearchSequence, get_beam_search_score)
-from vllm.engine.arg_utils import EngineArgs, TaskOption
+from vllm.engine.arg_utils import EngineArgs, HfOverrides, TaskOption
 from vllm.engine.llm_engine import LLMEngine
 from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
                                         apply_hf_chat_template,
@@ -101,7 +101,9 @@ class LLM:
        disable_custom_all_reduce: See :class:`~vllm.config.ParallelConfig`
        disable_async_output_proc: Disable async output processing.
            This may result in lower performance.
-        hf_overrides: Arguments to be forwarded to the HuggingFace config.
+        hf_overrides: If a dictionary, contains arguments to be forwarded to the
+            HuggingFace config. If a callable, it is called to update the
+            HuggingFace config.
        **kwargs: Arguments for :class:`~vllm.EngineArgs`. (See
            :ref:`engine_args`)

@@ -156,7 +158,7 @@ class LLM:
        max_seq_len_to_capture: int = 8192,
        disable_custom_all_reduce: bool = False,
        disable_async_output_proc: bool = False,
-        hf_overrides: Optional[dict] = None,
+        hf_overrides: Optional[HfOverrides] = None,
        mm_processor_kwargs: Optional[Dict[str, Any]] = None,
        # After positional args are removed, move this right below `model`
        task: TaskOption = "auto",