[Frontend][Core] Override HF config.json via CLI (#5836)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-11-09 08:19:27 -08:00
parent d88bff1b96
commit b09895a618
7 changed files with 73 additions and 53 deletions
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -98,7 +98,10 @@ class LLM:
            to eager mode. Additionally for encoder-decoder models, if the
            sequence length of the encoder input is larger than this, we fall
            back to the eager mode.
-        disable_custom_all_reduce: See ParallelConfig
+        disable_custom_all_reduce: See :class:`~vllm.config.ParallelConfig`
+        disable_async_output_proc: Disable async output processing.
+            This may result in lower performance.
+        hf_overrides: Arguments to be forwarded to the HuggingFace config.
        **kwargs: Arguments for :class:`~vllm.EngineArgs`. (See
            :ref:`engine_args`)

@@ -153,6 +156,7 @@ class LLM:
        max_seq_len_to_capture: int = 8192,
        disable_custom_all_reduce: bool = False,
        disable_async_output_proc: bool = False,
+        hf_overrides: Optional[dict] = None,
        mm_processor_kwargs: Optional[Dict[str, Any]] = None,
        # After positional args are removed, move this right below `model`
        task: TaskOption = "auto",
@@ -194,6 +198,7 @@ class LLM:
            max_seq_len_to_capture=max_seq_len_to_capture,
            disable_custom_all_reduce=disable_custom_all_reduce,
            disable_async_output_proc=disable_async_output_proc,
+            hf_overrides=hf_overrides,
            mm_processor_kwargs=mm_processor_kwargs,
            pooling_type=pooling_type,
            pooling_norm=pooling_norm,