From 64a9c2528b1487fbfefa333cb1b246a57cddd4b2 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Mon, 9 Feb 2026 06:57:33 -0800 Subject: [PATCH] [UX] Add `--language-model-only` for hybrid models (#34120) Signed-off-by: Roger Wang --- vllm/config/model.py | 3 +++ vllm/config/multimodal.py | 14 +++++++++++--- vllm/engine/arg_utils.py | 5 +++++ 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/vllm/config/model.py b/vllm/config/model.py index b76d51868..96dbf9725 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -297,6 +297,7 @@ class ModelConfig: multimodal_config: MultiModalConfig | None = None """Configuration for multimodal model. If `None`, this will be inferred from the architecture of `self.model`.""" + language_model_only: InitVar[bool] = False limit_mm_per_prompt: InitVar[dict[str, int | dict[str, int]] | None] = None enable_mm_embeds: InitVar[bool | None] = None media_io_kwargs: InitVar[dict[str, dict[str, Any]] | None] = None @@ -411,6 +412,7 @@ class ModelConfig: def __post_init__( self, # Multimodal config init vars + language_model_only: bool, limit_mm_per_prompt: dict[str, int | dict[str, int]] | None, enable_mm_embeds: bool | None, media_io_kwargs: dict[str, dict[str, Any]] | None, @@ -576,6 +578,7 @@ class ModelConfig: mm_encoder_tp_mode = "weights" mm_config_kwargs = dict( + language_model_only=language_model_only, limit_per_prompt=limit_mm_per_prompt, enable_mm_embeds=enable_mm_embeds, media_io_kwargs=media_io_kwargs, diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py index 30305e4be..68244ba2f 100644 --- a/vllm/config/multimodal.py +++ b/vllm/config/multimodal.py @@ -54,8 +54,12 @@ DummyOptions: TypeAlias = ( class MultiModalConfig: """Controls the behavior of multimodal models.""" + language_model_only: bool = False + """If True, disables all multimodal inputs by setting all modality limits + to 0. Equivalent to setting --limit-mm-per-prompt to 0 for every + modality.""" limit_per_prompt: dict[str, DummyOptions] = Field(default_factory=dict) - """The maximum number of input items and options allowed per + """The maximum number of input items and options allowed per prompt for each modality. Defaults to 999 for each modality. @@ -63,11 +67,11 @@ class MultiModalConfig: {"image": 16, "video": 2} Configurable format (with options): - {"video": {"count": 1, "num_frames": 32, "width": 512, "height": 512}, + {"video": {"count": 1, "num_frames": 32, "width": 512, "height": 512}, "image": {"count": 5, "width": 512, "height": 512}} Mixed format (combining both): - {"image": 16, "video": {"count": 1, "num_frames": 32, "width": 512, + {"image": 16, "video": {"count": 1, "num_frames": 32, "width": 512, "height": 512}} """ enable_mm_embeds: bool = False @@ -215,6 +219,7 @@ class MultiModalConfig: the final hidden states. """ factors: list[Any] = [ + self.language_model_only, self.mm_encoder_attn_backend.name if self.mm_encoder_attn_backend is not None else None, @@ -228,6 +233,9 @@ class MultiModalConfig: Get the maximum number of input items allowed per prompt for the given modality (backward compatible). """ + if self.language_model_only: + return 0 + limit_data = self.limit_per_prompt.get(modality) if limit_data is None: diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index cf05c8e87..c7c78ffd8 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -454,6 +454,7 @@ class EngineArgs: allow_deprecated_quantization: bool = ModelConfig.allow_deprecated_quantization enforce_eager: bool = ModelConfig.enforce_eager disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce + language_model_only: bool = MultiModalConfig.language_model_only limit_mm_per_prompt: dict[str, int | dict[str, int]] = get_field( MultiModalConfig, "limit_per_prompt" ) @@ -975,6 +976,9 @@ class EngineArgs: title="MultiModalConfig", description=MultiModalConfig.__doc__, ) + multimodal_group.add_argument( + "--language-model-only", **multimodal_kwargs["language_model_only"] + ) multimodal_group.add_argument( "--limit-mm-per-prompt", **multimodal_kwargs["limit_per_prompt"] ) @@ -1291,6 +1295,7 @@ class EngineArgs: skip_tokenizer_init=self.skip_tokenizer_init, enable_prompt_embeds=self.enable_prompt_embeds, served_model_name=self.served_model_name, + language_model_only=self.language_model_only, limit_mm_per_prompt=self.limit_mm_per_prompt, enable_mm_embeds=self.enable_mm_embeds, interleave_mm_strings=self.interleave_mm_strings,