[UX] Add --language-model-only for hybrid models (#34120)

Signed-off-by: Roger Wang <hey@rogerw.io>
2026-02-09 06:57:33 -08:00
parent d0d97e2974
commit 64a9c2528b
3 changed files with 19 additions and 3 deletions
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -297,6 +297,7 @@ class ModelConfig:
    multimodal_config: MultiModalConfig | None = None
    """Configuration for multimodal model. If `None`, this will be inferred
    from the architecture of `self.model`."""
+    language_model_only: InitVar[bool] = False
    limit_mm_per_prompt: InitVar[dict[str, int | dict[str, int]] | None] = None
    enable_mm_embeds: InitVar[bool | None] = None
    media_io_kwargs: InitVar[dict[str, dict[str, Any]] | None] = None
@@ -411,6 +412,7 @@ class ModelConfig:
    def __post_init__(
        self,
        # Multimodal config init vars
+        language_model_only: bool,
        limit_mm_per_prompt: dict[str, int | dict[str, int]] | None,
        enable_mm_embeds: bool | None,
        media_io_kwargs: dict[str, dict[str, Any]] | None,
@@ -576,6 +578,7 @@ class ModelConfig:
                mm_encoder_tp_mode = "weights"

            mm_config_kwargs = dict(
+                language_model_only=language_model_only,
                limit_per_prompt=limit_mm_per_prompt,
                enable_mm_embeds=enable_mm_embeds,
                media_io_kwargs=media_io_kwargs,
--- a/vllm/config/multimodal.py
+++ b/vllm/config/multimodal.py
@@ -54,8 +54,12 @@ DummyOptions: TypeAlias = (
 class MultiModalConfig:
    """Controls the behavior of multimodal models."""

+    language_model_only: bool = False
+    """If True, disables all multimodal inputs by setting all modality limits
+    to 0. Equivalent to setting --limit-mm-per-prompt to 0 for every
+    modality."""
    limit_per_prompt: dict[str, DummyOptions] = Field(default_factory=dict)
-    """The maximum number of input items and options allowed per 
+    """The maximum number of input items and options allowed per
        prompt for each modality.
    Defaults to 999 for each modality.

@@ -63,11 +67,11 @@ class MultiModalConfig:
        {"image": 16, "video": 2}

    Configurable format (with options):
-        {"video": {"count": 1, "num_frames": 32, "width": 512, "height": 512}, 
+        {"video": {"count": 1, "num_frames": 32, "width": 512, "height": 512},
        "image": {"count": 5, "width": 512, "height": 512}}

    Mixed format (combining both):
-        {"image": 16, "video": {"count": 1, "num_frames": 32, "width": 512, 
+        {"image": 16, "video": {"count": 1, "num_frames": 32, "width": 512,
        "height": 512}}
    """
    enable_mm_embeds: bool = False
@@ -215,6 +219,7 @@ class MultiModalConfig:
        the final hidden states.
        """
        factors: list[Any] = [
+            self.language_model_only,
            self.mm_encoder_attn_backend.name
            if self.mm_encoder_attn_backend is not None
            else None,
@@ -228,6 +233,9 @@ class MultiModalConfig:
        Get the maximum number of input items allowed per prompt
        for the given modality (backward compatible).
        """
+        if self.language_model_only:
+            return 0
+
        limit_data = self.limit_per_prompt.get(modality)

        if limit_data is None: