Move MultiModalConfig from config/__init__.py to config/multimodal.py (#24659)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-09-15 18:43:16 +01:00
parent b834b4cbf1
commit c4afdb69cc
11 changed files with 229 additions and 247 deletions
--- a/vllm/config/init.py
+++ b/vllm/config/init.py
@@ -11,13 +11,12 @@ import json
 import os
 import textwrap
 import warnings
-from collections.abc import Mapping
 from contextlib import contextmanager
-from dataclasses import MISSING, Field, field, fields, is_dataclass, replace
+from dataclasses import InitVar, field, fields, is_dataclass, replace
 from functools import cached_property, lru_cache
 from importlib.util import find_spec
-from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Literal, Optional,
-                    Protocol, TypeVar, Union, cast, get_args)
+from typing import (TYPE_CHECKING, Any, Callable, Literal, Optional, Protocol,
+                    TypeVar, Union, cast, get_args)

 import regex as re
 import torch
@@ -37,6 +36,8 @@ from vllm.config.kv_events import KVEventsConfig
 from vllm.config.kv_transfer import KVTransferConfig
 from vllm.config.load import LoadConfig
 from vllm.config.lora import LoRAConfig
+from vllm.config.multimodal import (MMCacheType, MMEncoderTPMode,
+                                    MultiModalConfig)
 from vllm.config.parallel import (DistributedExecutorBackend, EPLBConfig,
                                  ParallelConfig)
 from vllm.config.scheduler import SchedulerConfig, SchedulerPolicy
@@ -238,31 +239,12 @@ def get_attr_docs(cls: type[Any]) -> dict[str, str]:
    return out


-def get_field(cls: ConfigType, name: str) -> Field:
-    """Get the default factory field of a dataclass by name. Used for getting
-    default factory fields in `EngineArgs`."""
-    if not is_dataclass(cls):
-        raise TypeError("The given class is not a dataclass.")
-    cls_fields = {f.name: f for f in fields(cls)}
-    if name not in cls_fields:
-        raise ValueError(f"Field '{name}' not found in {cls.__name__}.")
-    named_field: Field = cls_fields[name]
-    if (default_factory := named_field.default_factory) is not MISSING:
-        return field(default_factory=default_factory)
-    if (default := named_field.default) is not MISSING:
-        return field(default=default)
-    raise ValueError(
-        f"{cls.__name__}.{name} must have a default value or default factory.")
-
-
 def is_init_field(cls: ConfigType, name: str) -> bool:
    return next(f for f in fields(cls) if f.name == name).init


 TokenizerMode = Literal["auto", "slow", "mistral", "custom"]
 ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
-MMEncoderTPMode = Literal["weights", "data"]
-MMCacheType = Literal["shm", "lru"]


 class LogprobsMode(enum.Enum):
@@ -407,20 +389,6 @@ class ModelConfig:
    that this name(s) will also be used in `model_name` tag content of
    prometheus metrics, if multiple names provided, metrics tag will take the
    first one."""
-    limit_mm_per_prompt: dict[str, int] = field(default_factory=dict)
-    """Maximum number of data items per modality per prompt. Only applicable
-    for multimodal models."""
-    interleave_mm_strings: bool = False
-    """Enable fully interleaved support for multimodal prompts, while using
-    --chat-template-content-format=string. Defaults to False."""
-    skip_mm_profiling: bool = False
-    """When enabled, skips multimodal memory profiling and only profiles with
-    language backbone model during engine initialization.
-    """
-    media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
-    """Additional args passed to process media inputs, keyed by modalities.
-    For example, to set num_frames for video, set
-    `--media-io-kwargs '{"video": {"num_frames": 40} }'` """
    use_async_output_proc: bool = True
    """Whether to use async output processor."""
    config_format: Union[str, ConfigFormat] = "auto"
@@ -436,41 +404,6 @@ class ModelConfig:
    hf_overrides: HfOverrides = field(default_factory=dict)
    """If a dictionary, contains arguments to be forwarded to the Hugging Face
    config. If a callable, it is called to update the HuggingFace config."""
-    mm_processor_kwargs: Optional[dict[str, Any]] = None
-    """Arguments to be forwarded to the model's processor for multi-modal data,
-    e.g., image processor. Overrides for the multi-modal processor obtained
-    from `AutoProcessor.from_pretrained`. The available overrides depend on the
-    model that is being run. For example, for Phi-3-Vision: `{"num_crops": 4}`.
-    """
-    mm_processor_cache_gb: float = 4
-    """The size (in GiB) of the multi-modal processor cache, which is used to
-    avoid re-processing past multi-modal inputs.
-
-    This cache is duplicated for each API process and engine core process,
-    resulting in a total memory usage of
-    `mm_processor_cache_gb * (api_server_count + data_parallel_size)`.
-
-    Set to `0` to disable this cache completely (not recommended)."""
-    mm_processor_cache_type: MMCacheType = "lru"
-    """Type of cache to use for the multi-modal preprocessor/mapper. If `shm`,
-    use shared memory FIFO cache. If `lru`, use mirrored LRU cache."""
-    mm_shm_cache_max_object_size_mb: int = 128
-    """Size limit (in MiB) for each object stored in the multi-modal processor
-    shared memory cache. Only effective when `mm_processor_cache_type` is
-    `"shm"`."""
-    mm_encoder_tp_mode: MMEncoderTPMode = "weights"
-    """Indicates how to optimize multi-modal encoder inference using
-    tensor parallelism (TP).
-
-    - `"weights"`: Within the same vLLM engine, split the weights of
-        each layer across TP ranks. (default TP behavior)
-    - `"data"`: Within the same vLLM engine, split the batched input data
-        across TP ranks to process the data in parallel, while hosting
-        the full weights on each TP rank.
-        This batch-level DP is not to be confused with API request-level
-        DP (which is controlled by `--data-parallel-size`).
-        This is only supported on a per-model basis and falls back to
-        `"weights"` if the encoder does not support DP."""
    pooler_config: Optional["PoolerConfig"] = field(init=False)
    """Pooler config which controls the behaviour of output pooling in pooling
    models."""
@@ -513,6 +446,18 @@ class ModelConfig:
    io_processor_plugin: Optional[str] = None
    """IOProcessor plugin name to load at model startup"""

+    # Multimodal config and init vars
+    multimodal_config: Optional[MultiModalConfig] = None
+    limit_mm_per_prompt: InitVar[Optional[dict[str, int]]] = None
+    media_io_kwargs: InitVar[Optional[dict[str, dict[str, Any]]]] = None
+    mm_processor_kwargs: InitVar[Optional[dict[str, Any]]] = None
+    mm_processor_cache_gb: InitVar[Optional[float]] = None
+    mm_processor_cache_type: InitVar[Optional[MMCacheType]] = None
+    mm_shm_cache_max_object_size_mb: InitVar[Optional[int]] = None
+    mm_encoder_tp_mode: InitVar[Optional[MMEncoderTPMode]] = None
+    interleave_mm_strings: InitVar[Optional[bool]] = None
+    skip_mm_profiling: InitVar[Optional[bool]] = None
+
    def compute_hash(self) -> str:
        """
        WARNING: Whenever a new field is added to this config,
@@ -546,7 +491,18 @@ class ModelConfig:
        assert_hashable(str_factors)
        return hashlib.sha256(str(factors).encode()).hexdigest()

-    def __post_init__(self) -> None:
+    def __post_init__(
+            self,
+            # Multimodal config init vars
+            limit_mm_per_prompt: Optional[dict[str, int]],
+            media_io_kwargs: Optional[dict[str, dict[str, Any]]],
+            mm_processor_kwargs: Optional[dict[str, Any]],
+            mm_processor_cache_gb: Optional[float],
+            mm_processor_cache_type: Optional[MMCacheType],
+            mm_shm_cache_max_object_size_mb: Optional[int],
+            mm_encoder_tp_mode: Optional[MMEncoderTPMode],
+            interleave_mm_strings: Optional[bool],
+            skip_mm_profiling: Optional[bool]) -> None:
        # Set the default seed to 0 in V1.
        # NOTE(woosuk): In V0, we set the default seed to None because the
        # driver worker shares the same process as the user process, and thus
@@ -777,7 +733,33 @@ class ModelConfig:

        self.original_max_model_len = self.max_model_len
        self.max_model_len = self.get_and_verify_max_len(self.max_model_len)
-        self.multimodal_config = self._init_multimodal_config()
+        # Init multimodal config if needed
+        if self._model_info.supports_multimodal:
+            if (mm_encoder_tp_mode == "data" and
+                    not self._model_info.supports_multimodal_encoder_tp_data):
+                logger.warning_once(
+                    "This model does not support `--mm-encoder-tp-mode data`. "
+                    "Falling back to `--mm-encoder-tp-mode weights`.")
+                mm_encoder_tp_mode = "weights"
+
+            mm_config_kwargs = dict(
+                limit_per_prompt=limit_mm_per_prompt,
+                media_io_kwargs=media_io_kwargs,
+                mm_processor_kwargs=mm_processor_kwargs,
+                mm_processor_cache_gb=mm_processor_cache_gb,
+                mm_processor_cache_type=mm_processor_cache_type,
+                mm_shm_cache_max_object_size_mb=mm_shm_cache_max_object_size_mb,
+                mm_encoder_tp_mode=mm_encoder_tp_mode,
+                interleave_mm_strings=interleave_mm_strings,
+                skip_mm_profiling=skip_mm_profiling,
+            )
+
+            mm_config_kwargs = {
+                k: v
+                for k, v in mm_config_kwargs.items() if v is not None
+            }
+
+            self.multimodal_config = MultiModalConfig(**mm_config_kwargs)

        if self.disable_sliding_window:
            # Set after get_and_verify_max_len to ensure that max_model_len
@@ -875,30 +857,6 @@ class ModelConfig:
                ignore_pattern=["*.pt", "*.safetensors", "*.bin", "*.tensors"])
            self.tokenizer = object_storage_tokenizer.dir

-    def _init_multimodal_config(self) -> Optional["MultiModalConfig"]:
-        if self._model_info.supports_multimodal:
-            if (self.mm_encoder_tp_mode == "data" and
-                    not self._model_info.supports_multimodal_encoder_tp_data):
-                logger.warning_once(
-                    "This model does not support `--mm-encoder-tp-mode data`. "
-                    "Falling back to `--mm-encoder-tp-mode weights`.")
-                self.mm_encoder_tp_mode = "weights"
-
-            return MultiModalConfig(
-                limit_per_prompt=self.limit_mm_per_prompt,
-                media_io_kwargs=self.media_io_kwargs,
-                mm_processor_kwargs=self.mm_processor_kwargs,
-                mm_processor_cache_gb=self.mm_processor_cache_gb,
-                mm_processor_cache_type=self.mm_processor_cache_type,
-                mm_shm_cache_max_object_size_mb=self.
-                mm_shm_cache_max_object_size_mb,
-                mm_encoder_tp_mode=self.mm_encoder_tp_mode,
-                interleave_mm_strings=self.interleave_mm_strings,
-                skip_mm_profiling=self.skip_mm_profiling,
-            )
-
-        return None
-
    def _get_encoder_config(self):
        return get_sentence_transformer_tokenizer_config(
            self.model, self.revision)
@@ -2417,129 +2375,6 @@ class SpeculativeConfig:
        return f"SpeculativeConfig({method=}, {model=}, {num_spec_tokens=})"


-@config
-@dataclass
-class MultiModalConfig:
-    """Controls the behavior of multimodal models."""
-
-    limit_per_prompt: dict[str, int] = \
-        cast(dict[str, int], get_field(ModelConfig, "limit_mm_per_prompt"))
-    """
-    The maximum number of input items allowed per prompt for each modality.
-    Defaults to 1 (V0) or 999 (V1) for each modality.
-
-    For example, to allow up to 16 images and 2 videos per prompt:
-    `{"image": 16, "video": 2}`
-    """
-
-    media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
-    """Additional args passed to process media inputs, keyed by modalities.
-    For example, to set num_frames for video, set
-    `--media-io-kwargs '{"video": {"num_frames": 40} }'` """
-
-    mm_processor_kwargs: Optional[dict[str, object]] = None
-    """
-    Overrides for the multi-modal processor obtained from
-    `transformers.AutoProcessor.from_pretrained`.
-
-    The available overrides depend on the model that is being run.
-
-    For example, for Phi-3-Vision:
-    `{"num_crops": 4}`.
-    """
-
-    mm_processor_cache_gb: float = 4
-    """
-    The size (in GiB) of the multi-modal processor cache, which is used to
-
-    This cache is duplicated for each API process and engine core process,
-    resulting in a total memory usage of
-    `mm_processor_cache_gb * (api_server_count + data_parallel_size)`.
-
-    Set to `0` to disable this cache completely (not recommended).
-    """
-
-    mm_processor_cache_type: MMCacheType = "lru"
-    """Type of cache to use for the multi-modal preprocessor/mapper. If `shm`,
-    use shared memory FIFO cache. If `lru`, use mirrored LRU cache."""
-
-    mm_shm_cache_max_object_size_mb: int = 128
-    """Size limit (in MiB) for each object stored in the multi-modal processor
-    shared memory cache. Only effective when `mm_processor_cache_type` is
-    `"shm"`."""
-
-    mm_encoder_tp_mode: MMEncoderTPMode = "weights"
-    """
-    Indicates how to optimize multi-modal encoder inference using
-    tensor parallelism (TP).
-
-    - `"weights"`: Within the same vLLM engine, split the weights of
-        each layer across TP ranks. (default TP behavior)
-    - `"data"`: Within the same vLLM engine, split the batched input data
-        across TP ranks to process the data in parallel, while hosting
-        the full weights on each TP rank.
-        This batch-level DP is not to be confused with API request-level
-        DP (which is controlled by `--data-parallel-size`).
-        This is only supported on a per-model basis and falls back to
-        `"weights"` if the encoder does not support DP.
-    """
-
-    interleave_mm_strings: bool = False
-    """
-    Enable fully interleaved support for multimodal prompts.
-    """
-
-    skip_mm_profiling: bool = False
-    """
-    When enabled, skips multimodal memory profiling and only profiles with
-    language backbone model during engine initialization.
-
-    This reduces engine startup time but shifts the responsibility to users for
-    estimating the peak memory usage of the activation of multimodal encoder and
-    embedding cache.
-    """
-
-    def compute_hash(self) -> str:
-        """
-        WARNING: Whenever a new field is added to this config,
-        ensure that it is included in the factors list if
-        it affects the computation graph.
-
-        Provide a hash that uniquely identifies all the configs
-        that affect the structure of the computation
-        graph from input ids/embeddings to the final hidden states,
-        excluding anything before input ids/embeddings and after
-        the final hidden states.
-        """
-        # no factors to consider.
-        # this config will not affect the computation graph.
-        factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode(),
-                               usedforsecurity=False).hexdigest()
-        return hash_str
-
-    def get_limit_per_prompt(self, modality: str) -> int:
-        """
-        Get the maximum number of input items allowed per prompt
-        for the given modality.
-        """
-        return self.limit_per_prompt.get(
-            modality,
-            999 if envs.VLLM_USE_V1 else 1,
-        )
-
-    def merge_mm_processor_kwargs(
-        self,
-        inference_kwargs: Mapping[str, object],
-    ) -> dict[str, object]:
-        """
-        Get the keyword arguments to pass to the multi-modal processor
-        according to the extra arguments passed during inference.
-        """
-        kwargs = self.mm_processor_kwargs or {}
-        return kwargs | dict(inference_kwargs)
-
-
@config
@dataclass
 class PoolerConfig: