diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py index 5c2538079..6c3e2b9b8 100644 --- a/vllm/config/multimodal.py +++ b/vllm/config/multimodal.py @@ -3,10 +3,9 @@ import hashlib from collections.abc import Mapping -from dataclasses import field from typing import Any, Literal, TypeAlias -from pydantic import ConfigDict, Field, field_validator +from pydantic import ConfigDict, Field, field_validator, model_validator from pydantic.dataclasses import dataclass from vllm.config.utils import config @@ -55,7 +54,7 @@ DummyOptions: TypeAlias = ( class MultiModalConfig: """Controls the behavior of multimodal models.""" - limit_per_prompt: dict[str, DummyOptions] = field(default_factory=dict) + limit_per_prompt: dict[str, DummyOptions] = Field(default_factory=dict) """The maximum number of input items and options allowed per prompt for each modality. Defaults to 999 for each modality. @@ -71,7 +70,7 @@ class MultiModalConfig: {"image": 16, "video": {"count": 1, "num_frames": 32, "width": 512, "height": 512}} """ - media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) + media_io_kwargs: dict[str, dict[str, Any]] = Field(default_factory=dict) """Additional args passed to process media inputs, keyed by modalities. For example, to set num_frames for video, set `--media-io-kwargs '{"video": {"num_frames": 40} }'`""" @@ -84,7 +83,7 @@ class MultiModalConfig: For example, for Phi-3-Vision: `{"num_crops": 4}`.""" - mm_processor_cache_gb: float = 4 + mm_processor_cache_gb: float = Field(default=4, ge=0) """The size (in GiB) of the multi-modal processor cache, which is used to avoid re-processing past multi-modal inputs. @@ -96,7 +95,7 @@ class MultiModalConfig: mm_processor_cache_type: MMCacheType = "lru" """Type of cache to use for the multi-modal preprocessor/mapper. If `shm`, use shared memory FIFO cache. If `lru`, use mirrored LRU cache.""" - mm_shm_cache_max_object_size_mb: int = 128 + mm_shm_cache_max_object_size_mb: int = Field(default=128, ge=0) """Size limit (in MiB) for each object stored in the multi-modal processor shared memory cache. Only effective when `mm_processor_cache_type` is `"shm"`.""" @@ -123,7 +122,7 @@ class MultiModalConfig: This reduces engine startup time but shifts the responsibility to users for estimating the peak memory usage of the activation of multimodal encoder and embedding cache.""" - video_pruning_rate: float | None = None + video_pruning_rate: float | None = Field(default=None, ge=0.0, lt=1.0) """Sets pruning rate for video pruning via Efficient Video Sampling. Value sits in range [0;1) and determines fraction of media tokens from each video to be pruned. @@ -149,6 +148,18 @@ class MultiModalConfig: value[k] = BaseDummyOptions(**v) return value + @model_validator(mode="after") + def _validate_multimodal_config(self): + if self.mm_processor_cache_type != "shm" and ( + self.mm_shm_cache_max_object_size_mb + != MultiModalConfig.mm_shm_cache_max_object_size_mb + ): + raise ValueError( + "'mm_shm_cache_max_object_size_mb' should only be set when " + "'mm_processor_cache_type' is 'shm'." + ) + return self + def compute_hash(self) -> str: """ WARNING: Whenever a new field is added to this config,