diff --git a/docs/contributing/model/multimodal.md b/docs/contributing/model/multimodal.md index e123e0dcd..67cde8df9 100644 --- a/docs/contributing/model/multimodal.md +++ b/docs/contributing/model/multimodal.md @@ -293,21 +293,22 @@ Assuming that the memory usage increases with the number of tokens, the dummy in self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) target_width, target_height = \ self.info.get_image_size_with_most_features() - image_overrides = mm_options.get("image") if mm_options else None + image_overrides = mm_options.get("image") return { - "image": - self._get_dummy_images(width=target_width, - height=target_height, - num_images=num_images, - overrides=image_overrides) + "image": self._get_dummy_images( + width=target_width, + height=target_height, + num_images=num_images, + overrides=image_overrides, + ) } ``` @@ -479,17 +480,16 @@ Assuming that the memory usage increases with the number of tokens, the dummy in self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Optional[Mapping[str, BaseDummyOptions]] = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: target_width, target_height = \ self.info.get_image_size_with_most_features() num_images = mm_counts.get("image", 0) - image_overrides = mm_options.get("image") if mm_options else None + image_overrides = mm_options.get("image") return { - "image": - self._get_dummy_images( + "image": self._get_dummy_images( width=target_width, height=target_height, num_images=num_images, diff --git a/tests/models/multimodal/processing/test_audioflamingo3.py b/tests/models/multimodal/processing/test_audioflamingo3.py index d7c00516f..428fd9c6e 100644 --- a/tests/models/multimodal/processing/test_audioflamingo3.py +++ b/tests/models/multimodal/processing/test_audioflamingo3.py @@ -116,7 +116,7 @@ def test_dummy_data_generation(mock_ctx): builder = AudioFlamingo3DummyInputsBuilder(info) mm_counts = {"audio": 2} - dummy_data = builder.get_dummy_mm_data(100, mm_counts, None) + dummy_data = builder.get_dummy_mm_data(100, mm_counts, {}) assert "audio" in dummy_data assert len(dummy_data["audio"]) == 2 diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index 7f18d5b03..0c9e73094 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -195,6 +195,7 @@ def get_text_token_prompts( inputs = dummy_inputs.get_dummy_processor_inputs( model_config.max_model_len, mm_counts, + mm_options={}, ) text_prompt = None token_prompt = ( @@ -224,6 +225,7 @@ def get_text_token_prompts( inputs = dummy_inputs.get_dummy_processor_inputs( model_config.max_model_len, mm_counts, + mm_options={}, ) assert isinstance(inputs.prompt, str) diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py index 8f7993647..5661c2ce4 100644 --- a/tests/models/multimodal/processing/test_tensor_schema.py +++ b/tests/models/multimodal/processing/test_tensor_schema.py @@ -97,6 +97,7 @@ def create_batched_mm_kwargs( processor_inputs = dummy_inputs.get_dummy_processor_inputs( seq_len=model_config.max_model_len, mm_counts=mm_counts, + mm_options={}, ) mm_items = processor_inputs.mm_items resized_mm_data = { diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py index 0a867f1c8..f95a2e140 100644 --- a/vllm/config/multimodal.py +++ b/vllm/config/multimodal.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Mapping -from typing import Any, Literal, TypeAlias +from typing import Any, Literal, TypeAlias, TypedDict, final from pydantic import ConfigDict, Field, field_validator, model_validator from pydantic.dataclasses import dataclass @@ -43,11 +43,29 @@ class AudioDummyOptions(BaseDummyOptions): length: int | None = Field(None, gt=0) +@final +class MultiModalDummyOptionsBuiltins(TypedDict, total=False): + """Type annotations for modality types predefined by vLLM.""" + + image: ImageDummyOptions + """Options for dummy images.""" + + video: VideoDummyOptions + """Options for dummy videos.""" + + audio: AudioDummyOptions + """Options for dummy audios.""" + + MMEncoderTPMode = Literal["weights", "data"] MMCacheType = Literal["shm", "lru"] -DummyOptions: TypeAlias = ( - BaseDummyOptions | VideoDummyOptions | ImageDummyOptions | AudioDummyOptions -) +MMDummyOptions: TypeAlias = dict[str, BaseDummyOptions] +""" +A dictionary containing an entry for each modality type of dummy data. + +The built-in modalities are defined by +[`MultiModalDummyOptionsBuiltins`][vllm.config.multimodal.MultiModalDummyOptionsBuiltins]. +""" @config @@ -57,7 +75,7 @@ class MultiModalConfig: language_model_only: bool = False """If True, disables all multimodal inputs by setting all modality limits to 0. Equivalent to setting `--limit-mm-per-prompt` to 0 for every modality.""" - limit_per_prompt: dict[str, DummyOptions] = Field(default_factory=dict) + limit_per_prompt: MMDummyOptions = Field(default_factory=dict) """The maximum number of input items and options allowed per prompt for each modality. @@ -158,22 +176,27 @@ class MultiModalConfig: @field_validator("limit_per_prompt", mode="before") @classmethod def _validate_limit_per_prompt( - cls, value: dict[str, int | dict[str, int]] - ) -> dict[str, DummyOptions]: + cls, + value: dict[str, int | dict[str, int]], + ) -> MMDummyOptions: + out: MMDummyOptions = {} + for k, v in value.items(): # Handle legacy format where only count is specified if isinstance(v, int): v = {"count": v} + # Convert to the appropriate DummyOptions subclass if k == "video": - value[k] = VideoDummyOptions(**v) + out[k] = VideoDummyOptions(**v) elif k == "image": - value[k] = ImageDummyOptions(**v) + out[k] = ImageDummyOptions(**v) elif k == "audio": - value[k] = AudioDummyOptions(**v) + out[k] = AudioDummyOptions(**v) else: - value[k] = BaseDummyOptions(**v) - return value + out[k] = BaseDummyOptions(**v) + + return out @field_validator("mm_encoder_attn_backend", mode="before") @classmethod @@ -240,15 +263,8 @@ class MultiModalConfig: if limit_data is None: # Unspecified modality is set to 999 by default return 999 - return limit_data.count - def get_dummy_options(self, modality: str) -> BaseDummyOptions | None: - """ - Get the configurable dummy data options for a modality. - Returns None if no options are configured for this modality. - """ - # All values are now DummyOptions after normalization - return self.limit_per_prompt.get(modality) + return limit_data.count def merge_mm_processor_kwargs( self, diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index fc1720296..908581786 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -444,15 +444,14 @@ class AriaDummyInputsBuilder(BaseDummyInputsBuilder[AriaProcessingInfo]): self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: vision_config = self.info.get_vision_config() max_image_size = vision_config.image_size num_images = mm_counts.get("image", 0) - image_overrides = mm_options.get("image") if mm_options else None + image_overrides = mm_options.get("image") return { "image": self._get_dummy_images( diff --git a/vllm/model_executor/models/audioflamingo3.py b/vllm/model_executor/models/audioflamingo3.py index 111b99461..e56997fb7 100644 --- a/vllm/model_executor/models/audioflamingo3.py +++ b/vllm/model_executor/models/audioflamingo3.py @@ -252,16 +252,13 @@ class AudioFlamingo3DummyInputsBuilder( self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: - feature_extractor = self.info.get_feature_extractor( - **(mm_processor_kwargs or {}) - ) + feature_extractor = self.info.get_feature_extractor() sampling_rate = feature_extractor.sampling_rate audio_len = MAX_AUDIO_LEN * sampling_rate num_audios = mm_counts.get("audio", 0) - audio_overrides = mm_options.get("audio") if mm_options else None + audio_overrides = mm_options.get("audio") return { "audio": self._get_dummy_audios( diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py index ce3b990c3..c1806beec 100644 --- a/vllm/model_executor/models/aya_vision.py +++ b/vllm/model_executor/models/aya_vision.py @@ -191,13 +191,12 @@ class AyaVisionDummyInputsBuilder(BaseDummyInputsBuilder[AyaVisionProcessingInfo self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) image_size = self.info.get_image_size_with_most_features() - image_overrides = mm_options.get("image") if mm_options else None + image_overrides = mm_options.get("image") return { "image": self._get_dummy_images( diff --git a/vllm/model_executor/models/bagel.py b/vllm/model_executor/models/bagel.py index 657e8cefb..425342e8b 100644 --- a/vllm/model_executor/models/bagel.py +++ b/vllm/model_executor/models/bagel.py @@ -249,8 +249,7 @@ class BagelDummyInputsBuilder(BaseDummyInputsBuilder[BagelProcessingInfo]): self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) hf_config = self.info.get_hf_config() @@ -258,7 +257,7 @@ class BagelDummyInputsBuilder(BaseDummyInputsBuilder[BagelProcessingInfo]): # Use the configured image size image_size = vit_config.image_size - image_overrides = mm_options.get("image") if mm_options else None + image_overrides = mm_options.get("image") return { "image": self._get_dummy_images( diff --git a/vllm/model_executor/models/bee.py b/vllm/model_executor/models/bee.py index 5c3a1a4f1..ecb645edf 100644 --- a/vllm/model_executor/models/bee.py +++ b/vllm/model_executor/models/bee.py @@ -90,14 +90,13 @@ class BeeDummyInputsBuilder(LlavaDummyInputsBuilder[BeeProcessingInfo]): self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) target_width, target_height = self.info.get_image_size_with_most_features() - image_overrides = mm_options.get("image") if mm_options else None + image_overrides = mm_options.get("image") return { "image": self._get_dummy_images( diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index fe9db19ea..8f79c1aae 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -445,8 +445,7 @@ class Blip2DummyInputsBuilder(BaseDummyInputsBuilder[Blip2ProcessingInfo]): self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: hf_config = self.info.get_hf_config() vision_config = hf_config.vision_config @@ -454,7 +453,7 @@ class Blip2DummyInputsBuilder(BaseDummyInputsBuilder[Blip2ProcessingInfo]): max_image_size = vision_config.image_size num_images = mm_counts.get("image", 0) - image_overrides = mm_options.get("image") if mm_options else None + image_overrides = mm_options.get("image") return { "image": self._get_dummy_images( diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 2c21d70ed..e09a4eac7 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -116,15 +116,14 @@ class ChameleonDummyInputsBuilder(BaseDummyInputsBuilder[ChameleonProcessingInfo self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: config = self.info.get_hf_config() width = height = config.vq_config.resolution num_images = mm_counts.get("image", 0) - image_overrides = mm_options.get("image") if mm_options else None + image_overrides = mm_options.get("image") return { "image": self._get_dummy_images( diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index 556c68fc1..63c84e890 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -174,14 +174,13 @@ class CLIPDummyInputsBuilder(BaseDummyInputsBuilder[CLIPProcessingInfo]): self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) target_width, target_height = self.info.get_image_size_with_most_features() - image_overrides = mm_options.get("image") if mm_options else None + image_overrides = mm_options.get("image") return { "image": self._get_dummy_images( diff --git a/vllm/model_executor/models/cohere2_vision.py b/vllm/model_executor/models/cohere2_vision.py index 1bcdd41b3..69b2abb5f 100644 --- a/vllm/model_executor/models/cohere2_vision.py +++ b/vllm/model_executor/models/cohere2_vision.py @@ -197,13 +197,12 @@ class Cohere2VisionDummyInputsBuilder( self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) image_size = self.info.get_image_size_with_most_features() - image_overrides = mm_options.get("image") if mm_options else None + image_overrides = mm_options.get("image") return { "image": self._get_dummy_images( diff --git a/vllm/model_executor/models/colmodernvbert.py b/vllm/model_executor/models/colmodernvbert.py index 29efb4a5f..ecb243ced 100644 --- a/vllm/model_executor/models/colmodernvbert.py +++ b/vllm/model_executor/models/colmodernvbert.py @@ -132,12 +132,12 @@ class ColModernVBertDummyInputsBuilder( self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) target_width, target_height = self.info.get_image_size_with_most_features() - image_overrides = mm_options.get("image") if mm_options else None + image_overrides = mm_options.get("image") + return { "image": self._get_dummy_images( width=target_width, diff --git a/vllm/model_executor/models/deepseek_ocr.py b/vllm/model_executor/models/deepseek_ocr.py index 8293d2ece..b0fba01a4 100644 --- a/vllm/model_executor/models/deepseek_ocr.py +++ b/vllm/model_executor/models/deepseek_ocr.py @@ -255,8 +255,7 @@ class DeepseekOCRDummyInputsBuilder(BaseDummyInputsBuilder[DeepseekOCRProcessing self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/deepseek_ocr2.py b/vllm/model_executor/models/deepseek_ocr2.py index 6ababf9f2..b57aeeabd 100644 --- a/vllm/model_executor/models/deepseek_ocr2.py +++ b/vllm/model_executor/models/deepseek_ocr2.py @@ -137,8 +137,7 @@ class DeepseekOCR2DummyInputsBuilder( self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index e0de49fb6..79279b9d5 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -214,14 +214,13 @@ class DeepseekVL2DummyInputsBuilder(BaseDummyInputsBuilder[DeepseekVL2Processing self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) max_image_size = self.info.get_image_size_with_most_features() - image_overrides = mm_options.get("image") if mm_options else None + image_overrides = mm_options.get("image") return { "image": self._get_dummy_images( diff --git a/vllm/model_executor/models/dots_ocr.py b/vllm/model_executor/models/dots_ocr.py index 4d8acb082..25b4087d3 100644 --- a/vllm/model_executor/models/dots_ocr.py +++ b/vllm/model_executor/models/dots_ocr.py @@ -106,17 +106,13 @@ class DotsOCRDummyInputsBuilder(Qwen2VLDummyInputsBuilder): self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) - mm_processor_kwargs = mm_processor_kwargs or {} - target_width, target_height = self.info.get_image_size_with_most_features( # noqa: E501 - mm_processor_kwargs.get("max_pixels", None) - ) + target_width, target_height = self.info.get_image_size_with_most_features() - image_overrides = mm_options.get("image") if mm_options else None + image_overrides = mm_options.get("image") return { "image": self._get_dummy_images( diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py index ab1386e08..1df4adfac 100644 --- a/vllm/model_executor/models/ernie45_vl.py +++ b/vllm/model_executor/models/ernie45_vl.py @@ -1168,8 +1168,7 @@ class Ernie4_5_VLDummyInputsBuilder(BaseDummyInputsBuilder[Ernie4_5_VLProcessing self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) @@ -1179,8 +1178,8 @@ class Ernie4_5_VLDummyInputsBuilder(BaseDummyInputsBuilder[Ernie4_5_VLProcessing seq_len, mm_counts ) - image_overrides = mm_options.get("image") if mm_options else None - video_overrides = mm_options.get("video") if mm_options else None + image_overrides = mm_options.get("image") + video_overrides = mm_options.get("video") return { "image": self._get_dummy_images( diff --git a/vllm/model_executor/models/funasr.py b/vllm/model_executor/models/funasr.py index a1c70e10e..25ede72f1 100644 --- a/vllm/model_executor/models/funasr.py +++ b/vllm/model_executor/models/funasr.py @@ -746,23 +746,22 @@ class FunASRDummyInputsBuilder(BaseDummyInputsBuilder[FunASRProcessingInfo]): self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: - feature_extractor = self.info.get_feature_extractor( - **(mm_processor_kwargs or {}) - ) + feature_extractor = self.info.get_feature_extractor() sampling_rate = feature_extractor.sampling_rate audio_len = feature_extractor.chunk_length * sampling_rate num_audios = mm_counts.get("audio", 0) - audio_overrides = mm_options.get("audio") if mm_options else None + audio_overrides = mm_options.get("audio") return { "audio": self._get_dummy_audios( - length=audio_len, num_audios=num_audios, overrides=audio_overrides - ) + length=audio_len, + num_audios=num_audios, + overrides=audio_overrides, + ), } diff --git a/vllm/model_executor/models/funaudiochat.py b/vllm/model_executor/models/funaudiochat.py index a89a5c104..5bcb49e07 100644 --- a/vllm/model_executor/models/funaudiochat.py +++ b/vllm/model_executor/models/funaudiochat.py @@ -610,12 +610,9 @@ class FunAudioChatDummyInputsBuilder( self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: - feature_extractor = self.info.get_feature_extractor( - **(mm_processor_kwargs or {}) - ) + feature_extractor = self.info.get_feature_extractor() sampling_rate = int(feature_extractor.sampling_rate) # Dummy inputs are used for profiling; construct the worst-case audio @@ -632,7 +629,7 @@ class FunAudioChatDummyInputsBuilder( ) num_audios = int(mm_counts.get("audio", 0)) - audio_overrides = mm_options.get("audio") if mm_options else None + audio_overrides = mm_options.get("audio") return { "audio": self._get_dummy_audios( length=audio_len, diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index c4f1118f7..cc15cee59 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -142,13 +142,12 @@ class FuyuDummyInputsBuilder(BaseDummyInputsBuilder[FuyuProcessingInfo]): self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: target_width, target_height = self.info.get_image_size_with_most_features() num_images = mm_counts.get("image", 0) - image_overrides = mm_options.get("image") if mm_options else None + image_overrides = mm_options.get("image") return { "image": self._get_dummy_images( diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py index d0a326ccd..83a1ae52e 100644 --- a/vllm/model_executor/models/gemma3_mm.py +++ b/vllm/model_executor/models/gemma3_mm.py @@ -241,14 +241,13 @@ class Gemma3DummyInputsBuilder(BaseDummyInputsBuilder[Gemma3ProcessingInfo]): self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) target_width, target_height = self.info.get_image_size_with_most_features() - image_overrides = mm_options.get("image") if mm_options else None + image_overrides = mm_options.get("image") return { "image": self._get_dummy_images( diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py index 3e4745f7c..ab5d4ae46 100644 --- a/vllm/model_executor/models/gemma3n_mm.py +++ b/vllm/model_executor/models/gemma3n_mm.py @@ -175,8 +175,7 @@ class Gemma3nDummyInputsBuilder(BaseDummyInputsBuilder[Gemma3nProcessingInfo]): self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_audios = mm_counts.get("audio", 0) @@ -189,8 +188,8 @@ class Gemma3nDummyInputsBuilder(BaseDummyInputsBuilder[Gemma3nProcessingInfo]): img_width = image_processor.size.get("width", 224) img_height = image_processor.size.get("height", 224) - image_overrides = mm_options.get("image") if mm_options else None - audio_overrides = mm_options.get("audio") if mm_options else None + image_overrides = mm_options.get("image") + audio_overrides = mm_options.get("audio") return { "image": self._get_dummy_images( @@ -200,7 +199,9 @@ class Gemma3nDummyInputsBuilder(BaseDummyInputsBuilder[Gemma3nProcessingInfo]): overrides=image_overrides, ), "audio": self._get_dummy_audios( - length=audio_len, num_audios=num_audios, overrides=audio_overrides + length=audio_len, + num_audios=num_audios, + overrides=audio_overrides, ), } diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index a85d5e6f9..ff76a26bb 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -1163,8 +1163,7 @@ class Glm4vDummyInputsBuilder(BaseDummyInputsBuilder[Glm4vProcessingInfo]): self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) @@ -1174,8 +1173,8 @@ class Glm4vDummyInputsBuilder(BaseDummyInputsBuilder[Glm4vProcessingInfo]): seq_len, mm_counts ) - image_overrides = mm_options.get("image") if mm_options else None - video_overrides = mm_options.get("video") if mm_options else None + image_overrides = mm_options.get("image") + video_overrides = mm_options.get("video") return { "image": self._get_dummy_images( diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py index 4d86900e9..3513419cb 100644 --- a/vllm/model_executor/models/glm4v.py +++ b/vllm/model_executor/models/glm4v.py @@ -492,8 +492,7 @@ class GLM4VDummyInputsBuilder(BaseDummyInputsBuilder[GLM4VProcessingInfo]): self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: hf_config = self.info.get_hf_config() vision_config = hf_config.vision_config @@ -501,7 +500,7 @@ class GLM4VDummyInputsBuilder(BaseDummyInputsBuilder[GLM4VProcessingInfo]): target_width = target_height = vision_config["image_size"] num_images = mm_counts.get("image", 0) - image_overrides = mm_options.get("image") if mm_options else None + image_overrides = mm_options.get("image") return { "image": self._get_dummy_images( diff --git a/vllm/model_executor/models/glmasr.py b/vllm/model_executor/models/glmasr.py index b7d67b1e4..fd47a014a 100644 --- a/vllm/model_executor/models/glmasr.py +++ b/vllm/model_executor/models/glmasr.py @@ -726,15 +726,12 @@ class GlmAsrDummyInputsBuilder(BaseDummyInputsBuilder[GlmAsrProcessingInfo]): self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: - feature_extractor = self.info.get_feature_extractor( - **(mm_processor_kwargs or {}) - ) + feature_extractor = self.info.get_feature_extractor() sampling_rate = feature_extractor.sampling_rate num_audios = mm_counts.get("audio", 0) - audio_overrides = mm_options.get("audio") if mm_options else None + audio_overrides = mm_options.get("audio") max_audio_len = getattr( self.info.get_hf_processor(), "max_audio_len", DEFAULT_MAX_AUDIO_LEN_S @@ -743,7 +740,9 @@ class GlmAsrDummyInputsBuilder(BaseDummyInputsBuilder[GlmAsrProcessingInfo]): return { "audio": self._get_dummy_audios( - length=audio_len, num_audios=num_audios, overrides=audio_overrides + length=audio_len, + num_audios=num_audios, + overrides=audio_overrides, ) } diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py index 9d37a0683..393a2be34 100644 --- a/vllm/model_executor/models/granite_speech.py +++ b/vllm/model_executor/models/granite_speech.py @@ -216,11 +216,10 @@ class GraniteSpeechDummyInputsBuilder( self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: num_audios = mm_counts.get("audio", 0) - audio_overrides = mm_options.get("audio") if mm_options else None + audio_overrides = mm_options.get("audio") return { "audio": self._get_dummy_audios( diff --git a/vllm/model_executor/models/hunyuan_vision.py b/vllm/model_executor/models/hunyuan_vision.py index 50b6bd427..3f2d0e7dd 100644 --- a/vllm/model_executor/models/hunyuan_vision.py +++ b/vllm/model_executor/models/hunyuan_vision.py @@ -713,8 +713,7 @@ class HunYuanVLDummyInputsBuilder(BaseDummyInputsBuilder[HunYuanVLProcessingInfo self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: num_images = mm_counts.get("image", 1) diff --git a/vllm/model_executor/models/hyperclovax_vision.py b/vllm/model_executor/models/hyperclovax_vision.py index ea10d764f..1fb0d5e5d 100644 --- a/vllm/model_executor/models/hyperclovax_vision.py +++ b/vllm/model_executor/models/hyperclovax_vision.py @@ -165,8 +165,7 @@ class HCXVisionDummyInputsBuilder(BaseDummyInputsBuilder[HCXVisionProcessingInfo self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) @@ -174,8 +173,8 @@ class HCXVisionDummyInputsBuilder(BaseDummyInputsBuilder[HCXVisionProcessingInfo target_width, target_height = self.info.get_image_size_with_most_features() target_num_frames = 32 - image_overrides = mm_options.get("image") if mm_options else None - video_overrides = mm_options.get("video") if mm_options else None + image_overrides = mm_options.get("image") + video_overrides = mm_options.get("video") return { "image": self._get_dummy_images( diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index 434bc7318..a59c45654 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -277,15 +277,14 @@ class Idefics3DummyInputsBuilder(BaseDummyInputsBuilder[Idefics3ProcessingInfo]) self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) - hf_processor = self.info.get_hf_processor(**(mm_processor_kwargs or {})) + hf_processor = self.info.get_hf_processor() image_processor: Idefics3ImageProcessor = hf_processor.image_processor longest_edge = image_processor.max_image_size["longest_edge"] - image_overrides = mm_options.get("image") if mm_options else None + image_overrides = mm_options.get("image") return { "image": self._get_dummy_images( diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py index 5e973aa83..549f3ee54 100644 --- a/vllm/model_executor/models/interns1.py +++ b/vllm/model_executor/models/interns1.py @@ -297,8 +297,7 @@ class InternS1DummyInputsBuilder(BaseDummyInputsBuilder[InternS1ProcessingInfo]) self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: target_width, target_height = self.info.get_image_size_with_most_features() target_num_frames = self.info.get_num_frames_with_most_features( @@ -310,8 +309,8 @@ class InternS1DummyInputsBuilder(BaseDummyInputsBuilder[InternS1ProcessingInfo]) config = self.info.get_hf_config() image_size_h, image_size_w = config.vision_config.image_size - image_overrides = mm_options.get("image") if mm_options else None - video_overrides = mm_options.get("video") if mm_options else None + image_overrides = mm_options.get("image") + video_overrides = mm_options.get("video") return { "image": self._get_dummy_images( diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 7fbbb7237..a696d2129 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -762,13 +762,12 @@ class BaseInternVLDummyInputsBuilder(BaseDummyInputsBuilder[_I]): self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: target_width, target_height = self.info.get_image_size_with_most_features() num_images = mm_counts.get("image", 0) - image_overrides = mm_options.get("image") if mm_options else None + image_overrides = mm_options.get("image") return { "image": self._get_dummy_images( @@ -935,12 +934,9 @@ class InternVLDummyInputsBuilder( self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: - dummy_image = super().get_dummy_mm_data( - seq_len=seq_len, mm_counts=mm_counts, mm_options=mm_options - ) + dummy_image = super().get_dummy_mm_data(seq_len, mm_counts, mm_options) if self.info.supports_video: config = self.info.get_hf_config() image_size: int = config.vision_config.image_size @@ -948,7 +944,7 @@ class InternVLDummyInputsBuilder( seq_len, mm_counts ) num_videos = mm_counts.get("video", 0) - video_overrides = mm_options.get("video") if mm_options else None + video_overrides = mm_options.get("video") dummy_video = { "video": self._get_dummy_videos( width=image_size, diff --git a/vllm/model_executor/models/isaac.py b/vllm/model_executor/models/isaac.py index 8ed9ddda4..f4f7ce459 100644 --- a/vllm/model_executor/models/isaac.py +++ b/vllm/model_executor/models/isaac.py @@ -18,6 +18,7 @@ from typing_extensions import TypedDict, Unpack from vllm.config import VllmConfig from vllm.config.model import ModelConfig +from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import parallel_state from vllm.distributed import utils as dist_utils from vllm.model_executor.layers.attention import MMEncoderAttention @@ -849,13 +850,12 @@ class IsaacDummyInputsBuilder(BaseDummyInputsBuilder[IsaacProcessingInfo]): self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) target_width, target_height = self.info.get_image_size_with_most_features() - image_overrides = mm_options.get("image") if mm_options else None + image_overrides = mm_options.get("image") return { "image": self._get_dummy_images( diff --git a/vllm/model_executor/models/kanana_v.py b/vllm/model_executor/models/kanana_v.py index b679241b5..991fa28d9 100644 --- a/vllm/model_executor/models/kanana_v.py +++ b/vllm/model_executor/models/kanana_v.py @@ -444,8 +444,7 @@ class KananaVDummyInputsBuilder(BaseDummyInputsBuilder[KananaVProcessingInfo]): self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) return { diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py index 2ae044c28..2cb7dc425 100644 --- a/vllm/model_executor/models/keye.py +++ b/vllm/model_executor/models/keye.py @@ -1170,8 +1170,7 @@ class KeyeBaseDummyInputsBuilder(BaseDummyInputsBuilder[_I]): self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) @@ -1179,8 +1178,8 @@ class KeyeBaseDummyInputsBuilder(BaseDummyInputsBuilder[_I]): target_width, target_height = self.info.get_image_size_with_most_features() target_num_frames = self.info.get_num_frames_with_most_features(seq_len) - image_overrides = mm_options.get("image") if mm_options else None - video_overrides = mm_options.get("video") if mm_options else None + image_overrides = mm_options.get("image") + video_overrides = mm_options.get("video") mm_data = { "image": self._get_dummy_images( diff --git a/vllm/model_executor/models/kimi_k25.py b/vllm/model_executor/models/kimi_k25.py index 9d287ba9b..248339337 100644 --- a/vllm/model_executor/models/kimi_k25.py +++ b/vllm/model_executor/models/kimi_k25.py @@ -240,8 +240,7 @@ class KimiK25DummyInputsBuilder(BaseDummyInputsBuilder[KimiK25ProcessingInfo]): self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: # TODO: Support mm_options for vision_chunk to allow user configuration dummy_items = self.get_dummy_mm_items() diff --git a/vllm/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py index e280f8245..5da8ef980 100644 --- a/vllm/model_executor/models/kimi_vl.py +++ b/vllm/model_executor/models/kimi_vl.py @@ -215,12 +215,11 @@ class KimiVLDummyInputsBuilder(BaseDummyInputsBuilder[KimiVLProcessingInfo]): self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) - image_overrides = mm_options.get("image") if mm_options else None + image_overrides = mm_options.get("image") return { "image": self._get_dummy_images( diff --git a/vllm/model_executor/models/lfm2_vl.py b/vllm/model_executor/models/lfm2_vl.py index 3355e4016..86cd5546b 100644 --- a/vllm/model_executor/models/lfm2_vl.py +++ b/vllm/model_executor/models/lfm2_vl.py @@ -343,14 +343,13 @@ class Lfm2VLDummyInputsBuilder(BaseDummyInputsBuilder[Lfm2VLProcessingInfo]): self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) target_width, target_height = self.info.get_image_size_with_most_features() - image_overrides = mm_options.get("image") if mm_options else None + image_overrides = mm_options.get("image") return { "image": self._get_dummy_images( diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index c8ca1815d..e6eb268d6 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -232,14 +232,13 @@ class LlavaDummyInputsBuilder(BaseDummyInputsBuilder[_I]): self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) target_width, target_height = self.info.get_image_size_with_most_features() - image_overrides = mm_options.get("image") if mm_options else None + image_overrides = mm_options.get("image") return { "image": self._get_dummy_images( diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index 6696a0009..54558e123 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -165,8 +165,7 @@ class LlavaNextVideoDummyInputsBuilder( self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: num_videos = mm_counts.get("video", 0) @@ -175,7 +174,7 @@ class LlavaNextVideoDummyInputsBuilder( seq_len, mm_counts ) - video_overrides = mm_options.get("video") if mm_options else None + video_overrides = mm_options.get("video") return { "video": self._get_dummy_videos( diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 290ace8bf..f747df09c 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -276,8 +276,7 @@ class LlavaOnevisionDummyInputsBuilder( self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) @@ -287,8 +286,8 @@ class LlavaOnevisionDummyInputsBuilder( seq_len, mm_counts ) - image_overrides = mm_options.get("image") if mm_options else None - video_overrides = mm_options.get("video") if mm_options else None + image_overrides = mm_options.get("image") + video_overrides = mm_options.get("video") return { "image": self._get_dummy_images( diff --git a/vllm/model_executor/models/midashenglm.py b/vllm/model_executor/models/midashenglm.py index 4bba0ad71..08b955c81 100644 --- a/vllm/model_executor/models/midashenglm.py +++ b/vllm/model_executor/models/midashenglm.py @@ -565,12 +565,11 @@ class MiDashengLMDummyInputsBuilder(BaseDummyInputsBuilder[MiDashengLMProcessing self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: num_audios = mm_counts.get("audio", 0) - audio_overrides = mm_options.get("audio") if mm_options else None + audio_overrides = mm_options.get("audio") return { "audio": self._get_dummy_audios( diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py index 33df0f785..f176e50f8 100644 --- a/vllm/model_executor/models/minicpmo.py +++ b/vllm/model_executor/models/minicpmo.py @@ -301,8 +301,7 @@ class MiniCPMODummyInputsBuilder(MiniCPMVDummyInputsBuilder[MiniCPMOProcessingIn self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: num_audios = mm_counts.get("audio", 0) audio_len = ( @@ -310,11 +309,13 @@ class MiniCPMODummyInputsBuilder(MiniCPMVDummyInputsBuilder[MiniCPMOProcessingIn * self.info.get_default_audio_sampling_rate() ) - audio_overrides = mm_options.get("audio") if mm_options else None + audio_overrides = mm_options.get("audio") audio_mm_data = { "audio": self._get_dummy_audios( - length=audio_len, num_audios=num_audios, overrides=audio_overrides + length=audio_len, + num_audios=num_audios, + overrides=audio_overrides, ) } diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 6a1686100..784a03a60 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -707,8 +707,7 @@ class MiniCPMVDummyInputsBuilder(BaseDummyInputsBuilder[_I]): self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) @@ -719,8 +718,8 @@ class MiniCPMVDummyInputsBuilder(BaseDummyInputsBuilder[_I]): seq_len, mm_counts ) - image_overrides = mm_options.get("image") if mm_options else None - video_overrides = mm_options.get("video") if mm_options else None + image_overrides = mm_options.get("image") + video_overrides = mm_options.get("video") return { "image": self._get_dummy_images( diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py index 33d94e9ff..787fdf900 100644 --- a/vllm/model_executor/models/mistral3.py +++ b/vllm/model_executor/models/mistral3.py @@ -236,14 +236,13 @@ class Mistral3DummyInputsBuilder(BaseDummyInputsBuilder[_I]): self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) target_width, target_height = self.info.get_image_size_with_most_features() - image_overrides = mm_options.get("image") if mm_options else None + image_overrides = mm_options.get("image") return { "image": self._get_dummy_images( diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index 6b3ca695a..b08810892 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -707,14 +707,13 @@ class Mllama4DummyInputsBuilder(BaseDummyInputsBuilder[Mllama4ProcessingInfo]): self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) (target_width, target_height) = self.info.get_image_size_with_most_features() - image_overrides = mm_options.get("image") if mm_options else None + image_overrides = mm_options.get("image") return { "image": self._get_dummy_images( diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index b3689ed19..ba6d569b7 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -1274,13 +1274,12 @@ class MolmoDummyInputsBuilder(BaseDummyInputsBuilder[MolmoProcessingInfo]): self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: target_width, target_height = self.info.get_image_size_with_most_features() num_images = mm_counts.get("image", 0) - image_overrides = mm_options.get("image") if mm_options else None + image_overrides = mm_options.get("image") return { "image": self._get_dummy_images( diff --git a/vllm/model_executor/models/molmo2.py b/vllm/model_executor/models/molmo2.py index d32c034b5..b2e91616a 100644 --- a/vllm/model_executor/models/molmo2.py +++ b/vllm/model_executor/models/molmo2.py @@ -2082,8 +2082,7 @@ class Molmo2DummyInputsBuilder(BaseDummyInputsBuilder[Molmo2ProcessingInfo]): self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) @@ -2094,7 +2093,7 @@ class Molmo2DummyInputsBuilder(BaseDummyInputsBuilder[Molmo2ProcessingInfo]): if num_images > 0: target_width, target_height = self.info.get_image_size_with_most_features() - image_overrides = mm_options.get("image") if mm_options else None + image_overrides = mm_options.get("image") dummy_images = self._get_dummy_images( width=target_width, @@ -2110,7 +2109,7 @@ class Molmo2DummyInputsBuilder(BaseDummyInputsBuilder[Molmo2ProcessingInfo]): seq_len, mm_counts ) - video_overrides = mm_options.get("video") if mm_options else None + video_overrides = mm_options.get("video") if video_overrides: assert isinstance(video_overrides, VideoDummyOptions) diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py index b4c5f6e64..46cf7fe97 100644 --- a/vllm/model_executor/models/nano_nemotron_vl.py +++ b/vllm/model_executor/models/nano_nemotron_vl.py @@ -1388,8 +1388,7 @@ class NanoNemotronVLDummyInputsBuilder(BaseDummyInputsBuilder[_I]): self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) processor = self.info.get_hf_processor() @@ -1404,7 +1403,7 @@ class NanoNemotronVLDummyInputsBuilder(BaseDummyInputsBuilder[_I]): max_num_tiles ) - image_overrides = mm_options.get("image") if mm_options else None + image_overrides = mm_options.get("image") return { "image": self._get_dummy_images( @@ -1461,12 +1460,9 @@ class NanoNemotronVLDummyInputsBuilder( self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: - dummy_image = super().get_dummy_mm_data( - seq_len=seq_len, mm_counts=mm_counts, mm_options=mm_options - ) + dummy_image = super().get_dummy_mm_data(seq_len, mm_counts, mm_options) if self.info.supports_video: config = self.info.get_hf_config() image_size: int = config.force_image_size @@ -1474,7 +1470,7 @@ class NanoNemotronVLDummyInputsBuilder( seq_len, mm_counts ) num_videos = mm_counts.get("video", 0) - video_overrides = mm_options.get("video") if mm_options else None + video_overrides = mm_options.get("video") dummy_video = { "video": self._get_dummy_videos( width=image_size, diff --git a/vllm/model_executor/models/nemotron_parse.py b/vllm/model_executor/models/nemotron_parse.py index 813675a92..fc300a2f9 100644 --- a/vllm/model_executor/models/nemotron_parse.py +++ b/vllm/model_executor/models/nemotron_parse.py @@ -645,8 +645,7 @@ class NemotronParseDummyInputsBuilder( self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py index 840918953..ead24a4e9 100644 --- a/vllm/model_executor/models/nvlm_d.py +++ b/vllm/model_executor/models/nvlm_d.py @@ -92,13 +92,12 @@ class NVLMDummyInputsBuilder(BaseInternVLDummyInputsBuilder[NVLMProcessingInfo]) self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: target_width, target_height = self.info.get_image_size_with_most_features() num_images = mm_counts.get("image", 0) - image_overrides = mm_options.get("image") if mm_options else None + image_overrides = mm_options.get("image") return { "image": self._get_dummy_images( diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py index 990197cc6..2807c634b 100644 --- a/vllm/model_executor/models/ovis.py +++ b/vllm/model_executor/models/ovis.py @@ -306,14 +306,13 @@ class OvisDummyInputsBuilder(BaseDummyInputsBuilder[OvisProcessingInfo]): self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) target_width, target_height = self.info.get_image_size_with_most_features() - image_overrides = mm_options.get("image") if mm_options else None + image_overrides = mm_options.get("image") mm_data = { "image": self._get_dummy_images( diff --git a/vllm/model_executor/models/ovis2_5.py b/vllm/model_executor/models/ovis2_5.py index 9f2098a95..2d9385c57 100644 --- a/vllm/model_executor/models/ovis2_5.py +++ b/vllm/model_executor/models/ovis2_5.py @@ -287,8 +287,7 @@ class Ovis2_5DummyInputsBuilder(BaseDummyInputsBuilder[Ovis2_5ProcessingInfo]): self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) @@ -298,8 +297,8 @@ class Ovis2_5DummyInputsBuilder(BaseDummyInputsBuilder[Ovis2_5ProcessingInfo]): seq_len, mm_counts ) - image_overrides = mm_options.get("image") if mm_options else None - video_overrides = mm_options.get("video") if mm_options else None + image_overrides = mm_options.get("image") + video_overrides = mm_options.get("video") mm_data = { "image": self._get_dummy_images( diff --git a/vllm/model_executor/models/paddleocr_vl.py b/vllm/model_executor/models/paddleocr_vl.py index 2bbe7e850..6c9304101 100644 --- a/vllm/model_executor/models/paddleocr_vl.py +++ b/vllm/model_executor/models/paddleocr_vl.py @@ -206,13 +206,12 @@ class PaddleOCRVLDummyInputsBuilder(BaseDummyInputsBuilder[PaddleOCRVLProcessing self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) max_image_size = self.info.get_image_size_with_most_features() - image_overrides = mm_options.get("image") if mm_options else None + image_overrides = mm_options.get("image") return { "image": self._get_dummy_images( diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index 37beaffef..458bcfa3c 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -131,8 +131,7 @@ class PaliGemmaDummyInputsBuilder(BaseDummyInputsBuilder[PaliGemmaProcessingInfo self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: hf_config = self.info.get_hf_config() vision_config = hf_config.vision_config @@ -140,7 +139,7 @@ class PaliGemmaDummyInputsBuilder(BaseDummyInputsBuilder[PaliGemmaProcessingInfo num_images = mm_counts.get("image", 0) - image_overrides = mm_options.get("image") if mm_options else None + image_overrides = mm_options.get("image") return { "image": self._get_dummy_images( diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index a5a346e72..1466e3861 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -376,14 +376,13 @@ class Phi3VDummyInputsBuilder(BaseDummyInputsBuilder[Phi3VProcessingInfo]): self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) target_width, target_height = self.info.get_image_size_with_most_features() - image_overrides = mm_options.get("image") if mm_options else None + image_overrides = mm_options.get("image") return { "image": self._get_dummy_images( diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index 89676a9a7..5ccac92e3 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -822,16 +822,15 @@ class Phi4MMDummyInputsBuilder(BaseDummyInputsBuilder[Phi4MMProcessingInfo]): self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: num_audios = mm_counts.get("audio", 0) num_images = mm_counts.get("image", 0) target_width, target_height = self.info.get_image_size_with_most_features() - image_overrides = mm_options.get("image") if mm_options else None - audio_overrides = mm_options.get("audio") if mm_options else None + image_overrides = mm_options.get("image") + audio_overrides = mm_options.get("audio") mm_data = { "image": self._get_dummy_images( diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 0cfa8b6a3..ae714dea2 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -249,14 +249,13 @@ class PixtralDummyInputsBuilder(BaseDummyInputsBuilder[PixtralProcessingInfo]): self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) target_width, target_height = self.info.get_image_size_with_most_features() - image_overrides = mm_options.get("image") if mm_options else None + image_overrides = mm_options.get("image") return { "image": self._get_dummy_images( @@ -271,8 +270,7 @@ class PixtralDummyInputsBuilder(BaseDummyInputsBuilder[PixtralProcessingInfo]): self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> ProcessorInputs: tokenizer = self.info.get_tokenizer() diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index 974de8068..977b522b5 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -357,15 +357,13 @@ class Qwen2_5OmniThinkerDummyInputsBuilder( self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: num_audios = mm_counts.get("audio", 0) num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) - mm_processor_kwargs = mm_processor_kwargs or {} - feature_extractor = self.info.get_feature_extractor(**mm_processor_kwargs) + feature_extractor = self.info.get_feature_extractor() target_audio_length = ( min( @@ -375,16 +373,14 @@ class Qwen2_5OmniThinkerDummyInputsBuilder( * feature_extractor.sampling_rate ) - target_width, target_height = self.info.get_image_size_with_most_features( - max_pixels=mm_processor_kwargs.get("max_pixels", None), - ) + target_width, target_height = self.info.get_image_size_with_most_features() target_num_frames = self.info.get_num_frames_with_most_features( seq_len, mm_counts ) - image_overrides = mm_options.get("image") if mm_options else None - video_overrides = mm_options.get("video") if mm_options else None - audio_overrides = mm_options.get("audio") if mm_options else None + image_overrides = mm_options.get("image") + video_overrides = mm_options.get("video") + audio_overrides = mm_options.get("audio") mm_data = { "audio": self._get_dummy_audios( diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index 52c798e83..053e8bb85 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -195,22 +195,21 @@ class Qwen2AudioDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2AudioProcessingIn self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: - feature_extractor = self.info.get_feature_extractor( - **(mm_processor_kwargs or {}) - ) + feature_extractor = self.info.get_feature_extractor() sampling_rate = feature_extractor.sampling_rate audio_len = feature_extractor.chunk_length * sampling_rate num_audios = mm_counts.get("audio", 0) - audio_overrides = mm_options.get("audio") if mm_options else None + audio_overrides = mm_options.get("audio") return { "audio": self._get_dummy_audios( - length=audio_len, num_audios=num_audios, overrides=audio_overrides + length=audio_len, + num_audios=num_audios, + overrides=audio_overrides, ) } diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index c530493b1..eed559bcb 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -925,9 +925,14 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo): vision_config = hf_config.vision_config patch_size = vision_config.patch_size merge_size = vision_config.spatial_merge_size + if max_pixels is None: image_processor = self.get_image_processor() - max_pixels = image_processor.size["longest_edge"] + + mm_kwargs = self.ctx.get_merged_mm_kwargs({}) + size = mm_kwargs.get("size", image_processor.size) + max_pixels = size["longest_edge"] + unit = patch_size * merge_size max_seq_len = max_pixels // (unit * unit) @@ -1027,22 +1032,18 @@ class Qwen2VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2VLProcessingInfo]): self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) - mm_processor_kwargs = mm_processor_kwargs or {} - target_width, target_height = self.info.get_image_size_with_most_features( - max_pixels=mm_processor_kwargs.get("max_pixels", None) - ) + target_width, target_height = self.info.get_image_size_with_most_features() target_num_frames = self.info.get_num_frames_with_most_features( seq_len, mm_counts ) - image_overrides = mm_options.get("image") if mm_options else None - video_overrides = mm_options.get("video") if mm_options else None + image_overrides = mm_options.get("image") + video_overrides = mm_options.get("video") return { "image": self._get_dummy_images( diff --git a/vllm/model_executor/models/qwen3_asr.py b/vllm/model_executor/models/qwen3_asr.py index 5f56088cb..443da955d 100644 --- a/vllm/model_executor/models/qwen3_asr.py +++ b/vllm/model_executor/models/qwen3_asr.py @@ -146,14 +146,11 @@ class Qwen3ASRDummyInputsBuilder(BaseDummyInputsBuilder[Qwen3ASRProcessingInfo]) self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: num_audios = mm_counts.get("audio", 0) - feature_extractor = self.info.get_feature_extractor( - **(mm_processor_kwargs or {}) - ) + feature_extractor = self.info.get_feature_extractor() target_audio_length = ( min( @@ -163,7 +160,7 @@ class Qwen3ASRDummyInputsBuilder(BaseDummyInputsBuilder[Qwen3ASRProcessingInfo]) * feature_extractor.sampling_rate ) - audio_overrides = mm_options.get("audio") if mm_options else None + audio_overrides = mm_options.get("audio") return { "audio": self._get_dummy_audios( diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index abb38a648..1a017e561 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -703,11 +703,18 @@ class Qwen3VLProcessingInfo(Qwen2VLProcessingInfo): mm_counts: Mapping[str, int], ) -> int: video_processor = self.get_video_processor() - video_max_pixels = video_processor.size["longest_edge"] + + mm_kwargs = self.ctx.get_merged_mm_kwargs({}) + video_size = mm_kwargs.get("size", video_processor.size) + temporal_patch_size = mm_kwargs.get( + "temporal_patch_size", video_processor.temporal_patch_size + ) + # video_max_pixels contains the temporal compression factor, # so we divide by 2 to get the maximum number of image pixels. + video_max_pixels = video_size["longest_edge"] target_width, target_height = self.get_image_size_with_most_features( - max_pixels=video_max_pixels // video_processor.temporal_patch_size + max_pixels=video_max_pixels // temporal_patch_size ) num_video_soft_tokens = self.get_num_video_tokens( image_width=target_width, @@ -789,19 +796,15 @@ class Qwen3VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen3VLProcessingInfo]): self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) - image_overrides = mm_options.get("image") if mm_options else None - video_overrides = mm_options.get("video") if mm_options else None + image_overrides = mm_options.get("image") + video_overrides = mm_options.get("video") - mm_processor_kwargs = mm_processor_kwargs or {} target_image_width, target_image_height = ( - self.info.get_image_size_with_most_features( - max_pixels=mm_processor_kwargs.get("max_pixels", None), - ) + self.info.get_image_size_with_most_features() ) # treat videos as special images @@ -826,13 +829,20 @@ class Qwen3VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen3VLProcessingInfo]): target_num_frames = min(target_num_frames, num_frames_override) target_num_frames = max(target_num_frames, 2) - video_processor = self.info.get_video_processor(**(mm_processor_kwargs or {})) - video_max_pixels = video_processor.size["longest_edge"] + video_processor = self.info.get_video_processor() + + mm_kwargs = self.info.ctx.get_merged_mm_kwargs({}) + video_size = mm_kwargs.get("size", video_processor.size) + temporal_patch_size = mm_kwargs.get( + "temporal_patch_size", video_processor.temporal_patch_size + ) + # video_max_pixels contains the temporal compression factor, # so we divide by 2 to get the maximum number of image pixels. + video_max_pixels = video_size["longest_edge"] target_video_width, target_video_height = ( self.info.get_image_size_with_most_features( - max_pixels=video_max_pixels // video_processor.temporal_patch_size + max_pixels=video_max_pixels // temporal_patch_size ) ) target_video_size, _ = self.info._get_vision_info( diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py index 66b669a9c..8ac541f73 100644 --- a/vllm/model_executor/models/qwen_vl.py +++ b/vllm/model_executor/models/qwen_vl.py @@ -617,8 +617,7 @@ class QwenVLDummyInputsBuilder(BaseDummyInputsBuilder[QwenVLProcessingInfo]): self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: hf_config = self.info.get_hf_config() vision_config = hf_config.visual @@ -626,7 +625,7 @@ class QwenVLDummyInputsBuilder(BaseDummyInputsBuilder[QwenVLProcessingInfo]): target_width = target_height = vision_config["image_size"] num_images = mm_counts.get("image", 0) - image_overrides = mm_options.get("image") if mm_options else None + image_overrides = mm_options.get("image") return { "image": self._get_dummy_images( diff --git a/vllm/model_executor/models/rvl.py b/vllm/model_executor/models/rvl.py index f6ddaa8fa..72f68659c 100644 --- a/vllm/model_executor/models/rvl.py +++ b/vllm/model_executor/models/rvl.py @@ -40,14 +40,13 @@ class RVLDummyInputsBuilder(LlavaDummyInputsBuilder[RVLProcessingInfo]): self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) target_width, target_height = self.info.get_image_size_with_most_features() - image_overrides = mm_options.get("image") if mm_options else None + image_overrides = mm_options.get("image") return { "image": self._get_dummy_images( diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index 8e07a90e8..c31515130 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -158,14 +158,13 @@ class SiglipDummyInputsBuilder(BaseDummyInputsBuilder[SiglipProcessingInfo]): self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) target_width, target_height = self.info.get_image_size_with_most_features() - image_overrides = mm_options.get("image") if mm_options else None + image_overrides = mm_options.get("image") return { "image": self._get_dummy_images( diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py index acedb04bc..0003fbfde 100644 --- a/vllm/model_executor/models/skyworkr1v.py +++ b/vllm/model_executor/models/skyworkr1v.py @@ -529,13 +529,12 @@ class SkyworkR1VDummyInputsBuilder(BaseDummyInputsBuilder[SkyworkR1VProcessingIn self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: target_width, target_height = self.info.get_image_size_with_most_features() num_images = mm_counts.get("image", 0) - image_overrides = mm_options.get("image") if mm_options else None + image_overrides = mm_options.get("image") return { "image": self._get_dummy_images( diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py index 8050f6b85..eee1130cc 100644 --- a/vllm/model_executor/models/step3_vl.py +++ b/vllm/model_executor/models/step3_vl.py @@ -564,13 +564,12 @@ class Step3VLDummyInputsBuilder(BaseDummyInputsBuilder[Step3VLProcessingInfo]): self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: target_width, target_height = self.info.get_image_size_with_most_features() num_images = mm_counts.get("image", 0) - image_overrides = mm_options.get("image") if mm_options else None + image_overrides = mm_options.get("image") return { "image": self._get_dummy_images( diff --git a/vllm/model_executor/models/terratorch.py b/vllm/model_executor/models/terratorch.py index 1cf65abd6..a3a4030af 100644 --- a/vllm/model_executor/models/terratorch.py +++ b/vllm/model_executor/models/terratorch.py @@ -154,8 +154,7 @@ class TerratorchInputBuilder(BaseDummyInputsBuilder[TerratorchProcessingInfo]): self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: # Dummy data is generated based on the 'input' section # defined in the HF configuration file diff --git a/vllm/model_executor/models/transformers/multimodal.py b/vllm/model_executor/models/transformers/multimodal.py index 3b1eb7db8..a645679e0 100644 --- a/vllm/model_executor/models/transformers/multimodal.py +++ b/vllm/model_executor/models/transformers/multimodal.py @@ -101,14 +101,13 @@ class MultiModalDummyInputsBuilder(BaseDummyInputsBuilder[MultiModalProcessingIn self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, "BaseDummyOptions"] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, "BaseDummyOptions"], ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) target_width, target_height = self.info.get_max_image_size() - image_overrides = mm_options.get("image") if mm_options else None + image_overrides = mm_options.get("image") return { "image": self._get_dummy_images( diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index cf8267d20..4ac636110 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -164,12 +164,9 @@ class UltravoxDummyInputsBuilder(BaseDummyInputsBuilder[UltravoxProcessingInfo]) self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: - feature_extractor = self.info.get_feature_extractor( - **(mm_processor_kwargs or {}) - ) + feature_extractor = self.info.get_feature_extractor() sampling_rate = feature_extractor.sampling_rate audio_len = ( @@ -177,11 +174,13 @@ class UltravoxDummyInputsBuilder(BaseDummyInputsBuilder[UltravoxProcessingInfo]) ) num_audios = mm_counts.get("audio", 0) - audio_overrides = mm_options.get("audio") if mm_options else None + audio_overrides = mm_options.get("audio") return { "audio": self._get_dummy_audios( - length=audio_len, num_audios=num_audios, overrides=audio_overrides + length=audio_len, + num_audios=num_audios, + overrides=audio_overrides, ) } diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py index a4dcc1b41..8cbba09d4 100644 --- a/vllm/model_executor/models/voxtral.py +++ b/vllm/model_executor/models/voxtral.py @@ -218,18 +218,19 @@ class VoxtralDummyInputsBuilder(BaseDummyInputsBuilder[VoxtralProcessingInfo]): self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: num_audios = mm_counts.get("audio", 0) target_length = self.info.get_max_audio_array_len() - audio_overrides = mm_options.get("audio") if mm_options else None + audio_overrides = mm_options.get("audio") return { "audio": self._get_dummy_audios( - length=target_length, num_audios=num_audios, overrides=audio_overrides + length=target_length, + num_audios=num_audios, + overrides=audio_overrides, ) } @@ -237,8 +238,7 @@ class VoxtralDummyInputsBuilder(BaseDummyInputsBuilder[VoxtralProcessingInfo]): self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> ProcessorInputs: tokenizer = self.info.get_tokenizer() diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index 96818e264..2f7c4580a 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -695,22 +695,21 @@ class WhisperDummyInputsBuilder(BaseDummyInputsBuilder[WhisperProcessingInfo]): self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: - feature_extractor = self.info.get_feature_extractor( - **(mm_processor_kwargs or {}) - ) + feature_extractor = self.info.get_feature_extractor() sampling_rate = feature_extractor.sampling_rate audio_len = feature_extractor.chunk_length * sampling_rate num_audios = mm_counts.get("audio", 0) - audio_overrides = mm_options.get("audio") if mm_options else None + audio_overrides = mm_options.get("audio") return { "audio": self._get_dummy_audios( - length=audio_len, num_audios=num_audios, overrides=audio_overrides + length=audio_len, + num_audios=num_audios, + overrides=audio_overrides, ) } diff --git a/vllm/multimodal/processing/context.py b/vllm/multimodal/processing/context.py index b131ee3c4..6f4ce77bc 100644 --- a/vllm/multimodal/processing/context.py +++ b/vllm/multimodal/processing/context.py @@ -266,11 +266,14 @@ class InputProcessingContext: if isinstance(tokenizer, MistralTokenizer): tokenizer = tokenizer.transformers_tokenizer + merged_kwargs = self.get_merged_mm_kwargs(kwargs) + merged_kwargs.pop("tokenizer", None) + return cached_processor_from_config( self.model_config, processor_cls=typ, tokenizer=tokenizer, - **kwargs, + **merged_kwargs, ) def init_processor( @@ -283,12 +286,7 @@ class InputProcessingContext: Initialize a HuggingFace-like processor class, merging the keyword arguments with those in the model's configuration. """ - mm_config = self.model_config.get_multimodal_config() - base_kwargs = mm_config.mm_processor_kwargs - if base_kwargs is None: - base_kwargs = {} - - merged_kwargs = {**base_kwargs, **kwargs} + merged_kwargs = self.get_merged_mm_kwargs(kwargs) return typ(**merged_kwargs) diff --git a/vllm/multimodal/processing/dummy_inputs.py b/vllm/multimodal/processing/dummy_inputs.py index 0b02861e3..914395863 100644 --- a/vllm/multimodal/processing/dummy_inputs.py +++ b/vllm/multimodal/processing/dummy_inputs.py @@ -62,8 +62,7 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]): self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: """ Build the multimodal input which, after processing, results in @@ -83,8 +82,7 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]): self, seq_len: int, mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - mm_processor_kwargs: Mapping[str, object] | None = None, + mm_options: Mapping[str, BaseDummyOptions], ) -> ProcessorInputs: """ Build the input which, after processing, results in @@ -94,16 +92,9 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]): seq_len: Sequence length mm_counts: Count of items per modality mm_options: Configurable options per modality (optional) - mm_processor_kwargs: Additional keyword arguments - for hf_processor (optional) """ dummy_text = self.get_dummy_text(mm_counts) - dummy_mm_data = self.get_dummy_mm_data( - seq_len, - mm_counts, - mm_options, - mm_processor_kwargs=mm_processor_kwargs, - ) + dummy_mm_data = self.get_dummy_mm_data(seq_len, mm_counts, mm_options) dummy_mm_items = self.info.parse_mm_data(dummy_mm_data, validate=False) tokenization_kwargs = {"truncation": False} @@ -111,7 +102,6 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]): return ProcessorInputs( prompt=dummy_text, mm_items=dummy_mm_items, - hf_processor_mm_kwargs=mm_processor_kwargs or {}, tokenization_kwargs=tokenization_kwargs, ) diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 340754d16..540b42f0e 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -5,7 +5,6 @@ from dataclasses import dataclass from multiprocessing.synchronize import Lock as LockType from typing import TYPE_CHECKING, Generic, Literal, Protocol, TypeVar, cast -from vllm.config.multimodal import BaseDummyOptions from vllm.config.observability import ObservabilityConfig from vllm.logger import init_logger from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config @@ -99,27 +98,6 @@ class MultiModalRegistry: A registry that dispatches data processing according to the model. """ - def _extract_mm_options( - self, - model_config: "ModelConfig", - ) -> Mapping[str, BaseDummyOptions] | None: - """ - Extract multimodal dummy options from model config. - - Returns None if no configurable options are found, otherwise returns - a mapping of modality names to their dummy options. - """ - if not model_config.multimodal_config: - return None - - mm_options = { - m: opt - for m in model_config.multimodal_config.limit_per_prompt - if (opt := model_config.multimodal_config.get_dummy_options(m)) is not None - } - - return mm_options if len(mm_options) > 0 else None - def supports_multimodal_inputs(self, model_config: "ModelConfig") -> bool: """ Checks if the model supports multimodal inputs. @@ -261,8 +239,7 @@ class MultiModalRegistry: processor_inputs = processor.dummy_inputs.get_dummy_processor_inputs( seq_len=seq_len, mm_counts=mm_counts, - mm_options=self._extract_mm_options(model_config), - mm_processor_kwargs=mm_config.mm_processor_kwargs, + mm_options=mm_config.limit_per_prompt, ) mm_inputs = processor.apply( prompt=processor_inputs.prompt,