diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index b8e742362..fc1720296 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -445,6 +445,7 @@ class AriaDummyInputsBuilder(BaseDummyInputsBuilder[AriaProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: vision_config = self.info.get_vision_config() diff --git a/vllm/model_executor/models/audioflamingo3.py b/vllm/model_executor/models/audioflamingo3.py index 599f3d29f..111b99461 100644 --- a/vllm/model_executor/models/audioflamingo3.py +++ b/vllm/model_executor/models/audioflamingo3.py @@ -253,8 +253,11 @@ class AudioFlamingo3DummyInputsBuilder( seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: - feature_extractor = self.info.get_feature_extractor() + feature_extractor = self.info.get_feature_extractor( + **(mm_processor_kwargs or {}) + ) sampling_rate = feature_extractor.sampling_rate audio_len = MAX_AUDIO_LEN * sampling_rate num_audios = mm_counts.get("audio", 0) diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py index af72f0bc4..ce3b990c3 100644 --- a/vllm/model_executor/models/aya_vision.py +++ b/vllm/model_executor/models/aya_vision.py @@ -192,6 +192,7 @@ class AyaVisionDummyInputsBuilder(BaseDummyInputsBuilder[AyaVisionProcessingInfo seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) image_size = self.info.get_image_size_with_most_features() diff --git a/vllm/model_executor/models/bagel.py b/vllm/model_executor/models/bagel.py index ac16538e9..657e8cefb 100644 --- a/vllm/model_executor/models/bagel.py +++ b/vllm/model_executor/models/bagel.py @@ -250,6 +250,7 @@ class BagelDummyInputsBuilder(BaseDummyInputsBuilder[BagelProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) hf_config = self.info.get_hf_config() diff --git a/vllm/model_executor/models/bee.py b/vllm/model_executor/models/bee.py index 4f0342df4..5c3a1a4f1 100644 --- a/vllm/model_executor/models/bee.py +++ b/vllm/model_executor/models/bee.py @@ -91,6 +91,7 @@ class BeeDummyInputsBuilder(LlavaDummyInputsBuilder[BeeProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index f812eb849..fe9db19ea 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -446,6 +446,7 @@ class Blip2DummyInputsBuilder(BaseDummyInputsBuilder[Blip2ProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: hf_config = self.info.get_hf_config() vision_config = hf_config.vision_config diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index c4b885cc9..2c21d70ed 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -117,6 +117,7 @@ class ChameleonDummyInputsBuilder(BaseDummyInputsBuilder[ChameleonProcessingInfo seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: config = self.info.get_hf_config() diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index 4ffeedf46..3f189eacc 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -171,6 +171,7 @@ class CLIPDummyInputsBuilder(BaseDummyInputsBuilder[CLIPProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/cohere2_vision.py b/vllm/model_executor/models/cohere2_vision.py index ebdb4bcb8..4aefd2ead 100644 --- a/vllm/model_executor/models/cohere2_vision.py +++ b/vllm/model_executor/models/cohere2_vision.py @@ -221,6 +221,7 @@ class Cohere2VisionDummyInputsBuilder( seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) image_size = self.info.get_image_size_with_most_features() diff --git a/vllm/model_executor/models/deepseek_ocr.py b/vllm/model_executor/models/deepseek_ocr.py index 3425b1570..146b05002 100644 --- a/vllm/model_executor/models/deepseek_ocr.py +++ b/vllm/model_executor/models/deepseek_ocr.py @@ -256,6 +256,7 @@ class DeepseekOCRDummyInputsBuilder(BaseDummyInputsBuilder[DeepseekOCRProcessing seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/deepseek_ocr2.py b/vllm/model_executor/models/deepseek_ocr2.py index cead43685..6ababf9f2 100644 --- a/vllm/model_executor/models/deepseek_ocr2.py +++ b/vllm/model_executor/models/deepseek_ocr2.py @@ -138,6 +138,7 @@ class DeepseekOCR2DummyInputsBuilder( seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index cb98640ce..83ab54f60 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -215,6 +215,7 @@ class DeepseekVL2DummyInputsBuilder(BaseDummyInputsBuilder[DeepseekVL2Processing seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/dots_ocr.py b/vllm/model_executor/models/dots_ocr.py index d2f39553d..0d2fefb73 100644 --- a/vllm/model_executor/models/dots_ocr.py +++ b/vllm/model_executor/models/dots_ocr.py @@ -107,10 +107,13 @@ class DotsOCRDummyInputsBuilder(Qwen2VLDummyInputsBuilder): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) + mm_processor_kwargs = mm_processor_kwargs or {} target_width, target_height = self.info.get_image_size_with_most_features( # noqa: E501 + mm_processor_kwargs.get("max_pixels", None) ) image_overrides = mm_options.get("image") if mm_options else None diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py index 0ada8a223..50d3954b6 100644 --- a/vllm/model_executor/models/ernie45_vl.py +++ b/vllm/model_executor/models/ernie45_vl.py @@ -1153,6 +1153,7 @@ class Ernie4_5_VLDummyInputsBuilder(BaseDummyInputsBuilder[Ernie4_5_VLProcessing seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) diff --git a/vllm/model_executor/models/funasr.py b/vllm/model_executor/models/funasr.py index b4d4fb5b7..3e4a6131c 100644 --- a/vllm/model_executor/models/funasr.py +++ b/vllm/model_executor/models/funasr.py @@ -745,8 +745,11 @@ class FunASRDummyInputsBuilder(BaseDummyInputsBuilder[FunASRProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: - feature_extractor = self.info.get_feature_extractor() + feature_extractor = self.info.get_feature_extractor( + **(mm_processor_kwargs or {}) + ) sampling_rate = feature_extractor.sampling_rate audio_len = feature_extractor.chunk_length * sampling_rate diff --git a/vllm/model_executor/models/funaudiochat.py b/vllm/model_executor/models/funaudiochat.py index b7b8659a4..a89a5c104 100644 --- a/vllm/model_executor/models/funaudiochat.py +++ b/vllm/model_executor/models/funaudiochat.py @@ -611,8 +611,11 @@ class FunAudioChatDummyInputsBuilder( seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: - feature_extractor = self.info.get_feature_extractor() + feature_extractor = self.info.get_feature_extractor( + **(mm_processor_kwargs or {}) + ) sampling_rate = int(feature_extractor.sampling_rate) # Dummy inputs are used for profiling; construct the worst-case audio @@ -656,7 +659,7 @@ class FunAudioChatMultiModalProcessor( if not audios: return BatchFeature({"input_ids": input_ids}) - feature_extractor = self.info.get_feature_extractor() + feature_extractor = self.info.get_feature_extractor(**mm_kwargs) sr = int(feature_extractor.sampling_rate) min_samples = int(getattr(feature_extractor, "n_fft", 400) or 400) diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 50708f4b9..c4f1118f7 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -143,6 +143,7 @@ class FuyuDummyInputsBuilder(BaseDummyInputsBuilder[FuyuProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: target_width, target_height = self.info.get_image_size_with_most_features() num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py index 18437528e..1e803f89b 100644 --- a/vllm/model_executor/models/gemma3_mm.py +++ b/vllm/model_executor/models/gemma3_mm.py @@ -256,6 +256,7 @@ class Gemma3DummyInputsBuilder(BaseDummyInputsBuilder[Gemma3ProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py index 8b5e7b8bb..8588e51f5 100644 --- a/vllm/model_executor/models/gemma3n_mm.py +++ b/vllm/model_executor/models/gemma3n_mm.py @@ -182,6 +182,7 @@ class Gemma3nDummyInputsBuilder(BaseDummyInputsBuilder[Gemma3nProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_audios = mm_counts.get("audio", 0) diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 5333042cb..8440c3946 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -1143,6 +1143,7 @@ class Glm4vDummyInputsBuilder(BaseDummyInputsBuilder[Glm4vProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py index 56504029d..4d86900e9 100644 --- a/vllm/model_executor/models/glm4v.py +++ b/vllm/model_executor/models/glm4v.py @@ -493,6 +493,7 @@ class GLM4VDummyInputsBuilder(BaseDummyInputsBuilder[GLM4VProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: hf_config = self.info.get_hf_config() vision_config = hf_config.vision_config diff --git a/vllm/model_executor/models/glmasr.py b/vllm/model_executor/models/glmasr.py index b9bdb3aa2..4e223b15f 100644 --- a/vllm/model_executor/models/glmasr.py +++ b/vllm/model_executor/models/glmasr.py @@ -727,8 +727,11 @@ class GlmAsrDummyInputsBuilder(BaseDummyInputsBuilder[GlmAsrProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: - feature_extractor = self.info.get_feature_extractor() + feature_extractor = self.info.get_feature_extractor( + **(mm_processor_kwargs or {}) + ) sampling_rate = feature_extractor.sampling_rate num_audios = mm_counts.get("audio", 0) audio_overrides = mm_options.get("audio") if mm_options else None diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py index 6956f92ee..9d37a0683 100644 --- a/vllm/model_executor/models/granite_speech.py +++ b/vllm/model_executor/models/granite_speech.py @@ -217,6 +217,7 @@ class GraniteSpeechDummyInputsBuilder( seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_audios = mm_counts.get("audio", 0) audio_overrides = mm_options.get("audio") if mm_options else None diff --git a/vllm/model_executor/models/hunyuan_vision.py b/vllm/model_executor/models/hunyuan_vision.py index 729b6cb6c..edd00c5cd 100644 --- a/vllm/model_executor/models/hunyuan_vision.py +++ b/vllm/model_executor/models/hunyuan_vision.py @@ -702,6 +702,7 @@ class HunYuanVLDummyInputsBuilder(BaseDummyInputsBuilder[HunYuanVLProcessingInfo seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 1) diff --git a/vllm/model_executor/models/hyperclovax_vision.py b/vllm/model_executor/models/hyperclovax_vision.py index 6a1f58af2..ea10d764f 100644 --- a/vllm/model_executor/models/hyperclovax_vision.py +++ b/vllm/model_executor/models/hyperclovax_vision.py @@ -166,6 +166,7 @@ class HCXVisionDummyInputsBuilder(BaseDummyInputsBuilder[HCXVisionProcessingInfo seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index d51c50af0..e2cfd1d63 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -42,7 +42,7 @@ from vllm.multimodal.inputs import ( MultiModalFieldConfig, MultiModalKwargsItems, ) -from vllm.multimodal.parse import ImageProcessorItems, ImageSize, MultiModalDataItems +from vllm.multimodal.parse import ImageProcessorItems, MultiModalDataItems from vllm.multimodal.processing import ( BaseDummyInputsBuilder, BaseMultiModalProcessor, @@ -285,15 +285,6 @@ class Idefics3ProcessingInfo(BaseProcessingInfo): return num_patches * processor.image_seq_len - def get_image_size_with_most_features(self) -> ImageSize: - processor = self.get_hf_processor() - image_processor: Idefics3ImageProcessor = processor.image_processor - - return ImageSize( - width=image_processor.size["longest_edge"], - height=image_processor.size["longest_edge"], - ) - class Idefics3DummyInputsBuilder(BaseDummyInputsBuilder[Idefics3ProcessingInfo]): def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: @@ -309,9 +300,10 @@ class Idefics3DummyInputsBuilder(BaseDummyInputsBuilder[Idefics3ProcessingInfo]) seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) - hf_processor = self.info.get_hf_processor() + hf_processor = self.info.get_hf_processor(**(mm_processor_kwargs or {})) image_processor: Idefics3ImageProcessor = hf_processor.image_processor longest_edge = image_processor.max_image_size["longest_edge"] diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py index de306341c..dd1332dfd 100644 --- a/vllm/model_executor/models/interns1.py +++ b/vllm/model_executor/models/interns1.py @@ -298,6 +298,7 @@ class InternS1DummyInputsBuilder(BaseDummyInputsBuilder[InternS1ProcessingInfo]) seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: target_width, target_height = self.info.get_image_size_with_most_features() target_num_frames = self.info.get_num_frames_with_most_features( diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index bcce1c800..334ee3cbe 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -766,6 +766,7 @@ class BaseInternVLDummyInputsBuilder(BaseDummyInputsBuilder[_I]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: target_width, target_height = self.info.get_image_size_with_most_features() num_images = mm_counts.get("image", 0) @@ -938,6 +939,7 @@ class InternVLDummyInputsBuilder( seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: dummy_image = super().get_dummy_mm_data( seq_len=seq_len, mm_counts=mm_counts, mm_options=mm_options diff --git a/vllm/model_executor/models/isaac.py b/vllm/model_executor/models/isaac.py index ed10e8200..8ed9ddda4 100644 --- a/vllm/model_executor/models/isaac.py +++ b/vllm/model_executor/models/isaac.py @@ -850,6 +850,7 @@ class IsaacDummyInputsBuilder(BaseDummyInputsBuilder[IsaacProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/kanana_v.py b/vllm/model_executor/models/kanana_v.py index 06ea26155..b679241b5 100644 --- a/vllm/model_executor/models/kanana_v.py +++ b/vllm/model_executor/models/kanana_v.py @@ -445,6 +445,7 @@ class KananaVDummyInputsBuilder(BaseDummyInputsBuilder[KananaVProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) return { diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py index e57e5c6f3..960915af6 100644 --- a/vllm/model_executor/models/keye.py +++ b/vllm/model_executor/models/keye.py @@ -1159,6 +1159,7 @@ class KeyeBaseDummyInputsBuilder(BaseDummyInputsBuilder[_I]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) diff --git a/vllm/model_executor/models/kimi_k25.py b/vllm/model_executor/models/kimi_k25.py index cb07cfe98..bc6fffa3b 100644 --- a/vllm/model_executor/models/kimi_k25.py +++ b/vllm/model_executor/models/kimi_k25.py @@ -238,6 +238,7 @@ class KimiK25DummyInputsBuilder(BaseDummyInputsBuilder[KimiK25ProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: # TODO: Support mm_options for vision_chunk to allow user configuration dummy_items = self.get_dummy_mm_items() diff --git a/vllm/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py index cb7719777..e280f8245 100644 --- a/vllm/model_executor/models/kimi_vl.py +++ b/vllm/model_executor/models/kimi_vl.py @@ -216,6 +216,7 @@ class KimiVLDummyInputsBuilder(BaseDummyInputsBuilder[KimiVLProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/lfm2_vl.py b/vllm/model_executor/models/lfm2_vl.py index 445ecdce7..7bded977a 100644 --- a/vllm/model_executor/models/lfm2_vl.py +++ b/vllm/model_executor/models/lfm2_vl.py @@ -319,6 +319,7 @@ class Lfm2VLDummyInputsBuilder(BaseDummyInputsBuilder[Lfm2VLProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index c35728183..ecd2c895b 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -232,6 +232,7 @@ class LlavaDummyInputsBuilder(BaseDummyInputsBuilder[_I]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index 1aee7f9c5..6696a0009 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -166,6 +166,7 @@ class LlavaNextVideoDummyInputsBuilder( seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_videos = mm_counts.get("video", 0) diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index d49c08eb3..39633eaf9 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -277,6 +277,7 @@ class LlavaOnevisionDummyInputsBuilder( seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) diff --git a/vllm/model_executor/models/midashenglm.py b/vllm/model_executor/models/midashenglm.py index 3f75e60fd..4bba0ad71 100644 --- a/vllm/model_executor/models/midashenglm.py +++ b/vllm/model_executor/models/midashenglm.py @@ -566,6 +566,7 @@ class MiDashengLMDummyInputsBuilder(BaseDummyInputsBuilder[MiDashengLMProcessing seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_audios = mm_counts.get("audio", 0) diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py index 39b79e4b1..33df0f785 100644 --- a/vllm/model_executor/models/minicpmo.py +++ b/vllm/model_executor/models/minicpmo.py @@ -302,6 +302,7 @@ class MiniCPMODummyInputsBuilder(MiniCPMVDummyInputsBuilder[MiniCPMOProcessingIn seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_audios = mm_counts.get("audio", 0) audio_len = ( diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index ebe2eca32..6a1686100 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -708,6 +708,7 @@ class MiniCPMVDummyInputsBuilder(BaseDummyInputsBuilder[_I]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py index 08f5d45e2..33d94e9ff 100644 --- a/vllm/model_executor/models/mistral3.py +++ b/vllm/model_executor/models/mistral3.py @@ -237,6 +237,7 @@ class Mistral3DummyInputsBuilder(BaseDummyInputsBuilder[_I]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index 58f63597a..3752a7704 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -704,6 +704,7 @@ class Mllama4DummyInputsBuilder(BaseDummyInputsBuilder[Mllama4ProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 1ee177656..6edec9719 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -1278,6 +1278,7 @@ class MolmoDummyInputsBuilder(BaseDummyInputsBuilder[MolmoProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: target_width, target_height = self.info.get_image_size_with_most_features() num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/molmo2.py b/vllm/model_executor/models/molmo2.py index 30f639c8b..e0f74ce46 100644 --- a/vllm/model_executor/models/molmo2.py +++ b/vllm/model_executor/models/molmo2.py @@ -2079,6 +2079,7 @@ class Molmo2DummyInputsBuilder(BaseDummyInputsBuilder[Molmo2ProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py index 1c36b681f..fb683487f 100644 --- a/vllm/model_executor/models/nano_nemotron_vl.py +++ b/vllm/model_executor/models/nano_nemotron_vl.py @@ -1385,6 +1385,7 @@ class NanoNemotronVLDummyInputsBuilder(BaseDummyInputsBuilder[_I]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) processor = self.info.get_hf_processor() @@ -1457,6 +1458,7 @@ class NanoNemotronVLDummyInputsBuilder( seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: dummy_image = super().get_dummy_mm_data( seq_len=seq_len, mm_counts=mm_counts, mm_options=mm_options diff --git a/vllm/model_executor/models/nemotron_parse.py b/vllm/model_executor/models/nemotron_parse.py index f9acae3e0..b94b606a1 100644 --- a/vllm/model_executor/models/nemotron_parse.py +++ b/vllm/model_executor/models/nemotron_parse.py @@ -642,6 +642,7 @@ class NemotronParseDummyInputsBuilder( seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py index 73dd8dfd0..840918953 100644 --- a/vllm/model_executor/models/nvlm_d.py +++ b/vllm/model_executor/models/nvlm_d.py @@ -93,6 +93,7 @@ class NVLMDummyInputsBuilder(BaseInternVLDummyInputsBuilder[NVLMProcessingInfo]) seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: target_width, target_height = self.info.get_image_size_with_most_features() num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py index 3a058bb94..7e02d87ec 100644 --- a/vllm/model_executor/models/ovis.py +++ b/vllm/model_executor/models/ovis.py @@ -303,6 +303,7 @@ class OvisDummyInputsBuilder(BaseDummyInputsBuilder[OvisProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/ovis2_5.py b/vllm/model_executor/models/ovis2_5.py index a787a0bf8..69c0600d8 100644 --- a/vllm/model_executor/models/ovis2_5.py +++ b/vllm/model_executor/models/ovis2_5.py @@ -302,6 +302,7 @@ class Ovis2_5DummyInputsBuilder(BaseDummyInputsBuilder[Ovis2_5ProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) diff --git a/vllm/model_executor/models/paddleocr_vl.py b/vllm/model_executor/models/paddleocr_vl.py index b3873c160..8d287e342 100644 --- a/vllm/model_executor/models/paddleocr_vl.py +++ b/vllm/model_executor/models/paddleocr_vl.py @@ -204,6 +204,7 @@ class PaddleOCRVLDummyInputsBuilder(BaseDummyInputsBuilder[PaddleOCRVLProcessing seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index 4ab0067f3..e551f9fc9 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -128,6 +128,7 @@ class PaliGemmaDummyInputsBuilder(BaseDummyInputsBuilder[PaliGemmaProcessingInfo seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: hf_config = self.info.get_hf_config() vision_config = hf_config.vision_config diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 3dde6dfd7..8f33cc859 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -380,6 +380,7 @@ class Phi3VDummyInputsBuilder(BaseDummyInputsBuilder[Phi3VProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index 97a29b353..d11483a6b 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -826,6 +826,7 @@ class Phi4MMDummyInputsBuilder(BaseDummyInputsBuilder[Phi4MMProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_audios = mm_counts.get("audio", 0) num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 3a5dee3c2..7d12cffcd 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -261,6 +261,7 @@ class PixtralDummyInputsBuilder(BaseDummyInputsBuilder[PixtralProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) @@ -282,6 +283,7 @@ class PixtralDummyInputsBuilder(BaseDummyInputsBuilder[PixtralProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> ProcessorInputs: tokenizer = self.info.get_tokenizer() diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index 3b50ae74d..974de8068 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -358,12 +358,14 @@ class Qwen2_5OmniThinkerDummyInputsBuilder( seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_audios = mm_counts.get("audio", 0) num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) - feature_extractor = self.info.get_feature_extractor() + mm_processor_kwargs = mm_processor_kwargs or {} + feature_extractor = self.info.get_feature_extractor(**mm_processor_kwargs) target_audio_length = ( min( @@ -372,7 +374,10 @@ class Qwen2_5OmniThinkerDummyInputsBuilder( ) * feature_extractor.sampling_rate ) - target_width, target_height = self.info.get_image_size_with_most_features() + + target_width, target_height = self.info.get_image_size_with_most_features( + max_pixels=mm_processor_kwargs.get("max_pixels", None), + ) target_num_frames = self.info.get_num_frames_with_most_features( seq_len, mm_counts ) diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index 2115d5140..51a24b0ae 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -195,8 +195,11 @@ class Qwen2AudioDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2AudioProcessingIn seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: - feature_extractor = self.info.get_feature_extractor() + feature_extractor = self.info.get_feature_extractor( + **(mm_processor_kwargs or {}) + ) sampling_rate = feature_extractor.sampling_rate audio_len = feature_extractor.chunk_length * sampling_rate diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index d911fb1dd..fa9bf6cfe 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -1016,11 +1016,15 @@ class Qwen2VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2VLProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) - target_width, target_height = self.info.get_image_size_with_most_features() + mm_processor_kwargs = mm_processor_kwargs or {} + target_width, target_height = self.info.get_image_size_with_most_features( + max_pixels=mm_processor_kwargs.get("max_pixels", None) + ) target_num_frames = self.info.get_num_frames_with_most_features( seq_len, mm_counts ) diff --git a/vllm/model_executor/models/qwen3_asr.py b/vllm/model_executor/models/qwen3_asr.py index 9dac8d75b..5f56088cb 100644 --- a/vllm/model_executor/models/qwen3_asr.py +++ b/vllm/model_executor/models/qwen3_asr.py @@ -147,10 +147,13 @@ class Qwen3ASRDummyInputsBuilder(BaseDummyInputsBuilder[Qwen3ASRProcessingInfo]) seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_audios = mm_counts.get("audio", 0) - feature_extractor = self.info.get_feature_extractor() + feature_extractor = self.info.get_feature_extractor( + **(mm_processor_kwargs or {}) + ) target_audio_length = ( min( diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py index b06503031..50fbb8be1 100755 --- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py +++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py @@ -1169,7 +1169,7 @@ class Qwen3OmniMoeThinkerMultiModalProcessor( return x # NOTE: WhisperFeatureExtractor cannot handle empty list of audios - feature_extractor = self.info.get_feature_extractor() + feature_extractor = self.info.get_feature_extractor(**mm_kwargs) hop_length = feature_extractor.hop_length if audios: # NOTE: Qwen3-Omni processor accept "audio" diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 908f6342d..7d9785141 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -796,14 +796,18 @@ class Qwen3VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen3VLProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) num_videos = mm_counts.get("video", 0) image_overrides = mm_options.get("image") if mm_options else None video_overrides = mm_options.get("video") if mm_options else None + mm_processor_kwargs = mm_processor_kwargs or {} target_image_width, target_image_height = ( - self.info.get_image_size_with_most_features() + self.info.get_image_size_with_most_features( + max_pixels=mm_processor_kwargs.get("max_pixels", None), + ) ) # treat videos as special images @@ -828,7 +832,7 @@ class Qwen3VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen3VLProcessingInfo]): target_num_frames = min(target_num_frames, num_frames_override) target_num_frames = max(target_num_frames, 2) - video_processor = self.info.get_video_processor() + video_processor = self.info.get_video_processor(**(mm_processor_kwargs or {})) video_max_pixels = video_processor.size["longest_edge"] # video_max_pixels contains the temporal compression factor, # so we divide by 2 to get the maximum number of image pixels. diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py index ed61bb140..66b669a9c 100644 --- a/vllm/model_executor/models/qwen_vl.py +++ b/vllm/model_executor/models/qwen_vl.py @@ -618,6 +618,7 @@ class QwenVLDummyInputsBuilder(BaseDummyInputsBuilder[QwenVLProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: hf_config = self.info.get_hf_config() vision_config = hf_config.visual diff --git a/vllm/model_executor/models/rvl.py b/vllm/model_executor/models/rvl.py index 92352febe..f6ddaa8fa 100644 --- a/vllm/model_executor/models/rvl.py +++ b/vllm/model_executor/models/rvl.py @@ -41,6 +41,7 @@ class RVLDummyInputsBuilder(LlavaDummyInputsBuilder[RVLProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index 9f1bbd596..92ecc7579 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -155,6 +155,7 @@ class SiglipDummyInputsBuilder(BaseDummyInputsBuilder[SiglipProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py index 29a0389b9..4fadad14d 100644 --- a/vllm/model_executor/models/skyworkr1v.py +++ b/vllm/model_executor/models/skyworkr1v.py @@ -533,6 +533,7 @@ class SkyworkR1VDummyInputsBuilder(BaseDummyInputsBuilder[SkyworkR1VProcessingIn seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: target_width, target_height = self.info.get_image_size_with_most_features() num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py index 11081b040..8050f6b85 100644 --- a/vllm/model_executor/models/step3_vl.py +++ b/vllm/model_executor/models/step3_vl.py @@ -565,6 +565,7 @@ class Step3VLDummyInputsBuilder(BaseDummyInputsBuilder[Step3VLProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: target_width, target_height = self.info.get_image_size_with_most_features() num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/terratorch.py b/vllm/model_executor/models/terratorch.py index b817383ab..804eccbc4 100644 --- a/vllm/model_executor/models/terratorch.py +++ b/vllm/model_executor/models/terratorch.py @@ -154,6 +154,7 @@ class TerratorchInputBuilder(BaseDummyInputsBuilder[TerratorchProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: # Dummy data is generated based on the 'input' section # defined in the HF configuration file diff --git a/vllm/model_executor/models/transformers/multimodal.py b/vllm/model_executor/models/transformers/multimodal.py index 890b486b8..64dc5bf8b 100644 --- a/vllm/model_executor/models/transformers/multimodal.py +++ b/vllm/model_executor/models/transformers/multimodal.py @@ -98,6 +98,7 @@ class MultiModalDummyInputsBuilder(BaseDummyInputsBuilder[MultiModalProcessingIn seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, "BaseDummyOptions"] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 944dc5e12..d7a9bd4fd 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -161,8 +161,11 @@ class UltravoxDummyInputsBuilder(BaseDummyInputsBuilder[UltravoxProcessingInfo]) seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: - feature_extractor = self.info.get_feature_extractor() + feature_extractor = self.info.get_feature_extractor( + **(mm_processor_kwargs or {}) + ) sampling_rate = feature_extractor.sampling_rate audio_len = ( diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py index 581664aec..715d6aa25 100644 --- a/vllm/model_executor/models/voxtral.py +++ b/vllm/model_executor/models/voxtral.py @@ -220,6 +220,7 @@ class VoxtralDummyInputsBuilder(BaseDummyInputsBuilder[VoxtralProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: num_audios = mm_counts.get("audio", 0) @@ -238,6 +239,7 @@ class VoxtralDummyInputsBuilder(BaseDummyInputsBuilder[VoxtralProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> ProcessorInputs: tokenizer = self.info.get_tokenizer() diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index 7462d9f6e..26c7b62e8 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -685,8 +685,11 @@ class WhisperDummyInputsBuilder(BaseDummyInputsBuilder[WhisperProcessingInfo]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: - feature_extractor = self.info.get_feature_extractor() + feature_extractor = self.info.get_feature_extractor( + **(mm_processor_kwargs or {}) + ) sampling_rate = feature_extractor.sampling_rate audio_len = feature_extractor.chunk_length * sampling_rate diff --git a/vllm/multimodal/processing/dummy_inputs.py b/vllm/multimodal/processing/dummy_inputs.py index a93fd2c24..0b02861e3 100644 --- a/vllm/multimodal/processing/dummy_inputs.py +++ b/vllm/multimodal/processing/dummy_inputs.py @@ -63,6 +63,7 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> MultiModalDataDict: """ Build the multimodal input which, after processing, results in @@ -83,6 +84,7 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]): seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions] | None = None, + mm_processor_kwargs: Mapping[str, object] | None = None, ) -> ProcessorInputs: """ Build the input which, after processing, results in @@ -92,9 +94,16 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]): seq_len: Sequence length mm_counts: Count of items per modality mm_options: Configurable options per modality (optional) + mm_processor_kwargs: Additional keyword arguments + for hf_processor (optional) """ dummy_text = self.get_dummy_text(mm_counts) - dummy_mm_data = self.get_dummy_mm_data(seq_len, mm_counts, mm_options) + dummy_mm_data = self.get_dummy_mm_data( + seq_len, + mm_counts, + mm_options, + mm_processor_kwargs=mm_processor_kwargs, + ) dummy_mm_items = self.info.parse_mm_data(dummy_mm_data, validate=False) tokenization_kwargs = {"truncation": False} @@ -102,6 +111,7 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]): return ProcessorInputs( prompt=dummy_text, mm_items=dummy_mm_items, + hf_processor_mm_kwargs=mm_processor_kwargs or {}, tokenization_kwargs=tokenization_kwargs, ) diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 6c7e86a4f..340754d16 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -257,10 +257,12 @@ class MultiModalRegistry: if processor is None: processor = self.create_processor(model_config, cache=cache) + mm_config = model_config.get_multimodal_config() processor_inputs = processor.dummy_inputs.get_dummy_processor_inputs( seq_len=seq_len, mm_counts=mm_counts, mm_options=self._extract_mm_options(model_config), + mm_processor_kwargs=mm_config.mm_processor_kwargs, ) mm_inputs = processor.apply( prompt=processor_inputs.prompt,