diff --git a/tests/model_executor/test_qwen3_omni.py b/tests/model_executor/test_qwen3_omni.py index c92c61dcd..1ceaba04a 100644 --- a/tests/model_executor/test_qwen3_omni.py +++ b/tests/model_executor/test_qwen3_omni.py @@ -143,6 +143,7 @@ def test_qwen3_omni_get_updates_use_audio_in_video( # Create processing info info = Qwen3OmniMoeThinkerProcessingInfo(mock_ctx) + info._get_expected_hidden_size = lambda: 100 info.get_hf_config = Mock(return_value=mock_qwen3_omni_config) info.get_hf_processor = Mock(return_value=mock_processor) info.get_tokenizer = Mock(return_value=mock_tokenizer) diff --git a/vllm/model_executor/models/audioflamingo3.py b/vllm/model_executor/models/audioflamingo3.py index 6c5e13f42..3f1661abe 100644 --- a/vllm/model_executor/models/audioflamingo3.py +++ b/vllm/model_executor/models/audioflamingo3.py @@ -192,6 +192,22 @@ class AudioFlamingo3MultiModalProjector(nn.Module): return hidden_states +class AudioFlamingo3MultiModalDataParser(MultiModalDataParser): + def _parse_audio_data( + self, + data: dict[str, torch.Tensor] | ModalityData[Any], + ) -> ModalityDataItems[Any, Any] | None: + if isinstance(data, dict): + return DictEmbeddingItems( + data, + modality="audio", + required_fields={"audio_embeds"}, + fields_factory=_audioflamingo3_field_config, + ) + + return super()._parse_audio_data(data) + + class AudioFlamingo3ProcessingInfo(BaseProcessingInfo): def get_hf_config(self): return self.ctx.get_hf_config(AudioFlamingo3Config) @@ -204,6 +220,14 @@ class AudioFlamingo3ProcessingInfo(BaseProcessingInfo): feature_extractor = hf_processor.feature_extractor return feature_extractor + def get_data_parser(self): + feature_extractor = self.get_feature_extractor() + + return AudioFlamingo3MultiModalDataParser( + target_sr=feature_extractor.sampling_rate, + expected_hidden_size=self._get_expected_hidden_size(), + ) + def get_supported_mm_limits(self) -> Mapping[str, int | None]: return {"audio": None} @@ -259,30 +283,9 @@ def _audioflamingo3_field_config(hf_inputs: Mapping[str, torch.Tensor]): ) -class AudioFlamingo3MultiModalDataParser(MultiModalDataParser): - def _parse_audio_data( - self, - data: dict[str, torch.Tensor] | ModalityData[Any], - ) -> ModalityDataItems[Any, Any] | None: - if isinstance(data, dict): - return DictEmbeddingItems( - data, - modality="audio", - required_fields={"audio_embeds"}, - fields_factory=_audioflamingo3_field_config, - ) - return super()._parse_audio_data(data) - - class AudioFlamingo3MultiModalProcessor( BaseMultiModalProcessor[AudioFlamingo3ProcessingInfo] ): - def _get_data_parser(self) -> MultiModalDataParser: - feature_extractor = self.info.get_feature_extractor() - return AudioFlamingo3MultiModalDataParser( - target_sr=feature_extractor.sampling_rate - ) - def _call_hf_processor( self, prompt: str, diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py index bdf3f86c4..fa85d952b 100644 --- a/vllm/model_executor/models/aya_vision.py +++ b/vllm/model_executor/models/aya_vision.py @@ -227,10 +227,8 @@ class AyaVisionMultiModalProcessor(BaseMultiModalProcessor[AyaVisionProcessingIn # HF processor pops the `num_patches` kwarg, which is needed by vLLM if (images := mm_data.get("images")) is not None: - parsed_images = ( - self._get_data_parser() - .parse_mm_data({"image": images}) - .get_items("image", ImageProcessorItems) + parsed_images = self.data_parser.parse_mm_data({"image": images}).get_items( + "image", ImageProcessorItems ) image_sizes = [ parsed_images.get_image_size(i) for i in range(len(parsed_images)) diff --git a/vllm/model_executor/models/cohere2_vision.py b/vllm/model_executor/models/cohere2_vision.py index 0cf2b6ba8..f281a1d4b 100644 --- a/vllm/model_executor/models/cohere2_vision.py +++ b/vllm/model_executor/models/cohere2_vision.py @@ -262,10 +262,8 @@ class Cohere2VisionMultiModalProcessor( hf_processor = self.info.get_hf_processor(**mm_kwargs) # Fallback calculation if HF processor didn't provide num_patches - parsed_images = ( - self._get_data_parser() - .parse_mm_data({"image": images}) - .get_items("image", ImageProcessorItems) + parsed_images = self.data_parser.parse_mm_data({"image": images}).get_items( + "image", ImageProcessorItems ) num_patches = [ diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py index db724d027..0ada8a223 100644 --- a/vllm/model_executor/models/ernie45_vl.py +++ b/vllm/model_executor/models/ernie45_vl.py @@ -793,6 +793,12 @@ class Ernie4_5_VLProcessingInfo(BaseProcessingInfo): def get_image_processor(self, **kwargs: object): return self.get_hf_processor(**kwargs).image_processor + def get_data_parser(self): + return MultiModalDataParser( + video_needs_metadata=True, + expected_hidden_size=self._get_expected_hidden_size(), + ) + def get_supported_mm_limits(self) -> Mapping[str, int | None]: return {"image": None, "video": None} @@ -947,11 +953,6 @@ class Ernie4_5_VLProcessingInfo(BaseProcessingInfo): class Ernie4_5VLMultiModalProcessor(BaseMultiModalProcessor[Ernie4_5_VLProcessingInfo]): - def _get_data_parser(self) -> MultiModalDataParser: - return MultiModalDataParser( - video_needs_metadata=True, - ) - def _pixel_values_norm( self, pixel_values: torch.Tensor, diff --git a/vllm/model_executor/models/funaudiochat.py b/vllm/model_executor/models/funaudiochat.py index 995fb6944..16afec3cf 100644 --- a/vllm/model_executor/models/funaudiochat.py +++ b/vllm/model_executor/models/funaudiochat.py @@ -552,6 +552,29 @@ class FunAudioChatDiscreteEncoder(nn.Module): class FunAudioChatProcessingInfo(BaseProcessingInfo): token_fps: int = 25 + @cached_property + def feature_extractor(self) -> WhisperFeatureExtractor: + return WhisperFeatureExtractor.from_pretrained(self.model_id) + + @cached_property + def speech_tokenizer(self) -> PreTrainedTokenizerFast: + return PreTrainedTokenizerFast.from_pretrained( + self.model_id, subfolder="speech_tokenizer" + ) + + def get_feature_extractor(self) -> WhisperFeatureExtractor: + return self.feature_extractor + + def get_speech_tokenizer(self) -> PreTrainedTokenizerFast: + return self.speech_tokenizer + + def get_data_parser(self): + return MultiModalDataParser( + target_sr=int(self.feature_extractor.sampling_rate), + target_channels=self.get_target_channels(), + expected_hidden_size=self._get_expected_hidden_size(), + ) + def get_supported_mm_limits(self) -> Mapping[str, int | None]: return {"audio": None} @@ -570,22 +593,6 @@ class FunAudioChatProcessingInfo(BaseProcessingInfo): max_audio_tokens = int(getattr(audio_cfg, "max_source_positions", 1500)) return {"audio": max_audio_tokens} - @cached_property - def feature_extractor(self) -> WhisperFeatureExtractor: - return WhisperFeatureExtractor.from_pretrained(self.model_id) - - @cached_property - def speech_tokenizer(self) -> PreTrainedTokenizerFast: - return PreTrainedTokenizerFast.from_pretrained( - self.model_id, subfolder="speech_tokenizer" - ) - - def get_feature_extractor(self) -> WhisperFeatureExtractor: - return self.feature_extractor - - def get_speech_tokenizer(self) -> PreTrainedTokenizerFast: - return self.speech_tokenizer - def get_audio_group_size(self) -> int: cfg = self.get_hf_config() audio_cfg = getattr(cfg, "audio_config", None) @@ -635,13 +642,6 @@ class FunAudioChatDummyInputsBuilder( class FunAudioChatMultiModalProcessor( BaseMultiModalProcessor[FunAudioChatProcessingInfo] ): - def _get_data_parser(self) -> MultiModalDataParser: - feature_extractor = self.info.get_feature_extractor() - return MultiModalDataParser( - target_sr=int(feature_extractor.sampling_rate), - target_channels=self.info.get_target_channels(), - ) - def _call_hf_processor( self, prompt: str, diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py index aa44582ed..e436d2981 100644 --- a/vllm/model_executor/models/gemma3_mm.py +++ b/vllm/model_executor/models/gemma3_mm.py @@ -290,10 +290,8 @@ class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]): # HF processor pops the `num_crops` kwarg, which is needed by vLLM if (images := mm_data.get("images")) is not None: - parsed_images = ( - self._get_data_parser() - .parse_mm_data({"image": images}) - .get_items("image", ImageProcessorItems) + parsed_images = self.data_parser.parse_mm_data({"image": images}).get_items( + "image", ImageProcessorItems ) image_sizes = [ parsed_images.get_image_size(i) for i in range(len(parsed_images)) diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py index 2e5287220..4b39877bb 100644 --- a/vllm/model_executor/models/gemma3n_mm.py +++ b/vllm/model_executor/models/gemma3n_mm.py @@ -107,6 +107,17 @@ class Gemma3nProcessingInfo(BaseProcessingInfo): def get_hf_processor(self, **kwargs: object): return self.ctx.get_hf_processor(Gemma3nProcessor, **kwargs) + def get_feature_extractor(self, **kwargs: object) -> Gemma3nAudioFeatureExtractor: + return self.get_hf_processor(**kwargs).feature_extractor + + def get_data_parser(self): + feature_extractor = self.get_feature_extractor() + + return MultiModalDataParser( + target_sr=feature_extractor.sampling_rate, + expected_hidden_size=self._get_expected_hidden_size(), + ) + def get_supported_mm_limits(self) -> Mapping[str, int | None]: return {"image": None, "audio": None} @@ -200,10 +211,6 @@ class Gemma3nDummyInputsBuilder(BaseDummyInputsBuilder[Gemma3nProcessingInfo]): class Gemma3nMultiModalProcessor(BaseMultiModalProcessor[Gemma3nProcessingInfo]): - def _get_data_parser(self) -> MultiModalDataParser: - feature_extractor = self.info.get_hf_processor().feature_extractor - return MultiModalDataParser(target_sr=feature_extractor.sampling_rate) - def _call_hf_processor( self, prompt: str, diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index a081641be..b6f5aae3a 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -822,6 +822,12 @@ class Glm4vProcessingInfo(BaseProcessingInfo): def get_video_processor(self, **kwargs: object) -> Glm4vVideoProcessor: return self.get_hf_processor(**kwargs).video_processor + def get_data_parser(self): + return MultiModalDataParser( + video_needs_metadata=True, + expected_hidden_size=self._get_expected_hidden_size(), + ) + def _get_vision_info( self, *, @@ -1222,9 +1228,6 @@ class Glm4vDummyInputsBuilder(BaseDummyInputsBuilder[Glm4vProcessingInfo]): class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]): - def _get_data_parser(self) -> MultiModalDataParser: - return MultiModalDataParser(video_needs_metadata=True) - def _call_hf_processor( self, prompt: str, diff --git a/vllm/model_executor/models/glmasr.py b/vllm/model_executor/models/glmasr.py index bc70e0ad2..2651540d2 100644 --- a/vllm/model_executor/models/glmasr.py +++ b/vllm/model_executor/models/glmasr.py @@ -620,64 +620,6 @@ class GlmAsrMultiModalProjector(nn.Module): return hidden_states -class GlmAsrProcessingInfo(BaseProcessingInfo): - """ - Processing information provider for GLM-ASR model. - - Provides access to model configuration, processor, and feature extractor - needed for audio preprocessing and multimodal integration. - """ - - def get_hf_config(self) -> GlmAsrConfig: - return self.ctx.get_hf_config(GlmAsrConfig) - - def get_hf_processor(self, **kwargs: object) -> GlmAsrProcessor: - return self.ctx.get_hf_processor(GlmAsrProcessor, **kwargs) - - def get_feature_extractor(self, **kwargs: object) -> WhisperFeatureExtractor: - return self.get_hf_processor(**kwargs).feature_extractor - - def get_supported_mm_limits(self) -> Mapping[str, int | None]: - return {"audio": None} - - -class GlmAsrDummyInputsBuilder(BaseDummyInputsBuilder[GlmAsrProcessingInfo]): - """ - Builder for dummy inputs used in profiling and testing. - - Generates dummy text prompts and audio data that match the expected - format for GLM-ASR model inputs. Used for memory profiling and - performance benchmarking. - """ - - def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: - num_audios = mm_counts.get("audio", 0) - hf_processor = self.info.get_hf_processor() - return hf_processor.audio_token * num_audios - - def get_dummy_mm_data( - self, - seq_len: int, - mm_counts: Mapping[str, int], - mm_options: Mapping[str, BaseDummyOptions] | None = None, - ) -> MultiModalDataDict: - feature_extractor = self.info.get_feature_extractor() - sampling_rate = feature_extractor.sampling_rate - num_audios = mm_counts.get("audio", 0) - audio_overrides = mm_options.get("audio") if mm_options else None - - max_audio_len = getattr( - self.info.get_hf_processor(), "max_audio_len", DEFAULT_MAX_AUDIO_LEN_S - ) - audio_len = int(max_audio_len * sampling_rate) - - return { - "audio": self._get_dummy_audios( - length=audio_len, num_audios=num_audios, overrides=audio_overrides - ) - } - - def _glmasr_field_config( hf_inputs: Mapping[str, torch.Tensor], ) -> dict[str, MultiModalFieldConfig]: @@ -737,16 +679,78 @@ class GlmAsrMultiModalDataParser(MultiModalDataParser): return super()._parse_audio_data(data) +class GlmAsrProcessingInfo(BaseProcessingInfo): + """ + Processing information provider for GLM-ASR model. + + Provides access to model configuration, processor, and feature extractor + needed for audio preprocessing and multimodal integration. + """ + + def get_hf_config(self) -> GlmAsrConfig: + return self.ctx.get_hf_config(GlmAsrConfig) + + def get_hf_processor(self, **kwargs: object) -> GlmAsrProcessor: + return self.ctx.get_hf_processor(GlmAsrProcessor, **kwargs) + + def get_feature_extractor(self, **kwargs: object) -> WhisperFeatureExtractor: + return self.get_hf_processor(**kwargs).feature_extractor + + def get_data_parser(self): + feature_extractor = self.get_feature_extractor() + + return GlmAsrMultiModalDataParser( + target_sr=feature_extractor.sampling_rate, + expected_hidden_size=self._get_expected_hidden_size(), + ) + + def get_supported_mm_limits(self) -> Mapping[str, int | None]: + return {"audio": None} + + +class GlmAsrDummyInputsBuilder(BaseDummyInputsBuilder[GlmAsrProcessingInfo]): + """ + Builder for dummy inputs used in profiling and testing. + + Generates dummy text prompts and audio data that match the expected + format for GLM-ASR model inputs. Used for memory profiling and + performance benchmarking. + """ + + def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: + num_audios = mm_counts.get("audio", 0) + hf_processor = self.info.get_hf_processor() + return hf_processor.audio_token * num_audios + + def get_dummy_mm_data( + self, + seq_len: int, + mm_counts: Mapping[str, int], + mm_options: Mapping[str, BaseDummyOptions] | None = None, + ) -> MultiModalDataDict: + feature_extractor = self.info.get_feature_extractor() + sampling_rate = feature_extractor.sampling_rate + num_audios = mm_counts.get("audio", 0) + audio_overrides = mm_options.get("audio") if mm_options else None + + max_audio_len = getattr( + self.info.get_hf_processor(), "max_audio_len", DEFAULT_MAX_AUDIO_LEN_S + ) + audio_len = int(max_audio_len * sampling_rate) + + return { + "audio": self._get_dummy_audios( + length=audio_len, num_audios=num_audios, overrides=audio_overrides + ) + } + + class GlmAsrMultiModalProcessor(BaseMultiModalProcessor["GlmAsrProcessingInfo"]): """ GLM-ASR processor that inherits directly from BaseMultiModalProcessor for better performance and cleaner implementation. """ - def _get_data_parser(self) -> MultiModalDataParser: - feature_extractor = self.info.get_feature_extractor() - return GlmAsrMultiModalDataParser(target_sr=feature_extractor.sampling_rate) - def _calculate_chunk_counts( self, audio_list: list[Any], diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py index 22296bf59..1f9b9d2c8 100644 --- a/vllm/model_executor/models/granite_speech.py +++ b/vllm/model_executor/models/granite_speech.py @@ -109,6 +109,14 @@ class GraniteSpeechAudioInputs(TensorSchema): class GraniteSpeechMultiModalProcessingInfo(BaseProcessingInfo): + def get_data_parser(self): + feature_extractor = self.get_hf_processor().audio_processor + + return MultiModalDataParser( + target_sr=feature_extractor.melspec_kwargs["sample_rate"], + expected_hidden_size=self._get_expected_hidden_size(), + ) + def get_supported_mm_limits(self) -> Mapping[str, int | None]: return {"audio": 1} @@ -127,11 +135,6 @@ class GraniteSpeechMultiModalProcessingInfo(BaseProcessingInfo): class GraniteSpeechMultiModalProcessor( BaseMultiModalProcessor[GraniteSpeechMultiModalProcessingInfo] ): - def _get_data_parser(self) -> MultiModalDataParser: - feature_extractor = self.info.get_hf_processor().audio_processor - sampling_rate = feature_extractor.melspec_kwargs["sample_rate"] - return MultiModalDataParser(target_sr=sampling_rate) - def _get_mm_fields_config( self, hf_inputs: BatchFeature, diff --git a/vllm/model_executor/models/hunyuan_vision.py b/vllm/model_executor/models/hunyuan_vision.py index c389a0660..5c64d2822 100644 --- a/vllm/model_executor/models/hunyuan_vision.py +++ b/vllm/model_executor/models/hunyuan_vision.py @@ -599,6 +599,11 @@ class HunYuanVLProcessingInfo(BaseProcessingInfo): ) -> HunYuanVLProcessor: return self.get_hf_processor(**kwargs).image_processor + def get_data_parser(self): + return HunYuanVLMultiModalDataParser( + expected_hidden_size=self._get_expected_hidden_size(), + ) + def get_supported_mm_limits(self) -> Mapping[str, int | None]: return {"image": None} @@ -710,9 +715,6 @@ class HunYuanVLDummyInputsBuilder(BaseDummyInputsBuilder[HunYuanVLProcessingInfo class HunYuanVLMultiModalProcessor(BaseMultiModalProcessor[HunYuanVLProcessingInfo]): - def _get_data_parser(self) -> MultiModalDataParser: - return HunYuanVLMultiModalDataParser() - def _call_hf_processor( self, prompt: str, diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index dfd5869b4..7ea67d6b9 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -349,10 +349,8 @@ class Idefics3MultiModalProcessor(BaseMultiModalProcessor[Idefics3ProcessingInfo tok_kwargs, ) - parsed_images = ( - self._get_data_parser() - .parse_mm_data({"image": images}) - .get_items("image", ImageProcessorItems) + parsed_images = self.data_parser.parse_mm_data({"image": images}).get_items( + "image", ImageProcessorItems ) image_sizes = [ parsed_images.get_image_size(i) for i in range(len(parsed_images)) diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py index d37b43102..e57e5c6f3 100644 --- a/vllm/model_executor/models/keye.py +++ b/vllm/model_executor/models/keye.py @@ -984,6 +984,11 @@ class KeyeProcessingInfo(BaseProcessingInfo): def get_image_processor(self, **kwargs: object): return self.get_hf_processor(**kwargs).image_processor + def get_data_parser(self): + return KeyeMultiModalDataParser( + expected_hidden_size=self._get_expected_hidden_size(), + ) + def get_supported_mm_limits( self, ) -> Mapping[str, int | None]: @@ -1183,13 +1188,11 @@ class KeyeBaseDummyInputsBuilder(BaseDummyInputsBuilder[_I]): return mm_data -class KeyeDummyInputsBuilder(KeyeBaseDummyInputsBuilder[KeyeProcessingInfo]): ... +class KeyeDummyInputsBuilder(KeyeBaseDummyInputsBuilder[KeyeProcessingInfo]): + pass class KeyeMultiModalProcessor(BaseMultiModalProcessor[KeyeProcessingInfo]): - def _get_data_parser(self) -> MultiModalDataParser: - return KeyeMultiModalDataParser() - def _get_prompt_updates( self, mm_items: MultiModalDataItems, diff --git a/vllm/model_executor/models/keye_vl1_5.py b/vllm/model_executor/models/keye_vl1_5.py index 2b04e3bd4..d304b245e 100644 --- a/vllm/model_executor/models/keye_vl1_5.py +++ b/vllm/model_executor/models/keye_vl1_5.py @@ -274,16 +274,6 @@ class KeyeVL1_5Projector(nn.Module): return hidden_states.view(*dims, -1) -class KeyeVL1_5ProcessingInfo(KeyeProcessingInfo): - def get_max_frame_per_video(self) -> int: - return 2048 - - def get_supported_mm_limits( - self, - ) -> Mapping[str, int | None]: - return {"image": None, "video": 1} - - def _keye_field_config( hf_inputs: Mapping[str, torch.Tensor], ): @@ -365,10 +355,22 @@ class KeyeVL1_5MultiModalDataParser(MultiModalDataParser): return super()._parse_video_data(data) -class KeyeVL1_5MultiModalProcessor(BaseMultiModalProcessor[KeyeVL1_5ProcessingInfo]): - def _get_data_parser(self) -> MultiModalDataParser: - return KeyeVL1_5MultiModalDataParser() +class KeyeVL1_5ProcessingInfo(KeyeProcessingInfo): + def get_data_parser(self): + return KeyeVL1_5MultiModalDataParser( + expected_hidden_size=self._get_expected_hidden_size(), + ) + def get_max_frame_per_video(self) -> int: + return 2048 + + def get_supported_mm_limits( + self, + ) -> Mapping[str, int | None]: + return {"image": None, "video": 1} + + +class KeyeVL1_5MultiModalProcessor(BaseMultiModalProcessor[KeyeVL1_5ProcessingInfo]): def _get_prompt_updates( self, mm_items: MultiModalDataItems, diff --git a/vllm/model_executor/models/lfm2_vl.py b/vllm/model_executor/models/lfm2_vl.py index c0fa8404f..db9b302f7 100644 --- a/vllm/model_executor/models/lfm2_vl.py +++ b/vllm/model_executor/models/lfm2_vl.py @@ -354,10 +354,8 @@ class Lfm2VLMultiModalProcessor(BaseMultiModalProcessor[Lfm2VLProcessingInfo]): tok_kwargs, ) - parsed_images = ( - self._get_data_parser() - .parse_mm_data({"image": images}) - .get_items("image", ImageProcessorItems) + parsed_images = self.data_parser.parse_mm_data({"image": images}).get_items( + "image", ImageProcessorItems ) image_sizes = [ parsed_images.get_image_size(i) for i in range(len(parsed_images)) diff --git a/vllm/model_executor/models/midashenglm.py b/vllm/model_executor/models/midashenglm.py index b2c23c29d..3f75e60fd 100644 --- a/vllm/model_executor/models/midashenglm.py +++ b/vllm/model_executor/models/midashenglm.py @@ -531,6 +531,14 @@ class MiDashengLMProcessingInfo(BaseProcessingInfo): feature_extractor = hf_processor.feature_extractor return feature_extractor + def get_data_parser(self): + feature_extractor = self.get_feature_extractor() + + return MultiModalDataParser( + target_sr=feature_extractor.sampling_rate, + expected_hidden_size=self._get_expected_hidden_size(), + ) + def get_supported_mm_limits(self) -> Mapping[str, int | None]: return {"audio": None} @@ -575,10 +583,6 @@ class MiDashengLMDummyInputsBuilder(BaseDummyInputsBuilder[MiDashengLMProcessing class MiDashengLMMultiModalProcessor( BaseMultiModalProcessor[MiDashengLMProcessingInfo] ): - def _get_data_parser(self) -> MultiModalDataParser: - feature_extractor = self.info.get_feature_extractor() - return MultiModalDataParser(target_sr=feature_extractor.sampling_rate) - def _call_hf_processor( self, prompt: str, diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py index c77ccca0a..4bf004106 100644 --- a/vllm/model_executor/models/minicpmo.py +++ b/vllm/model_executor/models/minicpmo.py @@ -53,7 +53,6 @@ from vllm.multimodal.parse import ( ModalityData, ModalityDataItems, MultiModalDataItems, - MultiModalDataParser, ) from vllm.multimodal.processing import ( PromptReplacement, @@ -174,6 +173,12 @@ class MiniCPMOMultiModalDataParser(MiniCPMVMultiModalDataParser): class MiniCPMOProcessingInfo(MiniCPMVProcessingInfo): audio_pattern = "()" + def get_data_parser(self): + return MiniCPMOMultiModalDataParser( + target_sr=self.get_default_audio_sampling_rate(), + expected_hidden_size=self._get_expected_hidden_size(), + ) + def get_supported_mm_limits(self) -> Mapping[str, int | None]: return {**super().get_supported_mm_limits(), "audio": None} @@ -274,11 +279,6 @@ class MiniCPMODummyInputsBuilder(MiniCPMVDummyInputsBuilder[MiniCPMOProcessingIn class MiniCPMOMultiModalProcessor(MiniCPMVMultiModalProcessor[MiniCPMOProcessingInfo]): - def _get_data_parser(self) -> MultiModalDataParser: - return MiniCPMOMultiModalDataParser( - target_sr=self.info.get_default_audio_sampling_rate() - ) - def get_audio_prompt_texts( self, audio_lens: int, @@ -300,10 +300,8 @@ class MiniCPMOMultiModalProcessor(MiniCPMVMultiModalProcessor[MiniCPMOProcessing if (audios := mm_data.get("audios")) is None: return {} - parsed_audios = ( - self._get_data_parser() - .parse_mm_data({"audio": audios}) - .get_items("audio", (MiniCPMOAudioEmbeddingItems, AudioProcessorItems)) + parsed_audios = self.data_parser.parse_mm_data({"audio": audios}).get_items( + "audio", (MiniCPMOAudioEmbeddingItems, AudioProcessorItems) ) if isinstance(parsed_audios, MiniCPMOAudioEmbeddingItems): diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 50a420759..bfeaf571f 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -557,6 +557,11 @@ class MiniCPMVProcessingInfo(BaseProcessingInfo): def get_image_processor(self, **kwargs: object): return self.get_hf_processor(**kwargs).image_processor + def get_data_parser(self): + return MiniCPMVMultiModalDataParser( + expected_hidden_size=self._get_expected_hidden_size(), + ) + def get_model_version(self): return get_version_by_config(self.get_hf_config()) @@ -736,9 +741,6 @@ class MiniCPMVDummyInputsBuilder(BaseDummyInputsBuilder[_I]): class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]): - def _get_data_parser(self) -> MultiModalDataParser: - return MiniCPMVMultiModalDataParser() - def get_image_prompt_texts(self, image_size: ImageSize, image_idx: int = 0) -> str: return self.info.get_slice_image_placeholder( image_size, @@ -765,10 +767,8 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]): if (images := mm_data.get("images")) is None: return {} - parsed_images = ( - self._get_data_parser() - .parse_mm_data({"image": images}) - .get_items("image", (MiniCPMVImageEmbeddingItems, ImageProcessorItems)) + parsed_images = self.data_parser.parse_mm_data({"image": images}).get_items( + "image", (MiniCPMVImageEmbeddingItems, ImageProcessorItems) ) if isinstance(parsed_images, MiniCPMVImageEmbeddingItems): @@ -793,10 +793,8 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]): if (videos := mm_data.get("videos")) is None: return {} - parsed_videos = ( - self._get_data_parser() - .parse_mm_data({"video": videos}) - .get_items("video", (MiniCPMVVideoEmbeddingItems, VideoProcessorItems)) + parsed_videos = self.data_parser.parse_mm_data({"video": videos}).get_items( + "video", (MiniCPMVVideoEmbeddingItems, VideoProcessorItems) ) if isinstance(parsed_videos, MiniCPMVVideoEmbeddingItems): diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index 9de43f1e1..f9dffbfc1 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -620,10 +620,8 @@ class Mllama4MultiModalProcessor(BaseMultiModalProcessor[Mllama4ProcessingInfo]) ) images = mm_data["images"] - parsed_images = ( - self._get_data_parser() - .parse_mm_data({"image": images}) - .get_items("image", ImageProcessorItems) + parsed_images = self.data_parser.parse_mm_data({"image": images}).get_items( + "image", ImageProcessorItems ) tile_size = vision_config.image_size diff --git a/vllm/model_executor/models/molmo2.py b/vllm/model_executor/models/molmo2.py index 6ded8e08c..cc718d6d5 100644 --- a/vllm/model_executor/models/molmo2.py +++ b/vllm/model_executor/models/molmo2.py @@ -1860,6 +1860,12 @@ def get_frame_times_and_chosen_fps( class Molmo2ProcessingInfo(BaseProcessingInfo): + def get_data_parser(self): + return MultiModalDataParser( + video_needs_metadata=True, + expected_hidden_size=self._get_expected_hidden_size(), + ) + def get_hf_processor(self, **kwargs: object) -> Molmo2ProcessorWrapper: processor = self.ctx.get_hf_processor(**kwargs) hf_config = self.ctx.get_hf_config() @@ -2183,9 +2189,6 @@ class Molmo2MultiModalProcessor(BaseMultiModalProcessor[Molmo2ProcessingInfo]): return prompt_tokens - def _get_data_parser(self) -> MultiModalDataParser: - return MultiModalDataParser(video_needs_metadata=True) - def _call_hf_processor( self, prompt: str, diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py index 21e2715fe..1c36b681f 100644 --- a/vllm/model_executor/models/nano_nemotron_vl.py +++ b/vllm/model_executor/models/nano_nemotron_vl.py @@ -1143,6 +1143,12 @@ class NanoNemotronVLProcessingInfo(BaseNanoNemotronVLProcessingInfo): def supports_video(self): return self.get_hf_processor().supports_video + def get_data_parser(self): + return MultiModalDataParser( + video_needs_metadata=True, + expected_hidden_size=self._get_expected_hidden_size(), + ) + def get_supported_mm_limits(self): video_limit = {"video": None} if self.supports_video else {} return {**super().get_supported_mm_limits(), **video_limit} @@ -1274,9 +1280,6 @@ class NanoNemotronVLMultiModalProcessor( ): """MultiModalProcessor extended for video support""" - def _get_data_parser(self) -> MultiModalDataParser: - return MultiModalDataParser(video_needs_metadata=True) - def _get_mm_fields_config( self, hf_inputs: BatchFeature, diff --git a/vllm/model_executor/models/opencua.py b/vllm/model_executor/models/opencua.py index cc860c939..abc196e22 100644 --- a/vllm/model_executor/models/opencua.py +++ b/vllm/model_executor/models/opencua.py @@ -25,7 +25,7 @@ from vllm.multimodal.inputs import ( MultiModalFieldConfig, MultiModalKwargsItems, ) -from vllm.multimodal.parse import MultiModalDataItems, MultiModalDataParser +from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.processing import ( BaseMultiModalProcessor, PromptReplacement, @@ -53,6 +53,12 @@ from .utils import ( class OpenCUAProcessingInfo(Qwen2VLProcessingInfo): + def get_data_parser(self): + return Qwen2VLMultiModalDataParser( + self.get_hf_config().vision_config.spatial_merge_size, + expected_hidden_size=self._get_expected_hidden_size(), + ) + def get_hf_config(self): return self.ctx.get_hf_config() @@ -125,11 +131,6 @@ class OpenCUAProcessor(Qwen2VLProcessor): class OpenCUAMultiModalProcessor(BaseMultiModalProcessor[OpenCUAProcessingInfo]): - def _get_data_parser(self) -> MultiModalDataParser: - return Qwen2VLMultiModalDataParser( - self.info.get_hf_config().vision_config.spatial_merge_size - ) - def _get_mm_fields_config( self, hf_inputs: BatchFeature, diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index 1d66fe2f1..97a29b353 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -568,6 +568,15 @@ class Phi4MMProcessingInfo(BaseProcessingInfo): def get_feature_extractor(self, **kwargs: object) -> SequenceFeatureExtractor: return self.get_hf_processor(**kwargs).audio_processor + def get_data_parser(self): + feature_extractor = self.get_feature_extractor() + + return MultiModalDataParser( + target_sr=feature_extractor.sampling_rate, + audio_resample_method="scipy", + expected_hidden_size=self._get_expected_hidden_size(), + ) + def get_supported_mm_limits(self) -> Mapping[str, int | None]: return {"audio": None, "image": None} @@ -844,12 +853,6 @@ class Phi4MMDummyInputsBuilder(BaseDummyInputsBuilder[Phi4MMProcessingInfo]): class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]): - def _get_data_parser(self) -> MultiModalDataParser: - feature_extractor = self.info.get_feature_extractor() - return MultiModalDataParser( - target_sr=feature_extractor.sampling_rate, audio_resample_method="scipy" - ) - def _call_hf_processor( self, prompt: str, diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index e4a9c0c10..5152a73de 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -77,7 +77,6 @@ from vllm.multimodal.parse import ( DictEmbeddingItems, ModalityDataItems, MultiModalDataItems, - MultiModalDataParser, ) from vllm.multimodal.processing import BaseDummyInputsBuilder from vllm.multimodal.processing.processor import ( @@ -227,6 +226,16 @@ class Qwen2_5OmniThinkerProcessingInfo( assert isinstance(feature_extractor, WhisperFeatureExtractor) return feature_extractor + def get_data_parser(self): + feature_extractor = self.get_feature_extractor() + + return Qwen2_5OmniThinkerMultiModalDataParser( + spatial_merge_size=self.get_hf_config().vision_config.spatial_merge_size, + target_sr=feature_extractor.sampling_rate, + target_channels=self.get_target_channels(), + expected_hidden_size=self._get_expected_hidden_size(), + ) + def get_target_channels(self) -> int: """Return target audio channels for Qwen2.5 Omni models (mono).""" return 1 @@ -310,14 +319,6 @@ class Qwen2_5OmniThinkerDummyInputsBuilder( class Qwen2_5OmniThinkerMultiModalProcessor( BaseMultiModalProcessor[Qwen2_5OmniThinkerProcessingInfo] ): - def _get_data_parser(self) -> MultiModalDataParser: - feature_extractor = self.info.get_feature_extractor() - return Qwen2_5OmniThinkerMultiModalDataParser( - spatial_merge_size=self.info.get_hf_config().vision_config.spatial_merge_size, - target_sr=feature_extractor.sampling_rate, - target_channels=self.info.get_target_channels(), - ) - def _call_hf_processor( self, prompt: str, diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index ab72a4482..2115d5140 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -127,6 +127,30 @@ def _get_feat_extract_output_lengths(input_lengths: torch.Tensor): return feat_lengths, output_lengths +def _qwen2audio_field_config(hf_inputs: Mapping[str, torch.Tensor]): + return dict( + audio_embeds=MultiModalFieldConfig.batched("audio"), + input_features=MultiModalFieldConfig.batched("audio"), + feature_attention_mask=MultiModalFieldConfig.batched("audio"), + ) + + +class Qwen2AudioMultiModalDataParser(MultiModalDataParser): + def _parse_audio_data( + self, + data: dict[str, torch.Tensor] | ModalityData[AudioItem], + ) -> ModalityDataItems[Any, Any] | None: + if isinstance(data, dict): + return DictEmbeddingItems( + data, + modality="audio", + required_fields={"audio_embeds"}, + fields_factory=_qwen2audio_field_config, + ) + + return super()._parse_audio_data(data) + + class Qwen2AudioProcessingInfo(BaseProcessingInfo): def get_hf_config(self): return self.ctx.get_hf_config(Qwen2AudioConfig) @@ -140,6 +164,15 @@ class Qwen2AudioProcessingInfo(BaseProcessingInfo): assert isinstance(feature_extractor, WhisperFeatureExtractor) return feature_extractor + def get_data_parser(self): + feature_extractor = self.get_feature_extractor() + + return Qwen2AudioMultiModalDataParser( + target_sr=feature_extractor.sampling_rate, + target_channels=self.get_target_channels(), + expected_hidden_size=self._get_expected_hidden_size(), + ) + def get_target_channels(self) -> int: """Return target audio channels for Qwen2 Audio models (mono).""" return 1 @@ -178,38 +211,7 @@ class Qwen2AudioDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2AudioProcessingIn } -def _qwen2audio_field_config(hf_inputs: Mapping[str, torch.Tensor]): - return dict( - audio_embeds=MultiModalFieldConfig.batched("audio"), - input_features=MultiModalFieldConfig.batched("audio"), - feature_attention_mask=MultiModalFieldConfig.batched("audio"), - ) - - -class Qwen2AudioMultiModalDataParser(MultiModalDataParser): - def _parse_audio_data( - self, - data: dict[str, torch.Tensor] | ModalityData[AudioItem], - ) -> ModalityDataItems[Any, Any] | None: - if isinstance(data, dict): - return DictEmbeddingItems( - data, - modality="audio", - required_fields={"audio_embeds"}, - fields_factory=_qwen2audio_field_config, - ) - - return super()._parse_audio_data(data) - - class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor[Qwen2AudioProcessingInfo]): - def _get_data_parser(self) -> MultiModalDataParser: - feature_extractor = self.info.get_feature_extractor() - return Qwen2AudioMultiModalDataParser( - target_sr=feature_extractor.sampling_rate, - target_channels=self.info.get_target_channels(), - ) - def _call_hf_processor( self, prompt: str, diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 3cd69630d..c7c26c206 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -806,6 +806,12 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo): def get_image_processor(self, **kwargs: object) -> Qwen2VLImageProcessor: return self.get_hf_processor(**kwargs).image_processor + def get_data_parser(self): + return Qwen2VLMultiModalDataParser( + self.get_hf_config().vision_config.spatial_merge_size, + expected_hidden_size=self._get_expected_hidden_size(), + ) + def get_supported_mm_limits(self) -> Mapping[str, int | None]: return {"image": None, "video": None} @@ -1039,11 +1045,6 @@ class Qwen2VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen2VLProcessingInfo]): class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]): - def _get_data_parser(self) -> MultiModalDataParser: - return Qwen2VLMultiModalDataParser( - self.info.get_hf_config().vision_config.spatial_merge_size - ) - def _get_prompt_updates( self, mm_items: MultiModalDataItems, diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index bbab47044..977548339 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -81,7 +81,7 @@ from vllm.multimodal.inputs import ( PlaceholderRange, VideoItem, ) -from vllm.multimodal.parse import ImageSize, MultiModalDataItems, MultiModalDataParser +from vllm.multimodal.parse import ImageSize, MultiModalDataItems from vllm.multimodal.processing import ( BaseDummyInputsBuilder, BaseMultiModalProcessor, @@ -624,6 +624,13 @@ class Qwen3VLProcessingInfo(Qwen2VLProcessingInfo): def get_video_processor(self, **kwargs: object) -> Qwen3VLVideoProcessor: return self.get_hf_processor(**kwargs).video_processor + def get_data_parser(self): + return Qwen2VLMultiModalDataParser( + self.get_hf_config().vision_config.spatial_merge_size, + video_needs_metadata=True, + expected_hidden_size=self._get_expected_hidden_size(), + ) + def _get_vision_info( self, *, @@ -901,12 +908,6 @@ class Qwen3VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen3VLProcessingInfo]): class Qwen3VLMultiModalProcessor(BaseMultiModalProcessor[Qwen3VLProcessingInfo]): - def _get_data_parser(self) -> MultiModalDataParser: - return Qwen2VLMultiModalDataParser( - self.info.get_hf_config().vision_config.spatial_merge_size, - video_needs_metadata=True, - ) - def _call_hf_processor( self, prompt: str, diff --git a/vllm/model_executor/models/terratorch.py b/vllm/model_executor/models/terratorch.py index 42910a4f5..3565af74e 100644 --- a/vllm/model_executor/models/terratorch.py +++ b/vllm/model_executor/models/terratorch.py @@ -19,6 +19,7 @@ from collections import OrderedDict from collections.abc import Iterable, Mapping, Sequence +from functools import cached_property from typing import Any import torch @@ -38,7 +39,6 @@ from vllm.model_executor.layers.pooler import IdentityPooler from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.utils import AutoWeightsLoader from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.cache import MultiModalProcessorOnlyCache from vllm.multimodal.inputs import ( ImageItem, ModalityData, @@ -89,7 +89,45 @@ def _terratorch_field_factory(input_definition: InputDefinition): return _terratorch_field_config +class TerratorchMultiModalDataParser(MultiModalDataParser): + def __init__(self, input_definition: InputDefinition, *args, **kwargs): + super().__init__(*args, **kwargs) + + self.input_definition = input_definition + + def _parse_image_data( + self, + data: dict[str, torch.Tensor] | ModalityData[ImageItem], + ) -> ModalityDataItems[Any, Any] | None: + if isinstance(data, dict): + return DictEmbeddingItems( + data, + modality="image", + required_fields=_terratorch_field_names(self.input_definition), + fields_factory=_terratorch_field_factory(self.input_definition), + ) + + return super()._parse_image_data(data) + + def parse_mm_data(self, mm_data: MultiModalDataDict) -> MultiModalDataItems: + if "image" not in mm_data: + mm_data = {"image": mm_data} + + return super().parse_mm_data(mm_data) + + class TerratorchProcessingInfo(BaseProcessingInfo): + @cached_property + def input_definition(self) -> InputDefinition: + pretrained_cfg = self.get_hf_config().to_dict()["pretrained_cfg"] + return InputDefinition(**pretrained_cfg["input"]) + + def get_data_parser(self): + return TerratorchMultiModalDataParser( + self.input_definition, + expected_hidden_size=self._get_expected_hidden_size(), + ) + def get_supported_mm_limits(self) -> Mapping[str, int | None]: return {"image": None} @@ -123,55 +161,13 @@ class TerratorchInputBuilder(BaseDummyInputsBuilder[TerratorchProcessingInfo]): return self.dummy_data_generator.get_dummy_mm_data() -class TerratorchMultiModalDataParser(MultiModalDataParser): - def __init__(self, input_definition: InputDefinition, *args, **kwargs): - super().__init__(*args, **kwargs) - - self.input_definition = input_definition - - def _parse_image_data( - self, - data: dict[str, torch.Tensor] | ModalityData[ImageItem], - ) -> ModalityDataItems[Any, Any] | None: - if isinstance(data, dict): - return DictEmbeddingItems( - data, - modality="image", - required_fields=_terratorch_field_names(self.input_definition), - fields_factory=_terratorch_field_factory(self.input_definition), - ) - - return super()._parse_image_data(data) - - def parse_mm_data(self, mm_data: MultiModalDataDict) -> MultiModalDataItems: - if "image" not in mm_data: - mm_data = {"image": mm_data} - - return super().parse_mm_data(mm_data) - - -class TerratorchMultiModalProcessor(BaseMultiModalProcessor): - def __init__( - self, - info: TerratorchProcessingInfo, - dummy_inputs: "BaseDummyInputsBuilder[TerratorchProcessingInfo]", - *, - cache: MultiModalProcessorOnlyCache | None = None, - ) -> None: - pretrained_cfg = info.get_hf_config().to_dict()["pretrained_cfg"] - self._input_definition = InputDefinition(**pretrained_cfg["input"]) - - super().__init__(info=info, dummy_inputs=dummy_inputs, cache=cache) - - def _get_data_parser(self) -> MultiModalDataParser: - return TerratorchMultiModalDataParser(self._input_definition) - +class TerratorchMultiModalProcessor(BaseMultiModalProcessor[TerratorchProcessingInfo]): def _get_mm_fields_config( self, hf_inputs: BatchFeature, hf_processor_mm_kwargs: Mapping[str, object], ) -> Mapping[str, MultiModalFieldConfig]: - return _terratorch_field_factory(self._input_definition)(hf_inputs) + return _terratorch_field_factory(self.info.input_definition)(hf_inputs) def _get_prompt_updates( self, diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index e8962c8bb..124d65761 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -133,6 +133,15 @@ class UltravoxProcessingInfo(BaseProcessingInfo): assert isinstance(feature_extractor, WhisperFeatureExtractor) return feature_extractor + def get_data_parser(self): + feature_extractor = self.get_feature_extractor() + + return MultiModalDataParser( + target_sr=feature_extractor.sampling_rate, + target_channels=self.get_target_channels(), + expected_hidden_size=self._get_expected_hidden_size(), + ) + def get_target_channels(self) -> int: """Return target audio channels for Ultravox models (mono).""" return 1 @@ -171,13 +180,6 @@ class UltravoxDummyInputsBuilder(BaseDummyInputsBuilder[UltravoxProcessingInfo]) class UltravoxMultiModalProcessor(BaseMultiModalProcessor[UltravoxProcessingInfo]): - def _get_data_parser(self) -> MultiModalDataParser: - feature_extractor = self.info.get_feature_extractor() - return MultiModalDataParser( - target_sr=feature_extractor.sampling_rate, - target_channels=self.info.get_target_channels(), - ) - def _call_hf_processor( self, prompt: str, diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py index aa0e616a9..c828aa7e5 100644 --- a/vllm/model_executor/models/voxtral.py +++ b/vllm/model_executor/models/voxtral.py @@ -203,6 +203,12 @@ class VoxtralProcessingInfo(BaseProcessingInfo): def get_hf_processor(self) -> VoxtralProcessorAdapter: return VoxtralProcessorAdapter(self.get_tokenizer()) + def get_data_parser(self): + return MultiModalDataParser( + target_sr=self.get_hf_processor().sampling_rate, + expected_hidden_size=self._get_expected_hidden_size(), + ) + def get_supported_mm_limits(self) -> Mapping[str, int | None]: return {"audio": 5} # Performance tends to degrade after 5 @@ -335,10 +341,6 @@ class VoxtralMultiModalProcessor(BaseMultiModalProcessor[VoxtralProcessingInfo]) # NOTE: The tokens are already inserted by the chat template return prompt_ids, mm_info, True - def _get_data_parser(self) -> MultiModalDataParser: - sampling_rate = self.info.get_hf_processor().sampling_rate - return MultiModalDataParser(target_sr=sampling_rate) - @MULTIMODAL_REGISTRY.register_processor( VoxtralMultiModalProcessor, diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index b254a5308..d9952ce43 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -644,6 +644,15 @@ class WhisperProcessingInfo(BaseProcessingInfo): def get_hf_config(self) -> WhisperConfig: return self.ctx.get_hf_config(WhisperConfig) + def get_data_parser(self): + feature_extractor = self.get_feature_extractor() + + return MultiModalDataParser( + target_sr=feature_extractor.sampling_rate, + target_channels=self.get_target_channels(), + expected_hidden_size=self._get_expected_hidden_size(), + ) + @property def skip_prompt_length_check(self) -> bool: return True # Because the encoder prompt is padded @@ -693,13 +702,6 @@ class WhisperDummyInputsBuilder(BaseDummyInputsBuilder[WhisperProcessingInfo]): class WhisperMultiModalProcessor(EncDecMultiModalProcessor[WhisperProcessingInfo]): - def _get_data_parser(self) -> MultiModalDataParser: - feature_extractor = self.info.get_feature_extractor() - return MultiModalDataParser( - target_sr=feature_extractor.sampling_rate, - target_channels=self.info.get_target_channels(), - ) - def create_encoder_prompt( self, prompt: str | list[int], diff --git a/vllm/multimodal/processing/context.py b/vllm/multimodal/processing/context.py index d4894a984..62ad1dc3e 100644 --- a/vllm/multimodal/processing/context.py +++ b/vllm/multimodal/processing/context.py @@ -17,6 +17,7 @@ import torch from typing_extensions import TypeVar from vllm.logger import init_logger +from vllm.multimodal.parse import MultiModalDataParser from vllm.tokenizers import TokenizerLike from vllm.transformers_utils.processor import cached_processor_from_config from vllm.utils.func_utils import get_allowed_kwarg_only_overrides @@ -569,6 +570,35 @@ class BaseProcessingInfo: """ return self.ctx.get_hf_processor(**kwargs) + def _get_expected_hidden_size(self) -> int | None: + """ + Get expected hidden size for embedding validation if `mm_embeds` are enabled. + + This validates hidden dimensions to prevent a vulnerability where embeddings + with correct `ndim` but wrong `shape` could cause crashes at inference time. + """ + model_config = self.ctx.model_config + mm_config = model_config.get_multimodal_config() + + if mm_config.enable_mm_embeds: + return model_config.get_inputs_embeds_size() + + return None + + def get_data_parser(self) -> MultiModalDataParser: + """ + Constructs a parser to preprocess multi-modal data items + before passing them to + [`_get_hf_mm_data`][vllm.multimodal.processing.BaseMultiModalProcessor._get_hf_mm_data]. + + You can support additional modalities by creating a subclass + of [`MultiModalDataParser`][vllm.multimodal.parse.MultiModalDataParser] + that has additional subparsers. + """ + return MultiModalDataParser( + expected_hidden_size=self._get_expected_hidden_size(), + ) + @property def skip_prompt_length_check(self) -> bool: return False diff --git a/vllm/multimodal/processing/processor.py b/vllm/multimodal/processing/processor.py index 1b8039f76..c2776f7f8 100644 --- a/vllm/multimodal/processing/processor.py +++ b/vllm/multimodal/processing/processor.py @@ -40,7 +40,6 @@ from ..parse import ( DictEmbeddingItems, EmbeddingItems, MultiModalDataItems, - MultiModalDataParser, ) from .context import ( BaseProcessingInfo, @@ -990,7 +989,16 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): self.dummy_inputs = dummy_inputs self.cache = cache - self.data_parser = self._get_data_parser() + if hasattr(self, "_get_data_parser"): + logger.warning_once( + "BaseMultiModalProcessor._get_data_parser is deprecated " + "and will be removed in v0.16." + "You should override `info.build_data_parser` instead." + ) + + self.data_parser = self._get_data_parser() # type: ignore + else: + self.data_parser = self.info.get_data_parser() # Avoid unnecessary recomputation self._supported_mm_limits = self.info.get_supported_mm_limits() @@ -1014,26 +1022,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): ) -> MultiModalInputs: return self.apply(prompt, mm_data, hf_processor_mm_kwargs, mm_uuids=mm_uuids) - def _get_data_parser(self) -> MultiModalDataParser: - """ - Construct a parser to preprocess multi-modal data items - before passing them to - [`_get_hf_mm_data`][vllm.multimodal.processing.BaseMultiModalProcessor._get_hf_mm_data]. - - You can support additional modalities by creating a subclass - of [`MultiModalDataParser`][vllm.multimodal.parse.MultiModalDataParser] - that has additional subparsers. - """ - # Get expected hidden size for embedding validation if mm_embeds enabled - # This validates hidden dimensions to prevent vulnerabilities: embeddings - # with correct ndim but wrong shape could cause crashes at inference time - mm_config = self.info.ctx.model_config.get_multimodal_config() - expected_hidden_size = None - if mm_config.enable_mm_embeds: - expected_hidden_size = self.info.ctx.model_config.get_inputs_embeds_size() - - return MultiModalDataParser(expected_hidden_size=expected_hidden_size) - def validate_num_items( self, modality: str,