diff --git a/setup.py b/setup.py index 556a511a3..f31b4cf24 100644 --- a/setup.py +++ b/setup.py @@ -1056,6 +1056,7 @@ setup( "scipy", "soundfile", "mistral_common[audio]", + "av", ], # Required for audio processing "video": [], # Kept for backwards compatibility "flashinfer": [], # Kept for backwards compatibility diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index ef241d545..ec03d283f 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -622,6 +622,15 @@ class NemotronHForCausalLMConfig(VerifyAndUpdateConfig): cache_config.mamba_ssm_cache_dtype = mamba_ssm_cache_dtype +class NemotronHNanoVLV2Config(VerifyAndUpdateConfig): + @staticmethod + def verify_and_update_model_config(model_config: "ModelConfig") -> None: + mm_config = model_config.multimodal_config + if mm_config is not None: + video_kwargs = mm_config.media_io_kwargs.setdefault("video", {}) + video_kwargs.setdefault("video_backend", "nemotron_vl") + + class Qwen3_5ForConditionalGenerationConfig(VerifyAndUpdateConfig): @staticmethod def verify_and_update_config(vllm_config: "VllmConfig") -> None: @@ -661,6 +670,7 @@ MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = { "GteNewModel": GteNewModelConfig, "GteNewForSequenceClassification": GteNewModelConfig, "Gemma3TextModel": Gemma3TextModelConfig, + "NemotronH_Nano_VL_V2": NemotronHNanoVLV2Config, "LlamaBidirectionalForSequenceClassification": LlamaBidirectionalConfig, "LlamaBidirectionalModel": LlamaBidirectionalConfig, "LlamaNemotronVLModel": LlamaNemotronVLConfig, diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py index 82422e89f..9b9beadc0 100644 --- a/vllm/model_executor/models/nano_nemotron_vl.py +++ b/vllm/model_executor/models/nano_nemotron_vl.py @@ -59,9 +59,11 @@ from vllm.multimodal.inputs import ( AudioItem, MultiModalDataDict, MultiModalFieldConfig, + MultiModalInputs, MultiModalKwargsItems, VideoItem, ) +from vllm.multimodal.media.audio import extract_audio_from_video_bytes from vllm.multimodal.parse import ( AudioProcessorItems, ImageEmbeddingItems, @@ -69,8 +71,13 @@ from vllm.multimodal.parse import ( ImageSize, MultiModalDataItems, MultiModalDataParser, + VideoProcessorItems, +) +from vllm.multimodal.processing import ( + BaseDummyInputsBuilder, + ProcessorInputs, + TimingContext, ) -from vllm.multimodal.processing import BaseDummyInputsBuilder from vllm.multimodal.processing.processor import ( BaseMultiModalProcessor, BaseProcessingInfo, @@ -1381,6 +1388,127 @@ class NanoNemotronVLMultiModalProcessor( ): """MultiModalProcessor extended for video support""" + def _extract_audio_from_videos( + self, + mm_items: MultiModalDataItems, + ) -> tuple[MultiModalDataItems, list[AudioItem]]: + """Extract audio tracks from video bytes in *mm_items*. + + Returns: + The augmented *mm_items* (with audio added) and the list of + extracted audio items. + """ + videos = mm_items.get_items("video", VideoProcessorItems) + assert isinstance(videos.metadata, list) + metadata_list = videos.metadata + + audio_items: list[AudioItem] = [] + for metadata in metadata_list: + video_bytes = metadata.get("original_video_bytes") + if video_bytes is None or len(video_bytes) == 0: + raise ValueError( + "Cannot extract audio from video: original_video_bytes is " + "missing or empty. When using use_audio_in_video=True, " + "video must be loaded with keep_video_bytes=True (e.g. via " + "the chat API with a model that sets use_audio_in_video)." + ) + audio_items.append(extract_audio_from_video_bytes(video_bytes)) + + # Create a new VideoProcessorItems with metadata that does not contain + # the large video bytes, to avoid modifying the input `mm_items`. + new_metadata_list = [ + {k: v for k, v in meta.items() if k != "original_video_bytes"} + for meta in metadata_list + ] + new_videos = VideoProcessorItems(data=videos.data, metadata=new_metadata_list) + + audio_parsed = self.data_parser.parse_mm_data({"audio": audio_items}) + + # Create a new MultiModalDataItems with the new video and audio items. + new_mm_items_dict = {**mm_items, **audio_parsed, "video": new_videos} + mm_items = MultiModalDataItems(new_mm_items_dict) + + return mm_items, audio_items + + def apply( + self, + processor_inputs: ProcessorInputs, + timing_ctx: TimingContext | None = None, + ) -> MultiModalInputs: + if (hf_processor_mm_kwargs := processor_inputs.hf_processor_mm_kwargs) is None: + hf_processor_mm_kwargs = {} + + use_audio_in_video = bool( + hf_processor_mm_kwargs.get("use_audio_in_video", False) + ) + + hf_processor_mm_kwargs = { + k: v for k, v in hf_processor_mm_kwargs.items() if k != "use_audio_in_video" + } + + processor_inputs.hf_processor_mm_kwargs = hf_processor_mm_kwargs + + if not ( + use_audio_in_video + and "video" in processor_inputs.mm_data_items + and "audio" not in processor_inputs.mm_data_items + ): + return super().apply( + processor_inputs, + timing_ctx, + ) + + mm_items, audio_items = self._extract_audio_from_videos( + processor_inputs.mm_data_items + ) + processor_inputs.mm_data_items = mm_items + + prompt = processor_inputs.prompt + tokenizer = self.info.get_tokenizer() + if not isinstance(prompt, str): + prompt = tokenizer.decode(prompt, skip_special_tokens=False) + + for _ in audio_items: + prompt = prompt.replace("