diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py index 9983015b0..a0ef99133 100644 --- a/vllm/model_executor/models/nano_nemotron_vl.py +++ b/vllm/model_executor/models/nano_nemotron_vl.py @@ -597,19 +597,26 @@ class NanoNemotronVLMultiModalProcessor( def _extract_audio_from_videos( self, mm_items: MultiModalDataItems, - ) -> tuple[MultiModalDataItems, list[AudioItem]]: + ) -> tuple[MultiModalDataItems, list[AudioItem], list[bool]]: """Extract audio tracks from video bytes in *mm_items*. + Videos whose bytes are missing or that contain no audio stream are + silently skipped. The returned *has_audio* mask is aligned with + the video list so callers know which ``