From df2503e125f3c869b0f274e64530d09bf01ea30d Mon Sep 17 00:00:00 2001 From: Andrii Skliar Date: Thu, 9 Apr 2026 13:44:39 +0200 Subject: [PATCH] nemotron-nano-vl: Allow `use_audio_in_video` to be passed at `vllm serve` time (#38538) Signed-off-by: Andrii Skliar Co-authored-by: Andrii Skliar --- .../model_executor/models/nano_nemotron_vl.py | 90 +++++++++++++++---- .../processors/nano_nemotron_vl.py | 2 + 2 files changed, 74 insertions(+), 18 deletions(-) diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py index 9983015b0..a0ef99133 100644 --- a/vllm/model_executor/models/nano_nemotron_vl.py +++ b/vllm/model_executor/models/nano_nemotron_vl.py @@ -597,19 +597,26 @@ class NanoNemotronVLMultiModalProcessor( def _extract_audio_from_videos( self, mm_items: MultiModalDataItems, - ) -> tuple[MultiModalDataItems, list[AudioItem]]: + ) -> tuple[MultiModalDataItems, list[AudioItem], list[bool]]: """Extract audio tracks from video bytes in *mm_items*. + Videos whose bytes are missing or that contain no audio stream are + silently skipped. The returned *has_audio* mask is aligned with + the video list so callers know which ``