nemotron-nano-vl: Allow use_audio_in_video to be passed at vllm serve time (#38538)
Signed-off-by: Andrii Skliar <askliar@nvidia.com> Co-authored-by: Andrii Skliar <askliar@nvidia.com>
This commit is contained in:
@@ -597,19 +597,26 @@ class NanoNemotronVLMultiModalProcessor(
|
|||||||
def _extract_audio_from_videos(
|
def _extract_audio_from_videos(
|
||||||
self,
|
self,
|
||||||
mm_items: MultiModalDataItems,
|
mm_items: MultiModalDataItems,
|
||||||
) -> tuple[MultiModalDataItems, list[AudioItem]]:
|
) -> tuple[MultiModalDataItems, list[AudioItem], list[bool]]:
|
||||||
"""Extract audio tracks from video bytes in *mm_items*.
|
"""Extract audio tracks from video bytes in *mm_items*.
|
||||||
|
|
||||||
|
Videos whose bytes are missing or that contain no audio stream are
|
||||||
|
silently skipped. The returned *has_audio* mask is aligned with
|
||||||
|
the video list so callers know which ``<video>`` tokens need an
|
||||||
|
accompanying audio context.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
The augmented *mm_items* (with audio added) and the list of
|
A 3-tuple of (augmented mm_items, extracted audio items,
|
||||||
extracted audio items.
|
per-video boolean mask indicating which videos have audio).
|
||||||
"""
|
"""
|
||||||
videos = mm_items.get_items("video", VideoProcessorItems)
|
videos = mm_items.get_items("video", VideoProcessorItems)
|
||||||
assert isinstance(videos.metadata, list)
|
assert isinstance(videos.metadata, list)
|
||||||
|
|
||||||
metadata_list = videos.metadata
|
metadata_list = videos.metadata
|
||||||
|
|
||||||
audio_items: list[AudioItem] = []
|
audio_items: list[AudioItem] = []
|
||||||
for metadata in metadata_list:
|
has_audio: list[bool] = []
|
||||||
|
for idx, metadata in enumerate(metadata_list):
|
||||||
video_bytes = metadata.get("original_video_bytes")
|
video_bytes = metadata.get("original_video_bytes")
|
||||||
if video_bytes is None or len(video_bytes) == 0:
|
if video_bytes is None or len(video_bytes) == 0:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
@@ -618,7 +625,16 @@ class NanoNemotronVLMultiModalProcessor(
|
|||||||
"video must be loaded with keep_video_bytes=True (e.g. via "
|
"video must be loaded with keep_video_bytes=True (e.g. via "
|
||||||
"the chat API with a model that sets use_audio_in_video)."
|
"the chat API with a model that sets use_audio_in_video)."
|
||||||
)
|
)
|
||||||
audio_items.append(load_audio_pyav(BytesIO(video_bytes)))
|
try:
|
||||||
|
audio_items.append(load_audio_pyav(BytesIO(video_bytes)))
|
||||||
|
has_audio.append(True)
|
||||||
|
except Exception:
|
||||||
|
logger.debug(
|
||||||
|
"Video %d: no audio stream found, skipping audio extraction.",
|
||||||
|
idx,
|
||||||
|
exc_info=True,
|
||||||
|
)
|
||||||
|
has_audio.append(False)
|
||||||
|
|
||||||
# Create a new VideoProcessorItems with metadata that does not contain
|
# Create a new VideoProcessorItems with metadata that does not contain
|
||||||
# the large video bytes, to avoid modifying the input `mm_items`.
|
# the large video bytes, to avoid modifying the input `mm_items`.
|
||||||
@@ -628,45 +644,83 @@ class NanoNemotronVLMultiModalProcessor(
|
|||||||
]
|
]
|
||||||
new_videos = VideoProcessorItems(data=videos.data, metadata=new_metadata_list)
|
new_videos = VideoProcessorItems(data=videos.data, metadata=new_metadata_list)
|
||||||
|
|
||||||
audio_parsed = self.data_parser.parse_mm_data({"audio": audio_items})
|
audio_parsed = {}
|
||||||
|
if audio_items:
|
||||||
|
audio_parsed = self.data_parser.parse_mm_data({"audio": audio_items})
|
||||||
|
|
||||||
# Create a new MultiModalDataItems with the new video and audio items.
|
# Create a new MultiModalDataItems with the new video and audio items.
|
||||||
new_mm_items_dict = {**mm_items, **audio_parsed, "video": new_videos}
|
new_mm_items_dict = {**mm_items, **audio_parsed, "video": new_videos}
|
||||||
mm_items = MultiModalDataItems(new_mm_items_dict)
|
mm_items = MultiModalDataItems(new_mm_items_dict)
|
||||||
|
|
||||||
return mm_items, audio_items
|
return mm_items, audio_items, has_audio
|
||||||
|
|
||||||
def apply(
|
def apply(
|
||||||
self,
|
self,
|
||||||
inputs: ProcessorInputs,
|
inputs: ProcessorInputs,
|
||||||
timing_ctx: TimingContext,
|
timing_ctx: TimingContext,
|
||||||
) -> MultiModalInput:
|
) -> MultiModalInput:
|
||||||
use_audio_in_video = bool(
|
mm_config = self.info.ctx.model_config.get_multimodal_config()
|
||||||
inputs.hf_processor_mm_kwargs.get("use_audio_in_video", False)
|
merged_kwargs = mm_config.merge_mm_processor_kwargs(
|
||||||
|
inputs.hf_processor_mm_kwargs
|
||||||
)
|
)
|
||||||
|
use_audio_in_video = bool(merged_kwargs.get("use_audio_in_video", False))
|
||||||
|
|
||||||
inputs.hf_processor_mm_kwargs = {
|
inputs.hf_processor_mm_kwargs = {
|
||||||
k: v
|
k: v
|
||||||
for k, v in inputs.hf_processor_mm_kwargs.items()
|
for k, v in inputs.hf_processor_mm_kwargs.items()
|
||||||
if k != "use_audio_in_video"
|
if k != "use_audio_in_video"
|
||||||
}
|
}
|
||||||
|
|
||||||
if not (
|
if not (use_audio_in_video and "video" in inputs.mm_data_items):
|
||||||
use_audio_in_video
|
|
||||||
and "video" in inputs.mm_data_items
|
|
||||||
and "audio" not in inputs.mm_data_items
|
|
||||||
):
|
|
||||||
return super().apply(inputs, timing_ctx)
|
return super().apply(inputs, timing_ctx)
|
||||||
|
|
||||||
mm_items, audio_items = self._extract_audio_from_videos(inputs.mm_data_items)
|
mm_items = inputs.mm_data_items
|
||||||
inputs.mm_data_items = mm_items
|
if "audio" in mm_items:
|
||||||
|
# Audio was pre-populated by upstream (e.g., OpenAI chat endpoint).
|
||||||
|
# Reuse existing audio items; validate 1:1 correspondence.
|
||||||
|
videos = mm_items.get_items("video", VideoProcessorItems)
|
||||||
|
audios = mm_items.get_items("audio", AudioProcessorItems)
|
||||||
|
if len(audios) != len(videos):
|
||||||
|
raise ValueError(
|
||||||
|
"use_audio_in_video requires equal number of audio and "
|
||||||
|
f"video items, got num_audios={len(audios)}, "
|
||||||
|
f"num_videos={len(videos)}"
|
||||||
|
)
|
||||||
|
audio_items = audios.get_all()
|
||||||
|
has_audio = [True] * len(videos)
|
||||||
|
logger.info(
|
||||||
|
"Using %d pre-populated audio item(s) from upstream.",
|
||||||
|
len(audio_items),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Extract audio from video bytes (library usage path).
|
||||||
|
mm_items, audio_items, has_audio = self._extract_audio_from_videos(mm_items)
|
||||||
|
inputs.mm_data_items = mm_items
|
||||||
|
logger.info(
|
||||||
|
"Extracted audio from video bytes: %d audio(s), has_audio=%s.",
|
||||||
|
len(audio_items),
|
||||||
|
has_audio,
|
||||||
|
)
|
||||||
|
|
||||||
|
if not audio_items:
|
||||||
|
return super().apply(inputs, timing_ctx)
|
||||||
|
|
||||||
prompt = inputs.prompt
|
prompt = inputs.prompt
|
||||||
tokenizer = self.info.get_tokenizer()
|
tokenizer = self.info.get_tokenizer()
|
||||||
if not isinstance(prompt, str):
|
if not isinstance(prompt, str):
|
||||||
prompt = tokenizer.decode(prompt, skip_special_tokens=False)
|
prompt = tokenizer.decode(prompt, skip_special_tokens=False)
|
||||||
|
|
||||||
for _ in audio_items:
|
# Inject AUDIO_CONTEXT only after <video> tokens whose video
|
||||||
prompt = prompt.replace("<video>", "<video>" + AUDIO_CONTEXT, 1)
|
# actually contained an audio stream (preserving video-audio pairing).
|
||||||
|
tag = "<video>"
|
||||||
|
head, *rest = prompt.split(tag)
|
||||||
|
rebuilt = [head]
|
||||||
|
for append_audio, part in zip(has_audio, rest, strict=True):
|
||||||
|
rebuilt.append(tag)
|
||||||
|
if append_audio:
|
||||||
|
rebuilt.append(AUDIO_CONTEXT)
|
||||||
|
rebuilt.append(part)
|
||||||
|
prompt = "".join(rebuilt)
|
||||||
|
|
||||||
inputs.prompt = tokenizer.encode(prompt, add_special_tokens=False)
|
inputs.prompt = tokenizer.encode(prompt, add_special_tokens=False)
|
||||||
|
|
||||||
|
|||||||
@@ -771,6 +771,7 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
|
|||||||
max_num_tiles: int | None = None,
|
max_num_tiles: int | None = None,
|
||||||
video_token: str | None = None,
|
video_token: str | None = None,
|
||||||
video_pruning_rate: float | None = None,
|
video_pruning_rate: float | None = None,
|
||||||
|
use_audio_in_video: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
super().__init__(
|
super().__init__(
|
||||||
config=config,
|
config=config,
|
||||||
@@ -781,6 +782,7 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
|
|||||||
# add extra video token for video processing
|
# add extra video token for video processing
|
||||||
self.video_token = video_token
|
self.video_token = video_token
|
||||||
self.video_pruning_rate = video_pruning_rate
|
self.video_pruning_rate = video_pruning_rate
|
||||||
|
self.use_audio_in_video = use_audio_in_video
|
||||||
|
|
||||||
# Video params live exclusively in vision_config
|
# Video params live exclusively in vision_config
|
||||||
vision_config = getattr(config, "vision_config", config)
|
vision_config = getattr(config, "vision_config", config)
|
||||||
|
|||||||
Reference in New Issue
Block a user