[Multimodal][Qwen3 Omni] Make Qwen3 Omni work with audio-in-video inputs in V1 engine. (#27721)
Signed-off-by: Chenheli Hua <huachenheli@outlook.com> Signed-off-by: Roger Wang <hey@rogerw.io> Co-authored-by: Roger Wang <hey@rogerw.io>
This commit is contained in:
@@ -23,7 +23,6 @@
|
||||
"""Inference-only Qwen2.5-Omni model (thinker part)."""
|
||||
|
||||
from collections.abc import Callable, Iterable, Mapping, Sequence
|
||||
from copy import copy
|
||||
from functools import partial
|
||||
from typing import Annotated, Any, Literal
|
||||
|
||||
@@ -387,15 +386,6 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
|
||||
self._validate_mm_kwargs(mm_kwargs, mm_item_counts)
|
||||
self._validate_mm_updates(mm_prompt_updates, mm_item_counts)
|
||||
|
||||
use_audio_in_video = False
|
||||
if "video" in mm_kwargs:
|
||||
video_items = [item for item in mm_kwargs["video"] if item is not None]
|
||||
# only check video items (if there are any)
|
||||
if video_items:
|
||||
use_audio_in_video = all(
|
||||
item["use_audio_in_video"].data for item in video_items
|
||||
)
|
||||
|
||||
if is_update_applied:
|
||||
mm_placeholders = self._find_mm_placeholders(
|
||||
prompt_ids,
|
||||
@@ -404,7 +394,6 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
|
||||
self._validate_mm_placeholders(
|
||||
mm_placeholders,
|
||||
mm_item_counts,
|
||||
use_audio_in_video=use_audio_in_video,
|
||||
)
|
||||
else:
|
||||
prompt_ids, mm_placeholders = self._apply_prompt_updates(
|
||||
@@ -414,7 +403,6 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
|
||||
self._validate_mm_placeholders(
|
||||
mm_placeholders,
|
||||
mm_item_counts,
|
||||
use_audio_in_video=use_audio_in_video,
|
||||
)
|
||||
|
||||
return prompt_ids, mm_placeholders
|
||||
@@ -640,19 +628,6 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
|
||||
|
||||
return mm_processed_data
|
||||
|
||||
def _validate_mm_placeholders(
|
||||
self,
|
||||
mm_placeholders: Mapping[str, list[PlaceholderFeaturesInfo]],
|
||||
mm_item_counts: Mapping[str, int],
|
||||
use_audio_in_video: bool = False,
|
||||
) -> None:
|
||||
if use_audio_in_video:
|
||||
mm_item_counts = copy(mm_item_counts)
|
||||
if "video" in mm_item_counts:
|
||||
assert "audio" in mm_item_counts
|
||||
mm_item_counts["audio"] -= mm_item_counts["video"]
|
||||
super()._validate_mm_placeholders(mm_placeholders, mm_item_counts)
|
||||
|
||||
|
||||
class Qwen2_5OmniConditionalGenerationMixin:
|
||||
def _parse_and_validate_audio_input(
|
||||
|
||||
@@ -68,11 +68,11 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalKwargsItems
|
||||
from vllm.multimodal.parse import AudioProcessorItems, MultiModalDataItems
|
||||
from vllm.multimodal.processing import (
|
||||
BaseMultiModalProcessor,
|
||||
MultiModalPromptUpdates,
|
||||
PlaceholderFeaturesInfo,
|
||||
PromptReplacement,
|
||||
PromptUpdate,
|
||||
PromptUpdateDetails,
|
||||
)
|
||||
from vllm.sequence import IntermediateTensors
|
||||
|
||||
@@ -87,7 +87,6 @@ from .qwen2_5_omni_thinker import (
|
||||
Qwen2_5OmniConditionalGenerationMixin,
|
||||
Qwen2_5OmniThinkerDummyInputsBuilder,
|
||||
Qwen2_5OmniThinkerMultiModalProcessor,
|
||||
Qwen2_5OmniThinkerProcessingInfo,
|
||||
)
|
||||
from .qwen2_5_vl import (
|
||||
Qwen2_5_VisionAttention,
|
||||
@@ -807,24 +806,8 @@ class Qwen3OmniMoeThinkerMultiModalProcessor(
|
||||
else:
|
||||
use_audio_in_video = False
|
||||
|
||||
if use_audio_in_video and "video" in mm_item_counts:
|
||||
assert "audio" in mm_item_counts
|
||||
mm_item_counts["audio"] -= mm_item_counts["video"]
|
||||
|
||||
# Special case with `use_audio_in_video=True`
|
||||
if use_audio_in_video:
|
||||
if is_update_applied:
|
||||
prompt_ids = self._get_raw_input_ids(prompt_ids, use_audio_in_video)
|
||||
(
|
||||
prompt_ids,
|
||||
mm_placeholders,
|
||||
) = self._apply_prompt_updates(
|
||||
prompt_ids,
|
||||
mm_prompt_updates,
|
||||
)
|
||||
self._validate_mm_placeholders(mm_placeholders, mm_item_counts)
|
||||
# normal case with `use_audio_in_video=False`
|
||||
elif is_update_applied:
|
||||
if is_update_applied:
|
||||
mm_placeholders = self._find_mm_placeholders(
|
||||
prompt_ids,
|
||||
mm_prompt_updates,
|
||||
@@ -834,10 +817,24 @@ class Qwen3OmniMoeThinkerMultiModalProcessor(
|
||||
mm_item_counts,
|
||||
)
|
||||
else:
|
||||
prompt_ids, mm_placeholders = self._apply_prompt_updates(
|
||||
prompt_ids,
|
||||
mm_prompt_updates,
|
||||
)
|
||||
if use_audio_in_video and "audio" in mm_prompt_updates:
|
||||
filtered_updates = {
|
||||
k: v for k, v in mm_prompt_updates.items() if k != "audio"
|
||||
}
|
||||
prompt_ids, mm_placeholders = self._apply_prompt_updates(
|
||||
prompt_ids,
|
||||
filtered_updates,
|
||||
)
|
||||
# Derive audio placeholders from video placeholders
|
||||
mm_placeholders = self._derive_audio_from_video_placeholders(
|
||||
mm_placeholders, mm_prompt_updates
|
||||
)
|
||||
else:
|
||||
prompt_ids, mm_placeholders = self._apply_prompt_updates(
|
||||
prompt_ids,
|
||||
mm_prompt_updates,
|
||||
)
|
||||
|
||||
self._validate_mm_placeholders(
|
||||
mm_placeholders,
|
||||
mm_item_counts,
|
||||
@@ -962,7 +959,9 @@ class Qwen3OmniMoeThinkerMultiModalProcessor(
|
||||
|
||||
def get_replacement_qwen2_use_audio_in_video(item_idx: int):
|
||||
nonlocal audio_in_video_item_idx
|
||||
audio_num_features = audio_output_lengths[audio_item_idx + item_idx]
|
||||
audio_num_features = audio_output_lengths[
|
||||
audio_in_video_item_idx + item_idx
|
||||
]
|
||||
video_grid_thw = out_mm_data["video_grid_thw"][item_idx]
|
||||
|
||||
audio_in_video_item_idx += 1
|
||||
@@ -971,14 +970,17 @@ class Qwen3OmniMoeThinkerMultiModalProcessor(
|
||||
if second_per_grid_ts:
|
||||
video_second_per_grid_t = second_per_grid_ts[item_idx]
|
||||
else:
|
||||
video_second_per_grid_t = 1.0
|
||||
video_second_per_grid_t = 2.0
|
||||
|
||||
return self.get_updates_use_audio_in_video(
|
||||
placeholder = self.get_updates_use_audio_in_video(
|
||||
thinker_config=thinker_config,
|
||||
audio_len=audio_num_features,
|
||||
video_grid_thw=video_grid_thw,
|
||||
video_second_per_grid_t=video_second_per_grid_t,
|
||||
)
|
||||
return PromptUpdateDetails.select_token_id(
|
||||
placeholder, embed_token_id=video_token_id
|
||||
)
|
||||
|
||||
video_replacement_fn = (
|
||||
get_replacement_qwen2_use_audio_in_video
|
||||
@@ -1004,14 +1006,50 @@ class Qwen3OmniMoeThinkerMultiModalProcessor(
|
||||
),
|
||||
]
|
||||
|
||||
def _validate_mm_placeholders(
|
||||
def _derive_audio_from_video_placeholders(
|
||||
self,
|
||||
mm_placeholders: Mapping[str, list[PlaceholderFeaturesInfo]],
|
||||
mm_item_counts: Mapping[str, int],
|
||||
) -> None:
|
||||
BaseMultiModalProcessor[
|
||||
Qwen2_5OmniThinkerProcessingInfo
|
||||
]._validate_mm_placeholders(self, mm_placeholders, mm_item_counts)
|
||||
placeholders: Mapping[str, list[PlaceholderFeaturesInfo]],
|
||||
mm_prompt_updates: MultiModalPromptUpdates,
|
||||
) -> Mapping[str, list[PlaceholderFeaturesInfo]]:
|
||||
"""
|
||||
Helper to derive audio placeholders from video placeholders when
|
||||
use_audio_in_video=True.
|
||||
"""
|
||||
if "video" not in placeholders:
|
||||
return placeholders
|
||||
|
||||
# Validate audio and video counts match
|
||||
num_videos = len(placeholders["video"])
|
||||
num_audios = len(mm_prompt_updates.get("audio", []))
|
||||
if num_audios != num_videos:
|
||||
raise ValueError(
|
||||
f"use_audio_in_video requires equal number of audio and video items, "
|
||||
f"got {num_audios=}, {num_videos=}"
|
||||
)
|
||||
|
||||
tokenizer = self.info.get_tokenizer()
|
||||
processor = self.info.get_hf_processor()
|
||||
audio_token_id = tokenizer.get_vocab()[processor.audio_token]
|
||||
|
||||
result_placeholders = dict(placeholders)
|
||||
audio_placeholders = []
|
||||
|
||||
# Each video is paired with one audio
|
||||
for video_idx, video_placeholder in enumerate(placeholders["video"]):
|
||||
# Create is_embed mask selecting only audio tokens
|
||||
audio_is_embed = torch.tensor(video_placeholder.tokens) == audio_token_id
|
||||
|
||||
audio_placeholder = PlaceholderFeaturesInfo(
|
||||
modality="audio",
|
||||
item_idx=video_idx,
|
||||
start_idx=video_placeholder.start_idx,
|
||||
tokens=video_placeholder.tokens,
|
||||
is_embed=audio_is_embed,
|
||||
)
|
||||
audio_placeholders.append(audio_placeholder)
|
||||
|
||||
result_placeholders["audio"] = audio_placeholders
|
||||
return result_placeholders
|
||||
|
||||
def _get_raw_input_ids(
|
||||
self,
|
||||
@@ -1454,7 +1492,11 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
|
||||
)
|
||||
|
||||
if not len(second_per_grid_ts) and len(video_grid_thw):
|
||||
second_per_grids = torch.ones(len(video_grid_thw), dtype=torch.float32)
|
||||
second_per_grid_ts = 2.0
|
||||
second_per_grids = (
|
||||
torch.ones(len(video_grid_thw), dtype=torch.float32)
|
||||
* second_per_grid_ts
|
||||
)
|
||||
else:
|
||||
second_per_grids = torch.tensor(second_per_grid_ts, dtype=torch.float32)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user