[Multimodal][Qwen3 Omni] Make Qwen3 Omni work with audio-in-video inputs in V1 engine. (#27721)
Signed-off-by: Chenheli Hua <huachenheli@outlook.com> Signed-off-by: Roger Wang <hey@rogerw.io> Co-authored-by: Roger Wang <hey@rogerw.io>
This commit is contained in:
@@ -23,7 +23,6 @@
|
||||
"""Inference-only Qwen2.5-Omni model (thinker part)."""
|
||||
|
||||
from collections.abc import Callable, Iterable, Mapping, Sequence
|
||||
from copy import copy
|
||||
from functools import partial
|
||||
from typing import Annotated, Any, Literal
|
||||
|
||||
@@ -387,15 +386,6 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
|
||||
self._validate_mm_kwargs(mm_kwargs, mm_item_counts)
|
||||
self._validate_mm_updates(mm_prompt_updates, mm_item_counts)
|
||||
|
||||
use_audio_in_video = False
|
||||
if "video" in mm_kwargs:
|
||||
video_items = [item for item in mm_kwargs["video"] if item is not None]
|
||||
# only check video items (if there are any)
|
||||
if video_items:
|
||||
use_audio_in_video = all(
|
||||
item["use_audio_in_video"].data for item in video_items
|
||||
)
|
||||
|
||||
if is_update_applied:
|
||||
mm_placeholders = self._find_mm_placeholders(
|
||||
prompt_ids,
|
||||
@@ -404,7 +394,6 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
|
||||
self._validate_mm_placeholders(
|
||||
mm_placeholders,
|
||||
mm_item_counts,
|
||||
use_audio_in_video=use_audio_in_video,
|
||||
)
|
||||
else:
|
||||
prompt_ids, mm_placeholders = self._apply_prompt_updates(
|
||||
@@ -414,7 +403,6 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
|
||||
self._validate_mm_placeholders(
|
||||
mm_placeholders,
|
||||
mm_item_counts,
|
||||
use_audio_in_video=use_audio_in_video,
|
||||
)
|
||||
|
||||
return prompt_ids, mm_placeholders
|
||||
@@ -640,19 +628,6 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
|
||||
|
||||
return mm_processed_data
|
||||
|
||||
def _validate_mm_placeholders(
|
||||
self,
|
||||
mm_placeholders: Mapping[str, list[PlaceholderFeaturesInfo]],
|
||||
mm_item_counts: Mapping[str, int],
|
||||
use_audio_in_video: bool = False,
|
||||
) -> None:
|
||||
if use_audio_in_video:
|
||||
mm_item_counts = copy(mm_item_counts)
|
||||
if "video" in mm_item_counts:
|
||||
assert "audio" in mm_item_counts
|
||||
mm_item_counts["audio"] -= mm_item_counts["video"]
|
||||
super()._validate_mm_placeholders(mm_placeholders, mm_item_counts)
|
||||
|
||||
|
||||
class Qwen2_5OmniConditionalGenerationMixin:
|
||||
def _parse_and_validate_audio_input(
|
||||
|
||||
Reference in New Issue
Block a user