[Doc] Show that use_audio_in_video is supported in docs (#30837)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2026-01-06 15:27:19 +08:00
committed by GitHub
parent 1fb0209bbc
commit da71d44410
4 changed files with 0 additions and 8 deletions

View File

@@ -767,9 +767,6 @@ Some models are supported only via the [Transformers modeling backend](#transfor
The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`HwwwH/MiniCPM-V-2`) for now.
For more details, please see: <https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630>
!!! note
For Qwen2.5-Omni and Qwen3-Omni, reading audio from video pre-processing (`--mm-processor-kwargs '{"use_audio_in_video": true}'`) is currently work in progress and not yet supported.
#### Transcription
Speech2Text models trained specifically for Automatic Speech Recognition.

View File

@@ -10,7 +10,6 @@ python examples/offline_inference/qwen2_5_omni/only_thinker.py \
-q mixed_modalities
# Read vision and audio inputs from a single video file
# NOTE: V1 engine does not support interleaved modalities yet.
python examples/offline_inference/qwen2_5_omni/only_thinker.py \
-q use_audio_in_video

View File

@@ -1128,8 +1128,6 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
multimodal_embeddings += tuple(audio_embeddings)
return multimodal_embeddings
# TODO (ywang96): support overlapping modality embeddings so that
# `use_audio_in_video` will work on V1.
def embed_input_ids(
self,
input_ids: torch.Tensor,

View File

@@ -1371,8 +1371,6 @@ class Qwen3OmniMoeThinkerForConditionalGeneration(
return inputs_embeds
deepstack_input_embeds = None
# TODO (ywang96): support overlapping modalitiy embeddings so that
# `use_audio_in_video` will work on V1.
# split the feat dim to obtain multi-scale visual feature
has_vision_embeddings = [
embeddings.shape[-1] != self.config.text_config.hidden_size