[Bugfix] Fix Qwen Omni audio inference (#27920)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-11-02 13:06:05 +08:00
parent 758ea2e980
commit 853a8eb53b
2 changed files with 2 additions and 10 deletions
--- a/vllm/model_executor/models/qwen2_5_omni_thinker.py
+++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py
@@ -130,6 +130,8 @@ class Qwen2_5OmniAudioFeatureInputs(TensorSchema):
        TensorShape("nmb", "tsl", dynamic_dims={"tsl"}),
    ]

+    audio_feature_lengths: Annotated[torch.Tensor, TensorShape("na")]
+
    feature_attention_mask: Annotated[
        torch.Tensor | list[torch.Tensor],
        TensorShape("na", "msl", dynamic_dims={"msl"}),
@@ -732,13 +734,6 @@ class Qwen2_5OmniConditionalGenerationMixin:
        input_features = audio_input["input_features"]
        audio_feature_lengths = audio_input["audio_feature_lengths"]

-        if audio_feature_lengths.shape[0] == 1:
-            audio_feature_lengths = audio_feature_lengths.squeeze(0)
-        elif audio_feature_lengths.shape[1] == 1:
-            audio_feature_lengths = audio_feature_lengths.squeeze(1)
-        else:
-            raise AssertionError(audio_feature_lengths.shape)
-
        audio_feat_lengths, audio_output_lengths = (
            self.audio_tower._get_feat_extract_output_lengths(audio_feature_lengths)
        )