[Misc] Add online audio_in_video test (#36775)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2026-03-15 15:14:11 +08:00
parent 6590a3ecda
commit 143e4dccdf
5 changed files with 100 additions and 1 deletions
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -10,6 +10,7 @@ pytest-cov

 # testing utils
 albumentations # required for Nemotron Parse in test_common.py
+av  # required for audio_in_video tests
 backoff # required for phi4mm test
 blobfile # required for kimi-vl test
 einops # required for MPT, qwen-vl
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -62,6 +62,8 @@ attrs==24.2.0
    #   referencing
 audioread==3.0.1
    # via librosa
+av==16.1.0
+    # via -r requirements/test.in
 backoff==2.2.1
    # via
    #   -r requirements/test.in
--- a/tests/entrypoints/openai/test_audio_in_video.py
+++ b/tests/entrypoints/openai/test_audio_in_video.py
@@ -0,0 +1,80 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import base64
+import json
+
+import openai
+import pytest
+import pytest_asyncio
+
+from ...conftest import VideoTestAssets
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "Qwen/Qwen2.5-Omni-3B"
+
+
+@pytest.fixture
+def server():
+    args = [
+        "--max-model-len",
+        "8192",
+        "--enforce-eager",
+        "--limit-mm-per-prompt",
+        json.dumps({"audio": 1, "video": 1}),
+    ]
+
+    with RemoteOpenAIServer(
+        MODEL_NAME,
+        args,
+    ) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.core_model
+@pytest.mark.asyncio
+async def test_online_audio_in_video(
+    client: openai.AsyncOpenAI, video_assets: VideoTestAssets
+):
+    """Test video input with `audio_in_video=True`"""
+
+    # we don't use video_urls above because they missed audio stream.
+    video_path = video_assets[0].video_path
+    with open(video_path, "rb") as f:
+        video_base64 = base64.b64encode(f.read()).decode("utf-8")
+
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "What's in this video?"},
+                {
+                    "type": "video_url",
+                    "video_url": {"url": f"data:video/mp4;base64,{video_base64}"},
+                },
+            ],
+        }
+    ]
+
+    # multi-turn to test mm processor cache as well
+    for _ in range(2):
+        chat_completion = await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=messages,
+            max_tokens=16,
+            extra_body={
+                "mm_processor_kwargs": {
+                    "use_audio_in_video": True,
+                }
+            },
+        )
+
+        assert len(chat_completion.choices) == 1
+        choice = chat_completion.choices[0]
+        assert choice.finish_reason == "length"
--- a/tests/multimodal/media/test_audio.py
+++ b/tests/multimodal/media/test_audio.py
@@ -4,6 +4,7 @@ import base64
 from pathlib import Path
 from unittest.mock import patch

+import librosa
 import numpy as np
 import pytest

@@ -71,3 +72,13 @@ def test_audio_media_io_encode_base64(dummy_audio):
        decoded = base64.b64decode(out)
        assert decoded == b"dummy_wav_data"
        mock_write.assert_called_once()
+
+
+def test_audio_media_io_from_video(video_assets):
+    audio_io = AudioMediaIO()
+    video_path = video_assets[0].video_path
+    with open(video_path, "rb") as f:
+        audio, sr = audio_io.load_bytes(f.read())
+    audio_ref, sr_ref = librosa.load(video_path, sr=None)
+    assert sr == sr_ref
+    np.testing.assert_allclose(audio_ref, audio, atol=1e-4)
--- a/vllm/entrypoints/serve/render/serving.py
+++ b/vllm/entrypoints/serve/render/serving.py
@@ -506,6 +506,7 @@ class OpenAIServingRender:
        (ResponsesRequest not supported here); TODO comment dropped accordingly.
        """
        renderer = self.renderer
+        mm_config = self.model_config.multimodal_config

        default_template_kwargs = merge_kwargs(
            default_template_kwargs,
@@ -518,7 +519,11 @@ class OpenAIServingRender:
        tok_params = request.build_tok_params(self.model_config)
        chat_params = request.build_chat_params(
            default_template, default_template_content_format
-        ).with_defaults(default_template_kwargs)
+        ).with_defaults(
+            default_template_kwargs,
+            default_media_io_kwargs=(mm_config.media_io_kwargs if mm_config else None),
+            default_mm_processor_kwargs=getattr(request, "mm_processor_kwargs", None),
+        )

        (conversation,), (engine_prompt,) = await renderer.render_chat_async(
            [messages],