[Misc] Add online audio_in_video test (#36775)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
Isotr0py
2026-03-15 15:14:11 +08:00
committed by GitHub
parent 6590a3ecda
commit 143e4dccdf
5 changed files with 100 additions and 1 deletions

View File

@@ -10,6 +10,7 @@ pytest-cov
# testing utils
albumentations # required for Nemotron Parse in test_common.py
av # required for audio_in_video tests
backoff # required for phi4mm test
blobfile # required for kimi-vl test
einops # required for MPT, qwen-vl

View File

@@ -62,6 +62,8 @@ attrs==24.2.0
# referencing
audioread==3.0.1
# via librosa
av==16.1.0
# via -r requirements/test.in
backoff==2.2.1
# via
# -r requirements/test.in

View File

@@ -0,0 +1,80 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import base64
import json
import openai
import pytest
import pytest_asyncio
from ...conftest import VideoTestAssets
from ...utils import RemoteOpenAIServer
MODEL_NAME = "Qwen/Qwen2.5-Omni-3B"
@pytest.fixture
def server():
args = [
"--max-model-len",
"8192",
"--enforce-eager",
"--limit-mm-per-prompt",
json.dumps({"audio": 1, "video": 1}),
]
with RemoteOpenAIServer(
MODEL_NAME,
args,
) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.mark.core_model
@pytest.mark.asyncio
async def test_online_audio_in_video(
client: openai.AsyncOpenAI, video_assets: VideoTestAssets
):
"""Test video input with `audio_in_video=True`"""
# we don't use video_urls above because they missed audio stream.
video_path = video_assets[0].video_path
with open(video_path, "rb") as f:
video_base64 = base64.b64encode(f.read()).decode("utf-8")
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "What's in this video?"},
{
"type": "video_url",
"video_url": {"url": f"data:video/mp4;base64,{video_base64}"},
},
],
}
]
# multi-turn to test mm processor cache as well
for _ in range(2):
chat_completion = await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
max_tokens=16,
extra_body={
"mm_processor_kwargs": {
"use_audio_in_video": True,
}
},
)
assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0]
assert choice.finish_reason == "length"

View File

@@ -4,6 +4,7 @@ import base64
from pathlib import Path
from unittest.mock import patch
import librosa
import numpy as np
import pytest
@@ -71,3 +72,13 @@ def test_audio_media_io_encode_base64(dummy_audio):
decoded = base64.b64decode(out)
assert decoded == b"dummy_wav_data"
mock_write.assert_called_once()
def test_audio_media_io_from_video(video_assets):
audio_io = AudioMediaIO()
video_path = video_assets[0].video_path
with open(video_path, "rb") as f:
audio, sr = audio_io.load_bytes(f.read())
audio_ref, sr_ref = librosa.load(video_path, sr=None)
assert sr == sr_ref
np.testing.assert_allclose(audio_ref, audio, atol=1e-4)

View File

@@ -506,6 +506,7 @@ class OpenAIServingRender:
(ResponsesRequest not supported here); TODO comment dropped accordingly.
"""
renderer = self.renderer
mm_config = self.model_config.multimodal_config
default_template_kwargs = merge_kwargs(
default_template_kwargs,
@@ -518,7 +519,11 @@ class OpenAIServingRender:
tok_params = request.build_tok_params(self.model_config)
chat_params = request.build_chat_params(
default_template, default_template_content_format
).with_defaults(default_template_kwargs)
).with_defaults(
default_template_kwargs,
default_media_io_kwargs=(mm_config.media_io_kwargs if mm_config else None),
default_mm_processor_kwargs=getattr(request, "mm_processor_kwargs", None),
)
(conversation,), (engine_prompt,) = await renderer.render_chat_async(
[messages],