[Misc] Add online audio_in_video test (#36775)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
@@ -10,6 +10,7 @@ pytest-cov
|
||||
|
||||
# testing utils
|
||||
albumentations # required for Nemotron Parse in test_common.py
|
||||
av # required for audio_in_video tests
|
||||
backoff # required for phi4mm test
|
||||
blobfile # required for kimi-vl test
|
||||
einops # required for MPT, qwen-vl
|
||||
|
||||
@@ -62,6 +62,8 @@ attrs==24.2.0
|
||||
# referencing
|
||||
audioread==3.0.1
|
||||
# via librosa
|
||||
av==16.1.0
|
||||
# via -r requirements/test.in
|
||||
backoff==2.2.1
|
||||
# via
|
||||
# -r requirements/test.in
|
||||
|
||||
80
tests/entrypoints/openai/test_audio_in_video.py
Normal file
80
tests/entrypoints/openai/test_audio_in_video.py
Normal file
@@ -0,0 +1,80 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import base64
|
||||
import json
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from ...conftest import VideoTestAssets
|
||||
from ...utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "Qwen/Qwen2.5-Omni-3B"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def server():
|
||||
args = [
|
||||
"--max-model-len",
|
||||
"8192",
|
||||
"--enforce-eager",
|
||||
"--limit-mm-per-prompt",
|
||||
json.dumps({"audio": 1, "video": 1}),
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(
|
||||
MODEL_NAME,
|
||||
args,
|
||||
) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.asyncio
|
||||
async def test_online_audio_in_video(
|
||||
client: openai.AsyncOpenAI, video_assets: VideoTestAssets
|
||||
):
|
||||
"""Test video input with `audio_in_video=True`"""
|
||||
|
||||
# we don't use video_urls above because they missed audio stream.
|
||||
video_path = video_assets[0].video_path
|
||||
with open(video_path, "rb") as f:
|
||||
video_base64 = base64.b64encode(f.read()).decode("utf-8")
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "What's in this video?"},
|
||||
{
|
||||
"type": "video_url",
|
||||
"video_url": {"url": f"data:video/mp4;base64,{video_base64}"},
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
# multi-turn to test mm processor cache as well
|
||||
for _ in range(2):
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_tokens=16,
|
||||
extra_body={
|
||||
"mm_processor_kwargs": {
|
||||
"use_audio_in_video": True,
|
||||
}
|
||||
},
|
||||
)
|
||||
|
||||
assert len(chat_completion.choices) == 1
|
||||
choice = chat_completion.choices[0]
|
||||
assert choice.finish_reason == "length"
|
||||
@@ -4,6 +4,7 @@ import base64
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
import librosa
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
@@ -71,3 +72,13 @@ def test_audio_media_io_encode_base64(dummy_audio):
|
||||
decoded = base64.b64decode(out)
|
||||
assert decoded == b"dummy_wav_data"
|
||||
mock_write.assert_called_once()
|
||||
|
||||
|
||||
def test_audio_media_io_from_video(video_assets):
|
||||
audio_io = AudioMediaIO()
|
||||
video_path = video_assets[0].video_path
|
||||
with open(video_path, "rb") as f:
|
||||
audio, sr = audio_io.load_bytes(f.read())
|
||||
audio_ref, sr_ref = librosa.load(video_path, sr=None)
|
||||
assert sr == sr_ref
|
||||
np.testing.assert_allclose(audio_ref, audio, atol=1e-4)
|
||||
|
||||
@@ -506,6 +506,7 @@ class OpenAIServingRender:
|
||||
(ResponsesRequest not supported here); TODO comment dropped accordingly.
|
||||
"""
|
||||
renderer = self.renderer
|
||||
mm_config = self.model_config.multimodal_config
|
||||
|
||||
default_template_kwargs = merge_kwargs(
|
||||
default_template_kwargs,
|
||||
@@ -518,7 +519,11 @@ class OpenAIServingRender:
|
||||
tok_params = request.build_tok_params(self.model_config)
|
||||
chat_params = request.build_chat_params(
|
||||
default_template, default_template_content_format
|
||||
).with_defaults(default_template_kwargs)
|
||||
).with_defaults(
|
||||
default_template_kwargs,
|
||||
default_media_io_kwargs=(mm_config.media_io_kwargs if mm_config else None),
|
||||
default_mm_processor_kwargs=getattr(request, "mm_processor_kwargs", None),
|
||||
)
|
||||
|
||||
(conversation,), (engine_prompt,) = await renderer.render_chat_async(
|
||||
[messages],
|
||||
|
||||
Reference in New Issue
Block a user