vllm/tests/entrypoints/openai/chat_completion/test_audio_in_video.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import json

import openai
import pybase64 as base64
import pytest
import pytest_asyncio

from tests.conftest import VideoTestAssets
from tests.utils import ROCM_EXTRA_ARGS, RemoteOpenAIServer

MODEL_NAME = "Qwen/Qwen2.5-Omni-3B"


@pytest.fixture
def server():
    args = [
        "--max-model-len",
        "16384",
        "--enforce-eager",
        "--limit-mm-per-prompt",
        json.dumps({"audio": 3, "video": 3}),
        *ROCM_EXTRA_ARGS,
    ]

    with RemoteOpenAIServer(
        MODEL_NAME,
        args,
    ) as remote_server:
        yield remote_server


@pytest_asyncio.fixture
async def client(server):
    async with server.get_async_client() as async_client:
        yield async_client


@pytest.mark.core_model
@pytest.mark.asyncio
async def test_online_audio_in_video(
    client: openai.AsyncOpenAI, video_assets: VideoTestAssets
):
    """Test video input with `audio_in_video=True`"""

    # we don't use video_urls above because they missed audio stream.
    video_path = video_assets[0].video_path
    with open(video_path, "rb") as f:
        video_base64 = base64.b64encode(f.read()).decode("utf-8")

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "What's in this video?"},
                {
                    "type": "video_url",
                    "video_url": {"url": f"data:video/mp4;base64,{video_base64}"},
                },
            ],
        }
    ]

    # multi-turn to test mm processor cache as well
    for _ in range(2):
        chat_completion = await client.chat.completions.create(
            model=MODEL_NAME,
            messages=messages,
            max_tokens=16,
            extra_body={
                "mm_processor_kwargs": {
                    "use_audio_in_video": True,
                }
            },
        )

        assert len(chat_completion.choices) == 1
        choice = chat_completion.choices[0]
        assert choice.finish_reason == "length"


@pytest.mark.core_model
@pytest.mark.asyncio
async def test_online_audio_in_video_multi_videos(
    client: openai.AsyncOpenAI, video_assets: VideoTestAssets
):
    """Test multi-video input with `audio_in_video=True`"""

    # we don't use video_urls above because they missed audio stream.
    video_path = video_assets[0].video_path
    with open(video_path, "rb") as f:
        video_base64 = base64.b64encode(f.read()).decode("utf-8")

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "What's in these two videos?"},
                {
                    "type": "video_url",
                    "video_url": {"url": f"data:video/mp4;base64,{video_base64}"},
                },
                {
                    "type": "video_url",
                    "video_url": {"url": f"data:video/mp4;base64,{video_base64}"},
                },
            ],
        }
    ]

    # multi-turn to test mm processor cache as well
    for _ in range(2):
        chat_completion = await client.chat.completions.create(
            model=MODEL_NAME,
            messages=messages,
            max_tokens=16,
            extra_body={
                "mm_processor_kwargs": {
                    "use_audio_in_video": True,
                }
            },
        )

        assert len(chat_completion.choices) == 1
        choice = chat_completion.choices[0]
        assert choice.finish_reason == "length"


@pytest.mark.core_model
@pytest.mark.asyncio
async def test_online_audio_in_video_interleaved(
    client: openai.AsyncOpenAI, video_assets: VideoTestAssets
):
    """Test interleaved video/audio input with `audio_in_video=True`"""

    # we don't use video_urls above because they missed audio stream.
    video_path = video_assets[0].video_path
    with open(video_path, "rb") as f:
        video_base64 = base64.b64encode(f.read()).decode("utf-8")

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "What's in these two videos?"},
                {
                    "type": "video_url",
                    "video_url": {"url": f"data:video/mp4;base64,{video_base64}"},
                },
                {
                    "type": "audio_url",
                    "audio_url": {"url": f"data:audio/mp4;base64,{video_base64}"},
                },
                {
                    "type": "video_url",
                    "video_url": {"url": f"data:video/mp4;base64,{video_base64}"},
                },
            ],
        }
    ]
    with pytest.raises(
        openai.BadRequestError,
        match="use_audio_in_video requires equal number of audio and video items",
    ):
        await client.chat.completions.create(
            model=MODEL_NAME,
            messages=messages,
            max_tokens=16,
            extra_body={
                "mm_processor_kwargs": {
                    "use_audio_in_video": True,
                }
            },
        )