diff --git a/tests/entrypoints/openai/test_audio_in_video.py b/tests/entrypoints/openai/test_audio_in_video.py index cf715b83a..334d9a71e 100644 --- a/tests/entrypoints/openai/test_audio_in_video.py +++ b/tests/entrypoints/openai/test_audio_in_video.py @@ -18,10 +18,10 @@ MODEL_NAME = "Qwen/Qwen2.5-Omni-3B" def server(): args = [ "--max-model-len", - "8192", + "16384", "--enforce-eager", "--limit-mm-per-prompt", - json.dumps({"audio": 1, "video": 1}), + json.dumps({"audio": 3, "video": 3}), ] with RemoteOpenAIServer( @@ -78,3 +78,98 @@ async def test_online_audio_in_video( assert len(chat_completion.choices) == 1 choice = chat_completion.choices[0] assert choice.finish_reason == "length" + + +@pytest.mark.core_model +@pytest.mark.asyncio +async def test_online_audio_in_video_multi_videos( + client: openai.AsyncOpenAI, video_assets: VideoTestAssets +): + """Test multi-video input with `audio_in_video=True`""" + + # we don't use video_urls above because they missed audio stream. + video_path = video_assets[0].video_path + with open(video_path, "rb") as f: + video_base64 = base64.b64encode(f.read()).decode("utf-8") + + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "What's in these two videos?"}, + { + "type": "video_url", + "video_url": {"url": f"data:video/mp4;base64,{video_base64}"}, + }, + { + "type": "video_url", + "video_url": {"url": f"data:video/mp4;base64,{video_base64}"}, + }, + ], + } + ] + + # multi-turn to test mm processor cache as well + for _ in range(2): + chat_completion = await client.chat.completions.create( + model=MODEL_NAME, + messages=messages, + max_tokens=16, + extra_body={ + "mm_processor_kwargs": { + "use_audio_in_video": True, + } + }, + ) + + assert len(chat_completion.choices) == 1 + choice = chat_completion.choices[0] + assert choice.finish_reason == "length" + + +@pytest.mark.core_model +@pytest.mark.asyncio +async def test_online_audio_in_video_interleaved( + client: openai.AsyncOpenAI, video_assets: VideoTestAssets +): + """Test interleaved video/audio input with `audio_in_video=True`""" + + # we don't use video_urls above because they missed audio stream. + video_path = video_assets[0].video_path + with open(video_path, "rb") as f: + video_base64 = base64.b64encode(f.read()).decode("utf-8") + + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "What's in these two videos?"}, + { + "type": "video_url", + "video_url": {"url": f"data:video/mp4;base64,{video_base64}"}, + }, + { + "type": "audio_url", + "audio_url": {"url": f"data:audio/mp4;base64,{video_base64}"}, + }, + { + "type": "video_url", + "video_url": {"url": f"data:video/mp4;base64,{video_base64}"}, + }, + ], + } + ] + with pytest.raises( + openai.BadRequestError, + match="use_audio_in_video requires equal number of audio and video items", + ): + await client.chat.completions.create( + model=MODEL_NAME, + messages=messages, + max_tokens=16, + extra_body={ + "mm_processor_kwargs": { + "use_audio_in_video": True, + } + }, + ) diff --git a/tests/models/multimodal/processing/test_audio_in_video.py b/tests/models/multimodal/processing/test_audio_in_video.py index e248e4e3a..894b097ab 100644 --- a/tests/models/multimodal/processing/test_audio_in_video.py +++ b/tests/models/multimodal/processing/test_audio_in_video.py @@ -34,8 +34,22 @@ MODELS = [ ] +def create_mm_data(num_videos: int) -> dict[str, list]: + # Small video (8 frames, 64×64) and ~0.5 s of audio at 16 kHz so the test + # stays fast even without a GPU. + mm_data = dict[str, list](video=[], audio=[]) + for i in range(num_videos): + rng = np.random.RandomState(i) + video = random_video(rng, min_frames=8, max_frames=9, min_wh=64, max_wh=65) + audio, sr = random_audio(rng, min_len=8000, max_len=8001, sr=16000) + mm_data["video"].append(video) + mm_data["audio"].append((audio, sr)) + return mm_data + + @pytest.mark.parametrize("model_id", MODELS) -def test_audio_in_video_cache_correctness(model_id: str) -> None: +@pytest.mark.parametrize("num_videos", [1, 2]) +def test_audio_in_video_cache_correctness(model_id: str, num_videos: int) -> None: """ Regression test for https://github.com/vllm-project/vllm/pull/36800 @@ -47,7 +61,7 @@ def test_audio_in_video_cache_correctness(model_id: str) -> None: """ ctx = build_model_context( model_id, - limit_mm_per_prompt={"audio": 1, "image": 0, "video": 1}, + limit_mm_per_prompt={"audio": num_videos, "image": 0, "video": num_videos}, mm_processor_cache_gb=1, ) @@ -65,17 +79,12 @@ def test_audio_in_video_cache_correctness(model_id: str) -> None: video_token_id = baseline_processor.info.get_hf_config().video_token_id - rng = np.random.RandomState(0) - # Small video (8 frames, 64×64) and ~0.5 s of audio at 16 kHz so the test - # stays fast even without a GPU. - video = random_video(rng, min_frames=8, max_frames=9, min_wh=64, max_wh=65) - audio, sr = random_audio(rng, min_len=8000, max_len=8001, sr=16000) - mm_data = {"video": [video], "audio": [(audio, sr)]} + mm_data = create_mm_data(num_videos) hf_processor_mm_kwargs = {"use_audio_in_video": True} def run(processor): return processor( - [video_token_id], + [video_token_id] * num_videos, mm_items=baseline_processor.info.parse_mm_data(mm_data), hf_processor_mm_kwargs=hf_processor_mm_kwargs, )["prompt_token_ids"] diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index 42829cf36..ff7dbb703 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -774,9 +774,7 @@ class Qwen2_5OmniThinkerMultiModalProcessor( def get_replacement_qwen2_use_audio_in_video(item_idx: int): nonlocal audio_in_video_item_idx - audio_num_features = audio_output_lengths[ - audio_in_video_item_idx + item_idx - ] + audio_num_features = audio_output_lengths[audio_in_video_item_idx] video_grid_thw = out_mm_data["video_grid_thw"][item_idx] audio_in_video_item_idx += 1 diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py index 085243588..fc097ffdd 100755 --- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py +++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py @@ -1489,9 +1489,7 @@ class Qwen3OmniMoeThinkerMultiModalProcessor( def get_replacement_qwen2_use_audio_in_video(item_idx: int): nonlocal audio_in_video_item_idx - audio_num_features = audio_output_lengths[ - audio_in_video_item_idx + item_idx - ] + audio_num_features = audio_output_lengths[audio_in_video_item_idx] video_grid_thw = out_mm_data["video_grid_thw"][item_idx] audio_in_video_item_idx += 1