[Misc] Consolidate Audio tests into multimodal common generation tests (#18214)

Signed-off-by: Isotr0py <2037008807@qq.com>
2025-05-16 17:18:08 +08:00
parent 541817670c
commit 390ec88905
9 changed files with 282 additions and 215 deletions
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -8,14 +8,14 @@ from collections import defaultdict
 from pathlib import PosixPath

 import pytest
-from transformers import (AutoModelForImageTextToText,
+from transformers import (AutoModel, AutoModelForImageTextToText,
                          AutoModelForTextToWaveform, AutoModelForVision2Seq)

 from vllm.platforms import current_platform
 from vllm.utils import identity

-from ....conftest import (IMAGE_ASSETS, HfRunner, ImageTestAssets,
-                          VideoTestAssets, VllmRunner)
+from ....conftest import (IMAGE_ASSETS, AudioTestAssets, HfRunner,
+                          ImageTestAssets, VideoTestAssets, VllmRunner)
 from ....utils import (create_new_process_for_each_test, large_gpu_mark,
                       multi_gpu_marks)
 from ...utils import check_outputs_equal
@@ -158,6 +158,17 @@ VLM_TEST_SETTINGS = {
        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
    ),
+    "ultravox": VLMTestInfo(
+        models = ["fixie-ai/ultravox-v0_5-llama-3_2-1b"],
+        test_type=VLMTestType.AUDIO,
+        prompt_formatter=lambda audio_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{audio_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
+        audio_idx_to_prompt=lambda idx: "<|audio|>",
+        max_model_len=4096,
+        max_num_seqs=2,
+        auto_cls=AutoModel,
+        hf_output_post_proc=model_utils.ultravox_trunc_hf_output,
+        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+    ),
    #### Extended model tests
    "aria": VLMTestInfo(
        models=["rhymes-ai/Aria"],
@@ -393,7 +404,6 @@ VLM_TEST_SETTINGS = {
                formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",   # noqa: E501
            ),
            limit_mm_per_prompt={"video": 4},
-            runner_mm_key="videos",
        )],
    ),
    "llava_next_video": VLMTestInfo(
@@ -706,6 +716,7 @@ VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2)
 # - multi-image
 # - image embeddings
 # - video
+# - audio
 # - custom inputs
@pytest.mark.parametrize(
    "model_type,test_case",
@@ -803,6 +814,28 @@ def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
    )


+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.AUDIO,
+        create_new_process_for_each_test=False,
+    ))
+def test_audio_models(model_type: str, test_case: ExpandableVLMTestArgs,
+                      hf_runner: type[HfRunner], vllm_runner: type[VllmRunner],
+                      audio_assets: AudioTestAssets, monkeypatch):
+    if model_type in REQUIRES_V0_MODELS:
+        monkeypatch.setenv("VLLM_USE_V1", "0")
+    model_test_info = VLM_TEST_SETTINGS[model_type]
+    runners.run_audio_test(
+        model_test_info=model_test_info,
+        test_case=test_case,
+        hf_runner=hf_runner,
+        vllm_runner=vllm_runner,
+        audio_assets=audio_assets,
+    )
+
+
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(
@@ -930,6 +963,29 @@ def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
    )


+@pytest.mark.parametrize(
+    "model_type,test_case",
+    get_parametrized_options(
+        VLM_TEST_SETTINGS,
+        test_type=VLMTestType.AUDIO,
+        create_new_process_for_each_test=True,
+    ))
+def test_audio_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
+                            hf_runner: type[HfRunner],
+                            vllm_runner: type[VllmRunner],
+                            audio_assets: AudioTestAssets, monkeypatch):
+    if model_type in REQUIRES_V0_MODELS:
+        monkeypatch.setenv("VLLM_USE_V1", "0")
+    model_test_info = VLM_TEST_SETTINGS[model_type]
+    runners.run_audio_test(
+        model_test_info=model_test_info,
+        test_case=test_case,
+        hf_runner=hf_runner,
+        vllm_runner=vllm_runner,
+        audio_assets=audio_assets,
+    )
+
+
@pytest.mark.parametrize(
    "model_type,test_case",
    get_parametrized_options(