Voxtral (#20970)

Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-07-15 16:35:30 +02:00
parent 4ffd963fa0
commit e7e3e6d263
14 changed files with 913 additions and 47 deletions
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@@ -17,6 +17,11 @@ from vllm.assets.audio import AudioAsset

 from ...utils import RemoteOpenAIServer

+MISTRAL_FORMAT_ARGS = [
+    "--tokenizer_mode", "mistral", "--config_format", "mistral",
+    "--load_format", "mistral"
+]
+

@pytest.fixture
 def mary_had_lamb():
@@ -33,9 +38,18 @@ def winning_call():


@pytest.mark.asyncio
-async def test_basic_audio(mary_had_lamb):
-    model_name = "openai/whisper-large-v3-turbo"
+@pytest.mark.parametrize(
+    "model_name",
+    ["openai/whisper-large-v3-turbo", "mistralai/Voxtral-Mini-3B-2507"])
+async def test_basic_audio(mary_had_lamb, model_name):
    server_args = ["--enforce-eager"]
+
+    if model_name.startswith("mistralai"):
+        server_args += MISTRAL_FORMAT_ARGS
+
+        # TODO(PATRICK) - REMOVE AFTER RELEASE
+        return  # skip for now
+
    # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
    with RemoteOpenAIServer(model_name, server_args) as remote_server:
        client = remote_server.get_async_client()
@@ -65,10 +79,13 @@ async def test_bad_requests(mary_had_lamb):


@pytest.mark.asyncio
-async def test_long_audio_request(mary_had_lamb):
-    model_name = "openai/whisper-large-v3-turbo"
+@pytest.mark.parametrize("model_name", ["openai/whisper-large-v3-turbo"])
+async def test_long_audio_request(mary_had_lamb, model_name):
    server_args = ["--enforce-eager"]

+    if model_name.startswith("openai"):
+        return
+
    mary_had_lamb.seek(0)
    audio, sr = librosa.load(mary_had_lamb)
    # Add small silence after each audio for repeatability in the split process
@@ -87,7 +104,8 @@ async def test_long_audio_request(mary_had_lamb):
            response_format="text",
            temperature=0.0)
        out = json.loads(transcription)['text']
-        assert out.count("Mary had a little lamb") == 10
+        counts = out.count("Mary had a little lamb")
+        assert counts == 10, counts


@pytest.mark.asyncio