tests/entrypoints/openai/test_transcription_validation.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

# imports for structured outputs tests
import json

import pytest

from ...utils import RemoteOpenAIServer
from .conftest import add_attention_backend

MISTRAL_FORMAT_ARGS = [
    "--tokenizer_mode",
    "mistral",
    "--config_format",
    "mistral",
    "--load_format",
    "mistral",
]


@pytest.mark.asyncio
@pytest.mark.parametrize(
    "model_name", ["mistralai/Voxtral-Mini-3B-2507", "Qwen/Qwen3-ASR-0.6B"]
)
async def test_basic_audio(mary_had_lamb, model_name, rocm_aiter_fa_attention):
    server_args = ["--enforce-eager"]

    if model_name.startswith("mistralai"):
        server_args += MISTRAL_FORMAT_ARGS

    add_attention_backend(server_args, rocm_aiter_fa_attention)

    # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
    with RemoteOpenAIServer(model_name, server_args) as remote_server:
        client = remote_server.get_async_client()
        transcription = await client.audio.transcriptions.create(
            model=model_name,
            file=mary_had_lamb,
            language="en",
            response_format="text",
            temperature=0.0,
        )
        out = json.loads(transcription)
        out_text = out["text"]
        out_usage = out["usage"]
        assert "Mary had a little lamb" in out_text
        assert out_usage["seconds"] == 16, out_usage["seconds"]


@pytest.mark.asyncio
async def test_basic_audio_with_lora(mary_had_lamb, rocm_aiter_fa_attention):
    """Ensure STT (transcribe) requests can pass LoRA through to generate."""
    # ROCm SPECIFIC CONFIGURATION:
    # To ensure the test passes on ROCm, we modify the max model length to 512.
    # We DO NOT apply this to other platforms to maintain strict upstream parity.
    from vllm.platforms import current_platform

    model_name = "ibm-granite/granite-speech-3.3-2b"
    lora_model_name = "speech"
    server_args = [
        "--enforce-eager",
        "--enable-lora",
        "--max-lora-rank",
        "64",
        "--lora-modules",
        f"{lora_model_name}={model_name}",
        "--max-model-len",
        "512" if current_platform.is_rocm() else "2048",
        "--max-num-seqs",
        "1",
    ]

    add_attention_backend(server_args, rocm_aiter_fa_attention)

    # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
    with RemoteOpenAIServer(model_name, server_args) as remote_server:
        client = remote_server.get_async_client()
        transcription = await client.audio.transcriptions.create(
            model=lora_model_name,
            file=mary_had_lamb,
            language="en",
            response_format="text",
            temperature=0.0,
        )
    out = json.loads(transcription)
    out_text = out["text"]
    out_usage = out["usage"]
    assert "mary had a little lamb" in out_text
    assert out_usage["seconds"] == 16, out_usage["seconds"]


@pytest.mark.asyncio
@pytest.mark.parametrize(
    "model_name", ["google/gemma-3n-E2B-it", "Qwen/Qwen3-ASR-0.6B"]
)
async def test_basic_audio_foscolo(foscolo, rocm_aiter_fa_attention, model_name):
    # Gemma accuracy on some of the audio samples we use is particularly bad,
    # hence we use a different one here. WER is evaluated separately.
    server_args = ["--enforce-eager"]

    add_attention_backend(server_args, rocm_aiter_fa_attention)

    with RemoteOpenAIServer(
        model_name, server_args, max_wait_seconds=480
    ) as remote_server:
        client = remote_server.get_async_client()
        transcription = await client.audio.transcriptions.create(
            model=model_name,
            file=foscolo,
            language="it",
            response_format="text",
            temperature=0.0,
        )
        out = json.loads(transcription)["text"]
        assert "ove il mio corpo fanciulletto giacque" in out
[Frontend] Add `/v1/audio/transcriptions` OpenAI API endpoint (#12909) 2025-02-13 16:23:45 +01:00			`# SPDX-License-Identifier: Apache-2.0`
[Misc] Add SPDX-FileCopyrightText (#19100) Signed-off-by: simon-mo <simon.mo@hey.com> 2025-06-03 11:20:17 -07:00			`# SPDX-FileCopyrightText: Copyright contributors to the vLLM project`
[Frontend] Add `/v1/audio/transcriptions` OpenAI API endpoint (#12909) 2025-02-13 16:23:45 +01:00
[Chore] Cleanup guided namespace, move to structured outputs config (#22772) Signed-off-by: Aaron Pham <contact@aarnphm.xyz> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> 2025-09-18 05:20:27 -04:00			`# imports for structured outputs tests`
[Frontend] Add `/v1/audio/transcriptions` OpenAI API endpoint (#12909) 2025-02-13 16:23:45 +01:00			`import json`

			`import pytest`

			`from ...utils import RemoteOpenAIServer`
[ROCm][CI] Fix entrypoints tests and Python-only installation test on ROCm (#28979) Signed-off-by: Andreas Karatzas <akaratza@amd.com> 2025-12-24 00:42:30 -06:00			`from .conftest import add_attention_backend`
[Frontend] Add `/v1/audio/transcriptions` OpenAI API endpoint (#12909) 2025-02-13 16:23:45 +01:00
Voxtral (#20970) Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> 2025-07-15 16:35:30 +02:00			`MISTRAL_FORMAT_ARGS = [`
			`"--tokenizer_mode",`
			`"mistral",`
			`"--config_format",`
			`"mistral",`
			`"--load_format",`
			`"mistral",`
			`]`

[Frontend] Add `/v1/audio/transcriptions` OpenAI API endpoint (#12909) 2025-02-13 16:23:45 +01:00
			`@pytest.mark.asyncio`
[CI] Qwen3-ASR transcriptios tests (#33414) Signed-off-by: NickLucche <nlucches@redhat.com> 2026-01-30 17:17:56 +01:00			`@pytest.mark.parametrize(`
			`"model_name", ["mistralai/Voxtral-Mini-3B-2507", "Qwen/Qwen3-ASR-0.6B"]`
			`)`
[ROCm][CI] Fix entrypoints tests and Python-only installation test on ROCm (#28979) Signed-off-by: Andreas Karatzas <akaratza@amd.com> 2025-12-24 00:42:30 -06:00			`async def test_basic_audio(mary_had_lamb, model_name, rocm_aiter_fa_attention):`
[Frontend] Add `/v1/audio/transcriptions` OpenAI API endpoint (#12909) 2025-02-13 16:23:45 +01:00			`server_args = ["--enforce-eager"]`
Voxtral (#20970) Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> 2025-07-15 16:35:30 +02:00
			`if model_name.startswith("mistralai"):`
			`server_args += MISTRAL_FORMAT_ARGS`

[ROCm][CI] Fix entrypoints tests and Python-only installation test on ROCm (#28979) Signed-off-by: Andreas Karatzas <akaratza@amd.com> 2025-12-24 00:42:30 -06:00			`add_attention_backend(server_args, rocm_aiter_fa_attention)`

[Frontend] Add `/v1/audio/transcriptions` OpenAI API endpoint (#12909) 2025-02-13 16:23:45 +01:00			`# Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.`
			`with RemoteOpenAIServer(model_name, server_args) as remote_server:`
			`client = remote_server.get_async_client()`
			`transcription = await client.audio.transcriptions.create(`
			`model=model_name,`
			`file=mary_had_lamb,`
			`language="en",`
			`response_format="text",`
			`temperature=0.0,`
			`)`
feat: add usage to TranscriptionResponse (text and json response_format) (#23576) Signed-off-by: Guillaume Calmettes <gcalmettes@scaleway.com> 2025-08-26 14:26:26 +02:00			`out = json.loads(transcription)`
			`out_text = out["text"]`
			`out_usage = out["usage"]`
[CI] Qwen3-ASR transcriptios tests (#33414) Signed-off-by: NickLucche <nlucches@redhat.com> 2026-01-30 17:17:56 +01:00			`assert "Mary had a little lamb" in out_text`
feat: add usage to TranscriptionResponse (text and json response_format) (#23576) Signed-off-by: Guillaume Calmettes <gcalmettes@scaleway.com> 2025-08-26 14:26:26 +02:00			`assert out_usage["seconds"] == 16, out_usage["seconds"]`
[Frontend] Add `/v1/audio/transcriptions` OpenAI API endpoint (#12909) 2025-02-13 16:23:45 +01:00

[Model, Core] Support Granite Speech & LoRA for STT (#24455) 2025-11-05 00:33:48 -07:00			`@pytest.mark.asyncio`
[ROCm][CI] Fix entrypoints tests and Python-only installation test on ROCm (#28979) Signed-off-by: Andreas Karatzas <akaratza@amd.com> 2025-12-24 00:42:30 -06:00			`async def test_basic_audio_with_lora(mary_had_lamb, rocm_aiter_fa_attention):`
[Model, Core] Support Granite Speech & LoRA for STT (#24455) 2025-11-05 00:33:48 -07:00			`"""Ensure STT (transcribe) requests can pass LoRA through to generate."""`
[ROCm][CI] Fix entrypoints tests and Python-only installation test on ROCm (#28979) Signed-off-by: Andreas Karatzas <akaratza@amd.com> 2025-12-24 00:42:30 -06:00			`# ROCm SPECIFIC CONFIGURATION:`
			`# To ensure the test passes on ROCm, we modify the max model length to 512.`
			`# We DO NOT apply this to other platforms to maintain strict upstream parity.`
			`from vllm.platforms import current_platform`

[Model, Core] Support Granite Speech & LoRA for STT (#24455) 2025-11-05 00:33:48 -07:00			`model_name = "ibm-granite/granite-speech-3.3-2b"`
			`lora_model_name = "speech"`
			`server_args = [`
			`"--enforce-eager",`
			`"--enable-lora",`
			`"--max-lora-rank",`
			`"64",`
			`"--lora-modules",`
			`f"{lora_model_name}={model_name}",`
			`"--max-model-len",`
[ROCm][CI] Fix entrypoints tests and Python-only installation test on ROCm (#28979) Signed-off-by: Andreas Karatzas <akaratza@amd.com> 2025-12-24 00:42:30 -06:00			`"512" if current_platform.is_rocm() else "2048",`
[Model, Core] Support Granite Speech & LoRA for STT (#24455) 2025-11-05 00:33:48 -07:00			`"--max-num-seqs",`
			`"1",`
			`]`

[ROCm][CI] Fix entrypoints tests and Python-only installation test on ROCm (#28979) Signed-off-by: Andreas Karatzas <akaratza@amd.com> 2025-12-24 00:42:30 -06:00			`add_attention_backend(server_args, rocm_aiter_fa_attention)`

[Model, Core] Support Granite Speech & LoRA for STT (#24455) 2025-11-05 00:33:48 -07:00			`# Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.`
			`with RemoteOpenAIServer(model_name, server_args) as remote_server:`
			`client = remote_server.get_async_client()`
			`transcription = await client.audio.transcriptions.create(`
			`model=lora_model_name,`
			`file=mary_had_lamb,`
			`language="en",`
			`response_format="text",`
			`temperature=0.0,`
			`)`
			`out = json.loads(transcription)`
			`out_text = out["text"]`
			`out_usage = out["usage"]`
			`assert "mary had a little lamb" in out_text`
			`assert out_usage["seconds"] == 16, out_usage["seconds"]`


[Frontend] Gemma3n audio `transcriptions`/`translations` endpoint (#23735) Signed-off-by: NickLucche <nlucches@redhat.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk> 2025-09-01 12:07:46 +02:00			`@pytest.mark.asyncio`
[CI] Qwen3-ASR transcriptios tests (#33414) Signed-off-by: NickLucche <nlucches@redhat.com> 2026-01-30 17:17:56 +01:00			`@pytest.mark.parametrize(`
			`"model_name", ["google/gemma-3n-E2B-it", "Qwen/Qwen3-ASR-0.6B"]`
			`)`
			`async def test_basic_audio_foscolo(foscolo, rocm_aiter_fa_attention, model_name):`
[Frontend] Gemma3n audio `transcriptions`/`translations` endpoint (#23735) Signed-off-by: NickLucche <nlucches@redhat.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk> 2025-09-01 12:07:46 +02:00			`# Gemma accuracy on some of the audio samples we use is particularly bad,`
			`# hence we use a different one here. WER is evaluated separately.`
			`server_args = ["--enforce-eager"]`

[ROCm][CI] Fix entrypoints tests and Python-only installation test on ROCm (#28979) Signed-off-by: Andreas Karatzas <akaratza@amd.com> 2025-12-24 00:42:30 -06:00			`add_attention_backend(server_args, rocm_aiter_fa_attention)`

[CI/Build] Fix flaky test_transcription_validation.py::test_basic_audio_gemma (#27924) Signed-off-by: Ben Browning <bbrownin@redhat.com> 2025-11-01 23:45:02 -04:00			`with RemoteOpenAIServer(`
			`model_name, server_args, max_wait_seconds=480`
			`) as remote_server:`
[Frontend] Gemma3n audio `transcriptions`/`translations` endpoint (#23735) Signed-off-by: NickLucche <nlucches@redhat.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk> 2025-09-01 12:07:46 +02:00			`client = remote_server.get_async_client()`
			`transcription = await client.audio.transcriptions.create(`
			`model=model_name,`
			`file=foscolo,`
			`language="it",`
			`response_format="text",`
			`temperature=0.0,`
			`)`
			`out = json.loads(transcription)["text"]`
[CI] Qwen3-ASR transcriptios tests (#33414) Signed-off-by: NickLucche <nlucches@redhat.com> 2026-01-30 17:17:56 +01:00			`assert "ove il mio corpo fanciulletto giacque" in out`