[Refactor] Relocate endpoint tests to mirror serving code directory structure (#37504)

Signed-off-by: sfeng33 <4florafeng@gmail.com>
2026-03-19 03:19:36 -04:00
parent e3126cd107
commit b21d384304
17 changed files with 14 additions and 17 deletions
--- a/tests/entrypoints/openai/speech_to_text/init.py
+++ b/tests/entrypoints/openai/speech_to_text/init.py
--- a/tests/entrypoints/openai/speech_to_text/test_transcription_validation.py
+++ b/tests/entrypoints/openai/speech_to_text/test_transcription_validation.py
@@ -0,0 +1,156 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# imports for structured outputs tests
+import json
+
+import pytest
+
+from tests.entrypoints.openai.conftest import add_attention_backend
+from tests.utils import ROCM_ENV_OVERRIDES, ROCM_EXTRA_ARGS, RemoteOpenAIServer
+
+MISTRAL_FORMAT_ARGS = [
+    "--tokenizer_mode",
+    "mistral",
+    "--config_format",
+    "mistral",
+    "--load_format",
+    "mistral",
+]
+
+
+async def transcribe_and_check(
+    client,
+    model_name: str,
+    file,
+    *,
+    language: str,
+    expected_text: str,
+    expected_seconds: int | None = None,
+    case_sensitive: bool = False,
+):
+    """Run a transcription request and assert the output contains
+    *expected_text* and optionally that usage reports *expected_seconds*.
+
+    Provides detailed failure messages with the actual transcription output.
+    """
+    transcription = await client.audio.transcriptions.create(
+        model=model_name,
+        file=file,
+        language=language,
+        response_format="text",
+        temperature=0.0,
+    )
+    out = json.loads(transcription)
+    out_text = out["text"]
+    out_usage = out["usage"]
+
+    if case_sensitive:
+        assert expected_text in out_text, (
+            f"Expected {expected_text!r} in transcription output, got: {out_text!r}"
+        )
+    else:
+        assert expected_text.lower() in out_text.lower(), (
+            f"Expected {expected_text!r} (case-insensitive) in transcription "
+            f"output, got: {out_text!r}"
+        )
+
+    if expected_seconds is not None:
+        assert out_usage["seconds"] == expected_seconds, (
+            f"Expected {expected_seconds}s of audio, "
+            f"got {out_usage['seconds']}s. Full usage: {out_usage!r}"
+        )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name", ["mistralai/Voxtral-Mini-3B-2507", "Qwen/Qwen3-ASR-0.6B"]
+)
+async def test_basic_audio(mary_had_lamb, model_name, rocm_aiter_fa_attention):
+    server_args = ["--enforce-eager", *ROCM_EXTRA_ARGS]
+
+    if model_name.startswith("mistralai"):
+        server_args += MISTRAL_FORMAT_ARGS
+
+    add_attention_backend(server_args, rocm_aiter_fa_attention)
+
+    # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
+    with RemoteOpenAIServer(
+        model_name, server_args, env_dict=ROCM_ENV_OVERRIDES
+    ) as remote_server:
+        client = remote_server.get_async_client()
+        await transcribe_and_check(
+            client,
+            model_name,
+            mary_had_lamb,
+            language="en",
+            expected_text="Mary had a little lamb",
+            expected_seconds=16,
+        )
+
+
+@pytest.mark.asyncio
+async def test_basic_audio_with_lora(mary_had_lamb, rocm_aiter_fa_attention):
+    """Ensure STT (transcribe) requests can pass LoRA through to generate."""
+    # ROCm SPECIFIC CONFIGURATION:
+    # To ensure the test passes on ROCm, we modify the max model length to 512.
+    # We DO NOT apply this to other platforms to maintain strict upstream parity.
+    from vllm.platforms import current_platform
+
+    model_name = "ibm-granite/granite-speech-3.3-2b"
+    lora_model_name = "speech"
+    server_args = [
+        "--enforce-eager",
+        "--enable-lora",
+        "--max-lora-rank",
+        "64",
+        "--lora-modules",
+        f"{lora_model_name}={model_name}",
+        "--max-model-len",
+        "512" if current_platform.is_rocm() else "2048",
+        "--max-num-seqs",
+        "1",
+    ]
+
+    add_attention_backend(server_args, rocm_aiter_fa_attention)
+
+    # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
+    with RemoteOpenAIServer(
+        model_name, server_args, env_dict=ROCM_ENV_OVERRIDES
+    ) as remote_server:
+        client = remote_server.get_async_client()
+        await transcribe_and_check(
+            client,
+            lora_model_name,
+            mary_had_lamb,
+            language="en",
+            expected_text="mary had a little lamb",
+            expected_seconds=16,
+        )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name", ["google/gemma-3n-E2B-it", "Qwen/Qwen3-ASR-0.6B"]
+)
+async def test_basic_audio_foscolo(foscolo, rocm_aiter_fa_attention, model_name):
+    # Gemma accuracy on some of the audio samples we use is particularly bad,
+    # hence we use a different one here. WER is evaluated separately.
+    server_args = ["--enforce-eager", *ROCM_EXTRA_ARGS]
+
+    add_attention_backend(server_args, rocm_aiter_fa_attention)
+
+    with RemoteOpenAIServer(
+        model_name,
+        server_args,
+        max_wait_seconds=480,
+        env_dict=ROCM_ENV_OVERRIDES,
+    ) as remote_server:
+        client = remote_server.get_async_client()
+        await transcribe_and_check(
+            client,
+            model_name,
+            foscolo,
+            language="it",
+            expected_text="ove il mio corpo fanciulletto giacque",
+        )
--- a/tests/entrypoints/openai/speech_to_text/test_transcription_validation_whisper.py
+++ b/tests/entrypoints/openai/speech_to_text/test_transcription_validation_whisper.py
@@ -0,0 +1,388 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# imports for structured outputs tests
+import asyncio
+import io
+import json
+
+import librosa
+import numpy as np
+import openai
+import pytest
+import pytest_asyncio
+import soundfile as sf
+
+from tests.utils import RemoteOpenAIServer
+
+MODEL_NAME = "openai/whisper-large-v3-turbo"
+
+
+@pytest.fixture(scope="module")
+def server():
+    with RemoteOpenAIServer(MODEL_NAME, []) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def whisper_client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+async def test_basic_audio(whisper_client, mary_had_lamb):
+    # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
+    transcription = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        response_format="text",
+        temperature=0.0,
+    )
+    out = json.loads(transcription)
+    out_text = out["text"]
+    out_usage = out["usage"]
+    assert "Mary had a little lamb," in out_text
+    assert out_usage["seconds"] == 16, out_usage["seconds"]
+
+
+@pytest.mark.asyncio
+async def test_basic_audio_batched(mary_had_lamb, winning_call, whisper_client):
+    transcription = whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        response_format="text",
+        temperature=0.0,
+    )
+    transcription2 = whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=winning_call,
+        language="en",
+        response_format="text",
+        temperature=0.0,
+    )
+    # Await both transcriptions by scheduling coroutines together
+    transcription, transcription2 = await asyncio.gather(transcription, transcription2)
+    out = json.loads(transcription)
+    out_text = out["text"]
+    assert "Mary had a little lamb," in out_text
+    out2 = json.loads(transcription2)
+    out_text2 = out2["text"]
+    assert "Edgar Martinez" in out_text2
+
+
+@pytest.mark.asyncio
+async def test_bad_requests(mary_had_lamb, whisper_client):
+    # invalid language
+    with pytest.raises(openai.BadRequestError):
+        await whisper_client.audio.transcriptions.create(
+            model=MODEL_NAME, file=mary_had_lamb, language="hh", temperature=0.0
+        )
+
+
+@pytest.mark.asyncio
+async def test_long_audio_request(mary_had_lamb, whisper_client):
+    mary_had_lamb.seek(0)
+    audio, sr = librosa.load(mary_had_lamb)
+    # Add small silence after each audio for repeatability in the split process
+    audio = np.pad(audio, (0, 1600))
+    repeated_audio = np.tile(audio, 10)
+    # Repeated audio to buffer
+    buffer = io.BytesIO()
+    sf.write(buffer, repeated_audio, sr, format="WAV")
+    buffer.seek(0)
+    transcription = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=buffer,
+        language="en",
+        response_format="text",
+        temperature=0.0,
+    )
+    out = json.loads(transcription)
+    out_text = out["text"]
+    out_usage = out["usage"]
+    counts = out_text.count("Mary had a little lamb")
+    assert counts == 10, counts
+    assert out_usage["seconds"] == 161, out_usage["seconds"]
+
+
+@pytest.mark.asyncio
+async def test_invalid_audio_file(whisper_client):
+    """Corrupted audio should surface as HTTP 400."""
+    invalid_audio = io.BytesIO(b"not a valid audio file")
+    invalid_audio.name = "invalid.wav"
+
+    with pytest.raises(openai.BadRequestError) as exc_info:
+        await whisper_client.audio.transcriptions.create(
+            model=MODEL_NAME,
+            file=invalid_audio,
+            language="en",
+        )
+
+    assert exc_info.value.status_code == 400
+    assert "Invalid or unsupported audio file" in exc_info.value.message
+
+
+@pytest.mark.asyncio
+async def test_completion_endpoints(whisper_client):
+    # text to text model
+    with pytest.raises(openai.NotFoundError):
+        await whisper_client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=[{"role": "system", "content": "You are a helpful assistant."}],
+        )
+
+    with pytest.raises(openai.NotFoundError):
+        await whisper_client.completions.create(model=MODEL_NAME, prompt="Hello")
+
+
+@pytest.mark.asyncio
+async def test_streaming_response(winning_call, whisper_client):
+    transcription = ""
+    res_no_stream = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=winning_call,
+        response_format="json",
+        language="en",
+        temperature=0.0,
+    )
+    res = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=winning_call,
+        language="en",
+        temperature=0.0,
+        stream=True,
+        timeout=30,
+    )
+    # Reconstruct from chunks and validate
+    async for chunk in res:
+        text = chunk.choices[0]["delta"]["content"]
+        transcription += text
+
+    assert transcription == res_no_stream.text
+
+
+@pytest.mark.asyncio
+async def test_stream_options(winning_call, whisper_client):
+    res = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=winning_call,
+        language="en",
+        temperature=0.0,
+        stream=True,
+        extra_body=dict(stream_include_usage=True, stream_continuous_usage_stats=True),
+        timeout=30,
+    )
+    final = False
+    continuous = True
+    async for chunk in res:
+        if not len(chunk.choices):
+            # final usage sent
+            final = True
+        else:
+            continuous = continuous and hasattr(chunk, "usage")
+    assert final and continuous
+
+
+@pytest.mark.asyncio
+async def test_sampling_params(mary_had_lamb, whisper_client):
+    """
+    Compare sampling with params and greedy sampling to assert results
+    are different when extreme sampling parameters values are picked.
+    """
+    transcription = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        temperature=0.8,
+        extra_body=dict(
+            seed=42,
+            repetition_penalty=1.9,
+            top_k=12,
+            top_p=0.4,
+            min_p=0.5,
+            frequency_penalty=1.8,
+            presence_penalty=2.0,
+        ),
+    )
+
+    greedy_transcription = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        temperature=0.0,
+        extra_body=dict(seed=42),
+    )
+
+    assert greedy_transcription.text != transcription.text
+
+
+@pytest.mark.asyncio
+async def test_audio_prompt(mary_had_lamb, whisper_client):
+    prompt = "This is a speech, recorded in a phonograph."
+    # Prompts should not omit the part of original prompt while transcribing.
+    prefix = "The first words I spoke in the original phonograph"
+    transcription = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        response_format="text",
+        temperature=0.0,
+    )
+    out = json.loads(transcription)["text"]
+    assert prefix in out
+    transcription_wprompt = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        response_format="text",
+        prompt=prompt,
+        temperature=0.0,
+    )
+    out_prompt = json.loads(transcription_wprompt)["text"]
+    assert prefix in out_prompt
+
+
+@pytest.mark.asyncio
+async def test_audio_with_timestamp(mary_had_lamb, whisper_client):
+    transcription = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        response_format="verbose_json",
+        temperature=0.0,
+    )
+    assert transcription.segments is not None
+    assert len(transcription.segments) > 0
+    assert transcription.segments[0].avg_logprob is not None
+    assert transcription.segments[0].compression_ratio is not None
+
+
+@pytest.mark.asyncio
+async def test_audio_with_max_tokens(whisper_client, mary_had_lamb):
+    transcription = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        response_format="text",
+        temperature=0.0,
+        extra_body={"max_completion_tokens": 1},
+    )
+    out = json.loads(transcription)
+    out_text = out["text"]
+    from transformers import AutoTokenizer
+
+    tok = AutoTokenizer.from_pretrained(MODEL_NAME)
+    out_tokens = tok(out_text, add_special_tokens=False)["input_ids"]
+    assert len(out_tokens) == 1
+    # max_completion_tokens > max_model_len
+    transcription = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        response_format="text",
+        temperature=0.0,
+        extra_body={"max_completion_tokens": int(1e6)},
+    )
+    out = json.loads(transcription)
+    out_text = out["text"]
+    out_tokens = tok(out_text, add_special_tokens=False)["input_ids"]
+    assert len(out_tokens) < 450  # ~Whisper max output len
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    ("fixture_name", "expected_lang", "expected_text"),
+    [
+        ("mary_had_lamb", "en", ["Mary had a little lamb"]),
+        ("foscolo", "it", ["zacinto", "sacre"]),
+    ],
+    ids=["english", "italian"],
+)
+async def test_language_auto_detect(
+    whisper_client, fixture_name, expected_lang, expected_text, request
+):
+    """Auto-detect language when no language param is provided."""
+    audio_file = request.getfixturevalue(fixture_name)
+    transcription = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=audio_file,
+        response_format="verbose_json",
+        temperature=0.0,
+    )
+    assert transcription.language == expected_lang
+    text_lower = transcription.text.lower()
+    assert any(word.lower() in text_lower for word in expected_text), (
+        f"Expected {expected_lang} text but got: {transcription.text}"
+    )
+
+
+@pytest.mark.asyncio
+async def test_whisper_beam_search_single_beam(mary_had_lamb, whisper_client):
+    """Test beam search with encoder-decoder model (Whisper) on transcriptions with
+    one beam aligns with greedy decoding.
+    """
+    beam_transcription = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        response_format="text",
+        temperature=0.0,
+        extra_body=dict(
+            use_beam_search=True,
+            n=1,
+        ),
+    )
+
+    greedy_transcription = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        response_format="text",
+        temperature=0.0,
+    )
+
+    greedy_res = json.loads(greedy_transcription)["text"]
+    beam_res = json.loads(beam_transcription)["text"]
+    assert greedy_res == beam_res
+
+
+@pytest.mark.asyncio
+async def test_whisper_beam_search_multibeam(mary_had_lamb, whisper_client):
+    """Test n>1 for beam search returns one transcription (best beam)."""
+    transcription = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        response_format="text",
+        temperature=0.0,
+        extra_body=dict(
+            use_beam_search=True,
+            n=2,
+        ),
+    )
+
+    result = json.loads(transcription)
+
+    text = result["text"]
+
+    assert text is not None
+    assert len(text) > 0
+    assert "mary had a little lamb" in text.lower()
+
+
+@pytest.mark.asyncio
+async def test_stream_with_beams_raises(winning_call, whisper_client):
+    """Test that stream=True + beam search raises bad request for now."""
+    with pytest.raises(openai.BadRequestError):
+        await whisper_client.audio.transcriptions.create(
+            model=MODEL_NAME,
+            file=winning_call,
+            language="en",
+            stream=True,
+            extra_body=dict(
+                use_beam_search=True,
+                n=2,
+            ),
+        )
--- a/tests/entrypoints/openai/speech_to_text/test_translation_validation.py
+++ b/tests/entrypoints/openai/speech_to_text/test_translation_validation.py
@@ -0,0 +1,285 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import io
+
+# imports for structured outputs tests
+import json
+
+import httpx
+import librosa
+import numpy as np
+import openai
+import pytest
+import pytest_asyncio
+import soundfile as sf
+
+from tests.entrypoints.openai.conftest import add_attention_backend
+from tests.utils import RemoteOpenAIServer
+
+SERVER_ARGS = ["--enforce-eager"]
+
+
+def _get_server_args(attention_config):
+    """Get server args with attention backend if specified."""
+    args = SERVER_ARGS.copy()
+    add_attention_backend(args, attention_config)
+    return args
+
+
+@pytest.fixture(
+    scope="module", params=["openai/whisper-small", "google/gemma-3n-E2B-it"]
+)
+def server(request, rocm_aiter_fa_attention):
+    # Parametrize over model name
+    with RemoteOpenAIServer(
+        request.param, _get_server_args(rocm_aiter_fa_attention)
+    ) as remote_server:
+        yield remote_server, request.param
+
+
+@pytest_asyncio.fixture
+async def client_and_model(server):
+    server, model_name = server
+    async with server.get_async_client() as async_client:
+        yield async_client, model_name
+
+
+@pytest.mark.asyncio
+async def test_non_asr_model(foscolo, rocm_aiter_fa_attention):
+    # text to text model
+    model_name = "JackFram/llama-68m"
+    with RemoteOpenAIServer(
+        model_name, _get_server_args(rocm_aiter_fa_attention)
+    ) as remote_server:
+        client = remote_server.get_async_client()
+
+        with pytest.raises(openai.NotFoundError):
+            await client.audio.translations.create(
+                model=model_name, file=foscolo, temperature=0.0
+            )
+
+
+@pytest.mark.asyncio
+async def test_basic_audio_with_lora(mary_had_lamb, rocm_aiter_fa_attention):
+    """Ensure STT (translate) requests can pass LoRA through to generate."""
+    # ROCm SPECIFIC CONFIGURATION:
+    # To ensure the test passes on ROCm, we modify the max model length to 512.
+    # We DO NOT apply this to other platforms to maintain strict upstream parity.
+    from vllm.platforms import current_platform
+
+    # NOTE - careful to call this test before the module scoped server
+    # fixture, otherwise it'll OOMkill the CI
+    model_name = "ibm-granite/granite-speech-3.3-2b"
+    lora_model_name = "speech"
+    server_args = [
+        "--enforce-eager",
+        "--enable-lora",
+        "--max-lora-rank",
+        "64",
+        "--lora-modules",
+        f"{lora_model_name}={model_name}",
+        "--max-model-len",
+        "512" if current_platform.is_rocm() else "2048",
+        "--max-num-seqs",
+        "1",
+    ]
+
+    add_attention_backend(server_args, rocm_aiter_fa_attention)
+
+    # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        translation = await client.audio.translations.create(
+            model=lora_model_name,
+            file=mary_had_lamb,
+            extra_body=dict(language="en", to_language="es"),
+            response_format="text",
+            temperature=0.0,
+        )
+    out = json.loads(translation)["text"].strip().lower()
+    assert "pequeño" in out.split(" ")
+
+
+# NOTE: (NickLucche) the large-v3-turbo model was not trained on translation!
+@pytest.mark.asyncio
+async def test_basic_audio(foscolo, client_and_model):
+    client, model_name = client_and_model
+    translation = await client.audio.translations.create(
+        model=model_name,
+        file=foscolo,
+        response_format="text",
+        # TODO remove `language="it"` once language detection is implemented
+        extra_body=dict(language="it", to_language="en"),
+        temperature=0.0,
+    )
+    out = json.loads(translation)["text"].strip().lower()
+    assert "greek sea" in out
+
+
+@pytest.mark.asyncio
+async def test_audio_prompt(foscolo, client_and_model):
+    client, model_name = client_and_model
+    # Condition whisper on starting text
+    prompt = "Nor have I ever"
+    transcription = await client.audio.translations.create(
+        model=model_name,
+        file=foscolo,
+        prompt=prompt,
+        extra_body=dict(language="it", to_language="en"),
+        response_format="text",
+        temperature=0.0,
+    )
+    out = json.loads(transcription)["text"]
+    assert "Nor will I ever touch the sacred" not in out
+    assert prompt not in out
+
+
+@pytest.mark.asyncio
+async def test_streaming_response(foscolo, client_and_model, server):
+    client, model_name = client_and_model
+    translation = ""
+    res_no_stream = await client.audio.translations.create(
+        model=model_name,
+        file=foscolo,
+        response_format="json",
+        extra_body=dict(language="it", to_language="en", seed=42),
+        temperature=0.0,
+    )
+
+    # Stream via HTTPX since OpenAI translation client doesn't expose streaming
+    server, model_name = server
+    url = server.url_for("v1/audio/translations")
+    headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"}
+    data = {
+        "model": model_name,
+        "language": "it",
+        "to_language": "en",
+        "stream": True,
+        "temperature": 0.0,
+        "seed": 42,
+    }
+    foscolo.seek(0)
+    async with httpx.AsyncClient() as http_client:
+        files = {"file": foscolo}
+        async with http_client.stream(
+            "POST", url, headers=headers, data=data, files=files
+        ) as response:
+            async for line in response.aiter_lines():
+                if not line:
+                    continue
+                if line.startswith("data: "):
+                    line = line[len("data: ") :]
+                if line.strip() == "[DONE]":
+                    break
+                chunk = json.loads(line)
+                text = chunk["choices"][0].get("delta", {}).get("content")
+                translation += text or ""
+
+    res_stream = translation.split()
+    # NOTE There's a small non-deterministic issue here, likely in the attn
+    # computation, which will cause a few tokens to be different, while still
+    # being very close semantically.
+    assert (
+        sum([x == y for x, y in zip(res_stream, res_no_stream.text.split())])
+        >= len(res_stream) * 0.9
+    )
+
+
+@pytest.mark.asyncio
+async def test_stream_options(foscolo, server):
+    server, model_name = server
+    url = server.url_for("v1/audio/translations")
+    headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"}
+    data = {
+        "model": model_name,
+        "language": "it",
+        "to_language": "en",
+        "stream": True,
+        "stream_include_usage": True,
+        "stream_continuous_usage_stats": True,
+        "temperature": 0.0,
+    }
+    foscolo.seek(0)
+    final = False
+    continuous = True
+    async with httpx.AsyncClient() as http_client:
+        files = {"file": foscolo}
+        async with http_client.stream(
+            "POST", url, headers=headers, data=data, files=files
+        ) as response:
+            async for line in response.aiter_lines():
+                if not line:
+                    continue
+                if line.startswith("data: "):
+                    line = line[len("data: ") :]
+                if line.strip() == "[DONE]":
+                    break
+                chunk = json.loads(line)
+                choices = chunk.get("choices", [])
+                if not choices:
+                    # final usage sent
+                    final = True
+                else:
+                    continuous = continuous and ("usage" in chunk)
+    assert final and continuous
+
+
+@pytest.mark.asyncio
+async def test_long_audio_request(foscolo, client_and_model):
+    client, model_name = client_and_model
+    if model_name == "google/gemma-3n-E2B-it":
+        pytest.skip("Gemma3n does not support long audio requests")
+    foscolo.seek(0)
+    audio, sr = librosa.load(foscolo)
+    repeated_audio = np.tile(audio, 2)
+    # Repeated audio to buffer
+    buffer = io.BytesIO()
+    sf.write(buffer, repeated_audio, sr, format="WAV")
+    buffer.seek(0)
+    translation = await client.audio.translations.create(
+        model=model_name,
+        file=buffer,
+        extra_body=dict(language="it", to_language="en"),
+        response_format="text",
+        temperature=0.0,
+    )
+    out = json.loads(translation)["text"].strip().lower()
+    assert out.count("greek sea") == 2
+
+
+@pytest.mark.asyncio
+async def test_audio_with_max_tokens(mary_had_lamb, client_and_model):
+    client, model_name = client_and_model
+    transcription = await client.audio.translations.create(
+        model=model_name,
+        file=mary_had_lamb,
+        response_format="text",
+        temperature=0.0,
+        extra_body={"max_completion_tokens": 1},
+    )
+    out = json.loads(transcription)
+    out_text = out["text"]
+    print(out_text)
+    from transformers import AutoTokenizer
+
+    tok = AutoTokenizer.from_pretrained(model_name)
+    out_tokens = tok(out_text, add_special_tokens=False)["input_ids"]
+    assert len(out_tokens) == 1
+    # max_completion_tokens > max_model_len
+    # max_model_len=32768 for Gemma-3n-E2B-it
+    transcription = await client.audio.transcriptions.create(
+        model=model_name,
+        file=mary_had_lamb,
+        response_format="text",
+        temperature=0.0,
+        extra_body={
+            "max_completion_tokens": int(1e6),
+            "repetition_penalty": 1.3,
+        },
+    )
+    out = json.loads(transcription)
+    out_text = out["text"]
+    print(out_text)
+    out_tokens = tok(out_text, add_special_tokens=False)["input_ids"]
+    assert len(out_tokens) < 450  # ~Whisper max output len