[Refactor] Relocate endpoint tests to mirror serving code directory structure (#37504)

Signed-off-by: sfeng33 <4florafeng@gmail.com>
This commit is contained in:
Flora Feng
2026-03-19 03:19:36 -04:00
committed by GitHub
parent e3126cd107
commit b21d384304
17 changed files with 14 additions and 17 deletions

View File

@@ -0,0 +1,156 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# imports for structured outputs tests
import json
import pytest
from tests.entrypoints.openai.conftest import add_attention_backend
from tests.utils import ROCM_ENV_OVERRIDES, ROCM_EXTRA_ARGS, RemoteOpenAIServer
MISTRAL_FORMAT_ARGS = [
"--tokenizer_mode",
"mistral",
"--config_format",
"mistral",
"--load_format",
"mistral",
]
async def transcribe_and_check(
client,
model_name: str,
file,
*,
language: str,
expected_text: str,
expected_seconds: int | None = None,
case_sensitive: bool = False,
):
"""Run a transcription request and assert the output contains
*expected_text* and optionally that usage reports *expected_seconds*.
Provides detailed failure messages with the actual transcription output.
"""
transcription = await client.audio.transcriptions.create(
model=model_name,
file=file,
language=language,
response_format="text",
temperature=0.0,
)
out = json.loads(transcription)
out_text = out["text"]
out_usage = out["usage"]
if case_sensitive:
assert expected_text in out_text, (
f"Expected {expected_text!r} in transcription output, got: {out_text!r}"
)
else:
assert expected_text.lower() in out_text.lower(), (
f"Expected {expected_text!r} (case-insensitive) in transcription "
f"output, got: {out_text!r}"
)
if expected_seconds is not None:
assert out_usage["seconds"] == expected_seconds, (
f"Expected {expected_seconds}s of audio, "
f"got {out_usage['seconds']}s. Full usage: {out_usage!r}"
)
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name", ["mistralai/Voxtral-Mini-3B-2507", "Qwen/Qwen3-ASR-0.6B"]
)
async def test_basic_audio(mary_had_lamb, model_name, rocm_aiter_fa_attention):
server_args = ["--enforce-eager", *ROCM_EXTRA_ARGS]
if model_name.startswith("mistralai"):
server_args += MISTRAL_FORMAT_ARGS
add_attention_backend(server_args, rocm_aiter_fa_attention)
# Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
with RemoteOpenAIServer(
model_name, server_args, env_dict=ROCM_ENV_OVERRIDES
) as remote_server:
client = remote_server.get_async_client()
await transcribe_and_check(
client,
model_name,
mary_had_lamb,
language="en",
expected_text="Mary had a little lamb",
expected_seconds=16,
)
@pytest.mark.asyncio
async def test_basic_audio_with_lora(mary_had_lamb, rocm_aiter_fa_attention):
"""Ensure STT (transcribe) requests can pass LoRA through to generate."""
# ROCm SPECIFIC CONFIGURATION:
# To ensure the test passes on ROCm, we modify the max model length to 512.
# We DO NOT apply this to other platforms to maintain strict upstream parity.
from vllm.platforms import current_platform
model_name = "ibm-granite/granite-speech-3.3-2b"
lora_model_name = "speech"
server_args = [
"--enforce-eager",
"--enable-lora",
"--max-lora-rank",
"64",
"--lora-modules",
f"{lora_model_name}={model_name}",
"--max-model-len",
"512" if current_platform.is_rocm() else "2048",
"--max-num-seqs",
"1",
]
add_attention_backend(server_args, rocm_aiter_fa_attention)
# Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
with RemoteOpenAIServer(
model_name, server_args, env_dict=ROCM_ENV_OVERRIDES
) as remote_server:
client = remote_server.get_async_client()
await transcribe_and_check(
client,
lora_model_name,
mary_had_lamb,
language="en",
expected_text="mary had a little lamb",
expected_seconds=16,
)
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name", ["google/gemma-3n-E2B-it", "Qwen/Qwen3-ASR-0.6B"]
)
async def test_basic_audio_foscolo(foscolo, rocm_aiter_fa_attention, model_name):
# Gemma accuracy on some of the audio samples we use is particularly bad,
# hence we use a different one here. WER is evaluated separately.
server_args = ["--enforce-eager", *ROCM_EXTRA_ARGS]
add_attention_backend(server_args, rocm_aiter_fa_attention)
with RemoteOpenAIServer(
model_name,
server_args,
max_wait_seconds=480,
env_dict=ROCM_ENV_OVERRIDES,
) as remote_server:
client = remote_server.get_async_client()
await transcribe_and_check(
client,
model_name,
foscolo,
language="it",
expected_text="ove il mio corpo fanciulletto giacque",
)

View File

@@ -0,0 +1,388 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# imports for structured outputs tests
import asyncio
import io
import json
import librosa
import numpy as np
import openai
import pytest
import pytest_asyncio
import soundfile as sf
from tests.utils import RemoteOpenAIServer
MODEL_NAME = "openai/whisper-large-v3-turbo"
@pytest.fixture(scope="module")
def server():
with RemoteOpenAIServer(MODEL_NAME, []) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def whisper_client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.mark.asyncio
async def test_basic_audio(whisper_client, mary_had_lamb):
# Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
transcription = await whisper_client.audio.transcriptions.create(
model=MODEL_NAME,
file=mary_had_lamb,
language="en",
response_format="text",
temperature=0.0,
)
out = json.loads(transcription)
out_text = out["text"]
out_usage = out["usage"]
assert "Mary had a little lamb," in out_text
assert out_usage["seconds"] == 16, out_usage["seconds"]
@pytest.mark.asyncio
async def test_basic_audio_batched(mary_had_lamb, winning_call, whisper_client):
transcription = whisper_client.audio.transcriptions.create(
model=MODEL_NAME,
file=mary_had_lamb,
language="en",
response_format="text",
temperature=0.0,
)
transcription2 = whisper_client.audio.transcriptions.create(
model=MODEL_NAME,
file=winning_call,
language="en",
response_format="text",
temperature=0.0,
)
# Await both transcriptions by scheduling coroutines together
transcription, transcription2 = await asyncio.gather(transcription, transcription2)
out = json.loads(transcription)
out_text = out["text"]
assert "Mary had a little lamb," in out_text
out2 = json.loads(transcription2)
out_text2 = out2["text"]
assert "Edgar Martinez" in out_text2
@pytest.mark.asyncio
async def test_bad_requests(mary_had_lamb, whisper_client):
# invalid language
with pytest.raises(openai.BadRequestError):
await whisper_client.audio.transcriptions.create(
model=MODEL_NAME, file=mary_had_lamb, language="hh", temperature=0.0
)
@pytest.mark.asyncio
async def test_long_audio_request(mary_had_lamb, whisper_client):
mary_had_lamb.seek(0)
audio, sr = librosa.load(mary_had_lamb)
# Add small silence after each audio for repeatability in the split process
audio = np.pad(audio, (0, 1600))
repeated_audio = np.tile(audio, 10)
# Repeated audio to buffer
buffer = io.BytesIO()
sf.write(buffer, repeated_audio, sr, format="WAV")
buffer.seek(0)
transcription = await whisper_client.audio.transcriptions.create(
model=MODEL_NAME,
file=buffer,
language="en",
response_format="text",
temperature=0.0,
)
out = json.loads(transcription)
out_text = out["text"]
out_usage = out["usage"]
counts = out_text.count("Mary had a little lamb")
assert counts == 10, counts
assert out_usage["seconds"] == 161, out_usage["seconds"]
@pytest.mark.asyncio
async def test_invalid_audio_file(whisper_client):
"""Corrupted audio should surface as HTTP 400."""
invalid_audio = io.BytesIO(b"not a valid audio file")
invalid_audio.name = "invalid.wav"
with pytest.raises(openai.BadRequestError) as exc_info:
await whisper_client.audio.transcriptions.create(
model=MODEL_NAME,
file=invalid_audio,
language="en",
)
assert exc_info.value.status_code == 400
assert "Invalid or unsupported audio file" in exc_info.value.message
@pytest.mark.asyncio
async def test_completion_endpoints(whisper_client):
# text to text model
with pytest.raises(openai.NotFoundError):
await whisper_client.chat.completions.create(
model=MODEL_NAME,
messages=[{"role": "system", "content": "You are a helpful assistant."}],
)
with pytest.raises(openai.NotFoundError):
await whisper_client.completions.create(model=MODEL_NAME, prompt="Hello")
@pytest.mark.asyncio
async def test_streaming_response(winning_call, whisper_client):
transcription = ""
res_no_stream = await whisper_client.audio.transcriptions.create(
model=MODEL_NAME,
file=winning_call,
response_format="json",
language="en",
temperature=0.0,
)
res = await whisper_client.audio.transcriptions.create(
model=MODEL_NAME,
file=winning_call,
language="en",
temperature=0.0,
stream=True,
timeout=30,
)
# Reconstruct from chunks and validate
async for chunk in res:
text = chunk.choices[0]["delta"]["content"]
transcription += text
assert transcription == res_no_stream.text
@pytest.mark.asyncio
async def test_stream_options(winning_call, whisper_client):
res = await whisper_client.audio.transcriptions.create(
model=MODEL_NAME,
file=winning_call,
language="en",
temperature=0.0,
stream=True,
extra_body=dict(stream_include_usage=True, stream_continuous_usage_stats=True),
timeout=30,
)
final = False
continuous = True
async for chunk in res:
if not len(chunk.choices):
# final usage sent
final = True
else:
continuous = continuous and hasattr(chunk, "usage")
assert final and continuous
@pytest.mark.asyncio
async def test_sampling_params(mary_had_lamb, whisper_client):
"""
Compare sampling with params and greedy sampling to assert results
are different when extreme sampling parameters values are picked.
"""
transcription = await whisper_client.audio.transcriptions.create(
model=MODEL_NAME,
file=mary_had_lamb,
language="en",
temperature=0.8,
extra_body=dict(
seed=42,
repetition_penalty=1.9,
top_k=12,
top_p=0.4,
min_p=0.5,
frequency_penalty=1.8,
presence_penalty=2.0,
),
)
greedy_transcription = await whisper_client.audio.transcriptions.create(
model=MODEL_NAME,
file=mary_had_lamb,
language="en",
temperature=0.0,
extra_body=dict(seed=42),
)
assert greedy_transcription.text != transcription.text
@pytest.mark.asyncio
async def test_audio_prompt(mary_had_lamb, whisper_client):
prompt = "This is a speech, recorded in a phonograph."
# Prompts should not omit the part of original prompt while transcribing.
prefix = "The first words I spoke in the original phonograph"
transcription = await whisper_client.audio.transcriptions.create(
model=MODEL_NAME,
file=mary_had_lamb,
language="en",
response_format="text",
temperature=0.0,
)
out = json.loads(transcription)["text"]
assert prefix in out
transcription_wprompt = await whisper_client.audio.transcriptions.create(
model=MODEL_NAME,
file=mary_had_lamb,
language="en",
response_format="text",
prompt=prompt,
temperature=0.0,
)
out_prompt = json.loads(transcription_wprompt)["text"]
assert prefix in out_prompt
@pytest.mark.asyncio
async def test_audio_with_timestamp(mary_had_lamb, whisper_client):
transcription = await whisper_client.audio.transcriptions.create(
model=MODEL_NAME,
file=mary_had_lamb,
language="en",
response_format="verbose_json",
temperature=0.0,
)
assert transcription.segments is not None
assert len(transcription.segments) > 0
assert transcription.segments[0].avg_logprob is not None
assert transcription.segments[0].compression_ratio is not None
@pytest.mark.asyncio
async def test_audio_with_max_tokens(whisper_client, mary_had_lamb):
transcription = await whisper_client.audio.transcriptions.create(
model=MODEL_NAME,
file=mary_had_lamb,
language="en",
response_format="text",
temperature=0.0,
extra_body={"max_completion_tokens": 1},
)
out = json.loads(transcription)
out_text = out["text"]
from transformers import AutoTokenizer
tok = AutoTokenizer.from_pretrained(MODEL_NAME)
out_tokens = tok(out_text, add_special_tokens=False)["input_ids"]
assert len(out_tokens) == 1
# max_completion_tokens > max_model_len
transcription = await whisper_client.audio.transcriptions.create(
model=MODEL_NAME,
file=mary_had_lamb,
language="en",
response_format="text",
temperature=0.0,
extra_body={"max_completion_tokens": int(1e6)},
)
out = json.loads(transcription)
out_text = out["text"]
out_tokens = tok(out_text, add_special_tokens=False)["input_ids"]
assert len(out_tokens) < 450 # ~Whisper max output len
@pytest.mark.asyncio
@pytest.mark.parametrize(
("fixture_name", "expected_lang", "expected_text"),
[
("mary_had_lamb", "en", ["Mary had a little lamb"]),
("foscolo", "it", ["zacinto", "sacre"]),
],
ids=["english", "italian"],
)
async def test_language_auto_detect(
whisper_client, fixture_name, expected_lang, expected_text, request
):
"""Auto-detect language when no language param is provided."""
audio_file = request.getfixturevalue(fixture_name)
transcription = await whisper_client.audio.transcriptions.create(
model=MODEL_NAME,
file=audio_file,
response_format="verbose_json",
temperature=0.0,
)
assert transcription.language == expected_lang
text_lower = transcription.text.lower()
assert any(word.lower() in text_lower for word in expected_text), (
f"Expected {expected_lang} text but got: {transcription.text}"
)
@pytest.mark.asyncio
async def test_whisper_beam_search_single_beam(mary_had_lamb, whisper_client):
"""Test beam search with encoder-decoder model (Whisper) on transcriptions with
one beam aligns with greedy decoding.
"""
beam_transcription = await whisper_client.audio.transcriptions.create(
model=MODEL_NAME,
file=mary_had_lamb,
language="en",
response_format="text",
temperature=0.0,
extra_body=dict(
use_beam_search=True,
n=1,
),
)
greedy_transcription = await whisper_client.audio.transcriptions.create(
model=MODEL_NAME,
file=mary_had_lamb,
response_format="text",
temperature=0.0,
)
greedy_res = json.loads(greedy_transcription)["text"]
beam_res = json.loads(beam_transcription)["text"]
assert greedy_res == beam_res
@pytest.mark.asyncio
async def test_whisper_beam_search_multibeam(mary_had_lamb, whisper_client):
"""Test n>1 for beam search returns one transcription (best beam)."""
transcription = await whisper_client.audio.transcriptions.create(
model=MODEL_NAME,
file=mary_had_lamb,
language="en",
response_format="text",
temperature=0.0,
extra_body=dict(
use_beam_search=True,
n=2,
),
)
result = json.loads(transcription)
text = result["text"]
assert text is not None
assert len(text) > 0
assert "mary had a little lamb" in text.lower()
@pytest.mark.asyncio
async def test_stream_with_beams_raises(winning_call, whisper_client):
"""Test that stream=True + beam search raises bad request for now."""
with pytest.raises(openai.BadRequestError):
await whisper_client.audio.transcriptions.create(
model=MODEL_NAME,
file=winning_call,
language="en",
stream=True,
extra_body=dict(
use_beam_search=True,
n=2,
),
)

View File

@@ -0,0 +1,285 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import io
# imports for structured outputs tests
import json
import httpx
import librosa
import numpy as np
import openai
import pytest
import pytest_asyncio
import soundfile as sf
from tests.entrypoints.openai.conftest import add_attention_backend
from tests.utils import RemoteOpenAIServer
SERVER_ARGS = ["--enforce-eager"]
def _get_server_args(attention_config):
"""Get server args with attention backend if specified."""
args = SERVER_ARGS.copy()
add_attention_backend(args, attention_config)
return args
@pytest.fixture(
scope="module", params=["openai/whisper-small", "google/gemma-3n-E2B-it"]
)
def server(request, rocm_aiter_fa_attention):
# Parametrize over model name
with RemoteOpenAIServer(
request.param, _get_server_args(rocm_aiter_fa_attention)
) as remote_server:
yield remote_server, request.param
@pytest_asyncio.fixture
async def client_and_model(server):
server, model_name = server
async with server.get_async_client() as async_client:
yield async_client, model_name
@pytest.mark.asyncio
async def test_non_asr_model(foscolo, rocm_aiter_fa_attention):
# text to text model
model_name = "JackFram/llama-68m"
with RemoteOpenAIServer(
model_name, _get_server_args(rocm_aiter_fa_attention)
) as remote_server:
client = remote_server.get_async_client()
with pytest.raises(openai.NotFoundError):
await client.audio.translations.create(
model=model_name, file=foscolo, temperature=0.0
)
@pytest.mark.asyncio
async def test_basic_audio_with_lora(mary_had_lamb, rocm_aiter_fa_attention):
"""Ensure STT (translate) requests can pass LoRA through to generate."""
# ROCm SPECIFIC CONFIGURATION:
# To ensure the test passes on ROCm, we modify the max model length to 512.
# We DO NOT apply this to other platforms to maintain strict upstream parity.
from vllm.platforms import current_platform
# NOTE - careful to call this test before the module scoped server
# fixture, otherwise it'll OOMkill the CI
model_name = "ibm-granite/granite-speech-3.3-2b"
lora_model_name = "speech"
server_args = [
"--enforce-eager",
"--enable-lora",
"--max-lora-rank",
"64",
"--lora-modules",
f"{lora_model_name}={model_name}",
"--max-model-len",
"512" if current_platform.is_rocm() else "2048",
"--max-num-seqs",
"1",
]
add_attention_backend(server_args, rocm_aiter_fa_attention)
# Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
with RemoteOpenAIServer(model_name, server_args) as remote_server:
client = remote_server.get_async_client()
translation = await client.audio.translations.create(
model=lora_model_name,
file=mary_had_lamb,
extra_body=dict(language="en", to_language="es"),
response_format="text",
temperature=0.0,
)
out = json.loads(translation)["text"].strip().lower()
assert "pequeño" in out.split(" ")
# NOTE: (NickLucche) the large-v3-turbo model was not trained on translation!
@pytest.mark.asyncio
async def test_basic_audio(foscolo, client_and_model):
client, model_name = client_and_model
translation = await client.audio.translations.create(
model=model_name,
file=foscolo,
response_format="text",
# TODO remove `language="it"` once language detection is implemented
extra_body=dict(language="it", to_language="en"),
temperature=0.0,
)
out = json.loads(translation)["text"].strip().lower()
assert "greek sea" in out
@pytest.mark.asyncio
async def test_audio_prompt(foscolo, client_and_model):
client, model_name = client_and_model
# Condition whisper on starting text
prompt = "Nor have I ever"
transcription = await client.audio.translations.create(
model=model_name,
file=foscolo,
prompt=prompt,
extra_body=dict(language="it", to_language="en"),
response_format="text",
temperature=0.0,
)
out = json.loads(transcription)["text"]
assert "Nor will I ever touch the sacred" not in out
assert prompt not in out
@pytest.mark.asyncio
async def test_streaming_response(foscolo, client_and_model, server):
client, model_name = client_and_model
translation = ""
res_no_stream = await client.audio.translations.create(
model=model_name,
file=foscolo,
response_format="json",
extra_body=dict(language="it", to_language="en", seed=42),
temperature=0.0,
)
# Stream via HTTPX since OpenAI translation client doesn't expose streaming
server, model_name = server
url = server.url_for("v1/audio/translations")
headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"}
data = {
"model": model_name,
"language": "it",
"to_language": "en",
"stream": True,
"temperature": 0.0,
"seed": 42,
}
foscolo.seek(0)
async with httpx.AsyncClient() as http_client:
files = {"file": foscolo}
async with http_client.stream(
"POST", url, headers=headers, data=data, files=files
) as response:
async for line in response.aiter_lines():
if not line:
continue
if line.startswith("data: "):
line = line[len("data: ") :]
if line.strip() == "[DONE]":
break
chunk = json.loads(line)
text = chunk["choices"][0].get("delta", {}).get("content")
translation += text or ""
res_stream = translation.split()
# NOTE There's a small non-deterministic issue here, likely in the attn
# computation, which will cause a few tokens to be different, while still
# being very close semantically.
assert (
sum([x == y for x, y in zip(res_stream, res_no_stream.text.split())])
>= len(res_stream) * 0.9
)
@pytest.mark.asyncio
async def test_stream_options(foscolo, server):
server, model_name = server
url = server.url_for("v1/audio/translations")
headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"}
data = {
"model": model_name,
"language": "it",
"to_language": "en",
"stream": True,
"stream_include_usage": True,
"stream_continuous_usage_stats": True,
"temperature": 0.0,
}
foscolo.seek(0)
final = False
continuous = True
async with httpx.AsyncClient() as http_client:
files = {"file": foscolo}
async with http_client.stream(
"POST", url, headers=headers, data=data, files=files
) as response:
async for line in response.aiter_lines():
if not line:
continue
if line.startswith("data: "):
line = line[len("data: ") :]
if line.strip() == "[DONE]":
break
chunk = json.loads(line)
choices = chunk.get("choices", [])
if not choices:
# final usage sent
final = True
else:
continuous = continuous and ("usage" in chunk)
assert final and continuous
@pytest.mark.asyncio
async def test_long_audio_request(foscolo, client_and_model):
client, model_name = client_and_model
if model_name == "google/gemma-3n-E2B-it":
pytest.skip("Gemma3n does not support long audio requests")
foscolo.seek(0)
audio, sr = librosa.load(foscolo)
repeated_audio = np.tile(audio, 2)
# Repeated audio to buffer
buffer = io.BytesIO()
sf.write(buffer, repeated_audio, sr, format="WAV")
buffer.seek(0)
translation = await client.audio.translations.create(
model=model_name,
file=buffer,
extra_body=dict(language="it", to_language="en"),
response_format="text",
temperature=0.0,
)
out = json.loads(translation)["text"].strip().lower()
assert out.count("greek sea") == 2
@pytest.mark.asyncio
async def test_audio_with_max_tokens(mary_had_lamb, client_and_model):
client, model_name = client_and_model
transcription = await client.audio.translations.create(
model=model_name,
file=mary_had_lamb,
response_format="text",
temperature=0.0,
extra_body={"max_completion_tokens": 1},
)
out = json.loads(transcription)
out_text = out["text"]
print(out_text)
from transformers import AutoTokenizer
tok = AutoTokenizer.from_pretrained(model_name)
out_tokens = tok(out_text, add_special_tokens=False)["input_ids"]
assert len(out_tokens) == 1
# max_completion_tokens > max_model_len
# max_model_len=32768 for Gemma-3n-E2B-it
transcription = await client.audio.transcriptions.create(
model=model_name,
file=mary_had_lamb,
response_format="text",
temperature=0.0,
extra_body={
"max_completion_tokens": int(1e6),
"repetition_penalty": 1.3,
},
)
out = json.loads(transcription)
out_text = out["text"]
print(out_text)
out_tokens = tok(out_text, add_special_tokens=False)["input_ids"]
assert len(out_tokens) < 450 # ~Whisper max output len