diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py index cbab74145..58742f186 100644 --- a/tests/entrypoints/openai/test_transcription_validation.py +++ b/tests/entrypoints/openai/test_transcription_validation.py @@ -6,7 +6,7 @@ import json import pytest -from ...utils import RemoteOpenAIServer +from ...utils import ROCM_ENV_OVERRIDES, ROCM_EXTRA_ARGS, RemoteOpenAIServer from .conftest import add_attention_backend MISTRAL_FORMAT_ARGS = [ @@ -19,12 +19,55 @@ MISTRAL_FORMAT_ARGS = [ ] +async def transcribe_and_check( + client, + model_name: str, + file, + *, + language: str, + expected_text: str, + expected_seconds: int | None = None, + case_sensitive: bool = False, +): + """Run a transcription request and assert the output contains + *expected_text* and optionally that usage reports *expected_seconds*. + + Provides detailed failure messages with the actual transcription output. + """ + transcription = await client.audio.transcriptions.create( + model=model_name, + file=file, + language=language, + response_format="text", + temperature=0.0, + ) + out = json.loads(transcription) + out_text = out["text"] + out_usage = out["usage"] + + if case_sensitive: + assert expected_text in out_text, ( + f"Expected {expected_text!r} in transcription output, got: {out_text!r}" + ) + else: + assert expected_text.lower() in out_text.lower(), ( + f"Expected {expected_text!r} (case-insensitive) in transcription " + f"output, got: {out_text!r}" + ) + + if expected_seconds is not None: + assert out_usage["seconds"] == expected_seconds, ( + f"Expected {expected_seconds}s of audio, " + f"got {out_usage['seconds']}s. Full usage: {out_usage!r}" + ) + + @pytest.mark.asyncio @pytest.mark.parametrize( "model_name", ["mistralai/Voxtral-Mini-3B-2507", "Qwen/Qwen3-ASR-0.6B"] ) async def test_basic_audio(mary_had_lamb, model_name, rocm_aiter_fa_attention): - server_args = ["--enforce-eager"] + server_args = ["--enforce-eager", *ROCM_EXTRA_ARGS] if model_name.startswith("mistralai"): server_args += MISTRAL_FORMAT_ARGS @@ -32,20 +75,18 @@ async def test_basic_audio(mary_had_lamb, model_name, rocm_aiter_fa_attention): add_attention_backend(server_args, rocm_aiter_fa_attention) # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb. - with RemoteOpenAIServer(model_name, server_args) as remote_server: + with RemoteOpenAIServer( + model_name, server_args, env_dict=ROCM_ENV_OVERRIDES + ) as remote_server: client = remote_server.get_async_client() - transcription = await client.audio.transcriptions.create( - model=model_name, - file=mary_had_lamb, + await transcribe_and_check( + client, + model_name, + mary_had_lamb, language="en", - response_format="text", - temperature=0.0, + expected_text="Mary had a little lamb", + expected_seconds=16, ) - out = json.loads(transcription) - out_text = out["text"] - out_usage = out["usage"] - assert "Mary had a little lamb" in out_text - assert out_usage["seconds"] == 16, out_usage["seconds"] @pytest.mark.asyncio @@ -74,20 +115,18 @@ async def test_basic_audio_with_lora(mary_had_lamb, rocm_aiter_fa_attention): add_attention_backend(server_args, rocm_aiter_fa_attention) # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb. - with RemoteOpenAIServer(model_name, server_args) as remote_server: + with RemoteOpenAIServer( + model_name, server_args, env_dict=ROCM_ENV_OVERRIDES + ) as remote_server: client = remote_server.get_async_client() - transcription = await client.audio.transcriptions.create( - model=lora_model_name, - file=mary_had_lamb, + await transcribe_and_check( + client, + lora_model_name, + mary_had_lamb, language="en", - response_format="text", - temperature=0.0, + expected_text="mary had a little lamb", + expected_seconds=16, ) - out = json.loads(transcription) - out_text = out["text"] - out_usage = out["usage"] - assert "mary had a little lamb" in out_text - assert out_usage["seconds"] == 16, out_usage["seconds"] @pytest.mark.asyncio @@ -97,20 +136,21 @@ async def test_basic_audio_with_lora(mary_had_lamb, rocm_aiter_fa_attention): async def test_basic_audio_foscolo(foscolo, rocm_aiter_fa_attention, model_name): # Gemma accuracy on some of the audio samples we use is particularly bad, # hence we use a different one here. WER is evaluated separately. - server_args = ["--enforce-eager"] + server_args = ["--enforce-eager", *ROCM_EXTRA_ARGS] add_attention_backend(server_args, rocm_aiter_fa_attention) with RemoteOpenAIServer( - model_name, server_args, max_wait_seconds=480 + model_name, + server_args, + max_wait_seconds=480, + env_dict=ROCM_ENV_OVERRIDES, ) as remote_server: client = remote_server.get_async_client() - transcription = await client.audio.transcriptions.create( - model=model_name, - file=foscolo, + await transcribe_and_check( + client, + model_name, + foscolo, language="it", - response_format="text", - temperature=0.0, + expected_text="ove il mio corpo fanciulletto giacque", ) - out = json.loads(transcription)["text"] - assert "ove il mio corpo fanciulletto giacque" in out diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py index 6c5a08ae2..c0d8b0532 100644 --- a/tests/entrypoints/openai/test_vision.py +++ b/tests/entrypoints/openai/test_vision.py @@ -12,7 +12,7 @@ from vllm.multimodal.media import MediaWithBytes from vllm.multimodal.utils import encode_image_url, fetch_image from vllm.platforms import current_platform -from ...utils import RemoteOpenAIServer +from ...utils import ROCM_ENV_OVERRIDES, ROCM_EXTRA_ARGS, RemoteOpenAIServer MODEL_NAME = "microsoft/Phi-3.5-vision-instruct" MAXIMUM_IMAGES = 2 @@ -48,10 +48,37 @@ def check_output_matches_terms(content: str, term_groups: list[list[str]]) -> bo All term groups must be satisfied. """ content_lower = content.lower() - for group in term_groups: - if not any(term.lower() in content_lower for term in group): - return False - return True + return all( + any(term.lower() in content_lower for term in group) for group in term_groups + ) + + +def assert_non_empty_content(chat_completion, *, context: str = "") -> str: + """Assert the first choice has non-empty string content; return it. + + Provides a detailed failure message including the full ChatCompletion + response so flaky / model-quality issues are easy to diagnose. + """ + prefix = f"[{context}] " if context else "" + choice = chat_completion.choices[0] + content = choice.message.content + + assert content is not None, ( + f"{prefix}Expected non-None content but got None. " + f"finish_reason={choice.finish_reason!r}, " + f"full message={choice.message!r}, " + f"usage={chat_completion.usage!r}" + ) + assert isinstance(content, str), ( + f"{prefix}Expected str content, got {type(content).__name__}: {content!r}" + ) + assert len(content) > 0, ( + f"{prefix}Expected non-empty content but got empty string. " + f"finish_reason={choice.finish_reason!r}, " + f"full message={choice.message!r}, " + f"usage={chat_completion.usage!r}" + ) + return content @pytest.fixture(scope="module") @@ -67,16 +94,22 @@ def server(): "--trust-remote-code", "--limit-mm-per-prompt", json.dumps({"image": MAXIMUM_IMAGES}), + *ROCM_EXTRA_ARGS, ] # ROCm: Increase timeouts to handle potential network delays and slower # video processing when downloading multiple videos from external sources - env_overrides = {} - if current_platform.is_rocm(): - env_overrides = { - "VLLM_VIDEO_FETCH_TIMEOUT": "120", - "VLLM_ENGINE_ITERATION_TIMEOUT_S": "300", - } + env_overrides = { + **ROCM_ENV_OVERRIDES, + **( + { + "VLLM_VIDEO_FETCH_TIMEOUT": "120", + "VLLM_ENGINE_ITERATION_TIMEOUT_S": "300", + } + if current_platform.is_rocm() + else {} + ), + } with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_overrides) as remote_server: yield remote_server @@ -117,6 +150,51 @@ def dummy_messages_from_image_url( ] +def describe_image_messages( + image_url: str, *, extra_image_fields: dict | None = None +) -> list[dict]: + """Build the system + user messages used by the completions-with-image + family of tests. *extra_image_fields* is merged into the top-level + image content block (for uuid / bad-key tests).""" + image_block: dict = { + "type": "image_url", + "image_url": {"url": image_url}, + } + if extra_image_fields: + image_block.update(extra_image_fields) + + return [ + {"role": "system", "content": "You are a helpful assistant."}, + { + "role": "user", + "content": [ + {"type": "text", "text": "Describe this image."}, + image_block, + ], + }, + ] + + +async def complete_and_check( + client: openai.AsyncOpenAI, + model_name: str, + messages: list[dict], + *, + context: str, + max_completion_tokens: int = 50, + temperature: float = 0.0, +) -> str: + """Run a chat completion and assert the output is non-empty. + Returns the content string.""" + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=max_completion_tokens, + temperature=temperature, + ) + return assert_non_empty_content(chat_completion, context=context) + + def get_hf_prompt_tokens(model_name, content, image_url): processor = AutoProcessor.from_pretrained( model_name, trust_remote_code=True, num_crops=4 @@ -153,7 +231,6 @@ async def test_single_chat_session_image( messages = dummy_messages_from_image_url(image_url, content_text) max_completion_tokens = 10 - # test single completion chat_completion = await client.chat.completions.create( model=model_name, messages=messages, @@ -162,32 +239,46 @@ async def test_single_chat_session_image( temperature=0.0, top_logprobs=5, ) - assert len(chat_completion.choices) == 1 + assert len(chat_completion.choices) == 1, ( + f"Expected 1 choice, got {len(chat_completion.choices)}" + ) choice = chat_completion.choices[0] - assert choice.finish_reason == "length" + assert choice.finish_reason == "length", ( + f"Expected finish_reason='length' (capped at {max_completion_tokens} " + f"tokens), got {choice.finish_reason!r}. " + f"content={choice.message.content!r}" + ) + hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text, image_url) - assert chat_completion.usage == openai.types.CompletionUsage( + expected_usage = openai.types.CompletionUsage( completion_tokens=max_completion_tokens, prompt_tokens=hf_prompt_tokens, total_tokens=hf_prompt_tokens + max_completion_tokens, ) + assert chat_completion.usage == expected_usage, ( + f"Usage mismatch: got {chat_completion.usage!r}, expected {expected_usage!r}" + ) message = choice.message - message = chat_completion.choices[0].message - assert message.content is not None and len(message.content) >= 10 - assert message.role == "assistant" + assert message.content is not None and len(message.content) >= 10, ( + f"Expected content with >=10 chars, got {message.content!r}" + ) + assert message.role == "assistant", ( + f"Expected role='assistant', got {message.role!r}" + ) + messages.append({"role": "assistant", "content": message.content}) # test multi-turn dialogue messages.append({"role": "user", "content": "express your result in json"}) - chat_completion = await client.chat.completions.create( - model=model_name, - messages=messages, + await complete_and_check( + client, + model_name, + messages, + context=f"multi-turn follow-up for {image_url}", max_completion_tokens=10, ) - message = chat_completion.choices[0].message - assert message.content is not None and len(message.content) >= 0 @pytest.mark.asyncio @@ -209,7 +300,7 @@ async def test_error_on_invalid_image_url_type( # image_url should be a dict {"url": "some url"}, not directly a string with pytest.raises(openai.BadRequestError): - _ = await client.chat.completions.create( + await client.chat.completions.create( model=model_name, messages=messages, max_completion_tokens=10, @@ -235,10 +326,15 @@ async def test_single_chat_session_image_beamsearch( top_logprobs=5, extra_body=dict(use_beam_search=True), ) - assert len(chat_completion.choices) == 2 - assert ( - chat_completion.choices[0].message.content - != chat_completion.choices[1].message.content + assert len(chat_completion.choices) == 2, ( + f"Expected 2 beam search choices, got {len(chat_completion.choices)}" + ) + + content_0 = chat_completion.choices[0].message.content + content_1 = chat_completion.choices[1].message.content + assert content_0 != content_1, ( + f"Beam search should produce different outputs for {image_url}, " + f"but both returned: {content_0!r}" ) @@ -269,33 +365,46 @@ async def test_single_chat_session_image_base64encoded( temperature=0.0, top_logprobs=5, ) - assert len(chat_completion.choices) == 1 + assert len(chat_completion.choices) == 1, ( + f"Expected 1 choice, got {len(chat_completion.choices)}" + ) choice = chat_completion.choices[0] - assert choice.finish_reason == "length" + assert choice.finish_reason == "length", ( + f"Expected finish_reason='length', got {choice.finish_reason!r}. " + f"content={choice.message.content!r}" + ) + hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text, image_url) - assert chat_completion.usage == openai.types.CompletionUsage( + expected_usage = openai.types.CompletionUsage( completion_tokens=max_completion_tokens, prompt_tokens=hf_prompt_tokens, total_tokens=hf_prompt_tokens + max_completion_tokens, ) + assert chat_completion.usage == expected_usage, ( + f"Usage mismatch: got {chat_completion.usage!r}, expected {expected_usage!r}" + ) message = choice.message - message = chat_completion.choices[0].message - assert message.content is not None and len(message.content) >= 10 - assert message.role == "assistant" + assert message.content is not None and len(message.content) >= 10, ( + f"Expected content with >=10 chars, got {message.content!r}" + ) + assert message.role == "assistant", ( + f"Expected role='assistant', got {message.role!r}" + ) + messages.append({"role": "assistant", "content": message.content}) # test multi-turn dialogue messages.append({"role": "user", "content": "express your result in json"}) - chat_completion = await client.chat.completions.create( - model=model_name, - messages=messages, + await complete_and_check( + client, + model_name, + messages, + context=f"multi-turn base64 follow-up for {raw_image_url}", max_completion_tokens=10, temperature=0.0, ) - message = chat_completion.choices[0].message - assert message.content is not None and len(message.content) >= 0 @pytest.mark.asyncio @@ -321,7 +430,10 @@ async def test_single_chat_session_image_base64encoded_beamsearch( temperature=0.0, extra_body=dict(use_beam_search=True), ) - assert len(chat_completion.choices) == 2 + assert len(chat_completion.choices) == 2, ( + f"Expected 2 beam search choices for image {image_idx} " + f"({raw_image_url}), got {len(chat_completion.choices)}" + ) # Verify beam search produces two different non-empty outputs content_0 = chat_completion.choices[0].message.content @@ -333,18 +445,28 @@ async def test_single_chat_session_image_base64encoded_beamsearch( f"Output 0: {content_0!r}, Output 1: {content_1!r}" ) - assert content_0, "First beam search output should not be empty" - assert content_1, "Second beam search output should not be empty" - assert content_0 != content_1, "Beam search should produce different outputs" + assert content_0, ( + f"First beam output is empty for image {image_idx} ({raw_image_url}). " + f"finish_reason={chat_completion.choices[0].finish_reason!r}" + ) + assert content_1, ( + f"Second beam output is empty for image {image_idx} " + f"({raw_image_url}). " + f"finish_reason={chat_completion.choices[1].finish_reason!r}" + ) + assert content_0 != content_1, ( + f"Beam search produced identical outputs for image {image_idx} " + f"({raw_image_url}): {content_0!r}" + ) # Verify each output contains the required terms for this image for i, content in enumerate([content_0, content_1]): - if not check_output_matches_terms(content, required_terms): - pytest.fail( - f"Output {i} '{content}' doesn't contain required terms. " - f"Expected all of these term groups (at least one from each): " - f"{required_terms}" - ) + assert check_output_matches_terms(content, required_terms), ( + f"Beam output {i} for image {image_idx} ({raw_image_url}) " + f"doesn't match required terms.\n" + f" content: {content!r}\n" + f" required (all groups, >=1 per group): {required_terms}" + ) @pytest.mark.asyncio @@ -378,16 +500,29 @@ async def test_chat_streaming_image( async for chunk in stream: delta = chunk.choices[0].delta if delta.role: - assert delta.role == "assistant" + assert delta.role == "assistant", ( + f"Expected role='assistant' in stream delta, got {delta.role!r}" + ) if delta.content: chunks.append(delta.content) if chunk.choices[0].finish_reason is not None: finish_reason_count += 1 # finish reason should only return in last block - assert finish_reason_count == 1 - assert chunk.choices[0].finish_reason == stop_reason - assert delta.content - assert "".join(chunks) == output + assert finish_reason_count == 1, ( + f"Expected exactly 1 finish_reason across stream chunks, " + f"got {finish_reason_count}" + ) + assert chunk.choices[0].finish_reason == stop_reason, ( + f"Stream finish_reason={chunk.choices[0].finish_reason!r} " + f"doesn't match non-stream finish_reason={stop_reason!r}" + ) + + streamed_text = "".join(chunks) + assert streamed_text == output, ( + f"Streamed output doesn't match non-streamed for {image_url}.\n" + f" streamed: {streamed_text!r}\n" + f" non-streamed: {output!r}" + ) @pytest.mark.asyncio @@ -418,17 +553,19 @@ async def test_multi_image_input( max_tokens=5, temperature=0.0, ) - completion = completion.choices[0].text - assert completion is not None and len(completion) >= 0 + assert completion.choices[0].text is not None, ( + "Server failed to produce output after rejecting over-limit " + "multi-image request" + ) else: - chat_completion = await client.chat.completions.create( - model=model_name, - messages=messages, + await complete_and_check( + client, + model_name, + messages, + context=f"multi-image input ({len(image_urls)} images)", max_completion_tokens=10, temperature=0.0, ) - message = chat_completion.choices[0].message - assert message.content is not None and len(message.content) >= 0 @pytest.mark.asyncio @@ -444,30 +581,13 @@ async def test_completions_with_image( image_urls: list[str], ): for image_url in image_urls: - chat_completion = await client.chat.completions.create( - messages=[ - {"role": "system", "content": "You are a helpful assistant."}, - { - "role": "user", - "content": [ - { - "type": "text", - "text": "Describe this image.", - }, - { - "type": "image_url", - "image_url": { - "url": image_url, - }, - }, - ], - }, - ], - model=model_name, + messages = describe_image_messages(image_url) + await complete_and_check( + client, + model_name, + messages, + context=f"completions_with_image url={image_url}", ) - assert chat_completion.choices[0].message.content is not None - assert isinstance(chat_completion.choices[0].message.content, str) - assert len(chat_completion.choices[0].message.content) > 0 @pytest.mark.asyncio @@ -483,54 +603,33 @@ async def test_completions_with_image_with_uuid( image_urls: list[str], ): for image_url in image_urls: - chat_completion = await client.chat.completions.create( - messages=[ - {"role": "system", "content": "You are a helpful assistant."}, - { - "role": "user", - "content": [ - { - "type": "text", - "text": "Describe this image.", - }, - { - "type": "image_url", - "image_url": { - "url": image_url, - }, - "uuid": image_url, - }, - ], - }, - ], - model=model_name, + messages = describe_image_messages( + image_url, + extra_image_fields={"uuid": image_url}, + ) + await complete_and_check( + client, + model_name, + messages, + context=f"uuid first request url={image_url}", ) - assert chat_completion.choices[0].message.content is not None - assert isinstance(chat_completion.choices[0].message.content, str) - assert len(chat_completion.choices[0].message.content) > 0 - # Second request, with empty image but the same uuid. - chat_completion_with_empty_image = await client.chat.completions.create( - messages=[ - {"role": "system", "content": "You are a helpful assistant."}, - { - "role": "user", - "content": [ - { - "type": "text", - "text": "Describe this image.", - }, - {"type": "image_url", "image_url": {}, "uuid": image_url}, - ], - }, - ], - model=model_name, + cached_messages: list[dict] = [ + {"role": "system", "content": "You are a helpful assistant."}, + { + "role": "user", + "content": [ + {"type": "text", "text": "Describe this image."}, + {"type": "image_url", "image_url": {}, "uuid": image_url}, + ], + }, + ] + await complete_and_check( + client, + model_name, + cached_messages, + context=f"uuid cached (empty image) uuid={image_url}", ) - assert chat_completion_with_empty_image.choices[0].message.content is not None - assert isinstance( - chat_completion_with_empty_image.choices[0].message.content, str - ) - assert len(chat_completion_with_empty_image.choices[0].message.content) > 0 @pytest.mark.asyncio @@ -540,16 +639,13 @@ async def test_completions_with_empty_image_with_uuid_without_cache_hit( model_name: str, ): with pytest.raises(openai.BadRequestError): - _ = await client.chat.completions.create( + await client.chat.completions.create( messages=[ {"role": "system", "content": "You are a helpful assistant."}, { "role": "user", "content": [ - { - "type": "text", - "text": "Describe this image.", - }, + {"type": "text", "text": "Describe this image."}, { "type": "image_url", "image_url": {}, @@ -575,29 +671,18 @@ async def test_completions_with_image_with_incorrect_uuid_format( image_urls: list[str], ): for image_url in image_urls: - chat_completion = await client.chat.completions.create( - messages=[ - {"role": "system", "content": "You are a helpful assistant."}, - { - "role": "user", - "content": [ - { - "type": "text", - "text": "Describe this image.", - }, - { - "type": "image_url", - "image_url": { - "url": image_url, - "incorrect_uuid_key": image_url, - }, - "also_incorrect_uuid_key": image_url, - }, - ], - }, - ], - model=model_name, + messages = describe_image_messages( + image_url, + extra_image_fields={ + "also_incorrect_uuid_key": image_url, + }, + ) + # Inject the bad key inside image_url dict too + messages[1]["content"][1]["image_url"]["incorrect_uuid_key"] = image_url + + await complete_and_check( + client, + model_name, + messages, + context=f"incorrect uuid format url={image_url}", ) - assert chat_completion.choices[0].message.content is not None - assert isinstance(chat_completion.choices[0].message.content, str) - assert len(chat_completion.choices[0].message.content) > 0