# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json import openai import pytest import pytest_asyncio from transformers import AutoProcessor from tests.utils import ROCM_ENV_OVERRIDES, ROCM_EXTRA_ARGS, RemoteOpenAIServer from vllm.multimodal.media import MediaWithBytes from vllm.multimodal.utils import encode_image_url, fetch_image from vllm.platforms import current_platform MODEL_NAME = "microsoft/Phi-3.5-vision-instruct" MAXIMUM_IMAGES = 2 # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA) TEST_IMAGE_ASSETS = [ "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" "Grayscale_8bits_palette_sample_image.png", # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/Grayscale_8bits_palette_sample_image.png", "1280px-Venn_diagram_rgb.svg.png", # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/1280px-Venn_diagram_rgb.svg.png", "RGBA_comp.png", # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/RGBA_comp.png", ] # Required terms for beam search validation # Each entry is a list of term groups - ALL groups must match # Each group is a list of alternatives - at least ONE term in the group must appear # This provides semantic validation while allowing wording variation REQUIRED_BEAM_SEARCH_TERMS = [ # Boardwalk image: must have "boardwalk" AND ("wooden" or "wood") [["boardwalk"], ["wooden", "wood"]], # Parrots image: must have ("parrot" or "bird") AND "two" [["parrot", "bird"], ["two"]], # Venn diagram: must have "venn" AND "diagram" [["venn"], ["diagram"]], # Gradient image: must have "gradient" AND ("color" or "spectrum") [["gradient"], ["color", "spectrum"]], ] def check_output_matches_terms(content: str, term_groups: list[list[str]]) -> bool: """ Check if content matches all required term groups. Each term group requires at least one of its terms to be present. All term groups must be satisfied. """ content_lower = content.lower() return all( any(term.lower() in content_lower for term in group) for group in term_groups ) def assert_non_empty_content(chat_completion, *, context: str = "") -> str: """Assert the first choice has non-empty string content; return it. Provides a detailed failure message including the full ChatCompletion response so flaky / model-quality issues are easy to diagnose. """ prefix = f"[{context}] " if context else "" choice = chat_completion.choices[0] content = choice.message.content assert content is not None, ( f"{prefix}Expected non-None content but got None. " f"finish_reason={choice.finish_reason!r}, " f"full message={choice.message!r}, " f"usage={chat_completion.usage!r}" ) assert isinstance(content, str), ( f"{prefix}Expected str content, got {type(content).__name__}: {content!r}" ) assert len(content) > 0, ( f"{prefix}Expected non-empty content but got empty string. " f"finish_reason={choice.finish_reason!r}, " f"full message={choice.message!r}, " f"usage={chat_completion.usage!r}" ) return content @pytest.fixture(scope="module") def server(): args = [ "--runner", "generate", "--max-model-len", "2048", "--max-num-seqs", "5", "--enforce-eager", "--trust-remote-code", "--limit-mm-per-prompt", json.dumps({"image": MAXIMUM_IMAGES}), *ROCM_EXTRA_ARGS, ] # ROCm: Increase timeouts to handle potential network delays and slower # video processing when downloading multiple videos from external sources env_overrides = { **ROCM_ENV_OVERRIDES, **( { "VLLM_VIDEO_FETCH_TIMEOUT": "120", "VLLM_ENGINE_ITERATION_TIMEOUT_S": "300", } if current_platform.is_rocm() else {} ), } with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_overrides) as remote_server: yield remote_server @pytest_asyncio.fixture async def client(server): async with server.get_async_client() as async_client: yield async_client @pytest.fixture(scope="session") def url_encoded_image(local_asset_server) -> dict[str, str]: return { image_asset: encode_image_url(local_asset_server.get_image_asset(image_asset)) for image_asset in TEST_IMAGE_ASSETS } def dummy_messages_from_image_url( image_urls: str | list[str], content_text: str = "What's in this image?", ): if isinstance(image_urls, str): image_urls = [image_urls] return [ { "role": "user", "content": [ *( {"type": "image_url", "image_url": {"url": image_url}} for image_url in image_urls ), {"type": "text", "text": content_text}, ], } ] def describe_image_messages( image_url: str, *, extra_image_fields: dict | None = None ) -> list[dict]: """Build the system + user messages used by the completions-with-image family of tests. *extra_image_fields* is merged into the top-level image content block (for uuid / bad-key tests).""" image_block: dict = { "type": "image_url", "image_url": {"url": image_url}, } if extra_image_fields: image_block.update(extra_image_fields) return [ {"role": "system", "content": "You are a helpful assistant."}, { "role": "user", "content": [ {"type": "text", "text": "Describe this image."}, image_block, ], }, ] async def complete_and_check( client: openai.AsyncOpenAI, model_name: str, messages: list[dict], *, context: str, max_completion_tokens: int = 50, temperature: float = 0.0, ) -> str: """Run a chat completion and assert the output is non-empty. Returns the content string.""" chat_completion = await client.chat.completions.create( model=model_name, messages=messages, max_completion_tokens=max_completion_tokens, temperature=temperature, ) return assert_non_empty_content(chat_completion, context=context) def get_hf_prompt_tokens(model_name, content, image_url): processor = AutoProcessor.from_pretrained( model_name, trust_remote_code=True, num_crops=4 ) placeholder = "<|image_1|>\n" messages = [ { "role": "user", "content": f"{placeholder}{content}", } ] image = fetch_image(image_url) # Unwrap MediaWithBytes if present if isinstance(image, MediaWithBytes): image = image.media images = [image] prompt = processor.tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) inputs = processor(prompt, images, return_tensors="pt") return inputs.input_ids.shape[1] @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True) async def test_single_chat_session_image( client: openai.AsyncOpenAI, model_name: str, image_url: str ): content_text = "What's in this image?" messages = dummy_messages_from_image_url(image_url, content_text) max_completion_tokens = 10 chat_completion = await client.chat.completions.create( model=model_name, messages=messages, max_completion_tokens=max_completion_tokens, logprobs=True, temperature=0.0, top_logprobs=5, ) assert len(chat_completion.choices) == 1, ( f"Expected 1 choice, got {len(chat_completion.choices)}" ) choice = chat_completion.choices[0] assert choice.finish_reason == "length", ( f"Expected finish_reason='length' (capped at {max_completion_tokens} " f"tokens), got {choice.finish_reason!r}. " f"content={choice.message.content!r}" ) hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text, image_url) expected_usage = openai.types.CompletionUsage( completion_tokens=max_completion_tokens, prompt_tokens=hf_prompt_tokens, total_tokens=hf_prompt_tokens + max_completion_tokens, ) assert chat_completion.usage == expected_usage, ( f"Usage mismatch: got {chat_completion.usage!r}, expected {expected_usage!r}" ) message = choice.message assert message.content is not None and len(message.content) >= 10, ( f"Expected content with >=10 chars, got {message.content!r}" ) assert message.role == "assistant", ( f"Expected role='assistant', got {message.role!r}" ) messages.append({"role": "assistant", "content": message.content}) # test multi-turn dialogue messages.append({"role": "user", "content": "express your result in json"}) await complete_and_check( client, model_name, messages, context=f"multi-turn follow-up for {image_url}", max_completion_tokens=10, ) @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True) async def test_error_on_invalid_image_url_type( client: openai.AsyncOpenAI, model_name: str, image_url: str ): content_text = "What's in this image?" messages = [ { "role": "user", "content": [ {"type": "image_url", "image_url": image_url}, {"type": "text", "text": content_text}, ], } ] # image_url should be a dict {"url": "some url"}, not directly a string with pytest.raises(openai.BadRequestError): await client.chat.completions.create( model=model_name, messages=messages, max_completion_tokens=10, temperature=0.0, ) @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True) async def test_single_chat_session_image_beamsearch( client: openai.AsyncOpenAI, model_name: str, image_url: str ): content_text = "What's in this image?" messages = dummy_messages_from_image_url(image_url, content_text) chat_completion = await client.chat.completions.create( model=model_name, messages=messages, n=2, max_completion_tokens=10, logprobs=True, top_logprobs=5, extra_body=dict(use_beam_search=True), ) assert len(chat_completion.choices) == 2, ( f"Expected 2 beam search choices, got {len(chat_completion.choices)}" ) content_0 = chat_completion.choices[0].message.content content_1 = chat_completion.choices[1].message.content assert content_0 != content_1, ( f"Beam search should produce different outputs for {image_url}, " f"but both returned: {content_0!r}" ) @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("raw_image_url", TEST_IMAGE_ASSETS) @pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True) async def test_single_chat_session_image_base64encoded( client: openai.AsyncOpenAI, model_name: str, raw_image_url: str, image_url: str, url_encoded_image: dict[str, str], ): content_text = "What's in this image?" messages = dummy_messages_from_image_url( url_encoded_image[raw_image_url], content_text, ) max_completion_tokens = 10 # test single completion chat_completion = await client.chat.completions.create( model=model_name, messages=messages, max_completion_tokens=max_completion_tokens, logprobs=True, temperature=0.0, top_logprobs=5, ) assert len(chat_completion.choices) == 1, ( f"Expected 1 choice, got {len(chat_completion.choices)}" ) choice = chat_completion.choices[0] assert choice.finish_reason == "length", ( f"Expected finish_reason='length', got {choice.finish_reason!r}. " f"content={choice.message.content!r}" ) hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text, image_url) expected_usage = openai.types.CompletionUsage( completion_tokens=max_completion_tokens, prompt_tokens=hf_prompt_tokens, total_tokens=hf_prompt_tokens + max_completion_tokens, ) assert chat_completion.usage == expected_usage, ( f"Usage mismatch: got {chat_completion.usage!r}, expected {expected_usage!r}" ) message = choice.message assert message.content is not None and len(message.content) >= 10, ( f"Expected content with >=10 chars, got {message.content!r}" ) assert message.role == "assistant", ( f"Expected role='assistant', got {message.role!r}" ) messages.append({"role": "assistant", "content": message.content}) # test multi-turn dialogue messages.append({"role": "user", "content": "express your result in json"}) await complete_and_check( client, model_name, messages, context=f"multi-turn base64 follow-up for {raw_image_url}", max_completion_tokens=10, temperature=0.0, ) @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("image_idx", list(range(len(TEST_IMAGE_ASSETS)))) async def test_single_chat_session_image_base64encoded_beamsearch( client: openai.AsyncOpenAI, model_name: str, image_idx: int, url_encoded_image: dict[str, str], ): # NOTE: This test validates that we pass MM data through beam search raw_image_url = TEST_IMAGE_ASSETS[image_idx] required_terms = REQUIRED_BEAM_SEARCH_TERMS[image_idx] messages = dummy_messages_from_image_url(url_encoded_image[raw_image_url]) chat_completion = await client.chat.completions.create( model=model_name, messages=messages, n=2, max_completion_tokens=10, temperature=0.0, extra_body=dict(use_beam_search=True), ) assert len(chat_completion.choices) == 2, ( f"Expected 2 beam search choices for image {image_idx} " f"({raw_image_url}), got {len(chat_completion.choices)}" ) # Verify beam search produces two different non-empty outputs content_0 = chat_completion.choices[0].message.content content_1 = chat_completion.choices[1].message.content # Emit beam search outputs for debugging print( f"Beam search outputs for image {image_idx} ({raw_image_url}): " f"Output 0: {content_0!r}, Output 1: {content_1!r}" ) assert content_0, ( f"First beam output is empty for image {image_idx} ({raw_image_url}). " f"finish_reason={chat_completion.choices[0].finish_reason!r}" ) assert content_1, ( f"Second beam output is empty for image {image_idx} " f"({raw_image_url}). " f"finish_reason={chat_completion.choices[1].finish_reason!r}" ) assert content_0 != content_1, ( f"Beam search produced identical outputs for image {image_idx} " f"({raw_image_url}): {content_0!r}" ) # Verify each output contains the required terms for this image for i, content in enumerate([content_0, content_1]): assert check_output_matches_terms(content, required_terms), ( f"Beam output {i} for image {image_idx} ({raw_image_url}) " f"doesn't match required terms.\n" f" content: {content!r}\n" f" required (all groups, >=1 per group): {required_terms}" ) @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True) async def test_chat_streaming_image( client: openai.AsyncOpenAI, model_name: str, image_url: str ): messages = dummy_messages_from_image_url(image_url) # test single completion chat_completion = await client.chat.completions.create( model=model_name, messages=messages, max_completion_tokens=10, temperature=0.0, ) output = chat_completion.choices[0].message.content stop_reason = chat_completion.choices[0].finish_reason # test streaming stream = await client.chat.completions.create( model=model_name, messages=messages, max_completion_tokens=10, temperature=0.0, stream=True, ) chunks: list[str] = [] finish_reason_count = 0 async for chunk in stream: delta = chunk.choices[0].delta if delta.role: assert delta.role == "assistant", ( f"Expected role='assistant' in stream delta, got {delta.role!r}" ) if delta.content: chunks.append(delta.content) if chunk.choices[0].finish_reason is not None: finish_reason_count += 1 # finish reason should only return in last block assert finish_reason_count == 1, ( f"Expected exactly 1 finish_reason across stream chunks, " f"got {finish_reason_count}" ) assert chunk.choices[0].finish_reason == stop_reason, ( f"Stream finish_reason={chunk.choices[0].finish_reason!r} " f"doesn't match non-stream finish_reason={stop_reason!r}" ) streamed_text = "".join(chunks) assert streamed_text == output, ( f"Streamed output doesn't match non-streamed for {image_url}.\n" f" streamed: {streamed_text!r}\n" f" non-streamed: {output!r}" ) @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize( "image_urls", [TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))], indirect=True, ) async def test_multi_image_input( client: openai.AsyncOpenAI, model_name: str, image_urls: list[str] ): messages = dummy_messages_from_image_url(image_urls) if len(image_urls) > MAXIMUM_IMAGES: with pytest.raises(openai.BadRequestError): # test multi-image input await client.chat.completions.create( model=model_name, messages=messages, max_completion_tokens=10, temperature=0.0, ) # the server should still work afterwards completion = await client.completions.create( model=model_name, prompt=[0, 0, 0, 0, 0], max_tokens=5, temperature=0.0, ) assert completion.choices[0].text is not None, ( "Server failed to produce output after rejecting over-limit " "multi-image request" ) else: await complete_and_check( client, model_name, messages, context=f"multi-image input ({len(image_urls)} images)", max_completion_tokens=10, temperature=0.0, ) @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize( "image_urls", [TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))], indirect=True, ) async def test_completions_with_image( client: openai.AsyncOpenAI, model_name: str, image_urls: list[str], ): for image_url in image_urls: messages = describe_image_messages(image_url) await complete_and_check( client, model_name, messages, context=f"completions_with_image url={image_url}", ) @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize( "image_urls", [TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))], indirect=True, ) async def test_completions_with_image_with_uuid( client: openai.AsyncOpenAI, model_name: str, image_urls: list[str], ): for image_url in image_urls: messages = describe_image_messages( image_url, extra_image_fields={"uuid": image_url}, ) await complete_and_check( client, model_name, messages, context=f"uuid first request url={image_url}", ) cached_messages: list[dict] = [ {"role": "system", "content": "You are a helpful assistant."}, { "role": "user", "content": [ {"type": "text", "text": "Describe this image."}, {"type": "image_url", "image_url": {}, "uuid": image_url}, ], }, ] await complete_and_check( client, model_name, cached_messages, context=f"uuid cached (empty image) uuid={image_url}", ) @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) async def test_completions_with_empty_image_with_uuid_without_cache_hit( client: openai.AsyncOpenAI, model_name: str, ): with pytest.raises(openai.BadRequestError): await client.chat.completions.create( messages=[ {"role": "system", "content": "You are a helpful assistant."}, { "role": "user", "content": [ {"type": "text", "text": "Describe this image."}, { "type": "image_url", "image_url": {}, "uuid": "uuid_not_previously_seen", }, ], }, ], model=model_name, ) @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize( "image_urls", [TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))], indirect=True, ) async def test_completions_with_image_with_incorrect_uuid_format( client: openai.AsyncOpenAI, model_name: str, image_urls: list[str], ): for image_url in image_urls: messages = describe_image_messages( image_url, extra_image_fields={ "also_incorrect_uuid_key": image_url, }, ) # Inject the bad key inside image_url dict too messages[1]["content"][1]["image_url"]["incorrect_uuid_key"] = image_url await complete_and_check( client, model_name, messages, context=f"incorrect uuid format url={image_url}", )