[CI] fix flaky empty responses and add diagnostic assertions in vision chat tests (#36341)

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
2026-03-08 00:42:24 -06:00
parent 5d6aae4577
commit 40077ea3de
2 changed files with 317 additions and 192 deletions
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@@ -6,7 +6,7 @@ import json

 import pytest

-from ...utils import RemoteOpenAIServer
+from ...utils import ROCM_ENV_OVERRIDES, ROCM_EXTRA_ARGS, RemoteOpenAIServer
 from .conftest import add_attention_backend

 MISTRAL_FORMAT_ARGS = [
@@ -19,12 +19,55 @@ MISTRAL_FORMAT_ARGS = [
 ]


+async def transcribe_and_check(
+    client,
+    model_name: str,
+    file,
+    *,
+    language: str,
+    expected_text: str,
+    expected_seconds: int | None = None,
+    case_sensitive: bool = False,
+):
+    """Run a transcription request and assert the output contains
+    *expected_text* and optionally that usage reports *expected_seconds*.
+
+    Provides detailed failure messages with the actual transcription output.
+    """
+    transcription = await client.audio.transcriptions.create(
+        model=model_name,
+        file=file,
+        language=language,
+        response_format="text",
+        temperature=0.0,
+    )
+    out = json.loads(transcription)
+    out_text = out["text"]
+    out_usage = out["usage"]
+
+    if case_sensitive:
+        assert expected_text in out_text, (
+            f"Expected {expected_text!r} in transcription output, got: {out_text!r}"
+        )
+    else:
+        assert expected_text.lower() in out_text.lower(), (
+            f"Expected {expected_text!r} (case-insensitive) in transcription "
+            f"output, got: {out_text!r}"
+        )
+
+    if expected_seconds is not None:
+        assert out_usage["seconds"] == expected_seconds, (
+            f"Expected {expected_seconds}s of audio, "
+            f"got {out_usage['seconds']}s. Full usage: {out_usage!r}"
+        )
+
+
@pytest.mark.asyncio
@pytest.mark.parametrize(
    "model_name", ["mistralai/Voxtral-Mini-3B-2507", "Qwen/Qwen3-ASR-0.6B"]
 )
 async def test_basic_audio(mary_had_lamb, model_name, rocm_aiter_fa_attention):
-    server_args = ["--enforce-eager"]
+    server_args = ["--enforce-eager", *ROCM_EXTRA_ARGS]

    if model_name.startswith("mistralai"):
        server_args += MISTRAL_FORMAT_ARGS
@@ -32,20 +75,18 @@ async def test_basic_audio(mary_had_lamb, model_name, rocm_aiter_fa_attention):
    add_attention_backend(server_args, rocm_aiter_fa_attention)

    # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
-    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+    with RemoteOpenAIServer(
+        model_name, server_args, env_dict=ROCM_ENV_OVERRIDES
+    ) as remote_server:
        client = remote_server.get_async_client()
-        transcription = await client.audio.transcriptions.create(
-            model=model_name,
-            file=mary_had_lamb,
+        await transcribe_and_check(
+            client,
+            model_name,
+            mary_had_lamb,
            language="en",
-            response_format="text",
-            temperature=0.0,
+            expected_text="Mary had a little lamb",
+            expected_seconds=16,
        )
-        out = json.loads(transcription)
-        out_text = out["text"]
-        out_usage = out["usage"]
-        assert "Mary had a little lamb" in out_text
-        assert out_usage["seconds"] == 16, out_usage["seconds"]


@pytest.mark.asyncio
@@ -74,20 +115,18 @@ async def test_basic_audio_with_lora(mary_had_lamb, rocm_aiter_fa_attention):
    add_attention_backend(server_args, rocm_aiter_fa_attention)

    # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
-    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+    with RemoteOpenAIServer(
+        model_name, server_args, env_dict=ROCM_ENV_OVERRIDES
+    ) as remote_server:
        client = remote_server.get_async_client()
-        transcription = await client.audio.transcriptions.create(
-            model=lora_model_name,
-            file=mary_had_lamb,
+        await transcribe_and_check(
+            client,
+            lora_model_name,
+            mary_had_lamb,
            language="en",
-            response_format="text",
-            temperature=0.0,
+            expected_text="mary had a little lamb",
+            expected_seconds=16,
        )
-    out = json.loads(transcription)
-    out_text = out["text"]
-    out_usage = out["usage"]
-    assert "mary had a little lamb" in out_text
-    assert out_usage["seconds"] == 16, out_usage["seconds"]


@pytest.mark.asyncio
@@ -97,20 +136,21 @@ async def test_basic_audio_with_lora(mary_had_lamb, rocm_aiter_fa_attention):
 async def test_basic_audio_foscolo(foscolo, rocm_aiter_fa_attention, model_name):
    # Gemma accuracy on some of the audio samples we use is particularly bad,
    # hence we use a different one here. WER is evaluated separately.
-    server_args = ["--enforce-eager"]
+    server_args = ["--enforce-eager", *ROCM_EXTRA_ARGS]

    add_attention_backend(server_args, rocm_aiter_fa_attention)

    with RemoteOpenAIServer(
-        model_name, server_args, max_wait_seconds=480
+        model_name,
+        server_args,
+        max_wait_seconds=480,
+        env_dict=ROCM_ENV_OVERRIDES,
    ) as remote_server:
        client = remote_server.get_async_client()
-        transcription = await client.audio.transcriptions.create(
-            model=model_name,
-            file=foscolo,
+        await transcribe_and_check(
+            client,
+            model_name,
+            foscolo,
            language="it",
-            response_format="text",
-            temperature=0.0,
+            expected_text="ove il mio corpo fanciulletto giacque",
        )
-        out = json.loads(transcription)["text"]
-        assert "ove il mio corpo fanciulletto giacque" in out
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -12,7 +12,7 @@ from vllm.multimodal.media import MediaWithBytes
 from vllm.multimodal.utils import encode_image_url, fetch_image
 from vllm.platforms import current_platform

-from ...utils import RemoteOpenAIServer
+from ...utils import ROCM_ENV_OVERRIDES, ROCM_EXTRA_ARGS, RemoteOpenAIServer

 MODEL_NAME = "microsoft/Phi-3.5-vision-instruct"
 MAXIMUM_IMAGES = 2
@@ -48,10 +48,37 @@ def check_output_matches_terms(content: str, term_groups: list[list[str]]) -> bo
    All term groups must be satisfied.
    """
    content_lower = content.lower()
-    for group in term_groups:
-        if not any(term.lower() in content_lower for term in group):
-            return False
-    return True
+    return all(
+        any(term.lower() in content_lower for term in group) for group in term_groups
+    )
+
+
+def assert_non_empty_content(chat_completion, *, context: str = "") -> str:
+    """Assert the first choice has non-empty string content; return it.
+
+    Provides a detailed failure message including the full ChatCompletion
+    response so flaky / model-quality issues are easy to diagnose.
+    """
+    prefix = f"[{context}] " if context else ""
+    choice = chat_completion.choices[0]
+    content = choice.message.content
+
+    assert content is not None, (
+        f"{prefix}Expected non-None content but got None. "
+        f"finish_reason={choice.finish_reason!r}, "
+        f"full message={choice.message!r}, "
+        f"usage={chat_completion.usage!r}"
+    )
+    assert isinstance(content, str), (
+        f"{prefix}Expected str content, got {type(content).__name__}: {content!r}"
+    )
+    assert len(content) > 0, (
+        f"{prefix}Expected non-empty content but got empty string. "
+        f"finish_reason={choice.finish_reason!r}, "
+        f"full message={choice.message!r}, "
+        f"usage={chat_completion.usage!r}"
+    )
+    return content


@pytest.fixture(scope="module")
@@ -67,16 +94,22 @@ def server():
        "--trust-remote-code",
        "--limit-mm-per-prompt",
        json.dumps({"image": MAXIMUM_IMAGES}),
+        *ROCM_EXTRA_ARGS,
    ]

    # ROCm: Increase timeouts to handle potential network delays and slower
    # video processing when downloading multiple videos from external sources
-    env_overrides = {}
-    if current_platform.is_rocm():
-        env_overrides = {
-            "VLLM_VIDEO_FETCH_TIMEOUT": "120",
-            "VLLM_ENGINE_ITERATION_TIMEOUT_S": "300",
-        }
+    env_overrides = {
+        **ROCM_ENV_OVERRIDES,
+        **(
+            {
+                "VLLM_VIDEO_FETCH_TIMEOUT": "120",
+                "VLLM_ENGINE_ITERATION_TIMEOUT_S": "300",
+            }
+            if current_platform.is_rocm()
+            else {}
+        ),
+    }

    with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_overrides) as remote_server:
        yield remote_server
@@ -117,6 +150,51 @@ def dummy_messages_from_image_url(
    ]


+def describe_image_messages(
+    image_url: str, *, extra_image_fields: dict | None = None
+) -> list[dict]:
+    """Build the system + user messages used by the completions-with-image
+    family of tests. *extra_image_fields* is merged into the top-level
+    image content block (for uuid / bad-key tests)."""
+    image_block: dict = {
+        "type": "image_url",
+        "image_url": {"url": image_url},
+    }
+    if extra_image_fields:
+        image_block.update(extra_image_fields)
+
+    return [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Describe this image."},
+                image_block,
+            ],
+        },
+    ]
+
+
+async def complete_and_check(
+    client: openai.AsyncOpenAI,
+    model_name: str,
+    messages: list[dict],
+    *,
+    context: str,
+    max_completion_tokens: int = 50,
+    temperature: float = 0.0,
+) -> str:
+    """Run a chat completion and assert the output is non-empty.
+    Returns the content string."""
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=max_completion_tokens,
+        temperature=temperature,
+    )
+    return assert_non_empty_content(chat_completion, context=context)
+
+
 def get_hf_prompt_tokens(model_name, content, image_url):
    processor = AutoProcessor.from_pretrained(
        model_name, trust_remote_code=True, num_crops=4
@@ -153,7 +231,6 @@ async def test_single_chat_session_image(
    messages = dummy_messages_from_image_url(image_url, content_text)

    max_completion_tokens = 10
-    # test single completion
    chat_completion = await client.chat.completions.create(
        model=model_name,
        messages=messages,
@@ -162,32 +239,46 @@ async def test_single_chat_session_image(
        temperature=0.0,
        top_logprobs=5,
    )
-    assert len(chat_completion.choices) == 1
+    assert len(chat_completion.choices) == 1, (
+        f"Expected 1 choice, got {len(chat_completion.choices)}"
+    )

    choice = chat_completion.choices[0]
-    assert choice.finish_reason == "length"
+    assert choice.finish_reason == "length", (
+        f"Expected finish_reason='length' (capped at {max_completion_tokens} "
+        f"tokens), got {choice.finish_reason!r}. "
+        f"content={choice.message.content!r}"
+    )
+
    hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text, image_url)
-    assert chat_completion.usage == openai.types.CompletionUsage(
+    expected_usage = openai.types.CompletionUsage(
        completion_tokens=max_completion_tokens,
        prompt_tokens=hf_prompt_tokens,
        total_tokens=hf_prompt_tokens + max_completion_tokens,
    )
+    assert chat_completion.usage == expected_usage, (
+        f"Usage mismatch: got {chat_completion.usage!r}, expected {expected_usage!r}"
+    )

    message = choice.message
-    message = chat_completion.choices[0].message
-    assert message.content is not None and len(message.content) >= 10
-    assert message.role == "assistant"
+    assert message.content is not None and len(message.content) >= 10, (
+        f"Expected content with >=10 chars, got {message.content!r}"
+    )
+    assert message.role == "assistant", (
+        f"Expected role='assistant', got {message.role!r}"
+    )
+
    messages.append({"role": "assistant", "content": message.content})

    # test multi-turn dialogue
    messages.append({"role": "user", "content": "express your result in json"})
-    chat_completion = await client.chat.completions.create(
-        model=model_name,
-        messages=messages,
+    await complete_and_check(
+        client,
+        model_name,
+        messages,
+        context=f"multi-turn follow-up for {image_url}",
        max_completion_tokens=10,
    )
-    message = chat_completion.choices[0].message
-    assert message.content is not None and len(message.content) >= 0


@pytest.mark.asyncio
@@ -209,7 +300,7 @@ async def test_error_on_invalid_image_url_type(

    # image_url should be a dict {"url": "some url"}, not directly a string
    with pytest.raises(openai.BadRequestError):
-        _ = await client.chat.completions.create(
+        await client.chat.completions.create(
            model=model_name,
            messages=messages,
            max_completion_tokens=10,
@@ -235,10 +326,15 @@ async def test_single_chat_session_image_beamsearch(
        top_logprobs=5,
        extra_body=dict(use_beam_search=True),
    )
-    assert len(chat_completion.choices) == 2
-    assert (
-        chat_completion.choices[0].message.content
-        != chat_completion.choices[1].message.content
+    assert len(chat_completion.choices) == 2, (
+        f"Expected 2 beam search choices, got {len(chat_completion.choices)}"
+    )
+
+    content_0 = chat_completion.choices[0].message.content
+    content_1 = chat_completion.choices[1].message.content
+    assert content_0 != content_1, (
+        f"Beam search should produce different outputs for {image_url}, "
+        f"but both returned: {content_0!r}"
    )


@@ -269,33 +365,46 @@ async def test_single_chat_session_image_base64encoded(
        temperature=0.0,
        top_logprobs=5,
    )
-    assert len(chat_completion.choices) == 1
+    assert len(chat_completion.choices) == 1, (
+        f"Expected 1 choice, got {len(chat_completion.choices)}"
+    )

    choice = chat_completion.choices[0]
-    assert choice.finish_reason == "length"
+    assert choice.finish_reason == "length", (
+        f"Expected finish_reason='length', got {choice.finish_reason!r}. "
+        f"content={choice.message.content!r}"
+    )
+
    hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text, image_url)
-    assert chat_completion.usage == openai.types.CompletionUsage(
+    expected_usage = openai.types.CompletionUsage(
        completion_tokens=max_completion_tokens,
        prompt_tokens=hf_prompt_tokens,
        total_tokens=hf_prompt_tokens + max_completion_tokens,
    )
+    assert chat_completion.usage == expected_usage, (
+        f"Usage mismatch: got {chat_completion.usage!r}, expected {expected_usage!r}"
+    )

    message = choice.message
-    message = chat_completion.choices[0].message
-    assert message.content is not None and len(message.content) >= 10
-    assert message.role == "assistant"
+    assert message.content is not None and len(message.content) >= 10, (
+        f"Expected content with >=10 chars, got {message.content!r}"
+    )
+    assert message.role == "assistant", (
+        f"Expected role='assistant', got {message.role!r}"
+    )
+
    messages.append({"role": "assistant", "content": message.content})

    # test multi-turn dialogue
    messages.append({"role": "user", "content": "express your result in json"})
-    chat_completion = await client.chat.completions.create(
-        model=model_name,
-        messages=messages,
+    await complete_and_check(
+        client,
+        model_name,
+        messages,
+        context=f"multi-turn base64 follow-up for {raw_image_url}",
        max_completion_tokens=10,
        temperature=0.0,
    )
-    message = chat_completion.choices[0].message
-    assert message.content is not None and len(message.content) >= 0


@pytest.mark.asyncio
@@ -321,7 +430,10 @@ async def test_single_chat_session_image_base64encoded_beamsearch(
        temperature=0.0,
        extra_body=dict(use_beam_search=True),
    )
-    assert len(chat_completion.choices) == 2
+    assert len(chat_completion.choices) == 2, (
+        f"Expected 2 beam search choices for image {image_idx} "
+        f"({raw_image_url}), got {len(chat_completion.choices)}"
+    )

    # Verify beam search produces two different non-empty outputs
    content_0 = chat_completion.choices[0].message.content
@@ -333,18 +445,28 @@ async def test_single_chat_session_image_base64encoded_beamsearch(
        f"Output 0: {content_0!r}, Output 1: {content_1!r}"
    )

-    assert content_0, "First beam search output should not be empty"
-    assert content_1, "Second beam search output should not be empty"
-    assert content_0 != content_1, "Beam search should produce different outputs"
+    assert content_0, (
+        f"First beam output is empty for image {image_idx} ({raw_image_url}). "
+        f"finish_reason={chat_completion.choices[0].finish_reason!r}"
+    )
+    assert content_1, (
+        f"Second beam output is empty for image {image_idx} "
+        f"({raw_image_url}). "
+        f"finish_reason={chat_completion.choices[1].finish_reason!r}"
+    )
+    assert content_0 != content_1, (
+        f"Beam search produced identical outputs for image {image_idx} "
+        f"({raw_image_url}): {content_0!r}"
+    )

    # Verify each output contains the required terms for this image
    for i, content in enumerate([content_0, content_1]):
-        if not check_output_matches_terms(content, required_terms):
-            pytest.fail(
-                f"Output {i} '{content}' doesn't contain required terms. "
-                f"Expected all of these term groups (at least one from each): "
-                f"{required_terms}"
-            )
+        assert check_output_matches_terms(content, required_terms), (
+            f"Beam output {i} for image {image_idx} ({raw_image_url}) "
+            f"doesn't match required terms.\n"
+            f"  content: {content!r}\n"
+            f"  required (all groups, >=1 per group): {required_terms}"
+        )


@pytest.mark.asyncio
@@ -378,16 +500,29 @@ async def test_chat_streaming_image(
    async for chunk in stream:
        delta = chunk.choices[0].delta
        if delta.role:
-            assert delta.role == "assistant"
+            assert delta.role == "assistant", (
+                f"Expected role='assistant' in stream delta, got {delta.role!r}"
+            )
        if delta.content:
            chunks.append(delta.content)
        if chunk.choices[0].finish_reason is not None:
            finish_reason_count += 1
    # finish reason should only return in last block
-    assert finish_reason_count == 1
-    assert chunk.choices[0].finish_reason == stop_reason
-    assert delta.content
-    assert "".join(chunks) == output
+    assert finish_reason_count == 1, (
+        f"Expected exactly 1 finish_reason across stream chunks, "
+        f"got {finish_reason_count}"
+    )
+    assert chunk.choices[0].finish_reason == stop_reason, (
+        f"Stream finish_reason={chunk.choices[0].finish_reason!r} "
+        f"doesn't match non-stream finish_reason={stop_reason!r}"
+    )
+
+    streamed_text = "".join(chunks)
+    assert streamed_text == output, (
+        f"Streamed output doesn't match non-streamed for {image_url}.\n"
+        f"  streamed:     {streamed_text!r}\n"
+        f"  non-streamed: {output!r}"
+    )


@pytest.mark.asyncio
@@ -418,17 +553,19 @@ async def test_multi_image_input(
            max_tokens=5,
            temperature=0.0,
        )
-        completion = completion.choices[0].text
-        assert completion is not None and len(completion) >= 0
+        assert completion.choices[0].text is not None, (
+            "Server failed to produce output after rejecting over-limit "
+            "multi-image request"
+        )
    else:
-        chat_completion = await client.chat.completions.create(
-            model=model_name,
-            messages=messages,
+        await complete_and_check(
+            client,
+            model_name,
+            messages,
+            context=f"multi-image input ({len(image_urls)} images)",
            max_completion_tokens=10,
            temperature=0.0,
        )
-        message = chat_completion.choices[0].message
-        assert message.content is not None and len(message.content) >= 0


@pytest.mark.asyncio
@@ -444,30 +581,13 @@ async def test_completions_with_image(
    image_urls: list[str],
 ):
    for image_url in image_urls:
-        chat_completion = await client.chat.completions.create(
-            messages=[
-                {"role": "system", "content": "You are a helpful assistant."},
-                {
-                    "role": "user",
-                    "content": [
-                        {
-                            "type": "text",
-                            "text": "Describe this image.",
-                        },
-                        {
-                            "type": "image_url",
-                            "image_url": {
-                                "url": image_url,
-                            },
-                        },
-                    ],
-                },
-            ],
-            model=model_name,
+        messages = describe_image_messages(image_url)
+        await complete_and_check(
+            client,
+            model_name,
+            messages,
+            context=f"completions_with_image url={image_url}",
        )
-        assert chat_completion.choices[0].message.content is not None
-        assert isinstance(chat_completion.choices[0].message.content, str)
-        assert len(chat_completion.choices[0].message.content) > 0


@pytest.mark.asyncio
@@ -483,54 +603,33 @@ async def test_completions_with_image_with_uuid(
    image_urls: list[str],
 ):
    for image_url in image_urls:
-        chat_completion = await client.chat.completions.create(
-            messages=[
-                {"role": "system", "content": "You are a helpful assistant."},
-                {
-                    "role": "user",
-                    "content": [
-                        {
-                            "type": "text",
-                            "text": "Describe this image.",
-                        },
-                        {
-                            "type": "image_url",
-                            "image_url": {
-                                "url": image_url,
-                            },
-                            "uuid": image_url,
-                        },
-                    ],
-                },
-            ],
-            model=model_name,
+        messages = describe_image_messages(
+            image_url,
+            extra_image_fields={"uuid": image_url},
+        )
+        await complete_and_check(
+            client,
+            model_name,
+            messages,
+            context=f"uuid first request url={image_url}",
        )
-        assert chat_completion.choices[0].message.content is not None
-        assert isinstance(chat_completion.choices[0].message.content, str)
-        assert len(chat_completion.choices[0].message.content) > 0

-        # Second request, with empty image but the same uuid.
-        chat_completion_with_empty_image = await client.chat.completions.create(
-            messages=[
-                {"role": "system", "content": "You are a helpful assistant."},
-                {
-                    "role": "user",
-                    "content": [
-                        {
-                            "type": "text",
-                            "text": "Describe this image.",
-                        },
-                        {"type": "image_url", "image_url": {}, "uuid": image_url},
-                    ],
-                },
-            ],
-            model=model_name,
+        cached_messages: list[dict] = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "Describe this image."},
+                    {"type": "image_url", "image_url": {}, "uuid": image_url},
+                ],
+            },
+        ]
+        await complete_and_check(
+            client,
+            model_name,
+            cached_messages,
+            context=f"uuid cached (empty image) uuid={image_url}",
        )
-        assert chat_completion_with_empty_image.choices[0].message.content is not None
-        assert isinstance(
-            chat_completion_with_empty_image.choices[0].message.content, str
-        )
-        assert len(chat_completion_with_empty_image.choices[0].message.content) > 0


@pytest.mark.asyncio
@@ -540,16 +639,13 @@ async def test_completions_with_empty_image_with_uuid_without_cache_hit(
    model_name: str,
 ):
    with pytest.raises(openai.BadRequestError):
-        _ = await client.chat.completions.create(
+        await client.chat.completions.create(
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {
                    "role": "user",
                    "content": [
-                        {
-                            "type": "text",
-                            "text": "Describe this image.",
-                        },
+                        {"type": "text", "text": "Describe this image."},
                        {
                            "type": "image_url",
                            "image_url": {},
@@ -575,29 +671,18 @@ async def test_completions_with_image_with_incorrect_uuid_format(
    image_urls: list[str],
 ):
    for image_url in image_urls:
-        chat_completion = await client.chat.completions.create(
-            messages=[
-                {"role": "system", "content": "You are a helpful assistant."},
-                {
-                    "role": "user",
-                    "content": [
-                        {
-                            "type": "text",
-                            "text": "Describe this image.",
-                        },
-                        {
-                            "type": "image_url",
-                            "image_url": {
-                                "url": image_url,
-                                "incorrect_uuid_key": image_url,
-                            },
-                            "also_incorrect_uuid_key": image_url,
-                        },
-                    ],
-                },
-            ],
-            model=model_name,
+        messages = describe_image_messages(
+            image_url,
+            extra_image_fields={
+                "also_incorrect_uuid_key": image_url,
+            },
+        )
+        # Inject the bad key inside image_url dict too
+        messages[1]["content"][1]["image_url"]["incorrect_uuid_key"] = image_url
+
+        await complete_and_check(
+            client,
+            model_name,
+            messages,
+            context=f"incorrect uuid format url={image_url}",
        )
-        assert chat_completion.choices[0].message.content is not None
-        assert isinstance(chat_completion.choices[0].message.content, str)
-        assert len(chat_completion.choices[0].message.content) > 0