[CI] fix flaky empty responses and add diagnostic assertions in vision chat tests (#36341)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
This commit is contained in:
@@ -6,7 +6,7 @@ import json
|
||||
|
||||
import pytest
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
from ...utils import ROCM_ENV_OVERRIDES, ROCM_EXTRA_ARGS, RemoteOpenAIServer
|
||||
from .conftest import add_attention_backend
|
||||
|
||||
MISTRAL_FORMAT_ARGS = [
|
||||
@@ -19,12 +19,55 @@ MISTRAL_FORMAT_ARGS = [
|
||||
]
|
||||
|
||||
|
||||
async def transcribe_and_check(
|
||||
client,
|
||||
model_name: str,
|
||||
file,
|
||||
*,
|
||||
language: str,
|
||||
expected_text: str,
|
||||
expected_seconds: int | None = None,
|
||||
case_sensitive: bool = False,
|
||||
):
|
||||
"""Run a transcription request and assert the output contains
|
||||
*expected_text* and optionally that usage reports *expected_seconds*.
|
||||
|
||||
Provides detailed failure messages with the actual transcription output.
|
||||
"""
|
||||
transcription = await client.audio.transcriptions.create(
|
||||
model=model_name,
|
||||
file=file,
|
||||
language=language,
|
||||
response_format="text",
|
||||
temperature=0.0,
|
||||
)
|
||||
out = json.loads(transcription)
|
||||
out_text = out["text"]
|
||||
out_usage = out["usage"]
|
||||
|
||||
if case_sensitive:
|
||||
assert expected_text in out_text, (
|
||||
f"Expected {expected_text!r} in transcription output, got: {out_text!r}"
|
||||
)
|
||||
else:
|
||||
assert expected_text.lower() in out_text.lower(), (
|
||||
f"Expected {expected_text!r} (case-insensitive) in transcription "
|
||||
f"output, got: {out_text!r}"
|
||||
)
|
||||
|
||||
if expected_seconds is not None:
|
||||
assert out_usage["seconds"] == expected_seconds, (
|
||||
f"Expected {expected_seconds}s of audio, "
|
||||
f"got {out_usage['seconds']}s. Full usage: {out_usage!r}"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"model_name", ["mistralai/Voxtral-Mini-3B-2507", "Qwen/Qwen3-ASR-0.6B"]
|
||||
)
|
||||
async def test_basic_audio(mary_had_lamb, model_name, rocm_aiter_fa_attention):
|
||||
server_args = ["--enforce-eager"]
|
||||
server_args = ["--enforce-eager", *ROCM_EXTRA_ARGS]
|
||||
|
||||
if model_name.startswith("mistralai"):
|
||||
server_args += MISTRAL_FORMAT_ARGS
|
||||
@@ -32,20 +75,18 @@ async def test_basic_audio(mary_had_lamb, model_name, rocm_aiter_fa_attention):
|
||||
add_attention_backend(server_args, rocm_aiter_fa_attention)
|
||||
|
||||
# Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
|
||||
with RemoteOpenAIServer(model_name, server_args) as remote_server:
|
||||
with RemoteOpenAIServer(
|
||||
model_name, server_args, env_dict=ROCM_ENV_OVERRIDES
|
||||
) as remote_server:
|
||||
client = remote_server.get_async_client()
|
||||
transcription = await client.audio.transcriptions.create(
|
||||
model=model_name,
|
||||
file=mary_had_lamb,
|
||||
await transcribe_and_check(
|
||||
client,
|
||||
model_name,
|
||||
mary_had_lamb,
|
||||
language="en",
|
||||
response_format="text",
|
||||
temperature=0.0,
|
||||
expected_text="Mary had a little lamb",
|
||||
expected_seconds=16,
|
||||
)
|
||||
out = json.loads(transcription)
|
||||
out_text = out["text"]
|
||||
out_usage = out["usage"]
|
||||
assert "Mary had a little lamb" in out_text
|
||||
assert out_usage["seconds"] == 16, out_usage["seconds"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -74,20 +115,18 @@ async def test_basic_audio_with_lora(mary_had_lamb, rocm_aiter_fa_attention):
|
||||
add_attention_backend(server_args, rocm_aiter_fa_attention)
|
||||
|
||||
# Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
|
||||
with RemoteOpenAIServer(model_name, server_args) as remote_server:
|
||||
with RemoteOpenAIServer(
|
||||
model_name, server_args, env_dict=ROCM_ENV_OVERRIDES
|
||||
) as remote_server:
|
||||
client = remote_server.get_async_client()
|
||||
transcription = await client.audio.transcriptions.create(
|
||||
model=lora_model_name,
|
||||
file=mary_had_lamb,
|
||||
await transcribe_and_check(
|
||||
client,
|
||||
lora_model_name,
|
||||
mary_had_lamb,
|
||||
language="en",
|
||||
response_format="text",
|
||||
temperature=0.0,
|
||||
expected_text="mary had a little lamb",
|
||||
expected_seconds=16,
|
||||
)
|
||||
out = json.loads(transcription)
|
||||
out_text = out["text"]
|
||||
out_usage = out["usage"]
|
||||
assert "mary had a little lamb" in out_text
|
||||
assert out_usage["seconds"] == 16, out_usage["seconds"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -97,20 +136,21 @@ async def test_basic_audio_with_lora(mary_had_lamb, rocm_aiter_fa_attention):
|
||||
async def test_basic_audio_foscolo(foscolo, rocm_aiter_fa_attention, model_name):
|
||||
# Gemma accuracy on some of the audio samples we use is particularly bad,
|
||||
# hence we use a different one here. WER is evaluated separately.
|
||||
server_args = ["--enforce-eager"]
|
||||
server_args = ["--enforce-eager", *ROCM_EXTRA_ARGS]
|
||||
|
||||
add_attention_backend(server_args, rocm_aiter_fa_attention)
|
||||
|
||||
with RemoteOpenAIServer(
|
||||
model_name, server_args, max_wait_seconds=480
|
||||
model_name,
|
||||
server_args,
|
||||
max_wait_seconds=480,
|
||||
env_dict=ROCM_ENV_OVERRIDES,
|
||||
) as remote_server:
|
||||
client = remote_server.get_async_client()
|
||||
transcription = await client.audio.transcriptions.create(
|
||||
model=model_name,
|
||||
file=foscolo,
|
||||
await transcribe_and_check(
|
||||
client,
|
||||
model_name,
|
||||
foscolo,
|
||||
language="it",
|
||||
response_format="text",
|
||||
temperature=0.0,
|
||||
expected_text="ove il mio corpo fanciulletto giacque",
|
||||
)
|
||||
out = json.loads(transcription)["text"]
|
||||
assert "ove il mio corpo fanciulletto giacque" in out
|
||||
|
||||
@@ -12,7 +12,7 @@ from vllm.multimodal.media import MediaWithBytes
|
||||
from vllm.multimodal.utils import encode_image_url, fetch_image
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ...utils import RemoteOpenAIServer
|
||||
from ...utils import ROCM_ENV_OVERRIDES, ROCM_EXTRA_ARGS, RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "microsoft/Phi-3.5-vision-instruct"
|
||||
MAXIMUM_IMAGES = 2
|
||||
@@ -48,10 +48,37 @@ def check_output_matches_terms(content: str, term_groups: list[list[str]]) -> bo
|
||||
All term groups must be satisfied.
|
||||
"""
|
||||
content_lower = content.lower()
|
||||
for group in term_groups:
|
||||
if not any(term.lower() in content_lower for term in group):
|
||||
return False
|
||||
return True
|
||||
return all(
|
||||
any(term.lower() in content_lower for term in group) for group in term_groups
|
||||
)
|
||||
|
||||
|
||||
def assert_non_empty_content(chat_completion, *, context: str = "") -> str:
|
||||
"""Assert the first choice has non-empty string content; return it.
|
||||
|
||||
Provides a detailed failure message including the full ChatCompletion
|
||||
response so flaky / model-quality issues are easy to diagnose.
|
||||
"""
|
||||
prefix = f"[{context}] " if context else ""
|
||||
choice = chat_completion.choices[0]
|
||||
content = choice.message.content
|
||||
|
||||
assert content is not None, (
|
||||
f"{prefix}Expected non-None content but got None. "
|
||||
f"finish_reason={choice.finish_reason!r}, "
|
||||
f"full message={choice.message!r}, "
|
||||
f"usage={chat_completion.usage!r}"
|
||||
)
|
||||
assert isinstance(content, str), (
|
||||
f"{prefix}Expected str content, got {type(content).__name__}: {content!r}"
|
||||
)
|
||||
assert len(content) > 0, (
|
||||
f"{prefix}Expected non-empty content but got empty string. "
|
||||
f"finish_reason={choice.finish_reason!r}, "
|
||||
f"full message={choice.message!r}, "
|
||||
f"usage={chat_completion.usage!r}"
|
||||
)
|
||||
return content
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
@@ -67,16 +94,22 @@ def server():
|
||||
"--trust-remote-code",
|
||||
"--limit-mm-per-prompt",
|
||||
json.dumps({"image": MAXIMUM_IMAGES}),
|
||||
*ROCM_EXTRA_ARGS,
|
||||
]
|
||||
|
||||
# ROCm: Increase timeouts to handle potential network delays and slower
|
||||
# video processing when downloading multiple videos from external sources
|
||||
env_overrides = {}
|
||||
if current_platform.is_rocm():
|
||||
env_overrides = {
|
||||
"VLLM_VIDEO_FETCH_TIMEOUT": "120",
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": "300",
|
||||
}
|
||||
env_overrides = {
|
||||
**ROCM_ENV_OVERRIDES,
|
||||
**(
|
||||
{
|
||||
"VLLM_VIDEO_FETCH_TIMEOUT": "120",
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": "300",
|
||||
}
|
||||
if current_platform.is_rocm()
|
||||
else {}
|
||||
),
|
||||
}
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_overrides) as remote_server:
|
||||
yield remote_server
|
||||
@@ -117,6 +150,51 @@ def dummy_messages_from_image_url(
|
||||
]
|
||||
|
||||
|
||||
def describe_image_messages(
|
||||
image_url: str, *, extra_image_fields: dict | None = None
|
||||
) -> list[dict]:
|
||||
"""Build the system + user messages used by the completions-with-image
|
||||
family of tests. *extra_image_fields* is merged into the top-level
|
||||
image content block (for uuid / bad-key tests)."""
|
||||
image_block: dict = {
|
||||
"type": "image_url",
|
||||
"image_url": {"url": image_url},
|
||||
}
|
||||
if extra_image_fields:
|
||||
image_block.update(extra_image_fields)
|
||||
|
||||
return [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Describe this image."},
|
||||
image_block,
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
async def complete_and_check(
|
||||
client: openai.AsyncOpenAI,
|
||||
model_name: str,
|
||||
messages: list[dict],
|
||||
*,
|
||||
context: str,
|
||||
max_completion_tokens: int = 50,
|
||||
temperature: float = 0.0,
|
||||
) -> str:
|
||||
"""Run a chat completion and assert the output is non-empty.
|
||||
Returns the content string."""
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
temperature=temperature,
|
||||
)
|
||||
return assert_non_empty_content(chat_completion, context=context)
|
||||
|
||||
|
||||
def get_hf_prompt_tokens(model_name, content, image_url):
|
||||
processor = AutoProcessor.from_pretrained(
|
||||
model_name, trust_remote_code=True, num_crops=4
|
||||
@@ -153,7 +231,6 @@ async def test_single_chat_session_image(
|
||||
messages = dummy_messages_from_image_url(image_url, content_text)
|
||||
|
||||
max_completion_tokens = 10
|
||||
# test single completion
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
@@ -162,32 +239,46 @@ async def test_single_chat_session_image(
|
||||
temperature=0.0,
|
||||
top_logprobs=5,
|
||||
)
|
||||
assert len(chat_completion.choices) == 1
|
||||
assert len(chat_completion.choices) == 1, (
|
||||
f"Expected 1 choice, got {len(chat_completion.choices)}"
|
||||
)
|
||||
|
||||
choice = chat_completion.choices[0]
|
||||
assert choice.finish_reason == "length"
|
||||
assert choice.finish_reason == "length", (
|
||||
f"Expected finish_reason='length' (capped at {max_completion_tokens} "
|
||||
f"tokens), got {choice.finish_reason!r}. "
|
||||
f"content={choice.message.content!r}"
|
||||
)
|
||||
|
||||
hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text, image_url)
|
||||
assert chat_completion.usage == openai.types.CompletionUsage(
|
||||
expected_usage = openai.types.CompletionUsage(
|
||||
completion_tokens=max_completion_tokens,
|
||||
prompt_tokens=hf_prompt_tokens,
|
||||
total_tokens=hf_prompt_tokens + max_completion_tokens,
|
||||
)
|
||||
assert chat_completion.usage == expected_usage, (
|
||||
f"Usage mismatch: got {chat_completion.usage!r}, expected {expected_usage!r}"
|
||||
)
|
||||
|
||||
message = choice.message
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None and len(message.content) >= 10
|
||||
assert message.role == "assistant"
|
||||
assert message.content is not None and len(message.content) >= 10, (
|
||||
f"Expected content with >=10 chars, got {message.content!r}"
|
||||
)
|
||||
assert message.role == "assistant", (
|
||||
f"Expected role='assistant', got {message.role!r}"
|
||||
)
|
||||
|
||||
messages.append({"role": "assistant", "content": message.content})
|
||||
|
||||
# test multi-turn dialogue
|
||||
messages.append({"role": "user", "content": "express your result in json"})
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
await complete_and_check(
|
||||
client,
|
||||
model_name,
|
||||
messages,
|
||||
context=f"multi-turn follow-up for {image_url}",
|
||||
max_completion_tokens=10,
|
||||
)
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None and len(message.content) >= 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -209,7 +300,7 @@ async def test_error_on_invalid_image_url_type(
|
||||
|
||||
# image_url should be a dict {"url": "some url"}, not directly a string
|
||||
with pytest.raises(openai.BadRequestError):
|
||||
_ = await client.chat.completions.create(
|
||||
await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
@@ -235,10 +326,15 @@ async def test_single_chat_session_image_beamsearch(
|
||||
top_logprobs=5,
|
||||
extra_body=dict(use_beam_search=True),
|
||||
)
|
||||
assert len(chat_completion.choices) == 2
|
||||
assert (
|
||||
chat_completion.choices[0].message.content
|
||||
!= chat_completion.choices[1].message.content
|
||||
assert len(chat_completion.choices) == 2, (
|
||||
f"Expected 2 beam search choices, got {len(chat_completion.choices)}"
|
||||
)
|
||||
|
||||
content_0 = chat_completion.choices[0].message.content
|
||||
content_1 = chat_completion.choices[1].message.content
|
||||
assert content_0 != content_1, (
|
||||
f"Beam search should produce different outputs for {image_url}, "
|
||||
f"but both returned: {content_0!r}"
|
||||
)
|
||||
|
||||
|
||||
@@ -269,33 +365,46 @@ async def test_single_chat_session_image_base64encoded(
|
||||
temperature=0.0,
|
||||
top_logprobs=5,
|
||||
)
|
||||
assert len(chat_completion.choices) == 1
|
||||
assert len(chat_completion.choices) == 1, (
|
||||
f"Expected 1 choice, got {len(chat_completion.choices)}"
|
||||
)
|
||||
|
||||
choice = chat_completion.choices[0]
|
||||
assert choice.finish_reason == "length"
|
||||
assert choice.finish_reason == "length", (
|
||||
f"Expected finish_reason='length', got {choice.finish_reason!r}. "
|
||||
f"content={choice.message.content!r}"
|
||||
)
|
||||
|
||||
hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text, image_url)
|
||||
assert chat_completion.usage == openai.types.CompletionUsage(
|
||||
expected_usage = openai.types.CompletionUsage(
|
||||
completion_tokens=max_completion_tokens,
|
||||
prompt_tokens=hf_prompt_tokens,
|
||||
total_tokens=hf_prompt_tokens + max_completion_tokens,
|
||||
)
|
||||
assert chat_completion.usage == expected_usage, (
|
||||
f"Usage mismatch: got {chat_completion.usage!r}, expected {expected_usage!r}"
|
||||
)
|
||||
|
||||
message = choice.message
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None and len(message.content) >= 10
|
||||
assert message.role == "assistant"
|
||||
assert message.content is not None and len(message.content) >= 10, (
|
||||
f"Expected content with >=10 chars, got {message.content!r}"
|
||||
)
|
||||
assert message.role == "assistant", (
|
||||
f"Expected role='assistant', got {message.role!r}"
|
||||
)
|
||||
|
||||
messages.append({"role": "assistant", "content": message.content})
|
||||
|
||||
# test multi-turn dialogue
|
||||
messages.append({"role": "user", "content": "express your result in json"})
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
await complete_and_check(
|
||||
client,
|
||||
model_name,
|
||||
messages,
|
||||
context=f"multi-turn base64 follow-up for {raw_image_url}",
|
||||
max_completion_tokens=10,
|
||||
temperature=0.0,
|
||||
)
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None and len(message.content) >= 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -321,7 +430,10 @@ async def test_single_chat_session_image_base64encoded_beamsearch(
|
||||
temperature=0.0,
|
||||
extra_body=dict(use_beam_search=True),
|
||||
)
|
||||
assert len(chat_completion.choices) == 2
|
||||
assert len(chat_completion.choices) == 2, (
|
||||
f"Expected 2 beam search choices for image {image_idx} "
|
||||
f"({raw_image_url}), got {len(chat_completion.choices)}"
|
||||
)
|
||||
|
||||
# Verify beam search produces two different non-empty outputs
|
||||
content_0 = chat_completion.choices[0].message.content
|
||||
@@ -333,18 +445,28 @@ async def test_single_chat_session_image_base64encoded_beamsearch(
|
||||
f"Output 0: {content_0!r}, Output 1: {content_1!r}"
|
||||
)
|
||||
|
||||
assert content_0, "First beam search output should not be empty"
|
||||
assert content_1, "Second beam search output should not be empty"
|
||||
assert content_0 != content_1, "Beam search should produce different outputs"
|
||||
assert content_0, (
|
||||
f"First beam output is empty for image {image_idx} ({raw_image_url}). "
|
||||
f"finish_reason={chat_completion.choices[0].finish_reason!r}"
|
||||
)
|
||||
assert content_1, (
|
||||
f"Second beam output is empty for image {image_idx} "
|
||||
f"({raw_image_url}). "
|
||||
f"finish_reason={chat_completion.choices[1].finish_reason!r}"
|
||||
)
|
||||
assert content_0 != content_1, (
|
||||
f"Beam search produced identical outputs for image {image_idx} "
|
||||
f"({raw_image_url}): {content_0!r}"
|
||||
)
|
||||
|
||||
# Verify each output contains the required terms for this image
|
||||
for i, content in enumerate([content_0, content_1]):
|
||||
if not check_output_matches_terms(content, required_terms):
|
||||
pytest.fail(
|
||||
f"Output {i} '{content}' doesn't contain required terms. "
|
||||
f"Expected all of these term groups (at least one from each): "
|
||||
f"{required_terms}"
|
||||
)
|
||||
assert check_output_matches_terms(content, required_terms), (
|
||||
f"Beam output {i} for image {image_idx} ({raw_image_url}) "
|
||||
f"doesn't match required terms.\n"
|
||||
f" content: {content!r}\n"
|
||||
f" required (all groups, >=1 per group): {required_terms}"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -378,16 +500,29 @@ async def test_chat_streaming_image(
|
||||
async for chunk in stream:
|
||||
delta = chunk.choices[0].delta
|
||||
if delta.role:
|
||||
assert delta.role == "assistant"
|
||||
assert delta.role == "assistant", (
|
||||
f"Expected role='assistant' in stream delta, got {delta.role!r}"
|
||||
)
|
||||
if delta.content:
|
||||
chunks.append(delta.content)
|
||||
if chunk.choices[0].finish_reason is not None:
|
||||
finish_reason_count += 1
|
||||
# finish reason should only return in last block
|
||||
assert finish_reason_count == 1
|
||||
assert chunk.choices[0].finish_reason == stop_reason
|
||||
assert delta.content
|
||||
assert "".join(chunks) == output
|
||||
assert finish_reason_count == 1, (
|
||||
f"Expected exactly 1 finish_reason across stream chunks, "
|
||||
f"got {finish_reason_count}"
|
||||
)
|
||||
assert chunk.choices[0].finish_reason == stop_reason, (
|
||||
f"Stream finish_reason={chunk.choices[0].finish_reason!r} "
|
||||
f"doesn't match non-stream finish_reason={stop_reason!r}"
|
||||
)
|
||||
|
||||
streamed_text = "".join(chunks)
|
||||
assert streamed_text == output, (
|
||||
f"Streamed output doesn't match non-streamed for {image_url}.\n"
|
||||
f" streamed: {streamed_text!r}\n"
|
||||
f" non-streamed: {output!r}"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -418,17 +553,19 @@ async def test_multi_image_input(
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
)
|
||||
completion = completion.choices[0].text
|
||||
assert completion is not None and len(completion) >= 0
|
||||
assert completion.choices[0].text is not None, (
|
||||
"Server failed to produce output after rejecting over-limit "
|
||||
"multi-image request"
|
||||
)
|
||||
else:
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
await complete_and_check(
|
||||
client,
|
||||
model_name,
|
||||
messages,
|
||||
context=f"multi-image input ({len(image_urls)} images)",
|
||||
max_completion_tokens=10,
|
||||
temperature=0.0,
|
||||
)
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None and len(message.content) >= 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -444,30 +581,13 @@ async def test_completions_with_image(
|
||||
image_urls: list[str],
|
||||
):
|
||||
for image_url in image_urls:
|
||||
chat_completion = await client.chat.completions.create(
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Describe this image.",
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url,
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
model=model_name,
|
||||
messages = describe_image_messages(image_url)
|
||||
await complete_and_check(
|
||||
client,
|
||||
model_name,
|
||||
messages,
|
||||
context=f"completions_with_image url={image_url}",
|
||||
)
|
||||
assert chat_completion.choices[0].message.content is not None
|
||||
assert isinstance(chat_completion.choices[0].message.content, str)
|
||||
assert len(chat_completion.choices[0].message.content) > 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -483,54 +603,33 @@ async def test_completions_with_image_with_uuid(
|
||||
image_urls: list[str],
|
||||
):
|
||||
for image_url in image_urls:
|
||||
chat_completion = await client.chat.completions.create(
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Describe this image.",
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url,
|
||||
},
|
||||
"uuid": image_url,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
model=model_name,
|
||||
messages = describe_image_messages(
|
||||
image_url,
|
||||
extra_image_fields={"uuid": image_url},
|
||||
)
|
||||
await complete_and_check(
|
||||
client,
|
||||
model_name,
|
||||
messages,
|
||||
context=f"uuid first request url={image_url}",
|
||||
)
|
||||
assert chat_completion.choices[0].message.content is not None
|
||||
assert isinstance(chat_completion.choices[0].message.content, str)
|
||||
assert len(chat_completion.choices[0].message.content) > 0
|
||||
|
||||
# Second request, with empty image but the same uuid.
|
||||
chat_completion_with_empty_image = await client.chat.completions.create(
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Describe this image.",
|
||||
},
|
||||
{"type": "image_url", "image_url": {}, "uuid": image_url},
|
||||
],
|
||||
},
|
||||
],
|
||||
model=model_name,
|
||||
cached_messages: list[dict] = [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Describe this image."},
|
||||
{"type": "image_url", "image_url": {}, "uuid": image_url},
|
||||
],
|
||||
},
|
||||
]
|
||||
await complete_and_check(
|
||||
client,
|
||||
model_name,
|
||||
cached_messages,
|
||||
context=f"uuid cached (empty image) uuid={image_url}",
|
||||
)
|
||||
assert chat_completion_with_empty_image.choices[0].message.content is not None
|
||||
assert isinstance(
|
||||
chat_completion_with_empty_image.choices[0].message.content, str
|
||||
)
|
||||
assert len(chat_completion_with_empty_image.choices[0].message.content) > 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -540,16 +639,13 @@ async def test_completions_with_empty_image_with_uuid_without_cache_hit(
|
||||
model_name: str,
|
||||
):
|
||||
with pytest.raises(openai.BadRequestError):
|
||||
_ = await client.chat.completions.create(
|
||||
await client.chat.completions.create(
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Describe this image.",
|
||||
},
|
||||
{"type": "text", "text": "Describe this image."},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {},
|
||||
@@ -575,29 +671,18 @@ async def test_completions_with_image_with_incorrect_uuid_format(
|
||||
image_urls: list[str],
|
||||
):
|
||||
for image_url in image_urls:
|
||||
chat_completion = await client.chat.completions.create(
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Describe this image.",
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url,
|
||||
"incorrect_uuid_key": image_url,
|
||||
},
|
||||
"also_incorrect_uuid_key": image_url,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
model=model_name,
|
||||
messages = describe_image_messages(
|
||||
image_url,
|
||||
extra_image_fields={
|
||||
"also_incorrect_uuid_key": image_url,
|
||||
},
|
||||
)
|
||||
# Inject the bad key inside image_url dict too
|
||||
messages[1]["content"][1]["image_url"]["incorrect_uuid_key"] = image_url
|
||||
|
||||
await complete_and_check(
|
||||
client,
|
||||
model_name,
|
||||
messages,
|
||||
context=f"incorrect uuid format url={image_url}",
|
||||
)
|
||||
assert chat_completion.choices[0].message.content is not None
|
||||
assert isinstance(chat_completion.choices[0].message.content, str)
|
||||
assert len(chat_completion.choices[0].message.content) > 0
|
||||
|
||||
Reference in New Issue
Block a user