[Refactor] Relocate completion and chat completion tests (#37125)
Signed-off-by: sfeng33 <4florafeng@gmail.com>
This commit is contained in:
397
tests/entrypoints/openai/chat_completion/test_audio.py
Normal file
397
tests/entrypoints/openai/chat_completion/test_audio.py
Normal file
@@ -0,0 +1,397 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import json
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
from vllm.assets.audio import AudioAsset
|
||||
from vllm.multimodal.utils import encode_audio_base64, encode_audio_url, fetch_audio
|
||||
|
||||
MODEL_NAME = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
|
||||
TEST_AUDIO_URLS = [
|
||||
AudioAsset("winning_call").url,
|
||||
AudioAsset("mary_had_lamb").url,
|
||||
]
|
||||
MAXIMUM_AUDIOS = 2
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = [
|
||||
"--dtype",
|
||||
"float32",
|
||||
"--max-model-len",
|
||||
"2048",
|
||||
"--max-num-seqs",
|
||||
"5",
|
||||
"--enforce-eager",
|
||||
"--trust-remote-code",
|
||||
"--limit-mm-per-prompt",
|
||||
json.dumps({"audio": MAXIMUM_AUDIOS}),
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def base64_encoded_audio() -> dict[str, str]:
|
||||
return {
|
||||
audio_url: encode_audio_base64(*fetch_audio(audio_url))
|
||||
for audio_url in TEST_AUDIO_URLS
|
||||
}
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def url_encoded_audio() -> dict[str, str]:
|
||||
return {
|
||||
audio_url: encode_audio_url(*fetch_audio(audio_url))
|
||||
for audio_url in TEST_AUDIO_URLS
|
||||
}
|
||||
|
||||
|
||||
def dummy_messages_from_audio_url(
|
||||
audio_urls: str | list[str],
|
||||
content_text: str = "What's happening in this audio?",
|
||||
):
|
||||
if isinstance(audio_urls, str):
|
||||
audio_urls = [audio_urls]
|
||||
|
||||
return [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
*(
|
||||
{"type": "audio_url", "audio_url": {"url": audio_url}}
|
||||
for audio_url in audio_urls
|
||||
),
|
||||
{"type": "text", "text": content_text},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])
|
||||
async def test_single_chat_session_audio(
|
||||
client: openai.AsyncOpenAI, model_name: str, audio_url: str
|
||||
):
|
||||
messages = dummy_messages_from_audio_url(audio_url)
|
||||
|
||||
# test single completion
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
logprobs=True,
|
||||
temperature=0.0,
|
||||
top_logprobs=5,
|
||||
)
|
||||
assert len(chat_completion.choices) == 1
|
||||
|
||||
choice = chat_completion.choices[0]
|
||||
assert choice.finish_reason == "length"
|
||||
assert chat_completion.usage == openai.types.CompletionUsage(
|
||||
completion_tokens=10, prompt_tokens=202, total_tokens=212
|
||||
)
|
||||
|
||||
message = choice.message
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None and len(message.content) >= 10
|
||||
assert message.role == "assistant"
|
||||
messages.append({"role": "assistant", "content": message.content})
|
||||
|
||||
# test multi-turn dialogue
|
||||
messages.append({"role": "user", "content": "express your result in json"})
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
)
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None and len(message.content) >= 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])
|
||||
async def test_error_on_invalid_audio_url_type(
|
||||
client: openai.AsyncOpenAI, model_name: str, audio_url: str
|
||||
):
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "audio_url", "audio_url": audio_url},
|
||||
{"type": "text", "text": "What's happening in this audio?"},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
# audio_url should be a dict {"url": "some url"}, not directly a string
|
||||
with pytest.raises(openai.BadRequestError):
|
||||
_ = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])
|
||||
async def test_single_chat_session_audio_base64encoded(
|
||||
client: openai.AsyncOpenAI,
|
||||
model_name: str,
|
||||
audio_url: str,
|
||||
url_encoded_audio: dict[str, str],
|
||||
):
|
||||
messages = dummy_messages_from_audio_url(url_encoded_audio[audio_url])
|
||||
|
||||
# test single completion
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
logprobs=True,
|
||||
temperature=0.0,
|
||||
top_logprobs=5,
|
||||
)
|
||||
assert len(chat_completion.choices) == 1
|
||||
|
||||
choice = chat_completion.choices[0]
|
||||
assert choice.finish_reason == "length"
|
||||
assert chat_completion.usage == openai.types.CompletionUsage(
|
||||
completion_tokens=10, prompt_tokens=202, total_tokens=212
|
||||
)
|
||||
|
||||
message = choice.message
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None and len(message.content) >= 10
|
||||
assert message.role == "assistant"
|
||||
messages.append({"role": "assistant", "content": message.content})
|
||||
|
||||
# test multi-turn dialogue
|
||||
messages.append({"role": "user", "content": "express your result in json"})
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.0,
|
||||
)
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None and len(message.content) >= 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("audio_url", [TEST_AUDIO_URLS[0]])
|
||||
async def test_single_chat_session_input_audio(
|
||||
client: openai.AsyncOpenAI,
|
||||
model_name: str,
|
||||
audio_url: str,
|
||||
base64_encoded_audio: dict[str, str],
|
||||
):
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "input_audio",
|
||||
"input_audio": {
|
||||
"data": base64_encoded_audio[audio_url],
|
||||
"format": "wav",
|
||||
},
|
||||
},
|
||||
{"type": "text", "text": "What's happening in this audio?"},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
# test single completion
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
logprobs=True,
|
||||
top_logprobs=5,
|
||||
)
|
||||
assert len(chat_completion.choices) == 1
|
||||
|
||||
choice = chat_completion.choices[0]
|
||||
assert choice.finish_reason == "length"
|
||||
assert chat_completion.usage == openai.types.CompletionUsage(
|
||||
completion_tokens=10, prompt_tokens=202, total_tokens=212
|
||||
)
|
||||
|
||||
message = choice.message
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None and len(message.content) >= 10
|
||||
assert message.role == "assistant"
|
||||
messages.append({"role": "assistant", "content": message.content})
|
||||
|
||||
# test multi-turn dialogue
|
||||
messages.append({"role": "user", "content": "express your result in json"})
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
)
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None and len(message.content) >= 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
|
||||
async def test_chat_streaming_audio(
|
||||
client: openai.AsyncOpenAI, model_name: str, audio_url: str
|
||||
):
|
||||
messages = dummy_messages_from_audio_url(
|
||||
audio_url, "What's a short title for this audio?"
|
||||
)
|
||||
|
||||
# test single completion
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=8,
|
||||
temperature=0.0,
|
||||
)
|
||||
output = chat_completion.choices[0].message.content
|
||||
stop_reason = chat_completion.choices[0].finish_reason
|
||||
|
||||
# test streaming
|
||||
stream = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=8,
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
)
|
||||
chunks: list[str] = []
|
||||
finish_reason_count = 0
|
||||
async for chunk in stream:
|
||||
delta = chunk.choices[0].delta
|
||||
if delta.role:
|
||||
assert delta.role == "assistant"
|
||||
if delta.content:
|
||||
chunks.append(delta.content)
|
||||
if chunk.choices[0].finish_reason is not None:
|
||||
finish_reason_count += 1
|
||||
# finish reason should only return in last block
|
||||
assert finish_reason_count == 1
|
||||
assert chunk.choices[0].finish_reason == stop_reason
|
||||
assert delta.content
|
||||
assert "".join(chunks) == output
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
|
||||
async def test_chat_streaming_input_audio(
|
||||
client: openai.AsyncOpenAI,
|
||||
model_name: str,
|
||||
audio_url: str,
|
||||
base64_encoded_audio: dict[str, str],
|
||||
):
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "input_audio",
|
||||
"input_audio": {
|
||||
"data": base64_encoded_audio[audio_url],
|
||||
"format": "wav",
|
||||
},
|
||||
},
|
||||
{"type": "text", "text": "What's a short title for this audio?"},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
# test single completion
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=8,
|
||||
temperature=0.0,
|
||||
)
|
||||
output = chat_completion.choices[0].message.content
|
||||
stop_reason = chat_completion.choices[0].finish_reason
|
||||
|
||||
# test streaming
|
||||
stream = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=8,
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
)
|
||||
chunks: list[str] = []
|
||||
finish_reason_count = 0
|
||||
async for chunk in stream:
|
||||
delta = chunk.choices[0].delta
|
||||
if delta.role:
|
||||
assert delta.role == "assistant"
|
||||
if delta.content:
|
||||
chunks.append(delta.content)
|
||||
if chunk.choices[0].finish_reason is not None:
|
||||
finish_reason_count += 1
|
||||
# finish reason should only return in last block
|
||||
assert finish_reason_count == 1
|
||||
assert chunk.choices[0].finish_reason == stop_reason
|
||||
assert delta.content
|
||||
assert "".join(chunks) == output
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize(
|
||||
"audio_urls", [TEST_AUDIO_URLS, TEST_AUDIO_URLS + [TEST_AUDIO_URLS[0]]]
|
||||
)
|
||||
async def test_multi_audio_input(
|
||||
client: openai.AsyncOpenAI, model_name: str, audio_urls: list[str]
|
||||
):
|
||||
messages = dummy_messages_from_audio_url(audio_urls)
|
||||
|
||||
if len(audio_urls) > MAXIMUM_AUDIOS:
|
||||
with pytest.raises(openai.BadRequestError): # test multi-audio input
|
||||
await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
# the server should still work afterwards
|
||||
completion = await client.completions.create(
|
||||
model=model_name,
|
||||
prompt=[0, 0, 0, 0, 0],
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
)
|
||||
completion = completion.choices[0].text
|
||||
assert completion is not None and len(completion) >= 0
|
||||
else:
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.0,
|
||||
)
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None and len(message.content) >= 0
|
||||
175
tests/entrypoints/openai/chat_completion/test_audio_in_video.py
Normal file
175
tests/entrypoints/openai/chat_completion/test_audio_in_video.py
Normal file
@@ -0,0 +1,175 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import base64
|
||||
import json
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from tests.conftest import VideoTestAssets
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
|
||||
MODEL_NAME = "Qwen/Qwen2.5-Omni-3B"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def server():
|
||||
args = [
|
||||
"--max-model-len",
|
||||
"16384",
|
||||
"--enforce-eager",
|
||||
"--limit-mm-per-prompt",
|
||||
json.dumps({"audio": 3, "video": 3}),
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(
|
||||
MODEL_NAME,
|
||||
args,
|
||||
) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.asyncio
|
||||
async def test_online_audio_in_video(
|
||||
client: openai.AsyncOpenAI, video_assets: VideoTestAssets
|
||||
):
|
||||
"""Test video input with `audio_in_video=True`"""
|
||||
|
||||
# we don't use video_urls above because they missed audio stream.
|
||||
video_path = video_assets[0].video_path
|
||||
with open(video_path, "rb") as f:
|
||||
video_base64 = base64.b64encode(f.read()).decode("utf-8")
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "What's in this video?"},
|
||||
{
|
||||
"type": "video_url",
|
||||
"video_url": {"url": f"data:video/mp4;base64,{video_base64}"},
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
# multi-turn to test mm processor cache as well
|
||||
for _ in range(2):
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_tokens=16,
|
||||
extra_body={
|
||||
"mm_processor_kwargs": {
|
||||
"use_audio_in_video": True,
|
||||
}
|
||||
},
|
||||
)
|
||||
|
||||
assert len(chat_completion.choices) == 1
|
||||
choice = chat_completion.choices[0]
|
||||
assert choice.finish_reason == "length"
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.asyncio
|
||||
async def test_online_audio_in_video_multi_videos(
|
||||
client: openai.AsyncOpenAI, video_assets: VideoTestAssets
|
||||
):
|
||||
"""Test multi-video input with `audio_in_video=True`"""
|
||||
|
||||
# we don't use video_urls above because they missed audio stream.
|
||||
video_path = video_assets[0].video_path
|
||||
with open(video_path, "rb") as f:
|
||||
video_base64 = base64.b64encode(f.read()).decode("utf-8")
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "What's in these two videos?"},
|
||||
{
|
||||
"type": "video_url",
|
||||
"video_url": {"url": f"data:video/mp4;base64,{video_base64}"},
|
||||
},
|
||||
{
|
||||
"type": "video_url",
|
||||
"video_url": {"url": f"data:video/mp4;base64,{video_base64}"},
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
# multi-turn to test mm processor cache as well
|
||||
for _ in range(2):
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_tokens=16,
|
||||
extra_body={
|
||||
"mm_processor_kwargs": {
|
||||
"use_audio_in_video": True,
|
||||
}
|
||||
},
|
||||
)
|
||||
|
||||
assert len(chat_completion.choices) == 1
|
||||
choice = chat_completion.choices[0]
|
||||
assert choice.finish_reason == "length"
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@pytest.mark.asyncio
|
||||
async def test_online_audio_in_video_interleaved(
|
||||
client: openai.AsyncOpenAI, video_assets: VideoTestAssets
|
||||
):
|
||||
"""Test interleaved video/audio input with `audio_in_video=True`"""
|
||||
|
||||
# we don't use video_urls above because they missed audio stream.
|
||||
video_path = video_assets[0].video_path
|
||||
with open(video_path, "rb") as f:
|
||||
video_base64 = base64.b64encode(f.read()).decode("utf-8")
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "What's in these two videos?"},
|
||||
{
|
||||
"type": "video_url",
|
||||
"video_url": {"url": f"data:video/mp4;base64,{video_base64}"},
|
||||
},
|
||||
{
|
||||
"type": "audio_url",
|
||||
"audio_url": {"url": f"data:audio/mp4;base64,{video_base64}"},
|
||||
},
|
||||
{
|
||||
"type": "video_url",
|
||||
"video_url": {"url": f"data:video/mp4;base64,{video_base64}"},
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
with pytest.raises(
|
||||
openai.BadRequestError,
|
||||
match="use_audio_in_video requires equal number of audio and video items",
|
||||
):
|
||||
await client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_tokens=16,
|
||||
extra_body={
|
||||
"mm_processor_kwargs": {
|
||||
"use_audio_in_video": True,
|
||||
}
|
||||
},
|
||||
)
|
||||
@@ -0,0 +1,96 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
|
||||
import openai # use the official client for correctness check
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
from tests.conftest import AudioTestAssets
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
|
||||
# NOTE - the tests in this module are currently analogous to test_chat, but are
|
||||
# separated to avoid OOM killing due to module-scoped servers, since we
|
||||
# need a multimodal model for these tests.
|
||||
|
||||
# Contains a modality specific lora alongside the base model
|
||||
MULTIMODAL_MODEL_NAME = snapshot_download("microsoft/Phi-4-multimodal-instruct")
|
||||
AUDIO_LORA_PATH = os.path.join(MULTIMODAL_MODEL_NAME, "speech-lora")
|
||||
|
||||
ACTIVE_MM_LORA_RESPONSE = "Spoken text: The first words I spoke in the original chronograph, a little piece of practical poetry. Mary had a little lamb, it slept with quite a snow, and everywhere that Mary went, the lamb was sure to go." # noqa: E501
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def multimodal_server():
|
||||
args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"half",
|
||||
"--max-model-len",
|
||||
"4096",
|
||||
"--enforce-eager",
|
||||
# lora config below
|
||||
"--enable-lora",
|
||||
"--lora-modules",
|
||||
f"speech={AUDIO_LORA_PATH}",
|
||||
"--max-lora-rank",
|
||||
"320",
|
||||
"--max-num-seqs",
|
||||
"2",
|
||||
"--trust-remote-code",
|
||||
"--gpu-memory-utilization",
|
||||
"0.8",
|
||||
"--default-mm-loras",
|
||||
f'{{"audio": "{AUDIO_LORA_PATH}"}}',
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(
|
||||
MULTIMODAL_MODEL_NAME, args, max_wait_seconds=480
|
||||
) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def multi_modal_client(multimodal_server):
|
||||
async with multimodal_server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
# base model with default lora should give the same response as lora model
|
||||
"model_name",
|
||||
[MULTIMODAL_MODEL_NAME, "speech"],
|
||||
)
|
||||
async def test_default_mm_lora_chat_completions(
|
||||
model_name: str,
|
||||
multi_modal_client: openai.AsyncOpenAI,
|
||||
audio_assets: AudioTestAssets,
|
||||
):
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Can you transcribe this audio?",
|
||||
},
|
||||
{
|
||||
"type": "audio_url",
|
||||
"audio_url": {"url": audio_assets[0].url},
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
chat_completion = await multi_modal_client.chat.completions.create(
|
||||
model=model_name, messages=messages, max_completion_tokens=128, temperature=0.0
|
||||
)
|
||||
|
||||
assert len(chat_completion.choices) > 0
|
||||
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None and len(message.content) >= 0
|
||||
assert message.content == ACTIVE_MM_LORA_RESPONSE
|
||||
@@ -0,0 +1,42 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from tests.utils import VLLM_PATH, RemoteOpenAIServer
|
||||
|
||||
chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
|
||||
assert chatml_jinja_path.exists()
|
||||
|
||||
|
||||
def run_and_test_dummy_opt_api_server(model, tp=1):
|
||||
# the model is registered through the plugin
|
||||
server_args = [
|
||||
"--gpu-memory-utilization",
|
||||
"0.10",
|
||||
"--dtype",
|
||||
"float32",
|
||||
"--chat-template",
|
||||
str(chatml_jinja_path),
|
||||
"--load-format",
|
||||
"dummy",
|
||||
"-tp",
|
||||
f"{tp}",
|
||||
]
|
||||
with RemoteOpenAIServer(model, server_args) as server:
|
||||
client = server.get_client()
|
||||
completion = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": "Hello!"},
|
||||
],
|
||||
temperature=0,
|
||||
)
|
||||
generated_text = completion.choices[0].message.content
|
||||
assert generated_text is not None
|
||||
# make sure only the first token is generated
|
||||
rest = generated_text.replace("<s>", "")
|
||||
assert rest == ""
|
||||
|
||||
|
||||
def test_oot_registration_for_api_server(dummy_opt_path: str):
|
||||
run_and_test_dummy_opt_api_server(dummy_opt_path)
|
||||
104
tests/entrypoints/openai/chat_completion/test_root_path.py
Normal file
104
tests/entrypoints/openai/chat_completion/test_root_path.py
Normal file
@@ -0,0 +1,104 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import contextlib
|
||||
import os
|
||||
from typing import Any, NamedTuple
|
||||
|
||||
import openai # use the official client for correctness check
|
||||
import pytest
|
||||
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
|
||||
# # any model with a chat template should work here
|
||||
MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
|
||||
API_KEY = "abc-123"
|
||||
ERROR_API_KEY = "abc"
|
||||
ROOT_PATH = "llm"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = [
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"float16",
|
||||
"--enforce-eager",
|
||||
"--max-model-len",
|
||||
"4080",
|
||||
"--root-path", # use --root-path=/llm for testing
|
||||
"/" + ROOT_PATH,
|
||||
]
|
||||
envs = os.environ.copy()
|
||||
|
||||
envs["VLLM_API_KEY"] = API_KEY
|
||||
with RemoteOpenAIServer(MODEL_NAME, args, env_dict=envs) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
class TestCase(NamedTuple):
|
||||
model_name: str
|
||||
base_url: list[str]
|
||||
api_key: str
|
||||
expected_error: Any
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize(
|
||||
"test_case",
|
||||
[
|
||||
TestCase(
|
||||
model_name=MODEL_NAME,
|
||||
base_url=["v1"], # http://localhost:8000/v1
|
||||
api_key=ERROR_API_KEY,
|
||||
expected_error=openai.AuthenticationError,
|
||||
),
|
||||
TestCase(
|
||||
model_name=MODEL_NAME,
|
||||
base_url=[ROOT_PATH, "v1"], # http://localhost:8000/llm/v1
|
||||
api_key=ERROR_API_KEY,
|
||||
expected_error=openai.AuthenticationError,
|
||||
),
|
||||
TestCase(
|
||||
model_name=MODEL_NAME,
|
||||
base_url=["v1"], # http://localhost:8000/v1
|
||||
api_key=API_KEY,
|
||||
expected_error=None,
|
||||
),
|
||||
TestCase(
|
||||
model_name=MODEL_NAME,
|
||||
base_url=[ROOT_PATH, "v1"], # http://localhost:8000/llm/v1
|
||||
api_key=API_KEY,
|
||||
expected_error=None,
|
||||
),
|
||||
],
|
||||
)
|
||||
async def test_chat_session_root_path_with_api_key(
|
||||
server: RemoteOpenAIServer, test_case: TestCase
|
||||
):
|
||||
saying: str = "Here is a common saying about apple. An apple a day, keeps"
|
||||
ctx = contextlib.nullcontext()
|
||||
if test_case.expected_error is not None:
|
||||
ctx = pytest.raises(test_case.expected_error)
|
||||
with ctx:
|
||||
client = openai.AsyncOpenAI(
|
||||
api_key=test_case.api_key,
|
||||
base_url=server.url_for(*test_case.base_url),
|
||||
max_retries=0,
|
||||
)
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=test_case.model_name,
|
||||
messages=[
|
||||
{"role": "user", "content": "tell me a common saying"},
|
||||
{"role": "assistant", "content": saying},
|
||||
],
|
||||
extra_body={"continue_final_message": True, "add_generation_prompt": False},
|
||||
)
|
||||
|
||||
assert chat_completion.id is not None
|
||||
assert len(chat_completion.choices) == 1
|
||||
choice = chat_completion.choices[0]
|
||||
assert choice.finish_reason == "stop"
|
||||
message = choice.message
|
||||
assert len(message.content) > 0
|
||||
assert message.role == "assistant"
|
||||
403
tests/entrypoints/openai/chat_completion/test_video.py
Normal file
403
tests/entrypoints/openai/chat_completion/test_video.py
Normal file
@@ -0,0 +1,403 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import json
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
from vllm.multimodal.utils import encode_video_url, fetch_video
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
MODEL_NAME = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
|
||||
MAXIMUM_VIDEOS = 3
|
||||
|
||||
TEST_VIDEO_URLS = [
|
||||
"https://www.bogotobogo.com/python/OpenCV_Python/images/mean_shift_tracking/slow_traffic_small.mp4",
|
||||
"https://github.com/opencv/opencv/raw/refs/tags/4.12.0/samples/data/vtest.avi",
|
||||
"https://github.com/opencv/opencv/raw/refs/tags/4.12.0/samples/data/Megamind.avi",
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = [
|
||||
"--runner",
|
||||
"generate",
|
||||
"--max-model-len",
|
||||
"32768",
|
||||
"--max-num-seqs",
|
||||
"2",
|
||||
"--enforce-eager",
|
||||
"--trust-remote-code",
|
||||
"--limit-mm-per-prompt",
|
||||
json.dumps({"video": MAXIMUM_VIDEOS}),
|
||||
"--media-io-kwargs",
|
||||
json.dumps({"video": {"num_frames": 32}}),
|
||||
]
|
||||
|
||||
# ROCm: Increase timeouts to handle potential network delays and slower
|
||||
# video processing when downloading multiple videos from external sources
|
||||
env_overrides = {}
|
||||
if current_platform.is_rocm():
|
||||
env_overrides = {
|
||||
"VLLM_VIDEO_FETCH_TIMEOUT": "120",
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": "300",
|
||||
}
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_overrides) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def url_encoded_video() -> dict[str, str]:
|
||||
return {
|
||||
video_url: encode_video_url(fetch_video(video_url)[0])
|
||||
for video_url in TEST_VIDEO_URLS
|
||||
}
|
||||
|
||||
|
||||
def dummy_messages_from_video_url(
|
||||
video_urls: str | list[str],
|
||||
content_text: str = "What's in this video?",
|
||||
):
|
||||
if isinstance(video_urls, str):
|
||||
video_urls = [video_urls]
|
||||
|
||||
return [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
*(
|
||||
{"type": "video_url", "video_url": {"url": video_url}}
|
||||
for video_url in video_urls
|
||||
),
|
||||
{"type": "text", "text": content_text},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
|
||||
async def test_single_chat_session_video(
|
||||
client: openai.AsyncOpenAI, model_name: str, video_url: str
|
||||
):
|
||||
messages = dummy_messages_from_video_url(video_url)
|
||||
|
||||
# test single completion
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
logprobs=True,
|
||||
temperature=0.0,
|
||||
top_logprobs=5,
|
||||
)
|
||||
assert len(chat_completion.choices) == 1
|
||||
|
||||
choice = chat_completion.choices[0]
|
||||
assert choice.finish_reason == "length"
|
||||
assert chat_completion.usage == openai.types.CompletionUsage(
|
||||
completion_tokens=10, prompt_tokens=6287, total_tokens=6297
|
||||
)
|
||||
|
||||
message = choice.message
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None and len(message.content) >= 10
|
||||
assert message.role == "assistant"
|
||||
messages.append({"role": "assistant", "content": message.content})
|
||||
|
||||
# test multi-turn dialogue
|
||||
messages.append({"role": "user", "content": "express your result in json"})
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
)
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None and len(message.content) >= 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("video_url", [TEST_VIDEO_URLS[0]])
|
||||
async def test_request_media_io_kwargs_override_uses_fewer_video_frames(
|
||||
client: openai.AsyncOpenAI, model_name: str, video_url: str
|
||||
):
|
||||
messages = dummy_messages_from_video_url(video_url)
|
||||
|
||||
default_resp = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=1,
|
||||
temperature=0.0,
|
||||
)
|
||||
override_resp = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=1,
|
||||
temperature=0.0,
|
||||
extra_body={
|
||||
"media_io_kwargs": {
|
||||
"video": {
|
||||
"num_frames": 4,
|
||||
}
|
||||
}
|
||||
},
|
||||
)
|
||||
|
||||
assert default_resp.usage is not None
|
||||
assert override_resp.usage is not None
|
||||
assert override_resp.usage.prompt_tokens < default_resp.usage.prompt_tokens
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("video_url", [TEST_VIDEO_URLS[0]])
|
||||
async def test_invalid_num_frames_request_recoverable(
|
||||
client: openai.AsyncOpenAI, model_name: str, video_url: str
|
||||
):
|
||||
messages = dummy_messages_from_video_url(video_url)
|
||||
|
||||
with pytest.raises((openai.BadRequestError, openai.APIStatusError)):
|
||||
await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=1,
|
||||
temperature=0.0,
|
||||
extra_body={
|
||||
"media_io_kwargs": {
|
||||
"video": {
|
||||
"num_frames": "invalid",
|
||||
}
|
||||
}
|
||||
},
|
||||
)
|
||||
|
||||
# Server should still handle subsequent requests after the failed one.
|
||||
recovery_resp = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=1,
|
||||
temperature=0.0,
|
||||
)
|
||||
recovery_msg = recovery_resp.choices[0].message
|
||||
assert recovery_msg.content is not None and len(recovery_msg.content) >= 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
|
||||
async def test_error_on_invalid_video_url_type(
|
||||
client: openai.AsyncOpenAI, model_name: str, video_url: str
|
||||
):
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "video_url", "video_url": video_url},
|
||||
{"type": "text", "text": "What's in this video?"},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
# video_url should be a dict {"url": "some url"}, not directly a string
|
||||
with pytest.raises(openai.BadRequestError):
|
||||
_ = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
|
||||
async def test_single_chat_session_video_beamsearch(
|
||||
client: openai.AsyncOpenAI, model_name: str, video_url: str
|
||||
):
|
||||
messages = dummy_messages_from_video_url(video_url)
|
||||
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
n=2,
|
||||
max_completion_tokens=10,
|
||||
logprobs=True,
|
||||
top_logprobs=5,
|
||||
extra_body=dict(use_beam_search=True),
|
||||
)
|
||||
assert len(chat_completion.choices) == 2
|
||||
assert (
|
||||
chat_completion.choices[0].message.content
|
||||
!= chat_completion.choices[1].message.content
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
|
||||
async def test_single_chat_session_video_base64encoded(
|
||||
client: openai.AsyncOpenAI,
|
||||
model_name: str,
|
||||
video_url: str,
|
||||
url_encoded_video: dict[str, str],
|
||||
):
|
||||
messages = dummy_messages_from_video_url(url_encoded_video[video_url])
|
||||
|
||||
# test single completion
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
logprobs=True,
|
||||
temperature=0.0,
|
||||
top_logprobs=5,
|
||||
)
|
||||
assert len(chat_completion.choices) == 1
|
||||
|
||||
choice = chat_completion.choices[0]
|
||||
assert choice.finish_reason == "length"
|
||||
assert chat_completion.usage == openai.types.CompletionUsage(
|
||||
completion_tokens=10, prompt_tokens=6287, total_tokens=6297
|
||||
)
|
||||
|
||||
message = choice.message
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None and len(message.content) >= 10
|
||||
assert message.role == "assistant"
|
||||
messages.append({"role": "assistant", "content": message.content})
|
||||
|
||||
# test multi-turn dialogue
|
||||
messages.append({"role": "user", "content": "express your result in json"})
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.0,
|
||||
)
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None and len(message.content) >= 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
|
||||
async def test_single_chat_session_video_base64encoded_beamsearch(
|
||||
client: openai.AsyncOpenAI,
|
||||
model_name: str,
|
||||
video_url: str,
|
||||
url_encoded_video: dict[str, str],
|
||||
):
|
||||
messages = dummy_messages_from_video_url(url_encoded_video[video_url])
|
||||
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
n=2,
|
||||
max_completion_tokens=10,
|
||||
extra_body=dict(use_beam_search=True),
|
||||
)
|
||||
assert len(chat_completion.choices) == 2
|
||||
assert (
|
||||
chat_completion.choices[0].message.content
|
||||
!= chat_completion.choices[1].message.content
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
|
||||
async def test_chat_streaming_video(
|
||||
client: openai.AsyncOpenAI, model_name: str, video_url: str
|
||||
):
|
||||
messages = dummy_messages_from_video_url(video_url)
|
||||
|
||||
# test single completion
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.0,
|
||||
)
|
||||
output = chat_completion.choices[0].message.content
|
||||
stop_reason = chat_completion.choices[0].finish_reason
|
||||
|
||||
# test streaming
|
||||
stream = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
)
|
||||
chunks: list[str] = []
|
||||
finish_reason_count = 0
|
||||
async for chunk in stream:
|
||||
delta = chunk.choices[0].delta
|
||||
if delta.role:
|
||||
assert delta.role == "assistant"
|
||||
if delta.content:
|
||||
chunks.append(delta.content)
|
||||
if chunk.choices[0].finish_reason is not None:
|
||||
finish_reason_count += 1
|
||||
# finish reason should only return in last block
|
||||
assert finish_reason_count == 1
|
||||
assert chunk.choices[0].finish_reason == stop_reason
|
||||
assert delta.content
|
||||
assert "".join(chunks) == output
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize(
|
||||
"video_urls", [TEST_VIDEO_URLS[:i] for i in range(2, len(TEST_VIDEO_URLS))]
|
||||
)
|
||||
@pytest.mark.flaky(
|
||||
reruns=2,
|
||||
reruns_delay=5,
|
||||
condition=current_platform.is_rocm(),
|
||||
)
|
||||
async def test_multi_video_input(
|
||||
client: openai.AsyncOpenAI, model_name: str, video_urls: list[str]
|
||||
):
|
||||
messages = dummy_messages_from_video_url(video_urls)
|
||||
|
||||
if len(video_urls) > MAXIMUM_VIDEOS:
|
||||
with pytest.raises(openai.BadRequestError): # test multi-video input
|
||||
await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
# the server should still work afterwards
|
||||
completion = await client.completions.create(
|
||||
model=model_name,
|
||||
prompt=[0, 0, 0, 0, 0],
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
)
|
||||
completion = completion.choices[0].text
|
||||
assert completion is not None and len(completion) >= 0
|
||||
else:
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.0,
|
||||
)
|
||||
message = chat_completion.choices[0].message
|
||||
assert message.content is not None and len(message.content) >= 0
|
||||
687
tests/entrypoints/openai/chat_completion/test_vision.py
Normal file
687
tests/entrypoints/openai/chat_completion/test_vision.py
Normal file
@@ -0,0 +1,687 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import json
|
||||
|
||||
import openai
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
from transformers import AutoProcessor
|
||||
|
||||
from tests.utils import ROCM_ENV_OVERRIDES, ROCM_EXTRA_ARGS, RemoteOpenAIServer
|
||||
from vllm.multimodal.media import MediaWithBytes
|
||||
from vllm.multimodal.utils import encode_image_url, fetch_image
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
MODEL_NAME = "microsoft/Phi-3.5-vision-instruct"
|
||||
MAXIMUM_IMAGES = 2
|
||||
|
||||
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
|
||||
TEST_IMAGE_ASSETS = [
|
||||
"2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
|
||||
"Grayscale_8bits_palette_sample_image.png", # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/Grayscale_8bits_palette_sample_image.png",
|
||||
"1280px-Venn_diagram_rgb.svg.png", # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/1280px-Venn_diagram_rgb.svg.png",
|
||||
"RGBA_comp.png", # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/RGBA_comp.png",
|
||||
]
|
||||
|
||||
# Required terms for beam search validation
|
||||
# Each entry is a list of term groups - ALL groups must match
|
||||
# Each group is a list of alternatives - at least ONE term in the group must appear
|
||||
# This provides semantic validation while allowing wording variation
|
||||
REQUIRED_BEAM_SEARCH_TERMS = [
|
||||
# Boardwalk image: must have "boardwalk" AND ("wooden" or "wood")
|
||||
[["boardwalk"], ["wooden", "wood"]],
|
||||
# Parrots image: must have ("parrot" or "bird") AND "two"
|
||||
[["parrot", "bird"], ["two"]],
|
||||
# Venn diagram: must have "venn" AND "diagram"
|
||||
[["venn"], ["diagram"]],
|
||||
# Gradient image: must have "gradient" AND ("color" or "spectrum")
|
||||
[["gradient"], ["color", "spectrum"]],
|
||||
]
|
||||
|
||||
|
||||
def check_output_matches_terms(content: str, term_groups: list[list[str]]) -> bool:
|
||||
"""
|
||||
Check if content matches all required term groups.
|
||||
Each term group requires at least one of its terms to be present.
|
||||
All term groups must be satisfied.
|
||||
"""
|
||||
content_lower = content.lower()
|
||||
return all(
|
||||
any(term.lower() in content_lower for term in group) for group in term_groups
|
||||
)
|
||||
|
||||
|
||||
def assert_non_empty_content(chat_completion, *, context: str = "") -> str:
|
||||
"""Assert the first choice has non-empty string content; return it.
|
||||
|
||||
Provides a detailed failure message including the full ChatCompletion
|
||||
response so flaky / model-quality issues are easy to diagnose.
|
||||
"""
|
||||
prefix = f"[{context}] " if context else ""
|
||||
choice = chat_completion.choices[0]
|
||||
content = choice.message.content
|
||||
|
||||
assert content is not None, (
|
||||
f"{prefix}Expected non-None content but got None. "
|
||||
f"finish_reason={choice.finish_reason!r}, "
|
||||
f"full message={choice.message!r}, "
|
||||
f"usage={chat_completion.usage!r}"
|
||||
)
|
||||
assert isinstance(content, str), (
|
||||
f"{prefix}Expected str content, got {type(content).__name__}: {content!r}"
|
||||
)
|
||||
assert len(content) > 0, (
|
||||
f"{prefix}Expected non-empty content but got empty string. "
|
||||
f"finish_reason={choice.finish_reason!r}, "
|
||||
f"full message={choice.message!r}, "
|
||||
f"usage={chat_completion.usage!r}"
|
||||
)
|
||||
return content
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = [
|
||||
"--runner",
|
||||
"generate",
|
||||
"--max-model-len",
|
||||
"2048",
|
||||
"--max-num-seqs",
|
||||
"5",
|
||||
"--enforce-eager",
|
||||
"--trust-remote-code",
|
||||
"--limit-mm-per-prompt",
|
||||
json.dumps({"image": MAXIMUM_IMAGES}),
|
||||
*ROCM_EXTRA_ARGS,
|
||||
]
|
||||
|
||||
# ROCm: Increase timeouts to handle potential network delays and slower
|
||||
# video processing when downloading multiple videos from external sources
|
||||
env_overrides = {
|
||||
**ROCM_ENV_OVERRIDES,
|
||||
**(
|
||||
{
|
||||
"VLLM_VIDEO_FETCH_TIMEOUT": "120",
|
||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": "300",
|
||||
}
|
||||
if current_platform.is_rocm()
|
||||
else {}
|
||||
),
|
||||
}
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_overrides) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def client(server):
|
||||
async with server.get_async_client() as async_client:
|
||||
yield async_client
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def url_encoded_image(local_asset_server) -> dict[str, str]:
|
||||
return {
|
||||
image_asset: encode_image_url(local_asset_server.get_image_asset(image_asset))
|
||||
for image_asset in TEST_IMAGE_ASSETS
|
||||
}
|
||||
|
||||
|
||||
def dummy_messages_from_image_url(
|
||||
image_urls: str | list[str],
|
||||
content_text: str = "What's in this image?",
|
||||
):
|
||||
if isinstance(image_urls, str):
|
||||
image_urls = [image_urls]
|
||||
|
||||
return [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
*(
|
||||
{"type": "image_url", "image_url": {"url": image_url}}
|
||||
for image_url in image_urls
|
||||
),
|
||||
{"type": "text", "text": content_text},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
def describe_image_messages(
|
||||
image_url: str, *, extra_image_fields: dict | None = None
|
||||
) -> list[dict]:
|
||||
"""Build the system + user messages used by the completions-with-image
|
||||
family of tests. *extra_image_fields* is merged into the top-level
|
||||
image content block (for uuid / bad-key tests)."""
|
||||
image_block: dict = {
|
||||
"type": "image_url",
|
||||
"image_url": {"url": image_url},
|
||||
}
|
||||
if extra_image_fields:
|
||||
image_block.update(extra_image_fields)
|
||||
|
||||
return [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Describe this image."},
|
||||
image_block,
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
async def complete_and_check(
|
||||
client: openai.AsyncOpenAI,
|
||||
model_name: str,
|
||||
messages: list[dict],
|
||||
*,
|
||||
context: str,
|
||||
max_completion_tokens: int = 50,
|
||||
temperature: float = 0.0,
|
||||
) -> str:
|
||||
"""Run a chat completion and assert the output is non-empty.
|
||||
Returns the content string."""
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
temperature=temperature,
|
||||
)
|
||||
return assert_non_empty_content(chat_completion, context=context)
|
||||
|
||||
|
||||
def get_hf_prompt_tokens(model_name, content, image_url):
|
||||
processor = AutoProcessor.from_pretrained(
|
||||
model_name, trust_remote_code=True, num_crops=4
|
||||
)
|
||||
|
||||
placeholder = "<|image_1|>\n"
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"{placeholder}{content}",
|
||||
}
|
||||
]
|
||||
image = fetch_image(image_url)
|
||||
# Unwrap MediaWithBytes if present
|
||||
if isinstance(image, MediaWithBytes):
|
||||
image = image.media
|
||||
images = [image]
|
||||
|
||||
prompt = processor.tokenizer.apply_chat_template(
|
||||
messages, tokenize=False, add_generation_prompt=True
|
||||
)
|
||||
inputs = processor(prompt, images, return_tensors="pt")
|
||||
|
||||
return inputs.input_ids.shape[1]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
|
||||
async def test_single_chat_session_image(
|
||||
client: openai.AsyncOpenAI, model_name: str, image_url: str
|
||||
):
|
||||
content_text = "What's in this image?"
|
||||
messages = dummy_messages_from_image_url(image_url, content_text)
|
||||
|
||||
max_completion_tokens = 10
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
logprobs=True,
|
||||
temperature=0.0,
|
||||
top_logprobs=5,
|
||||
)
|
||||
assert len(chat_completion.choices) == 1, (
|
||||
f"Expected 1 choice, got {len(chat_completion.choices)}"
|
||||
)
|
||||
|
||||
choice = chat_completion.choices[0]
|
||||
assert choice.finish_reason == "length", (
|
||||
f"Expected finish_reason='length' (capped at {max_completion_tokens} "
|
||||
f"tokens), got {choice.finish_reason!r}. "
|
||||
f"content={choice.message.content!r}"
|
||||
)
|
||||
|
||||
hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text, image_url)
|
||||
expected_usage = openai.types.CompletionUsage(
|
||||
completion_tokens=max_completion_tokens,
|
||||
prompt_tokens=hf_prompt_tokens,
|
||||
total_tokens=hf_prompt_tokens + max_completion_tokens,
|
||||
)
|
||||
assert chat_completion.usage == expected_usage, (
|
||||
f"Usage mismatch: got {chat_completion.usage!r}, expected {expected_usage!r}"
|
||||
)
|
||||
|
||||
message = choice.message
|
||||
assert message.content is not None and len(message.content) >= 10, (
|
||||
f"Expected content with >=10 chars, got {message.content!r}"
|
||||
)
|
||||
assert message.role == "assistant", (
|
||||
f"Expected role='assistant', got {message.role!r}"
|
||||
)
|
||||
|
||||
messages.append({"role": "assistant", "content": message.content})
|
||||
|
||||
# test multi-turn dialogue
|
||||
messages.append({"role": "user", "content": "express your result in json"})
|
||||
await complete_and_check(
|
||||
client,
|
||||
model_name,
|
||||
messages,
|
||||
context=f"multi-turn follow-up for {image_url}",
|
||||
max_completion_tokens=10,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
|
||||
async def test_error_on_invalid_image_url_type(
|
||||
client: openai.AsyncOpenAI, model_name: str, image_url: str
|
||||
):
|
||||
content_text = "What's in this image?"
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": image_url},
|
||||
{"type": "text", "text": content_text},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
# image_url should be a dict {"url": "some url"}, not directly a string
|
||||
with pytest.raises(openai.BadRequestError):
|
||||
await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
|
||||
async def test_single_chat_session_image_beamsearch(
|
||||
client: openai.AsyncOpenAI, model_name: str, image_url: str
|
||||
):
|
||||
content_text = "What's in this image?"
|
||||
messages = dummy_messages_from_image_url(image_url, content_text)
|
||||
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
n=2,
|
||||
max_completion_tokens=10,
|
||||
logprobs=True,
|
||||
top_logprobs=5,
|
||||
extra_body=dict(use_beam_search=True),
|
||||
)
|
||||
assert len(chat_completion.choices) == 2, (
|
||||
f"Expected 2 beam search choices, got {len(chat_completion.choices)}"
|
||||
)
|
||||
|
||||
content_0 = chat_completion.choices[0].message.content
|
||||
content_1 = chat_completion.choices[1].message.content
|
||||
assert content_0 != content_1, (
|
||||
f"Beam search should produce different outputs for {image_url}, "
|
||||
f"but both returned: {content_0!r}"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("raw_image_url", TEST_IMAGE_ASSETS)
|
||||
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
|
||||
async def test_single_chat_session_image_base64encoded(
|
||||
client: openai.AsyncOpenAI,
|
||||
model_name: str,
|
||||
raw_image_url: str,
|
||||
image_url: str,
|
||||
url_encoded_image: dict[str, str],
|
||||
):
|
||||
content_text = "What's in this image?"
|
||||
messages = dummy_messages_from_image_url(
|
||||
url_encoded_image[raw_image_url],
|
||||
content_text,
|
||||
)
|
||||
|
||||
max_completion_tokens = 10
|
||||
# test single completion
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
logprobs=True,
|
||||
temperature=0.0,
|
||||
top_logprobs=5,
|
||||
)
|
||||
assert len(chat_completion.choices) == 1, (
|
||||
f"Expected 1 choice, got {len(chat_completion.choices)}"
|
||||
)
|
||||
|
||||
choice = chat_completion.choices[0]
|
||||
assert choice.finish_reason == "length", (
|
||||
f"Expected finish_reason='length', got {choice.finish_reason!r}. "
|
||||
f"content={choice.message.content!r}"
|
||||
)
|
||||
|
||||
hf_prompt_tokens = get_hf_prompt_tokens(model_name, content_text, image_url)
|
||||
expected_usage = openai.types.CompletionUsage(
|
||||
completion_tokens=max_completion_tokens,
|
||||
prompt_tokens=hf_prompt_tokens,
|
||||
total_tokens=hf_prompt_tokens + max_completion_tokens,
|
||||
)
|
||||
assert chat_completion.usage == expected_usage, (
|
||||
f"Usage mismatch: got {chat_completion.usage!r}, expected {expected_usage!r}"
|
||||
)
|
||||
|
||||
message = choice.message
|
||||
assert message.content is not None and len(message.content) >= 10, (
|
||||
f"Expected content with >=10 chars, got {message.content!r}"
|
||||
)
|
||||
assert message.role == "assistant", (
|
||||
f"Expected role='assistant', got {message.role!r}"
|
||||
)
|
||||
|
||||
messages.append({"role": "assistant", "content": message.content})
|
||||
|
||||
# test multi-turn dialogue
|
||||
messages.append({"role": "user", "content": "express your result in json"})
|
||||
await complete_and_check(
|
||||
client,
|
||||
model_name,
|
||||
messages,
|
||||
context=f"multi-turn base64 follow-up for {raw_image_url}",
|
||||
max_completion_tokens=10,
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("image_idx", list(range(len(TEST_IMAGE_ASSETS))))
|
||||
async def test_single_chat_session_image_base64encoded_beamsearch(
|
||||
client: openai.AsyncOpenAI,
|
||||
model_name: str,
|
||||
image_idx: int,
|
||||
url_encoded_image: dict[str, str],
|
||||
):
|
||||
# NOTE: This test validates that we pass MM data through beam search
|
||||
raw_image_url = TEST_IMAGE_ASSETS[image_idx]
|
||||
required_terms = REQUIRED_BEAM_SEARCH_TERMS[image_idx]
|
||||
|
||||
messages = dummy_messages_from_image_url(url_encoded_image[raw_image_url])
|
||||
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
n=2,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.0,
|
||||
extra_body=dict(use_beam_search=True),
|
||||
)
|
||||
assert len(chat_completion.choices) == 2, (
|
||||
f"Expected 2 beam search choices for image {image_idx} "
|
||||
f"({raw_image_url}), got {len(chat_completion.choices)}"
|
||||
)
|
||||
|
||||
# Verify beam search produces two different non-empty outputs
|
||||
content_0 = chat_completion.choices[0].message.content
|
||||
content_1 = chat_completion.choices[1].message.content
|
||||
|
||||
# Emit beam search outputs for debugging
|
||||
print(
|
||||
f"Beam search outputs for image {image_idx} ({raw_image_url}): "
|
||||
f"Output 0: {content_0!r}, Output 1: {content_1!r}"
|
||||
)
|
||||
|
||||
assert content_0, (
|
||||
f"First beam output is empty for image {image_idx} ({raw_image_url}). "
|
||||
f"finish_reason={chat_completion.choices[0].finish_reason!r}"
|
||||
)
|
||||
assert content_1, (
|
||||
f"Second beam output is empty for image {image_idx} "
|
||||
f"({raw_image_url}). "
|
||||
f"finish_reason={chat_completion.choices[1].finish_reason!r}"
|
||||
)
|
||||
assert content_0 != content_1, (
|
||||
f"Beam search produced identical outputs for image {image_idx} "
|
||||
f"({raw_image_url}): {content_0!r}"
|
||||
)
|
||||
|
||||
# Verify each output contains the required terms for this image
|
||||
for i, content in enumerate([content_0, content_1]):
|
||||
assert check_output_matches_terms(content, required_terms), (
|
||||
f"Beam output {i} for image {image_idx} ({raw_image_url}) "
|
||||
f"doesn't match required terms.\n"
|
||||
f" content: {content!r}\n"
|
||||
f" required (all groups, >=1 per group): {required_terms}"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
|
||||
async def test_chat_streaming_image(
|
||||
client: openai.AsyncOpenAI, model_name: str, image_url: str
|
||||
):
|
||||
messages = dummy_messages_from_image_url(image_url)
|
||||
|
||||
# test single completion
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.0,
|
||||
)
|
||||
output = chat_completion.choices[0].message.content
|
||||
stop_reason = chat_completion.choices[0].finish_reason
|
||||
|
||||
# test streaming
|
||||
stream = await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.0,
|
||||
stream=True,
|
||||
)
|
||||
chunks: list[str] = []
|
||||
finish_reason_count = 0
|
||||
async for chunk in stream:
|
||||
delta = chunk.choices[0].delta
|
||||
if delta.role:
|
||||
assert delta.role == "assistant", (
|
||||
f"Expected role='assistant' in stream delta, got {delta.role!r}"
|
||||
)
|
||||
if delta.content:
|
||||
chunks.append(delta.content)
|
||||
if chunk.choices[0].finish_reason is not None:
|
||||
finish_reason_count += 1
|
||||
# finish reason should only return in last block
|
||||
assert finish_reason_count == 1, (
|
||||
f"Expected exactly 1 finish_reason across stream chunks, "
|
||||
f"got {finish_reason_count}"
|
||||
)
|
||||
assert chunk.choices[0].finish_reason == stop_reason, (
|
||||
f"Stream finish_reason={chunk.choices[0].finish_reason!r} "
|
||||
f"doesn't match non-stream finish_reason={stop_reason!r}"
|
||||
)
|
||||
|
||||
streamed_text = "".join(chunks)
|
||||
assert streamed_text == output, (
|
||||
f"Streamed output doesn't match non-streamed for {image_url}.\n"
|
||||
f" streamed: {streamed_text!r}\n"
|
||||
f" non-streamed: {output!r}"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize(
|
||||
"image_urls",
|
||||
[TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
|
||||
indirect=True,
|
||||
)
|
||||
async def test_multi_image_input(
|
||||
client: openai.AsyncOpenAI, model_name: str, image_urls: list[str]
|
||||
):
|
||||
messages = dummy_messages_from_image_url(image_urls)
|
||||
|
||||
if len(image_urls) > MAXIMUM_IMAGES:
|
||||
with pytest.raises(openai.BadRequestError): # test multi-image input
|
||||
await client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=messages,
|
||||
max_completion_tokens=10,
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
# the server should still work afterwards
|
||||
completion = await client.completions.create(
|
||||
model=model_name,
|
||||
prompt=[0, 0, 0, 0, 0],
|
||||
max_tokens=5,
|
||||
temperature=0.0,
|
||||
)
|
||||
assert completion.choices[0].text is not None, (
|
||||
"Server failed to produce output after rejecting over-limit "
|
||||
"multi-image request"
|
||||
)
|
||||
else:
|
||||
await complete_and_check(
|
||||
client,
|
||||
model_name,
|
||||
messages,
|
||||
context=f"multi-image input ({len(image_urls)} images)",
|
||||
max_completion_tokens=10,
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize(
|
||||
"image_urls",
|
||||
[TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
|
||||
indirect=True,
|
||||
)
|
||||
async def test_completions_with_image(
|
||||
client: openai.AsyncOpenAI,
|
||||
model_name: str,
|
||||
image_urls: list[str],
|
||||
):
|
||||
for image_url in image_urls:
|
||||
messages = describe_image_messages(image_url)
|
||||
await complete_and_check(
|
||||
client,
|
||||
model_name,
|
||||
messages,
|
||||
context=f"completions_with_image url={image_url}",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize(
|
||||
"image_urls",
|
||||
[TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
|
||||
indirect=True,
|
||||
)
|
||||
async def test_completions_with_image_with_uuid(
|
||||
client: openai.AsyncOpenAI,
|
||||
model_name: str,
|
||||
image_urls: list[str],
|
||||
):
|
||||
for image_url in image_urls:
|
||||
messages = describe_image_messages(
|
||||
image_url,
|
||||
extra_image_fields={"uuid": image_url},
|
||||
)
|
||||
await complete_and_check(
|
||||
client,
|
||||
model_name,
|
||||
messages,
|
||||
context=f"uuid first request url={image_url}",
|
||||
)
|
||||
|
||||
cached_messages: list[dict] = [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Describe this image."},
|
||||
{"type": "image_url", "image_url": {}, "uuid": image_url},
|
||||
],
|
||||
},
|
||||
]
|
||||
await complete_and_check(
|
||||
client,
|
||||
model_name,
|
||||
cached_messages,
|
||||
context=f"uuid cached (empty image) uuid={image_url}",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_completions_with_empty_image_with_uuid_without_cache_hit(
|
||||
client: openai.AsyncOpenAI,
|
||||
model_name: str,
|
||||
):
|
||||
with pytest.raises(openai.BadRequestError):
|
||||
await client.chat.completions.create(
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Describe this image."},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {},
|
||||
"uuid": "uuid_not_previously_seen",
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
model=model_name,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
@pytest.mark.parametrize(
|
||||
"image_urls",
|
||||
[TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
|
||||
indirect=True,
|
||||
)
|
||||
async def test_completions_with_image_with_incorrect_uuid_format(
|
||||
client: openai.AsyncOpenAI,
|
||||
model_name: str,
|
||||
image_urls: list[str],
|
||||
):
|
||||
for image_url in image_urls:
|
||||
messages = describe_image_messages(
|
||||
image_url,
|
||||
extra_image_fields={
|
||||
"also_incorrect_uuid_key": image_url,
|
||||
},
|
||||
)
|
||||
# Inject the bad key inside image_url dict too
|
||||
messages[1]["content"][1]["image_url"]["incorrect_uuid_key"] = image_url
|
||||
|
||||
await complete_and_check(
|
||||
client,
|
||||
model_name,
|
||||
messages,
|
||||
context=f"incorrect uuid format url={image_url}",
|
||||
)
|
||||
150
tests/entrypoints/openai/chat_completion/test_vision_embeds.py
Normal file
150
tests/entrypoints/openai/chat_completion/test_vision_embeds.py
Normal file
@@ -0,0 +1,150 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import base64
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
import requests
|
||||
import torch
|
||||
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
from vllm.utils.serial_utils import tensor2base64
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model_name", ["ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"]
|
||||
)
|
||||
def test_single_content(model_name: str):
|
||||
args = [
|
||||
"--runner",
|
||||
"pooling",
|
||||
# use half precision for speed and memory savings in CI environment
|
||||
"--dtype",
|
||||
"float16",
|
||||
"--enforce-eager",
|
||||
"--trust-remote-code",
|
||||
"--max-num-seqs",
|
||||
"32",
|
||||
"--model-impl",
|
||||
"terratorch",
|
||||
"--skip-tokenizer-init",
|
||||
"--enable-mm-embeds",
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(model_name, args) as server:
|
||||
response = requests.post(
|
||||
server.url_for("pooling"),
|
||||
json={
|
||||
"model": model_name,
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_embeds",
|
||||
"image_embeds": {
|
||||
"pixel_values": tensor2base64(
|
||||
torch.ones((6, 512, 512), dtype=torch.float16)
|
||||
),
|
||||
"location_coords": tensor2base64(
|
||||
torch.ones((1, 2), dtype=torch.float16)
|
||||
),
|
||||
},
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
"encoding_format": "base64",
|
||||
},
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
output = response.json()["data"][0]["data"]
|
||||
|
||||
np_response = np.frombuffer(base64.b64decode(output), dtype=np.float32)
|
||||
assert len(np_response) == 524288
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_name", ["Qwen/Qwen3-VL-2B-Instruct"])
|
||||
def test_multi_content(model_name: str):
|
||||
args = [
|
||||
"--enforce-eager",
|
||||
"--max-num-seqs",
|
||||
"32",
|
||||
"--max-model-len",
|
||||
"8192",
|
||||
"--enable-mm-embeds",
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(model_name, args) as server:
|
||||
client = server.get_client()
|
||||
|
||||
# Image only
|
||||
chat_completion = client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_embeds",
|
||||
"image_embeds": {
|
||||
"image_embeds": tensor2base64(torch.zeros(220, 8192)),
|
||||
"image_grid_thw": tensor2base64(
|
||||
torch.tensor([1, 22, 40])
|
||||
),
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "image_embeds",
|
||||
"image_embeds": {
|
||||
"image_embeds": tensor2base64(torch.zeros(220, 8192)),
|
||||
"image_grid_thw": tensor2base64(
|
||||
torch.tensor([1, 22, 40])
|
||||
),
|
||||
},
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
max_tokens=5,
|
||||
)
|
||||
|
||||
assert chat_completion.id is not None
|
||||
assert len(chat_completion.choices) == 1
|
||||
|
||||
# Interleaved text and image
|
||||
chat_completion = client.chat.completions.create(
|
||||
model=model_name,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_embeds",
|
||||
"image_embeds": {
|
||||
"image_embeds": tensor2base64(torch.zeros(220, 8192)),
|
||||
"image_grid_thw": tensor2base64(
|
||||
torch.tensor([1, 22, 40])
|
||||
),
|
||||
},
|
||||
},
|
||||
{"type": "text", "text": "OCR:"},
|
||||
{
|
||||
"type": "image_embeds",
|
||||
"image_embeds": {
|
||||
"image_embeds": tensor2base64(torch.zeros(220, 8192)),
|
||||
"image_grid_thw": tensor2base64(
|
||||
torch.tensor([1, 22, 40])
|
||||
),
|
||||
},
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
max_tokens=5,
|
||||
)
|
||||
|
||||
assert chat_completion.id is not None
|
||||
assert len(chat_completion.choices) == 1
|
||||
Reference in New Issue
Block a user