[Bug] Fix Failure in /v1/chat/completions/render for Multimodal Requests (https://github.com/vllm-project/vllm/issues/35665) (#35684)
This commit is contained in:
155
tests/entrypoints/openai/cpu/test_render_multimodal.py
Normal file
155
tests/entrypoints/openai/cpu/test_render_multimodal.py
Normal file
@@ -0,0 +1,155 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
"""Multimodal tests for the /render endpoints that expose prompt preprocessing."""
|
||||
|
||||
import httpx
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
from vllm.multimodal.utils import encode_image_url
|
||||
|
||||
VISION_MODEL_NAME = "Qwen/Qwen3-VL-2B-Instruct"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def vision_server():
|
||||
"""Vision-capable server used for multimodal /render tests."""
|
||||
|
||||
args = [
|
||||
"--enforce-eager",
|
||||
"--max-model-len",
|
||||
"100",
|
||||
"--max-num-seqs",
|
||||
"1",
|
||||
"--limit-mm-per-prompt.image",
|
||||
"1",
|
||||
"--limit-mm-per-prompt.video",
|
||||
"0",
|
||||
]
|
||||
|
||||
env_overrides: dict[str, str] = {}
|
||||
|
||||
with RemoteOpenAIServer(
|
||||
VISION_MODEL_NAME,
|
||||
args,
|
||||
env_dict=env_overrides,
|
||||
) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest_asyncio.fixture
|
||||
async def vision_client(vision_server):
|
||||
async with httpx.AsyncClient(
|
||||
base_url=vision_server.url_for(""), timeout=60.0
|
||||
) as http_client:
|
||||
yield http_client
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_chat_completion_render_with_base64_image_url(
|
||||
vision_client,
|
||||
local_asset_server,
|
||||
):
|
||||
"""Render a multimodal chat request and verify tokens are returned."""
|
||||
|
||||
image = local_asset_server.get_image_asset("RGBA_comp.png")
|
||||
data_url = encode_image_url(image, format="PNG")
|
||||
|
||||
assert data_url.startswith("data:image/")
|
||||
assert ";base64," in data_url
|
||||
|
||||
response = await vision_client.post(
|
||||
"/v1/chat/completions/render",
|
||||
json={
|
||||
"model": VISION_MODEL_NAME,
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": data_url}},
|
||||
{"type": "text", "text": "What's in this image?"},
|
||||
],
|
||||
}
|
||||
],
|
||||
},
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
|
||||
data = response.json()
|
||||
assert isinstance(data, dict)
|
||||
assert "token_ids" in data
|
||||
assert isinstance(data["token_ids"], list)
|
||||
assert len(data["token_ids"]) > 0
|
||||
|
||||
# Verify multimodal features are populated
|
||||
assert "features" in data
|
||||
features = data["features"]
|
||||
assert features is not None
|
||||
|
||||
# mm_hashes: should have an "image" key with a list of hash strings
|
||||
assert "mm_hashes" in features
|
||||
assert "image" in features["mm_hashes"]
|
||||
image_hashes = features["mm_hashes"]["image"]
|
||||
assert isinstance(image_hashes, list)
|
||||
assert len(image_hashes) > 0
|
||||
assert all(isinstance(h, str) for h in image_hashes)
|
||||
|
||||
# mm_placeholders: should have an "image" key with offset/length dicts
|
||||
assert "mm_placeholders" in features
|
||||
assert "image" in features["mm_placeholders"]
|
||||
image_placeholders = features["mm_placeholders"]["image"]
|
||||
assert isinstance(image_placeholders, list)
|
||||
assert len(image_placeholders) > 0
|
||||
for p in image_placeholders:
|
||||
assert "offset" in p
|
||||
assert "length" in p
|
||||
assert isinstance(p["offset"], int)
|
||||
assert isinstance(p["length"], int)
|
||||
assert p["length"] > 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tokenize_matches_render_for_multimodal_input(
|
||||
vision_client,
|
||||
local_asset_server,
|
||||
):
|
||||
"""`/tokenize` should match `/v1/chat/completions/render` token output."""
|
||||
|
||||
image = local_asset_server.get_image_asset("RGBA_comp.png")
|
||||
data_url = encode_image_url(image, format="PNG")
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": data_url}},
|
||||
{"type": "text", "text": "What's in this image?"},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
render_response = await vision_client.post(
|
||||
"/v1/chat/completions/render",
|
||||
json={
|
||||
"model": VISION_MODEL_NAME,
|
||||
"messages": messages,
|
||||
},
|
||||
)
|
||||
assert render_response.status_code == 200
|
||||
render_data = render_response.json()
|
||||
|
||||
tokenize_response = await vision_client.post(
|
||||
"/tokenize",
|
||||
json={
|
||||
"model": VISION_MODEL_NAME,
|
||||
"messages": messages,
|
||||
},
|
||||
)
|
||||
assert tokenize_response.status_code == 200
|
||||
tokenize_data = tokenize_response.json()
|
||||
|
||||
assert tokenize_data["tokens"] == render_data["token_ids"]
|
||||
assert tokenize_data["count"] == len(render_data["token_ids"])
|
||||
Reference in New Issue
Block a user