[Frontend] Multimodal support in offline chat (#8098)

This commit is contained in:
Cyrus Leung
2024-09-04 13:22:17 +08:00
committed by GitHub
parent 2be8ec6e71
commit 855c262a6b
8 changed files with 356 additions and 112 deletions

View File

@@ -1,11 +1,14 @@
import warnings
from typing import Optional
import pytest
from PIL import Image
from vllm.assets.image import ImageAsset
from vllm.config import ModelConfig
from vllm.entrypoints.chat_utils import parse_chat_messages
from vllm.entrypoints.chat_utils import (parse_chat_messages,
parse_chat_messages_futures)
from vllm.multimodal import MultiModalDataDict
from vllm.multimodal.utils import encode_image_base64
from vllm.transformers_utils.tokenizer_group import TokenizerGroup
@@ -42,10 +45,28 @@ def image_url():
return f"data:image/jpeg;base64,{base64}"
@pytest.mark.asyncio
async def test_parse_chat_messages_with_image_url(phi3v_model_config,
phi3v_tokenizer, image_url):
conversation, mm_future = parse_chat_messages([{
def _assert_mm_data_is_image_input(
mm_data: Optional[MultiModalDataDict],
image_count: int,
) -> None:
assert mm_data is not None
assert set(mm_data.keys()) == {"image"}
image_data = mm_data.get("image")
assert image_data is not None
if image_count == 1:
assert isinstance(image_data, Image.Image)
else:
assert isinstance(image_data, list) and len(image_data) == image_count
def test_parse_chat_messages_single_image(
phi3v_model_config,
phi3v_tokenizer,
image_url,
):
conversation, mm_data = parse_chat_messages([{
"role":
"user",
"content": [{
@@ -63,15 +84,42 @@ async def test_parse_chat_messages_with_image_url(phi3v_model_config,
"role": "user",
"content": "<|image_1|>\nWhat's in the image?"
}]
mm_data = await mm_future
assert set(mm_data.keys()) == {"image"}
assert isinstance(mm_data["image"], Image.Image)
_assert_mm_data_is_image_input(mm_data, 1)
@pytest.mark.asyncio
async def test_parse_chat_messages_multiple_images(phi3v_model_config,
phi3v_tokenizer, image_url):
conversation, mm_future = parse_chat_messages([{
async def test_parse_chat_messages_single_image_async(
phi3v_model_config,
phi3v_tokenizer,
image_url,
):
conversation, mm_future = parse_chat_messages_futures([{
"role":
"user",
"content": [{
"type": "image_url",
"image_url": {
"url": image_url
}
}, {
"type": "text",
"text": "What's in the image?"
}]
}], phi3v_model_config, phi3v_tokenizer)
assert conversation == [{
"role": "user",
"content": "<|image_1|>\nWhat's in the image?"
}]
_assert_mm_data_is_image_input(await mm_future, 1)
def test_parse_chat_messages_multiple_images(
phi3v_model_config,
phi3v_tokenizer,
image_url,
):
conversation, mm_data = parse_chat_messages([{
"role":
"user",
"content": [{
@@ -96,15 +144,49 @@ async def test_parse_chat_messages_multiple_images(phi3v_model_config,
"content":
"<|image_1|>\n<|image_2|>\nWhat's in these images?"
}]
mm_data = await mm_future
assert set(mm_data.keys()) == {"image"}
assert len(mm_data["image"]) == 2
_assert_mm_data_is_image_input(mm_data, 2)
@pytest.mark.asyncio
async def test_parse_chat_messages_placeholder_already_in_prompt(
phi3v_model_config, phi3v_tokenizer, image_url):
conversation, mm_future = parse_chat_messages([{
async def test_parse_chat_messages_multiple_images_async(
phi3v_model_config,
phi3v_tokenizer,
image_url,
):
conversation, mm_future = parse_chat_messages_futures([{
"role":
"user",
"content": [{
"type": "image_url",
"image_url": {
"url": image_url
}
}, {
"type": "image_url",
"image_url": {
"url": image_url
}
}, {
"type": "text",
"text": "What's in these images?"
}]
}], phi3v_model_config, phi3v_tokenizer)
assert conversation == [{
"role":
"user",
"content":
"<|image_1|>\n<|image_2|>\nWhat's in these images?"
}]
_assert_mm_data_is_image_input(await mm_future, 2)
def test_parse_chat_messages_placeholder_already_in_prompt(
phi3v_model_config,
phi3v_tokenizer,
image_url,
):
conversation, mm_data = parse_chat_messages([{
"role":
"user",
"content": [{
@@ -131,15 +213,15 @@ async def test_parse_chat_messages_placeholder_already_in_prompt(
"content":
"What's in <|image_1|> and how does it compare to <|image_2|>?"
}]
mm_data = await mm_future
assert set(mm_data.keys()) == {"image"}
assert len(mm_data["image"]) == 2
_assert_mm_data_is_image_input(mm_data, 2)
@pytest.mark.asyncio
async def test_parse_chat_messages_placeholder_one_already_in_prompt(
phi3v_model_config, phi3v_tokenizer, image_url):
conversation, mm_future = parse_chat_messages([{
def test_parse_chat_messages_placeholder_one_already_in_prompt(
phi3v_model_config,
phi3v_tokenizer,
image_url,
):
conversation, mm_data = parse_chat_messages([{
"role":
"user",
"content": [{
@@ -167,15 +249,15 @@ async def test_parse_chat_messages_placeholder_one_already_in_prompt(
"<|image_2|>\nWhat's in <|image_1|> and how does it compare to the "
"other one?"
}]
mm_data = await mm_future
assert set(mm_data.keys()) == {"image"}
assert len(mm_data["image"]) == 2
_assert_mm_data_is_image_input(mm_data, 2)
@pytest.mark.asyncio
async def test_parse_chat_messages_multiple_images_across_messages(
phi3v_model_config, phi3v_tokenizer, image_url):
conversation, mm_future = parse_chat_messages([{
def test_parse_chat_messages_multiple_images_across_messages(
phi3v_model_config,
phi3v_tokenizer,
image_url,
):
conversation, mm_data = parse_chat_messages([{
"role":
"user",
"content": [{
@@ -218,14 +300,14 @@ async def test_parse_chat_messages_multiple_images_across_messages(
"content": "<|image_2|>\nWhat about this one?"
},
]
mm_data = await mm_future
assert set(mm_data.keys()) == {"image"}
assert len(mm_data["image"]) == 2
_assert_mm_data_is_image_input(mm_data, 2)
@pytest.mark.asyncio
async def test_parse_chat_messages_rejects_too_many_images_in_one_message(
phi3v_model_config, phi3v_tokenizer, image_url):
def test_parse_chat_messages_rejects_too_many_images_in_one_message(
phi3v_model_config,
phi3v_tokenizer,
image_url,
):
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
@@ -259,9 +341,11 @@ async def test_parse_chat_messages_rejects_too_many_images_in_one_message(
}], phi3v_model_config, phi3v_tokenizer)
@pytest.mark.asyncio
async def test_parse_chat_messages_rejects_too_many_images_across_messages(
phi3v_model_config, phi3v_tokenizer, image_url):
def test_parse_chat_messages_rejects_too_many_images_across_messages(
phi3v_model_config,
phi3v_tokenizer,
image_url,
):
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",