[Frontend] Multimodal support in offline chat (#8098)

2024-09-04 13:22:17 +08:00
parent 2be8ec6e71
commit 855c262a6b
8 changed files with 356 additions and 112 deletions
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -1,11 +1,14 @@
 import warnings
+from typing import Optional

 import pytest
 from PIL import Image

 from vllm.assets.image import ImageAsset
 from vllm.config import ModelConfig
-from vllm.entrypoints.chat_utils import parse_chat_messages
+from vllm.entrypoints.chat_utils import (parse_chat_messages,
+                                         parse_chat_messages_futures)
+from vllm.multimodal import MultiModalDataDict
 from vllm.multimodal.utils import encode_image_base64
 from vllm.transformers_utils.tokenizer_group import TokenizerGroup

@@ -42,10 +45,28 @@ def image_url():
    return f"data:image/jpeg;base64,{base64}"


-@pytest.mark.asyncio
-async def test_parse_chat_messages_with_image_url(phi3v_model_config,
-                                                  phi3v_tokenizer, image_url):
-    conversation, mm_future = parse_chat_messages([{
+def _assert_mm_data_is_image_input(
+    mm_data: Optional[MultiModalDataDict],
+    image_count: int,
+) -> None:
+    assert mm_data is not None
+    assert set(mm_data.keys()) == {"image"}
+
+    image_data = mm_data.get("image")
+    assert image_data is not None
+
+    if image_count == 1:
+        assert isinstance(image_data, Image.Image)
+    else:
+        assert isinstance(image_data, list) and len(image_data) == image_count
+
+
+def test_parse_chat_messages_single_image(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    conversation, mm_data = parse_chat_messages([{
        "role":
        "user",
        "content": [{
@@ -63,15 +84,42 @@ async def test_parse_chat_messages_with_image_url(phi3v_model_config,
        "role": "user",
        "content": "<|image_1|>\nWhat's in the image?"
    }]
-    mm_data = await mm_future
-    assert set(mm_data.keys()) == {"image"}
-    assert isinstance(mm_data["image"], Image.Image)
+    _assert_mm_data_is_image_input(mm_data, 1)


@pytest.mark.asyncio
-async def test_parse_chat_messages_multiple_images(phi3v_model_config,
-                                                   phi3v_tokenizer, image_url):
-    conversation, mm_future = parse_chat_messages([{
+async def test_parse_chat_messages_single_image_async(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    conversation, mm_future = parse_chat_messages_futures([{
+        "role":
+        "user",
+        "content": [{
+            "type": "image_url",
+            "image_url": {
+                "url": image_url
+            }
+        }, {
+            "type": "text",
+            "text": "What's in the image?"
+        }]
+    }], phi3v_model_config, phi3v_tokenizer)
+
+    assert conversation == [{
+        "role": "user",
+        "content": "<|image_1|>\nWhat's in the image?"
+    }]
+    _assert_mm_data_is_image_input(await mm_future, 1)
+
+
+def test_parse_chat_messages_multiple_images(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    conversation, mm_data = parse_chat_messages([{
        "role":
        "user",
        "content": [{
@@ -96,15 +144,49 @@ async def test_parse_chat_messages_multiple_images(phi3v_model_config,
        "content":
        "<|image_1|>\n<|image_2|>\nWhat's in these images?"
    }]
-    mm_data = await mm_future
-    assert set(mm_data.keys()) == {"image"}
-    assert len(mm_data["image"]) == 2
+    _assert_mm_data_is_image_input(mm_data, 2)


@pytest.mark.asyncio
-async def test_parse_chat_messages_placeholder_already_in_prompt(
-        phi3v_model_config, phi3v_tokenizer, image_url):
-    conversation, mm_future = parse_chat_messages([{
+async def test_parse_chat_messages_multiple_images_async(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    conversation, mm_future = parse_chat_messages_futures([{
+        "role":
+        "user",
+        "content": [{
+            "type": "image_url",
+            "image_url": {
+                "url": image_url
+            }
+        }, {
+            "type": "image_url",
+            "image_url": {
+                "url": image_url
+            }
+        }, {
+            "type": "text",
+            "text": "What's in these images?"
+        }]
+    }], phi3v_model_config, phi3v_tokenizer)
+
+    assert conversation == [{
+        "role":
+        "user",
+        "content":
+        "<|image_1|>\n<|image_2|>\nWhat's in these images?"
+    }]
+    _assert_mm_data_is_image_input(await mm_future, 2)
+
+
+def test_parse_chat_messages_placeholder_already_in_prompt(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    conversation, mm_data = parse_chat_messages([{
        "role":
        "user",
        "content": [{
@@ -131,15 +213,15 @@ async def test_parse_chat_messages_placeholder_already_in_prompt(
        "content":
        "What's in <|image_1|> and how does it compare to <|image_2|>?"
    }]
-    mm_data = await mm_future
-    assert set(mm_data.keys()) == {"image"}
-    assert len(mm_data["image"]) == 2
+    _assert_mm_data_is_image_input(mm_data, 2)


-@pytest.mark.asyncio
-async def test_parse_chat_messages_placeholder_one_already_in_prompt(
-        phi3v_model_config, phi3v_tokenizer, image_url):
-    conversation, mm_future = parse_chat_messages([{
+def test_parse_chat_messages_placeholder_one_already_in_prompt(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    conversation, mm_data = parse_chat_messages([{
        "role":
        "user",
        "content": [{
@@ -167,15 +249,15 @@ async def test_parse_chat_messages_placeholder_one_already_in_prompt(
        "<|image_2|>\nWhat's in <|image_1|> and how does it compare to the "
        "other one?"
    }]
-    mm_data = await mm_future
-    assert set(mm_data.keys()) == {"image"}
-    assert len(mm_data["image"]) == 2
+    _assert_mm_data_is_image_input(mm_data, 2)


-@pytest.mark.asyncio
-async def test_parse_chat_messages_multiple_images_across_messages(
-        phi3v_model_config, phi3v_tokenizer, image_url):
-    conversation, mm_future = parse_chat_messages([{
+def test_parse_chat_messages_multiple_images_across_messages(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    conversation, mm_data = parse_chat_messages([{
        "role":
        "user",
        "content": [{
@@ -218,14 +300,14 @@ async def test_parse_chat_messages_multiple_images_across_messages(
            "content": "<|image_2|>\nWhat about this one?"
        },
    ]
-    mm_data = await mm_future
-    assert set(mm_data.keys()) == {"image"}
-    assert len(mm_data["image"]) == 2
+    _assert_mm_data_is_image_input(mm_data, 2)


-@pytest.mark.asyncio
-async def test_parse_chat_messages_rejects_too_many_images_in_one_message(
-        phi3v_model_config, phi3v_tokenizer, image_url):
+def test_parse_chat_messages_rejects_too_many_images_in_one_message(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
    with warnings.catch_warnings():
        warnings.filterwarnings(
            "ignore",
@@ -259,9 +341,11 @@ async def test_parse_chat_messages_rejects_too_many_images_in_one_message(
            }], phi3v_model_config, phi3v_tokenizer)


-@pytest.mark.asyncio
-async def test_parse_chat_messages_rejects_too_many_images_across_messages(
-        phi3v_model_config, phi3v_tokenizer, image_url):
+def test_parse_chat_messages_rejects_too_many_images_across_messages(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
    with warnings.catch_warnings():
        warnings.filterwarnings(
            "ignore",