[Frontend] Multimodal support in offline chat (#8098)

2024-09-04 13:22:17 +08:00
parent 2be8ec6e71
commit 855c262a6b
8 changed files with 356 additions and 112 deletions
--- a/tests/entrypoints/llm/test_generate.py
+++ b/tests/entrypoints/llm/test_generate.py
@@ -6,6 +6,7 @@ import pytest
 from vllm import LLM, RequestOutput, SamplingParams

 from ...conftest import cleanup
+from ..openai.test_vision import TEST_IMAGE_URLS

 MODEL_NAME = "facebook/opt-125m"

@@ -159,3 +160,36 @@ def test_chat():
    ]
    outputs = llm.chat(messages)
    assert len(outputs) == 1
+
+
+@pytest.mark.parametrize("image_urls",
+                         [[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
+def test_chat_multi_image(image_urls: List[str]):
+    llm = LLM(
+        model="microsoft/Phi-3.5-vision-instruct",
+        dtype="bfloat16",
+        max_model_len=4096,
+        max_num_seqs=5,
+        enforce_eager=True,
+        trust_remote_code=True,
+        limit_mm_per_prompt={"image": 2},
+    )
+
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            *({
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            } for image_url in image_urls),
+            {
+                "type": "text",
+                "text": "What's in this image?"
+            },
+        ],
+    }]
+    outputs = llm.chat(messages)
+    assert len(outputs) >= 0