Signed-off-by: Roger Wang <hey@rogerw.io> Signed-off-by: Chenheli Hua <huachenheli@outlook.com> Signed-off-by: Roger Wang <hey@rogerw.me> Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: Roger Wang <hey@rogerw.io> Co-authored-by: Roger Wang <hey@rogerw.me> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
This commit is contained in:
@@ -21,7 +21,7 @@ from vllm.entrypoints.chat_utils import (_try_extract_ast, load_chat_template,
|
||||
resolve_chat_template_content_format,
|
||||
resolve_hf_chat_template)
|
||||
from vllm.entrypoints.llm import apply_hf_chat_template
|
||||
from vllm.multimodal import MultiModalDataDict
|
||||
from vllm.multimodal import MultiModalDataDict, MultiModalUUIDDict
|
||||
from vllm.multimodal.utils import (encode_audio_base64, encode_image_base64,
|
||||
encode_video_base64)
|
||||
from vllm.transformers_utils.tokenizer_group import TokenizerGroup
|
||||
@@ -179,6 +179,27 @@ def _assert_mm_data_is_image_input(
|
||||
assert isinstance(image_data, list) and len(image_data) == image_count
|
||||
|
||||
|
||||
def _assert_mm_uuids(
|
||||
mm_uuids: Optional[MultiModalUUIDDict],
|
||||
media_count: int,
|
||||
expected_uuids: list[Optional[str]],
|
||||
modality: str = "image",
|
||||
) -> None:
|
||||
if len(expected_uuids) > 0:
|
||||
assert mm_uuids is not None
|
||||
assert modality in mm_uuids
|
||||
|
||||
image_uuids = mm_uuids.get(modality)
|
||||
assert image_uuids is not None
|
||||
|
||||
assert isinstance(image_uuids,
|
||||
list) and len(image_uuids) == media_count
|
||||
|
||||
assert image_uuids == expected_uuids
|
||||
else:
|
||||
assert mm_uuids is None
|
||||
|
||||
|
||||
ModalityType = Literal["image", "video", "audio"]
|
||||
MultiModalDataCounts = Mapping[ModalityType, int]
|
||||
|
||||
@@ -201,7 +222,7 @@ def test_parse_chat_messages_single_image(
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
conversation, mm_data = parse_chat_messages(
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
@@ -228,6 +249,260 @@ def test_parse_chat_messages_single_image(
|
||||
"content": "<|image_1|>\nWhat's in the image?"
|
||||
}]
|
||||
_assert_mm_data_is_image_input(mm_data, 1)
|
||||
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[None])
|
||||
|
||||
|
||||
def test_parse_chat_messages_single_image_with_uuid(
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
image_uuid = str(hash(image_url))
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url,
|
||||
},
|
||||
"uuid": image_uuid,
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in the image?"
|
||||
},
|
||||
],
|
||||
}],
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
assert conversation == [{
|
||||
"role": "user",
|
||||
"content": "<|image_1|>\nWhat's in the image?"
|
||||
}]
|
||||
_assert_mm_data_is_image_input(mm_data, 1)
|
||||
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid])
|
||||
|
||||
|
||||
def test_parse_chat_messages_single_image_with_bad_uuid_format(
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
image_uuid = str(hash(image_url))
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url,
|
||||
"uuid": image_uuid,
|
||||
},
|
||||
"bad_uuid_key": image_uuid,
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in the image?"
|
||||
},
|
||||
],
|
||||
}],
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
assert conversation == [{
|
||||
"role": "user",
|
||||
"content": "<|image_1|>\nWhat's in the image?"
|
||||
}]
|
||||
_assert_mm_data_is_image_input(mm_data, 1)
|
||||
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[None])
|
||||
|
||||
|
||||
def test_parse_chat_messages_multiple_images_with_uuids(
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
image_uuid1 = "my_uuid_1"
|
||||
image_uuid2 = "my_uuid_2"
|
||||
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url,
|
||||
},
|
||||
"uuid": image_uuid1,
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url,
|
||||
},
|
||||
"uuid": image_uuid2,
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in the image?"
|
||||
},
|
||||
],
|
||||
}],
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
assert conversation == [{
|
||||
"role":
|
||||
"user",
|
||||
"content":
|
||||
"<|image_1|>\n<|image_2|>\nWhat's in the image?",
|
||||
}]
|
||||
_assert_mm_data_is_image_input(mm_data, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_parse_chat_messages_single_image_with_uuid_async(
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
image_uuid = str(hash(image_url))
|
||||
conversation, mm_future, mm_uuids = parse_chat_messages_futures(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
},
|
||||
"uuid": image_uuid,
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in the image?"
|
||||
},
|
||||
],
|
||||
}],
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
assert conversation == [{
|
||||
"role": "user",
|
||||
"content": "<|image_1|>\nWhat's in the image?"
|
||||
}]
|
||||
_assert_mm_data_is_image_input(await mm_future, 1)
|
||||
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid])
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_parse_chat_messages_multiple_images_with_uuids_async(
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
image_uuid1 = "my_uuid_1"
|
||||
image_uuid2 = "my_uuid_2"
|
||||
|
||||
conversation, mm_future, mm_uuids = parse_chat_messages_futures(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
},
|
||||
"uuid": image_uuid1,
|
||||
},
|
||||
{
|
||||
"type": "image_pil",
|
||||
"image_pil": ImageAsset("cherry_blossom").pil_image,
|
||||
"uuid": image_uuid2,
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in these images?"
|
||||
},
|
||||
],
|
||||
}],
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
assert conversation == [{
|
||||
"role":
|
||||
"user",
|
||||
"content":
|
||||
"<|image_1|>\n<|image_2|>\nWhat's in these images?",
|
||||
}]
|
||||
_assert_mm_data_is_image_input(await mm_future, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_parse_chat_messages_multiple_images_with_partial_uuids_async(
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
image_uuid2 = "my_uuid_2"
|
||||
|
||||
conversation, mm_future, mm_uuids = parse_chat_messages_futures(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "image_pil",
|
||||
"image_pil": ImageAsset("cherry_blossom").pil_image,
|
||||
"uuid": image_uuid2,
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in these images?"
|
||||
},
|
||||
],
|
||||
}],
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
assert conversation == [{
|
||||
"role":
|
||||
"user",
|
||||
"content":
|
||||
"<|image_1|>\n<|image_2|>\nWhat's in these images?",
|
||||
}]
|
||||
_assert_mm_data_is_image_input(await mm_future, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, image_uuid2])
|
||||
|
||||
|
||||
def test_parse_chat_messages_empty_system(
|
||||
@@ -235,7 +510,7 @@ def test_parse_chat_messages_empty_system(
|
||||
mistral_tokenizer,
|
||||
):
|
||||
# Test string format
|
||||
conversation, _ = parse_chat_messages(
|
||||
conversation, _, _ = parse_chat_messages(
|
||||
[
|
||||
{
|
||||
"role": "system",
|
||||
@@ -265,7 +540,7 @@ def test_parse_chat_messages_empty_system(
|
||||
]
|
||||
|
||||
# Test openai format
|
||||
conversation, _ = parse_chat_messages(
|
||||
conversation, _, _ = parse_chat_messages(
|
||||
[
|
||||
{
|
||||
"role": "system",
|
||||
@@ -307,7 +582,7 @@ async def test_parse_chat_messages_single_image_async(
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
conversation, mm_future = parse_chat_messages_futures(
|
||||
conversation, mm_future, mm_uuids = parse_chat_messages_futures(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
@@ -334,6 +609,7 @@ async def test_parse_chat_messages_single_image_async(
|
||||
"content": "<|image_1|>\nWhat's in the image?"
|
||||
}]
|
||||
_assert_mm_data_is_image_input(await mm_future, 1)
|
||||
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[None])
|
||||
|
||||
|
||||
def test_parse_chat_messages_multiple_images(
|
||||
@@ -341,7 +617,7 @@ def test_parse_chat_messages_multiple_images(
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
conversation, mm_data = parse_chat_messages(
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
@@ -374,6 +650,7 @@ def test_parse_chat_messages_multiple_images(
|
||||
"<|image_1|>\n<|image_2|>\nWhat's in these images?",
|
||||
}]
|
||||
_assert_mm_data_is_image_input(mm_data, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -382,7 +659,7 @@ async def test_parse_chat_messages_multiple_images_async(
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
conversation, mm_future = parse_chat_messages_futures(
|
||||
conversation, mm_future, mm_uuids = parse_chat_messages_futures(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
@@ -415,6 +692,7 @@ async def test_parse_chat_messages_multiple_images_async(
|
||||
"<|image_1|>\n<|image_2|>\nWhat's in these images?",
|
||||
}]
|
||||
_assert_mm_data_is_image_input(await mm_future, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
|
||||
|
||||
|
||||
def test_parse_chat_messages_placeholder_already_in_prompt(
|
||||
@@ -422,7 +700,7 @@ def test_parse_chat_messages_placeholder_already_in_prompt(
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
conversation, mm_data = parse_chat_messages(
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
@@ -458,6 +736,7 @@ def test_parse_chat_messages_placeholder_already_in_prompt(
|
||||
"What's in <|image_1|> and how does it compare to <|image_2|>?",
|
||||
}]
|
||||
_assert_mm_data_is_image_input(mm_data, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
|
||||
|
||||
|
||||
def test_parse_chat_messages_placeholder_one_already_in_prompt(
|
||||
@@ -465,7 +744,7 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt(
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
conversation, mm_data = parse_chat_messages(
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
@@ -503,6 +782,7 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt(
|
||||
"other one?",
|
||||
}]
|
||||
_assert_mm_data_is_image_input(mm_data, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
|
||||
|
||||
|
||||
def test_parse_chat_messages_multiple_images_across_messages(
|
||||
@@ -510,7 +790,7 @@ def test_parse_chat_messages_multiple_images_across_messages(
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
conversation, mm_data = parse_chat_messages(
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[
|
||||
{
|
||||
"role":
|
||||
@@ -569,13 +849,84 @@ def test_parse_chat_messages_multiple_images_across_messages(
|
||||
},
|
||||
]
|
||||
_assert_mm_data_is_image_input(mm_data, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
|
||||
|
||||
|
||||
def test_parse_chat_messages_multiple_images_with_uuids_across_messages(
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
image_uuid = str(hash(image_url))
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[
|
||||
{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
},
|
||||
"uuid": image_uuid,
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's in this image?"
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Some stuff."
|
||||
},
|
||||
{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
},
|
||||
"uuid": image_uuid,
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What about this one?"
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
assert conversation == [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "<|image_1|>\nWhat's in this image?"
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Some stuff."
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "<|image_2|>\nWhat about this one?"
|
||||
},
|
||||
]
|
||||
_assert_mm_data_is_image_input(mm_data, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid, image_uuid])
|
||||
|
||||
|
||||
def test_parse_chat_messages_context_text_format(
|
||||
phi3v_model_config,
|
||||
phi3v_tokenizer,
|
||||
):
|
||||
conversation, mm_data = parse_chat_messages(
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[
|
||||
{
|
||||
"role": "user",
|
||||
@@ -621,6 +972,8 @@ def test_parse_chat_messages_context_text_format(
|
||||
}],
|
||||
},
|
||||
]
|
||||
assert mm_data is None
|
||||
assert mm_uuids is None
|
||||
|
||||
|
||||
def test_parse_chat_messages_rejects_too_many_images_in_one_message(
|
||||
@@ -736,7 +1089,7 @@ def test_parse_chat_messages_multiple_images_uncommon_input(
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
conversation, mm_data = parse_chat_messages(
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
@@ -762,6 +1115,7 @@ def test_parse_chat_messages_multiple_images_uncommon_input(
|
||||
"<|image_1|>\n<|image_2|>\nWhat's in these images?",
|
||||
}]
|
||||
_assert_mm_data_is_image_input(mm_data, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
|
||||
|
||||
|
||||
def test_parse_chat_messages_multiple_images_interleave(
|
||||
@@ -769,7 +1123,7 @@ def test_parse_chat_messages_multiple_images_interleave(
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
conversation, mm_data = parse_chat_messages(
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
@@ -813,6 +1167,7 @@ def test_parse_chat_messages_multiple_images_interleave(
|
||||
"Do they have differences?",
|
||||
}]
|
||||
_assert_mm_data_is_image_input(mm_data, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -821,7 +1176,7 @@ async def test_parse_chat_messages_multiple_images_interleave_async(
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
conversation, mm_data = parse_chat_messages_futures(
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages_futures(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
@@ -865,6 +1220,63 @@ async def test_parse_chat_messages_multiple_images_interleave_async(
|
||||
"Do they have differences?",
|
||||
}]
|
||||
_assert_mm_data_is_image_input(await mm_data, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_parse_chat_messages_multiple_images_with_uuids_interleave_async(
|
||||
phi3v_model_config_mm_interleaved,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
image_uuid = str(hash(image_url))
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages_futures(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "I need you to compare this image",
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
},
|
||||
"uuid": image_uuid,
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "and this one"
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
},
|
||||
"uuid": image_uuid,
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Do they have differences?"
|
||||
},
|
||||
],
|
||||
}],
|
||||
phi3v_model_config_mm_interleaved,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
assert conversation == [{
|
||||
"role":
|
||||
"user",
|
||||
"content":
|
||||
"I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n" # noqa: E501
|
||||
"Do they have differences?",
|
||||
}]
|
||||
_assert_mm_data_is_image_input(await mm_data, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid, image_uuid])
|
||||
|
||||
|
||||
def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
|
||||
@@ -872,7 +1284,7 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
conversation, mm_data = parse_chat_messages(
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[
|
||||
{
|
||||
"role":
|
||||
@@ -935,6 +1347,81 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
|
||||
},
|
||||
]
|
||||
_assert_mm_data_is_image_input(mm_data, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
|
||||
|
||||
|
||||
def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interleave( # noqa: E501
|
||||
phi3v_model_config_mm_interleaved,
|
||||
phi3v_tokenizer,
|
||||
image_url,
|
||||
):
|
||||
image_uuid = str(hash(image_url))
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[
|
||||
{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's on this image?"
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
},
|
||||
"uuid": image_uuid,
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Be accurate."
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Some stuff."
|
||||
},
|
||||
{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's on this image?"
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
},
|
||||
"uuid": image_uuid,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
phi3v_model_config_mm_interleaved,
|
||||
phi3v_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
assert conversation == [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "What's on this image?\n<|image_1|>\nBe accurate.",
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Some stuff."
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": "What's on this image?\n<|image_2|>"
|
||||
},
|
||||
]
|
||||
_assert_mm_data_is_image_input(mm_data, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid, image_uuid])
|
||||
|
||||
|
||||
def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
|
||||
@@ -944,7 +1431,7 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
|
||||
video_url,
|
||||
audio_url,
|
||||
):
|
||||
conversation, mm_data = parse_chat_messages(
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[
|
||||
{
|
||||
"role":
|
||||
@@ -1030,6 +1517,229 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
|
||||
]
|
||||
|
||||
_assert_mm_data_inputs(mm_data, {"image": 2, "video": 1, "audio": 1})
|
||||
_assert_mm_uuids(mm_uuids,
|
||||
2,
|
||||
modality="image",
|
||||
expected_uuids=[None, None])
|
||||
_assert_mm_uuids(mm_uuids, 1, modality="video", expected_uuids=[None])
|
||||
_assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[None])
|
||||
|
||||
|
||||
def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interleave( # noqa: E501
|
||||
qwen25omni_model_config_mm_interleaved,
|
||||
qwen25omni_tokenizer,
|
||||
image_url,
|
||||
video_url,
|
||||
audio_url,
|
||||
):
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[
|
||||
{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's on this image?"
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
},
|
||||
"uuid": "image_123",
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Now listen to this audio"
|
||||
},
|
||||
{
|
||||
"type": "audio_url",
|
||||
"audio_url": {
|
||||
"url": audio_url
|
||||
},
|
||||
"uuid": "audio_123",
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Some stuff."
|
||||
},
|
||||
{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's on this image?"
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
},
|
||||
"uuid": "image_123",
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "And what's in the video?"
|
||||
},
|
||||
{
|
||||
"type": "video_url",
|
||||
"video_url": {
|
||||
"url": video_url
|
||||
},
|
||||
"uuid": "video_123",
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
qwen25omni_model_config_mm_interleaved,
|
||||
qwen25omni_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
assert conversation == [
|
||||
{
|
||||
"role":
|
||||
"user",
|
||||
"content":
|
||||
"What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
|
||||
"Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>", # noqa: E501
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Some stuff."
|
||||
},
|
||||
{
|
||||
"role":
|
||||
"user",
|
||||
"content":
|
||||
"What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
|
||||
"And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
|
||||
},
|
||||
]
|
||||
|
||||
_assert_mm_data_inputs(mm_data, {"image": 2, "video": 1, "audio": 1})
|
||||
_assert_mm_uuids(mm_uuids,
|
||||
2,
|
||||
modality="image",
|
||||
expected_uuids=["image_123", "image_123"])
|
||||
_assert_mm_uuids(mm_uuids,
|
||||
1,
|
||||
modality="video",
|
||||
expected_uuids=["video_123"])
|
||||
_assert_mm_uuids(mm_uuids,
|
||||
1,
|
||||
modality="audio",
|
||||
expected_uuids=["audio_123"])
|
||||
|
||||
|
||||
def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_messages_interleave( # noqa: E501
|
||||
qwen25omni_model_config_mm_interleaved,
|
||||
qwen25omni_tokenizer,
|
||||
image_url,
|
||||
video_url,
|
||||
audio_url,
|
||||
):
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[
|
||||
{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's on this image?"
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
},
|
||||
"uuid": "image_123",
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Now listen to this audio"
|
||||
},
|
||||
{
|
||||
"type": "audio_url",
|
||||
"audio_url": {
|
||||
"url": audio_url
|
||||
}
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Some stuff."
|
||||
},
|
||||
{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's on this image?"
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": image_url
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "And what's in the video?"
|
||||
},
|
||||
{
|
||||
"type": "video_url",
|
||||
"video_url": {
|
||||
"url": video_url
|
||||
},
|
||||
"uuid": "video_123",
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
qwen25omni_model_config_mm_interleaved,
|
||||
qwen25omni_tokenizer,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
assert conversation == [
|
||||
{
|
||||
"role":
|
||||
"user",
|
||||
"content":
|
||||
"What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
|
||||
"Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>", # noqa: E501
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Some stuff."
|
||||
},
|
||||
{
|
||||
"role":
|
||||
"user",
|
||||
"content":
|
||||
"What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
|
||||
"And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
|
||||
},
|
||||
]
|
||||
|
||||
_assert_mm_data_inputs(mm_data, {"image": 2, "video": 1, "audio": 1})
|
||||
_assert_mm_uuids(mm_uuids,
|
||||
2,
|
||||
modality="image",
|
||||
expected_uuids=["image_123", None])
|
||||
_assert_mm_uuids(mm_uuids,
|
||||
1,
|
||||
modality="video",
|
||||
expected_uuids=["video_123"])
|
||||
_assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[None])
|
||||
|
||||
|
||||
def test_parse_chat_messages_multiple_images_interleave_with_placeholders(
|
||||
@@ -1081,7 +1791,7 @@ def test_mllama_single_image(
|
||||
image_url,
|
||||
):
|
||||
"""Ensures that a single image is parsed correctly mllama."""
|
||||
conversation, mm_data = parse_chat_messages(
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
@@ -1100,6 +1810,7 @@ def test_mllama_single_image(
|
||||
content_format="openai",
|
||||
)
|
||||
_assert_mm_data_is_image_input(mm_data, 1)
|
||||
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[None])
|
||||
assert conversation == [{
|
||||
"role":
|
||||
"user",
|
||||
@@ -1121,7 +1832,7 @@ def test_mllama_interleaved_images(
|
||||
image_url,
|
||||
):
|
||||
"""Ensures that multiple image are parsed as interleaved dicts."""
|
||||
conversation, mm_data = parse_chat_messages(
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
[{
|
||||
"role":
|
||||
"user",
|
||||
@@ -1147,6 +1858,7 @@ def test_mllama_interleaved_images(
|
||||
content_format="openai",
|
||||
)
|
||||
_assert_mm_data_is_image_input(mm_data, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
|
||||
assert conversation == [{
|
||||
"role":
|
||||
"user",
|
||||
@@ -1227,7 +1939,7 @@ def test_multimodal_image_parsing_matches_hf(model, image_url):
|
||||
|
||||
# Now parse with vLLMs chat utils & apply the template
|
||||
vllm_conversation = get_conversation(is_hf=False)
|
||||
conversation, _ = parse_chat_messages(
|
||||
conversation, _, _ = parse_chat_messages(
|
||||
vllm_conversation,
|
||||
model_config,
|
||||
tokenizer_group,
|
||||
@@ -1518,7 +2230,7 @@ def test_parse_chat_messages_include_thinking_chunk(mistral_model_config,
|
||||
}],
|
||||
}]
|
||||
|
||||
conversation_with_thinking, _ = parse_chat_messages(
|
||||
conversation_with_thinking, _, _ = parse_chat_messages(
|
||||
messages,
|
||||
mistral_model_config,
|
||||
mistral_tokenizer,
|
||||
|
||||
Reference in New Issue
Block a user