[Misc] Cleanup Kimi-K2.5's vision chunk modality entrypoints (#33157)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
@@ -24,12 +24,25 @@ from vllm.multimodal.utils import (
|
||||
)
|
||||
from vllm.utils.serial_utils import tensor2base64
|
||||
|
||||
KIMI_K2_5_MODEL_ID = "moonshotai/Kimi-K2.5"
|
||||
PHI3V_MODEL_ID = "microsoft/Phi-3.5-vision-instruct"
|
||||
QWEN2AUDIO_MODEL_ID = "Qwen/Qwen2-Audio-7B-Instruct"
|
||||
QWEN25OMNI_MODEL_ID = "Qwen/Qwen2.5-Omni-7B"
|
||||
MISTRAL_MODEL_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def kimi_k2_5_model_config():
|
||||
return ModelConfig(
|
||||
KIMI_K2_5_MODEL_ID,
|
||||
runner="generate",
|
||||
trust_remote_code=True,
|
||||
limit_mm_per_prompt={
|
||||
"image": 2,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def phi3v_model_config():
|
||||
return ModelConfig(
|
||||
@@ -163,6 +176,22 @@ def _assert_mm_data_is_image_input(
|
||||
assert image_data[i] is None
|
||||
|
||||
|
||||
def _assert_mm_data_is_vision_chunk_input(
|
||||
mm_data: MultiModalDataDict | None,
|
||||
vision_chunk_count: int,
|
||||
) -> None:
|
||||
assert mm_data is not None
|
||||
assert set(mm_data.keys()) == {"vision_chunk"}
|
||||
|
||||
vision_chunk_data = mm_data.get("vision_chunk")
|
||||
assert vision_chunk_data is not None
|
||||
|
||||
assert (
|
||||
isinstance(vision_chunk_data, list)
|
||||
and len(vision_chunk_data) == vision_chunk_count
|
||||
)
|
||||
|
||||
|
||||
def _assert_mm_uuids(
|
||||
mm_uuids: MultiModalUUIDDict | None,
|
||||
media_count: int,
|
||||
@@ -2151,3 +2180,505 @@ async def test_parse_chat_messages_single_empty_audio_with_uuid_async(
|
||||
]
|
||||
_assert_mm_data_inputs(mm_data, {"audio": 1})
|
||||
_assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[audio_uuid])
|
||||
|
||||
|
||||
def test_parse_chat_messages_image_vision_chunk(
|
||||
kimi_k2_5_model_config,
|
||||
image_url,
|
||||
):
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Analyze this image."},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": image_url},
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
messages,
|
||||
kimi_k2_5_model_config,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
placeholder = "<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
|
||||
expected_conversation = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"{placeholder}\nAnalyze this image.",
|
||||
}
|
||||
]
|
||||
|
||||
assert conversation == expected_conversation
|
||||
_assert_mm_data_is_vision_chunk_input(mm_data, 1)
|
||||
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[None], modality="vision_chunk")
|
||||
|
||||
|
||||
def test_parse_chat_messages_video_vision_chunk(
|
||||
kimi_k2_5_model_config,
|
||||
video_url,
|
||||
):
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Analyze this video."},
|
||||
{
|
||||
"type": "video_url",
|
||||
"video_url": {"url": video_url},
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
messages,
|
||||
kimi_k2_5_model_config,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
placeholder = "<|kimi_k25_video_placeholder|>"
|
||||
expected_conversation = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"{placeholder}\nAnalyze this video.",
|
||||
}
|
||||
]
|
||||
|
||||
assert conversation == expected_conversation
|
||||
_assert_mm_data_is_vision_chunk_input(mm_data, 1)
|
||||
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[None], modality="vision_chunk")
|
||||
|
||||
|
||||
def test_parse_chat_messages_image_vision_chunk_with_uuid(
|
||||
kimi_k2_5_model_config,
|
||||
image_url,
|
||||
):
|
||||
image_uuid = "image_123"
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Analyze this image."},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": image_url},
|
||||
"uuid": image_uuid,
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
messages,
|
||||
kimi_k2_5_model_config,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
placeholder = "<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
|
||||
expected_conversation = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"{placeholder}\nAnalyze this image.",
|
||||
}
|
||||
]
|
||||
|
||||
assert conversation == expected_conversation
|
||||
_assert_mm_data_is_vision_chunk_input(mm_data, 1)
|
||||
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid], modality="vision_chunk")
|
||||
|
||||
|
||||
def test_parse_chat_messages_video_vision_chunk_with_uuid(
|
||||
kimi_k2_5_model_config,
|
||||
video_url,
|
||||
):
|
||||
video_uuid = "video_456"
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Analyze this video."},
|
||||
{
|
||||
"type": "video_url",
|
||||
"video_url": {"url": video_url},
|
||||
"uuid": video_uuid,
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
messages,
|
||||
kimi_k2_5_model_config,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
placeholder = "<|kimi_k25_video_placeholder|>"
|
||||
expected_conversation = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"{placeholder}\nAnalyze this video.",
|
||||
}
|
||||
]
|
||||
|
||||
assert conversation == expected_conversation
|
||||
_assert_mm_data_is_vision_chunk_input(mm_data, 1)
|
||||
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[video_uuid], modality="vision_chunk")
|
||||
|
||||
|
||||
def test_parse_chat_messages_mixed_vision_chunk(
|
||||
kimi_k2_5_model_config,
|
||||
image_url,
|
||||
video_url,
|
||||
):
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Analyze this image and video."},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": image_url},
|
||||
},
|
||||
{
|
||||
"type": "video_url",
|
||||
"video_url": {"url": video_url},
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
messages,
|
||||
kimi_k2_5_model_config,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
image_placeholder = (
|
||||
"<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
|
||||
)
|
||||
video_placeholder = "<|kimi_k25_video_placeholder|>"
|
||||
expected_conversation = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": (
|
||||
f"{image_placeholder}\n{video_placeholder}\n"
|
||||
"Analyze this image and video."
|
||||
),
|
||||
}
|
||||
]
|
||||
|
||||
assert conversation == expected_conversation
|
||||
_assert_mm_data_is_vision_chunk_input(mm_data, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None], modality="vision_chunk")
|
||||
|
||||
|
||||
def test_parse_chat_messages_mixed_vision_chunk_with_uuid(
|
||||
kimi_k2_5_model_config,
|
||||
image_url,
|
||||
video_url,
|
||||
):
|
||||
image_uuid = "image_123"
|
||||
video_uuid = "video_456"
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Analyze this image and video."},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": image_url},
|
||||
"uuid": image_uuid,
|
||||
},
|
||||
{
|
||||
"type": "video_url",
|
||||
"video_url": {"url": video_url},
|
||||
"uuid": video_uuid,
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
messages,
|
||||
kimi_k2_5_model_config,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
image_placeholder = (
|
||||
"<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
|
||||
)
|
||||
video_placeholder = "<|kimi_k25_video_placeholder|>"
|
||||
expected_conversation = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": (
|
||||
f"{image_placeholder}\n{video_placeholder}\n"
|
||||
"Analyze this image and video."
|
||||
),
|
||||
}
|
||||
]
|
||||
|
||||
assert conversation == expected_conversation
|
||||
_assert_mm_data_is_vision_chunk_input(mm_data, 2)
|
||||
_assert_mm_uuids(
|
||||
mm_uuids, 2, expected_uuids=[image_uuid, video_uuid], modality="vision_chunk"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_parse_chat_messages_mixed_vision_chunk_async(
|
||||
kimi_k2_5_model_config,
|
||||
image_url,
|
||||
video_url,
|
||||
):
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Analyze this image and video."},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": image_url},
|
||||
},
|
||||
{
|
||||
"type": "video_url",
|
||||
"video_url": {"url": video_url},
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
conversation, mm_data, mm_uuids = await parse_chat_messages_async(
|
||||
messages,
|
||||
kimi_k2_5_model_config,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
image_placeholder = (
|
||||
"<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
|
||||
)
|
||||
video_placeholder = "<|kimi_k25_video_placeholder|>"
|
||||
expected_conversation = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": (
|
||||
f"{image_placeholder}\n{video_placeholder}\n"
|
||||
"Analyze this image and video."
|
||||
),
|
||||
}
|
||||
]
|
||||
|
||||
assert conversation == expected_conversation
|
||||
_assert_mm_data_is_vision_chunk_input(mm_data, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None], modality="vision_chunk")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_parse_chat_messages_mixed_vision_chunk_with_uuid_async(
|
||||
kimi_k2_5_model_config,
|
||||
image_url,
|
||||
video_url,
|
||||
):
|
||||
image_uuid = "image_123"
|
||||
video_uuid = "video_456"
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Analyze this image and video."},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": image_url},
|
||||
"uuid": image_uuid,
|
||||
},
|
||||
{
|
||||
"type": "video_url",
|
||||
"video_url": {"url": video_url},
|
||||
"uuid": video_uuid,
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
conversation, mm_data, mm_uuids = await parse_chat_messages_async(
|
||||
messages,
|
||||
kimi_k2_5_model_config,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
image_placeholder = (
|
||||
"<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
|
||||
)
|
||||
video_placeholder = "<|kimi_k25_video_placeholder|>"
|
||||
expected_conversation = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": (
|
||||
f"{image_placeholder}\n{video_placeholder}\n"
|
||||
"Analyze this image and video."
|
||||
),
|
||||
}
|
||||
]
|
||||
|
||||
assert conversation == expected_conversation
|
||||
_assert_mm_data_is_vision_chunk_input(mm_data, 2)
|
||||
_assert_mm_uuids(
|
||||
mm_uuids, 2, expected_uuids=[image_uuid, video_uuid], modality="vision_chunk"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_parse_chat_messages_image_vision_chunk_async(
|
||||
kimi_k2_5_model_config,
|
||||
image_url,
|
||||
):
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Analyze this image."},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": image_url},
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
conversation, mm_data, mm_uuids = await parse_chat_messages_async(
|
||||
messages,
|
||||
kimi_k2_5_model_config,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
placeholder = "<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
|
||||
expected_conversation = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"{placeholder}\nAnalyze this image.",
|
||||
}
|
||||
]
|
||||
|
||||
assert conversation == expected_conversation
|
||||
_assert_mm_data_is_vision_chunk_input(mm_data, 1)
|
||||
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[None], modality="vision_chunk")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_parse_chat_messages_video_vision_chunk_async(
|
||||
kimi_k2_5_model_config,
|
||||
video_url,
|
||||
):
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Analyze this video."},
|
||||
{
|
||||
"type": "video_url",
|
||||
"video_url": {"url": video_url},
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
conversation, mm_data, mm_uuids = await parse_chat_messages_async(
|
||||
messages,
|
||||
kimi_k2_5_model_config,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
placeholder = "<|kimi_k25_video_placeholder|>"
|
||||
expected_conversation = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"{placeholder}\nAnalyze this video.",
|
||||
}
|
||||
]
|
||||
|
||||
assert conversation == expected_conversation
|
||||
_assert_mm_data_is_vision_chunk_input(mm_data, 1)
|
||||
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[None], modality="vision_chunk")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_parse_chat_messages_image_vision_chunk_with_uuid_async(
|
||||
kimi_k2_5_model_config,
|
||||
image_url,
|
||||
):
|
||||
image_uuid = "image_123"
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Analyze this image."},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": image_url},
|
||||
"uuid": image_uuid,
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
conversation, mm_data, mm_uuids = await parse_chat_messages_async(
|
||||
messages,
|
||||
kimi_k2_5_model_config,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
placeholder = "<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
|
||||
expected_conversation = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"{placeholder}\nAnalyze this image.",
|
||||
}
|
||||
]
|
||||
|
||||
assert conversation == expected_conversation
|
||||
_assert_mm_data_is_vision_chunk_input(mm_data, 1)
|
||||
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid], modality="vision_chunk")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_parse_chat_messages_video_vision_chunk_with_uuid_async(
|
||||
kimi_k2_5_model_config,
|
||||
video_url,
|
||||
):
|
||||
video_uuid = "video_456"
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Analyze this video."},
|
||||
{
|
||||
"type": "video_url",
|
||||
"video_url": {"url": video_url},
|
||||
"uuid": video_uuid,
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
conversation, mm_data, mm_uuids = await parse_chat_messages_async(
|
||||
messages,
|
||||
kimi_k2_5_model_config,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
placeholder = "<|kimi_k25_video_placeholder|>"
|
||||
expected_conversation = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"{placeholder}\nAnalyze this video.",
|
||||
}
|
||||
]
|
||||
|
||||
assert conversation == expected_conversation
|
||||
_assert_mm_data_is_vision_chunk_input(mm_data, 1)
|
||||
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[video_uuid], modality="vision_chunk")
|
||||
|
||||
Reference in New Issue
Block a user