[Misc] Cleanup Kimi-K2.5's vision chunk modality entrypoints (#33157)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
Isotr0py
2026-01-29 17:46:02 +08:00
committed by GitHub
parent e01ff5c070
commit 3a92c6f3b5
7 changed files with 733 additions and 204 deletions

View File

@@ -24,12 +24,25 @@ from vllm.multimodal.utils import (
)
from vllm.utils.serial_utils import tensor2base64
KIMI_K2_5_MODEL_ID = "moonshotai/Kimi-K2.5"
PHI3V_MODEL_ID = "microsoft/Phi-3.5-vision-instruct"
QWEN2AUDIO_MODEL_ID = "Qwen/Qwen2-Audio-7B-Instruct"
QWEN25OMNI_MODEL_ID = "Qwen/Qwen2.5-Omni-7B"
MISTRAL_MODEL_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
@pytest.fixture(scope="function")
def kimi_k2_5_model_config():
return ModelConfig(
KIMI_K2_5_MODEL_ID,
runner="generate",
trust_remote_code=True,
limit_mm_per_prompt={
"image": 2,
},
)
@pytest.fixture(scope="function")
def phi3v_model_config():
return ModelConfig(
@@ -163,6 +176,22 @@ def _assert_mm_data_is_image_input(
assert image_data[i] is None
def _assert_mm_data_is_vision_chunk_input(
mm_data: MultiModalDataDict | None,
vision_chunk_count: int,
) -> None:
assert mm_data is not None
assert set(mm_data.keys()) == {"vision_chunk"}
vision_chunk_data = mm_data.get("vision_chunk")
assert vision_chunk_data is not None
assert (
isinstance(vision_chunk_data, list)
and len(vision_chunk_data) == vision_chunk_count
)
def _assert_mm_uuids(
mm_uuids: MultiModalUUIDDict | None,
media_count: int,
@@ -2151,3 +2180,505 @@ async def test_parse_chat_messages_single_empty_audio_with_uuid_async(
]
_assert_mm_data_inputs(mm_data, {"audio": 1})
_assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[audio_uuid])
def test_parse_chat_messages_image_vision_chunk(
kimi_k2_5_model_config,
image_url,
):
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Analyze this image."},
{
"type": "image_url",
"image_url": {"url": image_url},
},
],
}
]
conversation, mm_data, mm_uuids = parse_chat_messages(
messages,
kimi_k2_5_model_config,
content_format="string",
)
placeholder = "<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
expected_conversation = [
{
"role": "user",
"content": f"{placeholder}\nAnalyze this image.",
}
]
assert conversation == expected_conversation
_assert_mm_data_is_vision_chunk_input(mm_data, 1)
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[None], modality="vision_chunk")
def test_parse_chat_messages_video_vision_chunk(
kimi_k2_5_model_config,
video_url,
):
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Analyze this video."},
{
"type": "video_url",
"video_url": {"url": video_url},
},
],
}
]
conversation, mm_data, mm_uuids = parse_chat_messages(
messages,
kimi_k2_5_model_config,
content_format="string",
)
placeholder = "<|kimi_k25_video_placeholder|>"
expected_conversation = [
{
"role": "user",
"content": f"{placeholder}\nAnalyze this video.",
}
]
assert conversation == expected_conversation
_assert_mm_data_is_vision_chunk_input(mm_data, 1)
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[None], modality="vision_chunk")
def test_parse_chat_messages_image_vision_chunk_with_uuid(
kimi_k2_5_model_config,
image_url,
):
image_uuid = "image_123"
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Analyze this image."},
{
"type": "image_url",
"image_url": {"url": image_url},
"uuid": image_uuid,
},
],
}
]
conversation, mm_data, mm_uuids = parse_chat_messages(
messages,
kimi_k2_5_model_config,
content_format="string",
)
placeholder = "<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
expected_conversation = [
{
"role": "user",
"content": f"{placeholder}\nAnalyze this image.",
}
]
assert conversation == expected_conversation
_assert_mm_data_is_vision_chunk_input(mm_data, 1)
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid], modality="vision_chunk")
def test_parse_chat_messages_video_vision_chunk_with_uuid(
kimi_k2_5_model_config,
video_url,
):
video_uuid = "video_456"
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Analyze this video."},
{
"type": "video_url",
"video_url": {"url": video_url},
"uuid": video_uuid,
},
],
}
]
conversation, mm_data, mm_uuids = parse_chat_messages(
messages,
kimi_k2_5_model_config,
content_format="string",
)
placeholder = "<|kimi_k25_video_placeholder|>"
expected_conversation = [
{
"role": "user",
"content": f"{placeholder}\nAnalyze this video.",
}
]
assert conversation == expected_conversation
_assert_mm_data_is_vision_chunk_input(mm_data, 1)
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[video_uuid], modality="vision_chunk")
def test_parse_chat_messages_mixed_vision_chunk(
kimi_k2_5_model_config,
image_url,
video_url,
):
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Analyze this image and video."},
{
"type": "image_url",
"image_url": {"url": image_url},
},
{
"type": "video_url",
"video_url": {"url": video_url},
},
],
}
]
conversation, mm_data, mm_uuids = parse_chat_messages(
messages,
kimi_k2_5_model_config,
content_format="string",
)
image_placeholder = (
"<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
)
video_placeholder = "<|kimi_k25_video_placeholder|>"
expected_conversation = [
{
"role": "user",
"content": (
f"{image_placeholder}\n{video_placeholder}\n"
"Analyze this image and video."
),
}
]
assert conversation == expected_conversation
_assert_mm_data_is_vision_chunk_input(mm_data, 2)
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None], modality="vision_chunk")
def test_parse_chat_messages_mixed_vision_chunk_with_uuid(
kimi_k2_5_model_config,
image_url,
video_url,
):
image_uuid = "image_123"
video_uuid = "video_456"
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Analyze this image and video."},
{
"type": "image_url",
"image_url": {"url": image_url},
"uuid": image_uuid,
},
{
"type": "video_url",
"video_url": {"url": video_url},
"uuid": video_uuid,
},
],
}
]
conversation, mm_data, mm_uuids = parse_chat_messages(
messages,
kimi_k2_5_model_config,
content_format="string",
)
image_placeholder = (
"<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
)
video_placeholder = "<|kimi_k25_video_placeholder|>"
expected_conversation = [
{
"role": "user",
"content": (
f"{image_placeholder}\n{video_placeholder}\n"
"Analyze this image and video."
),
}
]
assert conversation == expected_conversation
_assert_mm_data_is_vision_chunk_input(mm_data, 2)
_assert_mm_uuids(
mm_uuids, 2, expected_uuids=[image_uuid, video_uuid], modality="vision_chunk"
)
@pytest.mark.asyncio
async def test_parse_chat_messages_mixed_vision_chunk_async(
kimi_k2_5_model_config,
image_url,
video_url,
):
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Analyze this image and video."},
{
"type": "image_url",
"image_url": {"url": image_url},
},
{
"type": "video_url",
"video_url": {"url": video_url},
},
],
}
]
conversation, mm_data, mm_uuids = await parse_chat_messages_async(
messages,
kimi_k2_5_model_config,
content_format="string",
)
image_placeholder = (
"<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
)
video_placeholder = "<|kimi_k25_video_placeholder|>"
expected_conversation = [
{
"role": "user",
"content": (
f"{image_placeholder}\n{video_placeholder}\n"
"Analyze this image and video."
),
}
]
assert conversation == expected_conversation
_assert_mm_data_is_vision_chunk_input(mm_data, 2)
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None], modality="vision_chunk")
@pytest.mark.asyncio
async def test_parse_chat_messages_mixed_vision_chunk_with_uuid_async(
kimi_k2_5_model_config,
image_url,
video_url,
):
image_uuid = "image_123"
video_uuid = "video_456"
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Analyze this image and video."},
{
"type": "image_url",
"image_url": {"url": image_url},
"uuid": image_uuid,
},
{
"type": "video_url",
"video_url": {"url": video_url},
"uuid": video_uuid,
},
],
}
]
conversation, mm_data, mm_uuids = await parse_chat_messages_async(
messages,
kimi_k2_5_model_config,
content_format="string",
)
image_placeholder = (
"<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
)
video_placeholder = "<|kimi_k25_video_placeholder|>"
expected_conversation = [
{
"role": "user",
"content": (
f"{image_placeholder}\n{video_placeholder}\n"
"Analyze this image and video."
),
}
]
assert conversation == expected_conversation
_assert_mm_data_is_vision_chunk_input(mm_data, 2)
_assert_mm_uuids(
mm_uuids, 2, expected_uuids=[image_uuid, video_uuid], modality="vision_chunk"
)
@pytest.mark.asyncio
async def test_parse_chat_messages_image_vision_chunk_async(
kimi_k2_5_model_config,
image_url,
):
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Analyze this image."},
{
"type": "image_url",
"image_url": {"url": image_url},
},
],
}
]
conversation, mm_data, mm_uuids = await parse_chat_messages_async(
messages,
kimi_k2_5_model_config,
content_format="string",
)
placeholder = "<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
expected_conversation = [
{
"role": "user",
"content": f"{placeholder}\nAnalyze this image.",
}
]
assert conversation == expected_conversation
_assert_mm_data_is_vision_chunk_input(mm_data, 1)
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[None], modality="vision_chunk")
@pytest.mark.asyncio
async def test_parse_chat_messages_video_vision_chunk_async(
kimi_k2_5_model_config,
video_url,
):
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Analyze this video."},
{
"type": "video_url",
"video_url": {"url": video_url},
},
],
}
]
conversation, mm_data, mm_uuids = await parse_chat_messages_async(
messages,
kimi_k2_5_model_config,
content_format="string",
)
placeholder = "<|kimi_k25_video_placeholder|>"
expected_conversation = [
{
"role": "user",
"content": f"{placeholder}\nAnalyze this video.",
}
]
assert conversation == expected_conversation
_assert_mm_data_is_vision_chunk_input(mm_data, 1)
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[None], modality="vision_chunk")
@pytest.mark.asyncio
async def test_parse_chat_messages_image_vision_chunk_with_uuid_async(
kimi_k2_5_model_config,
image_url,
):
image_uuid = "image_123"
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Analyze this image."},
{
"type": "image_url",
"image_url": {"url": image_url},
"uuid": image_uuid,
},
],
}
]
conversation, mm_data, mm_uuids = await parse_chat_messages_async(
messages,
kimi_k2_5_model_config,
content_format="string",
)
placeholder = "<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
expected_conversation = [
{
"role": "user",
"content": f"{placeholder}\nAnalyze this image.",
}
]
assert conversation == expected_conversation
_assert_mm_data_is_vision_chunk_input(mm_data, 1)
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid], modality="vision_chunk")
@pytest.mark.asyncio
async def test_parse_chat_messages_video_vision_chunk_with_uuid_async(
kimi_k2_5_model_config,
video_url,
):
video_uuid = "video_456"
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Analyze this video."},
{
"type": "video_url",
"video_url": {"url": video_url},
"uuid": video_uuid,
},
],
}
]
conversation, mm_data, mm_uuids = await parse_chat_messages_async(
messages,
kimi_k2_5_model_config,
content_format="string",
)
placeholder = "<|kimi_k25_video_placeholder|>"
expected_conversation = [
{
"role": "user",
"content": f"{placeholder}\nAnalyze this video.",
}
]
assert conversation == expected_conversation
_assert_mm_data_is_vision_chunk_input(mm_data, 1)
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[video_uuid], modality="vision_chunk")