[Misc] Cleanup Kimi-K2.5's vision chunk modality entrypoints (#33157)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2026-01-29 17:46:02 +08:00
parent e01ff5c070
commit 3a92c6f3b5
7 changed files with 733 additions and 204 deletions
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -24,12 +24,25 @@ from vllm.multimodal.utils import (
 )
 from vllm.utils.serial_utils import tensor2base64

+KIMI_K2_5_MODEL_ID = "moonshotai/Kimi-K2.5"
 PHI3V_MODEL_ID = "microsoft/Phi-3.5-vision-instruct"
 QWEN2AUDIO_MODEL_ID = "Qwen/Qwen2-Audio-7B-Instruct"
 QWEN25OMNI_MODEL_ID = "Qwen/Qwen2.5-Omni-7B"
 MISTRAL_MODEL_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"


+@pytest.fixture(scope="function")
+def kimi_k2_5_model_config():
+    return ModelConfig(
+        KIMI_K2_5_MODEL_ID,
+        runner="generate",
+        trust_remote_code=True,
+        limit_mm_per_prompt={
+            "image": 2,
+        },
+    )
+
+
@pytest.fixture(scope="function")
 def phi3v_model_config():
    return ModelConfig(
@@ -163,6 +176,22 @@ def _assert_mm_data_is_image_input(
            assert image_data[i] is None


+def _assert_mm_data_is_vision_chunk_input(
+    mm_data: MultiModalDataDict | None,
+    vision_chunk_count: int,
+) -> None:
+    assert mm_data is not None
+    assert set(mm_data.keys()) == {"vision_chunk"}
+
+    vision_chunk_data = mm_data.get("vision_chunk")
+    assert vision_chunk_data is not None
+
+    assert (
+        isinstance(vision_chunk_data, list)
+        and len(vision_chunk_data) == vision_chunk_count
+    )
+
+
 def _assert_mm_uuids(
    mm_uuids: MultiModalUUIDDict | None,
    media_count: int,
@@ -2151,3 +2180,505 @@ async def test_parse_chat_messages_single_empty_audio_with_uuid_async(
    ]
    _assert_mm_data_inputs(mm_data, {"audio": 1})
    _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[audio_uuid])
+
+
+def test_parse_chat_messages_image_vision_chunk(
+    kimi_k2_5_model_config,
+    image_url,
+):
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Analyze this image."},
+                {
+                    "type": "image_url",
+                    "image_url": {"url": image_url},
+                },
+            ],
+        }
+    ]
+
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        messages,
+        kimi_k2_5_model_config,
+        content_format="string",
+    )
+
+    placeholder = "<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
+    expected_conversation = [
+        {
+            "role": "user",
+            "content": f"{placeholder}\nAnalyze this image.",
+        }
+    ]
+
+    assert conversation == expected_conversation
+    _assert_mm_data_is_vision_chunk_input(mm_data, 1)
+    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None], modality="vision_chunk")
+
+
+def test_parse_chat_messages_video_vision_chunk(
+    kimi_k2_5_model_config,
+    video_url,
+):
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Analyze this video."},
+                {
+                    "type": "video_url",
+                    "video_url": {"url": video_url},
+                },
+            ],
+        }
+    ]
+
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        messages,
+        kimi_k2_5_model_config,
+        content_format="string",
+    )
+
+    placeholder = "<|kimi_k25_video_placeholder|>"
+    expected_conversation = [
+        {
+            "role": "user",
+            "content": f"{placeholder}\nAnalyze this video.",
+        }
+    ]
+
+    assert conversation == expected_conversation
+    _assert_mm_data_is_vision_chunk_input(mm_data, 1)
+    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None], modality="vision_chunk")
+
+
+def test_parse_chat_messages_image_vision_chunk_with_uuid(
+    kimi_k2_5_model_config,
+    image_url,
+):
+    image_uuid = "image_123"
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Analyze this image."},
+                {
+                    "type": "image_url",
+                    "image_url": {"url": image_url},
+                    "uuid": image_uuid,
+                },
+            ],
+        }
+    ]
+
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        messages,
+        kimi_k2_5_model_config,
+        content_format="string",
+    )
+
+    placeholder = "<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
+    expected_conversation = [
+        {
+            "role": "user",
+            "content": f"{placeholder}\nAnalyze this image.",
+        }
+    ]
+
+    assert conversation == expected_conversation
+    _assert_mm_data_is_vision_chunk_input(mm_data, 1)
+    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid], modality="vision_chunk")
+
+
+def test_parse_chat_messages_video_vision_chunk_with_uuid(
+    kimi_k2_5_model_config,
+    video_url,
+):
+    video_uuid = "video_456"
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Analyze this video."},
+                {
+                    "type": "video_url",
+                    "video_url": {"url": video_url},
+                    "uuid": video_uuid,
+                },
+            ],
+        }
+    ]
+
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        messages,
+        kimi_k2_5_model_config,
+        content_format="string",
+    )
+
+    placeholder = "<|kimi_k25_video_placeholder|>"
+    expected_conversation = [
+        {
+            "role": "user",
+            "content": f"{placeholder}\nAnalyze this video.",
+        }
+    ]
+
+    assert conversation == expected_conversation
+    _assert_mm_data_is_vision_chunk_input(mm_data, 1)
+    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[video_uuid], modality="vision_chunk")
+
+
+def test_parse_chat_messages_mixed_vision_chunk(
+    kimi_k2_5_model_config,
+    image_url,
+    video_url,
+):
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Analyze this image and video."},
+                {
+                    "type": "image_url",
+                    "image_url": {"url": image_url},
+                },
+                {
+                    "type": "video_url",
+                    "video_url": {"url": video_url},
+                },
+            ],
+        }
+    ]
+
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        messages,
+        kimi_k2_5_model_config,
+        content_format="string",
+    )
+
+    image_placeholder = (
+        "<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
+    )
+    video_placeholder = "<|kimi_k25_video_placeholder|>"
+    expected_conversation = [
+        {
+            "role": "user",
+            "content": (
+                f"{image_placeholder}\n{video_placeholder}\n"
+                "Analyze this image and video."
+            ),
+        }
+    ]
+
+    assert conversation == expected_conversation
+    _assert_mm_data_is_vision_chunk_input(mm_data, 2)
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None], modality="vision_chunk")
+
+
+def test_parse_chat_messages_mixed_vision_chunk_with_uuid(
+    kimi_k2_5_model_config,
+    image_url,
+    video_url,
+):
+    image_uuid = "image_123"
+    video_uuid = "video_456"
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Analyze this image and video."},
+                {
+                    "type": "image_url",
+                    "image_url": {"url": image_url},
+                    "uuid": image_uuid,
+                },
+                {
+                    "type": "video_url",
+                    "video_url": {"url": video_url},
+                    "uuid": video_uuid,
+                },
+            ],
+        }
+    ]
+
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        messages,
+        kimi_k2_5_model_config,
+        content_format="string",
+    )
+
+    image_placeholder = (
+        "<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
+    )
+    video_placeholder = "<|kimi_k25_video_placeholder|>"
+    expected_conversation = [
+        {
+            "role": "user",
+            "content": (
+                f"{image_placeholder}\n{video_placeholder}\n"
+                "Analyze this image and video."
+            ),
+        }
+    ]
+
+    assert conversation == expected_conversation
+    _assert_mm_data_is_vision_chunk_input(mm_data, 2)
+    _assert_mm_uuids(
+        mm_uuids, 2, expected_uuids=[image_uuid, video_uuid], modality="vision_chunk"
+    )
+
+
+@pytest.mark.asyncio
+async def test_parse_chat_messages_mixed_vision_chunk_async(
+    kimi_k2_5_model_config,
+    image_url,
+    video_url,
+):
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Analyze this image and video."},
+                {
+                    "type": "image_url",
+                    "image_url": {"url": image_url},
+                },
+                {
+                    "type": "video_url",
+                    "video_url": {"url": video_url},
+                },
+            ],
+        }
+    ]
+
+    conversation, mm_data, mm_uuids = await parse_chat_messages_async(
+        messages,
+        kimi_k2_5_model_config,
+        content_format="string",
+    )
+
+    image_placeholder = (
+        "<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
+    )
+    video_placeholder = "<|kimi_k25_video_placeholder|>"
+    expected_conversation = [
+        {
+            "role": "user",
+            "content": (
+                f"{image_placeholder}\n{video_placeholder}\n"
+                "Analyze this image and video."
+            ),
+        }
+    ]
+
+    assert conversation == expected_conversation
+    _assert_mm_data_is_vision_chunk_input(mm_data, 2)
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None], modality="vision_chunk")
+
+
+@pytest.mark.asyncio
+async def test_parse_chat_messages_mixed_vision_chunk_with_uuid_async(
+    kimi_k2_5_model_config,
+    image_url,
+    video_url,
+):
+    image_uuid = "image_123"
+    video_uuid = "video_456"
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Analyze this image and video."},
+                {
+                    "type": "image_url",
+                    "image_url": {"url": image_url},
+                    "uuid": image_uuid,
+                },
+                {
+                    "type": "video_url",
+                    "video_url": {"url": video_url},
+                    "uuid": video_uuid,
+                },
+            ],
+        }
+    ]
+
+    conversation, mm_data, mm_uuids = await parse_chat_messages_async(
+        messages,
+        kimi_k2_5_model_config,
+        content_format="string",
+    )
+
+    image_placeholder = (
+        "<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
+    )
+    video_placeholder = "<|kimi_k25_video_placeholder|>"
+    expected_conversation = [
+        {
+            "role": "user",
+            "content": (
+                f"{image_placeholder}\n{video_placeholder}\n"
+                "Analyze this image and video."
+            ),
+        }
+    ]
+
+    assert conversation == expected_conversation
+    _assert_mm_data_is_vision_chunk_input(mm_data, 2)
+    _assert_mm_uuids(
+        mm_uuids, 2, expected_uuids=[image_uuid, video_uuid], modality="vision_chunk"
+    )
+
+
+@pytest.mark.asyncio
+async def test_parse_chat_messages_image_vision_chunk_async(
+    kimi_k2_5_model_config,
+    image_url,
+):
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Analyze this image."},
+                {
+                    "type": "image_url",
+                    "image_url": {"url": image_url},
+                },
+            ],
+        }
+    ]
+
+    conversation, mm_data, mm_uuids = await parse_chat_messages_async(
+        messages,
+        kimi_k2_5_model_config,
+        content_format="string",
+    )
+
+    placeholder = "<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
+    expected_conversation = [
+        {
+            "role": "user",
+            "content": f"{placeholder}\nAnalyze this image.",
+        }
+    ]
+
+    assert conversation == expected_conversation
+    _assert_mm_data_is_vision_chunk_input(mm_data, 1)
+    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None], modality="vision_chunk")
+
+
+@pytest.mark.asyncio
+async def test_parse_chat_messages_video_vision_chunk_async(
+    kimi_k2_5_model_config,
+    video_url,
+):
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Analyze this video."},
+                {
+                    "type": "video_url",
+                    "video_url": {"url": video_url},
+                },
+            ],
+        }
+    ]
+
+    conversation, mm_data, mm_uuids = await parse_chat_messages_async(
+        messages,
+        kimi_k2_5_model_config,
+        content_format="string",
+    )
+
+    placeholder = "<|kimi_k25_video_placeholder|>"
+    expected_conversation = [
+        {
+            "role": "user",
+            "content": f"{placeholder}\nAnalyze this video.",
+        }
+    ]
+
+    assert conversation == expected_conversation
+    _assert_mm_data_is_vision_chunk_input(mm_data, 1)
+    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None], modality="vision_chunk")
+
+
+@pytest.mark.asyncio
+async def test_parse_chat_messages_image_vision_chunk_with_uuid_async(
+    kimi_k2_5_model_config,
+    image_url,
+):
+    image_uuid = "image_123"
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Analyze this image."},
+                {
+                    "type": "image_url",
+                    "image_url": {"url": image_url},
+                    "uuid": image_uuid,
+                },
+            ],
+        }
+    ]
+
+    conversation, mm_data, mm_uuids = await parse_chat_messages_async(
+        messages,
+        kimi_k2_5_model_config,
+        content_format="string",
+    )
+
+    placeholder = "<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
+    expected_conversation = [
+        {
+            "role": "user",
+            "content": f"{placeholder}\nAnalyze this image.",
+        }
+    ]
+
+    assert conversation == expected_conversation
+    _assert_mm_data_is_vision_chunk_input(mm_data, 1)
+    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid], modality="vision_chunk")
+
+
+@pytest.mark.asyncio
+async def test_parse_chat_messages_video_vision_chunk_with_uuid_async(
+    kimi_k2_5_model_config,
+    video_url,
+):
+    video_uuid = "video_456"
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Analyze this video."},
+                {
+                    "type": "video_url",
+                    "video_url": {"url": video_url},
+                    "uuid": video_uuid,
+                },
+            ],
+        }
+    ]
+
+    conversation, mm_data, mm_uuids = await parse_chat_messages_async(
+        messages,
+        kimi_k2_5_model_config,
+        content_format="string",
+    )
+
+    placeholder = "<|kimi_k25_video_placeholder|>"
+    expected_conversation = [
+        {
+            "role": "user",
+            "content": f"{placeholder}\nAnalyze this video.",
+        }
+    ]
+
+    assert conversation == expected_conversation
+    _assert_mm_data_is_vision_chunk_input(mm_data, 1)
+    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[video_uuid], modality="vision_chunk")