[Misc] Cleanup Kimi-K2.5's vision chunk modality entrypoints (#33157)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
@@ -24,12 +24,25 @@ from vllm.multimodal.utils import (
|
||||
)
|
||||
from vllm.utils.serial_utils import tensor2base64
|
||||
|
||||
KIMI_K2_5_MODEL_ID = "moonshotai/Kimi-K2.5"
|
||||
PHI3V_MODEL_ID = "microsoft/Phi-3.5-vision-instruct"
|
||||
QWEN2AUDIO_MODEL_ID = "Qwen/Qwen2-Audio-7B-Instruct"
|
||||
QWEN25OMNI_MODEL_ID = "Qwen/Qwen2.5-Omni-7B"
|
||||
MISTRAL_MODEL_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def kimi_k2_5_model_config():
|
||||
return ModelConfig(
|
||||
KIMI_K2_5_MODEL_ID,
|
||||
runner="generate",
|
||||
trust_remote_code=True,
|
||||
limit_mm_per_prompt={
|
||||
"image": 2,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def phi3v_model_config():
|
||||
return ModelConfig(
|
||||
@@ -163,6 +176,22 @@ def _assert_mm_data_is_image_input(
|
||||
assert image_data[i] is None
|
||||
|
||||
|
||||
def _assert_mm_data_is_vision_chunk_input(
|
||||
mm_data: MultiModalDataDict | None,
|
||||
vision_chunk_count: int,
|
||||
) -> None:
|
||||
assert mm_data is not None
|
||||
assert set(mm_data.keys()) == {"vision_chunk"}
|
||||
|
||||
vision_chunk_data = mm_data.get("vision_chunk")
|
||||
assert vision_chunk_data is not None
|
||||
|
||||
assert (
|
||||
isinstance(vision_chunk_data, list)
|
||||
and len(vision_chunk_data) == vision_chunk_count
|
||||
)
|
||||
|
||||
|
||||
def _assert_mm_uuids(
|
||||
mm_uuids: MultiModalUUIDDict | None,
|
||||
media_count: int,
|
||||
@@ -2151,3 +2180,505 @@ async def test_parse_chat_messages_single_empty_audio_with_uuid_async(
|
||||
]
|
||||
_assert_mm_data_inputs(mm_data, {"audio": 1})
|
||||
_assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[audio_uuid])
|
||||
|
||||
|
||||
def test_parse_chat_messages_image_vision_chunk(
|
||||
kimi_k2_5_model_config,
|
||||
image_url,
|
||||
):
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Analyze this image."},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": image_url},
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
messages,
|
||||
kimi_k2_5_model_config,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
placeholder = "<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
|
||||
expected_conversation = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"{placeholder}\nAnalyze this image.",
|
||||
}
|
||||
]
|
||||
|
||||
assert conversation == expected_conversation
|
||||
_assert_mm_data_is_vision_chunk_input(mm_data, 1)
|
||||
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[None], modality="vision_chunk")
|
||||
|
||||
|
||||
def test_parse_chat_messages_video_vision_chunk(
|
||||
kimi_k2_5_model_config,
|
||||
video_url,
|
||||
):
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Analyze this video."},
|
||||
{
|
||||
"type": "video_url",
|
||||
"video_url": {"url": video_url},
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
messages,
|
||||
kimi_k2_5_model_config,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
placeholder = "<|kimi_k25_video_placeholder|>"
|
||||
expected_conversation = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"{placeholder}\nAnalyze this video.",
|
||||
}
|
||||
]
|
||||
|
||||
assert conversation == expected_conversation
|
||||
_assert_mm_data_is_vision_chunk_input(mm_data, 1)
|
||||
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[None], modality="vision_chunk")
|
||||
|
||||
|
||||
def test_parse_chat_messages_image_vision_chunk_with_uuid(
|
||||
kimi_k2_5_model_config,
|
||||
image_url,
|
||||
):
|
||||
image_uuid = "image_123"
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Analyze this image."},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": image_url},
|
||||
"uuid": image_uuid,
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
messages,
|
||||
kimi_k2_5_model_config,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
placeholder = "<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
|
||||
expected_conversation = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"{placeholder}\nAnalyze this image.",
|
||||
}
|
||||
]
|
||||
|
||||
assert conversation == expected_conversation
|
||||
_assert_mm_data_is_vision_chunk_input(mm_data, 1)
|
||||
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid], modality="vision_chunk")
|
||||
|
||||
|
||||
def test_parse_chat_messages_video_vision_chunk_with_uuid(
|
||||
kimi_k2_5_model_config,
|
||||
video_url,
|
||||
):
|
||||
video_uuid = "video_456"
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Analyze this video."},
|
||||
{
|
||||
"type": "video_url",
|
||||
"video_url": {"url": video_url},
|
||||
"uuid": video_uuid,
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
messages,
|
||||
kimi_k2_5_model_config,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
placeholder = "<|kimi_k25_video_placeholder|>"
|
||||
expected_conversation = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"{placeholder}\nAnalyze this video.",
|
||||
}
|
||||
]
|
||||
|
||||
assert conversation == expected_conversation
|
||||
_assert_mm_data_is_vision_chunk_input(mm_data, 1)
|
||||
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[video_uuid], modality="vision_chunk")
|
||||
|
||||
|
||||
def test_parse_chat_messages_mixed_vision_chunk(
|
||||
kimi_k2_5_model_config,
|
||||
image_url,
|
||||
video_url,
|
||||
):
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Analyze this image and video."},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": image_url},
|
||||
},
|
||||
{
|
||||
"type": "video_url",
|
||||
"video_url": {"url": video_url},
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
messages,
|
||||
kimi_k2_5_model_config,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
image_placeholder = (
|
||||
"<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
|
||||
)
|
||||
video_placeholder = "<|kimi_k25_video_placeholder|>"
|
||||
expected_conversation = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": (
|
||||
f"{image_placeholder}\n{video_placeholder}\n"
|
||||
"Analyze this image and video."
|
||||
),
|
||||
}
|
||||
]
|
||||
|
||||
assert conversation == expected_conversation
|
||||
_assert_mm_data_is_vision_chunk_input(mm_data, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None], modality="vision_chunk")
|
||||
|
||||
|
||||
def test_parse_chat_messages_mixed_vision_chunk_with_uuid(
|
||||
kimi_k2_5_model_config,
|
||||
image_url,
|
||||
video_url,
|
||||
):
|
||||
image_uuid = "image_123"
|
||||
video_uuid = "video_456"
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Analyze this image and video."},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": image_url},
|
||||
"uuid": image_uuid,
|
||||
},
|
||||
{
|
||||
"type": "video_url",
|
||||
"video_url": {"url": video_url},
|
||||
"uuid": video_uuid,
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
messages,
|
||||
kimi_k2_5_model_config,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
image_placeholder = (
|
||||
"<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
|
||||
)
|
||||
video_placeholder = "<|kimi_k25_video_placeholder|>"
|
||||
expected_conversation = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": (
|
||||
f"{image_placeholder}\n{video_placeholder}\n"
|
||||
"Analyze this image and video."
|
||||
),
|
||||
}
|
||||
]
|
||||
|
||||
assert conversation == expected_conversation
|
||||
_assert_mm_data_is_vision_chunk_input(mm_data, 2)
|
||||
_assert_mm_uuids(
|
||||
mm_uuids, 2, expected_uuids=[image_uuid, video_uuid], modality="vision_chunk"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_parse_chat_messages_mixed_vision_chunk_async(
|
||||
kimi_k2_5_model_config,
|
||||
image_url,
|
||||
video_url,
|
||||
):
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Analyze this image and video."},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": image_url},
|
||||
},
|
||||
{
|
||||
"type": "video_url",
|
||||
"video_url": {"url": video_url},
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
conversation, mm_data, mm_uuids = await parse_chat_messages_async(
|
||||
messages,
|
||||
kimi_k2_5_model_config,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
image_placeholder = (
|
||||
"<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
|
||||
)
|
||||
video_placeholder = "<|kimi_k25_video_placeholder|>"
|
||||
expected_conversation = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": (
|
||||
f"{image_placeholder}\n{video_placeholder}\n"
|
||||
"Analyze this image and video."
|
||||
),
|
||||
}
|
||||
]
|
||||
|
||||
assert conversation == expected_conversation
|
||||
_assert_mm_data_is_vision_chunk_input(mm_data, 2)
|
||||
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None], modality="vision_chunk")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_parse_chat_messages_mixed_vision_chunk_with_uuid_async(
|
||||
kimi_k2_5_model_config,
|
||||
image_url,
|
||||
video_url,
|
||||
):
|
||||
image_uuid = "image_123"
|
||||
video_uuid = "video_456"
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Analyze this image and video."},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": image_url},
|
||||
"uuid": image_uuid,
|
||||
},
|
||||
{
|
||||
"type": "video_url",
|
||||
"video_url": {"url": video_url},
|
||||
"uuid": video_uuid,
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
conversation, mm_data, mm_uuids = await parse_chat_messages_async(
|
||||
messages,
|
||||
kimi_k2_5_model_config,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
image_placeholder = (
|
||||
"<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
|
||||
)
|
||||
video_placeholder = "<|kimi_k25_video_placeholder|>"
|
||||
expected_conversation = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": (
|
||||
f"{image_placeholder}\n{video_placeholder}\n"
|
||||
"Analyze this image and video."
|
||||
),
|
||||
}
|
||||
]
|
||||
|
||||
assert conversation == expected_conversation
|
||||
_assert_mm_data_is_vision_chunk_input(mm_data, 2)
|
||||
_assert_mm_uuids(
|
||||
mm_uuids, 2, expected_uuids=[image_uuid, video_uuid], modality="vision_chunk"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_parse_chat_messages_image_vision_chunk_async(
|
||||
kimi_k2_5_model_config,
|
||||
image_url,
|
||||
):
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Analyze this image."},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": image_url},
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
conversation, mm_data, mm_uuids = await parse_chat_messages_async(
|
||||
messages,
|
||||
kimi_k2_5_model_config,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
placeholder = "<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
|
||||
expected_conversation = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"{placeholder}\nAnalyze this image.",
|
||||
}
|
||||
]
|
||||
|
||||
assert conversation == expected_conversation
|
||||
_assert_mm_data_is_vision_chunk_input(mm_data, 1)
|
||||
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[None], modality="vision_chunk")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_parse_chat_messages_video_vision_chunk_async(
|
||||
kimi_k2_5_model_config,
|
||||
video_url,
|
||||
):
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Analyze this video."},
|
||||
{
|
||||
"type": "video_url",
|
||||
"video_url": {"url": video_url},
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
conversation, mm_data, mm_uuids = await parse_chat_messages_async(
|
||||
messages,
|
||||
kimi_k2_5_model_config,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
placeholder = "<|kimi_k25_video_placeholder|>"
|
||||
expected_conversation = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"{placeholder}\nAnalyze this video.",
|
||||
}
|
||||
]
|
||||
|
||||
assert conversation == expected_conversation
|
||||
_assert_mm_data_is_vision_chunk_input(mm_data, 1)
|
||||
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[None], modality="vision_chunk")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_parse_chat_messages_image_vision_chunk_with_uuid_async(
|
||||
kimi_k2_5_model_config,
|
||||
image_url,
|
||||
):
|
||||
image_uuid = "image_123"
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Analyze this image."},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": image_url},
|
||||
"uuid": image_uuid,
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
conversation, mm_data, mm_uuids = await parse_chat_messages_async(
|
||||
messages,
|
||||
kimi_k2_5_model_config,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
placeholder = "<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
|
||||
expected_conversation = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"{placeholder}\nAnalyze this image.",
|
||||
}
|
||||
]
|
||||
|
||||
assert conversation == expected_conversation
|
||||
_assert_mm_data_is_vision_chunk_input(mm_data, 1)
|
||||
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid], modality="vision_chunk")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_parse_chat_messages_video_vision_chunk_with_uuid_async(
|
||||
kimi_k2_5_model_config,
|
||||
video_url,
|
||||
):
|
||||
video_uuid = "video_456"
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Analyze this video."},
|
||||
{
|
||||
"type": "video_url",
|
||||
"video_url": {"url": video_url},
|
||||
"uuid": video_uuid,
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
conversation, mm_data, mm_uuids = await parse_chat_messages_async(
|
||||
messages,
|
||||
kimi_k2_5_model_config,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
placeholder = "<|kimi_k25_video_placeholder|>"
|
||||
expected_conversation = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"{placeholder}\nAnalyze this video.",
|
||||
}
|
||||
]
|
||||
|
||||
assert conversation == expected_conversation
|
||||
_assert_mm_data_is_vision_chunk_input(mm_data, 1)
|
||||
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[video_uuid], modality="vision_chunk")
|
||||
|
||||
@@ -411,6 +411,11 @@ def test_processing_correctness(
|
||||
"Qwen-VL tokenizer requires downloading a font file from "
|
||||
"servers that often refuse connections in CI"
|
||||
)
|
||||
if model_id == "moonshotai/Kimi-K2.5":
|
||||
# FIXME(Isaac): Fix Kimi-K2.5's offline inference about vision chunks.
|
||||
pytest.skip(
|
||||
"Kimi-K2.5's offline inference has issues about vision chunks. Fix later."
|
||||
)
|
||||
|
||||
_test_processing_correctness(
|
||||
model_id,
|
||||
|
||||
@@ -155,6 +155,12 @@ def initialize_dummy_model(
|
||||
@create_new_process_for_each_test()
|
||||
@pytest.mark.parametrize("model_id", get_model_ids_to_test())
|
||||
def test_model_tensor_schema(model_id: str):
|
||||
if model_id == "moonshotai/Kimi-K2.5":
|
||||
# FIXME(Isotr0py): Fix Kimi-K2.5's offline inference about vision chunks.
|
||||
pytest.skip(
|
||||
"Kimi-K2.5's offline inference has issues about vision chunks. Fix later."
|
||||
)
|
||||
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(
|
||||
|
||||
@@ -786,7 +786,6 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
"KimiK25ForConditionalGeneration": _HfExamplesInfo(
|
||||
"moonshotai/Kimi-K2.5",
|
||||
trust_remote_code=True,
|
||||
is_available_online=False,
|
||||
),
|
||||
"LightOnOCRForConditionalGeneration": _HfExamplesInfo(
|
||||
"lightonai/LightOnOCR-1B-1025"
|
||||
|
||||
@@ -454,78 +454,6 @@ def _get_embeds_data(
|
||||
raise NotImplementedError(type(data_items))
|
||||
|
||||
|
||||
def rebuild_mm_uuids_from_mm_data(
|
||||
mm_uuids: MultiModalUUIDDict,
|
||||
mm_data: MultiModalDataDict,
|
||||
) -> MultiModalUUIDDict:
|
||||
"""Rebuild mm_uuids after vision_chunk processing.
|
||||
|
||||
When videos are split into chunks, the original UUIDs need to be updated
|
||||
to reflect the new UUIDs generated for each chunk.
|
||||
|
||||
Args:
|
||||
mm_uuids: Original UUIDs dictionary
|
||||
mm_data: Processed multimodal data with vision_chunk items
|
||||
|
||||
Returns:
|
||||
Updated UUIDs dictionary with chunk UUIDs
|
||||
"""
|
||||
vision_chunks = mm_data.get("vision_chunk")
|
||||
if vision_chunks is None:
|
||||
return mm_uuids
|
||||
|
||||
new_uuids = dict(mm_uuids)
|
||||
vision_chunk_uuids = []
|
||||
|
||||
for item in vision_chunks:
|
||||
# vision_chunk items are always dicts (VisionChunkImage/VisionChunkVideo)
|
||||
assert isinstance(item, dict)
|
||||
uuid_val = item.get("uuid")
|
||||
if uuid_val is not None:
|
||||
vision_chunk_uuids.append(uuid_val)
|
||||
|
||||
if vision_chunk_uuids:
|
||||
new_uuids["vision_chunk"] = vision_chunk_uuids
|
||||
|
||||
return new_uuids
|
||||
|
||||
|
||||
def build_video_prompts_from_mm_data(
|
||||
mm_data: MultiModalDataDict,
|
||||
) -> list[str]:
|
||||
"""Build video prompts from vision_chunk data.
|
||||
|
||||
Collects prompts from video chunks and groups them by video_idx.
|
||||
|
||||
Args:
|
||||
mm_data: Processed multimodal data with vision_chunk items
|
||||
|
||||
Returns:
|
||||
List of video prompts, one per video.
|
||||
"""
|
||||
vision_chunks = mm_data.get("vision_chunk")
|
||||
if vision_chunks is None:
|
||||
return []
|
||||
|
||||
# Group chunks by video_idx
|
||||
video_prompts_dict: dict[int, list[str]] = defaultdict(list)
|
||||
|
||||
for item in vision_chunks:
|
||||
# vision_chunk items are always dicts (VisionChunkImage/VisionChunkVideo)
|
||||
assert isinstance(item, dict)
|
||||
if item.get("type") == "video_chunk":
|
||||
video_idx = item.get("video_idx", 0)
|
||||
prompt = item.get("prompt", "")
|
||||
video_prompts_dict[video_idx].append(prompt)
|
||||
|
||||
# Build prompts in video order
|
||||
video_prompts = []
|
||||
for video_idx in sorted(video_prompts_dict.keys()):
|
||||
video_prompts.append("".join(video_prompts_dict[video_idx]))
|
||||
|
||||
return video_prompts
|
||||
|
||||
|
||||
class BaseMultiModalItemTracker(ABC, Generic[_T]):
|
||||
"""
|
||||
Tracks multi-modal items in a given request and ensures that the number
|
||||
@@ -616,10 +544,72 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
def _resolve_vision_chunk_items(
|
||||
vision_chunk_items: list[tuple[object, str | None]],
|
||||
mm_processor: BaseMultiModalProcessor,
|
||||
vision_chunks_modality_order: list[str],
|
||||
):
|
||||
# Process vision_chunk items - extract from (data, modality) tuples
|
||||
# and convert to VisionChunk types with proper UUID handling
|
||||
vision_chunks_uuids = [uuid for data, uuid in vision_chunk_items]
|
||||
|
||||
assert len(vision_chunk_items) == len(vision_chunks_modality_order), (
|
||||
f"vision_chunk items ({len(vision_chunk_items)}) and "
|
||||
f"modality_order ({len(vision_chunks_modality_order)}) must have same length"
|
||||
)
|
||||
|
||||
processed_chunks: list[VisionChunk] = []
|
||||
video_idx = 0
|
||||
for inner_modality, (data, uuid) in zip(
|
||||
vision_chunks_modality_order, vision_chunk_items
|
||||
):
|
||||
if inner_modality == "image":
|
||||
# Cast data to proper type for image
|
||||
# Use .media (PIL.Image) directly to avoid redundant
|
||||
# bytes→PIL conversion in media_processor
|
||||
if hasattr(data, "media"):
|
||||
image_data = data.media # type: ignore[union-attr]
|
||||
processed_chunks.append(
|
||||
VisionChunkImage(type="image", image=image_data, uuid=uuid)
|
||||
)
|
||||
else:
|
||||
processed_chunks.append(data) # type: ignore[arg-type]
|
||||
elif inner_modality == "video":
|
||||
# For video, we may need to split into chunks
|
||||
# if processor supports it
|
||||
# For now, just wrap as a video chunk placeholder
|
||||
if hasattr(mm_processor, "split_video_chunks") and data is not None:
|
||||
try:
|
||||
video_uuid = uuid or random_uuid()
|
||||
# video await result is (video_data, video_meta) tuple
|
||||
if isinstance(data, tuple) and len(data) >= 1:
|
||||
video_data = data[0]
|
||||
else:
|
||||
video_data = data
|
||||
video_chunks = mm_processor.split_video_chunks(video_data)
|
||||
for i, vc in enumerate(video_chunks):
|
||||
processed_chunks.append(
|
||||
VisionChunkVideo(
|
||||
type="video_chunk",
|
||||
video_chunk=vc["video_chunk"],
|
||||
uuid=f"{video_uuid}-{i}",
|
||||
video_idx=video_idx,
|
||||
prompt=vc["prompt"],
|
||||
)
|
||||
)
|
||||
video_idx += 1
|
||||
except Exception as e:
|
||||
logger.warning("Failed to split video chunks: %s", e)
|
||||
processed_chunks.append(data) # type: ignore[arg-type]
|
||||
else:
|
||||
processed_chunks.append(data) # type: ignore[arg-type]
|
||||
return processed_chunks, vision_chunks_uuids
|
||||
|
||||
|
||||
def _resolve_items(
|
||||
items_by_modality: dict[str, list[tuple[object, str | None]]],
|
||||
mm_processor: BaseMultiModalProcessor,
|
||||
vision_chunk_modality_order: dict[str, list[str]],
|
||||
modality_order: dict[str, list[str]],
|
||||
) -> tuple[MultiModalDataDict, MultiModalUUIDDict]:
|
||||
if "image" in items_by_modality and "image_embeds" in items_by_modality:
|
||||
raise ValueError("Mixing raw image and embedding inputs is not allowed")
|
||||
@@ -654,71 +644,13 @@ def _resolve_items(
|
||||
if "vision_chunk" in items_by_modality:
|
||||
# Process vision_chunk items - extract from (data, modality) tuples
|
||||
# and convert to VisionChunk types with proper UUID handling
|
||||
vision_chunk_items = items_by_modality["vision_chunk"]
|
||||
modality_order = vision_chunk_modality_order.get("vision_chunk", [])
|
||||
mm_uuids["vision_chunk"] = [
|
||||
uuid for data, uuid in items_by_modality["vision_chunk"]
|
||||
]
|
||||
|
||||
# Filter out None items (from asyncio.sleep(0) placeholders)
|
||||
filtered_items = [
|
||||
(idx, item)
|
||||
for idx, item in enumerate(vision_chunk_items)
|
||||
if item is not None
|
||||
]
|
||||
|
||||
assert len(filtered_items) == len(modality_order), (
|
||||
f"vision_chunk items ({len(filtered_items)}) and "
|
||||
f"modality_order ({len(modality_order)}) must have same length"
|
||||
processed_chunks, vision_chunk_uuids = _resolve_vision_chunk_items(
|
||||
items_by_modality["vision_chunk"],
|
||||
mm_processor,
|
||||
modality_order.get("vision_chunk", []),
|
||||
)
|
||||
|
||||
processed_chunks: list[VisionChunk] = []
|
||||
video_idx = 0
|
||||
for i, (idx, item) in enumerate(filtered_items):
|
||||
inner_modality = modality_order[i]
|
||||
data, uuid = item
|
||||
uuid_val = uuid if idx < len(mm_uuids["vision_chunk"]) else None
|
||||
if inner_modality == "image":
|
||||
# Cast data to proper type for image
|
||||
# Use .media (PIL.Image) directly to avoid redundant
|
||||
# bytes→PIL conversion in media_processor
|
||||
if hasattr(data, "media"):
|
||||
image_data = data.media # type: ignore[union-attr]
|
||||
processed_chunks.append(
|
||||
VisionChunkImage(type="image", image=image_data, uuid=uuid_val)
|
||||
)
|
||||
else:
|
||||
processed_chunks.append(data) # type: ignore[arg-type]
|
||||
elif inner_modality == "video":
|
||||
# For video, we may need to split into chunks
|
||||
# if processor supports it
|
||||
# For now, just wrap as a video chunk placeholder
|
||||
if hasattr(mm_processor, "split_video_chunks") and data is not None:
|
||||
try:
|
||||
video_uuid = uuid_val or random_uuid()
|
||||
# video await result is (video_data, video_meta) tuple
|
||||
if isinstance(data, tuple) and len(data) >= 1:
|
||||
video_data = data[0]
|
||||
else:
|
||||
video_data = data
|
||||
video_chunks = mm_processor.split_video_chunks(video_data)
|
||||
for i, vc in enumerate(video_chunks):
|
||||
processed_chunks.append(
|
||||
VisionChunkVideo(
|
||||
type="video_chunk",
|
||||
video_chunk=vc["video_chunk"],
|
||||
uuid=f"{video_uuid}-{i}",
|
||||
video_idx=video_idx,
|
||||
prompt=vc["prompt"],
|
||||
)
|
||||
)
|
||||
video_idx += 1
|
||||
except Exception as e:
|
||||
logger.warning("Failed to split video chunks: %s", e)
|
||||
processed_chunks.append(data) # type: ignore[arg-type]
|
||||
else:
|
||||
processed_chunks.append(data) # type: ignore[arg-type]
|
||||
mm_data["vision_chunk"] = processed_chunks
|
||||
mm_uuids["vision_chunk"] = vision_chunk_uuids
|
||||
|
||||
return mm_data, mm_uuids
|
||||
|
||||
|
||||
@@ -235,27 +235,6 @@ class VideoLoader:
|
||||
VIDEO_LOADER_REGISTRY = ExtensionManager()
|
||||
|
||||
|
||||
@VIDEO_LOADER_REGISTRY.register("identity")
|
||||
class IdentityVideoLoader(VideoLoader):
|
||||
"""IdentityVideoLoader returns raw video bytes without decoding.
|
||||
|
||||
This allows the model processor to handle video decoding and
|
||||
is required for models like Kimi-K2.5 that need custom video chunk splitting.
|
||||
|
||||
NOTE: This is temporary for Kimi-K2.5 testing. Remember to change back
|
||||
to opencv before release if needed.
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def load_bytes(
|
||||
cls,
|
||||
data: bytes,
|
||||
num_frames: int = -1,
|
||||
**kwargs: Any,
|
||||
) -> tuple[Any, Any]:
|
||||
return data, None
|
||||
|
||||
|
||||
@VIDEO_LOADER_REGISTRY.register("opencv")
|
||||
class OpenCVVideoBackend(VideoLoader):
|
||||
def get_cv2_video_api(self):
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import inspect
|
||||
from collections import deque
|
||||
import itertools
|
||||
from collections import defaultdict, deque
|
||||
from collections.abc import Set
|
||||
from functools import lru_cache
|
||||
from typing import Any, cast
|
||||
from typing import TYPE_CHECKING, Any, cast
|
||||
|
||||
import jinja2
|
||||
import jinja2.ext
|
||||
@@ -20,11 +21,9 @@ from vllm.entrypoints.chat_utils import (
|
||||
ChatTemplateContentFormatOption,
|
||||
ChatTemplateResolutionError,
|
||||
ConversationMessage,
|
||||
build_video_prompts_from_mm_data,
|
||||
load_chat_template,
|
||||
parse_chat_messages,
|
||||
parse_chat_messages_async,
|
||||
rebuild_mm_uuids_from_mm_data,
|
||||
)
|
||||
from vllm.inputs import TextPrompt, TokensPrompt
|
||||
from vllm.logger import init_logger
|
||||
@@ -36,6 +35,13 @@ from vllm.utils.func_utils import supports_kw
|
||||
|
||||
from .protocol import RendererLike
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.multimodal.inputs import MultiModalDataDict, MultiModalUUIDDict
|
||||
else:
|
||||
MultiModalDataDict = dict[str, Any]
|
||||
MultiModalUUIDDict = dict[str, Any]
|
||||
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
@@ -479,6 +485,104 @@ def safe_apply_chat_template(
|
||||
raise ValueError(str(e)) from e
|
||||
|
||||
|
||||
def rebuild_mm_uuids_from_mm_data(
|
||||
mm_uuids: "MultiModalUUIDDict",
|
||||
mm_data: "MultiModalDataDict",
|
||||
) -> "MultiModalUUIDDict":
|
||||
"""Rebuild mm_uuids after vision_chunk processing.
|
||||
|
||||
When videos are split into chunks, the original UUIDs need to be updated
|
||||
to reflect the new UUIDs generated for each chunk.
|
||||
|
||||
Args:
|
||||
mm_uuids: Original UUIDs dictionary
|
||||
mm_data: Processed multimodal data with vision_chunk items
|
||||
|
||||
Returns:
|
||||
Updated UUIDs dictionary with chunk UUIDs
|
||||
"""
|
||||
vision_chunks = mm_data.get("vision_chunk")
|
||||
if vision_chunks is None:
|
||||
return mm_uuids
|
||||
|
||||
assert all(isinstance(item, dict) for item in vision_chunks), (
|
||||
"Expected all vision_chunk items to be dicts"
|
||||
)
|
||||
vision_chunks = cast(list[dict[str, Any]], vision_chunks)
|
||||
vision_chunk_uuids = [
|
||||
uuid_val for item in vision_chunks if (uuid_val := item.get("uuid")) is not None
|
||||
]
|
||||
|
||||
if vision_chunk_uuids:
|
||||
mm_uuids = dict(mm_uuids)
|
||||
mm_uuids["vision_chunk"] = vision_chunk_uuids
|
||||
|
||||
return mm_uuids
|
||||
|
||||
|
||||
def build_video_prompts_from_mm_data(
|
||||
mm_data: "MultiModalDataDict",
|
||||
) -> list[str]:
|
||||
"""Build video prompts from vision_chunk data.
|
||||
|
||||
Collects prompts from video chunks and groups them by video_idx.
|
||||
|
||||
Args:
|
||||
mm_data: Processed multimodal data with vision_chunk items
|
||||
|
||||
Returns:
|
||||
List of video prompts, one per video.
|
||||
"""
|
||||
vision_chunks = mm_data.get("vision_chunk")
|
||||
if vision_chunks is None:
|
||||
return []
|
||||
|
||||
# Group chunks by video_idx
|
||||
video_prompts_dict: dict[int, list[str]] = defaultdict(list)
|
||||
|
||||
for item in vision_chunks:
|
||||
# vision_chunk items are always dicts (VisionChunkImage/VisionChunkVideo)
|
||||
assert isinstance(item, dict)
|
||||
if item.get("type") == "video_chunk":
|
||||
video_idx = item.get("video_idx", 0)
|
||||
prompt = item.get("prompt", "")
|
||||
video_prompts_dict[video_idx].append(prompt)
|
||||
|
||||
# Build prompts in video order
|
||||
video_prompts = [
|
||||
"".join(video_prompts_dict[video_idx])
|
||||
for video_idx in sorted(video_prompts_dict.keys())
|
||||
]
|
||||
|
||||
return video_prompts
|
||||
|
||||
|
||||
def replace_vision_chunk_video_placeholder(
|
||||
prompt_raw: str | list[int],
|
||||
mm_data: "MultiModalDataDict",
|
||||
video_placeholder: str | None,
|
||||
) -> str | list[int]:
|
||||
# get video placehoder, replace it with runtime video-chunk prompts
|
||||
if video_placeholder and isinstance(prompt_raw, str):
|
||||
video_prompts = build_video_prompts_from_mm_data(mm_data)
|
||||
|
||||
# replace in order
|
||||
prompt_raw_parts = prompt_raw.split(video_placeholder)
|
||||
if len(prompt_raw_parts) == len(video_prompts) + 1:
|
||||
prompt_raw = "".join(
|
||||
itertools.chain.from_iterable(zip(prompt_raw_parts, video_prompts))
|
||||
)
|
||||
prompt_raw += prompt_raw_parts[-1]
|
||||
else:
|
||||
logger.warning(
|
||||
"Number of video placeholders (%d) does not match "
|
||||
"number of videos (%d) in the request.",
|
||||
len(prompt_raw_parts) - 1,
|
||||
len(video_prompts),
|
||||
)
|
||||
return prompt_raw
|
||||
|
||||
|
||||
class HfRenderer(RendererLike):
|
||||
@classmethod
|
||||
def from_config(
|
||||
@@ -496,6 +600,9 @@ class HfRenderer(RendererLike):
|
||||
super().__init__()
|
||||
|
||||
self.config = config
|
||||
self.use_unified_vision_chunk = getattr(
|
||||
config.hf_config, "use_unified_vision_chunk", False
|
||||
)
|
||||
|
||||
if config.skip_tokenizer_init:
|
||||
tokenizer = None
|
||||
@@ -552,7 +659,7 @@ class HfRenderer(RendererLike):
|
||||
# NOTE: use_unified_vision_chunk is currently specific to Kimi-K2.5
|
||||
# model which uses unified vision chunks for both images and videos.
|
||||
if (
|
||||
getattr(model_config.hf_config, "use_unified_vision_chunk", False)
|
||||
self.use_unified_vision_chunk
|
||||
and mm_uuids is not None
|
||||
and mm_data is not None
|
||||
):
|
||||
@@ -562,25 +669,10 @@ class HfRenderer(RendererLike):
|
||||
video_placeholder = getattr(
|
||||
model_config.hf_config, "video_placeholder", None
|
||||
)
|
||||
if video_placeholder and isinstance(prompt_raw, str):
|
||||
video_prompts = build_video_prompts_from_mm_data(mm_data)
|
||||
|
||||
# replace in order
|
||||
prompt_raw_parts = prompt_raw.split(video_placeholder)
|
||||
if len(prompt_raw_parts) == len(video_prompts) + 1:
|
||||
prompt_raw = "".join(
|
||||
[
|
||||
prompt_raw_parts[i] + video_prompts[i]
|
||||
for i in range(len(video_prompts))
|
||||
]
|
||||
)
|
||||
prompt_raw += prompt_raw_parts[-1]
|
||||
else:
|
||||
logger.warning(
|
||||
"Number of video placeholders (%d) does not match "
|
||||
"number of videos (%d) in the request.",
|
||||
len(prompt_raw_parts) - 1,
|
||||
len(video_prompts),
|
||||
prompt_raw = replace_vision_chunk_video_placeholder(
|
||||
prompt_raw,
|
||||
mm_data,
|
||||
video_placeholder,
|
||||
)
|
||||
|
||||
prompt = (
|
||||
@@ -626,7 +718,7 @@ class HfRenderer(RendererLike):
|
||||
# NOTE: use_unified_vision_chunk is currently specific to Kimi-K2.5
|
||||
# model which uses unified vision chunks for both images and videos.
|
||||
if (
|
||||
getattr(model_config.hf_config, "use_unified_vision_chunk", False)
|
||||
self.use_unified_vision_chunk
|
||||
and mm_uuids is not None
|
||||
and mm_data is not None
|
||||
):
|
||||
@@ -636,25 +728,10 @@ class HfRenderer(RendererLike):
|
||||
video_placeholder = getattr(
|
||||
model_config.hf_config, "video_placeholder", None
|
||||
)
|
||||
if video_placeholder and isinstance(prompt_raw, str):
|
||||
video_prompts = build_video_prompts_from_mm_data(mm_data)
|
||||
|
||||
# replace in order
|
||||
prompt_raw_parts = prompt_raw.split(video_placeholder)
|
||||
if len(prompt_raw_parts) == len(video_prompts) + 1:
|
||||
prompt_raw = "".join(
|
||||
[
|
||||
prompt_raw_parts[i] + video_prompts[i]
|
||||
for i in range(len(video_prompts))
|
||||
]
|
||||
)
|
||||
prompt_raw += prompt_raw_parts[-1]
|
||||
else:
|
||||
logger.warning(
|
||||
"Number of video placeholders (%d) does not match "
|
||||
"number of videos (%d) in the request.",
|
||||
len(prompt_raw_parts) - 1,
|
||||
len(video_prompts),
|
||||
prompt_raw = replace_vision_chunk_video_placeholder(
|
||||
prompt_raw,
|
||||
mm_data,
|
||||
video_placeholder,
|
||||
)
|
||||
|
||||
prompt = (
|
||||
|
||||
Reference in New Issue
Block a user