[Misc] Cleanup Kimi-K2.5's vision chunk modality entrypoints (#33157)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
Isotr0py
2026-01-29 17:46:02 +08:00
committed by GitHub
parent e01ff5c070
commit 3a92c6f3b5
7 changed files with 733 additions and 204 deletions

View File

@@ -24,12 +24,25 @@ from vllm.multimodal.utils import (
)
from vllm.utils.serial_utils import tensor2base64
KIMI_K2_5_MODEL_ID = "moonshotai/Kimi-K2.5"
PHI3V_MODEL_ID = "microsoft/Phi-3.5-vision-instruct"
QWEN2AUDIO_MODEL_ID = "Qwen/Qwen2-Audio-7B-Instruct"
QWEN25OMNI_MODEL_ID = "Qwen/Qwen2.5-Omni-7B"
MISTRAL_MODEL_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
@pytest.fixture(scope="function")
def kimi_k2_5_model_config():
return ModelConfig(
KIMI_K2_5_MODEL_ID,
runner="generate",
trust_remote_code=True,
limit_mm_per_prompt={
"image": 2,
},
)
@pytest.fixture(scope="function")
def phi3v_model_config():
return ModelConfig(
@@ -163,6 +176,22 @@ def _assert_mm_data_is_image_input(
assert image_data[i] is None
def _assert_mm_data_is_vision_chunk_input(
mm_data: MultiModalDataDict | None,
vision_chunk_count: int,
) -> None:
assert mm_data is not None
assert set(mm_data.keys()) == {"vision_chunk"}
vision_chunk_data = mm_data.get("vision_chunk")
assert vision_chunk_data is not None
assert (
isinstance(vision_chunk_data, list)
and len(vision_chunk_data) == vision_chunk_count
)
def _assert_mm_uuids(
mm_uuids: MultiModalUUIDDict | None,
media_count: int,
@@ -2151,3 +2180,505 @@ async def test_parse_chat_messages_single_empty_audio_with_uuid_async(
]
_assert_mm_data_inputs(mm_data, {"audio": 1})
_assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[audio_uuid])
def test_parse_chat_messages_image_vision_chunk(
kimi_k2_5_model_config,
image_url,
):
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Analyze this image."},
{
"type": "image_url",
"image_url": {"url": image_url},
},
],
}
]
conversation, mm_data, mm_uuids = parse_chat_messages(
messages,
kimi_k2_5_model_config,
content_format="string",
)
placeholder = "<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
expected_conversation = [
{
"role": "user",
"content": f"{placeholder}\nAnalyze this image.",
}
]
assert conversation == expected_conversation
_assert_mm_data_is_vision_chunk_input(mm_data, 1)
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[None], modality="vision_chunk")
def test_parse_chat_messages_video_vision_chunk(
kimi_k2_5_model_config,
video_url,
):
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Analyze this video."},
{
"type": "video_url",
"video_url": {"url": video_url},
},
],
}
]
conversation, mm_data, mm_uuids = parse_chat_messages(
messages,
kimi_k2_5_model_config,
content_format="string",
)
placeholder = "<|kimi_k25_video_placeholder|>"
expected_conversation = [
{
"role": "user",
"content": f"{placeholder}\nAnalyze this video.",
}
]
assert conversation == expected_conversation
_assert_mm_data_is_vision_chunk_input(mm_data, 1)
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[None], modality="vision_chunk")
def test_parse_chat_messages_image_vision_chunk_with_uuid(
kimi_k2_5_model_config,
image_url,
):
image_uuid = "image_123"
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Analyze this image."},
{
"type": "image_url",
"image_url": {"url": image_url},
"uuid": image_uuid,
},
],
}
]
conversation, mm_data, mm_uuids = parse_chat_messages(
messages,
kimi_k2_5_model_config,
content_format="string",
)
placeholder = "<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
expected_conversation = [
{
"role": "user",
"content": f"{placeholder}\nAnalyze this image.",
}
]
assert conversation == expected_conversation
_assert_mm_data_is_vision_chunk_input(mm_data, 1)
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid], modality="vision_chunk")
def test_parse_chat_messages_video_vision_chunk_with_uuid(
kimi_k2_5_model_config,
video_url,
):
video_uuid = "video_456"
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Analyze this video."},
{
"type": "video_url",
"video_url": {"url": video_url},
"uuid": video_uuid,
},
],
}
]
conversation, mm_data, mm_uuids = parse_chat_messages(
messages,
kimi_k2_5_model_config,
content_format="string",
)
placeholder = "<|kimi_k25_video_placeholder|>"
expected_conversation = [
{
"role": "user",
"content": f"{placeholder}\nAnalyze this video.",
}
]
assert conversation == expected_conversation
_assert_mm_data_is_vision_chunk_input(mm_data, 1)
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[video_uuid], modality="vision_chunk")
def test_parse_chat_messages_mixed_vision_chunk(
kimi_k2_5_model_config,
image_url,
video_url,
):
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Analyze this image and video."},
{
"type": "image_url",
"image_url": {"url": image_url},
},
{
"type": "video_url",
"video_url": {"url": video_url},
},
],
}
]
conversation, mm_data, mm_uuids = parse_chat_messages(
messages,
kimi_k2_5_model_config,
content_format="string",
)
image_placeholder = (
"<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
)
video_placeholder = "<|kimi_k25_video_placeholder|>"
expected_conversation = [
{
"role": "user",
"content": (
f"{image_placeholder}\n{video_placeholder}\n"
"Analyze this image and video."
),
}
]
assert conversation == expected_conversation
_assert_mm_data_is_vision_chunk_input(mm_data, 2)
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None], modality="vision_chunk")
def test_parse_chat_messages_mixed_vision_chunk_with_uuid(
kimi_k2_5_model_config,
image_url,
video_url,
):
image_uuid = "image_123"
video_uuid = "video_456"
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Analyze this image and video."},
{
"type": "image_url",
"image_url": {"url": image_url},
"uuid": image_uuid,
},
{
"type": "video_url",
"video_url": {"url": video_url},
"uuid": video_uuid,
},
],
}
]
conversation, mm_data, mm_uuids = parse_chat_messages(
messages,
kimi_k2_5_model_config,
content_format="string",
)
image_placeholder = (
"<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
)
video_placeholder = "<|kimi_k25_video_placeholder|>"
expected_conversation = [
{
"role": "user",
"content": (
f"{image_placeholder}\n{video_placeholder}\n"
"Analyze this image and video."
),
}
]
assert conversation == expected_conversation
_assert_mm_data_is_vision_chunk_input(mm_data, 2)
_assert_mm_uuids(
mm_uuids, 2, expected_uuids=[image_uuid, video_uuid], modality="vision_chunk"
)
@pytest.mark.asyncio
async def test_parse_chat_messages_mixed_vision_chunk_async(
kimi_k2_5_model_config,
image_url,
video_url,
):
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Analyze this image and video."},
{
"type": "image_url",
"image_url": {"url": image_url},
},
{
"type": "video_url",
"video_url": {"url": video_url},
},
],
}
]
conversation, mm_data, mm_uuids = await parse_chat_messages_async(
messages,
kimi_k2_5_model_config,
content_format="string",
)
image_placeholder = (
"<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
)
video_placeholder = "<|kimi_k25_video_placeholder|>"
expected_conversation = [
{
"role": "user",
"content": (
f"{image_placeholder}\n{video_placeholder}\n"
"Analyze this image and video."
),
}
]
assert conversation == expected_conversation
_assert_mm_data_is_vision_chunk_input(mm_data, 2)
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None], modality="vision_chunk")
@pytest.mark.asyncio
async def test_parse_chat_messages_mixed_vision_chunk_with_uuid_async(
kimi_k2_5_model_config,
image_url,
video_url,
):
image_uuid = "image_123"
video_uuid = "video_456"
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Analyze this image and video."},
{
"type": "image_url",
"image_url": {"url": image_url},
"uuid": image_uuid,
},
{
"type": "video_url",
"video_url": {"url": video_url},
"uuid": video_uuid,
},
],
}
]
conversation, mm_data, mm_uuids = await parse_chat_messages_async(
messages,
kimi_k2_5_model_config,
content_format="string",
)
image_placeholder = (
"<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
)
video_placeholder = "<|kimi_k25_video_placeholder|>"
expected_conversation = [
{
"role": "user",
"content": (
f"{image_placeholder}\n{video_placeholder}\n"
"Analyze this image and video."
),
}
]
assert conversation == expected_conversation
_assert_mm_data_is_vision_chunk_input(mm_data, 2)
_assert_mm_uuids(
mm_uuids, 2, expected_uuids=[image_uuid, video_uuid], modality="vision_chunk"
)
@pytest.mark.asyncio
async def test_parse_chat_messages_image_vision_chunk_async(
kimi_k2_5_model_config,
image_url,
):
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Analyze this image."},
{
"type": "image_url",
"image_url": {"url": image_url},
},
],
}
]
conversation, mm_data, mm_uuids = await parse_chat_messages_async(
messages,
kimi_k2_5_model_config,
content_format="string",
)
placeholder = "<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
expected_conversation = [
{
"role": "user",
"content": f"{placeholder}\nAnalyze this image.",
}
]
assert conversation == expected_conversation
_assert_mm_data_is_vision_chunk_input(mm_data, 1)
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[None], modality="vision_chunk")
@pytest.mark.asyncio
async def test_parse_chat_messages_video_vision_chunk_async(
kimi_k2_5_model_config,
video_url,
):
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Analyze this video."},
{
"type": "video_url",
"video_url": {"url": video_url},
},
],
}
]
conversation, mm_data, mm_uuids = await parse_chat_messages_async(
messages,
kimi_k2_5_model_config,
content_format="string",
)
placeholder = "<|kimi_k25_video_placeholder|>"
expected_conversation = [
{
"role": "user",
"content": f"{placeholder}\nAnalyze this video.",
}
]
assert conversation == expected_conversation
_assert_mm_data_is_vision_chunk_input(mm_data, 1)
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[None], modality="vision_chunk")
@pytest.mark.asyncio
async def test_parse_chat_messages_image_vision_chunk_with_uuid_async(
kimi_k2_5_model_config,
image_url,
):
image_uuid = "image_123"
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Analyze this image."},
{
"type": "image_url",
"image_url": {"url": image_url},
"uuid": image_uuid,
},
],
}
]
conversation, mm_data, mm_uuids = await parse_chat_messages_async(
messages,
kimi_k2_5_model_config,
content_format="string",
)
placeholder = "<|media_begin|>image<|media_content|><|media_pad|><|media_end|>"
expected_conversation = [
{
"role": "user",
"content": f"{placeholder}\nAnalyze this image.",
}
]
assert conversation == expected_conversation
_assert_mm_data_is_vision_chunk_input(mm_data, 1)
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid], modality="vision_chunk")
@pytest.mark.asyncio
async def test_parse_chat_messages_video_vision_chunk_with_uuid_async(
kimi_k2_5_model_config,
video_url,
):
video_uuid = "video_456"
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Analyze this video."},
{
"type": "video_url",
"video_url": {"url": video_url},
"uuid": video_uuid,
},
],
}
]
conversation, mm_data, mm_uuids = await parse_chat_messages_async(
messages,
kimi_k2_5_model_config,
content_format="string",
)
placeholder = "<|kimi_k25_video_placeholder|>"
expected_conversation = [
{
"role": "user",
"content": f"{placeholder}\nAnalyze this video.",
}
]
assert conversation == expected_conversation
_assert_mm_data_is_vision_chunk_input(mm_data, 1)
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[video_uuid], modality="vision_chunk")

View File

@@ -411,6 +411,11 @@ def test_processing_correctness(
"Qwen-VL tokenizer requires downloading a font file from "
"servers that often refuse connections in CI"
)
if model_id == "moonshotai/Kimi-K2.5":
# FIXME(Isaac): Fix Kimi-K2.5's offline inference about vision chunks.
pytest.skip(
"Kimi-K2.5's offline inference has issues about vision chunks. Fix later."
)
_test_processing_correctness(
model_id,

View File

@@ -155,6 +155,12 @@ def initialize_dummy_model(
@create_new_process_for_each_test()
@pytest.mark.parametrize("model_id", get_model_ids_to_test())
def test_model_tensor_schema(model_id: str):
if model_id == "moonshotai/Kimi-K2.5":
# FIXME(Isotr0py): Fix Kimi-K2.5's offline inference about vision chunks.
pytest.skip(
"Kimi-K2.5's offline inference has issues about vision chunks. Fix later."
)
model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(

View File

@@ -786,7 +786,6 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"KimiK25ForConditionalGeneration": _HfExamplesInfo(
"moonshotai/Kimi-K2.5",
trust_remote_code=True,
is_available_online=False,
),
"LightOnOCRForConditionalGeneration": _HfExamplesInfo(
"lightonai/LightOnOCR-1B-1025"

View File

@@ -454,78 +454,6 @@ def _get_embeds_data(
raise NotImplementedError(type(data_items))
def rebuild_mm_uuids_from_mm_data(
mm_uuids: MultiModalUUIDDict,
mm_data: MultiModalDataDict,
) -> MultiModalUUIDDict:
"""Rebuild mm_uuids after vision_chunk processing.
When videos are split into chunks, the original UUIDs need to be updated
to reflect the new UUIDs generated for each chunk.
Args:
mm_uuids: Original UUIDs dictionary
mm_data: Processed multimodal data with vision_chunk items
Returns:
Updated UUIDs dictionary with chunk UUIDs
"""
vision_chunks = mm_data.get("vision_chunk")
if vision_chunks is None:
return mm_uuids
new_uuids = dict(mm_uuids)
vision_chunk_uuids = []
for item in vision_chunks:
# vision_chunk items are always dicts (VisionChunkImage/VisionChunkVideo)
assert isinstance(item, dict)
uuid_val = item.get("uuid")
if uuid_val is not None:
vision_chunk_uuids.append(uuid_val)
if vision_chunk_uuids:
new_uuids["vision_chunk"] = vision_chunk_uuids
return new_uuids
def build_video_prompts_from_mm_data(
mm_data: MultiModalDataDict,
) -> list[str]:
"""Build video prompts from vision_chunk data.
Collects prompts from video chunks and groups them by video_idx.
Args:
mm_data: Processed multimodal data with vision_chunk items
Returns:
List of video prompts, one per video.
"""
vision_chunks = mm_data.get("vision_chunk")
if vision_chunks is None:
return []
# Group chunks by video_idx
video_prompts_dict: dict[int, list[str]] = defaultdict(list)
for item in vision_chunks:
# vision_chunk items are always dicts (VisionChunkImage/VisionChunkVideo)
assert isinstance(item, dict)
if item.get("type") == "video_chunk":
video_idx = item.get("video_idx", 0)
prompt = item.get("prompt", "")
video_prompts_dict[video_idx].append(prompt)
# Build prompts in video order
video_prompts = []
for video_idx in sorted(video_prompts_dict.keys()):
video_prompts.append("".join(video_prompts_dict[video_idx]))
return video_prompts
class BaseMultiModalItemTracker(ABC, Generic[_T]):
"""
Tracks multi-modal items in a given request and ensures that the number
@@ -616,10 +544,72 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
raise NotImplementedError
def _resolve_vision_chunk_items(
vision_chunk_items: list[tuple[object, str | None]],
mm_processor: BaseMultiModalProcessor,
vision_chunks_modality_order: list[str],
):
# Process vision_chunk items - extract from (data, modality) tuples
# and convert to VisionChunk types with proper UUID handling
vision_chunks_uuids = [uuid for data, uuid in vision_chunk_items]
assert len(vision_chunk_items) == len(vision_chunks_modality_order), (
f"vision_chunk items ({len(vision_chunk_items)}) and "
f"modality_order ({len(vision_chunks_modality_order)}) must have same length"
)
processed_chunks: list[VisionChunk] = []
video_idx = 0
for inner_modality, (data, uuid) in zip(
vision_chunks_modality_order, vision_chunk_items
):
if inner_modality == "image":
# Cast data to proper type for image
# Use .media (PIL.Image) directly to avoid redundant
# bytes→PIL conversion in media_processor
if hasattr(data, "media"):
image_data = data.media # type: ignore[union-attr]
processed_chunks.append(
VisionChunkImage(type="image", image=image_data, uuid=uuid)
)
else:
processed_chunks.append(data) # type: ignore[arg-type]
elif inner_modality == "video":
# For video, we may need to split into chunks
# if processor supports it
# For now, just wrap as a video chunk placeholder
if hasattr(mm_processor, "split_video_chunks") and data is not None:
try:
video_uuid = uuid or random_uuid()
# video await result is (video_data, video_meta) tuple
if isinstance(data, tuple) and len(data) >= 1:
video_data = data[0]
else:
video_data = data
video_chunks = mm_processor.split_video_chunks(video_data)
for i, vc in enumerate(video_chunks):
processed_chunks.append(
VisionChunkVideo(
type="video_chunk",
video_chunk=vc["video_chunk"],
uuid=f"{video_uuid}-{i}",
video_idx=video_idx,
prompt=vc["prompt"],
)
)
video_idx += 1
except Exception as e:
logger.warning("Failed to split video chunks: %s", e)
processed_chunks.append(data) # type: ignore[arg-type]
else:
processed_chunks.append(data) # type: ignore[arg-type]
return processed_chunks, vision_chunks_uuids
def _resolve_items(
items_by_modality: dict[str, list[tuple[object, str | None]]],
mm_processor: BaseMultiModalProcessor,
vision_chunk_modality_order: dict[str, list[str]],
modality_order: dict[str, list[str]],
) -> tuple[MultiModalDataDict, MultiModalUUIDDict]:
if "image" in items_by_modality and "image_embeds" in items_by_modality:
raise ValueError("Mixing raw image and embedding inputs is not allowed")
@@ -654,71 +644,13 @@ def _resolve_items(
if "vision_chunk" in items_by_modality:
# Process vision_chunk items - extract from (data, modality) tuples
# and convert to VisionChunk types with proper UUID handling
vision_chunk_items = items_by_modality["vision_chunk"]
modality_order = vision_chunk_modality_order.get("vision_chunk", [])
mm_uuids["vision_chunk"] = [
uuid for data, uuid in items_by_modality["vision_chunk"]
]
# Filter out None items (from asyncio.sleep(0) placeholders)
filtered_items = [
(idx, item)
for idx, item in enumerate(vision_chunk_items)
if item is not None
]
assert len(filtered_items) == len(modality_order), (
f"vision_chunk items ({len(filtered_items)}) and "
f"modality_order ({len(modality_order)}) must have same length"
processed_chunks, vision_chunk_uuids = _resolve_vision_chunk_items(
items_by_modality["vision_chunk"],
mm_processor,
modality_order.get("vision_chunk", []),
)
processed_chunks: list[VisionChunk] = []
video_idx = 0
for i, (idx, item) in enumerate(filtered_items):
inner_modality = modality_order[i]
data, uuid = item
uuid_val = uuid if idx < len(mm_uuids["vision_chunk"]) else None
if inner_modality == "image":
# Cast data to proper type for image
# Use .media (PIL.Image) directly to avoid redundant
# bytes→PIL conversion in media_processor
if hasattr(data, "media"):
image_data = data.media # type: ignore[union-attr]
processed_chunks.append(
VisionChunkImage(type="image", image=image_data, uuid=uuid_val)
)
else:
processed_chunks.append(data) # type: ignore[arg-type]
elif inner_modality == "video":
# For video, we may need to split into chunks
# if processor supports it
# For now, just wrap as a video chunk placeholder
if hasattr(mm_processor, "split_video_chunks") and data is not None:
try:
video_uuid = uuid_val or random_uuid()
# video await result is (video_data, video_meta) tuple
if isinstance(data, tuple) and len(data) >= 1:
video_data = data[0]
else:
video_data = data
video_chunks = mm_processor.split_video_chunks(video_data)
for i, vc in enumerate(video_chunks):
processed_chunks.append(
VisionChunkVideo(
type="video_chunk",
video_chunk=vc["video_chunk"],
uuid=f"{video_uuid}-{i}",
video_idx=video_idx,
prompt=vc["prompt"],
)
)
video_idx += 1
except Exception as e:
logger.warning("Failed to split video chunks: %s", e)
processed_chunks.append(data) # type: ignore[arg-type]
else:
processed_chunks.append(data) # type: ignore[arg-type]
mm_data["vision_chunk"] = processed_chunks
mm_uuids["vision_chunk"] = vision_chunk_uuids
return mm_data, mm_uuids

View File

@@ -235,27 +235,6 @@ class VideoLoader:
VIDEO_LOADER_REGISTRY = ExtensionManager()
@VIDEO_LOADER_REGISTRY.register("identity")
class IdentityVideoLoader(VideoLoader):
"""IdentityVideoLoader returns raw video bytes without decoding.
This allows the model processor to handle video decoding and
is required for models like Kimi-K2.5 that need custom video chunk splitting.
NOTE: This is temporary for Kimi-K2.5 testing. Remember to change back
to opencv before release if needed.
"""
@classmethod
def load_bytes(
cls,
data: bytes,
num_frames: int = -1,
**kwargs: Any,
) -> tuple[Any, Any]:
return data, None
@VIDEO_LOADER_REGISTRY.register("opencv")
class OpenCVVideoBackend(VideoLoader):
def get_cv2_video_api(self):

View File

@@ -1,10 +1,11 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import inspect
from collections import deque
import itertools
from collections import defaultdict, deque
from collections.abc import Set
from functools import lru_cache
from typing import Any, cast
from typing import TYPE_CHECKING, Any, cast
import jinja2
import jinja2.ext
@@ -20,11 +21,9 @@ from vllm.entrypoints.chat_utils import (
ChatTemplateContentFormatOption,
ChatTemplateResolutionError,
ConversationMessage,
build_video_prompts_from_mm_data,
load_chat_template,
parse_chat_messages,
parse_chat_messages_async,
rebuild_mm_uuids_from_mm_data,
)
from vllm.inputs import TextPrompt, TokensPrompt
from vllm.logger import init_logger
@@ -36,6 +35,13 @@ from vllm.utils.func_utils import supports_kw
from .protocol import RendererLike
if TYPE_CHECKING:
from vllm.multimodal.inputs import MultiModalDataDict, MultiModalUUIDDict
else:
MultiModalDataDict = dict[str, Any]
MultiModalUUIDDict = dict[str, Any]
logger = init_logger(__name__)
@@ -479,6 +485,104 @@ def safe_apply_chat_template(
raise ValueError(str(e)) from e
def rebuild_mm_uuids_from_mm_data(
mm_uuids: "MultiModalUUIDDict",
mm_data: "MultiModalDataDict",
) -> "MultiModalUUIDDict":
"""Rebuild mm_uuids after vision_chunk processing.
When videos are split into chunks, the original UUIDs need to be updated
to reflect the new UUIDs generated for each chunk.
Args:
mm_uuids: Original UUIDs dictionary
mm_data: Processed multimodal data with vision_chunk items
Returns:
Updated UUIDs dictionary with chunk UUIDs
"""
vision_chunks = mm_data.get("vision_chunk")
if vision_chunks is None:
return mm_uuids
assert all(isinstance(item, dict) for item in vision_chunks), (
"Expected all vision_chunk items to be dicts"
)
vision_chunks = cast(list[dict[str, Any]], vision_chunks)
vision_chunk_uuids = [
uuid_val for item in vision_chunks if (uuid_val := item.get("uuid")) is not None
]
if vision_chunk_uuids:
mm_uuids = dict(mm_uuids)
mm_uuids["vision_chunk"] = vision_chunk_uuids
return mm_uuids
def build_video_prompts_from_mm_data(
mm_data: "MultiModalDataDict",
) -> list[str]:
"""Build video prompts from vision_chunk data.
Collects prompts from video chunks and groups them by video_idx.
Args:
mm_data: Processed multimodal data with vision_chunk items
Returns:
List of video prompts, one per video.
"""
vision_chunks = mm_data.get("vision_chunk")
if vision_chunks is None:
return []
# Group chunks by video_idx
video_prompts_dict: dict[int, list[str]] = defaultdict(list)
for item in vision_chunks:
# vision_chunk items are always dicts (VisionChunkImage/VisionChunkVideo)
assert isinstance(item, dict)
if item.get("type") == "video_chunk":
video_idx = item.get("video_idx", 0)
prompt = item.get("prompt", "")
video_prompts_dict[video_idx].append(prompt)
# Build prompts in video order
video_prompts = [
"".join(video_prompts_dict[video_idx])
for video_idx in sorted(video_prompts_dict.keys())
]
return video_prompts
def replace_vision_chunk_video_placeholder(
prompt_raw: str | list[int],
mm_data: "MultiModalDataDict",
video_placeholder: str | None,
) -> str | list[int]:
# get video placehoder, replace it with runtime video-chunk prompts
if video_placeholder and isinstance(prompt_raw, str):
video_prompts = build_video_prompts_from_mm_data(mm_data)
# replace in order
prompt_raw_parts = prompt_raw.split(video_placeholder)
if len(prompt_raw_parts) == len(video_prompts) + 1:
prompt_raw = "".join(
itertools.chain.from_iterable(zip(prompt_raw_parts, video_prompts))
)
prompt_raw += prompt_raw_parts[-1]
else:
logger.warning(
"Number of video placeholders (%d) does not match "
"number of videos (%d) in the request.",
len(prompt_raw_parts) - 1,
len(video_prompts),
)
return prompt_raw
class HfRenderer(RendererLike):
@classmethod
def from_config(
@@ -496,6 +600,9 @@ class HfRenderer(RendererLike):
super().__init__()
self.config = config
self.use_unified_vision_chunk = getattr(
config.hf_config, "use_unified_vision_chunk", False
)
if config.skip_tokenizer_init:
tokenizer = None
@@ -552,7 +659,7 @@ class HfRenderer(RendererLike):
# NOTE: use_unified_vision_chunk is currently specific to Kimi-K2.5
# model which uses unified vision chunks for both images and videos.
if (
getattr(model_config.hf_config, "use_unified_vision_chunk", False)
self.use_unified_vision_chunk
and mm_uuids is not None
and mm_data is not None
):
@@ -562,26 +669,11 @@ class HfRenderer(RendererLike):
video_placeholder = getattr(
model_config.hf_config, "video_placeholder", None
)
if video_placeholder and isinstance(prompt_raw, str):
video_prompts = build_video_prompts_from_mm_data(mm_data)
# replace in order
prompt_raw_parts = prompt_raw.split(video_placeholder)
if len(prompt_raw_parts) == len(video_prompts) + 1:
prompt_raw = "".join(
[
prompt_raw_parts[i] + video_prompts[i]
for i in range(len(video_prompts))
]
)
prompt_raw += prompt_raw_parts[-1]
else:
logger.warning(
"Number of video placeholders (%d) does not match "
"number of videos (%d) in the request.",
len(prompt_raw_parts) - 1,
len(video_prompts),
)
prompt_raw = replace_vision_chunk_video_placeholder(
prompt_raw,
mm_data,
video_placeholder,
)
prompt = (
TextPrompt(prompt=prompt_raw)
@@ -626,7 +718,7 @@ class HfRenderer(RendererLike):
# NOTE: use_unified_vision_chunk is currently specific to Kimi-K2.5
# model which uses unified vision chunks for both images and videos.
if (
getattr(model_config.hf_config, "use_unified_vision_chunk", False)
self.use_unified_vision_chunk
and mm_uuids is not None
and mm_data is not None
):
@@ -636,26 +728,11 @@ class HfRenderer(RendererLike):
video_placeholder = getattr(
model_config.hf_config, "video_placeholder", None
)
if video_placeholder and isinstance(prompt_raw, str):
video_prompts = build_video_prompts_from_mm_data(mm_data)
# replace in order
prompt_raw_parts = prompt_raw.split(video_placeholder)
if len(prompt_raw_parts) == len(video_prompts) + 1:
prompt_raw = "".join(
[
prompt_raw_parts[i] + video_prompts[i]
for i in range(len(video_prompts))
]
)
prompt_raw += prompt_raw_parts[-1]
else:
logger.warning(
"Number of video placeholders (%d) does not match "
"number of videos (%d) in the request.",
len(prompt_raw_parts) - 1,
len(video_prompts),
)
prompt_raw = replace_vision_chunk_video_placeholder(
prompt_raw,
mm_data,
video_placeholder,
)
prompt = (
TextPrompt(prompt=prompt_raw)