[Models] Kimi-K2.5 (#33131)
Signed-off-by: wanglinian <wanglinian@stu.pku.edu.cn> Signed-off-by: wangln19 <96399074+wangln19@users.noreply.github.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Signed-off-by: youkaichao <youkaichao@gmail.com> Signed-off-by: Roger Wang <hey@rogerw.io> Co-authored-by: wanglinian <wanglinian@stu.pku.edu.cn> Co-authored-by: wangln19 <96399074+wangln19@users.noreply.github.com> Co-authored-by: Zaida Zhou <58739961+zhouzaida@users.noreply.github.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: Nick Hill <nickhill123@gmail.com> Co-authored-by: youkaichao <youkaichao@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
@@ -46,6 +46,9 @@ from vllm.multimodal.inputs import (
|
||||
MultiModalBatchedField,
|
||||
MultiModalFlatField,
|
||||
MultiModalSharedField,
|
||||
VisionChunk,
|
||||
VisionChunkImage,
|
||||
VisionChunkVideo,
|
||||
)
|
||||
from vllm.multimodal.processing import BaseMultiModalProcessor
|
||||
from vllm.multimodal.utils import MEDIA_CONNECTOR_REGISTRY, MediaConnector
|
||||
@@ -336,7 +339,9 @@ ChatTemplateContentFormatOption = Literal["auto", "string", "openai"]
|
||||
ChatTemplateContentFormat = Literal["string", "openai"]
|
||||
|
||||
|
||||
ModalityStr = Literal["image", "audio", "video", "image_embeds", "audio_embeds"]
|
||||
ModalityStr = Literal[
|
||||
"image", "audio", "video", "image_embeds", "audio_embeds", "vision_chunk"
|
||||
]
|
||||
_T = TypeVar("_T")
|
||||
|
||||
|
||||
@@ -449,6 +454,78 @@ def _get_embeds_data(
|
||||
raise NotImplementedError(type(data_items))
|
||||
|
||||
|
||||
def rebuild_mm_uuids_from_mm_data(
|
||||
mm_uuids: MultiModalUUIDDict,
|
||||
mm_data: MultiModalDataDict,
|
||||
) -> MultiModalUUIDDict:
|
||||
"""Rebuild mm_uuids after vision_chunk processing.
|
||||
|
||||
When videos are split into chunks, the original UUIDs need to be updated
|
||||
to reflect the new UUIDs generated for each chunk.
|
||||
|
||||
Args:
|
||||
mm_uuids: Original UUIDs dictionary
|
||||
mm_data: Processed multimodal data with vision_chunk items
|
||||
|
||||
Returns:
|
||||
Updated UUIDs dictionary with chunk UUIDs
|
||||
"""
|
||||
vision_chunks = mm_data.get("vision_chunk")
|
||||
if vision_chunks is None:
|
||||
return mm_uuids
|
||||
|
||||
new_uuids = dict(mm_uuids)
|
||||
vision_chunk_uuids = []
|
||||
|
||||
for item in vision_chunks:
|
||||
# vision_chunk items are always dicts (VisionChunkImage/VisionChunkVideo)
|
||||
assert isinstance(item, dict)
|
||||
uuid_val = item.get("uuid")
|
||||
if uuid_val is not None:
|
||||
vision_chunk_uuids.append(uuid_val)
|
||||
|
||||
if vision_chunk_uuids:
|
||||
new_uuids["vision_chunk"] = vision_chunk_uuids
|
||||
|
||||
return new_uuids
|
||||
|
||||
|
||||
def build_video_prompts_from_mm_data(
|
||||
mm_data: MultiModalDataDict,
|
||||
) -> list[str]:
|
||||
"""Build video prompts from vision_chunk data.
|
||||
|
||||
Collects prompts from video chunks and groups them by video_idx.
|
||||
|
||||
Args:
|
||||
mm_data: Processed multimodal data with vision_chunk items
|
||||
|
||||
Returns:
|
||||
List of video prompts, one per video.
|
||||
"""
|
||||
vision_chunks = mm_data.get("vision_chunk")
|
||||
if vision_chunks is None:
|
||||
return []
|
||||
|
||||
# Group chunks by video_idx
|
||||
video_prompts_dict: dict[int, list[str]] = defaultdict(list)
|
||||
|
||||
for item in vision_chunks:
|
||||
# vision_chunk items are always dicts (VisionChunkImage/VisionChunkVideo)
|
||||
assert isinstance(item, dict)
|
||||
if item.get("type") == "video_chunk":
|
||||
video_idx = item.get("video_idx", 0)
|
||||
prompt = item.get("prompt", "")
|
||||
video_prompts_dict[video_idx].append(prompt)
|
||||
|
||||
# Build prompts in video order
|
||||
video_prompts = []
|
||||
for video_idx in sorted(video_prompts_dict.keys()):
|
||||
video_prompts.append("".join(video_prompts_dict[video_idx]))
|
||||
|
||||
return video_prompts
|
||||
|
||||
|
||||
class BaseMultiModalItemTracker(ABC, Generic[_T]):
|
||||
"""
|
||||
Tracks multi-modal items in a given request and ensures that the number
|
||||
@@ -462,6 +539,13 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
|
||||
self._model_config = model_config
|
||||
|
||||
self._items_by_modality = defaultdict[str, list[_T]](list)
|
||||
# Track original modality for each vision_chunk item (image or video)
|
||||
self._modality_order = defaultdict[str, list[str]](list)
|
||||
|
||||
@cached_property
|
||||
def use_unified_vision_chunk_modality(self) -> bool:
|
||||
"""Check if model uses unified vision_chunk modality for images/videos."""
|
||||
return getattr(self._model_config.hf_config, "use_unified_vision_chunk", False)
|
||||
|
||||
@property
|
||||
def model_config(self) -> ModelConfig:
|
||||
@@ -499,11 +583,31 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
|
||||
media.
|
||||
"""
|
||||
input_modality = modality.replace("_embeds", "")
|
||||
num_items = len(self._items_by_modality[modality]) + 1
|
||||
original_modality = modality
|
||||
use_vision_chunk = (
|
||||
self.use_unified_vision_chunk_modality
|
||||
and original_modality in ["video", "image"]
|
||||
)
|
||||
|
||||
# If use_unified_vision_chunk_modality is enabled,
|
||||
# map image/video to vision_chunk
|
||||
if use_vision_chunk:
|
||||
# To avoid validation fail
|
||||
# because models with use_unified_vision_chunk_modality=True
|
||||
# will only accept vision_chunk modality.
|
||||
input_modality = "vision_chunk"
|
||||
num_items = len(self._items_by_modality[input_modality]) + 1
|
||||
else:
|
||||
num_items = len(self._items_by_modality[original_modality]) + 1
|
||||
|
||||
self.mm_processor.validate_num_items(input_modality, num_items)
|
||||
|
||||
self._items_by_modality[modality].append(item)
|
||||
# Track original modality for vision_chunk items
|
||||
if use_vision_chunk:
|
||||
self._items_by_modality[input_modality].append(item) # type: ignore
|
||||
self._modality_order["vision_chunk"].append(original_modality)
|
||||
else:
|
||||
self._items_by_modality[original_modality].append(item)
|
||||
|
||||
return self.model_cls.get_placeholder_str(modality, num_items)
|
||||
|
||||
@@ -515,6 +619,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
|
||||
def _resolve_items(
|
||||
items_by_modality: dict[str, list[tuple[object, str | None]]],
|
||||
mm_processor: BaseMultiModalProcessor,
|
||||
vision_chunk_modality_order: dict[str, list[str]],
|
||||
) -> tuple[MultiModalDataDict, MultiModalUUIDDict]:
|
||||
if "image" in items_by_modality and "image_embeds" in items_by_modality:
|
||||
raise ValueError("Mixing raw image and embedding inputs is not allowed")
|
||||
@@ -546,6 +651,74 @@ def _resolve_items(
|
||||
if "video" in items_by_modality:
|
||||
mm_data["video"] = [data for data, uuid in items_by_modality["video"]]
|
||||
mm_uuids["video"] = [uuid for data, uuid in items_by_modality["video"]]
|
||||
if "vision_chunk" in items_by_modality:
|
||||
# Process vision_chunk items - extract from (data, modality) tuples
|
||||
# and convert to VisionChunk types with proper UUID handling
|
||||
vision_chunk_items = items_by_modality["vision_chunk"]
|
||||
modality_order = vision_chunk_modality_order.get("vision_chunk", [])
|
||||
mm_uuids["vision_chunk"] = [
|
||||
uuid for data, uuid in items_by_modality["vision_chunk"]
|
||||
]
|
||||
|
||||
# Filter out None items (from asyncio.sleep(0) placeholders)
|
||||
filtered_items = [
|
||||
(idx, item)
|
||||
for idx, item in enumerate(vision_chunk_items)
|
||||
if item is not None
|
||||
]
|
||||
|
||||
assert len(filtered_items) == len(modality_order), (
|
||||
f"vision_chunk items ({len(filtered_items)}) and "
|
||||
f"modality_order ({len(modality_order)}) must have same length"
|
||||
)
|
||||
|
||||
processed_chunks: list[VisionChunk] = []
|
||||
video_idx = 0
|
||||
for i, (idx, item) in enumerate(filtered_items):
|
||||
inner_modality = modality_order[i]
|
||||
data, uuid = item
|
||||
uuid_val = uuid if idx < len(mm_uuids["vision_chunk"]) else None
|
||||
if inner_modality == "image":
|
||||
# Cast data to proper type for image
|
||||
# Use .media (PIL.Image) directly to avoid redundant
|
||||
# bytes→PIL conversion in media_processor
|
||||
if hasattr(data, "media"):
|
||||
image_data = data.media # type: ignore[union-attr]
|
||||
processed_chunks.append(
|
||||
VisionChunkImage(type="image", image=image_data, uuid=uuid_val)
|
||||
)
|
||||
else:
|
||||
processed_chunks.append(data) # type: ignore[arg-type]
|
||||
elif inner_modality == "video":
|
||||
# For video, we may need to split into chunks
|
||||
# if processor supports it
|
||||
# For now, just wrap as a video chunk placeholder
|
||||
if hasattr(mm_processor, "split_video_chunks") and data is not None:
|
||||
try:
|
||||
video_uuid = uuid_val or random_uuid()
|
||||
# video await result is (video_data, video_meta) tuple
|
||||
if isinstance(data, tuple) and len(data) >= 1:
|
||||
video_data = data[0]
|
||||
else:
|
||||
video_data = data
|
||||
video_chunks = mm_processor.split_video_chunks(video_data)
|
||||
for i, vc in enumerate(video_chunks):
|
||||
processed_chunks.append(
|
||||
VisionChunkVideo(
|
||||
type="video_chunk",
|
||||
video_chunk=vc["video_chunk"],
|
||||
uuid=f"{video_uuid}-{i}",
|
||||
video_idx=video_idx,
|
||||
prompt=vc["prompt"],
|
||||
)
|
||||
)
|
||||
video_idx += 1
|
||||
except Exception as e:
|
||||
logger.warning("Failed to split video chunks: %s", e)
|
||||
processed_chunks.append(data) # type: ignore[arg-type]
|
||||
else:
|
||||
processed_chunks.append(data) # type: ignore[arg-type]
|
||||
mm_data["vision_chunk"] = processed_chunks
|
||||
|
||||
return mm_data, mm_uuids
|
||||
|
||||
@@ -557,7 +730,9 @@ class MultiModalItemTracker(BaseMultiModalItemTracker[tuple[object, str | None]]
|
||||
if not self._items_by_modality:
|
||||
return None, None
|
||||
|
||||
return _resolve_items(dict(self._items_by_modality), self.mm_processor)
|
||||
return _resolve_items(
|
||||
dict(self._items_by_modality), self.mm_processor, self._modality_order
|
||||
)
|
||||
|
||||
def create_parser(self) -> "BaseMultiModalContentParser":
|
||||
return MultiModalContentParser(self)
|
||||
@@ -577,7 +752,9 @@ class AsyncMultiModalItemTracker(
|
||||
for modality, coros in self._items_by_modality.items()
|
||||
}
|
||||
|
||||
return _resolve_items(resolved_items_by_modality, self.mm_processor)
|
||||
return _resolve_items(
|
||||
resolved_items_by_modality, self.mm_processor, self._modality_order
|
||||
)
|
||||
|
||||
def create_parser(self) -> "BaseMultiModalContentParser":
|
||||
return AsyncMultiModalContentParser(self)
|
||||
|
||||
Reference in New Issue
Block a user