diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 1f55b13ee..36e8b0c0b 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -24,12 +24,25 @@ from vllm.multimodal.utils import ( ) from vllm.utils.serial_utils import tensor2base64 +KIMI_K2_5_MODEL_ID = "moonshotai/Kimi-K2.5" PHI3V_MODEL_ID = "microsoft/Phi-3.5-vision-instruct" QWEN2AUDIO_MODEL_ID = "Qwen/Qwen2-Audio-7B-Instruct" QWEN25OMNI_MODEL_ID = "Qwen/Qwen2.5-Omni-7B" MISTRAL_MODEL_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503" +@pytest.fixture(scope="function") +def kimi_k2_5_model_config(): + return ModelConfig( + KIMI_K2_5_MODEL_ID, + runner="generate", + trust_remote_code=True, + limit_mm_per_prompt={ + "image": 2, + }, + ) + + @pytest.fixture(scope="function") def phi3v_model_config(): return ModelConfig( @@ -163,6 +176,22 @@ def _assert_mm_data_is_image_input( assert image_data[i] is None +def _assert_mm_data_is_vision_chunk_input( + mm_data: MultiModalDataDict | None, + vision_chunk_count: int, +) -> None: + assert mm_data is not None + assert set(mm_data.keys()) == {"vision_chunk"} + + vision_chunk_data = mm_data.get("vision_chunk") + assert vision_chunk_data is not None + + assert ( + isinstance(vision_chunk_data, list) + and len(vision_chunk_data) == vision_chunk_count + ) + + def _assert_mm_uuids( mm_uuids: MultiModalUUIDDict | None, media_count: int, @@ -2151,3 +2180,505 @@ async def test_parse_chat_messages_single_empty_audio_with_uuid_async( ] _assert_mm_data_inputs(mm_data, {"audio": 1}) _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[audio_uuid]) + + +def test_parse_chat_messages_image_vision_chunk( + kimi_k2_5_model_config, + image_url, +): + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Analyze this image."}, + { + "type": "image_url", + "image_url": {"url": image_url}, + }, + ], + } + ] + + conversation, mm_data, mm_uuids = parse_chat_messages( + messages, + kimi_k2_5_model_config, + content_format="string", + ) + + placeholder = "<|media_begin|>image<|media_content|><|media_pad|><|media_end|>" + expected_conversation = [ + { + "role": "user", + "content": f"{placeholder}\nAnalyze this image.", + } + ] + + assert conversation == expected_conversation + _assert_mm_data_is_vision_chunk_input(mm_data, 1) + _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None], modality="vision_chunk") + + +def test_parse_chat_messages_video_vision_chunk( + kimi_k2_5_model_config, + video_url, +): + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Analyze this video."}, + { + "type": "video_url", + "video_url": {"url": video_url}, + }, + ], + } + ] + + conversation, mm_data, mm_uuids = parse_chat_messages( + messages, + kimi_k2_5_model_config, + content_format="string", + ) + + placeholder = "<|kimi_k25_video_placeholder|>" + expected_conversation = [ + { + "role": "user", + "content": f"{placeholder}\nAnalyze this video.", + } + ] + + assert conversation == expected_conversation + _assert_mm_data_is_vision_chunk_input(mm_data, 1) + _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None], modality="vision_chunk") + + +def test_parse_chat_messages_image_vision_chunk_with_uuid( + kimi_k2_5_model_config, + image_url, +): + image_uuid = "image_123" + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Analyze this image."}, + { + "type": "image_url", + "image_url": {"url": image_url}, + "uuid": image_uuid, + }, + ], + } + ] + + conversation, mm_data, mm_uuids = parse_chat_messages( + messages, + kimi_k2_5_model_config, + content_format="string", + ) + + placeholder = "<|media_begin|>image<|media_content|><|media_pad|><|media_end|>" + expected_conversation = [ + { + "role": "user", + "content": f"{placeholder}\nAnalyze this image.", + } + ] + + assert conversation == expected_conversation + _assert_mm_data_is_vision_chunk_input(mm_data, 1) + _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid], modality="vision_chunk") + + +def test_parse_chat_messages_video_vision_chunk_with_uuid( + kimi_k2_5_model_config, + video_url, +): + video_uuid = "video_456" + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Analyze this video."}, + { + "type": "video_url", + "video_url": {"url": video_url}, + "uuid": video_uuid, + }, + ], + } + ] + + conversation, mm_data, mm_uuids = parse_chat_messages( + messages, + kimi_k2_5_model_config, + content_format="string", + ) + + placeholder = "<|kimi_k25_video_placeholder|>" + expected_conversation = [ + { + "role": "user", + "content": f"{placeholder}\nAnalyze this video.", + } + ] + + assert conversation == expected_conversation + _assert_mm_data_is_vision_chunk_input(mm_data, 1) + _assert_mm_uuids(mm_uuids, 1, expected_uuids=[video_uuid], modality="vision_chunk") + + +def test_parse_chat_messages_mixed_vision_chunk( + kimi_k2_5_model_config, + image_url, + video_url, +): + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Analyze this image and video."}, + { + "type": "image_url", + "image_url": {"url": image_url}, + }, + { + "type": "video_url", + "video_url": {"url": video_url}, + }, + ], + } + ] + + conversation, mm_data, mm_uuids = parse_chat_messages( + messages, + kimi_k2_5_model_config, + content_format="string", + ) + + image_placeholder = ( + "<|media_begin|>image<|media_content|><|media_pad|><|media_end|>" + ) + video_placeholder = "<|kimi_k25_video_placeholder|>" + expected_conversation = [ + { + "role": "user", + "content": ( + f"{image_placeholder}\n{video_placeholder}\n" + "Analyze this image and video." + ), + } + ] + + assert conversation == expected_conversation + _assert_mm_data_is_vision_chunk_input(mm_data, 2) + _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None], modality="vision_chunk") + + +def test_parse_chat_messages_mixed_vision_chunk_with_uuid( + kimi_k2_5_model_config, + image_url, + video_url, +): + image_uuid = "image_123" + video_uuid = "video_456" + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Analyze this image and video."}, + { + "type": "image_url", + "image_url": {"url": image_url}, + "uuid": image_uuid, + }, + { + "type": "video_url", + "video_url": {"url": video_url}, + "uuid": video_uuid, + }, + ], + } + ] + + conversation, mm_data, mm_uuids = parse_chat_messages( + messages, + kimi_k2_5_model_config, + content_format="string", + ) + + image_placeholder = ( + "<|media_begin|>image<|media_content|><|media_pad|><|media_end|>" + ) + video_placeholder = "<|kimi_k25_video_placeholder|>" + expected_conversation = [ + { + "role": "user", + "content": ( + f"{image_placeholder}\n{video_placeholder}\n" + "Analyze this image and video." + ), + } + ] + + assert conversation == expected_conversation + _assert_mm_data_is_vision_chunk_input(mm_data, 2) + _assert_mm_uuids( + mm_uuids, 2, expected_uuids=[image_uuid, video_uuid], modality="vision_chunk" + ) + + +@pytest.mark.asyncio +async def test_parse_chat_messages_mixed_vision_chunk_async( + kimi_k2_5_model_config, + image_url, + video_url, +): + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Analyze this image and video."}, + { + "type": "image_url", + "image_url": {"url": image_url}, + }, + { + "type": "video_url", + "video_url": {"url": video_url}, + }, + ], + } + ] + + conversation, mm_data, mm_uuids = await parse_chat_messages_async( + messages, + kimi_k2_5_model_config, + content_format="string", + ) + + image_placeholder = ( + "<|media_begin|>image<|media_content|><|media_pad|><|media_end|>" + ) + video_placeholder = "<|kimi_k25_video_placeholder|>" + expected_conversation = [ + { + "role": "user", + "content": ( + f"{image_placeholder}\n{video_placeholder}\n" + "Analyze this image and video." + ), + } + ] + + assert conversation == expected_conversation + _assert_mm_data_is_vision_chunk_input(mm_data, 2) + _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None], modality="vision_chunk") + + +@pytest.mark.asyncio +async def test_parse_chat_messages_mixed_vision_chunk_with_uuid_async( + kimi_k2_5_model_config, + image_url, + video_url, +): + image_uuid = "image_123" + video_uuid = "video_456" + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Analyze this image and video."}, + { + "type": "image_url", + "image_url": {"url": image_url}, + "uuid": image_uuid, + }, + { + "type": "video_url", + "video_url": {"url": video_url}, + "uuid": video_uuid, + }, + ], + } + ] + + conversation, mm_data, mm_uuids = await parse_chat_messages_async( + messages, + kimi_k2_5_model_config, + content_format="string", + ) + + image_placeholder = ( + "<|media_begin|>image<|media_content|><|media_pad|><|media_end|>" + ) + video_placeholder = "<|kimi_k25_video_placeholder|>" + expected_conversation = [ + { + "role": "user", + "content": ( + f"{image_placeholder}\n{video_placeholder}\n" + "Analyze this image and video." + ), + } + ] + + assert conversation == expected_conversation + _assert_mm_data_is_vision_chunk_input(mm_data, 2) + _assert_mm_uuids( + mm_uuids, 2, expected_uuids=[image_uuid, video_uuid], modality="vision_chunk" + ) + + +@pytest.mark.asyncio +async def test_parse_chat_messages_image_vision_chunk_async( + kimi_k2_5_model_config, + image_url, +): + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Analyze this image."}, + { + "type": "image_url", + "image_url": {"url": image_url}, + }, + ], + } + ] + + conversation, mm_data, mm_uuids = await parse_chat_messages_async( + messages, + kimi_k2_5_model_config, + content_format="string", + ) + + placeholder = "<|media_begin|>image<|media_content|><|media_pad|><|media_end|>" + expected_conversation = [ + { + "role": "user", + "content": f"{placeholder}\nAnalyze this image.", + } + ] + + assert conversation == expected_conversation + _assert_mm_data_is_vision_chunk_input(mm_data, 1) + _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None], modality="vision_chunk") + + +@pytest.mark.asyncio +async def test_parse_chat_messages_video_vision_chunk_async( + kimi_k2_5_model_config, + video_url, +): + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Analyze this video."}, + { + "type": "video_url", + "video_url": {"url": video_url}, + }, + ], + } + ] + + conversation, mm_data, mm_uuids = await parse_chat_messages_async( + messages, + kimi_k2_5_model_config, + content_format="string", + ) + + placeholder = "<|kimi_k25_video_placeholder|>" + expected_conversation = [ + { + "role": "user", + "content": f"{placeholder}\nAnalyze this video.", + } + ] + + assert conversation == expected_conversation + _assert_mm_data_is_vision_chunk_input(mm_data, 1) + _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None], modality="vision_chunk") + + +@pytest.mark.asyncio +async def test_parse_chat_messages_image_vision_chunk_with_uuid_async( + kimi_k2_5_model_config, + image_url, +): + image_uuid = "image_123" + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Analyze this image."}, + { + "type": "image_url", + "image_url": {"url": image_url}, + "uuid": image_uuid, + }, + ], + } + ] + + conversation, mm_data, mm_uuids = await parse_chat_messages_async( + messages, + kimi_k2_5_model_config, + content_format="string", + ) + + placeholder = "<|media_begin|>image<|media_content|><|media_pad|><|media_end|>" + expected_conversation = [ + { + "role": "user", + "content": f"{placeholder}\nAnalyze this image.", + } + ] + + assert conversation == expected_conversation + _assert_mm_data_is_vision_chunk_input(mm_data, 1) + _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid], modality="vision_chunk") + + +@pytest.mark.asyncio +async def test_parse_chat_messages_video_vision_chunk_with_uuid_async( + kimi_k2_5_model_config, + video_url, +): + video_uuid = "video_456" + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Analyze this video."}, + { + "type": "video_url", + "video_url": {"url": video_url}, + "uuid": video_uuid, + }, + ], + } + ] + + conversation, mm_data, mm_uuids = await parse_chat_messages_async( + messages, + kimi_k2_5_model_config, + content_format="string", + ) + + placeholder = "<|kimi_k25_video_placeholder|>" + expected_conversation = [ + { + "role": "user", + "content": f"{placeholder}\nAnalyze this video.", + } + ] + + assert conversation == expected_conversation + _assert_mm_data_is_vision_chunk_input(mm_data, 1) + _assert_mm_uuids(mm_uuids, 1, expected_uuids=[video_uuid], modality="vision_chunk") diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index a6e580cb8..57b777192 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -411,6 +411,11 @@ def test_processing_correctness( "Qwen-VL tokenizer requires downloading a font file from " "servers that often refuse connections in CI" ) + if model_id == "moonshotai/Kimi-K2.5": + # FIXME(Isaac): Fix Kimi-K2.5's offline inference about vision chunks. + pytest.skip( + "Kimi-K2.5's offline inference has issues about vision chunks. Fix later." + ) _test_processing_correctness( model_id, diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py index b55dad266..ba66c858d 100644 --- a/tests/models/multimodal/processing/test_tensor_schema.py +++ b/tests/models/multimodal/processing/test_tensor_schema.py @@ -155,6 +155,12 @@ def initialize_dummy_model( @create_new_process_for_each_test() @pytest.mark.parametrize("model_id", get_model_ids_to_test()) def test_model_tensor_schema(model_id: str): + if model_id == "moonshotai/Kimi-K2.5": + # FIXME(Isotr0py): Fix Kimi-K2.5's offline inference about vision chunks. + pytest.skip( + "Kimi-K2.5's offline inference has issues about vision chunks. Fix later." + ) + model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id) model_info.check_available_online(on_fail="skip") model_info.check_transformers_version( diff --git a/tests/models/registry.py b/tests/models/registry.py index 8fd801dc6..99a64d4e1 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -786,7 +786,6 @@ _MULTIMODAL_EXAMPLE_MODELS = { "KimiK25ForConditionalGeneration": _HfExamplesInfo( "moonshotai/Kimi-K2.5", trust_remote_code=True, - is_available_online=False, ), "LightOnOCRForConditionalGeneration": _HfExamplesInfo( "lightonai/LightOnOCR-1B-1025" diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index bcc8d3c65..82fdb281e 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -454,78 +454,6 @@ def _get_embeds_data( raise NotImplementedError(type(data_items)) -def rebuild_mm_uuids_from_mm_data( - mm_uuids: MultiModalUUIDDict, - mm_data: MultiModalDataDict, -) -> MultiModalUUIDDict: - """Rebuild mm_uuids after vision_chunk processing. - - When videos are split into chunks, the original UUIDs need to be updated - to reflect the new UUIDs generated for each chunk. - - Args: - mm_uuids: Original UUIDs dictionary - mm_data: Processed multimodal data with vision_chunk items - - Returns: - Updated UUIDs dictionary with chunk UUIDs - """ - vision_chunks = mm_data.get("vision_chunk") - if vision_chunks is None: - return mm_uuids - - new_uuids = dict(mm_uuids) - vision_chunk_uuids = [] - - for item in vision_chunks: - # vision_chunk items are always dicts (VisionChunkImage/VisionChunkVideo) - assert isinstance(item, dict) - uuid_val = item.get("uuid") - if uuid_val is not None: - vision_chunk_uuids.append(uuid_val) - - if vision_chunk_uuids: - new_uuids["vision_chunk"] = vision_chunk_uuids - - return new_uuids - - -def build_video_prompts_from_mm_data( - mm_data: MultiModalDataDict, -) -> list[str]: - """Build video prompts from vision_chunk data. - - Collects prompts from video chunks and groups them by video_idx. - - Args: - mm_data: Processed multimodal data with vision_chunk items - - Returns: - List of video prompts, one per video. - """ - vision_chunks = mm_data.get("vision_chunk") - if vision_chunks is None: - return [] - - # Group chunks by video_idx - video_prompts_dict: dict[int, list[str]] = defaultdict(list) - - for item in vision_chunks: - # vision_chunk items are always dicts (VisionChunkImage/VisionChunkVideo) - assert isinstance(item, dict) - if item.get("type") == "video_chunk": - video_idx = item.get("video_idx", 0) - prompt = item.get("prompt", "") - video_prompts_dict[video_idx].append(prompt) - - # Build prompts in video order - video_prompts = [] - for video_idx in sorted(video_prompts_dict.keys()): - video_prompts.append("".join(video_prompts_dict[video_idx])) - - return video_prompts - - class BaseMultiModalItemTracker(ABC, Generic[_T]): """ Tracks multi-modal items in a given request and ensures that the number @@ -616,10 +544,72 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): raise NotImplementedError +def _resolve_vision_chunk_items( + vision_chunk_items: list[tuple[object, str | None]], + mm_processor: BaseMultiModalProcessor, + vision_chunks_modality_order: list[str], +): + # Process vision_chunk items - extract from (data, modality) tuples + # and convert to VisionChunk types with proper UUID handling + vision_chunks_uuids = [uuid for data, uuid in vision_chunk_items] + + assert len(vision_chunk_items) == len(vision_chunks_modality_order), ( + f"vision_chunk items ({len(vision_chunk_items)}) and " + f"modality_order ({len(vision_chunks_modality_order)}) must have same length" + ) + + processed_chunks: list[VisionChunk] = [] + video_idx = 0 + for inner_modality, (data, uuid) in zip( + vision_chunks_modality_order, vision_chunk_items + ): + if inner_modality == "image": + # Cast data to proper type for image + # Use .media (PIL.Image) directly to avoid redundant + # bytes→PIL conversion in media_processor + if hasattr(data, "media"): + image_data = data.media # type: ignore[union-attr] + processed_chunks.append( + VisionChunkImage(type="image", image=image_data, uuid=uuid) + ) + else: + processed_chunks.append(data) # type: ignore[arg-type] + elif inner_modality == "video": + # For video, we may need to split into chunks + # if processor supports it + # For now, just wrap as a video chunk placeholder + if hasattr(mm_processor, "split_video_chunks") and data is not None: + try: + video_uuid = uuid or random_uuid() + # video await result is (video_data, video_meta) tuple + if isinstance(data, tuple) and len(data) >= 1: + video_data = data[0] + else: + video_data = data + video_chunks = mm_processor.split_video_chunks(video_data) + for i, vc in enumerate(video_chunks): + processed_chunks.append( + VisionChunkVideo( + type="video_chunk", + video_chunk=vc["video_chunk"], + uuid=f"{video_uuid}-{i}", + video_idx=video_idx, + prompt=vc["prompt"], + ) + ) + video_idx += 1 + except Exception as e: + logger.warning("Failed to split video chunks: %s", e) + processed_chunks.append(data) # type: ignore[arg-type] + else: + processed_chunks.append(data) # type: ignore[arg-type] + return processed_chunks, vision_chunks_uuids + + def _resolve_items( items_by_modality: dict[str, list[tuple[object, str | None]]], mm_processor: BaseMultiModalProcessor, - vision_chunk_modality_order: dict[str, list[str]], + modality_order: dict[str, list[str]], ) -> tuple[MultiModalDataDict, MultiModalUUIDDict]: if "image" in items_by_modality and "image_embeds" in items_by_modality: raise ValueError("Mixing raw image and embedding inputs is not allowed") @@ -654,71 +644,13 @@ def _resolve_items( if "vision_chunk" in items_by_modality: # Process vision_chunk items - extract from (data, modality) tuples # and convert to VisionChunk types with proper UUID handling - vision_chunk_items = items_by_modality["vision_chunk"] - modality_order = vision_chunk_modality_order.get("vision_chunk", []) - mm_uuids["vision_chunk"] = [ - uuid for data, uuid in items_by_modality["vision_chunk"] - ] - - # Filter out None items (from asyncio.sleep(0) placeholders) - filtered_items = [ - (idx, item) - for idx, item in enumerate(vision_chunk_items) - if item is not None - ] - - assert len(filtered_items) == len(modality_order), ( - f"vision_chunk items ({len(filtered_items)}) and " - f"modality_order ({len(modality_order)}) must have same length" + processed_chunks, vision_chunk_uuids = _resolve_vision_chunk_items( + items_by_modality["vision_chunk"], + mm_processor, + modality_order.get("vision_chunk", []), ) - - processed_chunks: list[VisionChunk] = [] - video_idx = 0 - for i, (idx, item) in enumerate(filtered_items): - inner_modality = modality_order[i] - data, uuid = item - uuid_val = uuid if idx < len(mm_uuids["vision_chunk"]) else None - if inner_modality == "image": - # Cast data to proper type for image - # Use .media (PIL.Image) directly to avoid redundant - # bytes→PIL conversion in media_processor - if hasattr(data, "media"): - image_data = data.media # type: ignore[union-attr] - processed_chunks.append( - VisionChunkImage(type="image", image=image_data, uuid=uuid_val) - ) - else: - processed_chunks.append(data) # type: ignore[arg-type] - elif inner_modality == "video": - # For video, we may need to split into chunks - # if processor supports it - # For now, just wrap as a video chunk placeholder - if hasattr(mm_processor, "split_video_chunks") and data is not None: - try: - video_uuid = uuid_val or random_uuid() - # video await result is (video_data, video_meta) tuple - if isinstance(data, tuple) and len(data) >= 1: - video_data = data[0] - else: - video_data = data - video_chunks = mm_processor.split_video_chunks(video_data) - for i, vc in enumerate(video_chunks): - processed_chunks.append( - VisionChunkVideo( - type="video_chunk", - video_chunk=vc["video_chunk"], - uuid=f"{video_uuid}-{i}", - video_idx=video_idx, - prompt=vc["prompt"], - ) - ) - video_idx += 1 - except Exception as e: - logger.warning("Failed to split video chunks: %s", e) - processed_chunks.append(data) # type: ignore[arg-type] - else: - processed_chunks.append(data) # type: ignore[arg-type] mm_data["vision_chunk"] = processed_chunks + mm_uuids["vision_chunk"] = vision_chunk_uuids return mm_data, mm_uuids diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index 9c7b9463b..f123799ca 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -235,27 +235,6 @@ class VideoLoader: VIDEO_LOADER_REGISTRY = ExtensionManager() -@VIDEO_LOADER_REGISTRY.register("identity") -class IdentityVideoLoader(VideoLoader): - """IdentityVideoLoader returns raw video bytes without decoding. - - This allows the model processor to handle video decoding and - is required for models like Kimi-K2.5 that need custom video chunk splitting. - - NOTE: This is temporary for Kimi-K2.5 testing. Remember to change back - to opencv before release if needed. - """ - - @classmethod - def load_bytes( - cls, - data: bytes, - num_frames: int = -1, - **kwargs: Any, - ) -> tuple[Any, Any]: - return data, None - - @VIDEO_LOADER_REGISTRY.register("opencv") class OpenCVVideoBackend(VideoLoader): def get_cv2_video_api(self): diff --git a/vllm/renderers/hf.py b/vllm/renderers/hf.py index e159a04b9..252e6e753 100644 --- a/vllm/renderers/hf.py +++ b/vllm/renderers/hf.py @@ -1,10 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import inspect -from collections import deque +import itertools +from collections import defaultdict, deque from collections.abc import Set from functools import lru_cache -from typing import Any, cast +from typing import TYPE_CHECKING, Any, cast import jinja2 import jinja2.ext @@ -20,11 +21,9 @@ from vllm.entrypoints.chat_utils import ( ChatTemplateContentFormatOption, ChatTemplateResolutionError, ConversationMessage, - build_video_prompts_from_mm_data, load_chat_template, parse_chat_messages, parse_chat_messages_async, - rebuild_mm_uuids_from_mm_data, ) from vllm.inputs import TextPrompt, TokensPrompt from vllm.logger import init_logger @@ -36,6 +35,13 @@ from vllm.utils.func_utils import supports_kw from .protocol import RendererLike +if TYPE_CHECKING: + from vllm.multimodal.inputs import MultiModalDataDict, MultiModalUUIDDict +else: + MultiModalDataDict = dict[str, Any] + MultiModalUUIDDict = dict[str, Any] + + logger = init_logger(__name__) @@ -479,6 +485,104 @@ def safe_apply_chat_template( raise ValueError(str(e)) from e +def rebuild_mm_uuids_from_mm_data( + mm_uuids: "MultiModalUUIDDict", + mm_data: "MultiModalDataDict", +) -> "MultiModalUUIDDict": + """Rebuild mm_uuids after vision_chunk processing. + + When videos are split into chunks, the original UUIDs need to be updated + to reflect the new UUIDs generated for each chunk. + + Args: + mm_uuids: Original UUIDs dictionary + mm_data: Processed multimodal data with vision_chunk items + + Returns: + Updated UUIDs dictionary with chunk UUIDs + """ + vision_chunks = mm_data.get("vision_chunk") + if vision_chunks is None: + return mm_uuids + + assert all(isinstance(item, dict) for item in vision_chunks), ( + "Expected all vision_chunk items to be dicts" + ) + vision_chunks = cast(list[dict[str, Any]], vision_chunks) + vision_chunk_uuids = [ + uuid_val for item in vision_chunks if (uuid_val := item.get("uuid")) is not None + ] + + if vision_chunk_uuids: + mm_uuids = dict(mm_uuids) + mm_uuids["vision_chunk"] = vision_chunk_uuids + + return mm_uuids + + +def build_video_prompts_from_mm_data( + mm_data: "MultiModalDataDict", +) -> list[str]: + """Build video prompts from vision_chunk data. + + Collects prompts from video chunks and groups them by video_idx. + + Args: + mm_data: Processed multimodal data with vision_chunk items + + Returns: + List of video prompts, one per video. + """ + vision_chunks = mm_data.get("vision_chunk") + if vision_chunks is None: + return [] + + # Group chunks by video_idx + video_prompts_dict: dict[int, list[str]] = defaultdict(list) + + for item in vision_chunks: + # vision_chunk items are always dicts (VisionChunkImage/VisionChunkVideo) + assert isinstance(item, dict) + if item.get("type") == "video_chunk": + video_idx = item.get("video_idx", 0) + prompt = item.get("prompt", "") + video_prompts_dict[video_idx].append(prompt) + + # Build prompts in video order + video_prompts = [ + "".join(video_prompts_dict[video_idx]) + for video_idx in sorted(video_prompts_dict.keys()) + ] + + return video_prompts + + +def replace_vision_chunk_video_placeholder( + prompt_raw: str | list[int], + mm_data: "MultiModalDataDict", + video_placeholder: str | None, +) -> str | list[int]: + # get video placehoder, replace it with runtime video-chunk prompts + if video_placeholder and isinstance(prompt_raw, str): + video_prompts = build_video_prompts_from_mm_data(mm_data) + + # replace in order + prompt_raw_parts = prompt_raw.split(video_placeholder) + if len(prompt_raw_parts) == len(video_prompts) + 1: + prompt_raw = "".join( + itertools.chain.from_iterable(zip(prompt_raw_parts, video_prompts)) + ) + prompt_raw += prompt_raw_parts[-1] + else: + logger.warning( + "Number of video placeholders (%d) does not match " + "number of videos (%d) in the request.", + len(prompt_raw_parts) - 1, + len(video_prompts), + ) + return prompt_raw + + class HfRenderer(RendererLike): @classmethod def from_config( @@ -496,6 +600,9 @@ class HfRenderer(RendererLike): super().__init__() self.config = config + self.use_unified_vision_chunk = getattr( + config.hf_config, "use_unified_vision_chunk", False + ) if config.skip_tokenizer_init: tokenizer = None @@ -552,7 +659,7 @@ class HfRenderer(RendererLike): # NOTE: use_unified_vision_chunk is currently specific to Kimi-K2.5 # model which uses unified vision chunks for both images and videos. if ( - getattr(model_config.hf_config, "use_unified_vision_chunk", False) + self.use_unified_vision_chunk and mm_uuids is not None and mm_data is not None ): @@ -562,26 +669,11 @@ class HfRenderer(RendererLike): video_placeholder = getattr( model_config.hf_config, "video_placeholder", None ) - if video_placeholder and isinstance(prompt_raw, str): - video_prompts = build_video_prompts_from_mm_data(mm_data) - - # replace in order - prompt_raw_parts = prompt_raw.split(video_placeholder) - if len(prompt_raw_parts) == len(video_prompts) + 1: - prompt_raw = "".join( - [ - prompt_raw_parts[i] + video_prompts[i] - for i in range(len(video_prompts)) - ] - ) - prompt_raw += prompt_raw_parts[-1] - else: - logger.warning( - "Number of video placeholders (%d) does not match " - "number of videos (%d) in the request.", - len(prompt_raw_parts) - 1, - len(video_prompts), - ) + prompt_raw = replace_vision_chunk_video_placeholder( + prompt_raw, + mm_data, + video_placeholder, + ) prompt = ( TextPrompt(prompt=prompt_raw) @@ -626,7 +718,7 @@ class HfRenderer(RendererLike): # NOTE: use_unified_vision_chunk is currently specific to Kimi-K2.5 # model which uses unified vision chunks for both images and videos. if ( - getattr(model_config.hf_config, "use_unified_vision_chunk", False) + self.use_unified_vision_chunk and mm_uuids is not None and mm_data is not None ): @@ -636,26 +728,11 @@ class HfRenderer(RendererLike): video_placeholder = getattr( model_config.hf_config, "video_placeholder", None ) - if video_placeholder and isinstance(prompt_raw, str): - video_prompts = build_video_prompts_from_mm_data(mm_data) - - # replace in order - prompt_raw_parts = prompt_raw.split(video_placeholder) - if len(prompt_raw_parts) == len(video_prompts) + 1: - prompt_raw = "".join( - [ - prompt_raw_parts[i] + video_prompts[i] - for i in range(len(video_prompts)) - ] - ) - prompt_raw += prompt_raw_parts[-1] - else: - logger.warning( - "Number of video placeholders (%d) does not match " - "number of videos (%d) in the request.", - len(prompt_raw_parts) - 1, - len(video_prompts), - ) + prompt_raw = replace_vision_chunk_video_placeholder( + prompt_raw, + mm_data, + video_placeholder, + ) prompt = ( TextPrompt(prompt=prompt_raw)