# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import warnings from collections.abc import Mapping from typing import Literal import pytest import torch from vllm.assets.audio import AudioAsset from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset from vllm.config import ModelConfig from vllm.entrypoints.chat_utils import ( parse_chat_messages, parse_chat_messages_async, ) from vllm.multimodal import MultiModalDataDict, MultiModalUUIDDict from vllm.multimodal.utils import ( encode_audio_url, encode_image_url, encode_video_url, ) from vllm.utils.serial_utils import tensor2base64 KIMI_K2_5_MODEL_ID = "moonshotai/Kimi-K2.5" PHI3V_MODEL_ID = "microsoft/Phi-3.5-vision-instruct" QWEN2AUDIO_MODEL_ID = "Qwen/Qwen2-Audio-7B-Instruct" QWEN25OMNI_MODEL_ID = "Qwen/Qwen2.5-Omni-7B" MISTRAL_MODEL_ID = "mistralai/Mistral-Small-3.1-24B-Instruct-2503" @pytest.fixture(scope="function") def kimi_k2_5_model_config(): return ModelConfig( KIMI_K2_5_MODEL_ID, runner="generate", trust_remote_code=True, limit_mm_per_prompt={ "image": 2, }, ) @pytest.fixture(scope="function") def phi3v_model_config(): return ModelConfig( PHI3V_MODEL_ID, runner="generate", trust_remote_code=True, limit_mm_per_prompt={ "image": 2, }, ) @pytest.fixture(scope="function") def phi3v_model_config_mm_interleaved(): return ModelConfig( PHI3V_MODEL_ID, runner="generate", trust_remote_code=True, interleave_mm_strings=True, limit_mm_per_prompt={ "image": 2, }, ) @pytest.fixture(scope="function") def phi3v_model_config_image_embeds(): return ModelConfig( PHI3V_MODEL_ID, runner="generate", trust_remote_code=True, limit_mm_per_prompt={ "image": 2, }, enable_mm_embeds=True, ) @pytest.fixture(scope="function") def qwen25omni_model_config_image_embeds(): return ModelConfig( QWEN25OMNI_MODEL_ID, runner="generate", limit_mm_per_prompt={"image": 2}, enable_mm_embeds=True, ) @pytest.fixture(scope="function") def qwen2_audio_model_config(): return ModelConfig( QWEN2AUDIO_MODEL_ID, runner="generate", trust_remote_code=True, limit_mm_per_prompt={ "audio": 1, }, ) @pytest.fixture(scope="function") def audio_embeds_model_config(): return ModelConfig( QWEN2AUDIO_MODEL_ID, runner="generate", trust_remote_code=True, limit_mm_per_prompt={ "audio": 2, }, enable_mm_embeds=True, ) @pytest.fixture(scope="function") def qwen25omni_model_config_mm_interleaved(): return ModelConfig( QWEN25OMNI_MODEL_ID, runner="generate", interleave_mm_strings=True, limit_mm_per_prompt={ "image": 2, "audio": 1, "video": 1, }, ) @pytest.fixture(scope="function") def mistral_model_config(): return ModelConfig( MISTRAL_MODEL_ID, runner="generate", limit_mm_per_prompt={ "image": 2, }, ) @pytest.fixture(scope="module") def image_url(): image = ImageAsset("cherry_blossom") return encode_image_url(image.pil_image) @pytest.fixture(scope="module") def video_url(): video = VideoAsset("baby_reading", 1) return encode_video_url(video.np_ndarrays) @pytest.fixture(scope="module") def audio_url(): audio = AudioAsset("mary_had_lamb") return encode_audio_url(*audio.audio_and_sample_rate) def _assert_mm_data_is_image_input( mm_data: MultiModalDataDict | None, image_count: int, skipped_image_indices: list | None = None, ) -> None: assert mm_data is not None assert set(mm_data.keys()) == {"image"} image_data = mm_data.get("image") assert image_data is not None assert isinstance(image_data, list) and len(image_data) == image_count if skipped_image_indices is not None: for i in skipped_image_indices: assert image_data[i] is None def _assert_mm_data_is_vision_chunk_input( mm_data: MultiModalDataDict | None, vision_chunk_count: int, ) -> None: assert mm_data is not None assert set(mm_data.keys()) == {"vision_chunk"} vision_chunk_data = mm_data.get("vision_chunk") assert vision_chunk_data is not None assert ( isinstance(vision_chunk_data, list) and len(vision_chunk_data) == vision_chunk_count ) def _assert_mm_uuids( mm_uuids: MultiModalUUIDDict | None, media_count: int, expected_uuids: list[str | None], modality: str = "image", ) -> None: if len(expected_uuids) > 0: assert mm_uuids is not None assert modality in mm_uuids image_uuids = mm_uuids.get(modality) assert image_uuids is not None assert isinstance(image_uuids, list) and len(image_uuids) == media_count assert image_uuids == expected_uuids else: assert mm_uuids is None ModalityType = Literal["image", "video", "audio"] MultiModalDataCounts = Mapping[ModalityType, int] def _assert_mm_data_inputs( mm_data: MultiModalDataDict | None, data_count: MultiModalDataCounts, skipped_media_indices: dict[str, list] | None = None, # modality -> list[int] ) -> None: assert mm_data is not None assert set(data_count.keys()) == (set(mm_data.keys())) for modality, n in data_count.items(): modality_data = mm_data.get(modality) assert modality_data is not None assert isinstance(modality_data, list) and len(modality_data) == n if skipped_media_indices is not None: skipped_media_indices_for_modality = skipped_media_indices.get(modality) assert skipped_media_indices_for_modality is not None for i in skipped_media_indices_for_modality: assert modality_data[i] is None def test_parse_chat_messages_single_image( phi3v_model_config, image_url, ): conversation, mm_data, mm_uuids = parse_chat_messages( [ { "role": "user", "content": [ {"type": "image_url", "image_url": {"url": image_url}}, {"type": "text", "text": "What's in the image?"}, ], } ], phi3v_model_config, content_format="string", ) assert conversation == [ {"role": "user", "content": "<|image_1|>\nWhat's in the image?"} ] _assert_mm_data_is_image_input(mm_data, 1) _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None]) def test_parse_chat_messages_single_image_with_uuid( phi3v_model_config, image_url, ): image_uuid = str(hash(image_url)) conversation, mm_data, mm_uuids = parse_chat_messages( [ { "role": "user", "content": [ { "type": "image_url", "image_url": { "url": image_url, }, "uuid": image_uuid, }, {"type": "text", "text": "What's in the image?"}, ], } ], phi3v_model_config, content_format="string", ) assert conversation == [ {"role": "user", "content": "<|image_1|>\nWhat's in the image?"} ] _assert_mm_data_is_image_input(mm_data, 1) _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid]) def test_parse_chat_messages_single_empty_image_with_uuid( phi3v_model_config, image_url, ): image_uuid = str(hash(image_url)) conversation, mm_data, mm_uuids = parse_chat_messages( [ { "role": "user", "content": [ { "type": "image_url", "image_url": None, "uuid": image_uuid, }, {"type": "text", "text": "What's in the image?"}, ], } ], phi3v_model_config, content_format="string", ) assert conversation == [ {"role": "user", "content": "<|image_1|>\nWhat's in the image?"} ] _assert_mm_data_is_image_input(mm_data, 1, skipped_image_indices=[0]) _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid]) def test_parse_chat_messages_single_image_with_bad_uuid_format( phi3v_model_config, image_url, ): image_uuid = str(hash(image_url)) conversation, mm_data, mm_uuids = parse_chat_messages( [ { "role": "user", "content": [ { "type": "image_url", "image_url": { "url": image_url, "uuid": image_uuid, }, "bad_uuid_key": image_uuid, }, {"type": "text", "text": "What's in the image?"}, ], } ], phi3v_model_config, content_format="string", ) assert conversation == [ {"role": "user", "content": "<|image_1|>\nWhat's in the image?"} ] _assert_mm_data_is_image_input(mm_data, 1) _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None]) def test_parse_chat_messages_multiple_images_with_uuids( phi3v_model_config, image_url, ): image_uuid1 = "my_uuid_1" image_uuid2 = "my_uuid_2" conversation, mm_data, mm_uuids = parse_chat_messages( [ { "role": "user", "content": [ { "type": "image_url", "image_url": { "url": image_url, }, "uuid": image_uuid1, }, { "type": "image_url", "image_url": { "url": image_url, }, "uuid": image_uuid2, }, {"type": "text", "text": "What's in the image?"}, ], } ], phi3v_model_config, content_format="string", ) assert conversation == [ { "role": "user", "content": "<|image_1|>\n<|image_2|>\nWhat's in the image?", } ] _assert_mm_data_is_image_input(mm_data, 2) _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2]) def test_parse_chat_messages_multiple_empty_images_with_uuids( phi3v_model_config, image_url, ): image_uuid1 = "my_uuid_1" image_uuid2 = "my_uuid_2" conversation, mm_data, mm_uuids = parse_chat_messages( [ { "role": "user", "content": [ { "type": "image_url", "image_url": None, "uuid": image_uuid1, }, { "type": "image_url", "image_url": None, "uuid": image_uuid2, }, {"type": "text", "text": "What's in the image?"}, ], } ], phi3v_model_config, content_format="string", ) assert conversation == [ { "role": "user", "content": "<|image_1|>\n<|image_2|>\nWhat's in the image?", } ] _assert_mm_data_is_image_input(mm_data, 2, skipped_image_indices=[0, 1]) _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2]) def test_parse_chat_messages_mixed_empty_images_with_uuids( phi3v_model_config, image_url, ): image_uuid1 = "my_uuid_1" image_uuid2 = "my_uuid_2" conversation, mm_data, mm_uuids = parse_chat_messages( [ { "role": "user", "content": [ { "type": "image_url", "image_url": { "url": image_url, }, "uuid": image_uuid1, }, { "type": "image_url", "image_url": None, "uuid": image_uuid2, }, {"type": "text", "text": "What's in the image?"}, ], } ], phi3v_model_config, content_format="string", ) assert conversation == [ { "role": "user", "content": "<|image_1|>\n<|image_2|>\nWhat's in the image?", } ] _assert_mm_data_is_image_input(mm_data, 2, skipped_image_indices=[1]) _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2]) @pytest.mark.asyncio async def test_parse_chat_messages_single_image_with_uuid_async( phi3v_model_config, image_url, ): image_uuid = str(hash(image_url)) conversation, mm_data, mm_uuids = await parse_chat_messages_async( [ { "role": "user", "content": [ { "type": "image_url", "image_url": {"url": image_url}, "uuid": image_uuid, }, {"type": "text", "text": "What's in the image?"}, ], } ], phi3v_model_config, content_format="string", ) assert conversation == [ {"role": "user", "content": "<|image_1|>\nWhat's in the image?"} ] _assert_mm_data_is_image_input(mm_data, 1) _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid]) @pytest.mark.asyncio async def test_parse_chat_messages_empty_image_with_uuid_async( phi3v_model_config, image_url, ): image_uuid = str(hash(image_url)) conversation, mm_data, mm_uuids = await parse_chat_messages_async( [ { "role": "user", "content": [ { "type": "image_url", "image_url": None, "uuid": image_uuid, }, {"type": "text", "text": "What's in the image?"}, ], } ], phi3v_model_config, content_format="string", ) assert conversation == [ {"role": "user", "content": "<|image_1|>\nWhat's in the image?"} ] _assert_mm_data_is_image_input(mm_data, 1, skipped_image_indices=[0]) _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid]) @pytest.mark.asyncio async def test_parse_chat_messages_multiple_images_with_uuids_async( phi3v_model_config, image_url, ): image_uuid1 = "my_uuid_1" image_uuid2 = "my_uuid_2" conversation, mm_data, mm_uuids = await parse_chat_messages_async( [ { "role": "user", "content": [ { "type": "image_url", "image_url": {"url": image_url}, "uuid": image_uuid1, }, { "type": "image_pil", "image_pil": ImageAsset("cherry_blossom").pil_image, "uuid": image_uuid2, }, {"type": "text", "text": "What's in these images?"}, ], } ], phi3v_model_config, content_format="string", ) assert conversation == [ { "role": "user", "content": "<|image_1|>\n<|image_2|>\nWhat's in these images?", } ] _assert_mm_data_is_image_input(mm_data, 2) _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2]) @pytest.mark.asyncio async def test_parse_chat_messages_multiple_empty_images_with_uuids_async( phi3v_model_config, image_url, ): image_uuid1 = "my_uuid_1" image_uuid2 = "my_uuid_2" conversation, mm_data, mm_uuids = await parse_chat_messages_async( [ { "role": "user", "content": [ { "type": "image_url", "image_url": None, "uuid": image_uuid1, }, { "type": "image_pil", "image_pil": None, "uuid": image_uuid2, }, {"type": "text", "text": "What's in these images?"}, ], } ], phi3v_model_config, content_format="string", ) assert conversation == [ { "role": "user", "content": "<|image_1|>\n<|image_2|>\nWhat's in these images?", } ] _assert_mm_data_is_image_input(mm_data, 2, skipped_image_indices=[0, 1]) _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2]) @pytest.mark.asyncio async def test_parse_chat_messages_multiple_images_with_partial_uuids_async( phi3v_model_config, image_url, ): image_uuid2 = "my_uuid_2" conversation, mm_data, mm_uuids = await parse_chat_messages_async( [ { "role": "user", "content": [ { "type": "image_url", "image_url": {"url": image_url}, }, { "type": "image_pil", "image_pil": ImageAsset("cherry_blossom").pil_image, "uuid": image_uuid2, }, {"type": "text", "text": "What's in these images?"}, ], } ], phi3v_model_config, content_format="string", ) assert conversation == [ { "role": "user", "content": "<|image_1|>\n<|image_2|>\nWhat's in these images?", } ] _assert_mm_data_is_image_input(mm_data, 2) _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, image_uuid2]) def test_parse_chat_messages_empty_system( mistral_model_config, ): # Test string format conversation, _, _ = parse_chat_messages( [ {"role": "system", "content": ""}, { "role": "user", "content": [{"type": "text", "text": "Who are you?"}], }, ], mistral_model_config, content_format="string", ) assert conversation == [ {"role": "system", "content": ""}, {"role": "user", "content": "Who are you?"}, ] # Test openai format conversation, _, _ = parse_chat_messages( [ {"role": "system", "content": ""}, { "role": "user", "content": [{"type": "text", "text": "Who are you?"}], }, ], mistral_model_config, content_format="openai", ) assert conversation == [ {"role": "system", "content": [{"type": "text", "text": ""}]}, {"role": "user", "content": [{"type": "text", "text": "Who are you?"}]}, ] @pytest.mark.asyncio async def test_parse_chat_messages_single_image_async( phi3v_model_config, image_url, ): conversation, mm_data, mm_uuids = await parse_chat_messages_async( [ { "role": "user", "content": [ {"type": "image_url", "image_url": {"url": image_url}}, {"type": "text", "text": "What's in the image?"}, ], } ], phi3v_model_config, content_format="string", ) assert conversation == [ {"role": "user", "content": "<|image_1|>\nWhat's in the image?"} ] _assert_mm_data_is_image_input(mm_data, 1) _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None]) def test_parse_chat_messages_multiple_images( phi3v_model_config, image_url, ): conversation, mm_data, mm_uuids = parse_chat_messages( [ { "role": "user", "content": [ {"type": "image_url", "image_url": {"url": image_url}}, { "type": "image_pil", "image_pil": ImageAsset("cherry_blossom").pil_image, }, {"type": "text", "text": "What's in these images?"}, ], } ], phi3v_model_config, content_format="string", ) assert conversation == [ { "role": "user", "content": "<|image_1|>\n<|image_2|>\nWhat's in these images?", } ] _assert_mm_data_is_image_input(mm_data, 2) _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None]) def test_parse_chat_messages_empty_pil_image_with_uuid( phi3v_model_config, ): uuid = "abcd" conversation, mm_data, mm_uuids = parse_chat_messages( [ { "role": "user", "content": [ {"type": "image_pil", "image_pil": None, "uuid": uuid}, {"type": "text", "text": "What's in this image?"}, ], } ], phi3v_model_config, content_format="string", ) assert conversation == [ { "role": "user", "content": "<|image_1|>\nWhat's in this image?", } ] _assert_mm_data_is_image_input(mm_data, 1, skipped_image_indices=[0]) _assert_mm_uuids(mm_uuids, 1, expected_uuids=[uuid]) def test_parse_chat_messages_empty_image_embeds_with_uuid( phi3v_model_config_image_embeds, ): uuid = "abcd" conversation, mm_data, mm_uuids = parse_chat_messages( [ { "role": "user", "content": [ {"type": "image_embeds", "image_embeds": None, "uuid": uuid}, {"type": "text", "text": "What's in this image?"}, ], } ], phi3v_model_config_image_embeds, content_format="string", ) assert conversation == [ { "role": "user", "content": "<|image_1|>\nWhat's in this image?", } ] assert mm_data is not None assert "image" in mm_data assert isinstance(mm_data["image"], list) assert len(mm_data["image"]) == 1 assert mm_data["image"][0] is None _assert_mm_uuids(mm_uuids, 1, expected_uuids=[uuid]) def test_parse_chat_messages_empty_audio_embeds_with_uuid( audio_embeds_model_config, ): """Test audio_embeds with UUID (no actual embeds data).""" uuid = "test-audio-uuid-123" conversation, mm_data, mm_uuids = parse_chat_messages( [ { "role": "user", "content": [ {"type": "text", "text": "Describe this audio"}, {"type": "audio_embeds", "audio_embeds": None, "uuid": uuid}, ], } ], audio_embeds_model_config, content_format="string", ) # Should have audio in mm_data as None (UUID provided) assert mm_data is not None assert "audio" in mm_data assert isinstance(mm_data["audio"], list) assert len(mm_data["audio"]) == 1 assert mm_data["audio"][0] is None # UUID should be recorded _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[uuid]) def test_parse_chat_messages_audio_embeds_with_string( audio_embeds_model_config, ): """Test audio_embeds with base64 string embedding data.""" import torch # Create a sample audio embedding tensor hidden_size = audio_embeds_model_config.get_inputs_embeds_size() audio_embedding = torch.randn(1, 128, hidden_size) # Encode it as base64 base64_audio_embedding = tensor2base64(audio_embedding) conversation, mm_data, mm_uuids = parse_chat_messages( [ { "role": "user", "content": [ {"type": "text", "text": "Describe this audio"}, { "type": "audio_embeds", "audio_embeds": base64_audio_embedding, }, ], } ], audio_embeds_model_config, content_format="string", ) # Should have audio embedding in mm_data (single tensor, not a list) assert mm_data is not None assert "audio" in mm_data assert isinstance(mm_data["audio"], torch.Tensor) assert mm_data["audio"].shape == audio_embedding.shape # No UUID provided assert mm_uuids is not None assert "audio" in mm_uuids _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[None]) @pytest.mark.asyncio async def test_parse_chat_messages_audio_embeds_async( audio_embeds_model_config, ): """Test audio_embeds with async futures.""" import torch # Create a sample audio embedding tensor hidden_size = audio_embeds_model_config.get_inputs_embeds_size() audio_embedding = torch.randn(1, 128, hidden_size) # Encode it as base64 base64_audio_embedding = tensor2base64(audio_embedding) conversation, mm_data, mm_uuids = await parse_chat_messages_async( [ { "role": "user", "content": [ {"type": "text", "text": "Describe this audio"}, { "type": "audio_embeds", "audio_embeds": base64_audio_embedding, }, ], } ], audio_embeds_model_config, content_format="string", ) # Should have audio embedding in mm_data (single tensor, not a list) assert mm_data is not None assert "audio" in mm_data assert isinstance(mm_data["audio"], torch.Tensor) assert mm_data["audio"].shape == audio_embedding.shape # No UUID provided assert mm_uuids is not None assert "audio" in mm_uuids _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[None]) def test_parse_chat_messages_multiple_image_embeds( phi3v_model_config_image_embeds, ): """Test that multiple image_embeds in a single message are now supported. This test validates the fix for the limitation that previously only allowed one message with {'type': 'image_embeds'}. Now multiple image embeddings can be provided in a single request, similar to regular images. """ # Create two sample image embedding tensors hidden_size = phi3v_model_config_image_embeds.get_inputs_embeds_size() image_embedding_1 = torch.randn(256, hidden_size) image_embedding_2 = torch.randn(128, hidden_size) # Encode them as base64 using the convenience function base64_image_embedding_1 = tensor2base64(image_embedding_1) base64_image_embedding_2 = tensor2base64(image_embedding_2) conversation, mm_data, mm_uuids = parse_chat_messages( [ { "role": "user", "content": [ { "type": "image_embeds", "image_embeds": base64_image_embedding_1, }, { "type": "image_embeds", "image_embeds": base64_image_embedding_2, }, {"type": "text", "text": "Describe these two images."}, ], } ], phi3v_model_config_image_embeds, content_format="string", ) # Verify conversation structure assert conversation == [ { "role": "user", "content": "<|image_1|>\n<|image_2|>\nDescribe these two images.", } ] # Verify mm_data contains a list of embeddings (not a single embedding) assert mm_data is not None assert "image" in mm_data assert isinstance(mm_data["image"], list) assert len(mm_data["image"]) == 2 # Verify each embedding has the correct shape assert isinstance(mm_data["image"][0], torch.Tensor) assert mm_data["image"][0].shape == image_embedding_1.shape assert isinstance(mm_data["image"][1], torch.Tensor) assert mm_data["image"][1].shape == image_embedding_2.shape # Verify UUIDs (None since we didn't provide any) _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None]) def test_parse_chat_messages_multiple_image_embeds_with_uuids( phi3v_model_config_image_embeds, ): """Test multiple image_embeds with UUIDs. This validates that UUIDs are properly tracked for multiple embeddings. """ uuid1 = "image-uuid-1" uuid2 = "image-uuid-2" conversation, mm_data, mm_uuids = parse_chat_messages( [ { "role": "user", "content": [ { "type": "image_embeds", "image_embeds": None, "uuid": uuid1, }, { "type": "image_embeds", "image_embeds": None, "uuid": uuid2, }, {"type": "text", "text": "Compare these images."}, ], } ], phi3v_model_config_image_embeds, content_format="string", ) # Verify conversation structure assert conversation == [ { "role": "user", "content": "<|image_1|>\n<|image_2|>\nCompare these images.", } ] # Verify mm_data contains a list with None values (UUID references) assert mm_data is not None assert "image" in mm_data assert isinstance(mm_data["image"], list) assert len(mm_data["image"]) == 2 assert mm_data["image"][0] is None assert mm_data["image"][1] is None # Verify UUIDs are correctly tracked _assert_mm_uuids(mm_uuids, 2, expected_uuids=[uuid1, uuid2]) @pytest.mark.asyncio async def test_parse_chat_messages_multiple_image_embeds_async( phi3v_model_config_image_embeds, ): """Test multiple image_embeds with async parsing. This validates the AsyncMultiModalItemTracker also supports multiple embeddings. """ # Create two sample image embedding tensors hidden_size = phi3v_model_config_image_embeds.get_inputs_embeds_size() image_embedding_1 = torch.randn(200, hidden_size) image_embedding_2 = torch.randn(150, hidden_size) # Encode them as base64 using the convenience function base64_image_embedding_1 = tensor2base64(image_embedding_1) base64_image_embedding_2 = tensor2base64(image_embedding_2) conversation, mm_data, mm_uuids = await parse_chat_messages_async( [ { "role": "user", "content": [ { "type": "image_embeds", "image_embeds": base64_image_embedding_1, }, { "type": "image_embeds", "image_embeds": base64_image_embedding_2, }, {"type": "text", "text": "What do these images show?"}, ], } ], phi3v_model_config_image_embeds, content_format="string", ) # Verify conversation structure assert conversation == [ { "role": "user", "content": "<|image_1|>\n<|image_2|>\nWhat do these images show?", } ] # Await the future and verify mm_data assert mm_data is not None assert "image" in mm_data assert isinstance(mm_data["image"], list) assert len(mm_data["image"]) == 2 # Verify each embedding has the correct shape assert isinstance(mm_data["image"][0], torch.Tensor) assert mm_data["image"][0].shape == image_embedding_1.shape assert isinstance(mm_data["image"][1], torch.Tensor) assert mm_data["image"][1].shape == image_embedding_2.shape # Verify UUIDs _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None]) @pytest.mark.asyncio async def test_parse_chat_messages_empty_image_embeds_with_uuid_async( phi3v_model_config_image_embeds, ): uuid = "abcd" conversation, mm_data, mm_uuids = await parse_chat_messages_async( [ { "role": "user", "content": [ {"type": "image_embeds", "image_embeds": None, "uuid": uuid}, {"type": "text", "text": "What's in this image?"}, ], } ], phi3v_model_config_image_embeds, content_format="string", ) assert conversation == [ { "role": "user", "content": "<|image_1|>\nWhat's in this image?", } ] assert mm_data is not None assert "image" in mm_data assert isinstance(mm_data["image"], list) assert len(mm_data["image"]) == 1 assert mm_data["image"][0] is None _assert_mm_uuids(mm_uuids, 1, expected_uuids=[uuid]) def test_parse_chat_messages_empty_dict_image_embeds( phi3v_model_config_image_embeds, ): """Test that empty dictionary for image_embeds is handled without errors.""" conversation, mm_data, mm_uuids = parse_chat_messages( [ { "role": "user", "content": [ {"type": "image_embeds", "image_embeds": {}}, {"type": "text", "text": "What's in this image?"}, ], } ], phi3v_model_config_image_embeds, content_format="string", ) # Verify conversation structure assert conversation == [ { "role": "user", "content": "<|image_1|>\nWhat's in this image?", } ] # Verify mm_data contains an empty dictionary of embeddings assert mm_data is not None assert "image" in mm_data assert isinstance(mm_data["image"], dict) assert len(mm_data["image"]) == 0 # Verify UUIDs (None since we didn't provide any) _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None]) def test_parse_chat_messages_multiple_dict_image_embeds( qwen25omni_model_config_image_embeds, ): """Test that multiple dictionaries for image_embeds is handled without errors.""" # Create two sample image embedding tensors batch_size = 2 hidden_size = qwen25omni_model_config_image_embeds.get_inputs_embeds_size() image_embeds = torch.randn(batch_size * 220, hidden_size) image_grid_thw = torch.tensor([[1, 22, 40] for _ in range(batch_size)]) conversation, mm_data, mm_uuids = parse_chat_messages( [ { "role": "user", "content": [ { "type": "image_embeds", "image_embeds": { "image_embeds": tensor2base64(embeds), "image_grid_thw": tensor2base64(grid_thw), }, } for embeds, grid_thw in zip( image_embeds.chunk(batch_size), image_grid_thw ) ] + [ {"type": "text", "text": "Describe these two images."}, ], } ], qwen25omni_model_config_image_embeds, content_format="string", ) # Verify conversation structure assert conversation == [ { "role": "user", "content": "<|vision_start|><|IMAGE|><|vision_end|>\n" "<|vision_start|><|IMAGE|><|vision_end|>\nDescribe these two images.", } ] # Verify mm_data contains a dictionary of multi-embeddings assert mm_data is not None assert "image" in mm_data assert isinstance(mm_data["image"], dict) assert len(mm_data["image"]) == batch_size # Verify each embedding has the correct shape assert isinstance(mm_data["image"]["image_embeds"], torch.Tensor) assert mm_data["image"]["image_embeds"].shape == image_embeds.shape assert isinstance(mm_data["image"]["image_grid_thw"], torch.Tensor) assert mm_data["image"]["image_grid_thw"].shape == image_grid_thw.shape # Verify UUIDs (None since we didn't provide any) _assert_mm_uuids(mm_uuids, batch_size, expected_uuids=[None, None]) @pytest.mark.asyncio async def test_parse_chat_messages_multiple_images_async( phi3v_model_config, image_url, ): conversation, mm_data, mm_uuids = await parse_chat_messages_async( [ { "role": "user", "content": [ {"type": "image_url", "image_url": {"url": image_url}}, { "type": "image_pil", "image_pil": ImageAsset("cherry_blossom").pil_image, }, {"type": "text", "text": "What's in these images?"}, ], } ], phi3v_model_config, content_format="string", ) assert conversation == [ { "role": "user", "content": "<|image_1|>\n<|image_2|>\nWhat's in these images?", } ] _assert_mm_data_is_image_input(mm_data, 2) _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None]) def test_parse_chat_messages_placeholder_already_in_prompt( phi3v_model_config, image_url, ): conversation, mm_data, mm_uuids = parse_chat_messages( [ { "role": "user", "content": [ {"type": "image_url", "image_url": {"url": image_url}}, {"type": "image_url", "image_url": {"url": image_url}}, { "type": "text", "text": "What's in <|image_1|> and how does it compare to <|image_2|>?", # noqa: E501 }, ], } ], phi3v_model_config, content_format="string", ) assert conversation == [ { "role": "user", "content": "What's in <|image_1|> and how does it compare to <|image_2|>?", } ] _assert_mm_data_is_image_input(mm_data, 2) _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None]) def test_parse_chat_messages_placeholder_one_already_in_prompt( phi3v_model_config, image_url, ): conversation, mm_data, mm_uuids = parse_chat_messages( [ { "role": "user", "content": [ {"type": "image_url", "image_url": {"url": image_url}}, {"type": "image_url", "image_url": {"url": image_url}}, { "type": "text", "text": "What's in <|image_1|> and how does it compare to " "the other one?", }, ], } ], phi3v_model_config, content_format="string", ) assert conversation == [ { "role": "user", "content": "<|image_2|>\nWhat's in <|image_1|> and how does it compare to " "the other one?", } ] _assert_mm_data_is_image_input(mm_data, 2) _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None]) def test_parse_chat_messages_multiple_images_across_messages( phi3v_model_config, image_url, ): conversation, mm_data, mm_uuids = parse_chat_messages( [ { "role": "user", "content": [ {"type": "image_url", "image_url": {"url": image_url}}, {"type": "text", "text": "What's in this image?"}, ], }, {"role": "assistant", "content": "Some stuff."}, { "role": "user", "content": [ {"type": "image_url", "image_url": {"url": image_url}}, {"type": "text", "text": "What about this one?"}, ], }, ], phi3v_model_config, content_format="string", ) assert conversation == [ {"role": "user", "content": "<|image_1|>\nWhat's in this image?"}, {"role": "assistant", "content": "Some stuff."}, {"role": "user", "content": "<|image_2|>\nWhat about this one?"}, ] _assert_mm_data_is_image_input(mm_data, 2) _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None]) def test_parse_chat_messages_multiple_images_with_uuids_across_messages( phi3v_model_config, image_url, ): image_uuid = str(hash(image_url)) conversation, mm_data, mm_uuids = parse_chat_messages( [ { "role": "user", "content": [ { "type": "image_url", "image_url": {"url": image_url}, "uuid": image_uuid, }, {"type": "text", "text": "What's in this image?"}, ], }, {"role": "assistant", "content": "Some stuff."}, { "role": "user", "content": [ { "type": "image_url", "image_url": {"url": image_url}, "uuid": image_uuid, }, {"type": "text", "text": "What about this one?"}, ], }, ], phi3v_model_config, content_format="string", ) assert conversation == [ {"role": "user", "content": "<|image_1|>\nWhat's in this image?"}, {"role": "assistant", "content": "Some stuff."}, {"role": "user", "content": "<|image_2|>\nWhat about this one?"}, ] _assert_mm_data_is_image_input(mm_data, 2) _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid, image_uuid]) def test_parse_chat_messages_context_text_format( phi3v_model_config, ): conversation, mm_data, mm_uuids = parse_chat_messages( [ { "role": "user", "content": [{"type": "text", "text": "What's in this text?"}], }, {"role": "assistant", "content": "Some stuff."}, {"role": "user", "content": "What about this one?"}, ], phi3v_model_config, content_format="openai", ) assert conversation == [ { "role": "user", "content": [{"type": "text", "text": "What's in this text?"}], }, { "role": "assistant", "content": [{"type": "text", "text": "Some stuff."}], }, { "role": "user", "content": [{"type": "text", "text": "What about this one?"}], }, ] assert mm_data is None assert mm_uuids is None def test_parse_chat_messages_openai_format_image_url( phi3v_model_config, image_url, ): content = [ {"type": "image_url", "image_url": {"url": image_url}}, {"type": "text", "text": "What's in the image?"}, ] conversation, mm_data, mm_uuids = parse_chat_messages( [ { "role": "user", "content": content, } ], phi3v_model_config, content_format="openai", ) assert conversation == [ { "role": "user", "content": [ {"type": "image"}, {"type": "text", "text": "What's in the image?"}, ], } ] _assert_mm_data_is_image_input(mm_data, 1) _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None]) def test_parse_chat_messages_rejects_too_many_images_in_one_message( phi3v_model_config, image_url, ): with warnings.catch_warnings(): warnings.filterwarnings( "ignore", message="coroutine 'async_get_and_parse_image' was never awaited", ) with pytest.raises(ValueError, match="At most"): parse_chat_messages( [ { "role": "user", "content": [ { "type": "image_url", "image_url": {"url": image_url}, }, { "type": "image_url", "image_url": {"url": image_url}, }, { "type": "image_url", "image_url": {"url": image_url}, }, {"type": "text", "text": "What's in these images?"}, ], } ], phi3v_model_config, content_format="string", ) def test_parse_chat_messages_rejects_too_many_images_across_messages( phi3v_model_config, image_url, ): with warnings.catch_warnings(): warnings.filterwarnings( "ignore", message="coroutine 'async_get_and_parse_image' was never awaited", ) with pytest.raises(ValueError, match="At most"): parse_chat_messages( [ { "role": "user", "content": [ { "type": "image_url", "image_url": {"url": image_url}, }, {"type": "text", "text": "What's in this image?"}, ], }, {"role": "assistant", "content": "Some stuff."}, { "role": "user", "content": [ { "type": "image_url", "image_url": {"url": image_url}, }, { "type": "image_url", "image_url": {"url": image_url}, }, {"type": "text", "text": "What about these two?"}, ], }, ], phi3v_model_config, content_format="string", ) def test_parse_chat_messages_multiple_images_uncommon_input( phi3v_model_config, image_url, ): conversation, mm_data, mm_uuids = parse_chat_messages( [ { "role": "user", "content": [ "What's in these images?", {"image_url": image_url}, {"image_url": image_url}, ], } ], phi3v_model_config, content_format="string", ) assert conversation == [ { "role": "user", "content": "<|image_1|>\n<|image_2|>\nWhat's in these images?", } ] _assert_mm_data_is_image_input(mm_data, 2) _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None]) def test_parse_chat_messages_multiple_images_interleave( phi3v_model_config_mm_interleaved, image_url, ): conversation, mm_data, mm_uuids = parse_chat_messages( [ { "role": "user", "content": [ { "type": "text", "text": "I need you to compare this image", }, {"type": "image_url", "image_url": {"url": image_url}}, {"type": "text", "text": "and this one"}, {"type": "image_url", "image_url": {"url": image_url}}, {"type": "text", "text": "Do they have differences?"}, ], } ], phi3v_model_config_mm_interleaved, content_format="string", ) assert conversation == [ { "role": "user", "content": "I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n" # noqa: E501 "Do they have differences?", } ] _assert_mm_data_is_image_input(mm_data, 2) _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None]) @pytest.mark.asyncio async def test_parse_chat_messages_multiple_images_interleave_async( phi3v_model_config_mm_interleaved, image_url, ): conversation, mm_data, mm_uuids = await parse_chat_messages_async( [ { "role": "user", "content": [ { "type": "text", "text": "I need you to compare this image", }, {"type": "image_url", "image_url": {"url": image_url}}, {"type": "text", "text": "and this one"}, {"type": "image_url", "image_url": {"url": image_url}}, {"type": "text", "text": "Do they have differences?"}, ], } ], phi3v_model_config_mm_interleaved, content_format="string", ) assert conversation == [ { "role": "user", "content": "I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n" # noqa: E501 "Do they have differences?", } ] _assert_mm_data_is_image_input(mm_data, 2) _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None]) @pytest.mark.asyncio async def test_parse_chat_messages_multiple_images_with_uuids_interleave_async( phi3v_model_config_mm_interleaved, image_url, ): image_uuid = str(hash(image_url)) conversation, mm_data, mm_uuids = await parse_chat_messages_async( [ { "role": "user", "content": [ { "type": "text", "text": "I need you to compare this image", }, { "type": "image_url", "image_url": {"url": image_url}, "uuid": image_uuid, }, {"type": "text", "text": "and this one"}, { "type": "image_url", "image_url": {"url": image_url}, "uuid": image_uuid, }, {"type": "text", "text": "Do they have differences?"}, ], } ], phi3v_model_config_mm_interleaved, content_format="string", ) assert conversation == [ { "role": "user", "content": "I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n" # noqa: E501 "Do they have differences?", } ] _assert_mm_data_is_image_input(mm_data, 2) _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid, image_uuid]) def test_parse_chat_messages_multiple_images_multiple_messages_interleave( phi3v_model_config_mm_interleaved, image_url, ): conversation, mm_data, mm_uuids = parse_chat_messages( [ { "role": "user", "content": [ {"type": "text", "text": "What's on this image?"}, {"type": "image_url", "image_url": {"url": image_url}}, {"type": "text", "text": "Be accurate."}, ], }, {"role": "assistant", "content": "Some stuff."}, { "role": "user", "content": [ {"type": "text", "text": "What's on this image?"}, {"type": "image_url", "image_url": {"url": image_url}}, ], }, ], phi3v_model_config_mm_interleaved, content_format="string", ) assert conversation == [ { "role": "user", "content": "What's on this image?\n<|image_1|>\nBe accurate.", }, {"role": "assistant", "content": "Some stuff."}, {"role": "user", "content": "What's on this image?\n<|image_2|>"}, ] _assert_mm_data_is_image_input(mm_data, 2) _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None]) def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interleave( phi3v_model_config_mm_interleaved, image_url, ): image_uuid = str(hash(image_url)) conversation, mm_data, mm_uuids = parse_chat_messages( [ { "role": "user", "content": [ {"type": "text", "text": "What's on this image?"}, { "type": "image_url", "image_url": {"url": image_url}, "uuid": image_uuid, }, {"type": "text", "text": "Be accurate."}, ], }, {"role": "assistant", "content": "Some stuff."}, { "role": "user", "content": [ {"type": "text", "text": "What's on this image?"}, { "type": "image_url", "image_url": {"url": image_url}, "uuid": image_uuid, }, ], }, ], phi3v_model_config_mm_interleaved, content_format="string", ) assert conversation == [ { "role": "user", "content": "What's on this image?\n<|image_1|>\nBe accurate.", }, {"role": "assistant", "content": "Some stuff."}, {"role": "user", "content": "What's on this image?\n<|image_2|>"}, ] _assert_mm_data_is_image_input(mm_data, 2) _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid, image_uuid]) def test_parse_chat_messages_multiple_modals_multiple_messages_interleave( qwen25omni_model_config_mm_interleaved, image_url, video_url, audio_url, ): conversation, mm_data, mm_uuids = parse_chat_messages( [ { "role": "user", "content": [ {"type": "text", "text": "What's on this image?"}, {"type": "image_url", "image_url": {"url": image_url}}, {"type": "text", "text": "Now listen to this audio"}, {"type": "audio_url", "audio_url": {"url": audio_url}}, ], }, {"role": "assistant", "content": "Some stuff."}, { "role": "user", "content": [ {"type": "text", "text": "What's on this image?"}, {"type": "image_url", "image_url": {"url": image_url}}, {"type": "text", "text": "And what's in the video?"}, {"type": "video_url", "video_url": {"url": video_url}}, ], }, ], qwen25omni_model_config_mm_interleaved, content_format="string", ) assert conversation == [ { "role": "user", "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>" "\nNow listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>", }, {"role": "assistant", "content": "Some stuff."}, { "role": "user", "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>" "\nAnd what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>", }, ] _assert_mm_data_inputs(mm_data, {"image": 2, "video": 1, "audio": 1}) _assert_mm_uuids(mm_uuids, 2, modality="image", expected_uuids=[None, None]) _assert_mm_uuids(mm_uuids, 1, modality="video", expected_uuids=[None]) _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[None]) def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interleave( qwen25omni_model_config_mm_interleaved, image_url, video_url, audio_url, ): conversation, mm_data, mm_uuids = parse_chat_messages( [ { "role": "user", "content": [ {"type": "text", "text": "What's on this image?"}, { "type": "image_url", "image_url": {"url": image_url}, "uuid": "image_123", }, {"type": "text", "text": "Now listen to this audio"}, { "type": "audio_url", "audio_url": {"url": audio_url}, "uuid": "audio_123", }, ], }, {"role": "assistant", "content": "Some stuff."}, { "role": "user", "content": [ {"type": "text", "text": "What's on this image?"}, { "type": "image_url", "image_url": {"url": image_url}, "uuid": "image_123", }, {"type": "text", "text": "And what's in the video?"}, { "type": "video_url", "video_url": {"url": video_url}, "uuid": "video_123", }, ], }, ], qwen25omni_model_config_mm_interleaved, content_format="string", ) assert conversation == [ { "role": "user", "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>" "\nNow listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>", }, {"role": "assistant", "content": "Some stuff."}, { "role": "user", "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>" "\nAnd what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>", }, ] _assert_mm_data_inputs(mm_data, {"image": 2, "video": 1, "audio": 1}) _assert_mm_uuids( mm_uuids, 2, modality="image", expected_uuids=["image_123", "image_123"] ) _assert_mm_uuids(mm_uuids, 1, modality="video", expected_uuids=["video_123"]) _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=["audio_123"]) def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_messages_interleave( # noqa: E501 qwen25omni_model_config_mm_interleaved, image_url, video_url, audio_url, ): conversation, mm_data, mm_uuids = parse_chat_messages( [ { "role": "user", "content": [ {"type": "text", "text": "What's on this image?"}, { "type": "image_url", "image_url": None, "uuid": "image_123", }, {"type": "text", "text": "Now listen to this audio"}, { "type": "audio_url", "audio_url": None, "uuid": "audio_123", }, ], }, {"role": "assistant", "content": "Some stuff."}, { "role": "user", "content": [ {"type": "text", "text": "What's on this image?"}, { "type": "image_url", "image_url": None, "uuid": "image_123", }, {"type": "text", "text": "And what's in the video?"}, { "type": "video_url", "video_url": None, "uuid": "video_123", }, ], }, ], qwen25omni_model_config_mm_interleaved, content_format="string", ) assert conversation == [ { "role": "user", "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>" "\nNow listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>", }, {"role": "assistant", "content": "Some stuff."}, { "role": "user", "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>" "\nAnd what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>", }, ] _assert_mm_data_inputs( mm_data, {"image": 2, "video": 1, "audio": 1}, skipped_media_indices={"image": [0, 1], "video": [0], "audio": [0]}, ) _assert_mm_uuids( mm_uuids, 2, modality="image", expected_uuids=["image_123", "image_123"] ) _assert_mm_uuids(mm_uuids, 1, modality="video", expected_uuids=["video_123"]) _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=["audio_123"]) def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_messages_interleave( # noqa: E501 qwen25omni_model_config_mm_interleaved, image_url, video_url, audio_url, ): conversation, mm_data, mm_uuids = parse_chat_messages( [ { "role": "user", "content": [ {"type": "text", "text": "What's on this image?"}, { "type": "image_url", "image_url": {"url": image_url}, "uuid": "image_123", }, {"type": "text", "text": "Now listen to this audio"}, {"type": "audio_url", "audio_url": {"url": audio_url}}, ], }, {"role": "assistant", "content": "Some stuff."}, { "role": "user", "content": [ {"type": "text", "text": "What's on this image?"}, {"type": "image_url", "image_url": {"url": image_url}}, {"type": "text", "text": "And what's in the video?"}, { "type": "video_url", "video_url": {"url": video_url}, "uuid": "video_123", }, ], }, ], qwen25omni_model_config_mm_interleaved, content_format="string", ) assert conversation == [ { "role": "user", "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>" "\nNow listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>", }, {"role": "assistant", "content": "Some stuff."}, { "role": "user", "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>" "\nAnd what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>", }, ] _assert_mm_data_inputs(mm_data, {"image": 2, "video": 1, "audio": 1}) _assert_mm_uuids(mm_uuids, 2, modality="image", expected_uuids=["image_123", None]) _assert_mm_uuids(mm_uuids, 1, modality="video", expected_uuids=["video_123"]) _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[None]) def test_parse_chat_messages_multiple_images_interleave_with_placeholders( phi3v_model_config_mm_interleaved, image_url, ): with pytest.raises( ValueError, match=r"Found more '<|image_1|>' placeholders in input prompt " "than actual multimodal data items.", ): parse_chat_messages( [ { "role": "user", "content": [ {"type": "image_url", "image_url": {"url": image_url}}, {"type": "image_url", "image_url": {"url": image_url}}, { "type": "text", "text": "I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n" # noqa: E501 "Do they have differences?", }, ], } ], phi3v_model_config_mm_interleaved, content_format="string", ) def test_parse_chat_messages_include_thinking_chunk(mistral_model_config): messages = [ { "role": "system", "content": [ {"type": "text", "text": "You are a helpful assistant."}, { "type": "thinking", "closed": True, "thinking": "Only return the answer when you are confident.", }, ], }, {"role": "user", "content": "What is 2+2?"}, { "role": "assistant", "content": [ {"type": "text", "text": "Let me think about it."}, {"type": "thinking", "closed": True, "thinking": "2+2 = 4"}, { "type": "text", "text": "The answer is 4.", }, ], }, ] conversation_with_thinking, _, _ = parse_chat_messages( messages, mistral_model_config, content_format="openai", ) expected_conversation = [ { "role": "system", "content": [ {"type": "text", "text": "You are a helpful assistant."}, { "type": "text", "text": "Only return the answer when you are confident.", }, ], }, { "role": "user", "content": [{"type": "text", "text": "What is 2+2?"}], }, { "role": "assistant", "content": [ {"type": "text", "text": "Let me think about it."}, {"type": "text", "text": "2+2 = 4"}, {"type": "text", "text": "The answer is 4."}, ], }, ] assert conversation_with_thinking == expected_conversation def test_parse_chat_messages_single_empty_audio_with_uuid( qwen2_audio_model_config, ): audio_uuid = "abcd" conversation, mm_data, mm_uuids = parse_chat_messages( [ { "role": "user", "content": [ { "type": "input_audio", "input_audio": {}, "uuid": audio_uuid, }, {"type": "text", "text": "What does the audio say?"}, ], } ], qwen2_audio_model_config, content_format="string", ) assert conversation == [ { "role": "user", "content": "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the " "audio say?", } ] _assert_mm_data_inputs(mm_data, {"audio": 1}) _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[audio_uuid]) @pytest.mark.asyncio async def test_parse_chat_messages_single_empty_audio_with_uuid_async( qwen2_audio_model_config, ): audio_uuid = "abcd" conversation, mm_data, mm_uuids = await parse_chat_messages_async( [ { "role": "user", "content": [ { "type": "input_audio", "input_audio": {}, "uuid": audio_uuid, }, {"type": "text", "text": "What does the audio say?"}, ], } ], qwen2_audio_model_config, content_format="string", ) assert conversation == [ { "role": "user", "content": "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the " "audio say?", } ] _assert_mm_data_inputs(mm_data, {"audio": 1}) _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[audio_uuid]) def test_parse_chat_messages_image_vision_chunk( kimi_k2_5_model_config, image_url, ): messages = [ { "role": "user", "content": [ {"type": "text", "text": "Analyze this image."}, { "type": "image_url", "image_url": {"url": image_url}, }, ], } ] conversation, mm_data, mm_uuids = parse_chat_messages( messages, kimi_k2_5_model_config, content_format="string", ) placeholder = "<|media_begin|>image<|media_content|><|media_pad|><|media_end|>" expected_conversation = [ { "role": "user", "content": f"{placeholder}\nAnalyze this image.", } ] assert conversation == expected_conversation _assert_mm_data_is_vision_chunk_input(mm_data, 1) _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None], modality="vision_chunk") def test_parse_chat_messages_video_vision_chunk( kimi_k2_5_model_config, video_url, ): messages = [ { "role": "user", "content": [ {"type": "text", "text": "Analyze this video."}, { "type": "video_url", "video_url": {"url": video_url}, }, ], } ] conversation, mm_data, mm_uuids = parse_chat_messages( messages, kimi_k2_5_model_config, content_format="string", ) placeholder = "<|kimi_k25_video_placeholder|>" expected_conversation = [ { "role": "user", "content": f"{placeholder}\nAnalyze this video.", } ] assert conversation == expected_conversation _assert_mm_data_is_vision_chunk_input(mm_data, 1) _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None], modality="vision_chunk") def test_parse_chat_messages_image_vision_chunk_with_uuid( kimi_k2_5_model_config, image_url, ): image_uuid = "image_123" messages = [ { "role": "user", "content": [ {"type": "text", "text": "Analyze this image."}, { "type": "image_url", "image_url": {"url": image_url}, "uuid": image_uuid, }, ], } ] conversation, mm_data, mm_uuids = parse_chat_messages( messages, kimi_k2_5_model_config, content_format="string", ) placeholder = "<|media_begin|>image<|media_content|><|media_pad|><|media_end|>" expected_conversation = [ { "role": "user", "content": f"{placeholder}\nAnalyze this image.", } ] assert conversation == expected_conversation _assert_mm_data_is_vision_chunk_input(mm_data, 1) _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid], modality="vision_chunk") def test_parse_chat_messages_video_vision_chunk_with_uuid( kimi_k2_5_model_config, video_url, ): video_uuid = "video_456" messages = [ { "role": "user", "content": [ {"type": "text", "text": "Analyze this video."}, { "type": "video_url", "video_url": {"url": video_url}, "uuid": video_uuid, }, ], } ] conversation, mm_data, mm_uuids = parse_chat_messages( messages, kimi_k2_5_model_config, content_format="string", ) placeholder = "<|kimi_k25_video_placeholder|>" expected_conversation = [ { "role": "user", "content": f"{placeholder}\nAnalyze this video.", } ] assert conversation == expected_conversation _assert_mm_data_is_vision_chunk_input(mm_data, 1) _assert_mm_uuids(mm_uuids, 1, expected_uuids=[video_uuid], modality="vision_chunk") def test_parse_chat_messages_mixed_vision_chunk( kimi_k2_5_model_config, image_url, video_url, ): messages = [ { "role": "user", "content": [ {"type": "text", "text": "Analyze this image and video."}, { "type": "image_url", "image_url": {"url": image_url}, }, { "type": "video_url", "video_url": {"url": video_url}, }, ], } ] conversation, mm_data, mm_uuids = parse_chat_messages( messages, kimi_k2_5_model_config, content_format="string", ) image_placeholder = ( "<|media_begin|>image<|media_content|><|media_pad|><|media_end|>" ) video_placeholder = "<|kimi_k25_video_placeholder|>" expected_conversation = [ { "role": "user", "content": ( f"{image_placeholder}\n{video_placeholder}\n" "Analyze this image and video." ), } ] assert conversation == expected_conversation _assert_mm_data_is_vision_chunk_input(mm_data, 2) _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None], modality="vision_chunk") def test_parse_chat_messages_mixed_vision_chunk_with_uuid( kimi_k2_5_model_config, image_url, video_url, ): image_uuid = "image_123" video_uuid = "video_456" messages = [ { "role": "user", "content": [ {"type": "text", "text": "Analyze this image and video."}, { "type": "image_url", "image_url": {"url": image_url}, "uuid": image_uuid, }, { "type": "video_url", "video_url": {"url": video_url}, "uuid": video_uuid, }, ], } ] conversation, mm_data, mm_uuids = parse_chat_messages( messages, kimi_k2_5_model_config, content_format="string", ) image_placeholder = ( "<|media_begin|>image<|media_content|><|media_pad|><|media_end|>" ) video_placeholder = "<|kimi_k25_video_placeholder|>" expected_conversation = [ { "role": "user", "content": ( f"{image_placeholder}\n{video_placeholder}\n" "Analyze this image and video." ), } ] assert conversation == expected_conversation _assert_mm_data_is_vision_chunk_input(mm_data, 2) _assert_mm_uuids( mm_uuids, 2, expected_uuids=[image_uuid, video_uuid], modality="vision_chunk" ) @pytest.mark.asyncio async def test_parse_chat_messages_mixed_vision_chunk_async( kimi_k2_5_model_config, image_url, video_url, ): messages = [ { "role": "user", "content": [ {"type": "text", "text": "Analyze this image and video."}, { "type": "image_url", "image_url": {"url": image_url}, }, { "type": "video_url", "video_url": {"url": video_url}, }, ], } ] conversation, mm_data, mm_uuids = await parse_chat_messages_async( messages, kimi_k2_5_model_config, content_format="string", ) image_placeholder = ( "<|media_begin|>image<|media_content|><|media_pad|><|media_end|>" ) video_placeholder = "<|kimi_k25_video_placeholder|>" expected_conversation = [ { "role": "user", "content": ( f"{image_placeholder}\n{video_placeholder}\n" "Analyze this image and video." ), } ] assert conversation == expected_conversation _assert_mm_data_is_vision_chunk_input(mm_data, 2) _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None], modality="vision_chunk") @pytest.mark.asyncio async def test_parse_chat_messages_mixed_vision_chunk_with_uuid_async( kimi_k2_5_model_config, image_url, video_url, ): image_uuid = "image_123" video_uuid = "video_456" messages = [ { "role": "user", "content": [ {"type": "text", "text": "Analyze this image and video."}, { "type": "image_url", "image_url": {"url": image_url}, "uuid": image_uuid, }, { "type": "video_url", "video_url": {"url": video_url}, "uuid": video_uuid, }, ], } ] conversation, mm_data, mm_uuids = await parse_chat_messages_async( messages, kimi_k2_5_model_config, content_format="string", ) image_placeholder = ( "<|media_begin|>image<|media_content|><|media_pad|><|media_end|>" ) video_placeholder = "<|kimi_k25_video_placeholder|>" expected_conversation = [ { "role": "user", "content": ( f"{image_placeholder}\n{video_placeholder}\n" "Analyze this image and video." ), } ] assert conversation == expected_conversation _assert_mm_data_is_vision_chunk_input(mm_data, 2) _assert_mm_uuids( mm_uuids, 2, expected_uuids=[image_uuid, video_uuid], modality="vision_chunk" ) @pytest.mark.asyncio async def test_parse_chat_messages_image_vision_chunk_async( kimi_k2_5_model_config, image_url, ): messages = [ { "role": "user", "content": [ {"type": "text", "text": "Analyze this image."}, { "type": "image_url", "image_url": {"url": image_url}, }, ], } ] conversation, mm_data, mm_uuids = await parse_chat_messages_async( messages, kimi_k2_5_model_config, content_format="string", ) placeholder = "<|media_begin|>image<|media_content|><|media_pad|><|media_end|>" expected_conversation = [ { "role": "user", "content": f"{placeholder}\nAnalyze this image.", } ] assert conversation == expected_conversation _assert_mm_data_is_vision_chunk_input(mm_data, 1) _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None], modality="vision_chunk") @pytest.mark.asyncio async def test_parse_chat_messages_video_vision_chunk_async( kimi_k2_5_model_config, video_url, ): messages = [ { "role": "user", "content": [ {"type": "text", "text": "Analyze this video."}, { "type": "video_url", "video_url": {"url": video_url}, }, ], } ] conversation, mm_data, mm_uuids = await parse_chat_messages_async( messages, kimi_k2_5_model_config, content_format="string", ) placeholder = "<|kimi_k25_video_placeholder|>" expected_conversation = [ { "role": "user", "content": f"{placeholder}\nAnalyze this video.", } ] assert conversation == expected_conversation _assert_mm_data_is_vision_chunk_input(mm_data, 1) _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None], modality="vision_chunk") @pytest.mark.asyncio async def test_parse_chat_messages_image_vision_chunk_with_uuid_async( kimi_k2_5_model_config, image_url, ): image_uuid = "image_123" messages = [ { "role": "user", "content": [ {"type": "text", "text": "Analyze this image."}, { "type": "image_url", "image_url": {"url": image_url}, "uuid": image_uuid, }, ], } ] conversation, mm_data, mm_uuids = await parse_chat_messages_async( messages, kimi_k2_5_model_config, content_format="string", ) placeholder = "<|media_begin|>image<|media_content|><|media_pad|><|media_end|>" expected_conversation = [ { "role": "user", "content": f"{placeholder}\nAnalyze this image.", } ] assert conversation == expected_conversation _assert_mm_data_is_vision_chunk_input(mm_data, 1) _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid], modality="vision_chunk") @pytest.mark.asyncio async def test_parse_chat_messages_video_vision_chunk_with_uuid_async( kimi_k2_5_model_config, video_url, ): video_uuid = "video_456" messages = [ { "role": "user", "content": [ {"type": "text", "text": "Analyze this video."}, { "type": "video_url", "video_url": {"url": video_url}, "uuid": video_uuid, }, ], } ] conversation, mm_data, mm_uuids = await parse_chat_messages_async( messages, kimi_k2_5_model_config, content_format="string", ) placeholder = "<|kimi_k25_video_placeholder|>" expected_conversation = [ { "role": "user", "content": f"{placeholder}\nAnalyze this video.", } ] assert conversation == expected_conversation _assert_mm_data_is_vision_chunk_input(mm_data, 1) _assert_mm_uuids(mm_uuids, 1, expected_uuids=[video_uuid], modality="vision_chunk")