[Frontend][Multimodal] Allow skipping media data when UUIDs are provided. (#23950)

Signed-off-by: Roger Wang <hey@rogerw.io> Signed-off-by: Chenheli Hua <huachenheli@outlook.com> Signed-off-by: Roger Wang <hey@rogerw.me> Co-authored-by: Roger Wang <hey@rogerw.io> Co-authored-by: Roger Wang <hey@rogerw.me>
2025-09-12 19:16:06 -07:00
parent 4fdd6f5cbf
commit 7f2ea7074e
9 changed files with 970 additions and 96 deletions
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -79,6 +79,28 @@ def phi3v_tokenizer():
    )


+@pytest.fixture(scope="function")
+def qwen2_audio_model_config():
+    return ModelConfig(
+        QWEN2AUDIO_MODEL_ID,
+        runner="generate",
+        trust_remote_code=True,
+        limit_mm_per_prompt={
+            "audio": 1,
+        },
+    )
+
+
+@pytest.fixture(scope="module")
+def qwen2_audio_tokenizer():
+    return TokenizerGroup(
+        tokenizer_id=QWEN2AUDIO_MODEL_ID,
+        enable_lora=False,
+        max_num_seqs=5,
+        max_input_length=None,
+    )
+
+
@pytest.fixture(scope="function")
 def qwen25omni_model_config_mm_interleaved():
    return ModelConfig(
@@ -169,6 +191,7 @@ def audio_url():
 def _assert_mm_data_is_image_input(
    mm_data: Optional[MultiModalDataDict],
    image_count: int,
+    skipped_image_indices: Optional[list] = None,
 ) -> None:
    assert mm_data is not None
    assert set(mm_data.keys()) == {"image"}
@@ -177,6 +200,9 @@ def _assert_mm_data_is_image_input(
    assert image_data is not None

    assert isinstance(image_data, list) and len(image_data) == image_count
+    if skipped_image_indices is not None:
+        for i in skipped_image_indices:
+            assert image_data[i] is None


 def _assert_mm_uuids(
@@ -205,8 +231,10 @@ MultiModalDataCounts = Mapping[ModalityType, int]


 def _assert_mm_data_inputs(
-    mm_data: Optional[MultiModalDataDict],
-    data_count: MultiModalDataCounts,
+        mm_data: Optional[MultiModalDataDict],
+        data_count: MultiModalDataCounts,
+        skipped_media_indices: Optional[dict[
+            str, list]] = None,  # modality -> list[int]
 ) -> None:
    assert mm_data is not None
    assert set(data_count.keys()) == (set(mm_data.keys()))
@@ -216,6 +244,13 @@ def _assert_mm_data_inputs(
        assert modality_data is not None
        assert isinstance(modality_data, list) and len(modality_data) == n

+        if skipped_media_indices is not None:
+            skipped_media_indices_for_modality = skipped_media_indices.get(
+                modality)
+            assert skipped_media_indices_for_modality is not None
+            for i in skipped_media_indices_for_modality:
+                assert modality_data[i] is None
+

 def test_parse_chat_messages_single_image(
    phi3v_model_config,
@@ -289,6 +324,41 @@ def test_parse_chat_messages_single_image_with_uuid(
    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid])


+def test_parse_chat_messages_single_empty_image_with_uuid(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    image_uuid = str(hash(image_url))
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": None,
+                    "uuid": image_uuid,
+                },
+                {
+                    "type": "text",
+                    "text": "What's in the image?"
+                },
+            ],
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role": "user",
+        "content": "<|image_1|>\nWhat's in the image?"
+    }]
+    _assert_mm_data_is_image_input(mm_data, 1, skipped_image_indices=[0])
+    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid])
+
+
 def test_parse_chat_messages_single_image_with_bad_uuid_format(
    phi3v_model_config,
    phi3v_tokenizer,
@@ -375,6 +445,96 @@ def test_parse_chat_messages_multiple_images_with_uuids(
    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])


+def test_parse_chat_messages_multiple_empty_images_with_uuids(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    image_uuid1 = "my_uuid_1"
+    image_uuid2 = "my_uuid_2"
+
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": None,
+                    "uuid": image_uuid1,
+                },
+                {
+                    "type": "image_url",
+                    "image_url": None,
+                    "uuid": image_uuid2,
+                },
+                {
+                    "type": "text",
+                    "text": "What's in the image?"
+                },
+            ],
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role":
+        "user",
+        "content":
+        "<|image_1|>\n<|image_2|>\nWhat's in the image?",
+    }]
+    _assert_mm_data_is_image_input(mm_data, 2, skipped_image_indices=[0, 1])
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])
+
+
+def test_parse_chat_messages_mixed_empty_images_with_uuids(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    image_uuid1 = "my_uuid_1"
+    image_uuid2 = "my_uuid_2"
+
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url,
+                    },
+                    "uuid": image_uuid1,
+                },
+                {
+                    "type": "image_url",
+                    "image_url": None,
+                    "uuid": image_uuid2,
+                },
+                {
+                    "type": "text",
+                    "text": "What's in the image?"
+                },
+            ],
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role":
+        "user",
+        "content":
+        "<|image_1|>\n<|image_2|>\nWhat's in the image?",
+    }]
+    _assert_mm_data_is_image_input(mm_data, 2, skipped_image_indices=[1])
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])
+
+
@pytest.mark.asyncio
 async def test_parse_chat_messages_single_image_with_uuid_async(
    phi3v_model_config,
@@ -413,6 +573,44 @@ async def test_parse_chat_messages_single_image_with_uuid_async(
    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid])


+@pytest.mark.asyncio
+async def test_parse_chat_messages_empty_image_with_uuid_async(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    image_uuid = str(hash(image_url))
+    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": None,
+                    "uuid": image_uuid,
+                },
+                {
+                    "type": "text",
+                    "text": "What's in the image?"
+                },
+            ],
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role": "user",
+        "content": "<|image_1|>\nWhat's in the image?"
+    }]
+    _assert_mm_data_is_image_input(await mm_future,
+                                   1,
+                                   skipped_image_indices=[0])
+    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid])
+
+
@pytest.mark.asyncio
 async def test_parse_chat_messages_multiple_images_with_uuids_async(
    phi3v_model_config,
@@ -460,6 +658,53 @@ async def test_parse_chat_messages_multiple_images_with_uuids_async(
    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])


+@pytest.mark.asyncio
+async def test_parse_chat_messages_multiple_empty_images_with_uuids_async(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    image_uuid1 = "my_uuid_1"
+    image_uuid2 = "my_uuid_2"
+
+    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": None,
+                    "uuid": image_uuid1,
+                },
+                {
+                    "type": "image_pil",
+                    "image_pil": None,
+                    "uuid": image_uuid2,
+                },
+                {
+                    "type": "text",
+                    "text": "What's in these images?"
+                },
+            ],
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role":
+        "user",
+        "content":
+        "<|image_1|>\n<|image_2|>\nWhat's in these images?",
+    }]
+    _assert_mm_data_is_image_input(await mm_future,
+                                   2,
+                                   skipped_image_indices=[0, 1])
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])
+
+
@pytest.mark.asyncio
 async def test_parse_chat_messages_multiple_images_with_partial_uuids_async(
    phi3v_model_config,
@@ -653,6 +898,114 @@ def test_parse_chat_messages_multiple_images(
    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])


+def test_parse_chat_messages_empty_pil_image_with_uuid(
+    phi3v_model_config,
+    phi3v_tokenizer,
+):
+    uuid = "abcd"
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "image_pil",
+                    "image_pil": None,
+                    "uuid": uuid
+                },
+                {
+                    "type": "text",
+                    "text": "What's in this image?"
+                },
+            ],
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role": "user",
+        "content": "<|image_1|>\nWhat's in this image?",
+    }]
+    _assert_mm_data_is_image_input(mm_data, 1, skipped_image_indices=[0])
+    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[uuid])
+
+
+def test_parse_chat_messages_empty_image_embeds_with_uuid(
+    phi3v_model_config,
+    phi3v_tokenizer,
+):
+    uuid = "abcd"
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "image_embeds",
+                    "image_embeds": None,
+                    "uuid": uuid
+                },
+                {
+                    "type": "text",
+                    "text": "What's in this image?"
+                },
+            ],
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role": "user",
+        "content": "<|image_1|>\nWhat's in this image?",
+    }]
+    assert mm_data is not None
+    assert "image" in mm_data
+    assert mm_data["image"] is None
+    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[uuid])
+
+
+@pytest.mark.asyncio
+async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
+    phi3v_model_config,
+    phi3v_tokenizer,
+):
+    uuid = "abcd"
+    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "image_embeds",
+                    "image_embeds": None,
+                    "uuid": uuid
+                },
+                {
+                    "type": "text",
+                    "text": "What's in this image?"
+                },
+            ],
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role": "user",
+        "content": "<|image_1|>\nWhat's in this image?",
+    }]
+    mm_data = await mm_future
+    assert mm_data is not None
+    assert "image" in mm_data
+    assert mm_data["image"] is None
+    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[uuid])
+
+
@pytest.mark.asyncio
 async def test_parse_chat_messages_multiple_images_async(
    phi3v_model_config,
@@ -1636,6 +1989,118 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interl
                     expected_uuids=["audio_123"])


+def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_messages_interleave(  # noqa: E501
+    qwen25omni_model_config_mm_interleaved,
+    qwen25omni_tokenizer,
+    image_url,
+    video_url,
+    audio_url,
+):
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [
+            {
+                "role":
+                "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What's on this image?"
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": None,
+                        "uuid": "image_123",
+                    },
+                    {
+                        "type": "text",
+                        "text": "Now listen to this audio"
+                    },
+                    {
+                        "type": "audio_url",
+                        "audio_url": None,
+                        "uuid": "audio_123",
+                    },
+                ],
+            },
+            {
+                "role": "assistant",
+                "content": "Some stuff."
+            },
+            {
+                "role":
+                "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What's on this image?"
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": None,
+                        "uuid": "image_123",
+                    },
+                    {
+                        "type": "text",
+                        "text": "And what's in the video?"
+                    },
+                    {
+                        "type": "video_url",
+                        "video_url": None,
+                        "uuid": "video_123",
+                    },
+                ],
+            },
+        ],
+        qwen25omni_model_config_mm_interleaved,
+        qwen25omni_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [
+        {
+            "role":
+            "user",
+            "content":
+            "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
+            "Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>",  # noqa: E501
+        },
+        {
+            "role": "assistant",
+            "content": "Some stuff."
+        },
+        {
+            "role":
+            "user",
+            "content":
+            "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
+            "And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
+        },
+    ]
+
+    _assert_mm_data_inputs(mm_data, {
+        "image": 2,
+        "video": 1,
+        "audio": 1
+    },
+                           skipped_media_indices={
+                               "image": [0, 1],
+                               "video": [0],
+                               "audio": [0]
+                           })
+    _assert_mm_uuids(mm_uuids,
+                     2,
+                     modality="image",
+                     expected_uuids=["image_123", "image_123"])
+    _assert_mm_uuids(mm_uuids,
+                     1,
+                     modality="video",
+                     expected_uuids=["video_123"])
+    _assert_mm_uuids(mm_uuids,
+                     1,
+                     modality="audio",
+                     expected_uuids=["audio_123"])
+
+
 def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_messages_interleave(  # noqa: E501
    qwen25omni_model_config_mm_interleaved,
    qwen25omni_tokenizer,
@@ -2355,3 +2820,82 @@ def test_apply_mistral_chat_template_thinking_chunk():
        r"[INST]Thanks, what is 3+3?[/INST]")

    assert string_tokens == expected_tokens
+
+
+def test_parse_chat_messages_single_empty_audio_with_uuid(
+    qwen2_audio_model_config,
+    qwen2_audio_tokenizer,
+):
+    audio_uuid = "abcd"
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "input_audio",
+                    "input_audio": {},
+                    "uuid": audio_uuid,
+                },
+                {
+                    "type": "text",
+                    "text": "What does the audio say?"
+                },
+            ],
+        }],
+        qwen2_audio_model_config,
+        qwen2_audio_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role":
+        "user",
+        "content":
+        "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the audio say?"
+    }]
+    _assert_mm_data_inputs(mm_data, {"audio": 1})
+    _assert_mm_uuids(mm_uuids,
+                     1,
+                     modality="audio",
+                     expected_uuids=[audio_uuid])
+
+
+@pytest.mark.asyncio
+async def test_parse_chat_messages_single_empty_audio_with_uuid_async(
+    qwen2_audio_model_config,
+    qwen2_audio_tokenizer,
+):
+    audio_uuid = "abcd"
+    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "input_audio",
+                    "input_audio": {},
+                    "uuid": audio_uuid,
+                },
+                {
+                    "type": "text",
+                    "text": "What does the audio say?"
+                },
+            ],
+        }],
+        qwen2_audio_model_config,
+        qwen2_audio_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role":
+        "user",
+        "content":
+        "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the audio say?"
+    }]
+    _assert_mm_data_inputs(await mm_future, {"audio": 1})
+    _assert_mm_uuids(mm_uuids,
+                     1,
+                     modality="audio",
+                     expected_uuids=[audio_uuid])