[Core] Use individual MM items in P0/P1 cache and model runner (#22570)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-13 22:18:07 +08:00
parent 20d65aa755
commit 19b927e52d
24 changed files with 549 additions and 486 deletions
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -5,7 +5,7 @@ import base64
 import mimetypes
 import os
 from tempfile import NamedTemporaryFile, TemporaryDirectory
-from typing import TYPE_CHECKING, NamedTuple, Optional
+from typing import TYPE_CHECKING, NamedTuple

 import numpy as np
 import pytest
@@ -19,14 +19,12 @@ from vllm.distributed.parallel_state import (init_distributed_environment,
                                             initialize_model_parallel)
 from vllm.multimodal.image import convert_image_mode
 from vllm.multimodal.inputs import PlaceholderRange
-from vllm.multimodal.utils import (MediaConnector,
-                                   merge_and_sort_multimodal_metadata,
+from vllm.multimodal.utils import (MediaConnector, argsort_mm_positions,
                                   run_dp_sharded_vision_model)
 from vllm.platforms import current_platform
 from vllm.utils import get_open_port, update_environment_variables

 if TYPE_CHECKING:
-    from vllm.multimodal.hasher import MultiModalHashDict
    from vllm.multimodal.inputs import MultiModalPlaceholderDict

 # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
@@ -178,19 +176,17 @@ async def test_fetch_video_http(video_url: str, num_frames: int):
    assert metadata_sync == metadata_async


-# Used for the next two tests related to `merge_and_sort_multimodal_metadata`.
+# Used for `test_argsort_mm_positions`.
 class TestCase(NamedTuple):
    mm_positions: "MultiModalPlaceholderDict"
-    mm_hashes: Optional["MultiModalHashDict"]
-    expected_modalities: list[str]
-    expected_ranges: list[PlaceholderRange]
-    expected_hashes: Optional[list[str]]
+    expected_modality_idxs: list[tuple[str, int]]


-def test_merge_and_sort_multimodal_metadata():
+def test_argsort_mm_positions():

    test_cases = [
-        # Single modality should return result as is but flattened
+        # Single modality
+        ## Internally sorted
        TestCase(
            mm_positions={
                "image": [
@@ -198,34 +194,27 @@ def test_merge_and_sort_multimodal_metadata():
                    PlaceholderRange(offset=3, length=2),
                ]
            },
-            mm_hashes={"image": ["hash1", "hash2"]},
-            expected_modalities=["image", "image"],
-            expected_ranges=[
-                PlaceholderRange(offset=0, length=2),
-                PlaceholderRange(offset=3, length=2),
+            expected_modality_idxs=[
+                ("image", 0),
+                ("image", 1),
            ],
-            expected_hashes=["hash1", "hash2"],
        ),
-
-        # Single modality without hashes return None for mm hash.
+        ## Internally unsorted
        TestCase(
            mm_positions={
                "image": [
+                    PlaceholderRange(offset=3, length=2),
                    PlaceholderRange(offset=0, length=2),
-                    PlaceholderRange(offset=2, length=2),
                ]
            },
-            mm_hashes=None,
-            expected_modalities=["image", "image"],
-            expected_ranges=[
-                PlaceholderRange(offset=0, length=2),
-                PlaceholderRange(offset=2, length=2),
+            expected_modality_idxs=[
+                ("image", 1),
+                ("image", 0),
            ],
-            expected_hashes=None,
        ),

-        # Multiple modalities with hashes should return sorted modalities
-        # and flattened ranges and hashes.
+        # Two modalities
+        ## Internally sorted
        TestCase(
            mm_positions={
                "image": [
@@ -237,47 +226,54 @@ def test_merge_and_sort_multimodal_metadata():
                    PlaceholderRange(offset=2, length=3),
                ]
            },
-            mm_hashes={
-                "image": ["image_hash1", "image_hash2"],
-                "audio": ["audio_hash1", "audio_hash2"],
-            },
-            expected_modalities=["audio", "audio", "image", "image"],
-            expected_ranges=[
-                PlaceholderRange(offset=0, length=2),
-                PlaceholderRange(offset=2, length=3),
-                PlaceholderRange(offset=7, length=4),
-                PlaceholderRange(offset=11, length=5),
-            ],
-            expected_hashes=[
-                "audio_hash1", "audio_hash2", "image_hash1", "image_hash2"
+            expected_modality_idxs=[
+                ("audio", 0),
+                ("audio", 1),
+                ("image", 0),
+                ("image", 1),
            ],
        ),
-
-        # Multiple modalities without hashes should return sorted modalities
-        # and flattened ranges and None.
+        ## Interleaved, internally sorted
        TestCase(
            mm_positions={
                "image": [
-                    PlaceholderRange(offset=7, length=4),
-                    PlaceholderRange(offset=11, length=5),
+                    PlaceholderRange(offset=0, length=4),
+                    PlaceholderRange(offset=8, length=2),
                ],
                "audio": [
-                    PlaceholderRange(offset=0, length=2),
-                    PlaceholderRange(offset=2, length=3),
+                    PlaceholderRange(offset=5, length=2),
+                    PlaceholderRange(offset=11, length=4),
                ]
            },
-            mm_hashes=None,
-            expected_modalities=["audio", "audio", "image", "image"],
-            expected_ranges=[
-                PlaceholderRange(offset=0, length=2),
-                PlaceholderRange(offset=2, length=3),
-                PlaceholderRange(offset=7, length=4),
-                PlaceholderRange(offset=11, length=5),
+            expected_modality_idxs=[
+                ("image", 0),
+                ("audio", 0),
+                ("image", 1),
+                ("audio", 1),
+            ],
+        ),
+        ## Interleaved, internally unsorted
+        TestCase(
+            mm_positions={
+                "image": [
+                    PlaceholderRange(offset=8, length=2),
+                    PlaceholderRange(offset=0, length=4),
+                ],
+                "audio": [
+                    PlaceholderRange(offset=11, length=4),
+                    PlaceholderRange(offset=5, length=2),
+                ]
+            },
+            expected_modality_idxs=[
+                ("image", 1),
+                ("audio", 1),
+                ("image", 0),
+                ("audio", 0),
            ],
-            expected_hashes=None,
        ),

        # Three modalities
+        ## Internally sorted
        TestCase(
            mm_positions={
                "image": [
@@ -293,72 +289,16 @@ def test_merge_and_sort_multimodal_metadata():
                    PlaceholderRange(offset=12, length=6),
                ]
            },
-            mm_hashes={
-                "image": ["image_hash1", "image_hash2"],
-                "audio": ["audio_hash1"],
-                "video": ["video_hash1", "video_hash2", "video_hash3"]
-            },
-            expected_modalities=[
-                "audio", "video", "video", "video", "image", "image"
-            ],
-            expected_ranges=[
-                PlaceholderRange(offset=0, length=2),
-                PlaceholderRange(offset=3, length=4),
-                PlaceholderRange(offset=7, length=5),
-                PlaceholderRange(offset=12, length=6),
-                PlaceholderRange(offset=15, length=7),
-                PlaceholderRange(offset=22, length=8),
-            ],
-            expected_hashes=[
-                "audio_hash1", "video_hash1", "video_hash2", "video_hash3",
-                "image_hash1", "image_hash2"
+            expected_modality_idxs=[
+                ("audio", 0),
+                ("video", 0),
+                ("video", 1),
+                ("video", 2),
+                ("image", 0),
+                ("image", 1),
            ],
        ),
-    ]
-
-    for (mm_positions, mm_hashes, expected_modalities, expected_ranges,
-         expected_hashes) in test_cases:
-        modalities, ranges, hashes = merge_and_sort_multimodal_metadata(
-            mm_positions, mm_hashes)
-
-        assert modalities == expected_modalities
-        assert ranges == expected_ranges
-        assert hashes == expected_hashes
-
-
-def test_merge_and_sort_multimodal_metadata_with_interleaving():
-
-    test_cases = [
-
-        # <image> <audio> <image> <audio>
-        TestCase(
-            mm_positions={
-                "image": [
-                    PlaceholderRange(offset=0, length=4),
-                    PlaceholderRange(offset=8, length=2),
-                ],
-                "audio": [
-                    PlaceholderRange(offset=5, length=2),
-                    PlaceholderRange(offset=11, length=4),
-                ]
-            },
-            mm_hashes={
-                "image": ["image_hash1", "image_hash2"],
-                "audio": ["audio_hash1", "audio_hash2"],
-            },
-            expected_modalities=["image", "audio", "image", "audio"],
-            expected_ranges=[
-                PlaceholderRange(offset=0, length=4),
-                PlaceholderRange(offset=5, length=2),
-                PlaceholderRange(offset=8, length=2),
-                PlaceholderRange(offset=11, length=4),
-            ],
-            expected_hashes=[
-                "image_hash1", "audio_hash1", "image_hash2", "audio_hash2"
-            ],
-        ),
-
-        # <image> <image> <audio> <video> <image>
+        ## Interleaved, internally sorted
        TestCase(
            mm_positions={
                "image": [
@@ -373,58 +313,43 @@ def test_merge_and_sort_multimodal_metadata_with_interleaving():
                    PlaceholderRange(offset=8, length=5),
                ]
            },
-            mm_hashes=None,
-            expected_modalities=["image", "image", "audio", "video", "image"],
-            expected_ranges=[
-                PlaceholderRange(offset=0, length=2),
-                PlaceholderRange(offset=2, length=3),
-                PlaceholderRange(offset=5, length=2),
-                PlaceholderRange(offset=8, length=5),
-                PlaceholderRange(offset=20, length=4),
+            expected_modality_idxs=[
+                ("image", 0),
+                ("image", 1),
+                ("audio", 0),
+                ("video", 0),
+                ("image", 2),
            ],
-            expected_hashes=None,
        ),
-
-        # <image> <audio> <video> <image> with hashes
+        ## Interleaved, internally sunorted
        TestCase(
            mm_positions={
                "image": [
                    PlaceholderRange(offset=0, length=2),
-                    PlaceholderRange(offset=18, length=4),
+                    PlaceholderRange(offset=20, length=4),
+                    PlaceholderRange(offset=2, length=3),
                ],
                "audio": [
-                    PlaceholderRange(offset=6, length=2),
+                    PlaceholderRange(offset=5, length=2),
                ],
                "video": [
-                    PlaceholderRange(offset=10, length=5),
+                    PlaceholderRange(offset=8, length=5),
                ]
            },
-            mm_hashes={
-                "image": ["image_hash1", "image_hash2"],
-                "audio": ["audio_hash1"],
-                "video": ["video_hash1"],
-            },
-            expected_modalities=["image", "audio", "video", "image"],
-            expected_ranges=[
-                PlaceholderRange(offset=0, length=2),
-                PlaceholderRange(offset=6, length=2),
-                PlaceholderRange(offset=10, length=5),
-                PlaceholderRange(offset=18, length=4),
-            ],
-            expected_hashes=[
-                "image_hash1", "audio_hash1", "video_hash1", "image_hash2"
+            expected_modality_idxs=[
+                ("image", 0),
+                ("image", 2),
+                ("audio", 0),
+                ("video", 0),
+                ("image", 1),
            ],
        ),
    ]

-    for (mm_positions, mm_hashes, expected_modalities, expected_ranges,
-         expected_hashes) in test_cases:
-        modalities, ranges, hashes = merge_and_sort_multimodal_metadata(
-            mm_positions, mm_hashes)
+    for mm_positions, expected_modality_idxs in test_cases:
+        modality_idxs = argsort_mm_positions(mm_positions)

-        assert modalities == expected_modalities
-        assert ranges == expected_ranges
-        assert hashes == expected_hashes
+        assert modality_idxs == expected_modality_idxs


 class SimpleLinearModel(torch.nn.Module):