[Core] Use individual MM items in P0/P1 cache and model runner (#22570)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2025-08-13 22:18:07 +08:00
committed by GitHub
parent 20d65aa755
commit 19b927e52d
24 changed files with 549 additions and 486 deletions

View File

@@ -5,7 +5,7 @@ import base64
import mimetypes
import os
from tempfile import NamedTemporaryFile, TemporaryDirectory
from typing import TYPE_CHECKING, NamedTuple, Optional
from typing import TYPE_CHECKING, NamedTuple
import numpy as np
import pytest
@@ -19,14 +19,12 @@ from vllm.distributed.parallel_state import (init_distributed_environment,
initialize_model_parallel)
from vllm.multimodal.image import convert_image_mode
from vllm.multimodal.inputs import PlaceholderRange
from vllm.multimodal.utils import (MediaConnector,
merge_and_sort_multimodal_metadata,
from vllm.multimodal.utils import (MediaConnector, argsort_mm_positions,
run_dp_sharded_vision_model)
from vllm.platforms import current_platform
from vllm.utils import get_open_port, update_environment_variables
if TYPE_CHECKING:
from vllm.multimodal.hasher import MultiModalHashDict
from vllm.multimodal.inputs import MultiModalPlaceholderDict
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
@@ -178,19 +176,17 @@ async def test_fetch_video_http(video_url: str, num_frames: int):
assert metadata_sync == metadata_async
# Used for the next two tests related to `merge_and_sort_multimodal_metadata`.
# Used for `test_argsort_mm_positions`.
class TestCase(NamedTuple):
mm_positions: "MultiModalPlaceholderDict"
mm_hashes: Optional["MultiModalHashDict"]
expected_modalities: list[str]
expected_ranges: list[PlaceholderRange]
expected_hashes: Optional[list[str]]
expected_modality_idxs: list[tuple[str, int]]
def test_merge_and_sort_multimodal_metadata():
def test_argsort_mm_positions():
test_cases = [
# Single modality should return result as is but flattened
# Single modality
## Internally sorted
TestCase(
mm_positions={
"image": [
@@ -198,34 +194,27 @@ def test_merge_and_sort_multimodal_metadata():
PlaceholderRange(offset=3, length=2),
]
},
mm_hashes={"image": ["hash1", "hash2"]},
expected_modalities=["image", "image"],
expected_ranges=[
PlaceholderRange(offset=0, length=2),
PlaceholderRange(offset=3, length=2),
expected_modality_idxs=[
("image", 0),
("image", 1),
],
expected_hashes=["hash1", "hash2"],
),
# Single modality without hashes return None for mm hash.
## Internally unsorted
TestCase(
mm_positions={
"image": [
PlaceholderRange(offset=3, length=2),
PlaceholderRange(offset=0, length=2),
PlaceholderRange(offset=2, length=2),
]
},
mm_hashes=None,
expected_modalities=["image", "image"],
expected_ranges=[
PlaceholderRange(offset=0, length=2),
PlaceholderRange(offset=2, length=2),
expected_modality_idxs=[
("image", 1),
("image", 0),
],
expected_hashes=None,
),
# Multiple modalities with hashes should return sorted modalities
# and flattened ranges and hashes.
# Two modalities
## Internally sorted
TestCase(
mm_positions={
"image": [
@@ -237,47 +226,54 @@ def test_merge_and_sort_multimodal_metadata():
PlaceholderRange(offset=2, length=3),
]
},
mm_hashes={
"image": ["image_hash1", "image_hash2"],
"audio": ["audio_hash1", "audio_hash2"],
},
expected_modalities=["audio", "audio", "image", "image"],
expected_ranges=[
PlaceholderRange(offset=0, length=2),
PlaceholderRange(offset=2, length=3),
PlaceholderRange(offset=7, length=4),
PlaceholderRange(offset=11, length=5),
],
expected_hashes=[
"audio_hash1", "audio_hash2", "image_hash1", "image_hash2"
expected_modality_idxs=[
("audio", 0),
("audio", 1),
("image", 0),
("image", 1),
],
),
# Multiple modalities without hashes should return sorted modalities
# and flattened ranges and None.
## Interleaved, internally sorted
TestCase(
mm_positions={
"image": [
PlaceholderRange(offset=7, length=4),
PlaceholderRange(offset=11, length=5),
PlaceholderRange(offset=0, length=4),
PlaceholderRange(offset=8, length=2),
],
"audio": [
PlaceholderRange(offset=0, length=2),
PlaceholderRange(offset=2, length=3),
PlaceholderRange(offset=5, length=2),
PlaceholderRange(offset=11, length=4),
]
},
mm_hashes=None,
expected_modalities=["audio", "audio", "image", "image"],
expected_ranges=[
PlaceholderRange(offset=0, length=2),
PlaceholderRange(offset=2, length=3),
PlaceholderRange(offset=7, length=4),
PlaceholderRange(offset=11, length=5),
expected_modality_idxs=[
("image", 0),
("audio", 0),
("image", 1),
("audio", 1),
],
),
## Interleaved, internally unsorted
TestCase(
mm_positions={
"image": [
PlaceholderRange(offset=8, length=2),
PlaceholderRange(offset=0, length=4),
],
"audio": [
PlaceholderRange(offset=11, length=4),
PlaceholderRange(offset=5, length=2),
]
},
expected_modality_idxs=[
("image", 1),
("audio", 1),
("image", 0),
("audio", 0),
],
expected_hashes=None,
),
# Three modalities
## Internally sorted
TestCase(
mm_positions={
"image": [
@@ -293,72 +289,16 @@ def test_merge_and_sort_multimodal_metadata():
PlaceholderRange(offset=12, length=6),
]
},
mm_hashes={
"image": ["image_hash1", "image_hash2"],
"audio": ["audio_hash1"],
"video": ["video_hash1", "video_hash2", "video_hash3"]
},
expected_modalities=[
"audio", "video", "video", "video", "image", "image"
],
expected_ranges=[
PlaceholderRange(offset=0, length=2),
PlaceholderRange(offset=3, length=4),
PlaceholderRange(offset=7, length=5),
PlaceholderRange(offset=12, length=6),
PlaceholderRange(offset=15, length=7),
PlaceholderRange(offset=22, length=8),
],
expected_hashes=[
"audio_hash1", "video_hash1", "video_hash2", "video_hash3",
"image_hash1", "image_hash2"
expected_modality_idxs=[
("audio", 0),
("video", 0),
("video", 1),
("video", 2),
("image", 0),
("image", 1),
],
),
]
for (mm_positions, mm_hashes, expected_modalities, expected_ranges,
expected_hashes) in test_cases:
modalities, ranges, hashes = merge_and_sort_multimodal_metadata(
mm_positions, mm_hashes)
assert modalities == expected_modalities
assert ranges == expected_ranges
assert hashes == expected_hashes
def test_merge_and_sort_multimodal_metadata_with_interleaving():
test_cases = [
# <image> <audio> <image> <audio>
TestCase(
mm_positions={
"image": [
PlaceholderRange(offset=0, length=4),
PlaceholderRange(offset=8, length=2),
],
"audio": [
PlaceholderRange(offset=5, length=2),
PlaceholderRange(offset=11, length=4),
]
},
mm_hashes={
"image": ["image_hash1", "image_hash2"],
"audio": ["audio_hash1", "audio_hash2"],
},
expected_modalities=["image", "audio", "image", "audio"],
expected_ranges=[
PlaceholderRange(offset=0, length=4),
PlaceholderRange(offset=5, length=2),
PlaceholderRange(offset=8, length=2),
PlaceholderRange(offset=11, length=4),
],
expected_hashes=[
"image_hash1", "audio_hash1", "image_hash2", "audio_hash2"
],
),
# <image> <image> <audio> <video> <image>
## Interleaved, internally sorted
TestCase(
mm_positions={
"image": [
@@ -373,58 +313,43 @@ def test_merge_and_sort_multimodal_metadata_with_interleaving():
PlaceholderRange(offset=8, length=5),
]
},
mm_hashes=None,
expected_modalities=["image", "image", "audio", "video", "image"],
expected_ranges=[
PlaceholderRange(offset=0, length=2),
PlaceholderRange(offset=2, length=3),
PlaceholderRange(offset=5, length=2),
PlaceholderRange(offset=8, length=5),
PlaceholderRange(offset=20, length=4),
expected_modality_idxs=[
("image", 0),
("image", 1),
("audio", 0),
("video", 0),
("image", 2),
],
expected_hashes=None,
),
# <image> <audio> <video> <image> with hashes
## Interleaved, internally sunorted
TestCase(
mm_positions={
"image": [
PlaceholderRange(offset=0, length=2),
PlaceholderRange(offset=18, length=4),
PlaceholderRange(offset=20, length=4),
PlaceholderRange(offset=2, length=3),
],
"audio": [
PlaceholderRange(offset=6, length=2),
PlaceholderRange(offset=5, length=2),
],
"video": [
PlaceholderRange(offset=10, length=5),
PlaceholderRange(offset=8, length=5),
]
},
mm_hashes={
"image": ["image_hash1", "image_hash2"],
"audio": ["audio_hash1"],
"video": ["video_hash1"],
},
expected_modalities=["image", "audio", "video", "image"],
expected_ranges=[
PlaceholderRange(offset=0, length=2),
PlaceholderRange(offset=6, length=2),
PlaceholderRange(offset=10, length=5),
PlaceholderRange(offset=18, length=4),
],
expected_hashes=[
"image_hash1", "audio_hash1", "video_hash1", "image_hash2"
expected_modality_idxs=[
("image", 0),
("image", 2),
("audio", 0),
("video", 0),
("image", 1),
],
),
]
for (mm_positions, mm_hashes, expected_modalities, expected_ranges,
expected_hashes) in test_cases:
modalities, ranges, hashes = merge_and_sort_multimodal_metadata(
mm_positions, mm_hashes)
for mm_positions, expected_modality_idxs in test_cases:
modality_idxs = argsort_mm_positions(mm_positions)
assert modalities == expected_modalities
assert ranges == expected_ranges
assert hashes == expected_hashes
assert modality_idxs == expected_modality_idxs
class SimpleLinearModel(torch.nn.Module):