[Refactor] Use data parser for matching data items to multi-modal UUIDs (#32955)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2026-01-26 15:00:28 +08:00
committed by GitHub
parent ee484b3f4b
commit 11b556878b
14 changed files with 701 additions and 604 deletions

View File

@@ -5,14 +5,8 @@ import pytest
from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset
from vllm.config import (
CacheConfig,
DeviceConfig,
ModelConfig,
MultiModalConfig,
VllmConfig,
)
from vllm.multimodal import MultiModalRegistry, MultiModalUUIDDict
from vllm.config import CacheConfig, ModelConfig, VllmConfig
from vllm.multimodal import MultiModalUUIDDict
from vllm.sampling_params import SamplingParams
from vllm.v1.engine.input_processor import InputProcessor
@@ -21,55 +15,26 @@ stop_pil_image = ImageAsset("stop_sign").pil_image
baby_reading_np_ndarrays = VideoAsset("baby_reading").np_ndarrays
def _mock_input_processor(
monkeypatch, *, mm_cache_gb: float = 4.0, enable_prefix_caching: bool = True
def _build_input_processor(
*, mm_cache_gb: float = 4.0, enable_prefix_caching: bool = True
) -> InputProcessor:
"""
Create a Processor instance with minimal configuration suitable for unit
tests without accessing external resources.
"""
monkeypatch.setattr(
ModelConfig, "try_get_generation_config", lambda self: {}, raising=True
)
monkeypatch.setattr(
ModelConfig, "__post_init__", lambda self, *args: None, raising=True
)
monkeypatch.setattr(
ModelConfig,
"verify_with_parallel_config",
lambda self, parallel_config: None,
raising=True,
)
monkeypatch.setattr(
MultiModalRegistry,
"processor_cache_from_config",
lambda self, vllm_config: None,
raising=True,
)
monkeypatch.setattr(VllmConfig, "__post_init__", lambda self: None, raising=True)
model_config = ModelConfig(
tokenizer="dummy",
model="Qwen/Qwen2.5-VL-3B-Instruct",
skip_tokenizer_init=True,
max_model_len=128,
mm_processor_cache_gb=mm_cache_gb,
generation_config="vllm",
)
model_config.runner_type = "generate"
model_config.multimodal_config = MultiModalConfig(mm_processor_cache_gb=mm_cache_gb)
vllm_config = VllmConfig(
model_config=model_config,
cache_config=CacheConfig(enable_prefix_caching=enable_prefix_caching),
device_config=DeviceConfig(device="cpu"),
)
return InputProcessor(vllm_config)
def test_multi_modal_uuids_length_mismatch_raises(monkeypatch):
input_processor = _mock_input_processor(monkeypatch)
def test_multi_modal_uuids_length_mismatch_raises():
input_processor = _build_input_processor()
prompt = {
"prompt": "USER: <image>\nDescribe\nASSISTANT:",
@@ -78,7 +43,7 @@ def test_multi_modal_uuids_length_mismatch_raises(monkeypatch):
"multi_modal_uuids": {"image": ["hash_cherry"]},
}
with pytest.raises(ValueError, match="must have same length as data"):
with pytest.raises(ValueError, match="must have same length as"):
input_processor.process_inputs(
request_id="req-1",
prompt=prompt, # type: ignore[arg-type]
@@ -86,21 +51,21 @@ def test_multi_modal_uuids_length_mismatch_raises(monkeypatch):
)
def test_multi_modal_uuids_missing_modality_raises(monkeypatch):
input_processor = _mock_input_processor(monkeypatch)
def test_multi_modal_uuids_missing_modality_raises():
input_processor = _build_input_processor()
prompt = {
"prompt": "USER: <image><video>\nDescribe\nASSISTANT:",
# Two modalities provided in data
"multi_modal_data": {
"image": [cherry_pil_image],
"video": [baby_reading_np_ndarrays],
"video": None,
},
# Only image uuids provided; video missing should raise
"multi_modal_uuids": {"image": ["hash_cherry"]},
}
with pytest.raises(ValueError, match="must be provided if multi_modal_data"):
with pytest.raises(ValueError, match="is empty but .* is missing"):
input_processor.process_inputs(
request_id="req-2",
prompt=prompt, # type: ignore[arg-type]
@@ -119,8 +84,7 @@ def test_multi_modal_uuids_missing_modality_raises(monkeypatch):
def test_multi_modal_uuids_accepts_none_and_passes_through(
monkeypatch, mm_cache_gb: float, enable_prefix_caching: bool
):
input_processor = _mock_input_processor(
monkeypatch,
input_processor = _build_input_processor(
mm_cache_gb=mm_cache_gb,
enable_prefix_caching=enable_prefix_caching,
)
@@ -163,8 +127,8 @@ def test_multi_modal_uuids_accepts_none_and_passes_through(
def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
# When both processor cache is 0 and prefix caching disabled, the
# processor builds overrides from request id instead of using user UUIDs.
input_processor = _mock_input_processor(
monkeypatch, mm_cache_gb=0.0, enable_prefix_caching=False
input_processor = _build_input_processor(
mm_cache_gb=0.0, enable_prefix_caching=False
)
captured: dict[str, MultiModalUUIDDict] = {}
@@ -180,12 +144,12 @@ def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
)
request_id = "req-42"
mm_uuids = {"image": ["hash_cherry", "hash_stop"], "video": "hash_video"}
mm_uuids = {"image": ["hash_cherry", "hash_stop"], "video": ["hash_video"]}
prompt = {
"prompt": "USER: <image><image><video>\nDescribe\nASSISTANT:",
"multi_modal_data": {
"image": [cherry_pil_image, stop_pil_image],
"video": baby_reading_np_ndarrays,
"video": [baby_reading_np_ndarrays],
},
"multi_modal_uuids": mm_uuids,
}
@@ -197,16 +161,15 @@ def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
)
# Expect request-id-based overrides are passed through
mm_uuids = captured["mm_uuids"]
assert set(mm_uuids.keys()) == {"image", "video"}
assert len(mm_uuids["image"]) == 2
assert len(mm_uuids["video"]) == 1
assert mm_uuids["image"][0].startswith(f"{request_id}-image-") and mm_uuids[
"image"
][0].endswith("-0")
assert mm_uuids["image"][1].startswith(f"{request_id}-image-") and mm_uuids[
"image"
][1].endswith("-1")
assert mm_uuids["video"][0].startswith(f"{request_id}-video-") and mm_uuids[
"video"
][0].endswith("-0")
assert captured["mm_uuids"]["image"][0].startswith(
f"{request_id}-image-"
) and captured["mm_uuids"]["image"][0].endswith("-0")
assert captured["mm_uuids"]["image"][1].startswith(
f"{request_id}-image-"
) and captured["mm_uuids"]["image"][1].endswith("-1")
assert captured["mm_uuids"]["video"][0].startswith(
f"{request_id}-video-"
) and captured["mm_uuids"]["video"][0].endswith("-0")