[Refactor] Use data parser for matching data items to multi-modal UUIDs (#32955)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-01-26 15:00:28 +08:00
parent ee484b3f4b
commit 11b556878b
14 changed files with 701 additions and 604 deletions
--- a/tests/v1/engine/test_process_multi_modal_uuids.py
+++ b/tests/v1/engine/test_process_multi_modal_uuids.py
@@ -5,14 +5,8 @@ import pytest

 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
-from vllm.config import (
-    CacheConfig,
-    DeviceConfig,
-    ModelConfig,
-    MultiModalConfig,
-    VllmConfig,
-)
-from vllm.multimodal import MultiModalRegistry, MultiModalUUIDDict
+from vllm.config import CacheConfig, ModelConfig, VllmConfig
+from vllm.multimodal import MultiModalUUIDDict
 from vllm.sampling_params import SamplingParams
 from vllm.v1.engine.input_processor import InputProcessor

@@ -21,55 +15,26 @@ stop_pil_image = ImageAsset("stop_sign").pil_image
 baby_reading_np_ndarrays = VideoAsset("baby_reading").np_ndarrays


-def _mock_input_processor(
-    monkeypatch, *, mm_cache_gb: float = 4.0, enable_prefix_caching: bool = True
+def _build_input_processor(
+    *, mm_cache_gb: float = 4.0, enable_prefix_caching: bool = True
 ) -> InputProcessor:
-    """
-    Create a Processor instance with minimal configuration suitable for unit
-    tests without accessing external resources.
-    """
-    monkeypatch.setattr(
-        ModelConfig, "try_get_generation_config", lambda self: {}, raising=True
-    )
-    monkeypatch.setattr(
-        ModelConfig, "__post_init__", lambda self, *args: None, raising=True
-    )
-    monkeypatch.setattr(
-        ModelConfig,
-        "verify_with_parallel_config",
-        lambda self, parallel_config: None,
-        raising=True,
-    )
-    monkeypatch.setattr(
-        MultiModalRegistry,
-        "processor_cache_from_config",
-        lambda self, vllm_config: None,
-        raising=True,
-    )
-
-    monkeypatch.setattr(VllmConfig, "__post_init__", lambda self: None, raising=True)
-
    model_config = ModelConfig(
-        tokenizer="dummy",
+        model="Qwen/Qwen2.5-VL-3B-Instruct",
        skip_tokenizer_init=True,
        max_model_len=128,
        mm_processor_cache_gb=mm_cache_gb,
-        generation_config="vllm",
    )
-    model_config.runner_type = "generate"
-    model_config.multimodal_config = MultiModalConfig(mm_processor_cache_gb=mm_cache_gb)

    vllm_config = VllmConfig(
        model_config=model_config,
        cache_config=CacheConfig(enable_prefix_caching=enable_prefix_caching),
-        device_config=DeviceConfig(device="cpu"),
    )

    return InputProcessor(vllm_config)


-def test_multi_modal_uuids_length_mismatch_raises(monkeypatch):
-    input_processor = _mock_input_processor(monkeypatch)
+def test_multi_modal_uuids_length_mismatch_raises():
+    input_processor = _build_input_processor()

    prompt = {
        "prompt": "USER: <image>\nDescribe\nASSISTANT:",
@@ -78,7 +43,7 @@ def test_multi_modal_uuids_length_mismatch_raises(monkeypatch):
        "multi_modal_uuids": {"image": ["hash_cherry"]},
    }

-    with pytest.raises(ValueError, match="must have same length as data"):
+    with pytest.raises(ValueError, match="must have same length as"):
        input_processor.process_inputs(
            request_id="req-1",
            prompt=prompt,  # type: ignore[arg-type]
@@ -86,21 +51,21 @@ def test_multi_modal_uuids_length_mismatch_raises(monkeypatch):
        )


-def test_multi_modal_uuids_missing_modality_raises(monkeypatch):
-    input_processor = _mock_input_processor(monkeypatch)
+def test_multi_modal_uuids_missing_modality_raises():
+    input_processor = _build_input_processor()

    prompt = {
        "prompt": "USER: <image><video>\nDescribe\nASSISTANT:",
        # Two modalities provided in data
        "multi_modal_data": {
            "image": [cherry_pil_image],
-            "video": [baby_reading_np_ndarrays],
+            "video": None,
        },
        # Only image uuids provided; video missing should raise
        "multi_modal_uuids": {"image": ["hash_cherry"]},
    }

-    with pytest.raises(ValueError, match="must be provided if multi_modal_data"):
+    with pytest.raises(ValueError, match="is empty but .* is missing"):
        input_processor.process_inputs(
            request_id="req-2",
            prompt=prompt,  # type: ignore[arg-type]
@@ -119,8 +84,7 @@ def test_multi_modal_uuids_missing_modality_raises(monkeypatch):
 def test_multi_modal_uuids_accepts_none_and_passes_through(
    monkeypatch, mm_cache_gb: float, enable_prefix_caching: bool
 ):
-    input_processor = _mock_input_processor(
-        monkeypatch,
+    input_processor = _build_input_processor(
        mm_cache_gb=mm_cache_gb,
        enable_prefix_caching=enable_prefix_caching,
    )
@@ -163,8 +127,8 @@ def test_multi_modal_uuids_accepts_none_and_passes_through(
 def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
    # When both processor cache is 0 and prefix caching disabled, the
    # processor builds overrides from request id instead of using user UUIDs.
-    input_processor = _mock_input_processor(
-        monkeypatch, mm_cache_gb=0.0, enable_prefix_caching=False
+    input_processor = _build_input_processor(
+        mm_cache_gb=0.0, enable_prefix_caching=False
    )

    captured: dict[str, MultiModalUUIDDict] = {}
@@ -180,12 +144,12 @@ def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
    )

    request_id = "req-42"
-    mm_uuids = {"image": ["hash_cherry", "hash_stop"], "video": "hash_video"}
+    mm_uuids = {"image": ["hash_cherry", "hash_stop"], "video": ["hash_video"]}
    prompt = {
        "prompt": "USER: <image><image><video>\nDescribe\nASSISTANT:",
        "multi_modal_data": {
            "image": [cherry_pil_image, stop_pil_image],
-            "video": baby_reading_np_ndarrays,
+            "video": [baby_reading_np_ndarrays],
        },
        "multi_modal_uuids": mm_uuids,
    }
@@ -197,16 +161,15 @@ def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
    )

    # Expect request-id-based overrides are passed through
-    mm_uuids = captured["mm_uuids"]
    assert set(mm_uuids.keys()) == {"image", "video"}
    assert len(mm_uuids["image"]) == 2
    assert len(mm_uuids["video"]) == 1
-    assert mm_uuids["image"][0].startswith(f"{request_id}-image-") and mm_uuids[
-        "image"
-    ][0].endswith("-0")
-    assert mm_uuids["image"][1].startswith(f"{request_id}-image-") and mm_uuids[
-        "image"
-    ][1].endswith("-1")
-    assert mm_uuids["video"][0].startswith(f"{request_id}-video-") and mm_uuids[
-        "video"
-    ][0].endswith("-0")
+    assert captured["mm_uuids"]["image"][0].startswith(
+        f"{request_id}-image-"
+    ) and captured["mm_uuids"]["image"][0].endswith("-0")
+    assert captured["mm_uuids"]["image"][1].startswith(
+        f"{request_id}-image-"
+    ) and captured["mm_uuids"]["image"][1].endswith("-1")
+    assert captured["mm_uuids"]["video"][0].startswith(
+        f"{request_id}-video-"
+    ) and captured["mm_uuids"]["video"][0].endswith("-0")