[Bugfix] Check dimensions of multimodal embeddings in V1 (#15816)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-04-01 00:01:35 +08:00
parent e5ef4fa99a
commit 09e974d483
14 changed files with 98 additions and 37 deletions
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -41,6 +41,8 @@ from vllm.v1.utils import bind_kv_cache
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin

+from .utils import sanity_check_mm_encoder_outputs
+
 if TYPE_CHECKING:
    import xgrammar as xgr

@@ -867,6 +869,11 @@ class GPUModelRunner(LoRAModelRunnerMixin):
            curr_group_outputs = self.model.get_multimodal_embeddings(
                **batched_mm_inputs)

+            sanity_check_mm_encoder_outputs(
+                curr_group_outputs,
+                expected_num_items=len(grouped_mm_inputs),
+            )
+
            for output in curr_group_outputs:
                encoder_outputs.append(output)

@@ -1490,12 +1497,11 @@ class GPUModelRunner(LoRAModelRunnerMixin):
            # Run multimodal encoder.
            dummy_encoder_outputs = self.model.get_multimodal_embeddings(
                **batched_dummy_mm_inputs)
-            assert len(dummy_encoder_outputs) == max_num_mm_items, (
-                "Expected dimension 0 of encoder outputs to match the number "
-                f"of multimodal data items: {max_num_mm_items}, got "
-                f"{len(dummy_encoder_outputs)=} instead. This is most likely "
-                "due to the 'get_multimodal_embeddings' method of the model "
-                "not implemented correctly.")
+
+            sanity_check_mm_encoder_outputs(
+                dummy_encoder_outputs,
+                expected_num_items=max_num_mm_items,
+            )

            # Cache the dummy encoder outputs.
            self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs))
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -37,6 +37,8 @@ from vllm.v1.sample.tpu.sampler import Sampler as TPUSampler
 from vllm.v1.utils import bind_kv_cache
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch

+from .utils import sanity_check_mm_encoder_outputs
+
 if TYPE_CHECKING:
    from vllm.v1.core.sched.output import SchedulerOutput

@@ -512,6 +514,11 @@ class TPUModelRunner:
            curr_group_outputs = self.model.get_multimodal_embeddings(
                **batched_mm_inputs)

+            sanity_check_mm_encoder_outputs(
+                curr_group_outputs,
+                expected_num_items=len(grouped_mm_inputs),
+            )
+
            for output in curr_group_outputs:
                encoder_outputs.append(output)

--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -0,0 +1,29 @@
+# SPDX-License-Identifier: Apache-2.0
+import torch
+
+
+def sanity_check_mm_encoder_outputs(
+    mm_embeddings: object,
+    expected_num_items: int,
+) -> None:
+    """
+    Perform sanity checks for the result of
+    :meth:`vllm.model_executor.models.SupportsMultiModal.get_multimodal_embeddings`.
+    """
+    assert isinstance(mm_embeddings, (list, tuple, torch.Tensor)), (
+        "Expected multimodal embeddings to be a list/tuple of 2D tensors, "
+        f"or a single 3D tensor, but got {type(mm_embeddings)} "
+        "instead. This is most likely due to incorrect implementation "
+        "of the model's `get_multimodal_embeddings` method.")
+
+    assert len(mm_embeddings) == expected_num_items, (
+        "Expected number of multimodal embeddings to match number of "
+        f"input items: {expected_num_items}, but got {len(mm_embeddings)=} "
+        "instead. This is most likely due to incorrect implementation "
+        "of the model's `get_multimodal_embeddings` method.")
+
+    assert all(e.ndim == 2 for e in mm_embeddings), (
+        "Expected multimodal embeddings to be a sequence of 2D tensors, "
+        f"but got tensors with shapes {[e.shape for e in mm_embeddings]} "
+        "instead. This is most likely due to incorrect implementation "
+        "of the model's `get_multimodal_embeddings` method.")