[Bugfix] Merge MM embeddings by index instead of token IDs (#16229)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: NickLucche <nlucches@redhat.com> Signed-off-by: Roger Wang <hey@rogerw.io> Co-authored-by: NickLucche <nlucches@redhat.com> Co-authored-by: Roger Wang <hey@rogerw.io>
2025-09-27 16:15:12 +08:00
parent 176173989a
commit 27d7638b94
80 changed files with 966 additions and 1139 deletions
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -1151,7 +1151,6 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
        modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
        if not modalities:
            return []
-            return None

        # The result multimodal_embeddings is tuple of tensors, with each
        # tensor corresponding to a multimodal data item (image or video).
@@ -1175,19 +1174,6 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):

        return multimodal_embeddings

-    def get_input_embeddings(
-        self,
-        input_ids: torch.Tensor,
-        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
-    ) -> torch.Tensor:
-        inputs_embeds = self.model.embed_tokens(input_ids)
-        if multimodal_embeddings is not None and len(
-                multimodal_embeddings) != 0:
-            inputs_embeds = merge_multimodal_embeddings(
-                input_ids, inputs_embeds, multimodal_embeddings,
-                [_IMAGE_PLACEHOLDER_TOKEN_ID, _AUDIO_PLACEHOLDER_TOKEN_ID])
-        return inputs_embeds
-
    def get_input_embeddings_v0(
        self,
        input_ids: torch.Tensor,