[Bugfix] Fix embedding assignment for InternVL-based models (#15086)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-03-20 11:40:13 +08:00
parent 70e500cad9
commit ffa443afed
7 changed files with 123 additions and 106 deletions
--- a/vllm/model_executor/models/nvlm_d.py
+++ b/vllm/model_executor/models/nvlm_d.py
@@ -36,11 +36,11 @@ class NVLMProcessor(BaseInternVLProcessor):
    def image_token_id(self) -> int:
        return self.tokenizer.get_vocab()[IMG_PAD]

-    def get_image_repl_features(
+    def get_image_repl(
        self,
        feature_size: int,
        num_patches: Optional[int],
-    ) -> str:
+    ) -> PromptUpdateDetails[str]:
        if num_patches is None:
            raise NotImplementedError("Embedding inputs are not supported")

@@ -55,14 +55,9 @@ class NVLMProcessor(BaseInternVLProcessor):
        # We include the start and end as well because "<Image><tile" is
        # tokenized as ["<Image", "><", "tile"], resulting in assertion error
        # when trying to find "<tile" as a subsequence of "<Image><tile"
-        return "<Image>" + features + "</Image>"
+        repl = "<Image>" + features + "</Image>"

-    def get_image_repl_full(
-        self,
-        feature_size: int,
-        num_patches: Optional[int],
-    ) -> str:
-        return self.get_image_repl_features(feature_size, num_patches)
+        return PromptUpdateDetails(full=repl, features=repl)


 class NVLMProcessingInfo(BaseInternVLProcessingInfo):
@@ -180,11 +175,11 @@ class NVLMMultiModalProcessor(InternVLMultiModalProcessor[NVLMProcessingInfo]):
            if num_patches is not None:
                assert isinstance(num_patches, int)

+            repl = hf_processor.get_image_repl(feature_size, num_patches)
+
            return PromptUpdateDetails(
-                full=hf_processor.get_image_repl_full(feature_size,
-                                                      num_patches) + "\n",
-                features=hf_processor.get_image_repl_features(
-                    feature_size, num_patches) + "\n",
+                full=repl.full + "\n",
+                features=repl.features + "\n",
            )

        # See note in dummy data regarding why we have the extra newline