[V1] Scatter and gather placeholders in the model runner (#15712)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: mgoin <mgoin64@gmail.com> Signed-off-by: Roger Wang <ywang@roblox.com> Co-authored-by: mgoin <mgoin64@gmail.com> Co-authored-by: Roger Wang <ywang@roblox.com>
2025-04-05 05:26:44 +08:00
parent 651cf0fec1
commit f5722a5052
42 changed files with 496 additions and 942 deletions
--- a/vllm/model_executor/models/nvlm_d.py
+++ b/vllm/model_executor/models/nvlm_d.py
@@ -57,7 +57,7 @@ class NVLMProcessor(BaseInternVLProcessor):
        # when trying to find "<tile" as a subsequence of "<Image><tile"
        repl = "<Image>" + features + "</Image>"

-        return PromptUpdateDetails(full=repl, features=repl)
+        return PromptUpdateDetails.select_text(repl, IMG_PAD)


 class NVLMProcessingInfo(BaseInternVLProcessingInfo):
@@ -84,31 +84,6 @@ class NVLMProcessingInfo(BaseInternVLProcessingInfo):
            **kwargs,
        )

-    def get_max_image_tokens(self) -> int:
-        hf_processor = self.get_hf_processor()
-        tokenizer = hf_processor.tokenizer
-
-        max_num_patches = hf_processor.max_dynamic_patch
-        # we need +1 here because max_dynamic_patch in config doesn't
-        # include the thumbnail patch
-        tile_pos_identifiers = [
-            f"<tile_{i+1}>" for i in range(max_num_patches)
-        ]
-        if hf_processor.use_thumbnail and max_num_patches != 1:
-            tile_pos_identifiers += ["<tile_global_thumbnail>"]
-
-        # "<Image><tile" is tokenized as ["<Image", "><", "tile"]
-        # so we include <tile_1> in the start_str
-        start_str = "<Image>" + tile_pos_identifiers.pop(0)
-        end_str = "</Image>"
-        start_token_len = len(tokenizer.encode(start_str))
-        end_token_len = len(tokenizer.encode(end_str))
-        tile_token_len = sum(
-            len(tokenizer.encode(identifier))
-            for identifier in tile_pos_identifiers)
-        non_image_tokens_num = start_token_len + end_token_len + tile_token_len
-        return super().get_max_image_tokens() + non_image_tokens_num
-

 class NVLMDummyInputsBuilder(InternVLDummyInputsBuilder[NVLMProcessingInfo]):

@@ -177,10 +152,7 @@ class NVLMMultiModalProcessor(InternVLMultiModalProcessor[NVLMProcessingInfo]):

            repl = hf_processor.get_image_repl(feature_size, num_patches)

-            return PromptUpdateDetails(
-                full=repl.full + "\n",
-                features=repl.features + "\n",
-            )
+            return PromptUpdateDetails.select_text(repl.full + "\n", IMG_PAD)

        # See note in dummy data regarding why we have the extra newline
        return [