[1/2] Move InternVL-based processors (#37260)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-03-17 21:50:56 +08:00
parent 2660b9289c
commit f340324335
20 changed files with 3252 additions and 3099 deletions
--- a/vllm/model_executor/models/nvlm_d.py
+++ b/vllm/model_executor/models/nvlm_d.py
@@ -27,48 +27,16 @@ from vllm.multimodal.processing import (
    PromptUpdate,
    PromptUpdateDetails,
 )
+from vllm.transformers_utils.processors.nvlm_d import IMG_PAD, NVLMProcessor

 from .intern_vit import InternVisionModel
 from .internvl import (
    BaseInternVLDummyInputsBuilder,
    BaseInternVLMultiModalProcessor,
    BaseInternVLProcessingInfo,
-    BaseInternVLProcessor,
    InternVLChatModel,
 )

-IMG_PAD = "<|vision_pad|>"
-
-
-class NVLMProcessor(BaseInternVLProcessor):
-    @property
-    def image_token_id(self) -> int:
-        return self.tokenizer.get_vocab()[IMG_PAD]
-
-    def get_image_repl(
-        self,
-        feature_size: int,
-        num_patches: int | None,
-    ) -> PromptUpdateDetails[str]:
-        if num_patches is None:
-            raise NotImplementedError("Embedding inputs are not supported")
-
-        tile_pos_identifiers = [f"<tile_{i}>" for i in range(1, num_patches)]
-        if self.use_thumbnail:
-            tile_pos_identifiers += ["<tile_global_thumbnail>"]
-
-        context_size = feature_size // num_patches
-        features = "".join(
-            identifier + IMG_PAD * context_size for identifier in tile_pos_identifiers
-        )
-
-        # We include the start and end as well because "<Image><tile" is
-        # tokenized as ["<Image", "><", "tile"], resulting in assertion error
-        # when trying to find "<tile" as a subsequence of "<Image><tile"
-        repl = "<Image>" + features + "</Image>"
-
-        return PromptUpdateDetails.select_text(repl, IMG_PAD)
-

 class NVLMProcessingInfo(BaseInternVLProcessingInfo):
    def get_hf_processor(self, **kwargs: object) -> NVLMProcessor: