[VLM] Merged multi-modal processor for InternVL-based models (#12553)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Isotr0py <2037008807@qq.com>
2025-02-04 16:44:52 +08:00
parent 96b23621c1
commit d1ca7df84d
34 changed files with 1469 additions and 1021 deletions
--- a/vllm/model_executor/models/nvlm_d.py
+++ b/vllm/model_executor/models/nvlm_d.py
@@ -6,44 +6,190 @@
 # Copyright (c) 2024 NVIDIA
 # Licensed under Apache 2.0 License [see LICENSE for details]
 # --------------------------------------------------------
-from typing import Optional
+from typing import Mapping, Optional

+import torch
 import torch.nn as nn
 from transformers import PretrainedConfig

-from vllm.inputs import INPUT_REGISTRY
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import MultiModalKwargs
+from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
+                                   MultiModalDataItems)
+from vllm.multimodal.processing import (PromptReplacement,
+                                        PromptReplacementDetails)
+from vllm.multimodal.profiling import ProcessorInputs

 from .intern_vit import InternVisionModel
-from .internvl import (InternVLChatModel, InternVLInputPipeline,
-                       get_max_internvl_image_tokens)
+from .internvl import (BaseInternVLProcessingInfo, BaseInternVLProcessor,
+                       InternVLChatModel, InternVLDummyInputsBuilder,
+                       InternVLMultiModalProcessor)

-IMG_START = '<|vision_start|>'
-IMG_END = '<|vision_end|>'
-IMG_CONTEXT = '<|vision_pad|>'
+IMG_PAD = "<|vision_pad|>"


-class NVLMInputPipeline(InternVLInputPipeline):
+class NVLMProcessor(BaseInternVLProcessor):
+
+    @property
+    def image_token_id(self) -> int:
+        return self.tokenizer.get_vocab()[IMG_PAD]
+
+    def get_image_repl_features(
+        self,
+        feature_size: int,
+        num_patches: Optional[int],
+    ) -> str:
+        if num_patches is None:
+            raise NotImplementedError("Embedding inputs are not supported")
+
+        tile_pos_identifiers = [f"<tile_{i}>" for i in range(1, num_patches)]
+        if self.use_thumbnail and num_patches != 1:
+            tile_pos_identifiers += ["<tile_global_thumbnail>"]

-    def _create_image_prompt(self, feature_size: int, num_patches: int) -> str:
-        tile_pos_identifiers = ([f"<tile_{i}>"
-                                 for i in range(1, num_patches)] +
-                                ["<tile_global_thumbnail>"])
        context_size = feature_size // num_patches
+        features = "".join(identifier + IMG_PAD * context_size
+                           for identifier in tile_pos_identifiers)

-        return '<Image>' + ''.join(
-            tile_pos_identifier + self.img_context_token * context_size
-            for tile_pos_identifier in tile_pos_identifiers) + '</Image>'
+        # We include the start and end as well because "<Image><tile" is
+        # tokenized as ["<Image", "><", "tile"], resulting in assertion error
+        # when trying to find "<tile" as a subsequence of "<Image><tile"
+        return "<Image>" + features + "</Image>"
+
+    def get_image_repl_full(
+        self,
+        feature_size: int,
+        num_patches: Optional[int],
+    ) -> str:
+        return self.get_image_repl_features(feature_size, num_patches)


-input_pipeline = NVLMInputPipeline(IMG_START, IMG_END, IMG_CONTEXT)
+class NVLMProcessingInfo(BaseInternVLProcessingInfo):
+
+    def get_hf_processor(
+        self,
+        *,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+    ) -> NVLMProcessor:
+        return NVLMProcessor(
+            self.get_hf_config(),
+            self.get_tokenizer(),
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+        )
+
+    def get_max_image_tokens(self) -> int:
+        hf_processor = self.get_hf_processor()
+        tokenizer = hf_processor.tokenizer
+
+        max_num_patches = hf_processor.max_dynamic_patch
+        # we need +1 here because max_dynamic_patch in config doesn't
+        # include the thumbnail patch
+        tile_pos_identifiers = [
+            f"<tile_{i+1}>" for i in range(max_num_patches)
+        ]
+        if hf_processor.use_thumbnail and max_num_patches != 1:
+            tile_pos_identifiers += ["<tile_global_thumbnail>"]
+
+        # "<Image><tile" is tokenized as ["<Image", "><", "tile"]
+        # so we include <tile_1> in the start_str
+        start_str = "<Image>" + tile_pos_identifiers.pop(0)
+        end_str = "</Image>"
+        start_token_len = len(tokenizer.encode(start_str))
+        end_token_len = len(tokenizer.encode(end_str))
+        tile_token_len = sum(
+            len(tokenizer.encode(identifier))
+            for identifier in tile_pos_identifiers)
+        non_image_tokens_num = start_token_len + end_token_len + tile_token_len
+        return super().get_max_image_tokens() + non_image_tokens_num


-@MULTIMODAL_REGISTRY.register_image_input_mapper(input_pipeline.input_mapper)
-@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_internvl_image_tokens)
-@INPUT_REGISTRY.register_dummy_data(input_pipeline.dummy_data)
-@INPUT_REGISTRY.register_input_processor(input_pipeline.input_processor)
+class NVLMDummyInputsBuilder(InternVLDummyInputsBuilder[NVLMProcessingInfo]):
+
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+        num_images = mm_counts.get("image", 0)
+
+        mm_data = {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images)
+        }
+
+        return ProcessorInputs(
+            # The newline is necessary to separate ">" of the current item
+            # and "<" of the next item
+            prompt_text="<image>\n" * num_images,
+            mm_data=mm_data,
+        )
+
+
+class NVLMMultiModalProcessor(InternVLMultiModalProcessor[NVLMProcessingInfo]):
+
+    def _get_prompt_replacements(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> list[PromptReplacement]:
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+
+        if "image_num_patches" in out_mm_kwargs:
+            image_num_patches = out_mm_kwargs["image_num_patches"]
+            assert isinstance(image_num_patches, torch.Tensor)
+            image_num_patches = image_num_patches.tolist()
+        elif "image_embeds" in out_mm_kwargs:
+            # TODO: Use image size information in dictionary embedding inputs
+            # to compute num_patches (similar to Qwen2-VL)
+            image_num_patches = [None] * len(out_mm_kwargs["image_embeds"])
+        else:
+            image_num_patches = []
+
+        def get_replacement_nvlm(item_idx: int):
+            images = mm_items.get_items(
+                "image", (ImageEmbeddingItems, ImageProcessorItems))
+
+            if isinstance(images, ImageEmbeddingItems):
+                feature_size = images.get_feature_size(item_idx)
+            else:
+                image_size = images.get_image_size(item_idx)
+                feature_size = self.info.get_num_image_tokens(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                    processor=hf_processor,
+                )
+
+            num_patches = image_num_patches[item_idx]
+            if num_patches is not None:
+                assert isinstance(num_patches, int)
+
+            return PromptReplacementDetails(
+                full=hf_processor.get_image_repl_full(feature_size,
+                                                      num_patches) + "\n",
+                features=hf_processor.get_image_repl_features(
+                    feature_size, num_patches) + "\n",
+            )
+
+        # See note in dummy data regarding why we have the extra newline
+        return [
+            PromptReplacement(
+                modality="image",
+                target="<image>\n",
+                replacement=get_replacement_nvlm,
+            )
+        ]
+
+
+@MULTIMODAL_REGISTRY.register_processor(NVLMMultiModalProcessor,
+                                        info=NVLMProcessingInfo,
+                                        dummy_inputs=NVLMDummyInputsBuilder)
 class NVLM_D_Model(InternVLChatModel):

    def _init_mlp1(self, config: PretrainedConfig) -> nn.Sequential: