# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project # adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_internvl_chat.py # -------------------------------------------------------- # InternVL # Copyright (c) 2023 OpenGVLab # Licensed under The MIT License [see LICENSE for details] # -------------------------------------------------------- from abc import abstractmethod from collections.abc import Iterable, Mapping, Sequence from functools import cached_property from typing import Annotated, Literal, TypeAlias, TypeVar import torch import torch.nn as nn from transformers import BatchFeature, PretrainedConfig from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.inputs import MultiModalDataDict from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization.awq import AWQConfig from vllm.model_executor.models.intern_vit import ( InternVisionModel, InternVisionPatchModel, ) from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import ( BatchedTensorInputs, MultiModalFieldConfig, MultiModalKwargsItems, ) from vllm.multimodal.parse import ( ImageEmbeddingItems, ImageProcessorItems, ImageSize, MultiModalDataItems, ) from vllm.multimodal.processing import ( BaseDummyInputsBuilder, BaseMultiModalProcessor, BaseProcessingInfo, PromptReplacement, PromptUpdate, ) from vllm.sequence import IntermediateTensors from vllm.transformers_utils.processors.internvl import ( InternVLImageProcessor, InternVLProcessor, InternVLVideoProcessor, ) from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import ( MultiModalEmbeddings, SupportsLoRA, SupportsMultiModal, SupportsPP, ) from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix class InternVLImagePixelInputs(TensorSchema): """ Dimensions: - bn: Batch size * number of images - bnp: Batch size * number of images * (1 + num_patches) - c: Number of channels (3) - h: Height of each image patch - w: Width of each image patch """ type: Literal["pixel_values"] pixel_values_flat: Annotated[torch.Tensor, TensorShape("bnp", 3, "h", "w")] num_patches: Annotated[torch.Tensor, TensorShape("bn")] class InternVLImageEmbeddingInputs(TensorSchema): """ Dimensions: - n: Number of images - f: Total image feature size - h: Hidden size (must match the hidden size of language model backbone) """ type: Literal["image_embeds"] data: Annotated[torch.Tensor | list[torch.Tensor], TensorShape("n", "f", "h")] InternVLImageInputs: TypeAlias = InternVLImagePixelInputs | InternVLImageEmbeddingInputs class InternVLVideoPixelInputs(TensorSchema): """ Dimensions: - bvf: Batch size * number of videos * num_frames - bn: Batch size * number of images - c: Number of channels (3) - h: Height of each video frame - w: Width of each video frame """ type: Literal["pixel_values_videos"] pixel_values_flat: Annotated[torch.Tensor, TensorShape("bvf", 3, "h", "w")] num_patches: Annotated[torch.Tensor, TensorShape("bn")] class InternVLVideoEmbeddingInputs(TensorSchema): """ Dimensions: - n: Number of videos - f: Total video feature size - h: Hidden size (must match the hidden size of language model backbone) """ type: Literal["video_embeds"] data: Annotated[torch.Tensor | list[torch.Tensor], TensorShape("n", "f", "h")] InternVLVideoInputs: TypeAlias = InternVLVideoPixelInputs | InternVLVideoEmbeddingInputs class BaseInternVLProcessingInfo(BaseProcessingInfo): """Basic image-only ProcessingInfo for InternVL-style models.""" @abstractmethod def get_hf_processor(self, **kwargs: object) -> InternVLProcessor: raise NotImplementedError def get_supported_mm_limits(self) -> Mapping[str, int | None]: return {"image": None} def get_num_image_tokens( self, *, image_width: int, image_height: int, processor: InternVLProcessor, ) -> int: return processor.get_num_image_tokens( image_width=image_width, image_height=image_height, ) def get_image_size_with_most_features(self) -> ImageSize: processor = self.get_hf_processor() image_processor = processor.image_processor base_size = image_processor.image_size target_ratios = processor.resolve_target_ratios() largest_feature_size, largest_feature_pinpoint = 0, None for wr, hr in target_ratios: width, height = base_size * wr, base_size * hr feat_size = self.get_num_image_tokens( image_width=width, image_height=height, processor=processor, ) if feat_size > largest_feature_size: largest_feature_size = feat_size largest_feature_pinpoint = ImageSize(width=width, height=height) if largest_feature_size == 0 or largest_feature_pinpoint is None: raise ValueError("Cannot have a largest feature size of 0!") return largest_feature_pinpoint def get_max_image_tokens(self) -> int: processor = self.get_hf_processor() target_width, target_height = self.get_image_size_with_most_features() return self.get_num_image_tokens( image_width=target_width, image_height=target_height, processor=processor, ) _I = TypeVar("_I", bound=BaseInternVLProcessingInfo) class BaseInternVLDummyInputsBuilder(BaseDummyInputsBuilder[_I]): """Basic image-only DummyInputsBuilder for InternVL-style models.""" def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: num_images = mm_counts.get("image", 0) return "" * num_images def get_dummy_mm_data( self, seq_len: int, mm_counts: Mapping[str, int], mm_options: Mapping[str, BaseDummyOptions], ) -> MultiModalDataDict: target_width, target_height = self.info.get_image_size_with_most_features() num_images = mm_counts.get("image", 0) image_overrides = mm_options.get("image") return { "image": self._get_dummy_images( width=target_width, height=target_height, num_images=num_images, overrides=image_overrides, ) } class BaseInternVLMultiModalProcessor(BaseMultiModalProcessor[_I]): """Basic image-only MultiModalProcessor for InternVL-style models.""" def _call_hf_processor( self, prompt: str, mm_data: Mapping[str, object], mm_kwargs: Mapping[str, object], tok_kwargs: Mapping[str, object], ) -> BatchFeature: processed_outputs = super()._call_hf_processor( prompt=prompt, mm_data=mm_data, mm_kwargs=mm_kwargs, tok_kwargs=tok_kwargs, ) hf_processor = self.info.get_hf_processor(**mm_kwargs) image_token_id = hf_processor.ctx_image_token_id # Since there may be extra tokens in the feature placeholders, # we need to pass the image token ID to the model to select the # tokens to merge from the vision encoder outputs processed_outputs["image_token_id"] = torch.tensor(image_token_id) return processed_outputs def _get_image_fields_config(self, hf_inputs: BatchFeature): image_num_patches = hf_inputs.get("image_num_patches", torch.empty(0)) num_images = len(image_num_patches) return dict( pixel_values_flat=MultiModalFieldConfig.flat_from_sizes( "image", image_num_patches ), image_num_patches=MultiModalFieldConfig.batched("image"), image_embeds=MultiModalFieldConfig.batched("image"), image_token_id=MultiModalFieldConfig.shared("image", num_images), ) def _get_mm_fields_config( self, hf_inputs: BatchFeature, hf_processor_mm_kwargs: Mapping[str, object], ) -> Mapping[str, MultiModalFieldConfig]: return self._get_image_fields_config(hf_inputs) def _get_prompt_repl_image( self, mm_items: MultiModalDataItems, hf_processor: InternVLProcessor, out_mm_data: BatchedTensorInputs, ): if "image_num_patches" in out_mm_data: image_num_patches = out_mm_data["image_num_patches"] assert isinstance(image_num_patches, torch.Tensor) image_num_patches = image_num_patches.tolist() elif "image_embeds" in out_mm_data: # TODO: Use image size information in dictionary embedding inputs # to compute num_patches (similar to Qwen2-VL) image_num_patches = [None] * len(out_mm_data["image_embeds"]) else: image_num_patches = [] def get_replacement_internvl(item_idx: int): images = mm_items.get_items( "image", (ImageEmbeddingItems, ImageProcessorItems) ) if isinstance(images, ImageEmbeddingItems): feature_size = images.get_feature_size(item_idx) else: image_size = images.get_image_size(item_idx) feature_size = self.info.get_num_image_tokens( image_width=image_size.width, image_height=image_size.height, processor=hf_processor, ) num_patches = image_num_patches[item_idx] if num_patches is not None: assert isinstance(num_patches, int) return hf_processor.get_image_repl(num_patches, num_features=feature_size) return PromptReplacement( modality="image", target="", replacement=get_replacement_internvl, ) def _get_prompt_updates( self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargsItems, ) -> Sequence[PromptUpdate]: hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) out_mm_data = out_mm_kwargs.get_data() return [ self._get_prompt_repl_image(mm_items, hf_processor, out_mm_data), ] class InternVLProcessingInfo(BaseInternVLProcessingInfo): """InternVL ProcessingInfo extended for video processing""" def get_image_processor(self, **kwargs): config = self.get_hf_config() vision_config = config.vision_config kwargs = self.ctx.get_merged_mm_kwargs(kwargs) kwargs.setdefault("image_size", vision_config.image_size) kwargs.setdefault("min_dynamic_patch", config.min_dynamic_patch) kwargs.setdefault("max_dynamic_patch", config.max_dynamic_patch) kwargs.setdefault("dynamic_image_size", config.dynamic_image_size) kwargs.setdefault("use_thumbnail", config.use_thumbnail) return InternVLImageProcessor(**kwargs) def get_video_processor(self, **kwargs): config = self.get_hf_config() vision_config = config.vision_config kwargs = self.ctx.get_merged_mm_kwargs(kwargs) kwargs.setdefault("image_size", vision_config.image_size) return InternVLVideoProcessor(**kwargs) @cached_property def ctx_video_token(self): text_model_type = self.get_hf_config().get_text_config().model_type ctx_video_token_map = { "qwen2": "<|video_pad|>", "qwen3": "<|video_pad|>", "qwen3_moe": "<|video_pad|>", "gpt_oss": "<|reserved_200000|>", } if text_model_type not in ctx_video_token_map: return None ctx_video_token = ctx_video_token_map[text_model_type] if ctx_video_token not in self.get_tokenizer().get_vocab(): return None return ctx_video_token def get_hf_processor(self, **kwargs: object) -> InternVLProcessor: config = self.get_hf_config() vision_config = config.vision_config image_processor = self.get_image_processor(**kwargs) image_size = image_processor.image_size patch_size = vision_config.patch_size downsample_ratio = config.downsample_ratio image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2)) ctx_video_token = self.ctx_video_token video_processor = ( self.get_video_processor(**kwargs) if ctx_video_token else None ) return InternVLProcessor( tokenizer=self.get_tokenizer(), image_processor=image_processor, video_processor=video_processor, image_seq_length=image_seq_length, ctx_video_token=ctx_video_token, ) def get_supported_mm_limits(self): video_limit = {"video": None} if self.ctx_video_token else {} return {**super().get_supported_mm_limits(), **video_limit} def get_num_frames_with_most_features( self, seq_len: int, mm_counts: Mapping[str, int], ) -> int: max_images = mm_counts.get("image", 0) max_videos = mm_counts.get("video", 0) processor = self.get_hf_processor() num_image_token = processor.image_seq_length max_image_tokens = self.get_max_image_tokens() * max_images max_total_frames = (seq_len - max_image_tokens) // num_image_token max_frames_per_video = max_total_frames // max(max_videos, 1) return max(max_frames_per_video, 1) class InternVLDummyInputsBuilder( BaseInternVLDummyInputsBuilder[InternVLProcessingInfo] ): """InternVL DummyInputsBuilder extended for video support""" def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: num_videos = mm_counts.get("video", 0) return super().get_dummy_text(mm_counts) + "