diff --git a/tests/models/multimodal/processing/test_h2ovl.py b/tests/models/multimodal/processing/test_h2ovl.py index 19e4cb896..3ba256f3c 100644 --- a/tests/models/multimodal/processing/test_h2ovl.py +++ b/tests/models/multimodal/processing/test_h2ovl.py @@ -23,7 +23,7 @@ def _get_expected_num_patches( min_num: int, max_num: int, ): - from vllm.model_executor.models.h2ovl import ( + from vllm.transformers_utils.processors.h2ovl import ( calculate_h2ovl_targets, get_h2ovl_target_ratios, ) diff --git a/tests/models/multimodal/processing/test_internvl.py b/tests/models/multimodal/processing/test_internvl.py index 437c7b682..7954dd6b5 100644 --- a/tests/models/multimodal/processing/test_internvl.py +++ b/tests/models/multimodal/processing/test_internvl.py @@ -23,7 +23,7 @@ def _get_expected_num_patches( min_num: int, max_num: int, ): - from vllm.model_executor.models.internvl import ( + from vllm.transformers_utils.processors.internvl import ( calculate_internvl_targets, get_internvl_target_ratios, ) diff --git a/tests/models/multimodal/processing/test_nemotron_vl.py b/tests/models/multimodal/processing/test_nemotron_vl.py index d9e635dde..be5c222fd 100644 --- a/tests/models/multimodal/processing/test_nemotron_vl.py +++ b/tests/models/multimodal/processing/test_nemotron_vl.py @@ -23,7 +23,7 @@ def _get_expected_num_patches( min_num: int, max_num: int, ): - from vllm.model_executor.models.nemotron_vl import ( + from vllm.transformers_utils.processors.nemotron_vl import ( calculate_nemotron_vl_targets, get_nemotron_vl_target_ratios, ) diff --git a/vllm/model_executor/models/eagle2_5_vl.py b/vllm/model_executor/models/eagle2_5_vl.py index 718e8bb54..3e6182db5 100644 --- a/vllm/model_executor/models/eagle2_5_vl.py +++ b/vllm/model_executor/models/eagle2_5_vl.py @@ -15,9 +15,8 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.siglip import SiglipVisionModel from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.processing import PromptUpdateDetails from vllm.sequence import IntermediateTensors -from vllm.tokenizers import TokenizerLike +from vllm.transformers_utils.processors.eagle2_5_vl import Eagle2_5_VLProcessor from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import ( @@ -27,13 +26,9 @@ from .interfaces import ( SupportsPP, ) from .internvl import ( - IMG_CONTEXT, - IMG_END, - IMG_START, BaseInternVLDummyInputsBuilder, BaseInternVLMultiModalProcessor, BaseInternVLProcessingInfo, - BaseInternVLProcessor, ) from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix @@ -70,81 +65,6 @@ Eagle2_5_VLImageInputs: TypeAlias = ( ) -class Eagle2_5_VLProcessor(BaseInternVLProcessor): - """ - Custom processor for Eagle2.5-VL model. - Extends BaseInternVLProcessor with Eagle-specific token handling. - """ - - def __init__( - self, - config: PretrainedConfig, - tokenizer: TokenizerLike, - *, - min_dynamic_patch: int | None = None, - max_dynamic_patch: int | None = None, - dynamic_image_size: bool | None = None, - ) -> None: - # Skip super().__init__() to avoid config manipulation - # Directly initialize all required attributes - self.config = config - self.tokenizer = tokenizer - - # Image size with force_image_size override - image_size: int = config.vision_config.image_size - if hasattr(config, "force_image_size") and config.force_image_size: - image_size = config.force_image_size - - patch_size: int = config.vision_config.patch_size - downsample_ratio: float = getattr(config, "downsample_ratio", 0.5) - - # Compute num_image_token - self.num_image_token = int( - (image_size // patch_size) ** 2 * (downsample_ratio**2) - ) - self.image_size = image_size - - # Dynamic patch settings with defaults - self.min_dynamic_patch = ( - min_dynamic_patch - if min_dynamic_patch is not None - else getattr(config, "min_dynamic_patch", 1) - ) - self.max_dynamic_patch = ( - max_dynamic_patch - if max_dynamic_patch is not None - else getattr(config, "max_dynamic_patch", 12) - ) - self.dynamic_image_size = ( - dynamic_image_size - if dynamic_image_size is not None - else getattr(config, "dynamic_image_size", True) - ) - self.use_thumbnail: bool = getattr(config, "use_thumbnail", True) - - @property - def image_token_id(self) -> int: - """Get the image token ID from config or tokenizer.""" - if hasattr(self.config, "image_token_index"): - return self.config.image_token_index - # Fallback to tokenizer vocab - use (ID: 151667) - vocab = self.tokenizer.get_vocab() - if IMG_CONTEXT in vocab: - return vocab[IMG_CONTEXT] - raise ValueError(f"Cannot find image token '{IMG_CONTEXT}' in vocabulary") - - def get_image_repl( - self, - feature_size: int, - num_patches: int | None, - ) -> PromptUpdateDetails[str]: - """Get image replacement string for prompt.""" - repl_features = IMG_CONTEXT * feature_size - repl_full = IMG_START + repl_features + IMG_END - - return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT) - - class Eagle2_5_VLProcessingInfo(BaseInternVLProcessingInfo): """Processing info for Eagle2.5-VL model.""" diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py index 0b61bd5a2..3b01985c4 100644 --- a/vllm/model_executor/models/h2ovl.py +++ b/vllm/model_executor/models/h2ovl.py @@ -11,7 +11,6 @@ from collections.abc import Mapping, Sequence import torch -from PIL import Image from transformers import PretrainedConfig from vllm.model_executor.layers.quantization import QuantizationConfig @@ -27,391 +26,19 @@ from vllm.multimodal.processing.processor import ( ProcessorInputs, PromptReplacement, PromptUpdate, - PromptUpdateDetails, TimingContext, ) -from vllm.tokenizers import TokenizerLike +from vllm.transformers_utils.processors.h2ovl import H2OVLProcessor from .intern_vit import InternVisionModel from .internvl import ( - IMG_CONTEXT, - IMG_END, - IMG_START, BaseInternVLDummyInputsBuilder, BaseInternVLMultiModalProcessor, BaseInternVLProcessingInfo, - BaseInternVLProcessor, InternVLChatModel, - build_transform, - find_closest_aspect_ratio, - get_internvl_target_ratios, ) -def resolve_h2ovl_min_max_num( - *, - min_dynamic_patch: int, - max_dynamic_patch: int, - dynamic_image_size: bool, - use_thumbnail: bool, -) -> tuple[int, int]: - min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1 - max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1 - - if use_thumbnail and max_dynamic_patch != 1: - max_dynamic_patch += 1 - - return min_dynamic_patch, max_dynamic_patch - - -def get_h2ovl_target_ratios( - min_num: int, - max_num: int, - *, - prior_aspect_ratio: tuple[int, int] | None, -) -> list[tuple[int, int]]: - target_ratios = get_internvl_target_ratios(min_num, max_num) - - # if prior_aspect_ratio is provided, filter the target ratios - if prior_aspect_ratio is not None: - target_ratios = [ - ratio - for ratio in target_ratios - if prior_aspect_ratio[0] % ratio[0] != 0 - and prior_aspect_ratio[1] % ratio[1] != 0 - ] - - return target_ratios - - -# modified to include blocks generated in second pass -def calculate_h2ovl_targets( - *, - orig_width: int, - orig_height: int, - target_ratios: list[tuple[int, int]], - image_size: int, - use_thumbnail: bool, -) -> tuple[int, int, int, tuple[int, int]]: - aspect_ratio = orig_width / orig_height - - # find the closest aspect ratio to the target - target_aspect_ratio = find_closest_aspect_ratio( - aspect_ratio, - target_ratios, - width=orig_width, - height=orig_height, - image_size=image_size, - ) - - # calculate the target width and height - target_width = image_size * target_aspect_ratio[0] - target_height = image_size * target_aspect_ratio[1] - blocks = target_aspect_ratio[0] * target_aspect_ratio[1] - - # add thumbnail image if num_blocks != 1 - if use_thumbnail and blocks != 1: - blocks += 1 - - return blocks, target_width, target_height, target_aspect_ratio - - -# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B -# refactored to handle prior_aspect_ratio -def dynamic_preprocess_h2ovl( - image: Image.Image, - *, - target_ratios: list[tuple[int, int]], - image_size: int, - use_thumbnail: bool, -) -> tuple[list[Image.Image], tuple[int, int]]: - orig_width, orig_height = image.size - - # calculate the number of blocks without thumbnail - ( - blocks, - target_width, - target_height, - target_aspect_ratio, - ) = calculate_h2ovl_targets( - orig_width=orig_width, - orig_height=orig_height, - target_ratios=target_ratios, - image_size=image_size, - use_thumbnail=False, - ) - - # resize the image - resized_img = image.resize((target_width, target_height)) - processed_images = [] - for i in range(blocks): - box = ( - (i % (target_width // image_size)) * image_size, - (i // (target_width // image_size)) * image_size, - ((i % (target_width // image_size)) + 1) * image_size, - ((i // (target_width // image_size)) + 1) * image_size, - ) - # split the image - split_img = resized_img.crop(box) - processed_images.append(split_img) - - assert len(processed_images) == blocks - - if use_thumbnail and len(processed_images) != 1: - thumbnail_img = image.resize((image_size, image_size)) - processed_images.append(thumbnail_img) - - return processed_images, target_aspect_ratio - - -def _preprocess_image( - image: Image.Image, - *, - input_size: int, - min_num: int, - max_num: int, - use_thumbnail: bool, - prior_aspect_ratio: tuple[int, int] | None, -) -> tuple[torch.Tensor, tuple[int, int]]: - target_ratios = get_h2ovl_target_ratios( - min_num, - max_num, - prior_aspect_ratio=prior_aspect_ratio, - ) - - transform = build_transform(input_size=input_size) - images, target_aspect_ratio = dynamic_preprocess_h2ovl( - image, - image_size=input_size, - use_thumbnail=use_thumbnail, - target_ratios=target_ratios, - ) - - pixel_values = torch.stack([transform(image) for image in images]) - return pixel_values, target_aspect_ratio - - -# refactored to use the _preprocess_image function -def image_to_pixel_values_h2ovl( - image: Image.Image, - *, - input_size: int, - min_num: int, - max_num: int, - use_thumbnail: bool, - use_msac: bool, -) -> torch.Tensor: - # when MSAC is turned on, we need to process the image twice - if use_msac: - # first pass - pixel_values1, aspect_ratio1 = _preprocess_image( - image, - input_size=input_size, - min_num=1, - max_num=max_num, - use_thumbnail=True, - prior_aspect_ratio=None, - ) - # second pass - pixel_values2, _ = _preprocess_image( - image, - input_size=input_size, - min_num=3, - max_num=max_num, - use_thumbnail=True, - prior_aspect_ratio=aspect_ratio1, - ) - # combine pixel values - pixel_values = torch.cat( - [pixel_values2[:-1], pixel_values1[:-1], pixel_values2[-1:]], 0 - ) - - else: - pixel_values, _ = _preprocess_image( - image, - input_size=input_size, - min_num=min_num, - max_num=max_num, - use_thumbnail=use_thumbnail, - prior_aspect_ratio=None, - ) - - return pixel_values - - -class H2OVLProcessor(BaseInternVLProcessor): - def __init__( - self, - config: PretrainedConfig, - tokenizer: TokenizerLike, - *, - min_dynamic_patch: int | None = None, - max_dynamic_patch: int | None = None, - dynamic_image_size: bool | None = None, - use_msac: bool | None = None, - ) -> None: - super().__init__( - config, - tokenizer, - min_dynamic_patch=min_dynamic_patch, - max_dynamic_patch=max_dynamic_patch, - dynamic_image_size=dynamic_image_size, - ) - - if use_msac is None: - use_msac = config.use_msac - assert isinstance(use_msac, bool) - - self.use_msac = use_msac - - @property - def image_token_id(self) -> int: - return self.tokenizer.get_vocab()[IMG_CONTEXT] - - def get_image_repl( - self, - feature_size: int, - num_patches: int | None, - ) -> PromptUpdateDetails[str]: - repl_features = IMG_CONTEXT * feature_size - repl_full = IMG_START + repl_features + IMG_END - - return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT) - - def resolve_min_max_num( - self, - *, - min_dynamic_patch: int | None = None, - max_dynamic_patch: int | None = None, - dynamic_image_size: bool | None = None, - use_thumbnail: bool | None = None, - ) -> tuple[int, int]: - min_dynamic_patch = ( - self.min_dynamic_patch if min_dynamic_patch is None else min_dynamic_patch - ) - max_dynamic_patch = ( - self.max_dynamic_patch if max_dynamic_patch is None else max_dynamic_patch - ) - dynamic_image_size = ( - self.dynamic_image_size - if dynamic_image_size is None - else dynamic_image_size - ) - use_thumbnail = self.use_thumbnail if use_thumbnail is None else use_thumbnail - - return resolve_h2ovl_min_max_num( - min_dynamic_patch=min_dynamic_patch, - max_dynamic_patch=max_dynamic_patch, - dynamic_image_size=dynamic_image_size, - use_thumbnail=use_thumbnail, - ) - - def resolve_target_ratios( - self, - *, - min_dynamic_patch: int | None = None, - max_dynamic_patch: int | None = None, - dynamic_image_size: bool | None = None, - use_thumbnail: bool | None = None, - prior_aspect_ratio: tuple[int, int] | None = None, - override_min_num: int | None = None, - ) -> list[tuple[int, int]]: - min_num, max_num = self.resolve_min_max_num( - min_dynamic_patch=min_dynamic_patch, - max_dynamic_patch=max_dynamic_patch, - dynamic_image_size=dynamic_image_size, - use_thumbnail=use_thumbnail, - ) - if override_min_num is not None: - min_num = override_min_num - - return get_h2ovl_target_ratios( - min_num, - max_num, - prior_aspect_ratio=prior_aspect_ratio, - ) - - def get_num_image_tokens( - self, - *, - image_width: int, - image_height: int, - use_msac: bool | None = None, - ) -> int: - use_msac = self.use_msac if use_msac is None else use_msac - - use_thumbnail = self.use_thumbnail - - if use_msac: - target_ratios_1 = self.resolve_target_ratios( - use_thumbnail=False, # Applied in calculate_targets - override_min_num=1, - ) - num_patches_1, _, _, aspect_ratio_1 = calculate_h2ovl_targets( - orig_width=image_width, - orig_height=image_height, - image_size=self.image_size, - target_ratios=target_ratios_1, - use_thumbnail=True, - ) - - target_ratios_2 = self.resolve_target_ratios( - use_thumbnail=False, # Applied in calculate_targets - prior_aspect_ratio=aspect_ratio_1, - override_min_num=3, - ) - num_patches_2, _, _, _ = calculate_h2ovl_targets( - orig_width=image_width, - orig_height=image_height, - image_size=self.image_size, - target_ratios=target_ratios_2, - use_thumbnail=True, - ) - - num_patches = num_patches_1 + num_patches_2 - 1 - else: - target_ratios = self.resolve_target_ratios( - use_thumbnail=False, # Applied in calculate_targets - ) - num_patches, _, _, _ = calculate_h2ovl_targets( - orig_width=image_width, - orig_height=image_height, - image_size=self.image_size, - target_ratios=target_ratios, - use_thumbnail=use_thumbnail, - ) - - return num_patches * self.num_image_token - - def _images_to_pixel_values_lst( - self, - images: list[Image.Image], - min_dynamic_patch: int | None = None, - max_dynamic_patch: int | None = None, - dynamic_image_size: bool | None = None, - ) -> list[torch.Tensor]: - use_msac = self.use_msac if len(images) == 1 else False - - min_num, max_num = self.resolve_min_max_num( - min_dynamic_patch=min_dynamic_patch, - max_dynamic_patch=max_dynamic_patch, - dynamic_image_size=dynamic_image_size, - use_thumbnail=False, # Applied in image_to_pixel_values - ) - - return [ - image_to_pixel_values_h2ovl( - image, - input_size=self.image_size, - min_num=min_num, - max_num=max_num, - use_thumbnail=self.use_thumbnail, - use_msac=use_msac, - ) - for image in images - ] - - class H2OVLProcessingInfo(BaseInternVLProcessingInfo): def get_hf_processor(self, **kwargs: object) -> H2OVLProcessor: return self.ctx.init_processor( diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index cdaa2b093..8126391b2 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -7,16 +7,13 @@ # Copyright (c) 2023 OpenGVLab # Licensed under The MIT License [see LICENSE for details] # -------------------------------------------------------- -from abc import ABC, abstractmethod +from abc import abstractmethod from collections.abc import Iterable, Mapping, Sequence -from typing import Annotated, Any, Literal, TypeAlias, TypeVar +from typing import Annotated, Literal, TypeAlias, TypeVar -import numpy.typing as npt import torch import torch.nn as nn -import torchvision.transforms as T -from PIL import Image -from transformers import BatchFeature, PretrainedConfig, TensorType +from transformers import BatchFeature, PretrainedConfig from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions @@ -28,7 +25,6 @@ from vllm.model_executor.models.intern_vit import ( ) from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.image import convert_image_mode from vllm.multimodal.inputs import ( MultiModalDataDict, MultiModalFieldConfig, @@ -46,10 +42,12 @@ from vllm.multimodal.processing import ( BaseProcessingInfo, PromptReplacement, PromptUpdate, - PromptUpdateDetails, ) from vllm.sequence import IntermediateTensors -from vllm.tokenizers import TokenizerLike +from vllm.transformers_utils.processors.internvl import ( + BaseInternVLProcessor, + InternVLProcessor, +) from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import ( @@ -60,13 +58,6 @@ from .interfaces import ( ) from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix -IMG_START = "" -IMG_END = "" -IMG_CONTEXT = "" - -IMAGENET_MEAN = (0.485, 0.456, 0.406) -IMAGENET_STD = (0.229, 0.224, 0.225) - class InternVLImagePixelInputs(TensorSchema): """ @@ -128,568 +119,6 @@ class InternVLVideoEmbeddingInputs(TensorSchema): InternVLVideoInputs: TypeAlias = InternVLVideoPixelInputs | InternVLVideoEmbeddingInputs -# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B -def build_transform(input_size: int): - MEAN, STD = IMAGENET_MEAN, IMAGENET_STD - transform = T.Compose( - [ - T.Lambda(lambda img: convert_image_mode(img, "RGB")), - T.Resize( - (input_size, input_size), interpolation=T.InterpolationMode.BICUBIC - ), - T.ToTensor(), - T.Normalize(mean=MEAN, std=STD), - ] - ) - return transform - - -# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B -def find_closest_aspect_ratio( - aspect_ratio: float, - target_ratios: list[tuple[int, int]], - *, - width: int, - height: int, - image_size: int, -) -> tuple[int, int]: - best_ratio_diff = float("inf") - best_ratio = (1, 1) - area = width * height - for ratio in target_ratios: - target_aspect_ratio = ratio[0] / ratio[1] - ratio_diff = abs(aspect_ratio - target_aspect_ratio) - if ratio_diff < best_ratio_diff: - best_ratio_diff = ratio_diff - best_ratio = ratio - elif ratio_diff == best_ratio_diff: - if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]: - best_ratio = ratio - return best_ratio - - -def resolve_internvl_min_max_num( - *, - min_dynamic_patch: int, - max_dynamic_patch: int, - dynamic_image_size: bool, - use_thumbnail: bool, -) -> tuple[int, int]: - min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1 - max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1 - - if use_thumbnail and max_dynamic_patch != 1: - max_dynamic_patch += 1 - - return min_dynamic_patch, max_dynamic_patch - - -def get_internvl_target_ratios( - min_num: int, - max_num: int, -) -> list[tuple[int, int]]: - target_ratios = { - (i, j) - for n in range(min_num, max_num + 1) - for i in range(1, n + 1) - for j in range(1, n + 1) - if min_num <= i * j <= max_num - } - return sorted(target_ratios, key=lambda x: x[0] * x[1]) - - -def calculate_internvl_targets( - *, - orig_width: int, - orig_height: int, - target_ratios: list[tuple[int, int]], - image_size: int, - use_thumbnail: bool, -) -> tuple[int, int, int]: - aspect_ratio = orig_width / orig_height - - # find the closest aspect ratio to the target - target_aspect_ratio = find_closest_aspect_ratio( - aspect_ratio, - target_ratios, - width=orig_width, - height=orig_height, - image_size=image_size, - ) - - # calculate the target width and height - target_width = image_size * target_aspect_ratio[0] - target_height = image_size * target_aspect_ratio[1] - blocks = target_aspect_ratio[0] * target_aspect_ratio[1] - - # add thumbnail image if num_blocks != 1 - if use_thumbnail and blocks != 1: - blocks += 1 - - return blocks, target_width, target_height - - -# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B -def dynamic_preprocess_internvl( - image: Image.Image, - *, - target_ratios: list[tuple[int, int]], - image_size: int, - use_thumbnail: bool, -) -> list[Image.Image]: - orig_width, orig_height = image.size - - # calculate the number of blocks without thumbnail - blocks, target_width, target_height = calculate_internvl_targets( - orig_width=orig_width, - orig_height=orig_height, - target_ratios=target_ratios, - image_size=image_size, - use_thumbnail=False, - ) - - # resize the image - resized_img = image.resize((target_width, target_height)) - processed_images = [] - for i in range(blocks): - box = ( - (i % (target_width // image_size)) * image_size, - (i // (target_width // image_size)) * image_size, - ((i % (target_width // image_size)) + 1) * image_size, - ((i // (target_width // image_size)) + 1) * image_size, - ) - # split the image - split_img = resized_img.crop(box) - processed_images.append(split_img) - - assert len(processed_images) == blocks - - if use_thumbnail and len(processed_images) != 1: - thumbnail_img = image.resize((image_size, image_size)) - processed_images.append(thumbnail_img) - - return processed_images - - -# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B -def image_to_pixel_values_internvl( - image: Image.Image, - *, - input_size: int, - min_num: int, - max_num: int, - use_thumbnail: bool, -) -> torch.Tensor: - target_ratios = get_internvl_target_ratios(min_num, max_num) - - transform = build_transform(input_size=input_size) - images = dynamic_preprocess_internvl( - image, - target_ratios=target_ratios, - image_size=input_size, - use_thumbnail=use_thumbnail, - ) - - pixel_values = torch.stack([transform(image) for image in images]) - return pixel_values - - -# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B -def video_to_pixel_values_internvl( - video: npt.NDArray, - *, - input_size: int, - min_num: int, - max_num: int, - use_thumbnail: bool, -) -> torch.Tensor: - target_ratios = get_internvl_target_ratios(min_num, max_num) - - transform = build_transform(input_size=input_size) - frames_list = list[Image.Image]() - for frame in video: - pil_frame = dynamic_preprocess_internvl( - Image.fromarray(frame, mode="RGB"), - target_ratios=target_ratios, - image_size=input_size, - use_thumbnail=use_thumbnail, - ) - assert len(pil_frame) == 1 - frames_list.extend(pil_frame) - - pixel_values = torch.stack([transform(image) for image in frames_list]) - return pixel_values - - -class BaseInternVLProcessor(ABC): - """ - This model doesn't define its own HF processor, - so we implement our own one here. - - The code to insert image tokens is based on: - https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py#L252 - """ - - def __init__( - self, - config: PretrainedConfig, - tokenizer: TokenizerLike, - *, - min_dynamic_patch: int | None = None, - max_dynamic_patch: int | None = None, - dynamic_image_size: bool | None = None, - ) -> None: - super().__init__() - - self.config = config - self.tokenizer = tokenizer - - image_size: int = config.vision_config.image_size - patch_size: int = config.vision_config.patch_size - - if min_dynamic_patch is None: - min_dynamic_patch = config.min_dynamic_patch - assert isinstance(min_dynamic_patch, int) - - if max_dynamic_patch is None: - max_dynamic_patch = config.max_dynamic_patch - assert isinstance(max_dynamic_patch, int) - - if dynamic_image_size is None: - dynamic_image_size = config.dynamic_image_size - assert isinstance(dynamic_image_size, bool) - - self.num_image_token = int( - (image_size // patch_size) ** 2 * (config.downsample_ratio**2) - ) - self.image_size = image_size - self.min_dynamic_patch = min_dynamic_patch - self.max_dynamic_patch = max_dynamic_patch - self.dynamic_image_size = dynamic_image_size - self.use_thumbnail: bool = config.use_thumbnail - - @property - @abstractmethod - def image_token_id(self) -> int: - raise NotImplementedError - - @abstractmethod - def get_image_repl( - self, - feature_size: int, - num_patches: int | None, - ) -> PromptUpdateDetails[str]: - raise NotImplementedError - - def resolve_min_max_num( - self, - *, - min_dynamic_patch: int | None = None, - max_dynamic_patch: int | None = None, - dynamic_image_size: bool | None = None, - use_thumbnail: bool | None = None, - ) -> tuple[int, int]: - min_dynamic_patch = ( - self.min_dynamic_patch if min_dynamic_patch is None else min_dynamic_patch - ) - max_dynamic_patch = ( - self.max_dynamic_patch if max_dynamic_patch is None else max_dynamic_patch - ) - dynamic_image_size = ( - self.dynamic_image_size - if dynamic_image_size is None - else dynamic_image_size - ) - use_thumbnail = self.use_thumbnail if use_thumbnail is None else use_thumbnail - - return resolve_internvl_min_max_num( - min_dynamic_patch=min_dynamic_patch, - max_dynamic_patch=max_dynamic_patch, - dynamic_image_size=dynamic_image_size, - use_thumbnail=use_thumbnail, - ) - - def resolve_target_ratios( - self, - *, - min_dynamic_patch: int | None = None, - max_dynamic_patch: int | None = None, - dynamic_image_size: bool | None = None, - use_thumbnail: bool | None = None, - ) -> list[tuple[int, int]]: - min_num, max_num = self.resolve_min_max_num( - min_dynamic_patch=min_dynamic_patch, - max_dynamic_patch=max_dynamic_patch, - dynamic_image_size=dynamic_image_size, - use_thumbnail=use_thumbnail, - ) - - return get_internvl_target_ratios(min_num, max_num) - - def get_num_image_tokens( - self, - *, - image_width: int, - image_height: int, - ) -> int: - target_ratios = self.resolve_target_ratios( - use_thumbnail=False, # Applied in calculate_targets - ) - - num_patches, _, _ = calculate_internvl_targets( - orig_width=image_width, - orig_height=image_height, - image_size=self.image_size, - target_ratios=target_ratios, - use_thumbnail=self.use_thumbnail, - ) - - return num_patches * self.num_image_token - - def _images_to_pixel_values_lst( - self, - images: list[Image.Image], - min_dynamic_patch: int | None = None, - max_dynamic_patch: int | None = None, - dynamic_image_size: bool | None = None, - ) -> list[torch.Tensor]: - min_num, max_num = self.resolve_min_max_num( - min_dynamic_patch=min_dynamic_patch, - max_dynamic_patch=max_dynamic_patch, - dynamic_image_size=dynamic_image_size, - use_thumbnail=False, # Applied in image_to_pixel_values - ) - - return [ - image_to_pixel_values_internvl( - image, - input_size=self.image_size, - min_num=min_num, - max_num=max_num, - use_thumbnail=self.use_thumbnail, - ) - for image in images - ] - - def _preprocess_image( - self, - text: list[str], - images: list[Image.Image], - min_dynamic_patch: int | None = None, - max_dynamic_patch: int | None = None, - dynamic_image_size: bool | None = None, - ) -> tuple[list[str], dict[str, torch.Tensor]]: - if len(images) == 0: - image_inputs = {} - else: - pixel_values_lst = self._images_to_pixel_values_lst( - images, - min_dynamic_patch=min_dynamic_patch, - max_dynamic_patch=max_dynamic_patch, - dynamic_image_size=dynamic_image_size, - ) - image_inputs = { - "pixel_values_flat": torch.cat(pixel_values_lst), - "image_num_patches": torch.tensor( - [len(item) for item in pixel_values_lst] - ), - } - - for pixel_values in pixel_values_lst: - num_patches = pixel_values.shape[0] - feature_size = num_patches * self.num_image_token - - image_repl = self.get_image_repl(feature_size, num_patches) - text = [t.replace("", image_repl.full, 1) for t in text] - return text, image_inputs - - def _make_batch_input(self, input_item: Any | list[Any] | None = None): - if input_item is None: - input_item = [] - if not isinstance(input_item, list): - input_item = [input_item] - return input_item - - def __call__( - self, - text: str | list[str] | None = None, - images: Image.Image | list[Image.Image] | None = None, - min_dynamic_patch: int | None = None, - max_dynamic_patch: int | None = None, - dynamic_image_size: bool | None = None, - return_tensors: str | TensorType | None = None, - ) -> BatchFeature: - text, images = [self._make_batch_input(x) for x in (text, images)] - - text, image_inputs = self._preprocess_image( - text=text, - images=images, - min_dynamic_patch=min_dynamic_patch, - max_dynamic_patch=max_dynamic_patch, - dynamic_image_size=dynamic_image_size, - ) - - text_inputs = self.tokenizer(text) - - combined_outputs = {**text_inputs, **image_inputs} - - return BatchFeature(combined_outputs, tensor_type=return_tensors) - - -class InternVLProcessor(BaseInternVLProcessor): - """ - HF Processor for InternVLChatModel with extended video processing logic. - - Code for video processing is adapted from video example: - https://huggingface.co/OpenGVLab/InternVL3-1B#inference-with-transformers - """ - - def __init__( - self, - config: PretrainedConfig, - tokenizer: TokenizerLike, - *, - min_dynamic_patch: int | None = None, - max_dynamic_patch: int | None = None, - dynamic_image_size: bool | None = None, - video_token: str | None = None, - ) -> None: - super().__init__( - config=config, - tokenizer=tokenizer, - min_dynamic_patch=min_dynamic_patch, - max_dynamic_patch=max_dynamic_patch, - dynamic_image_size=dynamic_image_size, - ) - # add extra video token for video processing - self.video_token = video_token - - @property - def image_token_id(self) -> int: - return self.tokenizer.get_vocab()[IMG_CONTEXT] - - @property - def video_token_id(self) -> int | None: - if self.video_token is None: - return None - return self.tokenizer.get_vocab().get(self.video_token, None) - - @property - def supports_video(self) -> bool: - return self.video_token_id is not None - - def _videos_to_pixel_values_lst( - self, - videos: list[npt.NDArray], - dynamic_image_size: bool | None = None, - ) -> list[torch.Tensor]: - min_num, max_num = self.resolve_min_max_num( - min_dynamic_patch=1, - max_dynamic_patch=1, - dynamic_image_size=dynamic_image_size, - use_thumbnail=False, # Applied in image_to_pixel_values - ) - - return [ - video_to_pixel_values_internvl( - video, - input_size=self.image_size, - min_num=min_num, - max_num=max_num, - use_thumbnail=False, - ) - for video in videos - ] - - def _preprocess_video( - self, - text: list[str], - videos: list[npt.NDArray], - dynamic_image_size: bool | None = None, - ): - if len(videos) == 0 or not self.supports_video: - video_inputs = {} - else: - pixel_values_lst_video = self._videos_to_pixel_values_lst( - videos, - dynamic_image_size=dynamic_image_size, - ) - video_inputs = { - "pixel_values_flat_video": torch.cat(pixel_values_lst_video), - "video_num_patches": torch.tensor( - [len(item) for item in pixel_values_lst_video] - ), - } - - for pixel_values in pixel_values_lst_video: - num_patches = pixel_values.shape[0] - - video_repl = self.get_video_repl( - self.num_image_token, num_patches, self.video_token - ) - text = [t.replace("