[1/2] Move InternVL-based processors (#37260)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -23,7 +23,7 @@ def _get_expected_num_patches(
|
||||
min_num: int,
|
||||
max_num: int,
|
||||
):
|
||||
from vllm.model_executor.models.h2ovl import (
|
||||
from vllm.transformers_utils.processors.h2ovl import (
|
||||
calculate_h2ovl_targets,
|
||||
get_h2ovl_target_ratios,
|
||||
)
|
||||
|
||||
@@ -23,7 +23,7 @@ def _get_expected_num_patches(
|
||||
min_num: int,
|
||||
max_num: int,
|
||||
):
|
||||
from vllm.model_executor.models.internvl import (
|
||||
from vllm.transformers_utils.processors.internvl import (
|
||||
calculate_internvl_targets,
|
||||
get_internvl_target_ratios,
|
||||
)
|
||||
|
||||
@@ -23,7 +23,7 @@ def _get_expected_num_patches(
|
||||
min_num: int,
|
||||
max_num: int,
|
||||
):
|
||||
from vllm.model_executor.models.nemotron_vl import (
|
||||
from vllm.transformers_utils.processors.nemotron_vl import (
|
||||
calculate_nemotron_vl_targets,
|
||||
get_nemotron_vl_target_ratios,
|
||||
)
|
||||
|
||||
@@ -15,9 +15,8 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
from vllm.model_executor.models.module_mapping import MultiModelKeys
|
||||
from vllm.model_executor.models.siglip import SiglipVisionModel
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.processing import PromptUpdateDetails
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
from vllm.transformers_utils.processors.eagle2_5_vl import Eagle2_5_VLProcessor
|
||||
from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
||||
|
||||
from .interfaces import (
|
||||
@@ -27,13 +26,9 @@ from .interfaces import (
|
||||
SupportsPP,
|
||||
)
|
||||
from .internvl import (
|
||||
IMG_CONTEXT,
|
||||
IMG_END,
|
||||
IMG_START,
|
||||
BaseInternVLDummyInputsBuilder,
|
||||
BaseInternVLMultiModalProcessor,
|
||||
BaseInternVLProcessingInfo,
|
||||
BaseInternVLProcessor,
|
||||
)
|
||||
from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix
|
||||
|
||||
@@ -70,81 +65,6 @@ Eagle2_5_VLImageInputs: TypeAlias = (
|
||||
)
|
||||
|
||||
|
||||
class Eagle2_5_VLProcessor(BaseInternVLProcessor):
|
||||
"""
|
||||
Custom processor for Eagle2.5-VL model.
|
||||
Extends BaseInternVLProcessor with Eagle-specific token handling.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
tokenizer: TokenizerLike,
|
||||
*,
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
) -> None:
|
||||
# Skip super().__init__() to avoid config manipulation
|
||||
# Directly initialize all required attributes
|
||||
self.config = config
|
||||
self.tokenizer = tokenizer
|
||||
|
||||
# Image size with force_image_size override
|
||||
image_size: int = config.vision_config.image_size
|
||||
if hasattr(config, "force_image_size") and config.force_image_size:
|
||||
image_size = config.force_image_size
|
||||
|
||||
patch_size: int = config.vision_config.patch_size
|
||||
downsample_ratio: float = getattr(config, "downsample_ratio", 0.5)
|
||||
|
||||
# Compute num_image_token
|
||||
self.num_image_token = int(
|
||||
(image_size // patch_size) ** 2 * (downsample_ratio**2)
|
||||
)
|
||||
self.image_size = image_size
|
||||
|
||||
# Dynamic patch settings with defaults
|
||||
self.min_dynamic_patch = (
|
||||
min_dynamic_patch
|
||||
if min_dynamic_patch is not None
|
||||
else getattr(config, "min_dynamic_patch", 1)
|
||||
)
|
||||
self.max_dynamic_patch = (
|
||||
max_dynamic_patch
|
||||
if max_dynamic_patch is not None
|
||||
else getattr(config, "max_dynamic_patch", 12)
|
||||
)
|
||||
self.dynamic_image_size = (
|
||||
dynamic_image_size
|
||||
if dynamic_image_size is not None
|
||||
else getattr(config, "dynamic_image_size", True)
|
||||
)
|
||||
self.use_thumbnail: bool = getattr(config, "use_thumbnail", True)
|
||||
|
||||
@property
|
||||
def image_token_id(self) -> int:
|
||||
"""Get the image token ID from config or tokenizer."""
|
||||
if hasattr(self.config, "image_token_index"):
|
||||
return self.config.image_token_index
|
||||
# Fallback to tokenizer vocab - use <IMG_CONTEXT> (ID: 151667)
|
||||
vocab = self.tokenizer.get_vocab()
|
||||
if IMG_CONTEXT in vocab:
|
||||
return vocab[IMG_CONTEXT]
|
||||
raise ValueError(f"Cannot find image token '{IMG_CONTEXT}' in vocabulary")
|
||||
|
||||
def get_image_repl(
|
||||
self,
|
||||
feature_size: int,
|
||||
num_patches: int | None,
|
||||
) -> PromptUpdateDetails[str]:
|
||||
"""Get image replacement string for prompt."""
|
||||
repl_features = IMG_CONTEXT * feature_size
|
||||
repl_full = IMG_START + repl_features + IMG_END
|
||||
|
||||
return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
|
||||
|
||||
|
||||
class Eagle2_5_VLProcessingInfo(BaseInternVLProcessingInfo):
|
||||
"""Processing info for Eagle2.5-VL model."""
|
||||
|
||||
|
||||
@@ -11,7 +11,6 @@
|
||||
from collections.abc import Mapping, Sequence
|
||||
|
||||
import torch
|
||||
from PIL import Image
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
from vllm.model_executor.layers.quantization import QuantizationConfig
|
||||
@@ -27,391 +26,19 @@ from vllm.multimodal.processing.processor import (
|
||||
ProcessorInputs,
|
||||
PromptReplacement,
|
||||
PromptUpdate,
|
||||
PromptUpdateDetails,
|
||||
TimingContext,
|
||||
)
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
from vllm.transformers_utils.processors.h2ovl import H2OVLProcessor
|
||||
|
||||
from .intern_vit import InternVisionModel
|
||||
from .internvl import (
|
||||
IMG_CONTEXT,
|
||||
IMG_END,
|
||||
IMG_START,
|
||||
BaseInternVLDummyInputsBuilder,
|
||||
BaseInternVLMultiModalProcessor,
|
||||
BaseInternVLProcessingInfo,
|
||||
BaseInternVLProcessor,
|
||||
InternVLChatModel,
|
||||
build_transform,
|
||||
find_closest_aspect_ratio,
|
||||
get_internvl_target_ratios,
|
||||
)
|
||||
|
||||
|
||||
def resolve_h2ovl_min_max_num(
|
||||
*,
|
||||
min_dynamic_patch: int,
|
||||
max_dynamic_patch: int,
|
||||
dynamic_image_size: bool,
|
||||
use_thumbnail: bool,
|
||||
) -> tuple[int, int]:
|
||||
min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1
|
||||
max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
|
||||
|
||||
if use_thumbnail and max_dynamic_patch != 1:
|
||||
max_dynamic_patch += 1
|
||||
|
||||
return min_dynamic_patch, max_dynamic_patch
|
||||
|
||||
|
||||
def get_h2ovl_target_ratios(
|
||||
min_num: int,
|
||||
max_num: int,
|
||||
*,
|
||||
prior_aspect_ratio: tuple[int, int] | None,
|
||||
) -> list[tuple[int, int]]:
|
||||
target_ratios = get_internvl_target_ratios(min_num, max_num)
|
||||
|
||||
# if prior_aspect_ratio is provided, filter the target ratios
|
||||
if prior_aspect_ratio is not None:
|
||||
target_ratios = [
|
||||
ratio
|
||||
for ratio in target_ratios
|
||||
if prior_aspect_ratio[0] % ratio[0] != 0
|
||||
and prior_aspect_ratio[1] % ratio[1] != 0
|
||||
]
|
||||
|
||||
return target_ratios
|
||||
|
||||
|
||||
# modified to include blocks generated in second pass
|
||||
def calculate_h2ovl_targets(
|
||||
*,
|
||||
orig_width: int,
|
||||
orig_height: int,
|
||||
target_ratios: list[tuple[int, int]],
|
||||
image_size: int,
|
||||
use_thumbnail: bool,
|
||||
) -> tuple[int, int, int, tuple[int, int]]:
|
||||
aspect_ratio = orig_width / orig_height
|
||||
|
||||
# find the closest aspect ratio to the target
|
||||
target_aspect_ratio = find_closest_aspect_ratio(
|
||||
aspect_ratio,
|
||||
target_ratios,
|
||||
width=orig_width,
|
||||
height=orig_height,
|
||||
image_size=image_size,
|
||||
)
|
||||
|
||||
# calculate the target width and height
|
||||
target_width = image_size * target_aspect_ratio[0]
|
||||
target_height = image_size * target_aspect_ratio[1]
|
||||
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
|
||||
|
||||
# add thumbnail image if num_blocks != 1
|
||||
if use_thumbnail and blocks != 1:
|
||||
blocks += 1
|
||||
|
||||
return blocks, target_width, target_height, target_aspect_ratio
|
||||
|
||||
|
||||
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
|
||||
# refactored to handle prior_aspect_ratio
|
||||
def dynamic_preprocess_h2ovl(
|
||||
image: Image.Image,
|
||||
*,
|
||||
target_ratios: list[tuple[int, int]],
|
||||
image_size: int,
|
||||
use_thumbnail: bool,
|
||||
) -> tuple[list[Image.Image], tuple[int, int]]:
|
||||
orig_width, orig_height = image.size
|
||||
|
||||
# calculate the number of blocks without thumbnail
|
||||
(
|
||||
blocks,
|
||||
target_width,
|
||||
target_height,
|
||||
target_aspect_ratio,
|
||||
) = calculate_h2ovl_targets(
|
||||
orig_width=orig_width,
|
||||
orig_height=orig_height,
|
||||
target_ratios=target_ratios,
|
||||
image_size=image_size,
|
||||
use_thumbnail=False,
|
||||
)
|
||||
|
||||
# resize the image
|
||||
resized_img = image.resize((target_width, target_height))
|
||||
processed_images = []
|
||||
for i in range(blocks):
|
||||
box = (
|
||||
(i % (target_width // image_size)) * image_size,
|
||||
(i // (target_width // image_size)) * image_size,
|
||||
((i % (target_width // image_size)) + 1) * image_size,
|
||||
((i // (target_width // image_size)) + 1) * image_size,
|
||||
)
|
||||
# split the image
|
||||
split_img = resized_img.crop(box)
|
||||
processed_images.append(split_img)
|
||||
|
||||
assert len(processed_images) == blocks
|
||||
|
||||
if use_thumbnail and len(processed_images) != 1:
|
||||
thumbnail_img = image.resize((image_size, image_size))
|
||||
processed_images.append(thumbnail_img)
|
||||
|
||||
return processed_images, target_aspect_ratio
|
||||
|
||||
|
||||
def _preprocess_image(
|
||||
image: Image.Image,
|
||||
*,
|
||||
input_size: int,
|
||||
min_num: int,
|
||||
max_num: int,
|
||||
use_thumbnail: bool,
|
||||
prior_aspect_ratio: tuple[int, int] | None,
|
||||
) -> tuple[torch.Tensor, tuple[int, int]]:
|
||||
target_ratios = get_h2ovl_target_ratios(
|
||||
min_num,
|
||||
max_num,
|
||||
prior_aspect_ratio=prior_aspect_ratio,
|
||||
)
|
||||
|
||||
transform = build_transform(input_size=input_size)
|
||||
images, target_aspect_ratio = dynamic_preprocess_h2ovl(
|
||||
image,
|
||||
image_size=input_size,
|
||||
use_thumbnail=use_thumbnail,
|
||||
target_ratios=target_ratios,
|
||||
)
|
||||
|
||||
pixel_values = torch.stack([transform(image) for image in images])
|
||||
return pixel_values, target_aspect_ratio
|
||||
|
||||
|
||||
# refactored to use the _preprocess_image function
|
||||
def image_to_pixel_values_h2ovl(
|
||||
image: Image.Image,
|
||||
*,
|
||||
input_size: int,
|
||||
min_num: int,
|
||||
max_num: int,
|
||||
use_thumbnail: bool,
|
||||
use_msac: bool,
|
||||
) -> torch.Tensor:
|
||||
# when MSAC is turned on, we need to process the image twice
|
||||
if use_msac:
|
||||
# first pass
|
||||
pixel_values1, aspect_ratio1 = _preprocess_image(
|
||||
image,
|
||||
input_size=input_size,
|
||||
min_num=1,
|
||||
max_num=max_num,
|
||||
use_thumbnail=True,
|
||||
prior_aspect_ratio=None,
|
||||
)
|
||||
# second pass
|
||||
pixel_values2, _ = _preprocess_image(
|
||||
image,
|
||||
input_size=input_size,
|
||||
min_num=3,
|
||||
max_num=max_num,
|
||||
use_thumbnail=True,
|
||||
prior_aspect_ratio=aspect_ratio1,
|
||||
)
|
||||
# combine pixel values
|
||||
pixel_values = torch.cat(
|
||||
[pixel_values2[:-1], pixel_values1[:-1], pixel_values2[-1:]], 0
|
||||
)
|
||||
|
||||
else:
|
||||
pixel_values, _ = _preprocess_image(
|
||||
image,
|
||||
input_size=input_size,
|
||||
min_num=min_num,
|
||||
max_num=max_num,
|
||||
use_thumbnail=use_thumbnail,
|
||||
prior_aspect_ratio=None,
|
||||
)
|
||||
|
||||
return pixel_values
|
||||
|
||||
|
||||
class H2OVLProcessor(BaseInternVLProcessor):
|
||||
def __init__(
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
tokenizer: TokenizerLike,
|
||||
*,
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
use_msac: bool | None = None,
|
||||
) -> None:
|
||||
super().__init__(
|
||||
config,
|
||||
tokenizer,
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
)
|
||||
|
||||
if use_msac is None:
|
||||
use_msac = config.use_msac
|
||||
assert isinstance(use_msac, bool)
|
||||
|
||||
self.use_msac = use_msac
|
||||
|
||||
@property
|
||||
def image_token_id(self) -> int:
|
||||
return self.tokenizer.get_vocab()[IMG_CONTEXT]
|
||||
|
||||
def get_image_repl(
|
||||
self,
|
||||
feature_size: int,
|
||||
num_patches: int | None,
|
||||
) -> PromptUpdateDetails[str]:
|
||||
repl_features = IMG_CONTEXT * feature_size
|
||||
repl_full = IMG_START + repl_features + IMG_END
|
||||
|
||||
return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
|
||||
|
||||
def resolve_min_max_num(
|
||||
self,
|
||||
*,
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
use_thumbnail: bool | None = None,
|
||||
) -> tuple[int, int]:
|
||||
min_dynamic_patch = (
|
||||
self.min_dynamic_patch if min_dynamic_patch is None else min_dynamic_patch
|
||||
)
|
||||
max_dynamic_patch = (
|
||||
self.max_dynamic_patch if max_dynamic_patch is None else max_dynamic_patch
|
||||
)
|
||||
dynamic_image_size = (
|
||||
self.dynamic_image_size
|
||||
if dynamic_image_size is None
|
||||
else dynamic_image_size
|
||||
)
|
||||
use_thumbnail = self.use_thumbnail if use_thumbnail is None else use_thumbnail
|
||||
|
||||
return resolve_h2ovl_min_max_num(
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
use_thumbnail=use_thumbnail,
|
||||
)
|
||||
|
||||
def resolve_target_ratios(
|
||||
self,
|
||||
*,
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
use_thumbnail: bool | None = None,
|
||||
prior_aspect_ratio: tuple[int, int] | None = None,
|
||||
override_min_num: int | None = None,
|
||||
) -> list[tuple[int, int]]:
|
||||
min_num, max_num = self.resolve_min_max_num(
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
use_thumbnail=use_thumbnail,
|
||||
)
|
||||
if override_min_num is not None:
|
||||
min_num = override_min_num
|
||||
|
||||
return get_h2ovl_target_ratios(
|
||||
min_num,
|
||||
max_num,
|
||||
prior_aspect_ratio=prior_aspect_ratio,
|
||||
)
|
||||
|
||||
def get_num_image_tokens(
|
||||
self,
|
||||
*,
|
||||
image_width: int,
|
||||
image_height: int,
|
||||
use_msac: bool | None = None,
|
||||
) -> int:
|
||||
use_msac = self.use_msac if use_msac is None else use_msac
|
||||
|
||||
use_thumbnail = self.use_thumbnail
|
||||
|
||||
if use_msac:
|
||||
target_ratios_1 = self.resolve_target_ratios(
|
||||
use_thumbnail=False, # Applied in calculate_targets
|
||||
override_min_num=1,
|
||||
)
|
||||
num_patches_1, _, _, aspect_ratio_1 = calculate_h2ovl_targets(
|
||||
orig_width=image_width,
|
||||
orig_height=image_height,
|
||||
image_size=self.image_size,
|
||||
target_ratios=target_ratios_1,
|
||||
use_thumbnail=True,
|
||||
)
|
||||
|
||||
target_ratios_2 = self.resolve_target_ratios(
|
||||
use_thumbnail=False, # Applied in calculate_targets
|
||||
prior_aspect_ratio=aspect_ratio_1,
|
||||
override_min_num=3,
|
||||
)
|
||||
num_patches_2, _, _, _ = calculate_h2ovl_targets(
|
||||
orig_width=image_width,
|
||||
orig_height=image_height,
|
||||
image_size=self.image_size,
|
||||
target_ratios=target_ratios_2,
|
||||
use_thumbnail=True,
|
||||
)
|
||||
|
||||
num_patches = num_patches_1 + num_patches_2 - 1
|
||||
else:
|
||||
target_ratios = self.resolve_target_ratios(
|
||||
use_thumbnail=False, # Applied in calculate_targets
|
||||
)
|
||||
num_patches, _, _, _ = calculate_h2ovl_targets(
|
||||
orig_width=image_width,
|
||||
orig_height=image_height,
|
||||
image_size=self.image_size,
|
||||
target_ratios=target_ratios,
|
||||
use_thumbnail=use_thumbnail,
|
||||
)
|
||||
|
||||
return num_patches * self.num_image_token
|
||||
|
||||
def _images_to_pixel_values_lst(
|
||||
self,
|
||||
images: list[Image.Image],
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
) -> list[torch.Tensor]:
|
||||
use_msac = self.use_msac if len(images) == 1 else False
|
||||
|
||||
min_num, max_num = self.resolve_min_max_num(
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
use_thumbnail=False, # Applied in image_to_pixel_values
|
||||
)
|
||||
|
||||
return [
|
||||
image_to_pixel_values_h2ovl(
|
||||
image,
|
||||
input_size=self.image_size,
|
||||
min_num=min_num,
|
||||
max_num=max_num,
|
||||
use_thumbnail=self.use_thumbnail,
|
||||
use_msac=use_msac,
|
||||
)
|
||||
for image in images
|
||||
]
|
||||
|
||||
|
||||
class H2OVLProcessingInfo(BaseInternVLProcessingInfo):
|
||||
def get_hf_processor(self, **kwargs: object) -> H2OVLProcessor:
|
||||
return self.ctx.init_processor(
|
||||
|
||||
@@ -7,16 +7,13 @@
|
||||
# Copyright (c) 2023 OpenGVLab
|
||||
# Licensed under The MIT License [see LICENSE for details]
|
||||
# --------------------------------------------------------
|
||||
from abc import ABC, abstractmethod
|
||||
from abc import abstractmethod
|
||||
from collections.abc import Iterable, Mapping, Sequence
|
||||
from typing import Annotated, Any, Literal, TypeAlias, TypeVar
|
||||
from typing import Annotated, Literal, TypeAlias, TypeVar
|
||||
|
||||
import numpy.typing as npt
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torchvision.transforms as T
|
||||
from PIL import Image
|
||||
from transformers import BatchFeature, PretrainedConfig, TensorType
|
||||
from transformers import BatchFeature, PretrainedConfig
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.config.multimodal import BaseDummyOptions
|
||||
@@ -28,7 +25,6 @@ from vllm.model_executor.models.intern_vit import (
|
||||
)
|
||||
from vllm.model_executor.models.module_mapping import MultiModelKeys
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.image import convert_image_mode
|
||||
from vllm.multimodal.inputs import (
|
||||
MultiModalDataDict,
|
||||
MultiModalFieldConfig,
|
||||
@@ -46,10 +42,12 @@ from vllm.multimodal.processing import (
|
||||
BaseProcessingInfo,
|
||||
PromptReplacement,
|
||||
PromptUpdate,
|
||||
PromptUpdateDetails,
|
||||
)
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
from vllm.transformers_utils.processors.internvl import (
|
||||
BaseInternVLProcessor,
|
||||
InternVLProcessor,
|
||||
)
|
||||
from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
||||
|
||||
from .interfaces import (
|
||||
@@ -60,13 +58,6 @@ from .interfaces import (
|
||||
)
|
||||
from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix
|
||||
|
||||
IMG_START = "<img>"
|
||||
IMG_END = "</img>"
|
||||
IMG_CONTEXT = "<IMG_CONTEXT>"
|
||||
|
||||
IMAGENET_MEAN = (0.485, 0.456, 0.406)
|
||||
IMAGENET_STD = (0.229, 0.224, 0.225)
|
||||
|
||||
|
||||
class InternVLImagePixelInputs(TensorSchema):
|
||||
"""
|
||||
@@ -128,568 +119,6 @@ class InternVLVideoEmbeddingInputs(TensorSchema):
|
||||
InternVLVideoInputs: TypeAlias = InternVLVideoPixelInputs | InternVLVideoEmbeddingInputs
|
||||
|
||||
|
||||
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
|
||||
def build_transform(input_size: int):
|
||||
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
|
||||
transform = T.Compose(
|
||||
[
|
||||
T.Lambda(lambda img: convert_image_mode(img, "RGB")),
|
||||
T.Resize(
|
||||
(input_size, input_size), interpolation=T.InterpolationMode.BICUBIC
|
||||
),
|
||||
T.ToTensor(),
|
||||
T.Normalize(mean=MEAN, std=STD),
|
||||
]
|
||||
)
|
||||
return transform
|
||||
|
||||
|
||||
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
|
||||
def find_closest_aspect_ratio(
|
||||
aspect_ratio: float,
|
||||
target_ratios: list[tuple[int, int]],
|
||||
*,
|
||||
width: int,
|
||||
height: int,
|
||||
image_size: int,
|
||||
) -> tuple[int, int]:
|
||||
best_ratio_diff = float("inf")
|
||||
best_ratio = (1, 1)
|
||||
area = width * height
|
||||
for ratio in target_ratios:
|
||||
target_aspect_ratio = ratio[0] / ratio[1]
|
||||
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
|
||||
if ratio_diff < best_ratio_diff:
|
||||
best_ratio_diff = ratio_diff
|
||||
best_ratio = ratio
|
||||
elif ratio_diff == best_ratio_diff:
|
||||
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
|
||||
best_ratio = ratio
|
||||
return best_ratio
|
||||
|
||||
|
||||
def resolve_internvl_min_max_num(
|
||||
*,
|
||||
min_dynamic_patch: int,
|
||||
max_dynamic_patch: int,
|
||||
dynamic_image_size: bool,
|
||||
use_thumbnail: bool,
|
||||
) -> tuple[int, int]:
|
||||
min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1
|
||||
max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
|
||||
|
||||
if use_thumbnail and max_dynamic_patch != 1:
|
||||
max_dynamic_patch += 1
|
||||
|
||||
return min_dynamic_patch, max_dynamic_patch
|
||||
|
||||
|
||||
def get_internvl_target_ratios(
|
||||
min_num: int,
|
||||
max_num: int,
|
||||
) -> list[tuple[int, int]]:
|
||||
target_ratios = {
|
||||
(i, j)
|
||||
for n in range(min_num, max_num + 1)
|
||||
for i in range(1, n + 1)
|
||||
for j in range(1, n + 1)
|
||||
if min_num <= i * j <= max_num
|
||||
}
|
||||
return sorted(target_ratios, key=lambda x: x[0] * x[1])
|
||||
|
||||
|
||||
def calculate_internvl_targets(
|
||||
*,
|
||||
orig_width: int,
|
||||
orig_height: int,
|
||||
target_ratios: list[tuple[int, int]],
|
||||
image_size: int,
|
||||
use_thumbnail: bool,
|
||||
) -> tuple[int, int, int]:
|
||||
aspect_ratio = orig_width / orig_height
|
||||
|
||||
# find the closest aspect ratio to the target
|
||||
target_aspect_ratio = find_closest_aspect_ratio(
|
||||
aspect_ratio,
|
||||
target_ratios,
|
||||
width=orig_width,
|
||||
height=orig_height,
|
||||
image_size=image_size,
|
||||
)
|
||||
|
||||
# calculate the target width and height
|
||||
target_width = image_size * target_aspect_ratio[0]
|
||||
target_height = image_size * target_aspect_ratio[1]
|
||||
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
|
||||
|
||||
# add thumbnail image if num_blocks != 1
|
||||
if use_thumbnail and blocks != 1:
|
||||
blocks += 1
|
||||
|
||||
return blocks, target_width, target_height
|
||||
|
||||
|
||||
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
|
||||
def dynamic_preprocess_internvl(
|
||||
image: Image.Image,
|
||||
*,
|
||||
target_ratios: list[tuple[int, int]],
|
||||
image_size: int,
|
||||
use_thumbnail: bool,
|
||||
) -> list[Image.Image]:
|
||||
orig_width, orig_height = image.size
|
||||
|
||||
# calculate the number of blocks without thumbnail
|
||||
blocks, target_width, target_height = calculate_internvl_targets(
|
||||
orig_width=orig_width,
|
||||
orig_height=orig_height,
|
||||
target_ratios=target_ratios,
|
||||
image_size=image_size,
|
||||
use_thumbnail=False,
|
||||
)
|
||||
|
||||
# resize the image
|
||||
resized_img = image.resize((target_width, target_height))
|
||||
processed_images = []
|
||||
for i in range(blocks):
|
||||
box = (
|
||||
(i % (target_width // image_size)) * image_size,
|
||||
(i // (target_width // image_size)) * image_size,
|
||||
((i % (target_width // image_size)) + 1) * image_size,
|
||||
((i // (target_width // image_size)) + 1) * image_size,
|
||||
)
|
||||
# split the image
|
||||
split_img = resized_img.crop(box)
|
||||
processed_images.append(split_img)
|
||||
|
||||
assert len(processed_images) == blocks
|
||||
|
||||
if use_thumbnail and len(processed_images) != 1:
|
||||
thumbnail_img = image.resize((image_size, image_size))
|
||||
processed_images.append(thumbnail_img)
|
||||
|
||||
return processed_images
|
||||
|
||||
|
||||
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
|
||||
def image_to_pixel_values_internvl(
|
||||
image: Image.Image,
|
||||
*,
|
||||
input_size: int,
|
||||
min_num: int,
|
||||
max_num: int,
|
||||
use_thumbnail: bool,
|
||||
) -> torch.Tensor:
|
||||
target_ratios = get_internvl_target_ratios(min_num, max_num)
|
||||
|
||||
transform = build_transform(input_size=input_size)
|
||||
images = dynamic_preprocess_internvl(
|
||||
image,
|
||||
target_ratios=target_ratios,
|
||||
image_size=input_size,
|
||||
use_thumbnail=use_thumbnail,
|
||||
)
|
||||
|
||||
pixel_values = torch.stack([transform(image) for image in images])
|
||||
return pixel_values
|
||||
|
||||
|
||||
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
|
||||
def video_to_pixel_values_internvl(
|
||||
video: npt.NDArray,
|
||||
*,
|
||||
input_size: int,
|
||||
min_num: int,
|
||||
max_num: int,
|
||||
use_thumbnail: bool,
|
||||
) -> torch.Tensor:
|
||||
target_ratios = get_internvl_target_ratios(min_num, max_num)
|
||||
|
||||
transform = build_transform(input_size=input_size)
|
||||
frames_list = list[Image.Image]()
|
||||
for frame in video:
|
||||
pil_frame = dynamic_preprocess_internvl(
|
||||
Image.fromarray(frame, mode="RGB"),
|
||||
target_ratios=target_ratios,
|
||||
image_size=input_size,
|
||||
use_thumbnail=use_thumbnail,
|
||||
)
|
||||
assert len(pil_frame) == 1
|
||||
frames_list.extend(pil_frame)
|
||||
|
||||
pixel_values = torch.stack([transform(image) for image in frames_list])
|
||||
return pixel_values
|
||||
|
||||
|
||||
class BaseInternVLProcessor(ABC):
|
||||
"""
|
||||
This model doesn't define its own HF processor,
|
||||
so we implement our own one here.
|
||||
|
||||
The code to insert image tokens is based on:
|
||||
https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py#L252
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
tokenizer: TokenizerLike,
|
||||
*,
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
self.config = config
|
||||
self.tokenizer = tokenizer
|
||||
|
||||
image_size: int = config.vision_config.image_size
|
||||
patch_size: int = config.vision_config.patch_size
|
||||
|
||||
if min_dynamic_patch is None:
|
||||
min_dynamic_patch = config.min_dynamic_patch
|
||||
assert isinstance(min_dynamic_patch, int)
|
||||
|
||||
if max_dynamic_patch is None:
|
||||
max_dynamic_patch = config.max_dynamic_patch
|
||||
assert isinstance(max_dynamic_patch, int)
|
||||
|
||||
if dynamic_image_size is None:
|
||||
dynamic_image_size = config.dynamic_image_size
|
||||
assert isinstance(dynamic_image_size, bool)
|
||||
|
||||
self.num_image_token = int(
|
||||
(image_size // patch_size) ** 2 * (config.downsample_ratio**2)
|
||||
)
|
||||
self.image_size = image_size
|
||||
self.min_dynamic_patch = min_dynamic_patch
|
||||
self.max_dynamic_patch = max_dynamic_patch
|
||||
self.dynamic_image_size = dynamic_image_size
|
||||
self.use_thumbnail: bool = config.use_thumbnail
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def image_token_id(self) -> int:
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def get_image_repl(
|
||||
self,
|
||||
feature_size: int,
|
||||
num_patches: int | None,
|
||||
) -> PromptUpdateDetails[str]:
|
||||
raise NotImplementedError
|
||||
|
||||
def resolve_min_max_num(
|
||||
self,
|
||||
*,
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
use_thumbnail: bool | None = None,
|
||||
) -> tuple[int, int]:
|
||||
min_dynamic_patch = (
|
||||
self.min_dynamic_patch if min_dynamic_patch is None else min_dynamic_patch
|
||||
)
|
||||
max_dynamic_patch = (
|
||||
self.max_dynamic_patch if max_dynamic_patch is None else max_dynamic_patch
|
||||
)
|
||||
dynamic_image_size = (
|
||||
self.dynamic_image_size
|
||||
if dynamic_image_size is None
|
||||
else dynamic_image_size
|
||||
)
|
||||
use_thumbnail = self.use_thumbnail if use_thumbnail is None else use_thumbnail
|
||||
|
||||
return resolve_internvl_min_max_num(
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
use_thumbnail=use_thumbnail,
|
||||
)
|
||||
|
||||
def resolve_target_ratios(
|
||||
self,
|
||||
*,
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
use_thumbnail: bool | None = None,
|
||||
) -> list[tuple[int, int]]:
|
||||
min_num, max_num = self.resolve_min_max_num(
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
use_thumbnail=use_thumbnail,
|
||||
)
|
||||
|
||||
return get_internvl_target_ratios(min_num, max_num)
|
||||
|
||||
def get_num_image_tokens(
|
||||
self,
|
||||
*,
|
||||
image_width: int,
|
||||
image_height: int,
|
||||
) -> int:
|
||||
target_ratios = self.resolve_target_ratios(
|
||||
use_thumbnail=False, # Applied in calculate_targets
|
||||
)
|
||||
|
||||
num_patches, _, _ = calculate_internvl_targets(
|
||||
orig_width=image_width,
|
||||
orig_height=image_height,
|
||||
image_size=self.image_size,
|
||||
target_ratios=target_ratios,
|
||||
use_thumbnail=self.use_thumbnail,
|
||||
)
|
||||
|
||||
return num_patches * self.num_image_token
|
||||
|
||||
def _images_to_pixel_values_lst(
|
||||
self,
|
||||
images: list[Image.Image],
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
) -> list[torch.Tensor]:
|
||||
min_num, max_num = self.resolve_min_max_num(
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
use_thumbnail=False, # Applied in image_to_pixel_values
|
||||
)
|
||||
|
||||
return [
|
||||
image_to_pixel_values_internvl(
|
||||
image,
|
||||
input_size=self.image_size,
|
||||
min_num=min_num,
|
||||
max_num=max_num,
|
||||
use_thumbnail=self.use_thumbnail,
|
||||
)
|
||||
for image in images
|
||||
]
|
||||
|
||||
def _preprocess_image(
|
||||
self,
|
||||
text: list[str],
|
||||
images: list[Image.Image],
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
) -> tuple[list[str], dict[str, torch.Tensor]]:
|
||||
if len(images) == 0:
|
||||
image_inputs = {}
|
||||
else:
|
||||
pixel_values_lst = self._images_to_pixel_values_lst(
|
||||
images,
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
)
|
||||
image_inputs = {
|
||||
"pixel_values_flat": torch.cat(pixel_values_lst),
|
||||
"image_num_patches": torch.tensor(
|
||||
[len(item) for item in pixel_values_lst]
|
||||
),
|
||||
}
|
||||
|
||||
for pixel_values in pixel_values_lst:
|
||||
num_patches = pixel_values.shape[0]
|
||||
feature_size = num_patches * self.num_image_token
|
||||
|
||||
image_repl = self.get_image_repl(feature_size, num_patches)
|
||||
text = [t.replace("<image>", image_repl.full, 1) for t in text]
|
||||
return text, image_inputs
|
||||
|
||||
def _make_batch_input(self, input_item: Any | list[Any] | None = None):
|
||||
if input_item is None:
|
||||
input_item = []
|
||||
if not isinstance(input_item, list):
|
||||
input_item = [input_item]
|
||||
return input_item
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
text: str | list[str] | None = None,
|
||||
images: Image.Image | list[Image.Image] | None = None,
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
return_tensors: str | TensorType | None = None,
|
||||
) -> BatchFeature:
|
||||
text, images = [self._make_batch_input(x) for x in (text, images)]
|
||||
|
||||
text, image_inputs = self._preprocess_image(
|
||||
text=text,
|
||||
images=images,
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
)
|
||||
|
||||
text_inputs = self.tokenizer(text)
|
||||
|
||||
combined_outputs = {**text_inputs, **image_inputs}
|
||||
|
||||
return BatchFeature(combined_outputs, tensor_type=return_tensors)
|
||||
|
||||
|
||||
class InternVLProcessor(BaseInternVLProcessor):
|
||||
"""
|
||||
HF Processor for InternVLChatModel with extended video processing logic.
|
||||
|
||||
Code for video processing is adapted from video example:
|
||||
https://huggingface.co/OpenGVLab/InternVL3-1B#inference-with-transformers
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
tokenizer: TokenizerLike,
|
||||
*,
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
video_token: str | None = None,
|
||||
) -> None:
|
||||
super().__init__(
|
||||
config=config,
|
||||
tokenizer=tokenizer,
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
)
|
||||
# add extra video token for video processing
|
||||
self.video_token = video_token
|
||||
|
||||
@property
|
||||
def image_token_id(self) -> int:
|
||||
return self.tokenizer.get_vocab()[IMG_CONTEXT]
|
||||
|
||||
@property
|
||||
def video_token_id(self) -> int | None:
|
||||
if self.video_token is None:
|
||||
return None
|
||||
return self.tokenizer.get_vocab().get(self.video_token, None)
|
||||
|
||||
@property
|
||||
def supports_video(self) -> bool:
|
||||
return self.video_token_id is not None
|
||||
|
||||
def _videos_to_pixel_values_lst(
|
||||
self,
|
||||
videos: list[npt.NDArray],
|
||||
dynamic_image_size: bool | None = None,
|
||||
) -> list[torch.Tensor]:
|
||||
min_num, max_num = self.resolve_min_max_num(
|
||||
min_dynamic_patch=1,
|
||||
max_dynamic_patch=1,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
use_thumbnail=False, # Applied in image_to_pixel_values
|
||||
)
|
||||
|
||||
return [
|
||||
video_to_pixel_values_internvl(
|
||||
video,
|
||||
input_size=self.image_size,
|
||||
min_num=min_num,
|
||||
max_num=max_num,
|
||||
use_thumbnail=False,
|
||||
)
|
||||
for video in videos
|
||||
]
|
||||
|
||||
def _preprocess_video(
|
||||
self,
|
||||
text: list[str],
|
||||
videos: list[npt.NDArray],
|
||||
dynamic_image_size: bool | None = None,
|
||||
):
|
||||
if len(videos) == 0 or not self.supports_video:
|
||||
video_inputs = {}
|
||||
else:
|
||||
pixel_values_lst_video = self._videos_to_pixel_values_lst(
|
||||
videos,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
)
|
||||
video_inputs = {
|
||||
"pixel_values_flat_video": torch.cat(pixel_values_lst_video),
|
||||
"video_num_patches": torch.tensor(
|
||||
[len(item) for item in pixel_values_lst_video]
|
||||
),
|
||||
}
|
||||
|
||||
for pixel_values in pixel_values_lst_video:
|
||||
num_patches = pixel_values.shape[0]
|
||||
|
||||
video_repl = self.get_video_repl(
|
||||
self.num_image_token, num_patches, self.video_token
|
||||
)
|
||||
text = [t.replace("<video>", video_repl.full, 1) for t in text]
|
||||
return text, video_inputs
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
text: str | list[str] | None = None,
|
||||
images: Image.Image | list[Image.Image] | None = None,
|
||||
videos: npt.NDArray | list[npt.NDArray] | None = None,
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
return_tensors: str | TensorType | None = None,
|
||||
) -> BatchFeature:
|
||||
text, images, videos = [
|
||||
self._make_batch_input(x) for x in (text, images, videos)
|
||||
]
|
||||
|
||||
text, image_inputs = self._preprocess_image(
|
||||
text=text,
|
||||
images=images,
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
)
|
||||
|
||||
text, video_inputs = self._preprocess_video(
|
||||
text=text,
|
||||
videos=videos,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
)
|
||||
|
||||
text_inputs = self.tokenizer(text)
|
||||
|
||||
combined_outputs = {**text_inputs, **image_inputs, **video_inputs}
|
||||
|
||||
return BatchFeature(combined_outputs, tensor_type=return_tensors)
|
||||
|
||||
def get_image_repl(
|
||||
self,
|
||||
feature_size: int,
|
||||
num_patches: int | None,
|
||||
) -> PromptUpdateDetails[str]:
|
||||
repl_features = IMG_CONTEXT * feature_size
|
||||
repl_full = IMG_START + repl_features + IMG_END
|
||||
|
||||
return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
|
||||
|
||||
def get_video_repl(
|
||||
self,
|
||||
feature_size: int,
|
||||
num_patches: int | None = None,
|
||||
video_context_token: str = IMG_CONTEXT,
|
||||
) -> PromptUpdateDetails[str]:
|
||||
repl_features = video_context_token * self.num_image_token
|
||||
repl_features_with_sep = IMG_START + repl_features + IMG_END
|
||||
# num_patches is equal to num_frames
|
||||
repl_full = "".join(
|
||||
[f"Frame{i + 1}: {repl_features_with_sep}" for i in range(num_patches)]
|
||||
)
|
||||
|
||||
return PromptUpdateDetails.select_text(repl_full, video_context_token)
|
||||
|
||||
|
||||
class BaseInternVLProcessingInfo(BaseProcessingInfo):
|
||||
"""Basic image-only ProcessingInfo for InternVL-style models."""
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -11,18 +11,13 @@ import math
|
||||
from collections.abc import Iterable, Mapping, Sequence
|
||||
from typing import Annotated, Literal
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from einops import rearrange
|
||||
from PIL import Image
|
||||
from timm.data.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
|
||||
from torchvision import transforms as T
|
||||
from transformers import (
|
||||
BartConfig,
|
||||
BatchFeature,
|
||||
PretrainedConfig,
|
||||
TensorType,
|
||||
)
|
||||
|
||||
from vllm.config import CacheConfig, VllmConfig
|
||||
@@ -59,13 +54,12 @@ from vllm.multimodal.processing import (
|
||||
PromptUpdate,
|
||||
)
|
||||
from vllm.renderers import TokenizeParams
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
from vllm.transformers_utils.configs.radio import RadioConfig
|
||||
from vllm.transformers_utils.processors.nemotron_parse import NemotronParseProcessor
|
||||
from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
||||
from vllm.v1.attention.backend import AttentionType
|
||||
|
||||
logger = init_logger(__name__)
|
||||
DEFAULT_FINAL_IMAGE_SIZE = (2048, 1648)
|
||||
|
||||
|
||||
class BartScaledWordEmbedding(VocabParallelEmbedding):
|
||||
@@ -372,231 +366,6 @@ class NemotronParsePixelInputs(TensorSchema):
|
||||
data: Annotated[torch.Tensor, TensorShape("b", 3, "h", "w")]
|
||||
|
||||
|
||||
class NemotronParseImageProcessor:
|
||||
"""
|
||||
NemotronParse Image Processor
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
final_size: tuple = DEFAULT_FINAL_IMAGE_SIZE,
|
||||
**kwargs,
|
||||
):
|
||||
# Ensure final_size is properly formatted
|
||||
if isinstance(final_size, (list, tuple)) and len(final_size) >= 2:
|
||||
self.final_size = (int(final_size[0]), int(final_size[1]))
|
||||
elif isinstance(final_size, (int, float)):
|
||||
self.final_size = (int(final_size), int(final_size))
|
||||
else:
|
||||
self.final_size = DEFAULT_FINAL_IMAGE_SIZE # Default fallback
|
||||
|
||||
self.norm_mean = torch.Tensor(OPENAI_CLIP_MEAN).reshape(1, 3, 1, 1)
|
||||
self.norm_std = torch.Tensor(OPENAI_CLIP_STD).reshape(1, 3, 1, 1)
|
||||
|
||||
# Create transforms
|
||||
self._create_transforms()
|
||||
|
||||
def _create_transforms(self):
|
||||
"""Create transform objects."""
|
||||
try:
|
||||
import albumentations as A
|
||||
except ImportError as err:
|
||||
raise ImportError(
|
||||
"The package `albumentations` is required to use "
|
||||
"NemotronParse model. Please install it with `pip install "
|
||||
"albumentations`."
|
||||
) from err
|
||||
|
||||
# Ensure final_size is a tuple of integers
|
||||
if isinstance(self.final_size, (list, tuple)):
|
||||
self.target_height, self.target_width = (
|
||||
int(self.final_size[0]),
|
||||
int(self.final_size[1]),
|
||||
)
|
||||
else:
|
||||
self.target_height = self.target_width = int(self.final_size)
|
||||
|
||||
import cv2
|
||||
|
||||
self.transform = A.Compose(
|
||||
[
|
||||
A.PadIfNeeded(
|
||||
min_height=self.target_height,
|
||||
min_width=self.target_width,
|
||||
border_mode=cv2.BORDER_CONSTANT,
|
||||
fill=[255, 255, 255],
|
||||
p=1.0,
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
self.torch_transform = T.Compose(
|
||||
[
|
||||
T.ToTensor(),
|
||||
]
|
||||
)
|
||||
|
||||
def _resize_with_aspect_ratio(self, image: np.ndarray) -> np.ndarray:
|
||||
"""Resize image maintaining aspect ratio (exact replica of original
|
||||
LongestMaxSizeHW)."""
|
||||
height, width = image.shape[:2]
|
||||
max_size_height = self.target_height
|
||||
max_size_width = self.target_width
|
||||
|
||||
# Original LongestMaxSizeHW algorithm from custom_augmentations.py
|
||||
aspect_ratio = width / height
|
||||
new_height = height
|
||||
new_width = width
|
||||
|
||||
# If height too big then scale image down
|
||||
if height > max_size_height:
|
||||
new_height = max_size_height
|
||||
new_width = int(new_height * aspect_ratio)
|
||||
|
||||
# If width too big, scale image down further
|
||||
if new_width > max_size_width:
|
||||
new_width = max_size_width
|
||||
new_height = int(new_width / aspect_ratio)
|
||||
|
||||
# Use cv2.INTER_LINEAR like the original
|
||||
import cv2
|
||||
|
||||
return cv2.resize(
|
||||
image, (new_width, new_height), interpolation=cv2.INTER_LINEAR
|
||||
)
|
||||
|
||||
def _pad_to_size(self, image: np.ndarray) -> np.ndarray:
|
||||
"""Pad image to target size with white padding (matches A.PadIfNeeded
|
||||
behavior)."""
|
||||
h, w = image.shape[:2]
|
||||
min_height, min_width = self.target_height, self.target_width
|
||||
|
||||
# Only pad if image is smaller than target (matches A.PadIfNeeded logic)
|
||||
pad_h = max(0, min_height - h)
|
||||
pad_w = max(0, min_width - w)
|
||||
|
||||
if pad_h == 0 and pad_w == 0:
|
||||
return image
|
||||
|
||||
# A.PadIfNeeded pads to bottom-right with constant value
|
||||
if len(image.shape) == 3:
|
||||
# Color image - pad bottom and right with white (255, 255, 255)
|
||||
padded = np.pad(
|
||||
image,
|
||||
((0, pad_h), (0, pad_w), (0, 0)),
|
||||
mode="constant",
|
||||
constant_values=255,
|
||||
)
|
||||
else:
|
||||
# Grayscale image - pad with white (255)
|
||||
padded = np.pad(
|
||||
image, ((0, pad_h), (0, pad_w)), mode="constant", constant_values=255
|
||||
)
|
||||
|
||||
return padded
|
||||
|
||||
def preprocess(
|
||||
self,
|
||||
images: Image.Image | list[Image.Image],
|
||||
**kwargs,
|
||||
) -> dict[str, torch.Tensor]:
|
||||
"""
|
||||
Preprocess an image or batch of images for the NemotronParse model.
|
||||
|
||||
Args:
|
||||
images: Input image(s)
|
||||
"""
|
||||
# Ensure images is a list
|
||||
if not isinstance(images, list):
|
||||
images = [images]
|
||||
|
||||
# Convert PIL images to numpy arrays if needed
|
||||
processed_images = []
|
||||
for image in images:
|
||||
if isinstance(image, Image.Image):
|
||||
image = np.asarray(image)
|
||||
processed_images.append(image)
|
||||
|
||||
# Apply NemotronParse-specific transforms
|
||||
pixel_values = []
|
||||
for image in processed_images:
|
||||
# Manual resize with aspect ratio preservation
|
||||
# (replaces LongestMaxSizeHW)
|
||||
processed_image = self._resize_with_aspect_ratio(image)
|
||||
|
||||
# Apply remaining albumentations transforms if available
|
||||
if self.transform is not None:
|
||||
transformed = self.transform(image=processed_image)
|
||||
processed_image = transformed["image"]
|
||||
else:
|
||||
# Fallback: just pad to target size
|
||||
processed_image = self._pad_to_size(processed_image)
|
||||
|
||||
# Convert to tensor
|
||||
pixel_values_tensor = self.torch_transform(processed_image)
|
||||
|
||||
# Handle grayscale images
|
||||
if pixel_values_tensor.shape[0] == 1:
|
||||
pixel_values_tensor = pixel_values_tensor.expand(3, -1, -1)
|
||||
|
||||
pixel_values.append(pixel_values_tensor)
|
||||
|
||||
# Stack into batch
|
||||
pixel_values = torch.stack(pixel_values)
|
||||
|
||||
# Normalize pixel values
|
||||
normalized_values = (pixel_values - self.norm_mean) / self.norm_std
|
||||
return {"pixel_values": normalized_values}
|
||||
|
||||
def __call__(
|
||||
self, images: Image.Image | list[Image.Image], **kwargs
|
||||
) -> dict[str, torch.Tensor]:
|
||||
return self.preprocess(images, **kwargs)
|
||||
|
||||
|
||||
class NemotronParseProcessor:
|
||||
"""
|
||||
NemotronParse Processor
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
tokenizer: TokenizerLike,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
self.config = config
|
||||
self.tokenizer = tokenizer
|
||||
|
||||
self.image_processor = NemotronParseImageProcessor(final_size=config.image_size)
|
||||
|
||||
def _make_batch_input(self, input_item=None):
|
||||
if input_item is None:
|
||||
input_item = []
|
||||
if not isinstance(input_item, list):
|
||||
input_item = [input_item]
|
||||
return input_item
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
text: str | None = None,
|
||||
images: Image.Image | list[Image.Image] | None = None,
|
||||
return_tensors: str | TensorType | None = None,
|
||||
**kwargs,
|
||||
) -> BatchFeature:
|
||||
text, images = [self._make_batch_input(x) for x in (text, images)]
|
||||
image_inputs = {} if len(images) == 0 else self.image_processor(images)
|
||||
|
||||
text_inputs = self.tokenizer(text, add_special_tokens=False, **kwargs)
|
||||
combined_outputs = BatchFeature(
|
||||
data={**text_inputs, **image_inputs},
|
||||
tensor_type=return_tensors,
|
||||
)
|
||||
return combined_outputs
|
||||
|
||||
|
||||
class NemotronParseProcessingInfo(BaseProcessingInfo):
|
||||
def get_hf_config(self):
|
||||
return self.ctx.get_hf_config()
|
||||
|
||||
@@ -1,22 +1,11 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_internvl_chat.py
|
||||
# --------------------------------------------------------
|
||||
# InternVL
|
||||
# Copyright (c) 2023 OpenGVLab
|
||||
# Licensed under The MIT License [see LICENSE for details]
|
||||
# --------------------------------------------------------
|
||||
import math
|
||||
from abc import ABC
|
||||
from collections.abc import Iterable
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torchvision.transforms as T
|
||||
from PIL import Image
|
||||
from transformers import AutoModel, PretrainedConfig
|
||||
from transformers.image_processing_utils_fast import BaseImageProcessorFast
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.model_executor.layers.linear import ReplicatedLinear
|
||||
@@ -30,16 +19,16 @@ from vllm.model_executor.models.internvl import (
|
||||
InternVLImageEmbeddingInputs,
|
||||
InternVLImageInputs,
|
||||
InternVLImagePixelInputs,
|
||||
InternVLProcessor,
|
||||
)
|
||||
from vllm.model_executor.models.module_mapping import MultiModelKeys
|
||||
from vllm.model_executor.models.siglip import SiglipVisionModel
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.image import convert_image_mode
|
||||
from vllm.multimodal.processing import PromptUpdateDetails
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
from vllm.transformers_utils.processor import cached_image_processor_from_config
|
||||
from vllm.transformers_utils.processors.nemotron_vl import (
|
||||
LlamaNemotronVLEmbedProcessor,
|
||||
NemotronVLProcessor,
|
||||
)
|
||||
from vllm.transformers_utils.repo_utils import get_hf_file_to_dict
|
||||
|
||||
from .interfaces import (
|
||||
@@ -58,310 +47,6 @@ from .utils import (
|
||||
)
|
||||
|
||||
|
||||
def build_transform(input_size: int):
|
||||
return T.Compose(
|
||||
[
|
||||
T.Lambda(lambda img: convert_image_mode(img, "RGB")),
|
||||
T.Resize(
|
||||
(input_size, input_size), interpolation=T.InterpolationMode.BICUBIC
|
||||
),
|
||||
T.ToTensor(),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
# adapted from https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1
|
||||
def find_closest_aspect_ratio(
|
||||
aspect_ratio: float,
|
||||
target_ratios: list[tuple[int, int]],
|
||||
*,
|
||||
width: int,
|
||||
height: int,
|
||||
image_size: int,
|
||||
) -> tuple[int, int]:
|
||||
best_factor = float("-inf")
|
||||
best_ratio = (1, 1)
|
||||
area = width * height
|
||||
|
||||
for rw, rh in target_ratios:
|
||||
target_aspect_ratio = rw / rh
|
||||
size_factor = min((rw * rh * image_size * image_size) / area, 0.6)
|
||||
ratio_closeness = min(
|
||||
target_aspect_ratio / aspect_ratio, aspect_ratio / target_aspect_ratio
|
||||
)
|
||||
factor = size_factor * ratio_closeness
|
||||
|
||||
if factor > best_factor:
|
||||
best_factor = factor
|
||||
best_ratio = (rw, rh)
|
||||
|
||||
return best_ratio
|
||||
|
||||
|
||||
def calculate_nemotron_vl_targets(
|
||||
*,
|
||||
orig_width: int,
|
||||
orig_height: int,
|
||||
target_ratios: list[tuple[int, int]],
|
||||
image_size: int,
|
||||
use_thumbnail: bool,
|
||||
) -> tuple[int, int, int]:
|
||||
aspect_ratio = orig_width / orig_height
|
||||
|
||||
# find the closest aspect ratio to the target
|
||||
target_aspect_ratio = find_closest_aspect_ratio(
|
||||
aspect_ratio,
|
||||
target_ratios,
|
||||
width=orig_width,
|
||||
height=orig_height,
|
||||
image_size=image_size,
|
||||
)
|
||||
|
||||
# calculate the target width and height
|
||||
target_width = image_size * target_aspect_ratio[0]
|
||||
target_height = image_size * target_aspect_ratio[1]
|
||||
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
|
||||
|
||||
# add thumbnail image if num_blocks != 1
|
||||
if use_thumbnail and blocks != 1:
|
||||
blocks += 1
|
||||
|
||||
return blocks, target_width, target_height
|
||||
|
||||
|
||||
def dynamic_preprocess_nemotron_vl(
|
||||
image: Image.Image,
|
||||
*,
|
||||
target_ratios: list[tuple[int, int]],
|
||||
image_size: int,
|
||||
use_thumbnail: bool,
|
||||
) -> list[Image.Image]:
|
||||
orig_width, orig_height = image.size
|
||||
|
||||
# calculate the number of blocks without thumbnail
|
||||
blocks, target_width, target_height = calculate_nemotron_vl_targets(
|
||||
orig_width=orig_width,
|
||||
orig_height=orig_height,
|
||||
target_ratios=target_ratios,
|
||||
image_size=image_size,
|
||||
use_thumbnail=False,
|
||||
)
|
||||
|
||||
# resize the image
|
||||
resized_img = image.resize((target_width, target_height))
|
||||
processed_images = []
|
||||
for i in range(blocks):
|
||||
box = (
|
||||
(i % (target_width // image_size)) * image_size,
|
||||
(i // (target_width // image_size)) * image_size,
|
||||
((i % (target_width // image_size)) + 1) * image_size,
|
||||
((i // (target_width // image_size)) + 1) * image_size,
|
||||
)
|
||||
# split the image
|
||||
split_img = resized_img.crop(box)
|
||||
processed_images.append(split_img)
|
||||
|
||||
assert len(processed_images) == blocks
|
||||
|
||||
if use_thumbnail and len(processed_images) != 1:
|
||||
thumbnail_img = image.resize((image_size, image_size))
|
||||
processed_images.append(thumbnail_img)
|
||||
|
||||
return processed_images
|
||||
|
||||
|
||||
def get_nemotron_vl_target_ratios(
|
||||
min_num: int,
|
||||
max_num: int,
|
||||
) -> list[tuple[int, int]]:
|
||||
target_ratios = {
|
||||
(i, j)
|
||||
for n in range(min_num, max_num + 1)
|
||||
for i in range(1, n + 1)
|
||||
for j in range(1, n + 1)
|
||||
if min_num <= i * j <= max_num
|
||||
}
|
||||
return sorted(target_ratios, key=lambda x: x[0] * x[1])
|
||||
|
||||
|
||||
def image_to_pixel_values_nemotron_vl(
|
||||
image: Image.Image,
|
||||
*,
|
||||
input_size: int,
|
||||
min_num: int,
|
||||
max_num: int,
|
||||
use_thumbnail: bool,
|
||||
transform: T.Compose | None = None,
|
||||
) -> torch.Tensor:
|
||||
target_ratios = get_nemotron_vl_target_ratios(min_num, max_num)
|
||||
|
||||
if transform is None:
|
||||
transform = build_transform(input_size=input_size)
|
||||
|
||||
images = dynamic_preprocess_nemotron_vl(
|
||||
image,
|
||||
target_ratios=target_ratios,
|
||||
image_size=input_size,
|
||||
use_thumbnail=use_thumbnail,
|
||||
)
|
||||
|
||||
pixel_values = torch.stack([transform(image) for image in images])
|
||||
return pixel_values
|
||||
|
||||
|
||||
class NemotronVLProcessor(InternVLProcessor):
|
||||
IMG_START = "<img>"
|
||||
IMG_END = "</img>"
|
||||
IMG_CONTEXT = "<image>"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
tokenizer: TokenizerLike,
|
||||
image_processor: BaseImageProcessorFast | None = None,
|
||||
*,
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
) -> None:
|
||||
ABC.__init__(self)
|
||||
self.config = config
|
||||
self.tokenizer = tokenizer
|
||||
self.image_processor = image_processor
|
||||
image_size: int = config.force_image_size
|
||||
patch_size: int = config.patch_size
|
||||
|
||||
if min_dynamic_patch is None:
|
||||
min_dynamic_patch = 1
|
||||
assert isinstance(min_dynamic_patch, int)
|
||||
|
||||
if max_dynamic_patch is None:
|
||||
max_dynamic_patch = self.image_processor.max_num_tiles
|
||||
assert isinstance(max_dynamic_patch, int)
|
||||
|
||||
if dynamic_image_size is None:
|
||||
dynamic_image_size = True
|
||||
assert isinstance(dynamic_image_size, bool)
|
||||
|
||||
self.num_image_token = int(
|
||||
(image_size // patch_size) ** 2 * (config.downsample_ratio**2)
|
||||
)
|
||||
self.image_size = image_size
|
||||
self.min_dynamic_patch = min_dynamic_patch
|
||||
self.max_dynamic_patch = max_dynamic_patch
|
||||
self.dynamic_image_size = dynamic_image_size
|
||||
|
||||
if image_processor is not None:
|
||||
self.use_thumbnail = image_processor.use_thumbnail
|
||||
else:
|
||||
self.use_thumbnail = getattr(config, "use_thumbnail", True)
|
||||
|
||||
@property
|
||||
def image_token_id(self) -> int:
|
||||
return self.tokenizer.get_vocab()[self.IMG_CONTEXT]
|
||||
|
||||
def _get_transform(self) -> T.Compose:
|
||||
return build_transform(input_size=self.image_size)
|
||||
|
||||
def get_num_image_tokens(
|
||||
self,
|
||||
*,
|
||||
image_width: int,
|
||||
image_height: int,
|
||||
) -> int:
|
||||
target_ratios = self.resolve_target_ratios(
|
||||
use_thumbnail=False, # Applied in calculate_targets
|
||||
)
|
||||
|
||||
num_patches, _, _ = calculate_nemotron_vl_targets(
|
||||
orig_width=image_width,
|
||||
orig_height=image_height,
|
||||
image_size=self.image_size,
|
||||
target_ratios=target_ratios,
|
||||
use_thumbnail=self.use_thumbnail,
|
||||
)
|
||||
|
||||
return num_patches * self.num_image_token
|
||||
|
||||
def _images_to_pixel_values_lst(
|
||||
self,
|
||||
images: list[Image.Image],
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
) -> list[torch.Tensor]:
|
||||
min_num, max_num = self.resolve_min_max_num(
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
use_thumbnail=False, # Applied in image_to_pixel_values
|
||||
)
|
||||
|
||||
return [
|
||||
image_to_pixel_values_nemotron_vl(
|
||||
image,
|
||||
input_size=self.image_size,
|
||||
min_num=min_num,
|
||||
max_num=max_num,
|
||||
use_thumbnail=self.use_thumbnail,
|
||||
transform=self._get_transform(),
|
||||
)
|
||||
for image in images
|
||||
]
|
||||
|
||||
def _replace_image_tokens(
|
||||
self,
|
||||
text: list[str],
|
||||
pixel_values_lst: list[torch.Tensor],
|
||||
) -> list[str]:
|
||||
"""Replace <image> placeholders with image tokens."""
|
||||
for pixel_values in pixel_values_lst:
|
||||
num_patches = pixel_values.shape[0]
|
||||
feature_size = num_patches * self.num_image_token
|
||||
image_repl = self.get_image_repl(feature_size, num_patches)
|
||||
# Use temporary placeholder to avoid replacing tokens we just inserted
|
||||
NVL_IMAGE_CONTEXT = image_repl.full.replace("<image>", "<NVL_IMG_CONTEXT>")
|
||||
text = [t.replace("<image>", NVL_IMAGE_CONTEXT, 1) for t in text]
|
||||
return [t.replace("<NVL_IMG_CONTEXT>", self.IMG_CONTEXT) for t in text]
|
||||
|
||||
def _preprocess_image(
|
||||
self,
|
||||
text: list[str],
|
||||
images: list[Image.Image],
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
) -> tuple[list[str], dict[str, torch.Tensor]]:
|
||||
if len(images) == 0:
|
||||
image_inputs = {}
|
||||
else:
|
||||
pixel_values_lst = self._images_to_pixel_values_lst(
|
||||
images,
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
)
|
||||
image_inputs = {
|
||||
"pixel_values_flat": torch.cat(pixel_values_lst),
|
||||
"image_num_patches": torch.tensor(
|
||||
[len(item) for item in pixel_values_lst]
|
||||
),
|
||||
}
|
||||
|
||||
text = self._replace_image_tokens(text, pixel_values_lst)
|
||||
return text, image_inputs
|
||||
|
||||
def get_image_repl(
|
||||
self,
|
||||
feature_size: int,
|
||||
num_patches: int | None,
|
||||
) -> PromptUpdateDetails[str]:
|
||||
repl_features = self.IMG_CONTEXT * feature_size
|
||||
repl_full = self.IMG_START + repl_features + self.IMG_END
|
||||
|
||||
return PromptUpdateDetails.select_text(repl_full, self.IMG_CONTEXT)
|
||||
|
||||
|
||||
class NemotronVLProcessingInfo(BaseInternVLProcessingInfo):
|
||||
"""Processing info for Nemotron VL models."""
|
||||
|
||||
@@ -700,91 +385,6 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, Suppor
|
||||
# - Pooler output instead of generative logits
|
||||
# --------------------------------------------------------
|
||||
|
||||
# SigLIP normalization constants
|
||||
SIGLIP_MEAN = (0.5, 0.5, 0.5)
|
||||
SIGLIP_STD = (0.5, 0.5, 0.5)
|
||||
|
||||
|
||||
def build_siglip_transform(input_size: int):
|
||||
"""Build transform for SigLIP vision encoder with normalization.
|
||||
|
||||
Extends the base transform from nemotron_vl with SigLIP-specific normalization.
|
||||
"""
|
||||
base_transform = build_transform(input_size=input_size)
|
||||
return T.Compose(
|
||||
[
|
||||
base_transform,
|
||||
T.Normalize(mean=SIGLIP_MEAN, std=SIGLIP_STD),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
class LlamaNemotronVLEmbedProcessor(NemotronVLProcessor):
|
||||
"""
|
||||
Processor for LlamaNemotronVL embedding model.
|
||||
|
||||
Inherits from NemotronVLProcessor and specializes it for embedding tasks:
|
||||
- Uses SigLIP transform with normalization instead of base transform
|
||||
- Uses different image context token (<IMG_CONTEXT> vs <image>)
|
||||
"""
|
||||
|
||||
IMG_CONTEXT = "<IMG_CONTEXT>"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
tokenizer: TokenizerLike,
|
||||
processor_config: dict,
|
||||
*,
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
) -> None:
|
||||
if min_dynamic_patch is None:
|
||||
min_dynamic_patch = processor_config.get(
|
||||
"min_input_tiles",
|
||||
getattr(config, "min_dynamic_patch", 1),
|
||||
)
|
||||
if max_dynamic_patch is None:
|
||||
max_dynamic_patch = processor_config.get(
|
||||
"max_input_tiles",
|
||||
getattr(config, "max_dynamic_patch", 1),
|
||||
)
|
||||
if dynamic_image_size is None:
|
||||
dynamic_image_size = processor_config.get(
|
||||
"dynamic_image_size",
|
||||
getattr(config, "dynamic_image_size", True),
|
||||
)
|
||||
super().__init__(
|
||||
config=config,
|
||||
tokenizer=tokenizer,
|
||||
image_processor=None,
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
)
|
||||
|
||||
def _get_transform(self) -> T.Compose:
|
||||
"""Override to add SigLIP normalization."""
|
||||
return build_siglip_transform(input_size=self.image_size)
|
||||
|
||||
def _replace_image_tokens(
|
||||
self,
|
||||
text: list[str],
|
||||
pixel_values_lst: list[torch.Tensor],
|
||||
) -> list[str]:
|
||||
"""Override with simpler token replacement for embedding model.
|
||||
|
||||
No temporary placeholder needed because IMG_CONTEXT is <IMG_CONTEXT>,
|
||||
not <image>, so there's no collision risk.
|
||||
"""
|
||||
for pixel_values in pixel_values_lst:
|
||||
num_patches = pixel_values.shape[0]
|
||||
feature_size = num_patches * self.num_image_token
|
||||
image_repl = self.get_image_repl(feature_size, num_patches)
|
||||
text = [t.replace("<image>", image_repl.full, 1) for t in text]
|
||||
return text
|
||||
|
||||
|
||||
class LlamaNemotronVLEmbedProcessingInfo(NemotronVLProcessingInfo):
|
||||
"""Processing info for LlamaNemotronVL embedding model."""
|
||||
|
||||
@@ -27,48 +27,16 @@ from vllm.multimodal.processing import (
|
||||
PromptUpdate,
|
||||
PromptUpdateDetails,
|
||||
)
|
||||
from vllm.transformers_utils.processors.nvlm_d import IMG_PAD, NVLMProcessor
|
||||
|
||||
from .intern_vit import InternVisionModel
|
||||
from .internvl import (
|
||||
BaseInternVLDummyInputsBuilder,
|
||||
BaseInternVLMultiModalProcessor,
|
||||
BaseInternVLProcessingInfo,
|
||||
BaseInternVLProcessor,
|
||||
InternVLChatModel,
|
||||
)
|
||||
|
||||
IMG_PAD = "<|vision_pad|>"
|
||||
|
||||
|
||||
class NVLMProcessor(BaseInternVLProcessor):
|
||||
@property
|
||||
def image_token_id(self) -> int:
|
||||
return self.tokenizer.get_vocab()[IMG_PAD]
|
||||
|
||||
def get_image_repl(
|
||||
self,
|
||||
feature_size: int,
|
||||
num_patches: int | None,
|
||||
) -> PromptUpdateDetails[str]:
|
||||
if num_patches is None:
|
||||
raise NotImplementedError("Embedding inputs are not supported")
|
||||
|
||||
tile_pos_identifiers = [f"<tile_{i}>" for i in range(1, num_patches)]
|
||||
if self.use_thumbnail:
|
||||
tile_pos_identifiers += ["<tile_global_thumbnail>"]
|
||||
|
||||
context_size = feature_size // num_patches
|
||||
features = "".join(
|
||||
identifier + IMG_PAD * context_size for identifier in tile_pos_identifiers
|
||||
)
|
||||
|
||||
# We include the start and end as well because "<Image><tile" is
|
||||
# tokenized as ["<Image", "><", "tile"], resulting in assertion error
|
||||
# when trying to find "<tile" as a subsequence of "<Image><tile"
|
||||
repl = "<Image>" + features + "</Image>"
|
||||
|
||||
return PromptUpdateDetails.select_text(repl, IMG_PAD)
|
||||
|
||||
|
||||
class NVLMProcessingInfo(BaseInternVLProcessingInfo):
|
||||
def get_hf_processor(self, **kwargs: object) -> NVLMProcessor:
|
||||
|
||||
@@ -12,9 +12,7 @@ from typing import Annotated, Literal, TypeAlias
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torchvision.transforms as T
|
||||
from PIL import Image
|
||||
from transformers import BatchFeature, PretrainedConfig, TensorType
|
||||
from transformers import BatchFeature, PretrainedConfig
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.config.multimodal import BaseDummyOptions
|
||||
@@ -26,7 +24,6 @@ from vllm.model_executor.models.intern_vit import (
|
||||
InternVisionPatchModel,
|
||||
)
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.image import convert_image_mode
|
||||
from vllm.multimodal.inputs import (
|
||||
MultiModalDataDict,
|
||||
MultiModalFieldConfig,
|
||||
@@ -44,22 +41,14 @@ from vllm.multimodal.processing import (
|
||||
BaseProcessingInfo,
|
||||
PromptReplacement,
|
||||
PromptUpdate,
|
||||
PromptUpdateDetails,
|
||||
)
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
from vllm.transformers_utils.processors.skyworkr1v import SkyworkR1VProcessor
|
||||
from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
||||
|
||||
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
|
||||
from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix
|
||||
|
||||
IMG_START = "<img>"
|
||||
IMG_END = "</img>"
|
||||
IMG_CONTEXT = "<IMG_CONTEXT>"
|
||||
|
||||
IMAGENET_MEAN = (0.485, 0.456, 0.406)
|
||||
IMAGENET_STD = (0.229, 0.224, 0.225)
|
||||
|
||||
|
||||
class SkyworkR1VImagePixelInputs(TensorSchema):
|
||||
"""
|
||||
@@ -106,370 +95,6 @@ SkyworkR1VImageInputs: TypeAlias = (
|
||||
)
|
||||
|
||||
|
||||
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/
|
||||
def build_transform(input_size: int):
|
||||
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
|
||||
return T.Compose(
|
||||
[
|
||||
T.Lambda(lambda img: convert_image_mode(img, "RGB")),
|
||||
T.Resize(
|
||||
(input_size, input_size), interpolation=T.InterpolationMode.BICUBIC
|
||||
),
|
||||
T.ToTensor(),
|
||||
T.Normalize(mean=MEAN, std=STD),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/
|
||||
def find_closest_aspect_ratio(
|
||||
aspect_ratio: float,
|
||||
target_ratios: list[tuple[int, int]],
|
||||
*,
|
||||
width: int,
|
||||
height: int,
|
||||
image_size: int,
|
||||
) -> tuple[int, int]:
|
||||
best_ratio_diff = float("inf")
|
||||
best_ratio = (1, 1)
|
||||
area = width * height
|
||||
for ratio in target_ratios:
|
||||
target_aspect_ratio = ratio[0] / ratio[1]
|
||||
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
|
||||
if ratio_diff < best_ratio_diff:
|
||||
best_ratio_diff = ratio_diff
|
||||
best_ratio = ratio
|
||||
elif ratio_diff == best_ratio_diff:
|
||||
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
|
||||
best_ratio = ratio
|
||||
return best_ratio
|
||||
|
||||
|
||||
def resolve_skyworkr1v_min_max_num(
|
||||
*,
|
||||
min_dynamic_patch: int,
|
||||
max_dynamic_patch: int,
|
||||
dynamic_image_size: bool,
|
||||
use_thumbnail: bool,
|
||||
) -> tuple[int, int]:
|
||||
min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1
|
||||
max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
|
||||
|
||||
if use_thumbnail and max_dynamic_patch != 1:
|
||||
max_dynamic_patch += 1
|
||||
|
||||
return min_dynamic_patch, max_dynamic_patch
|
||||
|
||||
|
||||
def get_skyworkr1v_target_ratios(
|
||||
min_num: int,
|
||||
max_num: int,
|
||||
) -> list[tuple[int, int]]:
|
||||
target_ratios = {
|
||||
(i, j)
|
||||
for n in range(min_num, max_num + 1)
|
||||
for i in range(1, n + 1)
|
||||
for j in range(1, n + 1)
|
||||
if min_num <= i * j <= max_num
|
||||
}
|
||||
return sorted(target_ratios, key=lambda x: x[0] * x[1])
|
||||
|
||||
|
||||
def calculate_skyworkr1v_targets(
|
||||
*,
|
||||
orig_width: int,
|
||||
orig_height: int,
|
||||
target_ratios: list[tuple[int, int]],
|
||||
image_size: int,
|
||||
use_thumbnail: bool,
|
||||
) -> tuple[int, int, int]:
|
||||
aspect_ratio = orig_width / orig_height
|
||||
|
||||
# find the closest aspect ratio to the target
|
||||
target_aspect_ratio = find_closest_aspect_ratio(
|
||||
aspect_ratio,
|
||||
target_ratios,
|
||||
width=orig_width,
|
||||
height=orig_height,
|
||||
image_size=image_size,
|
||||
)
|
||||
|
||||
# calculate the target width and height
|
||||
target_width = image_size * target_aspect_ratio[0]
|
||||
target_height = image_size * target_aspect_ratio[1]
|
||||
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
|
||||
|
||||
# add thumbnail image if num_blocks != 1
|
||||
if use_thumbnail and blocks != 1:
|
||||
blocks += 1
|
||||
|
||||
return blocks, target_width, target_height
|
||||
|
||||
|
||||
def dynamic_preprocess_skyworkr1v(
|
||||
image: Image.Image,
|
||||
*,
|
||||
target_ratios: list[tuple[int, int]],
|
||||
image_size: int,
|
||||
use_thumbnail: bool,
|
||||
) -> list[Image.Image]:
|
||||
orig_width, orig_height = image.size
|
||||
|
||||
# calculate the number of blocks without thumbnail
|
||||
blocks, target_width, target_height = calculate_skyworkr1v_targets(
|
||||
orig_width=orig_width,
|
||||
orig_height=orig_height,
|
||||
target_ratios=target_ratios,
|
||||
image_size=image_size,
|
||||
use_thumbnail=False,
|
||||
)
|
||||
|
||||
# resize the image
|
||||
resized_img = image.resize((target_width, target_height))
|
||||
processed_images = []
|
||||
for i in range(blocks):
|
||||
box = (
|
||||
(i % (target_width // image_size)) * image_size,
|
||||
(i // (target_width // image_size)) * image_size,
|
||||
((i % (target_width // image_size)) + 1) * image_size,
|
||||
((i // (target_width // image_size)) + 1) * image_size,
|
||||
)
|
||||
# split the image
|
||||
split_img = resized_img.crop(box)
|
||||
processed_images.append(split_img)
|
||||
|
||||
assert len(processed_images) == blocks
|
||||
|
||||
if use_thumbnail and len(processed_images) != 1:
|
||||
thumbnail_img = image.resize((image_size, image_size))
|
||||
processed_images.append(thumbnail_img)
|
||||
|
||||
return processed_images
|
||||
|
||||
|
||||
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B
|
||||
def image_to_pixel_values_skyworkr1v(
|
||||
image: Image.Image,
|
||||
*,
|
||||
input_size: int,
|
||||
min_num: int,
|
||||
max_num: int,
|
||||
use_thumbnail: bool,
|
||||
) -> torch.Tensor:
|
||||
target_ratios = get_skyworkr1v_target_ratios(min_num, max_num)
|
||||
|
||||
transform = build_transform(input_size=input_size)
|
||||
images = dynamic_preprocess_skyworkr1v(
|
||||
image,
|
||||
target_ratios=target_ratios,
|
||||
image_size=input_size,
|
||||
use_thumbnail=use_thumbnail,
|
||||
)
|
||||
|
||||
pixel_values = torch.stack([transform(image) for image in images])
|
||||
return pixel_values
|
||||
|
||||
|
||||
class SkyworkR1VProcessor:
|
||||
"""
|
||||
This model doesn't define its own HF processor,
|
||||
so we implement our own one here.
|
||||
|
||||
The code to insert image tokens is based on:
|
||||
https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/modeling_skywork_chat.py#L252
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
tokenizer: TokenizerLike,
|
||||
*,
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
self.config = config
|
||||
self.tokenizer = tokenizer
|
||||
|
||||
image_size: int = config.vision_config.image_size
|
||||
patch_size: int = config.vision_config.patch_size
|
||||
|
||||
if min_dynamic_patch is None:
|
||||
min_dynamic_patch = config.min_dynamic_patch
|
||||
assert isinstance(min_dynamic_patch, int)
|
||||
|
||||
if max_dynamic_patch is None:
|
||||
max_dynamic_patch = config.max_dynamic_patch
|
||||
assert isinstance(max_dynamic_patch, int)
|
||||
|
||||
if dynamic_image_size is None:
|
||||
dynamic_image_size = config.dynamic_image_size
|
||||
assert isinstance(dynamic_image_size, bool)
|
||||
|
||||
self.num_image_token = int(
|
||||
(image_size // patch_size) ** 2 * (config.downsample_ratio**2)
|
||||
)
|
||||
self.image_size = image_size
|
||||
self.min_dynamic_patch = min_dynamic_patch
|
||||
self.max_dynamic_patch = max_dynamic_patch
|
||||
self.dynamic_image_size = dynamic_image_size
|
||||
self.use_thumbnail: bool = config.use_thumbnail
|
||||
|
||||
@property
|
||||
def image_token_id(self) -> int:
|
||||
return self.tokenizer.get_vocab()[IMG_CONTEXT]
|
||||
|
||||
def get_image_repl(
|
||||
self,
|
||||
feature_size: int,
|
||||
num_patches: int | None,
|
||||
) -> PromptUpdateDetails[str]:
|
||||
repl_features = IMG_CONTEXT * feature_size
|
||||
repl_full = IMG_START + repl_features + IMG_END
|
||||
|
||||
return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
|
||||
|
||||
def resolve_min_max_num(
|
||||
self,
|
||||
*,
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
use_thumbnail: bool | None = None,
|
||||
) -> tuple[int, int]:
|
||||
min_dynamic_patch = (
|
||||
self.min_dynamic_patch if min_dynamic_patch is None else min_dynamic_patch
|
||||
)
|
||||
max_dynamic_patch = (
|
||||
self.max_dynamic_patch if max_dynamic_patch is None else max_dynamic_patch
|
||||
)
|
||||
dynamic_image_size = (
|
||||
self.dynamic_image_size
|
||||
if dynamic_image_size is None
|
||||
else dynamic_image_size
|
||||
)
|
||||
use_thumbnail = self.use_thumbnail if use_thumbnail is None else use_thumbnail
|
||||
|
||||
return resolve_skyworkr1v_min_max_num(
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
use_thumbnail=use_thumbnail,
|
||||
)
|
||||
|
||||
def resolve_target_ratios(
|
||||
self,
|
||||
*,
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
use_thumbnail: bool | None = None,
|
||||
) -> list[tuple[int, int]]:
|
||||
min_num, max_num = self.resolve_min_max_num(
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
use_thumbnail=use_thumbnail,
|
||||
)
|
||||
|
||||
return get_skyworkr1v_target_ratios(min_num, max_num)
|
||||
|
||||
def get_num_image_tokens(
|
||||
self,
|
||||
*,
|
||||
image_width: int,
|
||||
image_height: int,
|
||||
) -> int:
|
||||
target_ratios = self.resolve_target_ratios(
|
||||
use_thumbnail=False, # Applied in calculate_targets
|
||||
)
|
||||
|
||||
num_patches, _, _ = calculate_skyworkr1v_targets(
|
||||
orig_width=image_width,
|
||||
orig_height=image_height,
|
||||
image_size=self.image_size,
|
||||
target_ratios=target_ratios,
|
||||
use_thumbnail=self.use_thumbnail,
|
||||
)
|
||||
|
||||
return num_patches * self.num_image_token
|
||||
|
||||
def _images_to_pixel_values_lst(
|
||||
self,
|
||||
images: list[Image.Image],
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
) -> list[torch.Tensor]:
|
||||
min_num, max_num = self.resolve_min_max_num(
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
use_thumbnail=False, # Applied in image_to_pixel_values
|
||||
)
|
||||
|
||||
return [
|
||||
image_to_pixel_values_skyworkr1v(
|
||||
image,
|
||||
input_size=self.image_size,
|
||||
min_num=min_num,
|
||||
max_num=max_num,
|
||||
use_thumbnail=self.use_thumbnail,
|
||||
)
|
||||
for image in images
|
||||
]
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
text: str | list[str] | None = None,
|
||||
images: Image.Image | list[Image.Image] | None = None,
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
return_tensors: str | TensorType | None = None,
|
||||
) -> BatchFeature:
|
||||
if text is None:
|
||||
text = []
|
||||
if not isinstance(text, list):
|
||||
text = [text]
|
||||
if images is None:
|
||||
images = []
|
||||
if not isinstance(images, list):
|
||||
images = [images]
|
||||
|
||||
if len(images) == 0:
|
||||
image_inputs = {}
|
||||
else:
|
||||
pixel_values_lst = self._images_to_pixel_values_lst(
|
||||
images,
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
)
|
||||
image_inputs = {
|
||||
"pixel_values_flat": torch.cat(pixel_values_lst),
|
||||
"image_num_patches": torch.tensor(
|
||||
[len(item) for item in pixel_values_lst]
|
||||
),
|
||||
}
|
||||
|
||||
for pixel_values in pixel_values_lst:
|
||||
num_patches = pixel_values.shape[0]
|
||||
feature_size = num_patches * self.num_image_token
|
||||
|
||||
image_repl = self.get_image_repl(feature_size, num_patches)
|
||||
|
||||
text = [t.replace("<image>", image_repl.full, 1) for t in text]
|
||||
|
||||
text_inputs = self.tokenizer(text)
|
||||
|
||||
combined_outputs = {**text_inputs, **image_inputs}
|
||||
|
||||
return BatchFeature(combined_outputs, tensor_type=return_tensors)
|
||||
|
||||
|
||||
class SkyworkR1VProcessingInfo(BaseProcessingInfo):
|
||||
def get_hf_processor(self, **kwargs: object) -> SkyworkR1VProcessor:
|
||||
return self.ctx.init_processor(
|
||||
|
||||
@@ -13,35 +13,53 @@ import importlib
|
||||
__all__ = [
|
||||
"BagelProcessor",
|
||||
"DeepseekVLV2Processor",
|
||||
"Eagle2_5_VLProcessor",
|
||||
"FireRedASR2Processor",
|
||||
"FunASRProcessor",
|
||||
"GLM4VProcessor",
|
||||
"H2OVLProcessor",
|
||||
"HunYuanVLProcessor",
|
||||
"HunYuanVLImageProcessor",
|
||||
"InternVLProcessor",
|
||||
"KimiAudioProcessor",
|
||||
"MistralCommonPixtralProcessor",
|
||||
"MistralCommonVoxtralProcessor",
|
||||
"NanoNemotronVLProcessor",
|
||||
"NemotronParseProcessor",
|
||||
"NemotronVLProcessor",
|
||||
"LlamaNemotronVLEmbedProcessor",
|
||||
"NVLMProcessor",
|
||||
"OvisProcessor",
|
||||
"Ovis2_5Processor",
|
||||
"QwenVLProcessor",
|
||||
"Qwen3ASRProcessor",
|
||||
"SkyworkR1VProcessor",
|
||||
]
|
||||
|
||||
_CLASS_TO_MODULE: dict[str, str] = {
|
||||
"BagelProcessor": "vllm.transformers_utils.processors.bagel",
|
||||
"DeepseekVLV2Processor": "vllm.transformers_utils.processors.deepseek_vl2",
|
||||
"Eagle2_5_VLProcessor": "vllm.transformers_utils.processors.eagle2_5_vl",
|
||||
"FireRedASR2Processor": "vllm.transformers_utils.processors.fireredasr2",
|
||||
"FunASRProcessor": "vllm.transformers_utils.processors.funasr",
|
||||
"GLM4VProcessor": "vllm.transformers_utils.processors.glm4v",
|
||||
"H2OVLProcessor": "vllm.transformers_utils.processors.h2ovl",
|
||||
"HunYuanVLProcessor": "vllm.transformers_utils.processors.hunyuan_vl",
|
||||
"HunYuanVLImageProcessor": "vllm.transformers_utils.processors.hunyuan_vl_image",
|
||||
"InternVLProcessor": "vllm.transformers_utils.processors.internvl",
|
||||
"KimiAudioProcessor": "vllm.transformers_utils.processors.kimi_audio",
|
||||
"MistralCommonPixtralProcessor": "vllm.transformers_utils.processors.pixtral",
|
||||
"MistralCommonVoxtralProcessor": "vllm.transformers_utils.processors.voxtral",
|
||||
"NanoNemotronVLProcessor": "vllm.transformers_utils.processors.nano_nemotron_vl",
|
||||
"NemotronParseProcessor": "vllm.transformers_utils.processors.nemotron_parse",
|
||||
"NemotronVLProcessor": "vllm.transformers_utils.processors.nemotron_vl",
|
||||
"LlamaNemotronVLEmbedProcessor": "vllm.transformers_utils.processors.nemotron_vl",
|
||||
"NVLMProcessor": "vllm.transformers_utils.processors.nvlm_d",
|
||||
"OvisProcessor": "vllm.transformers_utils.processors.ovis",
|
||||
"Ovis2_5Processor": "vllm.transformers_utils.processors.ovis2_5",
|
||||
"QwenVLProcessor": "vllm.transformers_utils.processors.qwen_vl",
|
||||
"Qwen3ASRProcessor": "vllm.transformers_utils.processors.qwen3_asr",
|
||||
"SkyworkR1VProcessor": "vllm.transformers_utils.processors.skyworkr1v",
|
||||
}
|
||||
|
||||
|
||||
|
||||
85
vllm/transformers_utils/processors/eagle2_5_vl.py
Normal file
85
vllm/transformers_utils/processors/eagle2_5_vl.py
Normal file
@@ -0,0 +1,85 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# Adapted from NVIDIA Eagle2.5-VL model
|
||||
# https://huggingface.co/nvidia/Eagle2.5-8B
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
from vllm.multimodal.processing import PromptUpdateDetails
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
|
||||
from .internvl import IMG_CONTEXT, IMG_END, IMG_START, BaseInternVLProcessor
|
||||
|
||||
|
||||
class Eagle2_5_VLProcessor(BaseInternVLProcessor):
|
||||
"""
|
||||
Custom processor for Eagle2.5-VL model.
|
||||
Extends BaseInternVLProcessor with Eagle-specific token handling.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
tokenizer: TokenizerLike,
|
||||
*,
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
) -> None:
|
||||
# Skip super().__init__() to avoid config manipulation
|
||||
# Directly initialize all required attributes
|
||||
self.config = config
|
||||
self.tokenizer = tokenizer
|
||||
|
||||
# Image size with force_image_size override
|
||||
image_size: int = config.vision_config.image_size
|
||||
if hasattr(config, "force_image_size") and config.force_image_size:
|
||||
image_size = config.force_image_size
|
||||
|
||||
patch_size: int = config.vision_config.patch_size
|
||||
downsample_ratio: float = getattr(config, "downsample_ratio", 0.5)
|
||||
|
||||
# Compute num_image_token
|
||||
self.num_image_token = int(
|
||||
(image_size // patch_size) ** 2 * (downsample_ratio**2)
|
||||
)
|
||||
self.image_size = image_size
|
||||
|
||||
# Dynamic patch settings with defaults
|
||||
self.min_dynamic_patch = (
|
||||
min_dynamic_patch
|
||||
if min_dynamic_patch is not None
|
||||
else getattr(config, "min_dynamic_patch", 1)
|
||||
)
|
||||
self.max_dynamic_patch = (
|
||||
max_dynamic_patch
|
||||
if max_dynamic_patch is not None
|
||||
else getattr(config, "max_dynamic_patch", 12)
|
||||
)
|
||||
self.dynamic_image_size = (
|
||||
dynamic_image_size
|
||||
if dynamic_image_size is not None
|
||||
else getattr(config, "dynamic_image_size", True)
|
||||
)
|
||||
self.use_thumbnail: bool = getattr(config, "use_thumbnail", True)
|
||||
|
||||
@property
|
||||
def image_token_id(self) -> int:
|
||||
"""Get the image token ID from config or tokenizer."""
|
||||
if hasattr(self.config, "image_token_index"):
|
||||
return self.config.image_token_index
|
||||
# Fallback to tokenizer vocab - use <IMG_CONTEXT> (ID: 151667)
|
||||
vocab = self.tokenizer.get_vocab()
|
||||
if IMG_CONTEXT in vocab:
|
||||
return vocab[IMG_CONTEXT]
|
||||
raise ValueError(f"Cannot find image token '{IMG_CONTEXT}' in vocabulary")
|
||||
|
||||
def get_image_repl(
|
||||
self,
|
||||
feature_size: int,
|
||||
num_patches: int | None,
|
||||
) -> PromptUpdateDetails[str]:
|
||||
"""Get image replacement string for prompt."""
|
||||
repl_features = IMG_CONTEXT * feature_size
|
||||
repl_full = IMG_START + repl_features + IMG_END
|
||||
|
||||
return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
|
||||
390
vllm/transformers_utils/processors/h2ovl.py
Normal file
390
vllm/transformers_utils/processors/h2ovl.py
Normal file
@@ -0,0 +1,390 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# adapted from https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/modeling_h2ovl_chat.py
|
||||
# https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/image_process.py
|
||||
# --------------------------------------------------------
|
||||
# H2OVL-Mississippi
|
||||
# Copyright (c) 2024 H2O.AI
|
||||
# Licensed under Apache 2.0 License [see LICENSE for details]
|
||||
# --------------------------------------------------------
|
||||
import torch
|
||||
from PIL import Image
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
from vllm.multimodal.processing import PromptUpdateDetails
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
|
||||
from .internvl import (
|
||||
IMG_CONTEXT,
|
||||
IMG_END,
|
||||
IMG_START,
|
||||
BaseInternVLProcessor,
|
||||
build_transform,
|
||||
find_closest_aspect_ratio,
|
||||
get_internvl_target_ratios,
|
||||
)
|
||||
|
||||
|
||||
def resolve_h2ovl_min_max_num(
|
||||
*,
|
||||
min_dynamic_patch: int,
|
||||
max_dynamic_patch: int,
|
||||
dynamic_image_size: bool,
|
||||
use_thumbnail: bool,
|
||||
) -> tuple[int, int]:
|
||||
min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1
|
||||
max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
|
||||
|
||||
if use_thumbnail and max_dynamic_patch != 1:
|
||||
max_dynamic_patch += 1
|
||||
|
||||
return min_dynamic_patch, max_dynamic_patch
|
||||
|
||||
|
||||
def get_h2ovl_target_ratios(
|
||||
min_num: int,
|
||||
max_num: int,
|
||||
*,
|
||||
prior_aspect_ratio: tuple[int, int] | None,
|
||||
) -> list[tuple[int, int]]:
|
||||
target_ratios = get_internvl_target_ratios(min_num, max_num)
|
||||
|
||||
# if prior_aspect_ratio is provided, filter the target ratios
|
||||
if prior_aspect_ratio is not None:
|
||||
target_ratios = [
|
||||
ratio
|
||||
for ratio in target_ratios
|
||||
if prior_aspect_ratio[0] % ratio[0] != 0
|
||||
and prior_aspect_ratio[1] % ratio[1] != 0
|
||||
]
|
||||
|
||||
return target_ratios
|
||||
|
||||
|
||||
# modified to include blocks generated in second pass
|
||||
def calculate_h2ovl_targets(
|
||||
*,
|
||||
orig_width: int,
|
||||
orig_height: int,
|
||||
target_ratios: list[tuple[int, int]],
|
||||
image_size: int,
|
||||
use_thumbnail: bool,
|
||||
) -> tuple[int, int, int, tuple[int, int]]:
|
||||
aspect_ratio = orig_width / orig_height
|
||||
|
||||
# find the closest aspect ratio to the target
|
||||
target_aspect_ratio = find_closest_aspect_ratio(
|
||||
aspect_ratio,
|
||||
target_ratios,
|
||||
width=orig_width,
|
||||
height=orig_height,
|
||||
image_size=image_size,
|
||||
)
|
||||
|
||||
# calculate the target width and height
|
||||
target_width = image_size * target_aspect_ratio[0]
|
||||
target_height = image_size * target_aspect_ratio[1]
|
||||
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
|
||||
|
||||
# add thumbnail image if num_blocks != 1
|
||||
if use_thumbnail and blocks != 1:
|
||||
blocks += 1
|
||||
|
||||
return blocks, target_width, target_height, target_aspect_ratio
|
||||
|
||||
|
||||
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
|
||||
# refactored to handle prior_aspect_ratio
|
||||
def dynamic_preprocess_h2ovl(
|
||||
image: Image.Image,
|
||||
*,
|
||||
target_ratios: list[tuple[int, int]],
|
||||
image_size: int,
|
||||
use_thumbnail: bool,
|
||||
) -> tuple[list[Image.Image], tuple[int, int]]:
|
||||
orig_width, orig_height = image.size
|
||||
|
||||
# calculate the number of blocks without thumbnail
|
||||
(
|
||||
blocks,
|
||||
target_width,
|
||||
target_height,
|
||||
target_aspect_ratio,
|
||||
) = calculate_h2ovl_targets(
|
||||
orig_width=orig_width,
|
||||
orig_height=orig_height,
|
||||
target_ratios=target_ratios,
|
||||
image_size=image_size,
|
||||
use_thumbnail=False,
|
||||
)
|
||||
|
||||
# resize the image
|
||||
resized_img = image.resize((target_width, target_height))
|
||||
processed_images = []
|
||||
for i in range(blocks):
|
||||
box = (
|
||||
(i % (target_width // image_size)) * image_size,
|
||||
(i // (target_width // image_size)) * image_size,
|
||||
((i % (target_width // image_size)) + 1) * image_size,
|
||||
((i // (target_width // image_size)) + 1) * image_size,
|
||||
)
|
||||
# split the image
|
||||
split_img = resized_img.crop(box)
|
||||
processed_images.append(split_img)
|
||||
|
||||
assert len(processed_images) == blocks
|
||||
|
||||
if use_thumbnail and len(processed_images) != 1:
|
||||
thumbnail_img = image.resize((image_size, image_size))
|
||||
processed_images.append(thumbnail_img)
|
||||
|
||||
return processed_images, target_aspect_ratio
|
||||
|
||||
|
||||
def _preprocess_image(
|
||||
image: Image.Image,
|
||||
*,
|
||||
input_size: int,
|
||||
min_num: int,
|
||||
max_num: int,
|
||||
use_thumbnail: bool,
|
||||
prior_aspect_ratio: tuple[int, int] | None,
|
||||
) -> tuple[torch.Tensor, tuple[int, int]]:
|
||||
target_ratios = get_h2ovl_target_ratios(
|
||||
min_num,
|
||||
max_num,
|
||||
prior_aspect_ratio=prior_aspect_ratio,
|
||||
)
|
||||
|
||||
transform = build_transform(input_size=input_size)
|
||||
images, target_aspect_ratio = dynamic_preprocess_h2ovl(
|
||||
image,
|
||||
image_size=input_size,
|
||||
use_thumbnail=use_thumbnail,
|
||||
target_ratios=target_ratios,
|
||||
)
|
||||
|
||||
pixel_values = torch.stack([transform(image) for image in images])
|
||||
return pixel_values, target_aspect_ratio
|
||||
|
||||
|
||||
# refactored to use the _preprocess_image function
|
||||
def image_to_pixel_values_h2ovl(
|
||||
image: Image.Image,
|
||||
*,
|
||||
input_size: int,
|
||||
min_num: int,
|
||||
max_num: int,
|
||||
use_thumbnail: bool,
|
||||
use_msac: bool,
|
||||
) -> torch.Tensor:
|
||||
# when MSAC is turned on, we need to process the image twice
|
||||
if use_msac:
|
||||
# first pass
|
||||
pixel_values1, aspect_ratio1 = _preprocess_image(
|
||||
image,
|
||||
input_size=input_size,
|
||||
min_num=1,
|
||||
max_num=max_num,
|
||||
use_thumbnail=True,
|
||||
prior_aspect_ratio=None,
|
||||
)
|
||||
# second pass
|
||||
pixel_values2, _ = _preprocess_image(
|
||||
image,
|
||||
input_size=input_size,
|
||||
min_num=3,
|
||||
max_num=max_num,
|
||||
use_thumbnail=True,
|
||||
prior_aspect_ratio=aspect_ratio1,
|
||||
)
|
||||
# combine pixel values
|
||||
pixel_values = torch.cat(
|
||||
[pixel_values2[:-1], pixel_values1[:-1], pixel_values2[-1:]], 0
|
||||
)
|
||||
|
||||
else:
|
||||
pixel_values, _ = _preprocess_image(
|
||||
image,
|
||||
input_size=input_size,
|
||||
min_num=min_num,
|
||||
max_num=max_num,
|
||||
use_thumbnail=use_thumbnail,
|
||||
prior_aspect_ratio=None,
|
||||
)
|
||||
|
||||
return pixel_values
|
||||
|
||||
|
||||
class H2OVLProcessor(BaseInternVLProcessor):
|
||||
def __init__(
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
tokenizer: TokenizerLike,
|
||||
*,
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
use_msac: bool | None = None,
|
||||
) -> None:
|
||||
super().__init__(
|
||||
config,
|
||||
tokenizer,
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
)
|
||||
|
||||
if use_msac is None:
|
||||
use_msac = config.use_msac
|
||||
assert isinstance(use_msac, bool)
|
||||
|
||||
self.use_msac = use_msac
|
||||
|
||||
@property
|
||||
def image_token_id(self) -> int:
|
||||
return self.tokenizer.get_vocab()[IMG_CONTEXT]
|
||||
|
||||
def get_image_repl(
|
||||
self,
|
||||
feature_size: int,
|
||||
num_patches: int | None,
|
||||
) -> PromptUpdateDetails[str]:
|
||||
repl_features = IMG_CONTEXT * feature_size
|
||||
repl_full = IMG_START + repl_features + IMG_END
|
||||
|
||||
return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
|
||||
|
||||
def resolve_min_max_num(
|
||||
self,
|
||||
*,
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
use_thumbnail: bool | None = None,
|
||||
) -> tuple[int, int]:
|
||||
min_dynamic_patch = (
|
||||
self.min_dynamic_patch if min_dynamic_patch is None else min_dynamic_patch
|
||||
)
|
||||
max_dynamic_patch = (
|
||||
self.max_dynamic_patch if max_dynamic_patch is None else max_dynamic_patch
|
||||
)
|
||||
dynamic_image_size = (
|
||||
self.dynamic_image_size
|
||||
if dynamic_image_size is None
|
||||
else dynamic_image_size
|
||||
)
|
||||
use_thumbnail = self.use_thumbnail if use_thumbnail is None else use_thumbnail
|
||||
|
||||
return resolve_h2ovl_min_max_num(
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
use_thumbnail=use_thumbnail,
|
||||
)
|
||||
|
||||
def resolve_target_ratios(
|
||||
self,
|
||||
*,
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
use_thumbnail: bool | None = None,
|
||||
prior_aspect_ratio: tuple[int, int] | None = None,
|
||||
override_min_num: int | None = None,
|
||||
) -> list[tuple[int, int]]:
|
||||
min_num, max_num = self.resolve_min_max_num(
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
use_thumbnail=use_thumbnail,
|
||||
)
|
||||
if override_min_num is not None:
|
||||
min_num = override_min_num
|
||||
|
||||
return get_h2ovl_target_ratios(
|
||||
min_num,
|
||||
max_num,
|
||||
prior_aspect_ratio=prior_aspect_ratio,
|
||||
)
|
||||
|
||||
def get_num_image_tokens(
|
||||
self,
|
||||
*,
|
||||
image_width: int,
|
||||
image_height: int,
|
||||
use_msac: bool | None = None,
|
||||
) -> int:
|
||||
use_msac = self.use_msac if use_msac is None else use_msac
|
||||
|
||||
use_thumbnail = self.use_thumbnail
|
||||
|
||||
if use_msac:
|
||||
target_ratios_1 = self.resolve_target_ratios(
|
||||
use_thumbnail=False, # Applied in calculate_targets
|
||||
override_min_num=1,
|
||||
)
|
||||
num_patches_1, _, _, aspect_ratio_1 = calculate_h2ovl_targets(
|
||||
orig_width=image_width,
|
||||
orig_height=image_height,
|
||||
image_size=self.image_size,
|
||||
target_ratios=target_ratios_1,
|
||||
use_thumbnail=True,
|
||||
)
|
||||
|
||||
target_ratios_2 = self.resolve_target_ratios(
|
||||
use_thumbnail=False, # Applied in calculate_targets
|
||||
prior_aspect_ratio=aspect_ratio_1,
|
||||
override_min_num=3,
|
||||
)
|
||||
num_patches_2, _, _, _ = calculate_h2ovl_targets(
|
||||
orig_width=image_width,
|
||||
orig_height=image_height,
|
||||
image_size=self.image_size,
|
||||
target_ratios=target_ratios_2,
|
||||
use_thumbnail=True,
|
||||
)
|
||||
|
||||
num_patches = num_patches_1 + num_patches_2 - 1
|
||||
else:
|
||||
target_ratios = self.resolve_target_ratios(
|
||||
use_thumbnail=False, # Applied in calculate_targets
|
||||
)
|
||||
num_patches, _, _, _ = calculate_h2ovl_targets(
|
||||
orig_width=image_width,
|
||||
orig_height=image_height,
|
||||
image_size=self.image_size,
|
||||
target_ratios=target_ratios,
|
||||
use_thumbnail=use_thumbnail,
|
||||
)
|
||||
|
||||
return num_patches * self.num_image_token
|
||||
|
||||
def _images_to_pixel_values_lst(
|
||||
self,
|
||||
images: list[Image.Image],
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
) -> list[torch.Tensor]:
|
||||
use_msac = self.use_msac if len(images) == 1 else False
|
||||
|
||||
min_num, max_num = self.resolve_min_max_num(
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
use_thumbnail=False, # Applied in image_to_pixel_values
|
||||
)
|
||||
|
||||
return [
|
||||
image_to_pixel_values_h2ovl(
|
||||
image,
|
||||
input_size=self.image_size,
|
||||
min_num=min_num,
|
||||
max_num=max_num,
|
||||
use_thumbnail=self.use_thumbnail,
|
||||
use_msac=use_msac,
|
||||
)
|
||||
for image in images
|
||||
]
|
||||
603
vllm/transformers_utils/processors/internvl.py
Normal file
603
vllm/transformers_utils/processors/internvl.py
Normal file
@@ -0,0 +1,603 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_internvl_chat.py
|
||||
# --------------------------------------------------------
|
||||
# InternVL
|
||||
# Copyright (c) 2023 OpenGVLab
|
||||
# Licensed under The MIT License [see LICENSE for details]
|
||||
# --------------------------------------------------------
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, TypeVar
|
||||
|
||||
import numpy.typing as npt
|
||||
import torch
|
||||
import torchvision.transforms as T
|
||||
from PIL import Image
|
||||
from transformers import BatchFeature, PretrainedConfig, TensorType
|
||||
|
||||
from vllm.multimodal.image import convert_image_mode
|
||||
from vllm.multimodal.processing import PromptUpdateDetails
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
|
||||
_T = TypeVar("_T")
|
||||
|
||||
IMG_START = "<img>"
|
||||
IMG_END = "</img>"
|
||||
IMG_CONTEXT = "<IMG_CONTEXT>"
|
||||
|
||||
IMAGENET_MEAN = (0.485, 0.456, 0.406)
|
||||
IMAGENET_STD = (0.229, 0.224, 0.225)
|
||||
|
||||
|
||||
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
|
||||
def build_transform(input_size: int):
|
||||
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
|
||||
transform = T.Compose(
|
||||
[
|
||||
T.Lambda(lambda img: convert_image_mode(img, "RGB")),
|
||||
T.Resize(
|
||||
(input_size, input_size), interpolation=T.InterpolationMode.BICUBIC
|
||||
),
|
||||
T.ToTensor(),
|
||||
T.Normalize(mean=MEAN, std=STD),
|
||||
]
|
||||
)
|
||||
return transform
|
||||
|
||||
|
||||
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
|
||||
def find_closest_aspect_ratio(
|
||||
aspect_ratio: float,
|
||||
target_ratios: list[tuple[int, int]],
|
||||
*,
|
||||
width: int,
|
||||
height: int,
|
||||
image_size: int,
|
||||
) -> tuple[int, int]:
|
||||
best_ratio_diff = float("inf")
|
||||
best_ratio = (1, 1)
|
||||
area = width * height
|
||||
for ratio in target_ratios:
|
||||
target_aspect_ratio = ratio[0] / ratio[1]
|
||||
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
|
||||
if ratio_diff < best_ratio_diff:
|
||||
best_ratio_diff = ratio_diff
|
||||
best_ratio = ratio
|
||||
elif ratio_diff == best_ratio_diff:
|
||||
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
|
||||
best_ratio = ratio
|
||||
return best_ratio
|
||||
|
||||
|
||||
def resolve_internvl_min_max_num(
|
||||
*,
|
||||
min_dynamic_patch: int,
|
||||
max_dynamic_patch: int,
|
||||
dynamic_image_size: bool,
|
||||
use_thumbnail: bool,
|
||||
) -> tuple[int, int]:
|
||||
min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1
|
||||
max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
|
||||
|
||||
if use_thumbnail and max_dynamic_patch != 1:
|
||||
max_dynamic_patch += 1
|
||||
|
||||
return min_dynamic_patch, max_dynamic_patch
|
||||
|
||||
|
||||
def get_internvl_target_ratios(
|
||||
min_num: int,
|
||||
max_num: int,
|
||||
) -> list[tuple[int, int]]:
|
||||
target_ratios = {
|
||||
(i, j)
|
||||
for n in range(min_num, max_num + 1)
|
||||
for i in range(1, n + 1)
|
||||
for j in range(1, n + 1)
|
||||
if min_num <= i * j <= max_num
|
||||
}
|
||||
return sorted(target_ratios, key=lambda x: x[0] * x[1])
|
||||
|
||||
|
||||
def calculate_internvl_targets(
|
||||
*,
|
||||
orig_width: int,
|
||||
orig_height: int,
|
||||
target_ratios: list[tuple[int, int]],
|
||||
image_size: int,
|
||||
use_thumbnail: bool,
|
||||
) -> tuple[int, int, int]:
|
||||
aspect_ratio = orig_width / orig_height
|
||||
|
||||
# find the closest aspect ratio to the target
|
||||
target_aspect_ratio = find_closest_aspect_ratio(
|
||||
aspect_ratio,
|
||||
target_ratios,
|
||||
width=orig_width,
|
||||
height=orig_height,
|
||||
image_size=image_size,
|
||||
)
|
||||
|
||||
# calculate the target width and height
|
||||
target_width = image_size * target_aspect_ratio[0]
|
||||
target_height = image_size * target_aspect_ratio[1]
|
||||
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
|
||||
|
||||
# add thumbnail image if num_blocks != 1
|
||||
if use_thumbnail and blocks != 1:
|
||||
blocks += 1
|
||||
|
||||
return blocks, target_width, target_height
|
||||
|
||||
|
||||
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
|
||||
def dynamic_preprocess_internvl(
|
||||
image: Image.Image,
|
||||
*,
|
||||
target_ratios: list[tuple[int, int]],
|
||||
image_size: int,
|
||||
use_thumbnail: bool,
|
||||
) -> list[Image.Image]:
|
||||
orig_width, orig_height = image.size
|
||||
|
||||
# calculate the number of blocks without thumbnail
|
||||
blocks, target_width, target_height = calculate_internvl_targets(
|
||||
orig_width=orig_width,
|
||||
orig_height=orig_height,
|
||||
target_ratios=target_ratios,
|
||||
image_size=image_size,
|
||||
use_thumbnail=False,
|
||||
)
|
||||
|
||||
# resize the image
|
||||
resized_img = image.resize((target_width, target_height))
|
||||
processed_images = []
|
||||
for i in range(blocks):
|
||||
box = (
|
||||
(i % (target_width // image_size)) * image_size,
|
||||
(i // (target_width // image_size)) * image_size,
|
||||
((i % (target_width // image_size)) + 1) * image_size,
|
||||
((i // (target_width // image_size)) + 1) * image_size,
|
||||
)
|
||||
# split the image
|
||||
split_img = resized_img.crop(box)
|
||||
processed_images.append(split_img)
|
||||
|
||||
assert len(processed_images) == blocks
|
||||
|
||||
if use_thumbnail and len(processed_images) != 1:
|
||||
thumbnail_img = image.resize((image_size, image_size))
|
||||
processed_images.append(thumbnail_img)
|
||||
|
||||
return processed_images
|
||||
|
||||
|
||||
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
|
||||
def image_to_pixel_values_internvl(
|
||||
image: Image.Image,
|
||||
*,
|
||||
input_size: int,
|
||||
min_num: int,
|
||||
max_num: int,
|
||||
use_thumbnail: bool,
|
||||
) -> torch.Tensor:
|
||||
target_ratios = get_internvl_target_ratios(min_num, max_num)
|
||||
|
||||
transform = build_transform(input_size=input_size)
|
||||
images = dynamic_preprocess_internvl(
|
||||
image,
|
||||
target_ratios=target_ratios,
|
||||
image_size=input_size,
|
||||
use_thumbnail=use_thumbnail,
|
||||
)
|
||||
|
||||
pixel_values = torch.stack([transform(image) for image in images])
|
||||
return pixel_values
|
||||
|
||||
|
||||
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
|
||||
def video_to_pixel_values_internvl(
|
||||
video: npt.NDArray,
|
||||
*,
|
||||
input_size: int,
|
||||
min_num: int,
|
||||
max_num: int,
|
||||
use_thumbnail: bool,
|
||||
) -> torch.Tensor:
|
||||
target_ratios = get_internvl_target_ratios(min_num, max_num)
|
||||
|
||||
transform = build_transform(input_size=input_size)
|
||||
frames_list = list[Image.Image]()
|
||||
for frame in video:
|
||||
pil_frame = dynamic_preprocess_internvl(
|
||||
Image.fromarray(frame, mode="RGB"),
|
||||
target_ratios=target_ratios,
|
||||
image_size=input_size,
|
||||
use_thumbnail=use_thumbnail,
|
||||
)
|
||||
assert len(pil_frame) == 1
|
||||
frames_list.extend(pil_frame)
|
||||
|
||||
pixel_values = torch.stack([transform(image) for image in frames_list])
|
||||
return pixel_values
|
||||
|
||||
|
||||
class BaseInternVLProcessor(ABC):
|
||||
"""
|
||||
This model doesn't define its own HF processor,
|
||||
so we implement our own one here.
|
||||
|
||||
The code to insert image tokens is based on:
|
||||
https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py#L252
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
tokenizer: TokenizerLike,
|
||||
*,
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
self.config = config
|
||||
self.tokenizer = tokenizer
|
||||
|
||||
image_size: int = config.vision_config.image_size
|
||||
patch_size: int = config.vision_config.patch_size
|
||||
|
||||
if min_dynamic_patch is None:
|
||||
min_dynamic_patch = config.min_dynamic_patch
|
||||
assert isinstance(min_dynamic_patch, int)
|
||||
|
||||
if max_dynamic_patch is None:
|
||||
max_dynamic_patch = config.max_dynamic_patch
|
||||
assert isinstance(max_dynamic_patch, int)
|
||||
|
||||
if dynamic_image_size is None:
|
||||
dynamic_image_size = config.dynamic_image_size
|
||||
assert isinstance(dynamic_image_size, bool)
|
||||
|
||||
self.num_image_token = int(
|
||||
(image_size // patch_size) ** 2 * (config.downsample_ratio**2)
|
||||
)
|
||||
self.image_size = image_size
|
||||
self.min_dynamic_patch = min_dynamic_patch
|
||||
self.max_dynamic_patch = max_dynamic_patch
|
||||
self.dynamic_image_size = dynamic_image_size
|
||||
self.use_thumbnail: bool = config.use_thumbnail
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def image_token_id(self) -> int:
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def get_image_repl(
|
||||
self,
|
||||
feature_size: int,
|
||||
num_patches: int | None,
|
||||
) -> PromptUpdateDetails[str]:
|
||||
raise NotImplementedError
|
||||
|
||||
def resolve_min_max_num(
|
||||
self,
|
||||
*,
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
use_thumbnail: bool | None = None,
|
||||
) -> tuple[int, int]:
|
||||
min_dynamic_patch = (
|
||||
self.min_dynamic_patch if min_dynamic_patch is None else min_dynamic_patch
|
||||
)
|
||||
max_dynamic_patch = (
|
||||
self.max_dynamic_patch if max_dynamic_patch is None else max_dynamic_patch
|
||||
)
|
||||
dynamic_image_size = (
|
||||
self.dynamic_image_size
|
||||
if dynamic_image_size is None
|
||||
else dynamic_image_size
|
||||
)
|
||||
use_thumbnail = self.use_thumbnail if use_thumbnail is None else use_thumbnail
|
||||
|
||||
return resolve_internvl_min_max_num(
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
use_thumbnail=use_thumbnail,
|
||||
)
|
||||
|
||||
def resolve_target_ratios(
|
||||
self,
|
||||
*,
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
use_thumbnail: bool | None = None,
|
||||
) -> list[tuple[int, int]]:
|
||||
min_num, max_num = self.resolve_min_max_num(
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
use_thumbnail=use_thumbnail,
|
||||
)
|
||||
|
||||
return get_internvl_target_ratios(min_num, max_num)
|
||||
|
||||
def get_num_image_tokens(
|
||||
self,
|
||||
*,
|
||||
image_width: int,
|
||||
image_height: int,
|
||||
) -> int:
|
||||
target_ratios = self.resolve_target_ratios(
|
||||
use_thumbnail=False, # Applied in calculate_targets
|
||||
)
|
||||
|
||||
num_patches, _, _ = calculate_internvl_targets(
|
||||
orig_width=image_width,
|
||||
orig_height=image_height,
|
||||
image_size=self.image_size,
|
||||
target_ratios=target_ratios,
|
||||
use_thumbnail=self.use_thumbnail,
|
||||
)
|
||||
|
||||
return num_patches * self.num_image_token
|
||||
|
||||
def _images_to_pixel_values_lst(
|
||||
self,
|
||||
images: list[Image.Image],
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
) -> list[torch.Tensor]:
|
||||
min_num, max_num = self.resolve_min_max_num(
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
use_thumbnail=False, # Applied in image_to_pixel_values
|
||||
)
|
||||
|
||||
return [
|
||||
image_to_pixel_values_internvl(
|
||||
image,
|
||||
input_size=self.image_size,
|
||||
min_num=min_num,
|
||||
max_num=max_num,
|
||||
use_thumbnail=self.use_thumbnail,
|
||||
)
|
||||
for image in images
|
||||
]
|
||||
|
||||
def _preprocess_image(
|
||||
self,
|
||||
text: list[str],
|
||||
images: list[Image.Image],
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
) -> tuple[list[str], dict[str, torch.Tensor]]:
|
||||
if len(images) == 0:
|
||||
image_inputs = {}
|
||||
else:
|
||||
pixel_values_lst = self._images_to_pixel_values_lst(
|
||||
images,
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
)
|
||||
image_inputs = {
|
||||
"pixel_values_flat": torch.cat(pixel_values_lst),
|
||||
"image_num_patches": torch.tensor(
|
||||
[len(item) for item in pixel_values_lst]
|
||||
),
|
||||
}
|
||||
|
||||
for pixel_values in pixel_values_lst:
|
||||
num_patches = pixel_values.shape[0]
|
||||
feature_size = num_patches * self.num_image_token
|
||||
|
||||
image_repl = self.get_image_repl(feature_size, num_patches)
|
||||
text = [t.replace("<image>", image_repl.full, 1) for t in text]
|
||||
return text, image_inputs
|
||||
|
||||
def _make_batch_input(self, input_item: _T | list[_T] | None = None) -> list[_T]:
|
||||
if input_item is None:
|
||||
input_item = []
|
||||
if not isinstance(input_item, list):
|
||||
input_item = [input_item]
|
||||
return input_item
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
text: str | list[str] | None = None,
|
||||
images: Image.Image | list[Image.Image] | None = None,
|
||||
*,
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
return_tensors: str | TensorType | None = None,
|
||||
**kwargs,
|
||||
) -> BatchFeature:
|
||||
text = self._make_batch_input(text)
|
||||
images = self._make_batch_input(images)
|
||||
|
||||
text, image_inputs = self._preprocess_image(
|
||||
text=text,
|
||||
images=images,
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
)
|
||||
|
||||
text_inputs = self.tokenizer(text)
|
||||
|
||||
combined_outputs = {**text_inputs, **image_inputs}
|
||||
|
||||
return BatchFeature(combined_outputs, tensor_type=return_tensors)
|
||||
|
||||
|
||||
class InternVLProcessor(BaseInternVLProcessor):
|
||||
"""
|
||||
HF Processor for InternVLChatModel with extended video processing logic.
|
||||
|
||||
Code for video processing is adapted from video example:
|
||||
https://huggingface.co/OpenGVLab/InternVL3-1B#inference-with-transformers
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
tokenizer: TokenizerLike,
|
||||
*,
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
video_token: str | None = None,
|
||||
) -> None:
|
||||
super().__init__(
|
||||
config=config,
|
||||
tokenizer=tokenizer,
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
)
|
||||
# add extra video token for video processing
|
||||
self.video_token = video_token
|
||||
|
||||
@property
|
||||
def image_token_id(self) -> int:
|
||||
return self.tokenizer.get_vocab()[IMG_CONTEXT]
|
||||
|
||||
@property
|
||||
def video_token_id(self) -> int | None:
|
||||
if self.video_token is None:
|
||||
return None
|
||||
return self.tokenizer.get_vocab().get(self.video_token, None)
|
||||
|
||||
@property
|
||||
def supports_video(self) -> bool:
|
||||
return self.video_token_id is not None
|
||||
|
||||
def _videos_to_pixel_values_lst(
|
||||
self,
|
||||
videos: list[npt.NDArray],
|
||||
dynamic_image_size: bool | None = None,
|
||||
) -> list[torch.Tensor]:
|
||||
min_num, max_num = self.resolve_min_max_num(
|
||||
min_dynamic_patch=1,
|
||||
max_dynamic_patch=1,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
use_thumbnail=False, # Applied in image_to_pixel_values
|
||||
)
|
||||
|
||||
return [
|
||||
video_to_pixel_values_internvl(
|
||||
video,
|
||||
input_size=self.image_size,
|
||||
min_num=min_num,
|
||||
max_num=max_num,
|
||||
use_thumbnail=False,
|
||||
)
|
||||
for video in videos
|
||||
]
|
||||
|
||||
def _preprocess_video(
|
||||
self,
|
||||
text: list[str],
|
||||
videos: list[npt.NDArray],
|
||||
dynamic_image_size: bool | None = None,
|
||||
) -> tuple[list[str], dict[str, Any]]:
|
||||
if len(videos) == 0 or not self.supports_video:
|
||||
return text, {}
|
||||
|
||||
video_token = self.video_token
|
||||
assert video_token is not None
|
||||
|
||||
pixel_values_lst_video = self._videos_to_pixel_values_lst(
|
||||
videos,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
)
|
||||
video_inputs = {
|
||||
"pixel_values_flat_video": torch.cat(pixel_values_lst_video),
|
||||
"video_num_patches": torch.tensor(
|
||||
[len(item) for item in pixel_values_lst_video]
|
||||
),
|
||||
}
|
||||
|
||||
for pixel_values in pixel_values_lst_video:
|
||||
num_patches = pixel_values.shape[0]
|
||||
|
||||
video_repl = self.get_video_repl(
|
||||
self.num_image_token, num_patches, video_token
|
||||
)
|
||||
text = [t.replace("<video>", video_repl.full, 1) for t in text]
|
||||
return text, video_inputs
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
text: str | list[str] | None = None,
|
||||
images: Image.Image | list[Image.Image] | None = None,
|
||||
videos: npt.NDArray | list[npt.NDArray] | None = None,
|
||||
*,
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
return_tensors: str | TensorType | None = None,
|
||||
**kwargs,
|
||||
) -> BatchFeature:
|
||||
text = self._make_batch_input(text)
|
||||
images = self._make_batch_input(images)
|
||||
videos = self._make_batch_input(videos)
|
||||
|
||||
text, image_inputs = self._preprocess_image(
|
||||
text=text,
|
||||
images=images,
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
)
|
||||
|
||||
text, video_inputs = self._preprocess_video(
|
||||
text=text,
|
||||
videos=videos,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
)
|
||||
|
||||
text_inputs = self.tokenizer(text)
|
||||
|
||||
combined_outputs = {**text_inputs, **image_inputs, **video_inputs}
|
||||
|
||||
return BatchFeature(combined_outputs, tensor_type=return_tensors)
|
||||
|
||||
def get_image_repl(
|
||||
self,
|
||||
feature_size: int,
|
||||
num_patches: int | None,
|
||||
) -> PromptUpdateDetails[str]:
|
||||
repl_features = IMG_CONTEXT * feature_size
|
||||
repl_full = IMG_START + repl_features + IMG_END
|
||||
|
||||
return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
|
||||
|
||||
def get_video_repl(
|
||||
self,
|
||||
feature_size: int,
|
||||
num_patches: int | None,
|
||||
video_context_token: str = IMG_CONTEXT,
|
||||
) -> PromptUpdateDetails[str]:
|
||||
if num_patches is None:
|
||||
raise NotImplementedError("Embedding inputs are not supported")
|
||||
|
||||
repl_features = video_context_token * self.num_image_token
|
||||
repl_features_with_sep = IMG_START + repl_features + IMG_END
|
||||
# num_patches is equal to num_frames
|
||||
repl_full = "".join(
|
||||
[f"Frame{i + 1}: {repl_features_with_sep}" for i in range(num_patches)]
|
||||
)
|
||||
|
||||
return PromptUpdateDetails.select_text(repl_full, video_context_token)
|
||||
1032
vllm/transformers_utils/processors/nano_nemotron_vl.py
Normal file
1032
vllm/transformers_utils/processors/nano_nemotron_vl.py
Normal file
File diff suppressed because it is too large
Load Diff
245
vllm/transformers_utils/processors/nemotron_parse.py
Normal file
245
vllm/transformers_utils/processors/nemotron_parse.py
Normal file
@@ -0,0 +1,245 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
#
|
||||
# Adapted from https://github.com/amalad/vllm/blob/nemotron_parse/vllm/model_executor/models/nemotron_parse.py
|
||||
# that's based on https://huggingface.co/nvidia/NVIDIA-Nemotron-Parse-v1.1/blob/main/hf_nemotron_parse_modeling.py
|
||||
from typing import TypeVar
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from PIL import Image
|
||||
from timm.data.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
|
||||
from torchvision import transforms as T
|
||||
from transformers import BatchFeature, PretrainedConfig, TensorType
|
||||
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
|
||||
_T = TypeVar("_T")
|
||||
|
||||
DEFAULT_FINAL_IMAGE_SIZE = (2048, 1648)
|
||||
|
||||
|
||||
class NemotronParseImageProcessor:
|
||||
"""
|
||||
NemotronParse Image Processor
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
final_size: tuple = DEFAULT_FINAL_IMAGE_SIZE,
|
||||
**kwargs,
|
||||
):
|
||||
# Ensure final_size is properly formatted
|
||||
if isinstance(final_size, (list, tuple)) and len(final_size) >= 2:
|
||||
self.final_size = (int(final_size[0]), int(final_size[1]))
|
||||
elif isinstance(final_size, (int, float)):
|
||||
self.final_size = (int(final_size), int(final_size))
|
||||
else:
|
||||
self.final_size = DEFAULT_FINAL_IMAGE_SIZE # Default fallback
|
||||
|
||||
self.norm_mean = torch.Tensor(OPENAI_CLIP_MEAN).reshape(1, 3, 1, 1)
|
||||
self.norm_std = torch.Tensor(OPENAI_CLIP_STD).reshape(1, 3, 1, 1)
|
||||
|
||||
# Create transforms
|
||||
self._create_transforms()
|
||||
|
||||
def _create_transforms(self):
|
||||
"""Create transform objects."""
|
||||
try:
|
||||
import albumentations as A
|
||||
except ImportError as err:
|
||||
raise ImportError(
|
||||
"The package `albumentations` is required to use "
|
||||
"NemotronParse model. Please install it with `pip install "
|
||||
"albumentations`."
|
||||
) from err
|
||||
|
||||
# Ensure final_size is a tuple of integers
|
||||
if isinstance(self.final_size, (list, tuple)):
|
||||
self.target_height, self.target_width = (
|
||||
int(self.final_size[0]),
|
||||
int(self.final_size[1]),
|
||||
)
|
||||
else:
|
||||
self.target_height = self.target_width = int(self.final_size)
|
||||
|
||||
import cv2
|
||||
|
||||
self.transform = A.Compose(
|
||||
[
|
||||
A.PadIfNeeded(
|
||||
min_height=self.target_height,
|
||||
min_width=self.target_width,
|
||||
border_mode=cv2.BORDER_CONSTANT,
|
||||
fill=[255, 255, 255],
|
||||
p=1.0,
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
self.torch_transform = T.Compose(
|
||||
[
|
||||
T.ToTensor(),
|
||||
]
|
||||
)
|
||||
|
||||
def _resize_with_aspect_ratio(self, image: np.ndarray) -> np.ndarray:
|
||||
"""Resize image maintaining aspect ratio (exact replica of original
|
||||
LongestMaxSizeHW)."""
|
||||
height, width = image.shape[:2]
|
||||
max_size_height = self.target_height
|
||||
max_size_width = self.target_width
|
||||
|
||||
# Original LongestMaxSizeHW algorithm from custom_augmentations.py
|
||||
aspect_ratio = width / height
|
||||
new_height = height
|
||||
new_width = width
|
||||
|
||||
# If height too big then scale image down
|
||||
if height > max_size_height:
|
||||
new_height = max_size_height
|
||||
new_width = int(new_height * aspect_ratio)
|
||||
|
||||
# If width too big, scale image down further
|
||||
if new_width > max_size_width:
|
||||
new_width = max_size_width
|
||||
new_height = int(new_width / aspect_ratio)
|
||||
|
||||
# Use cv2.INTER_LINEAR like the original
|
||||
import cv2
|
||||
|
||||
return cv2.resize(
|
||||
image, (new_width, new_height), interpolation=cv2.INTER_LINEAR
|
||||
)
|
||||
|
||||
def _pad_to_size(self, image: np.ndarray) -> np.ndarray:
|
||||
"""Pad image to target size with white padding (matches A.PadIfNeeded
|
||||
behavior)."""
|
||||
h, w = image.shape[:2]
|
||||
min_height, min_width = self.target_height, self.target_width
|
||||
|
||||
# Only pad if image is smaller than target (matches A.PadIfNeeded logic)
|
||||
pad_h = max(0, min_height - h)
|
||||
pad_w = max(0, min_width - w)
|
||||
|
||||
if pad_h == 0 and pad_w == 0:
|
||||
return image
|
||||
|
||||
# A.PadIfNeeded pads to bottom-right with constant value
|
||||
if len(image.shape) == 3:
|
||||
# Color image - pad bottom and right with white (255, 255, 255)
|
||||
padded = np.pad(
|
||||
image,
|
||||
((0, pad_h), (0, pad_w), (0, 0)),
|
||||
mode="constant",
|
||||
constant_values=255,
|
||||
)
|
||||
else:
|
||||
# Grayscale image - pad with white (255)
|
||||
padded = np.pad(
|
||||
image, ((0, pad_h), (0, pad_w)), mode="constant", constant_values=255
|
||||
)
|
||||
|
||||
return padded
|
||||
|
||||
def preprocess(
|
||||
self,
|
||||
images: Image.Image | list[Image.Image],
|
||||
**kwargs,
|
||||
) -> dict[str, torch.Tensor]:
|
||||
"""
|
||||
Preprocess an image or batch of images for the NemotronParse model.
|
||||
|
||||
Args:
|
||||
images: Input image(s)
|
||||
"""
|
||||
# Ensure images is a list
|
||||
if not isinstance(images, list):
|
||||
images = [images]
|
||||
|
||||
# Convert PIL images to numpy arrays if needed
|
||||
processed_images = []
|
||||
for image in images:
|
||||
if isinstance(image, Image.Image):
|
||||
image = np.asarray(image)
|
||||
processed_images.append(image)
|
||||
|
||||
# Apply NemotronParse-specific transforms
|
||||
pixel_values = []
|
||||
for image in processed_images:
|
||||
# Manual resize with aspect ratio preservation
|
||||
# (replaces LongestMaxSizeHW)
|
||||
processed_image = self._resize_with_aspect_ratio(image)
|
||||
|
||||
# Apply remaining albumentations transforms if available
|
||||
if self.transform is not None:
|
||||
transformed = self.transform(image=processed_image)
|
||||
processed_image = transformed["image"]
|
||||
else:
|
||||
# Fallback: just pad to target size
|
||||
processed_image = self._pad_to_size(processed_image)
|
||||
|
||||
# Convert to tensor
|
||||
pixel_values_tensor = self.torch_transform(processed_image)
|
||||
|
||||
# Handle grayscale images
|
||||
if pixel_values_tensor.shape[0] == 1:
|
||||
pixel_values_tensor = pixel_values_tensor.expand(3, -1, -1)
|
||||
|
||||
pixel_values.append(pixel_values_tensor)
|
||||
|
||||
# Stack into batch
|
||||
pixel_values = torch.stack(pixel_values)
|
||||
|
||||
# Normalize pixel values
|
||||
normalized_values = (pixel_values - self.norm_mean) / self.norm_std
|
||||
return {"pixel_values": normalized_values}
|
||||
|
||||
def __call__(
|
||||
self, images: Image.Image | list[Image.Image], **kwargs
|
||||
) -> dict[str, torch.Tensor]:
|
||||
return self.preprocess(images, **kwargs)
|
||||
|
||||
|
||||
class NemotronParseProcessor:
|
||||
"""
|
||||
NemotronParse Processor
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
tokenizer: TokenizerLike,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
self.config = config
|
||||
self.tokenizer = tokenizer
|
||||
|
||||
self.image_processor = NemotronParseImageProcessor(final_size=config.image_size)
|
||||
|
||||
def _make_batch_input(self, input_item: _T | list[_T] | None = None) -> list[_T]:
|
||||
if input_item is None:
|
||||
input_item = []
|
||||
if not isinstance(input_item, list):
|
||||
input_item = [input_item]
|
||||
return input_item
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
text: str | list[str] | None = None,
|
||||
images: Image.Image | list[Image.Image] | None = None,
|
||||
return_tensors: str | TensorType | None = None,
|
||||
**kwargs,
|
||||
) -> BatchFeature:
|
||||
text = self._make_batch_input(text)
|
||||
images = self._make_batch_input(images)
|
||||
image_inputs = {} if len(images) == 0 else self.image_processor(images)
|
||||
|
||||
text_inputs = self.tokenizer(text, add_special_tokens=False, **kwargs)
|
||||
combined_outputs = BatchFeature(
|
||||
data={**text_inputs, **image_inputs},
|
||||
tensor_type=return_tensors,
|
||||
)
|
||||
return combined_outputs
|
||||
410
vllm/transformers_utils/processors/nemotron_vl.py
Normal file
410
vllm/transformers_utils/processors/nemotron_vl.py
Normal file
@@ -0,0 +1,410 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from abc import ABC
|
||||
|
||||
import torch
|
||||
import torchvision.transforms as T
|
||||
from PIL import Image
|
||||
from transformers import PretrainedConfig
|
||||
from transformers.image_processing_utils_fast import BaseImageProcessorFast
|
||||
|
||||
from vllm.multimodal.image import convert_image_mode
|
||||
from vllm.multimodal.processing import PromptUpdateDetails
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
|
||||
from .internvl import InternVLProcessor
|
||||
|
||||
# Configure PIL to handle large images without warnings
|
||||
# This prevents DecompressionBombWarning for legitimate large images
|
||||
Image.MAX_IMAGE_PIXELS = None # Disable the limit entirely
|
||||
# Alternative: Set a specific higher limit
|
||||
# Image.MAX_IMAGE_PIXELS = 300000000 # ~300M pixels
|
||||
|
||||
|
||||
def build_transform(input_size: int):
|
||||
return T.Compose(
|
||||
[
|
||||
T.Lambda(lambda img: convert_image_mode(img, "RGB")),
|
||||
T.Resize(
|
||||
(input_size, input_size), interpolation=T.InterpolationMode.BICUBIC
|
||||
),
|
||||
T.ToTensor(),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
# adapted from https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1
|
||||
def find_closest_aspect_ratio(
|
||||
aspect_ratio: float,
|
||||
target_ratios: list[tuple[int, int]],
|
||||
*,
|
||||
width: int,
|
||||
height: int,
|
||||
image_size: int,
|
||||
) -> tuple[int, int]:
|
||||
best_factor = float("-inf")
|
||||
best_ratio = (1, 1)
|
||||
area = width * height
|
||||
|
||||
for rw, rh in target_ratios:
|
||||
target_aspect_ratio = rw / rh
|
||||
size_factor = min((rw * rh * image_size * image_size) / area, 0.6)
|
||||
ratio_closeness = min(
|
||||
target_aspect_ratio / aspect_ratio, aspect_ratio / target_aspect_ratio
|
||||
)
|
||||
factor = size_factor * ratio_closeness
|
||||
|
||||
if factor > best_factor:
|
||||
best_factor = factor
|
||||
best_ratio = (rw, rh)
|
||||
|
||||
return best_ratio
|
||||
|
||||
|
||||
def calculate_nemotron_vl_targets(
|
||||
*,
|
||||
orig_width: int,
|
||||
orig_height: int,
|
||||
target_ratios: list[tuple[int, int]],
|
||||
image_size: int,
|
||||
use_thumbnail: bool,
|
||||
) -> tuple[int, int, int]:
|
||||
aspect_ratio = orig_width / orig_height
|
||||
|
||||
# find the closest aspect ratio to the target
|
||||
target_aspect_ratio = find_closest_aspect_ratio(
|
||||
aspect_ratio,
|
||||
target_ratios,
|
||||
width=orig_width,
|
||||
height=orig_height,
|
||||
image_size=image_size,
|
||||
)
|
||||
|
||||
# calculate the target width and height
|
||||
target_width = image_size * target_aspect_ratio[0]
|
||||
target_height = image_size * target_aspect_ratio[1]
|
||||
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
|
||||
|
||||
# add thumbnail image if num_blocks != 1
|
||||
if use_thumbnail and blocks != 1:
|
||||
blocks += 1
|
||||
|
||||
return blocks, target_width, target_height
|
||||
|
||||
|
||||
def dynamic_preprocess_nemotron_vl(
|
||||
image: Image.Image,
|
||||
*,
|
||||
target_ratios: list[tuple[int, int]],
|
||||
image_size: int,
|
||||
use_thumbnail: bool,
|
||||
) -> list[Image.Image]:
|
||||
orig_width, orig_height = image.size
|
||||
|
||||
# calculate the number of blocks without thumbnail
|
||||
blocks, target_width, target_height = calculate_nemotron_vl_targets(
|
||||
orig_width=orig_width,
|
||||
orig_height=orig_height,
|
||||
target_ratios=target_ratios,
|
||||
image_size=image_size,
|
||||
use_thumbnail=False,
|
||||
)
|
||||
|
||||
# resize the image
|
||||
resized_img = image.resize((target_width, target_height))
|
||||
processed_images = []
|
||||
for i in range(blocks):
|
||||
box = (
|
||||
(i % (target_width // image_size)) * image_size,
|
||||
(i // (target_width // image_size)) * image_size,
|
||||
((i % (target_width // image_size)) + 1) * image_size,
|
||||
((i // (target_width // image_size)) + 1) * image_size,
|
||||
)
|
||||
# split the image
|
||||
split_img = resized_img.crop(box)
|
||||
processed_images.append(split_img)
|
||||
|
||||
assert len(processed_images) == blocks
|
||||
|
||||
if use_thumbnail and len(processed_images) != 1:
|
||||
thumbnail_img = image.resize((image_size, image_size))
|
||||
processed_images.append(thumbnail_img)
|
||||
|
||||
return processed_images
|
||||
|
||||
|
||||
def get_nemotron_vl_target_ratios(
|
||||
min_num: int,
|
||||
max_num: int,
|
||||
) -> list[tuple[int, int]]:
|
||||
target_ratios = {
|
||||
(i, j)
|
||||
for n in range(min_num, max_num + 1)
|
||||
for i in range(1, n + 1)
|
||||
for j in range(1, n + 1)
|
||||
if min_num <= i * j <= max_num
|
||||
}
|
||||
return sorted(target_ratios, key=lambda x: x[0] * x[1])
|
||||
|
||||
|
||||
def image_to_pixel_values_nemotron_vl(
|
||||
image: Image.Image,
|
||||
*,
|
||||
input_size: int,
|
||||
min_num: int,
|
||||
max_num: int,
|
||||
use_thumbnail: bool,
|
||||
transform: T.Compose | None = None,
|
||||
) -> torch.Tensor:
|
||||
target_ratios = get_nemotron_vl_target_ratios(min_num, max_num)
|
||||
|
||||
if transform is None:
|
||||
transform = build_transform(input_size=input_size)
|
||||
|
||||
images = dynamic_preprocess_nemotron_vl(
|
||||
image,
|
||||
target_ratios=target_ratios,
|
||||
image_size=input_size,
|
||||
use_thumbnail=use_thumbnail,
|
||||
)
|
||||
|
||||
pixel_values = torch.stack([transform(image) for image in images])
|
||||
return pixel_values
|
||||
|
||||
|
||||
class NemotronVLProcessor(InternVLProcessor):
|
||||
IMG_START = "<img>"
|
||||
IMG_END = "</img>"
|
||||
IMG_CONTEXT = "<image>"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
tokenizer: TokenizerLike,
|
||||
image_processor: BaseImageProcessorFast,
|
||||
*,
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
) -> None:
|
||||
ABC.__init__(self)
|
||||
self.config = config
|
||||
self.tokenizer = tokenizer
|
||||
self.image_processor = image_processor
|
||||
image_size: int = config.force_image_size
|
||||
patch_size: int = config.patch_size
|
||||
|
||||
if min_dynamic_patch is None:
|
||||
min_dynamic_patch = 1
|
||||
assert isinstance(min_dynamic_patch, int)
|
||||
|
||||
if max_dynamic_patch is None:
|
||||
max_dynamic_patch = self.image_processor.max_num_tiles
|
||||
assert isinstance(max_dynamic_patch, int)
|
||||
|
||||
if dynamic_image_size is None:
|
||||
dynamic_image_size = True
|
||||
assert isinstance(dynamic_image_size, bool)
|
||||
|
||||
self.num_image_token = int(
|
||||
(image_size // patch_size) ** 2 * (config.downsample_ratio**2)
|
||||
)
|
||||
self.image_size = image_size
|
||||
self.min_dynamic_patch = min_dynamic_patch
|
||||
self.max_dynamic_patch = max_dynamic_patch
|
||||
self.dynamic_image_size = dynamic_image_size
|
||||
|
||||
if image_processor is not None:
|
||||
self.use_thumbnail = image_processor.use_thumbnail
|
||||
else:
|
||||
self.use_thumbnail = getattr(config, "use_thumbnail", True)
|
||||
|
||||
@property
|
||||
def image_token_id(self) -> int:
|
||||
return self.tokenizer.get_vocab()[self.IMG_CONTEXT]
|
||||
|
||||
def _get_transform(self) -> T.Compose:
|
||||
return build_transform(input_size=self.image_size)
|
||||
|
||||
def get_num_image_tokens(
|
||||
self,
|
||||
*,
|
||||
image_width: int,
|
||||
image_height: int,
|
||||
) -> int:
|
||||
target_ratios = self.resolve_target_ratios(
|
||||
use_thumbnail=False, # Applied in calculate_targets
|
||||
)
|
||||
|
||||
num_patches, _, _ = calculate_nemotron_vl_targets(
|
||||
orig_width=image_width,
|
||||
orig_height=image_height,
|
||||
image_size=self.image_size,
|
||||
target_ratios=target_ratios,
|
||||
use_thumbnail=self.use_thumbnail,
|
||||
)
|
||||
|
||||
return num_patches * self.num_image_token
|
||||
|
||||
def _images_to_pixel_values_lst(
|
||||
self,
|
||||
images: list[Image.Image],
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
) -> list[torch.Tensor]:
|
||||
min_num, max_num = self.resolve_min_max_num(
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
use_thumbnail=False, # Applied in image_to_pixel_values
|
||||
)
|
||||
|
||||
return [
|
||||
image_to_pixel_values_nemotron_vl(
|
||||
image,
|
||||
input_size=self.image_size,
|
||||
min_num=min_num,
|
||||
max_num=max_num,
|
||||
use_thumbnail=self.use_thumbnail,
|
||||
transform=self._get_transform(),
|
||||
)
|
||||
for image in images
|
||||
]
|
||||
|
||||
def _replace_image_tokens(
|
||||
self,
|
||||
text: list[str],
|
||||
pixel_values_lst: list[torch.Tensor],
|
||||
) -> list[str]:
|
||||
"""Replace <image> placeholders with image tokens."""
|
||||
for pixel_values in pixel_values_lst:
|
||||
num_patches = pixel_values.shape[0]
|
||||
feature_size = num_patches * self.num_image_token
|
||||
image_repl = self.get_image_repl(feature_size, num_patches)
|
||||
# Use temporary placeholder to avoid replacing tokens we just inserted
|
||||
NVL_IMAGE_CONTEXT = image_repl.full.replace("<image>", "<NVL_IMG_CONTEXT>")
|
||||
text = [t.replace("<image>", NVL_IMAGE_CONTEXT, 1) for t in text]
|
||||
return [t.replace("<NVL_IMG_CONTEXT>", self.IMG_CONTEXT) for t in text]
|
||||
|
||||
def _preprocess_image(
|
||||
self,
|
||||
text: list[str],
|
||||
images: list[Image.Image],
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
) -> tuple[list[str], dict[str, torch.Tensor]]:
|
||||
if len(images) == 0:
|
||||
image_inputs = {}
|
||||
else:
|
||||
pixel_values_lst = self._images_to_pixel_values_lst(
|
||||
images,
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
)
|
||||
image_inputs = {
|
||||
"pixel_values_flat": torch.cat(pixel_values_lst),
|
||||
"image_num_patches": torch.tensor(
|
||||
[len(item) for item in pixel_values_lst]
|
||||
),
|
||||
}
|
||||
|
||||
text = self._replace_image_tokens(text, pixel_values_lst)
|
||||
return text, image_inputs
|
||||
|
||||
def get_image_repl(
|
||||
self,
|
||||
feature_size: int,
|
||||
num_patches: int | None,
|
||||
) -> PromptUpdateDetails[str]:
|
||||
repl_features = self.IMG_CONTEXT * feature_size
|
||||
repl_full = self.IMG_START + repl_features + self.IMG_END
|
||||
|
||||
return PromptUpdateDetails.select_text(repl_full, self.IMG_CONTEXT)
|
||||
|
||||
|
||||
# SigLIP normalization constants
|
||||
SIGLIP_MEAN = (0.5, 0.5, 0.5)
|
||||
SIGLIP_STD = (0.5, 0.5, 0.5)
|
||||
|
||||
|
||||
def build_siglip_transform(input_size: int):
|
||||
"""Build transform for SigLIP vision encoder with normalization.
|
||||
|
||||
Extends the base transform from nemotron_vl with SigLIP-specific normalization.
|
||||
"""
|
||||
return T.Compose(
|
||||
[
|
||||
build_transform(input_size=input_size),
|
||||
T.Normalize(mean=SIGLIP_MEAN, std=SIGLIP_STD),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
class LlamaNemotronVLEmbedProcessor(NemotronVLProcessor):
|
||||
"""
|
||||
Processor for LlamaNemotronVL embedding model.
|
||||
|
||||
Inherits from NemotronVLProcessor and specializes it for embedding tasks:
|
||||
- Uses SigLIP transform with normalization instead of base transform
|
||||
- Uses different image context token (<IMG_CONTEXT> vs <image>)
|
||||
"""
|
||||
|
||||
IMG_CONTEXT = "<IMG_CONTEXT>"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
tokenizer: TokenizerLike,
|
||||
processor_config: dict,
|
||||
*,
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
) -> None:
|
||||
if min_dynamic_patch is None:
|
||||
min_dynamic_patch = processor_config.get(
|
||||
"min_input_tiles",
|
||||
getattr(config, "min_dynamic_patch", 1),
|
||||
)
|
||||
if max_dynamic_patch is None:
|
||||
max_dynamic_patch = processor_config.get(
|
||||
"max_input_tiles",
|
||||
getattr(config, "max_dynamic_patch", 1),
|
||||
)
|
||||
if dynamic_image_size is None:
|
||||
dynamic_image_size = processor_config.get(
|
||||
"dynamic_image_size",
|
||||
getattr(config, "dynamic_image_size", True),
|
||||
)
|
||||
super().__init__(
|
||||
config=config,
|
||||
tokenizer=tokenizer,
|
||||
image_processor=None,
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
)
|
||||
|
||||
def _get_transform(self) -> T.Compose:
|
||||
"""Override to add SigLIP normalization."""
|
||||
return build_siglip_transform(input_size=self.image_size)
|
||||
|
||||
def _replace_image_tokens(
|
||||
self,
|
||||
text: list[str],
|
||||
pixel_values_lst: list[torch.Tensor],
|
||||
) -> list[str]:
|
||||
"""Override with simpler token replacement for embedding model.
|
||||
|
||||
No temporary placeholder needed because IMG_CONTEXT is <IMG_CONTEXT>,
|
||||
not <image>, so there's no collision risk.
|
||||
"""
|
||||
for pixel_values in pixel_values_lst:
|
||||
num_patches = pixel_values.shape[0]
|
||||
feature_size = num_patches * self.num_image_token
|
||||
image_repl = self.get_image_repl(feature_size, num_patches)
|
||||
text = [t.replace("<image>", image_repl.full, 1) for t in text]
|
||||
return text
|
||||
44
vllm/transformers_utils/processors/nvlm_d.py
Normal file
44
vllm/transformers_utils/processors/nvlm_d.py
Normal file
@@ -0,0 +1,44 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# adapted from https://huggingface.co/nvidia/NVLM-D-72B/blob/main/modeling_nvlm_d.py
|
||||
# --------------------------------------------------------
|
||||
# NVLM-D
|
||||
# Copyright (c) 2024 NVIDIA
|
||||
# Licensed under Apache 2.0 License [see LICENSE for details]
|
||||
# --------------------------------------------------------
|
||||
from vllm.multimodal.processing import PromptUpdateDetails
|
||||
|
||||
from .internvl import BaseInternVLProcessor
|
||||
|
||||
IMG_PAD = "<|vision_pad|>"
|
||||
|
||||
|
||||
class NVLMProcessor(BaseInternVLProcessor):
|
||||
@property
|
||||
def image_token_id(self) -> int:
|
||||
return self.tokenizer.get_vocab()[IMG_PAD]
|
||||
|
||||
def get_image_repl(
|
||||
self,
|
||||
feature_size: int,
|
||||
num_patches: int | None,
|
||||
) -> PromptUpdateDetails[str]:
|
||||
if num_patches is None:
|
||||
raise NotImplementedError("Embedding inputs are not supported")
|
||||
|
||||
tile_pos_identifiers = [f"<tile_{i}>" for i in range(1, num_patches)]
|
||||
if self.use_thumbnail:
|
||||
tile_pos_identifiers += ["<tile_global_thumbnail>"]
|
||||
|
||||
context_size = feature_size // num_patches
|
||||
features = "".join(
|
||||
identifier + IMG_PAD * context_size for identifier in tile_pos_identifiers
|
||||
)
|
||||
|
||||
# We include the start and end as well because "<Image><tile" is
|
||||
# tokenized as ["<Image", "><", "tile"], resulting in assertion error
|
||||
# when trying to find "<tile" as a subsequence of "<Image><tile"
|
||||
repl = "<Image>" + features + "</Image>"
|
||||
|
||||
return PromptUpdateDetails.select_text(repl, IMG_PAD)
|
||||
389
vllm/transformers_utils/processors/skyworkr1v.py
Normal file
389
vllm/transformers_utils/processors/skyworkr1v.py
Normal file
@@ -0,0 +1,389 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/modeling_skywork_chat.py
|
||||
# --------------------------------------------------------
|
||||
# SkyworkR1V
|
||||
# Copyright (c) 2025 Skywork
|
||||
# Licensed under The MIT License [see LICENSE for details]
|
||||
# --------------------------------------------------------
|
||||
|
||||
import torch
|
||||
import torchvision.transforms as T
|
||||
from PIL import Image
|
||||
from transformers import BatchFeature, PretrainedConfig, TensorType
|
||||
|
||||
from vllm.multimodal.image import convert_image_mode
|
||||
from vllm.multimodal.processing import PromptUpdateDetails
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
|
||||
IMG_START = "<img>"
|
||||
IMG_END = "</img>"
|
||||
IMG_CONTEXT = "<IMG_CONTEXT>"
|
||||
|
||||
IMAGENET_MEAN = (0.485, 0.456, 0.406)
|
||||
IMAGENET_STD = (0.229, 0.224, 0.225)
|
||||
|
||||
|
||||
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/
|
||||
def build_transform(input_size: int):
|
||||
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
|
||||
return T.Compose(
|
||||
[
|
||||
T.Lambda(lambda img: convert_image_mode(img, "RGB")),
|
||||
T.Resize(
|
||||
(input_size, input_size), interpolation=T.InterpolationMode.BICUBIC
|
||||
),
|
||||
T.ToTensor(),
|
||||
T.Normalize(mean=MEAN, std=STD),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/
|
||||
def find_closest_aspect_ratio(
|
||||
aspect_ratio: float,
|
||||
target_ratios: list[tuple[int, int]],
|
||||
*,
|
||||
width: int,
|
||||
height: int,
|
||||
image_size: int,
|
||||
) -> tuple[int, int]:
|
||||
best_ratio_diff = float("inf")
|
||||
best_ratio = (1, 1)
|
||||
area = width * height
|
||||
for ratio in target_ratios:
|
||||
target_aspect_ratio = ratio[0] / ratio[1]
|
||||
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
|
||||
if ratio_diff < best_ratio_diff:
|
||||
best_ratio_diff = ratio_diff
|
||||
best_ratio = ratio
|
||||
elif ratio_diff == best_ratio_diff:
|
||||
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
|
||||
best_ratio = ratio
|
||||
return best_ratio
|
||||
|
||||
|
||||
def resolve_skyworkr1v_min_max_num(
|
||||
*,
|
||||
min_dynamic_patch: int,
|
||||
max_dynamic_patch: int,
|
||||
dynamic_image_size: bool,
|
||||
use_thumbnail: bool,
|
||||
) -> tuple[int, int]:
|
||||
min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1
|
||||
max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
|
||||
|
||||
if use_thumbnail and max_dynamic_patch != 1:
|
||||
max_dynamic_patch += 1
|
||||
|
||||
return min_dynamic_patch, max_dynamic_patch
|
||||
|
||||
|
||||
def get_skyworkr1v_target_ratios(
|
||||
min_num: int,
|
||||
max_num: int,
|
||||
) -> list[tuple[int, int]]:
|
||||
target_ratios = {
|
||||
(i, j)
|
||||
for n in range(min_num, max_num + 1)
|
||||
for i in range(1, n + 1)
|
||||
for j in range(1, n + 1)
|
||||
if min_num <= i * j <= max_num
|
||||
}
|
||||
return sorted(target_ratios, key=lambda x: x[0] * x[1])
|
||||
|
||||
|
||||
def calculate_skyworkr1v_targets(
|
||||
*,
|
||||
orig_width: int,
|
||||
orig_height: int,
|
||||
target_ratios: list[tuple[int, int]],
|
||||
image_size: int,
|
||||
use_thumbnail: bool,
|
||||
) -> tuple[int, int, int]:
|
||||
aspect_ratio = orig_width / orig_height
|
||||
|
||||
# find the closest aspect ratio to the target
|
||||
target_aspect_ratio = find_closest_aspect_ratio(
|
||||
aspect_ratio,
|
||||
target_ratios,
|
||||
width=orig_width,
|
||||
height=orig_height,
|
||||
image_size=image_size,
|
||||
)
|
||||
|
||||
# calculate the target width and height
|
||||
target_width = image_size * target_aspect_ratio[0]
|
||||
target_height = image_size * target_aspect_ratio[1]
|
||||
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
|
||||
|
||||
# add thumbnail image if num_blocks != 1
|
||||
if use_thumbnail and blocks != 1:
|
||||
blocks += 1
|
||||
|
||||
return blocks, target_width, target_height
|
||||
|
||||
|
||||
def dynamic_preprocess_skyworkr1v(
|
||||
image: Image.Image,
|
||||
*,
|
||||
target_ratios: list[tuple[int, int]],
|
||||
image_size: int,
|
||||
use_thumbnail: bool,
|
||||
) -> list[Image.Image]:
|
||||
orig_width, orig_height = image.size
|
||||
|
||||
# calculate the number of blocks without thumbnail
|
||||
blocks, target_width, target_height = calculate_skyworkr1v_targets(
|
||||
orig_width=orig_width,
|
||||
orig_height=orig_height,
|
||||
target_ratios=target_ratios,
|
||||
image_size=image_size,
|
||||
use_thumbnail=False,
|
||||
)
|
||||
|
||||
# resize the image
|
||||
resized_img = image.resize((target_width, target_height))
|
||||
processed_images = []
|
||||
for i in range(blocks):
|
||||
box = (
|
||||
(i % (target_width // image_size)) * image_size,
|
||||
(i // (target_width // image_size)) * image_size,
|
||||
((i % (target_width // image_size)) + 1) * image_size,
|
||||
((i // (target_width // image_size)) + 1) * image_size,
|
||||
)
|
||||
# split the image
|
||||
split_img = resized_img.crop(box)
|
||||
processed_images.append(split_img)
|
||||
|
||||
assert len(processed_images) == blocks
|
||||
|
||||
if use_thumbnail and len(processed_images) != 1:
|
||||
thumbnail_img = image.resize((image_size, image_size))
|
||||
processed_images.append(thumbnail_img)
|
||||
|
||||
return processed_images
|
||||
|
||||
|
||||
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B
|
||||
def image_to_pixel_values_skyworkr1v(
|
||||
image: Image.Image,
|
||||
*,
|
||||
input_size: int,
|
||||
min_num: int,
|
||||
max_num: int,
|
||||
use_thumbnail: bool,
|
||||
) -> torch.Tensor:
|
||||
target_ratios = get_skyworkr1v_target_ratios(min_num, max_num)
|
||||
|
||||
transform = build_transform(input_size=input_size)
|
||||
images = dynamic_preprocess_skyworkr1v(
|
||||
image,
|
||||
target_ratios=target_ratios,
|
||||
image_size=input_size,
|
||||
use_thumbnail=use_thumbnail,
|
||||
)
|
||||
|
||||
pixel_values = torch.stack([transform(image) for image in images])
|
||||
return pixel_values
|
||||
|
||||
|
||||
class SkyworkR1VProcessor:
|
||||
"""
|
||||
This model doesn't define its own HF processor,
|
||||
so we implement our own one here.
|
||||
|
||||
The code to insert image tokens is based on:
|
||||
https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/modeling_skywork_chat.py#L252
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
tokenizer: TokenizerLike,
|
||||
*,
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
self.config = config
|
||||
self.tokenizer = tokenizer
|
||||
|
||||
image_size: int = config.vision_config.image_size
|
||||
patch_size: int = config.vision_config.patch_size
|
||||
|
||||
if min_dynamic_patch is None:
|
||||
min_dynamic_patch = config.min_dynamic_patch
|
||||
assert isinstance(min_dynamic_patch, int)
|
||||
|
||||
if max_dynamic_patch is None:
|
||||
max_dynamic_patch = config.max_dynamic_patch
|
||||
assert isinstance(max_dynamic_patch, int)
|
||||
|
||||
if dynamic_image_size is None:
|
||||
dynamic_image_size = config.dynamic_image_size
|
||||
assert isinstance(dynamic_image_size, bool)
|
||||
|
||||
self.num_image_token = int(
|
||||
(image_size // patch_size) ** 2 * (config.downsample_ratio**2)
|
||||
)
|
||||
self.image_size = image_size
|
||||
self.min_dynamic_patch = min_dynamic_patch
|
||||
self.max_dynamic_patch = max_dynamic_patch
|
||||
self.dynamic_image_size = dynamic_image_size
|
||||
self.use_thumbnail: bool = config.use_thumbnail
|
||||
|
||||
@property
|
||||
def image_token_id(self) -> int:
|
||||
return self.tokenizer.get_vocab()[IMG_CONTEXT]
|
||||
|
||||
def get_image_repl(
|
||||
self,
|
||||
feature_size: int,
|
||||
num_patches: int | None,
|
||||
) -> PromptUpdateDetails[str]:
|
||||
repl_features = IMG_CONTEXT * feature_size
|
||||
repl_full = IMG_START + repl_features + IMG_END
|
||||
|
||||
return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
|
||||
|
||||
def resolve_min_max_num(
|
||||
self,
|
||||
*,
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
use_thumbnail: bool | None = None,
|
||||
) -> tuple[int, int]:
|
||||
min_dynamic_patch = (
|
||||
self.min_dynamic_patch if min_dynamic_patch is None else min_dynamic_patch
|
||||
)
|
||||
max_dynamic_patch = (
|
||||
self.max_dynamic_patch if max_dynamic_patch is None else max_dynamic_patch
|
||||
)
|
||||
dynamic_image_size = (
|
||||
self.dynamic_image_size
|
||||
if dynamic_image_size is None
|
||||
else dynamic_image_size
|
||||
)
|
||||
use_thumbnail = self.use_thumbnail if use_thumbnail is None else use_thumbnail
|
||||
|
||||
return resolve_skyworkr1v_min_max_num(
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
use_thumbnail=use_thumbnail,
|
||||
)
|
||||
|
||||
def resolve_target_ratios(
|
||||
self,
|
||||
*,
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
use_thumbnail: bool | None = None,
|
||||
) -> list[tuple[int, int]]:
|
||||
min_num, max_num = self.resolve_min_max_num(
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
use_thumbnail=use_thumbnail,
|
||||
)
|
||||
|
||||
return get_skyworkr1v_target_ratios(min_num, max_num)
|
||||
|
||||
def get_num_image_tokens(
|
||||
self,
|
||||
*,
|
||||
image_width: int,
|
||||
image_height: int,
|
||||
) -> int:
|
||||
target_ratios = self.resolve_target_ratios(
|
||||
use_thumbnail=False, # Applied in calculate_targets
|
||||
)
|
||||
|
||||
num_patches, _, _ = calculate_skyworkr1v_targets(
|
||||
orig_width=image_width,
|
||||
orig_height=image_height,
|
||||
image_size=self.image_size,
|
||||
target_ratios=target_ratios,
|
||||
use_thumbnail=self.use_thumbnail,
|
||||
)
|
||||
|
||||
return num_patches * self.num_image_token
|
||||
|
||||
def _images_to_pixel_values_lst(
|
||||
self,
|
||||
images: list[Image.Image],
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
) -> list[torch.Tensor]:
|
||||
min_num, max_num = self.resolve_min_max_num(
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
use_thumbnail=False, # Applied in image_to_pixel_values
|
||||
)
|
||||
|
||||
return [
|
||||
image_to_pixel_values_skyworkr1v(
|
||||
image,
|
||||
input_size=self.image_size,
|
||||
min_num=min_num,
|
||||
max_num=max_num,
|
||||
use_thumbnail=self.use_thumbnail,
|
||||
)
|
||||
for image in images
|
||||
]
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
text: str | list[str] | None = None,
|
||||
images: Image.Image | list[Image.Image] | None = None,
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
return_tensors: str | TensorType | None = None,
|
||||
) -> BatchFeature:
|
||||
if text is None:
|
||||
text = []
|
||||
if not isinstance(text, list):
|
||||
text = [text]
|
||||
if images is None:
|
||||
images = []
|
||||
if not isinstance(images, list):
|
||||
images = [images]
|
||||
|
||||
if len(images) == 0:
|
||||
image_inputs = {}
|
||||
else:
|
||||
pixel_values_lst = self._images_to_pixel_values_lst(
|
||||
images,
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
)
|
||||
image_inputs = {
|
||||
"pixel_values_flat": torch.cat(pixel_values_lst),
|
||||
"image_num_patches": torch.tensor(
|
||||
[len(item) for item in pixel_values_lst]
|
||||
),
|
||||
}
|
||||
|
||||
for pixel_values in pixel_values_lst:
|
||||
num_patches = pixel_values.shape[0]
|
||||
feature_size = num_patches * self.num_image_token
|
||||
|
||||
image_repl = self.get_image_repl(feature_size, num_patches)
|
||||
|
||||
text = [t.replace("<image>", image_repl.full, 1) for t in text]
|
||||
|
||||
text_inputs = self.tokenizer(text)
|
||||
|
||||
combined_outputs = {**text_inputs, **image_inputs}
|
||||
|
||||
return BatchFeature(combined_outputs, tensor_type=return_tensors)
|
||||
Reference in New Issue
Block a user