[2/3] Refactor InternVL-based processors (#37324)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2026-03-18 22:22:19 +08:00
committed by GitHub
parent 525f2eeb0b
commit 99267c23ca
18 changed files with 815 additions and 1199 deletions

View File

@@ -489,13 +489,14 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
self.image_size = self.vision_config.image_size
def __call__(self, text: str, images: Image | list[Image], **kwargs):
from vllm.model_executor.models.h2ovl import (
IMG_CONTEXT,
IMG_END,
IMG_START,
from vllm.transformers_utils.processors.h2ovl import (
image_to_pixel_values_h2ovl,
)
IMG_START = "<img>"
IMG_END = "</img>"
IMG_CONTEXT = "<IMG_CONTEXT>"
images = [images] if isinstance(images, Image) else images
pixel_values = [
image_to_pixel_values_h2ovl(
@@ -751,16 +752,17 @@ def skyworkr1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
self.image_size = self.vision_config.image_size
def __call__(self, text: str, images: Image | list[Image], **kwargs):
from vllm.model_executor.models.skyworkr1v import (
IMG_CONTEXT,
IMG_END,
IMG_START,
image_to_pixel_values_skyworkr1v,
from vllm.transformers_utils.processors.internvl import (
image_to_pixel_values_internvl,
)
IMG_START = "<img>"
IMG_END = "</img>"
IMG_CONTEXT = "<IMG_CONTEXT>"
images = [images] if isinstance(images, Image) else images
pixel_values = [
image_to_pixel_values_skyworkr1v(
image_to_pixel_values_internvl(
image,
input_size=self.image_size,
min_num=self.min_num,
@@ -815,14 +817,15 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
videos: npt.NDArray | list[npt.NDArray] = None,
**kwargs,
):
from vllm.model_executor.models.internvl import (
IMG_CONTEXT,
IMG_END,
IMG_START,
from vllm.transformers_utils.processors.internvl import (
image_to_pixel_values_internvl,
video_to_pixel_values_internvl,
)
IMG_START = "<img>"
IMG_END = "</img>"
IMG_CONTEXT = "<IMG_CONTEXT>"
images = [images] if isinstance(images, Image) else images
videos = [videos] if isinstance(videos, np.ndarray) else videos
if images is not None:

View File

@@ -779,7 +779,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"rednote-hilab/dots.ocr", trust_remote_code=True
),
"Eagle2_5_VLForConditionalGeneration": _HfExamplesInfo(
"nvidia/Eagle2.5-8B", trust_remote_code=True, is_available_online=False
"nvidia/Eagle2.5-8B",
trust_remote_code=True,
),
"Emu3ForConditionalGeneration": _HfExamplesInfo("BAAI/Emu3-Chat-hf"),
"Ernie4_5_VLMoeForConditionalGeneration": _HfExamplesInfo(

View File

@@ -16,7 +16,10 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.model_executor.models.siglip import SiglipVisionModel
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.processors.eagle2_5_vl import Eagle2_5_VLProcessor
from vllm.transformers_utils.processors.internvl import (
InternVLImageProcessor,
InternVLProcessor,
)
from vllm.utils.tensor_schema import TensorSchema, TensorShape
from .interfaces import (
@@ -68,12 +71,35 @@ Eagle2_5_VLImageInputs: TypeAlias = (
class Eagle2_5_VLProcessingInfo(BaseInternVLProcessingInfo):
"""Processing info for Eagle2.5-VL model."""
def get_hf_processor(self, **kwargs) -> Eagle2_5_VLProcessor:
return self.ctx.init_processor(
Eagle2_5_VLProcessor,
config=self.ctx.get_hf_config(),
def get_image_processor(self, **kwargs):
config = self.get_hf_config()
vision_config = config.vision_config
kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
kwargs.setdefault(
"image_size", config.force_image_size or vision_config.image_size
)
kwargs.setdefault("min_dynamic_patch", config.min_dynamic_patch)
kwargs.setdefault("max_dynamic_patch", config.max_dynamic_patch)
kwargs.setdefault("dynamic_image_size", config.dynamic_image_size)
kwargs.setdefault("use_thumbnail", config.use_thumbnail)
return InternVLImageProcessor(**kwargs)
def get_hf_processor(self, **kwargs) -> InternVLProcessor:
config = self.get_hf_config()
vision_config = config.vision_config
image_processor = self.get_image_processor(**kwargs)
image_size = image_processor.image_size
patch_size = vision_config.patch_size
downsample_ratio = config.downsample_ratio
image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2))
return InternVLProcessor(
tokenizer=self.get_tokenizer(),
**kwargs,
image_processor=image_processor,
image_seq_length=image_seq_length,
)

View File

@@ -395,13 +395,13 @@ class GLM4VProcessingInfo(BaseProcessingInfo):
vision_config = config.vision_config
image_size = vision_config["image_size"]
kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
kwargs.setdefault("size", {"width": image_size, "height": image_size})
return GLM4VImageProcessorFast(**kwargs)
def get_hf_processor(self, **kwargs: object) -> GLM4VProcessor:
return self.ctx.init_processor(
GLM4VProcessor,
return GLM4VProcessor(
tokenizer=self.get_tokenizer(),
image_processor=self.get_image_processor(**kwargs),
)

View File

@@ -28,7 +28,7 @@ from vllm.multimodal.processing.processor import (
PromptUpdate,
TimingContext,
)
from vllm.transformers_utils.processors.h2ovl import H2OVLProcessor
from vllm.transformers_utils.processors.h2ovl import H2OVLImageProcessor, H2OVLProcessor
from .intern_vit import InternVisionModel
from .internvl import (
@@ -40,12 +40,34 @@ from .internvl import (
class H2OVLProcessingInfo(BaseInternVLProcessingInfo):
def get_image_processor(self, **kwargs):
config = self.get_hf_config()
vision_config = config.vision_config
kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
kwargs.setdefault("image_size", vision_config.image_size)
kwargs.setdefault("min_dynamic_patch", config.min_dynamic_patch)
kwargs.setdefault("max_dynamic_patch", config.max_dynamic_patch)
kwargs.setdefault("dynamic_image_size", config.dynamic_image_size)
kwargs.setdefault("use_thumbnail", config.use_thumbnail)
kwargs.setdefault("use_msac", config.use_msac)
return H2OVLImageProcessor(**kwargs)
def get_hf_processor(self, **kwargs: object) -> H2OVLProcessor:
return self.ctx.init_processor(
H2OVLProcessor,
config=self.get_hf_config(),
config = self.get_hf_config()
vision_config = config.vision_config
image_processor = self.get_image_processor(**kwargs)
image_size = image_processor.image_size
patch_size = vision_config.patch_size
downsample_ratio = config.downsample_ratio
image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2))
return H2OVLProcessor(
tokenizer=self.get_tokenizer(),
**kwargs,
image_processor=image_processor,
image_seq_length=image_seq_length,
)
def get_num_image_tokens(
@@ -106,7 +128,7 @@ class H2OVLMultiModalProcessor(BaseInternVLMultiModalProcessor[H2OVLProcessingIn
if num_patches is not None:
assert isinstance(num_patches, int)
return hf_processor.get_image_repl(feature_size, num_patches)
return hf_processor.get_image_repl(num_patches, num_features=feature_size)
return [
PromptReplacement(

View File

@@ -9,6 +9,7 @@
# --------------------------------------------------------
from abc import abstractmethod
from collections.abc import Iterable, Mapping, Sequence
from functools import cached_property
from typing import Annotated, Literal, TypeAlias, TypeVar
import torch
@@ -45,8 +46,9 @@ from vllm.multimodal.processing import (
)
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.processors.internvl import (
BaseInternVLProcessor,
InternVLImageProcessor,
InternVLProcessor,
InternVLVideoProcessor,
)
from vllm.utils.tensor_schema import TensorSchema, TensorShape
@@ -123,7 +125,7 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo):
"""Basic image-only ProcessingInfo for InternVL-style models."""
@abstractmethod
def get_hf_processor(self, **kwargs: object) -> BaseInternVLProcessor:
def get_hf_processor(self, **kwargs: object) -> InternVLProcessor:
raise NotImplementedError
def get_supported_mm_limits(self) -> Mapping[str, int | None]:
@@ -134,7 +136,7 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo):
*,
image_width: int,
image_height: int,
processor: BaseInternVLProcessor,
processor: InternVLProcessor,
) -> int:
return processor.get_num_image_tokens(
image_width=image_width,
@@ -143,8 +145,9 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo):
def get_image_size_with_most_features(self) -> ImageSize:
processor = self.get_hf_processor()
image_processor = processor.image_processor
base_size = processor.image_size
base_size = image_processor.image_size
target_ratios = processor.resolve_target_ratios()
largest_feature_size, largest_feature_pinpoint = 0, None
@@ -226,7 +229,7 @@ class BaseInternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
)
hf_processor = self.info.get_hf_processor(**mm_kwargs)
image_token_id = hf_processor.image_token_id
image_token_id = hf_processor.ctx_image_token_id
# Since there may be extra tokens in the feature placeholders,
# we need to pass the image token ID to the model to select the
@@ -291,7 +294,7 @@ class BaseInternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
if num_patches is not None:
assert isinstance(num_patches, int)
return hf_processor.get_image_repl(feature_size, num_patches)
return hf_processor.get_image_repl(num_patches, num_features=feature_size)
return [
PromptReplacement(
@@ -305,23 +308,73 @@ class BaseInternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
class InternVLProcessingInfo(BaseInternVLProcessingInfo):
"""InternVL ProcessingInfo extended for video processing"""
@property
def supports_video(self):
return self.get_hf_processor().supports_video
def get_image_processor(self, **kwargs):
config = self.get_hf_config()
vision_config = config.vision_config
def get_supported_mm_limits(self):
video_limit = {"video": None} if self.supports_video else {}
return {**super().get_supported_mm_limits(), **video_limit}
kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
kwargs.setdefault("image_size", vision_config.image_size)
kwargs.setdefault("min_dynamic_patch", config.min_dynamic_patch)
kwargs.setdefault("max_dynamic_patch", config.max_dynamic_patch)
kwargs.setdefault("dynamic_image_size", config.dynamic_image_size)
kwargs.setdefault("use_thumbnail", config.use_thumbnail)
def get_video_token(self) -> str | None:
return InternVLImageProcessor(**kwargs)
def get_video_processor(self, **kwargs):
config = self.get_hf_config()
vision_config = config.vision_config
kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
kwargs.setdefault("image_size", vision_config.image_size)
return InternVLVideoProcessor(**kwargs)
@cached_property
def ctx_video_token(self):
text_model_type = self.get_hf_config().get_text_config().model_type
video_token_map = {
ctx_video_token_map = {
"qwen2": "<|video_pad|>",
"qwen3": "<|video_pad|>",
"qwen3_moe": "<|video_pad|>",
"gpt_oss": "<|reserved_200000|>",
}
return video_token_map.get(text_model_type)
if text_model_type not in ctx_video_token_map:
return None
ctx_video_token = ctx_video_token_map[text_model_type]
if ctx_video_token not in self.get_tokenizer().get_vocab():
return None
return ctx_video_token
def get_hf_processor(self, **kwargs: object) -> InternVLProcessor:
config = self.get_hf_config()
vision_config = config.vision_config
image_processor = self.get_image_processor(**kwargs)
image_size = image_processor.image_size
patch_size = vision_config.patch_size
downsample_ratio = config.downsample_ratio
image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2))
ctx_video_token = self.ctx_video_token
video_processor = (
self.get_video_processor(**kwargs) if ctx_video_token else None
)
return InternVLProcessor(
tokenizer=self.get_tokenizer(),
image_processor=image_processor,
video_processor=video_processor,
image_seq_length=image_seq_length,
ctx_video_token=ctx_video_token,
)
def get_supported_mm_limits(self):
video_limit = {"video": None} if self.ctx_video_token else {}
return {**super().get_supported_mm_limits(), **video_limit}
def get_num_frames_with_most_features(
self,
@@ -332,22 +385,14 @@ class InternVLProcessingInfo(BaseInternVLProcessingInfo):
max_videos = mm_counts.get("video", 0)
processor = self.get_hf_processor()
num_image_token = processor.image_seq_length
max_image_tokens = self.get_max_image_tokens() * max_images
max_total_frames = (seq_len - max_image_tokens) // processor.num_image_token
max_total_frames = (seq_len - max_image_tokens) // num_image_token
max_frames_per_video = max_total_frames // max(max_videos, 1)
return max(max_frames_per_video, 1)
def get_hf_processor(self, **kwargs: object) -> InternVLProcessor:
return self.ctx.init_processor(
InternVLProcessor,
config=self.get_hf_config(),
tokenizer=self.get_tokenizer(),
video_token=self.get_video_token(),
**kwargs,
)
class InternVLDummyInputsBuilder(
BaseInternVLDummyInputsBuilder[InternVLProcessingInfo]
@@ -366,7 +411,7 @@ class InternVLDummyInputsBuilder(
mm_options: Mapping[str, BaseDummyOptions],
) -> MultiModalDataDict:
dummy_image = super().get_dummy_mm_data(seq_len, mm_counts, mm_options)
if self.info.supports_video:
if self.info.ctx_video_token:
config = self.info.get_hf_config()
image_size: int = config.vision_config.image_size
target_num_frames = self.info.get_num_frames_with_most_features(
@@ -405,11 +450,9 @@ class InternVLMultiModalProcessor(
)
hf_processor = self.info.get_hf_processor(**mm_kwargs)
if (
self.info.supports_video
and (video_token_id := hf_processor.video_token_id) is not None
):
if (video_token_id := hf_processor.ctx_video_token_id) is not None:
processed_outputs["video_token_id"] = torch.tensor(video_token_id)
return processed_outputs
def _get_mm_fields_config(
@@ -418,7 +461,7 @@ class InternVLMultiModalProcessor(
hf_processor_mm_kwargs: Mapping[str, object],
) -> Mapping[str, MultiModalFieldConfig]:
image_fields = super()._get_mm_fields_config(hf_inputs, hf_processor_mm_kwargs)
if self.info.supports_video:
if self.info.ctx_video_token:
video_num_patches = hf_inputs.get("video_num_patches", torch.empty(0))
num_videos = len(video_num_patches)
video_fields = dict(
@@ -444,6 +487,8 @@ class InternVLMultiModalProcessor(
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
out_mm_kwargs=out_mm_kwargs,
)
if self.info.ctx_video_token is None:
return prompt_repl
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
@@ -456,26 +501,20 @@ class InternVLMultiModalProcessor(
video_num_patches = []
def get_video_replacement_internvl(item_idx: int):
feature_size = hf_processor.num_image_token
num_patches = video_num_patches[item_idx]
if num_patches is not None:
assert isinstance(num_patches, int)
return hf_processor.get_video_repl(
feature_size, num_patches, video_context_token=hf_processor.video_token
)
return hf_processor.get_video_repl(num_patches)
if self.info.supports_video:
prompt_repl = [
*prompt_repl,
PromptReplacement(
modality="video",
target="<video>",
replacement=get_video_replacement_internvl,
),
]
return prompt_repl
return [
*prompt_repl,
PromptReplacement(
modality="video",
target="<video>",
replacement=get_video_replacement_internvl,
),
]
@MULTIMODAL_REGISTRY.register_processor(

View File

@@ -26,8 +26,10 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.processor import cached_image_processor_from_config
from vllm.transformers_utils.processors.nemotron_vl import (
LlamaNemotronNanoVLImageProcessor,
LlamaNemotronNanoVLProcessor,
LlamaNemotronVLEmbedImageProcessor,
LlamaNemotronVLEmbedProcessor,
NemotronVLProcessor,
)
from vllm.transformers_utils.repo_utils import get_hf_file_to_dict
@@ -50,19 +52,34 @@ from .utils import (
class NemotronVLProcessingInfo(BaseInternVLProcessingInfo):
"""Processing info for Nemotron VL models."""
def get_hf_processor(self, **kwargs: object) -> NemotronVLProcessor:
return self.ctx.init_processor(
NemotronVLProcessor,
config=self.get_hf_config(),
tokenizer=self.get_tokenizer(),
image_processor=self.get_image_processor(),
**kwargs,
def get_image_processor(self, **kwargs: object):
kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
orig_processor = cached_image_processor_from_config(
self.ctx.model_config, **kwargs
)
def get_image_processor(self, **kwargs: object):
return cached_image_processor_from_config(
self.ctx.model_config,
**kwargs,
return LlamaNemotronNanoVLImageProcessor(
image_size=orig_processor.image_size,
min_dynamic_patch=1,
max_dynamic_patch=orig_processor.max_num_tiles,
dynamic_image_size=True,
use_thumbnail=orig_processor.use_thumbnail,
)
def get_hf_processor(self, **kwargs: object) -> LlamaNemotronNanoVLProcessor:
config = self.get_hf_config()
vision_config = config.vision_config
image_processor = self.get_image_processor(**kwargs)
image_size = image_processor.image_size
patch_size = vision_config.patch_size
downsample_ratio = config.downsample_ratio
image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2))
return LlamaNemotronNanoVLProcessor(
tokenizer=self.get_tokenizer(),
image_processor=image_processor,
image_seq_length=image_seq_length,
)
@@ -386,29 +403,58 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, Suppor
# --------------------------------------------------------
class LlamaNemotronVLEmbedProcessingInfo(NemotronVLProcessingInfo):
class LlamaNemotronVLEmbedProcessingInfo(BaseInternVLProcessingInfo):
"""Processing info for LlamaNemotronVL embedding model."""
def get_hf_processor(self, **kwargs: object) -> LlamaNemotronVLEmbedProcessor:
"""Override to create embedding-specific processor without image_processor."""
def get_image_processor(self, **kwargs):
model_config = self.ctx.model_config
processor_config = {}
if model_config.model is not None:
processor_config = (
get_hf_file_to_dict(
"processor_config.json",
model_config.model,
model_config.revision,
)
or {}
)
return self.ctx.init_processor(
LlamaNemotronVLEmbedProcessor,
config=self.get_hf_config(),
config = self.get_hf_config()
processor_config = (
get_hf_file_to_dict(
"processor_config.json",
model_config.model,
model_config.revision,
)
or {}
)
min_dynamic_patch = processor_config.get(
"min_input_tiles",
getattr(config, "min_dynamic_patch", 1),
)
max_dynamic_patch = processor_config.get(
"max_input_tiles",
getattr(config, "max_dynamic_patch", 1),
)
dynamic_image_size = processor_config.get(
"dynamic_image_size",
getattr(config, "dynamic_image_size", True),
)
kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
kwargs.setdefault("image_size", config.force_image_size)
kwargs.setdefault("min_dynamic_patch", min_dynamic_patch)
kwargs.setdefault("max_dynamic_patch", max_dynamic_patch)
kwargs.setdefault("dynamic_image_size", dynamic_image_size)
kwargs.setdefault("use_thumbnail", True)
return LlamaNemotronVLEmbedImageProcessor(**kwargs)
def get_hf_processor(self, **kwargs: object) -> LlamaNemotronVLEmbedProcessor:
config = self.get_hf_config()
vision_config = config.vision_config
image_processor = self.get_image_processor(**kwargs)
image_size = image_processor.image_size
patch_size = vision_config.patch_size
downsample_ratio = config.downsample_ratio
image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2))
return LlamaNemotronVLEmbedProcessor(
tokenizer=self.get_tokenizer(),
processor_config=processor_config,
**kwargs,
image_processor=image_processor,
image_seq_length=image_seq_length,
)

View File

@@ -27,7 +27,8 @@ from vllm.multimodal.processing import (
PromptUpdate,
PromptUpdateDetails,
)
from vllm.transformers_utils.processors.nvlm_d import IMG_PAD, NVLMProcessor
from vllm.transformers_utils.processors.internvl import InternVLImageProcessor
from vllm.transformers_utils.processors.nvlm_d import NVLMProcessor
from .intern_vit import InternVisionModel
from .internvl import (
@@ -39,12 +40,33 @@ from .internvl import (
class NVLMProcessingInfo(BaseInternVLProcessingInfo):
def get_image_processor(self, **kwargs):
config = self.get_hf_config()
vision_config = config.vision_config
kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
kwargs.setdefault("image_size", vision_config.image_size)
kwargs.setdefault("min_dynamic_patch", config.min_dynamic_patch)
kwargs.setdefault("max_dynamic_patch", config.max_dynamic_patch)
kwargs.setdefault("dynamic_image_size", config.dynamic_image_size)
kwargs.setdefault("use_thumbnail", config.use_thumbnail)
return InternVLImageProcessor(**kwargs)
def get_hf_processor(self, **kwargs: object) -> NVLMProcessor:
return self.ctx.init_processor(
NVLMProcessor,
config=self.get_hf_config(),
config = self.get_hf_config()
vision_config = config.vision_config
image_processor = self.get_image_processor(**kwargs)
image_size = image_processor.image_size
patch_size = vision_config.patch_size
downsample_ratio = config.downsample_ratio
image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2))
return NVLMProcessor(
tokenizer=self.get_tokenizer(),
**kwargs,
image_processor=image_processor,
image_seq_length=image_seq_length,
)
@@ -117,9 +139,11 @@ class NVLMMultiModalProcessor(BaseInternVLMultiModalProcessor[NVLMProcessingInfo
if num_patches is not None:
assert isinstance(num_patches, int)
repl = hf_processor.get_image_repl(feature_size, num_patches)
repl = hf_processor.get_image_repl(num_patches, num_features=feature_size)
return PromptUpdateDetails.select_text(repl.full + "\n", IMG_PAD)
return PromptUpdateDetails.select_text(
repl.full + "\n", hf_processor.ctx_image_token
)
# See note in dummy data regarding why we have the extra newline
return [

View File

@@ -440,13 +440,13 @@ class QwenVLProcessingInfo(BaseProcessingInfo):
vision_config = config.visual
image_size = vision_config["image_size"]
kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
kwargs.setdefault("size", {"width": image_size, "height": image_size})
return QwenVLImageProcessorFast(**kwargs)
def get_hf_processor(self, **kwargs: object) -> QwenVLProcessor:
return self.ctx.init_processor(
QwenVLProcessor,
return QwenVLProcessor(
tokenizer=self.get_tokenizer(),
image_processor=self.get_image_processor(**kwargs),
)

View File

@@ -43,7 +43,10 @@ from vllm.multimodal.processing import (
PromptUpdate,
)
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.processors.skyworkr1v import SkyworkR1VProcessor
from vllm.transformers_utils.processors.internvl import (
InternVLImageProcessor,
InternVLProcessor,
)
from vllm.utils.tensor_schema import TensorSchema, TensorShape
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
@@ -96,12 +99,33 @@ SkyworkR1VImageInputs: TypeAlias = (
class SkyworkR1VProcessingInfo(BaseProcessingInfo):
def get_hf_processor(self, **kwargs: object) -> SkyworkR1VProcessor:
return self.ctx.init_processor(
SkyworkR1VProcessor,
config=self.get_hf_config(),
def get_image_processor(self, **kwargs):
config = self.get_hf_config()
vision_config = config.vision_config
kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
kwargs.setdefault("image_size", vision_config.image_size)
kwargs.setdefault("min_dynamic_patch", config.min_dynamic_patch)
kwargs.setdefault("max_dynamic_patch", config.max_dynamic_patch)
kwargs.setdefault("dynamic_image_size", config.dynamic_image_size)
kwargs.setdefault("use_thumbnail", config.use_thumbnail)
return InternVLImageProcessor(**kwargs)
def get_hf_processor(self, **kwargs: object) -> InternVLProcessor:
config = self.get_hf_config()
vision_config = config.vision_config
image_processor = self.get_image_processor(**kwargs)
image_size = image_processor.image_size
patch_size = vision_config.patch_size
downsample_ratio = config.downsample_ratio
image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2))
return InternVLProcessor(
tokenizer=self.get_tokenizer(),
**kwargs,
image_processor=image_processor,
image_seq_length=image_seq_length,
)
def get_supported_mm_limits(self) -> Mapping[str, int | None]:
@@ -112,7 +136,7 @@ class SkyworkR1VProcessingInfo(BaseProcessingInfo):
*,
image_width: int,
image_height: int,
processor: SkyworkR1VProcessor,
processor: InternVLProcessor,
) -> int:
return processor.get_num_image_tokens(
image_width=image_width,
@@ -121,8 +145,9 @@ class SkyworkR1VProcessingInfo(BaseProcessingInfo):
def get_image_size_with_most_features(self) -> ImageSize:
processor = self.get_hf_processor()
image_processor = processor.image_processor
base_size = processor.image_size
base_size = image_processor.image_size
target_ratios = processor.resolve_target_ratios()
largest_feature_size, largest_feature_pinpoint = 0, None
@@ -187,7 +212,7 @@ class SkyworkR1VMultiModalProcessor(BaseMultiModalProcessor[SkyworkR1VProcessing
)
hf_processor = self.info.get_hf_processor(**mm_kwargs)
image_token_id = hf_processor.image_token_id
image_token_id = hf_processor.ctx_image_token_id
# Since there may be extra tokens in the feature placeholders,
# we need to pass the image token ID to the model to select the
@@ -252,7 +277,7 @@ class SkyworkR1VMultiModalProcessor(BaseMultiModalProcessor[SkyworkR1VProcessing
if num_patches is not None:
assert isinstance(num_patches, int)
return hf_processor.get_image_repl(feature_size, num_patches)
return hf_processor.get_image_repl(num_patches, num_features=feature_size)
return [
PromptReplacement(

View File

@@ -14,7 +14,6 @@ __all__ = [
"BagelProcessor",
"CohereASRProcessor",
"DeepseekVLV2Processor",
"Eagle2_5_VLProcessor",
"FireRedASR2Processor",
"FunASRProcessor",
"GLM4VProcessor",
@@ -34,14 +33,12 @@ __all__ = [
"Ovis2_5Processor",
"QwenVLProcessor",
"Qwen3ASRProcessor",
"SkyworkR1VProcessor",
]
_CLASS_TO_MODULE: dict[str, str] = {
"BagelProcessor": "vllm.transformers_utils.processors.bagel",
"CohereASRProcessor": "vllm.transformers_utils.processors.cohere_asr",
"DeepseekVLV2Processor": "vllm.transformers_utils.processors.deepseek_vl2",
"Eagle2_5_VLProcessor": "vllm.transformers_utils.processors.eagle2_5_vl",
"FireRedASR2Processor": "vllm.transformers_utils.processors.fireredasr2",
"FunASRProcessor": "vllm.transformers_utils.processors.funasr",
"GLM4VProcessor": "vllm.transformers_utils.processors.glm4v",
@@ -61,7 +58,6 @@ _CLASS_TO_MODULE: dict[str, str] = {
"Ovis2_5Processor": "vllm.transformers_utils.processors.ovis2_5",
"QwenVLProcessor": "vllm.transformers_utils.processors.qwen_vl",
"Qwen3ASRProcessor": "vllm.transformers_utils.processors.qwen3_asr",
"SkyworkR1VProcessor": "vllm.transformers_utils.processors.skyworkr1v",
}

View File

@@ -1,85 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Adapted from NVIDIA Eagle2.5-VL model
# https://huggingface.co/nvidia/Eagle2.5-8B
from transformers import PretrainedConfig
from vllm.multimodal.processing import PromptUpdateDetails
from vllm.tokenizers import TokenizerLike
from .internvl import IMG_CONTEXT, IMG_END, IMG_START, BaseInternVLProcessor
class Eagle2_5_VLProcessor(BaseInternVLProcessor):
"""
Custom processor for Eagle2.5-VL model.
Extends BaseInternVLProcessor with Eagle-specific token handling.
"""
def __init__(
self,
config: PretrainedConfig,
tokenizer: TokenizerLike,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
) -> None:
# Skip super().__init__() to avoid config manipulation
# Directly initialize all required attributes
self.config = config
self.tokenizer = tokenizer
# Image size with force_image_size override
image_size: int = config.vision_config.image_size
if hasattr(config, "force_image_size") and config.force_image_size:
image_size = config.force_image_size
patch_size: int = config.vision_config.patch_size
downsample_ratio: float = getattr(config, "downsample_ratio", 0.5)
# Compute num_image_token
self.num_image_token = int(
(image_size // patch_size) ** 2 * (downsample_ratio**2)
)
self.image_size = image_size
# Dynamic patch settings with defaults
self.min_dynamic_patch = (
min_dynamic_patch
if min_dynamic_patch is not None
else getattr(config, "min_dynamic_patch", 1)
)
self.max_dynamic_patch = (
max_dynamic_patch
if max_dynamic_patch is not None
else getattr(config, "max_dynamic_patch", 12)
)
self.dynamic_image_size = (
dynamic_image_size
if dynamic_image_size is not None
else getattr(config, "dynamic_image_size", True)
)
self.use_thumbnail: bool = getattr(config, "use_thumbnail", True)
@property
def image_token_id(self) -> int:
"""Get the image token ID from config or tokenizer."""
if hasattr(self.config, "image_token_index"):
return self.config.image_token_index
# Fallback to tokenizer vocab - use <IMG_CONTEXT> (ID: 151667)
vocab = self.tokenizer.get_vocab()
if IMG_CONTEXT in vocab:
return vocab[IMG_CONTEXT]
raise ValueError(f"Cannot find image token '{IMG_CONTEXT}' in vocabulary")
def get_image_repl(
self,
feature_size: int,
num_patches: int | None,
) -> PromptUpdateDetails[str]:
"""Get image replacement string for prompt."""
repl_features = IMG_CONTEXT * feature_size
repl_full = IMG_START + repl_features + IMG_END
return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)

View File

@@ -10,16 +10,12 @@
# --------------------------------------------------------
import torch
from PIL import Image
from transformers import PretrainedConfig
from vllm.multimodal.processing import PromptUpdateDetails
from vllm.tokenizers import TokenizerLike
from vllm.tokenizers.hf import HfTokenizer
from .internvl import (
IMG_CONTEXT,
IMG_END,
IMG_START,
BaseInternVLProcessor,
InternVLImageProcessor,
InternVLProcessor,
build_transform,
find_closest_aspect_ratio,
get_internvl_target_ratios,
@@ -217,45 +213,26 @@ def image_to_pixel_values_h2ovl(
return pixel_values
class H2OVLProcessor(BaseInternVLProcessor):
class H2OVLImageProcessor(InternVLImageProcessor):
def __init__(
self,
config: PretrainedConfig,
tokenizer: TokenizerLike,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
use_msac: bool | None = None,
image_size: int,
min_dynamic_patch: int,
max_dynamic_patch: int,
dynamic_image_size: bool,
use_thumbnail: bool,
use_msac: bool,
) -> None:
super().__init__(
config,
tokenizer,
image_size=image_size,
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=use_thumbnail,
)
if use_msac is None:
use_msac = config.use_msac
assert isinstance(use_msac, bool)
self.use_msac = use_msac
@property
def image_token_id(self) -> int:
return self.tokenizer.get_vocab()[IMG_CONTEXT]
def get_image_repl(
self,
feature_size: int,
num_patches: int | None,
) -> PromptUpdateDetails[str]:
repl_features = IMG_CONTEXT * feature_size
repl_full = IMG_START + repl_features + IMG_END
return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
def resolve_min_max_num(
self,
*,
@@ -264,18 +241,14 @@ class H2OVLProcessor(BaseInternVLProcessor):
dynamic_image_size: bool | None = None,
use_thumbnail: bool | None = None,
) -> tuple[int, int]:
min_dynamic_patch = (
self.min_dynamic_patch if min_dynamic_patch is None else min_dynamic_patch
)
max_dynamic_patch = (
self.max_dynamic_patch if max_dynamic_patch is None else max_dynamic_patch
)
dynamic_image_size = (
self.dynamic_image_size
if dynamic_image_size is None
else dynamic_image_size
)
use_thumbnail = self.use_thumbnail if use_thumbnail is None else use_thumbnail
if min_dynamic_patch is None:
min_dynamic_patch = self.min_dynamic_patch
if max_dynamic_patch is None:
max_dynamic_patch = self.max_dynamic_patch
if dynamic_image_size is None:
dynamic_image_size = self.dynamic_image_size
if use_thumbnail is None:
use_thumbnail = self.use_thumbnail
return resolve_h2ovl_min_max_num(
min_dynamic_patch=min_dynamic_patch,
@@ -284,83 +257,6 @@ class H2OVLProcessor(BaseInternVLProcessor):
use_thumbnail=use_thumbnail,
)
def resolve_target_ratios(
self,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
use_thumbnail: bool | None = None,
prior_aspect_ratio: tuple[int, int] | None = None,
override_min_num: int | None = None,
) -> list[tuple[int, int]]:
min_num, max_num = self.resolve_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=use_thumbnail,
)
if override_min_num is not None:
min_num = override_min_num
return get_h2ovl_target_ratios(
min_num,
max_num,
prior_aspect_ratio=prior_aspect_ratio,
)
def get_num_image_tokens(
self,
*,
image_width: int,
image_height: int,
use_msac: bool | None = None,
) -> int:
use_msac = self.use_msac if use_msac is None else use_msac
use_thumbnail = self.use_thumbnail
if use_msac:
target_ratios_1 = self.resolve_target_ratios(
use_thumbnail=False, # Applied in calculate_targets
override_min_num=1,
)
num_patches_1, _, _, aspect_ratio_1 = calculate_h2ovl_targets(
orig_width=image_width,
orig_height=image_height,
image_size=self.image_size,
target_ratios=target_ratios_1,
use_thumbnail=True,
)
target_ratios_2 = self.resolve_target_ratios(
use_thumbnail=False, # Applied in calculate_targets
prior_aspect_ratio=aspect_ratio_1,
override_min_num=3,
)
num_patches_2, _, _, _ = calculate_h2ovl_targets(
orig_width=image_width,
orig_height=image_height,
image_size=self.image_size,
target_ratios=target_ratios_2,
use_thumbnail=True,
)
num_patches = num_patches_1 + num_patches_2 - 1
else:
target_ratios = self.resolve_target_ratios(
use_thumbnail=False, # Applied in calculate_targets
)
num_patches, _, _, _ = calculate_h2ovl_targets(
orig_width=image_width,
orig_height=image_height,
image_size=self.image_size,
target_ratios=target_ratios,
use_thumbnail=use_thumbnail,
)
return num_patches * self.num_image_token
def _images_to_pixel_values_lst(
self,
images: list[Image.Image],
@@ -388,3 +284,104 @@ class H2OVLProcessor(BaseInternVLProcessor):
)
for image in images
]
class H2OVLProcessor(InternVLProcessor):
def __init__(
self,
image_processor: H2OVLImageProcessor,
tokenizer: HfTokenizer,
*,
image_seq_length: int,
start_image_token: str = "<img>",
end_image_token: str = "</img>",
ctx_image_token: str = "<IMG_CONTEXT>",
) -> None:
super().__init__(
image_processor=image_processor,
tokenizer=tokenizer,
image_seq_length=image_seq_length,
start_image_token=start_image_token,
end_image_token=end_image_token,
ctx_image_token=ctx_image_token,
)
self.image_processor: H2OVLImageProcessor
def resolve_target_ratios(
self,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
use_thumbnail: bool | None = None,
prior_aspect_ratio: tuple[int, int] | None = None,
override_min_num: int | None = None,
) -> list[tuple[int, int]]:
min_num, max_num = self.image_processor.resolve_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=use_thumbnail,
)
if override_min_num is not None:
min_num = override_min_num
return get_h2ovl_target_ratios(
min_num,
max_num,
prior_aspect_ratio=prior_aspect_ratio,
)
def get_num_image_tokens(
self,
*,
image_width: int,
image_height: int,
use_msac: bool | None = None,
) -> int:
image_processor = self.image_processor
use_msac = image_processor.use_msac if use_msac is None else use_msac
use_thumbnail = image_processor.use_thumbnail
if use_msac:
target_ratios_1 = self.resolve_target_ratios(
use_thumbnail=False, # Applied in calculate_targets
override_min_num=1,
)
num_patches_1, _, _, aspect_ratio_1 = calculate_h2ovl_targets(
orig_width=image_width,
orig_height=image_height,
image_size=image_processor.image_size,
target_ratios=target_ratios_1,
use_thumbnail=True,
)
target_ratios_2 = self.resolve_target_ratios(
use_thumbnail=False, # Applied in calculate_targets
prior_aspect_ratio=aspect_ratio_1,
override_min_num=3,
)
num_patches_2, _, _, _ = calculate_h2ovl_targets(
orig_width=image_width,
orig_height=image_height,
image_size=image_processor.image_size,
target_ratios=target_ratios_2,
use_thumbnail=True,
)
num_patches = num_patches_1 + num_patches_2 - 1
else:
target_ratios = self.resolve_target_ratios(
use_thumbnail=False, # Applied in calculate_targets
)
num_patches, _, _, _ = calculate_h2ovl_targets(
orig_width=image_width,
orig_height=image_height,
image_size=image_processor.image_size,
target_ratios=target_ratios,
use_thumbnail=use_thumbnail,
)
return num_patches * self.image_seq_length

View File

@@ -7,24 +7,17 @@
# Copyright (c) 2023 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
from abc import ABC, abstractmethod
from typing import Any, TypeVar
import numpy.typing as npt
import torch
import torchvision.transforms as T
from PIL import Image
from transformers import BatchFeature, PretrainedConfig, TensorType
from transformers import BatchFeature, TensorType
from transformers.processing_utils import ProcessorMixin
from vllm.multimodal.image import convert_image_mode
from vllm.multimodal.processing import PromptUpdateDetails
from vllm.tokenizers import TokenizerLike
_T = TypeVar("_T")
IMG_START = "<img>"
IMG_END = "</img>"
IMG_CONTEXT = "<IMG_CONTEXT>"
from vllm.tokenizers.hf import HfTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
@@ -33,7 +26,7 @@ IMAGENET_STD = (0.229, 0.224, 0.225)
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
def build_transform(input_size: int):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose(
return T.Compose(
[
T.Lambda(lambda img: convert_image_mode(img, "RGB")),
T.Resize(
@@ -43,7 +36,6 @@ def build_transform(input_size: int):
T.Normalize(mean=MEAN, std=STD),
]
)
return transform
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
@@ -223,65 +215,20 @@ def video_to_pixel_values_internvl(
return pixel_values
class BaseInternVLProcessor(ABC):
"""
This model doesn't define its own HF processor,
so we implement our own one here.
The code to insert image tokens is based on:
https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py#L252
"""
class InternVLImageProcessor:
def __init__(
self,
config: PretrainedConfig,
tokenizer: TokenizerLike,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
image_size: int,
min_dynamic_patch: int,
max_dynamic_patch: int,
dynamic_image_size: bool,
use_thumbnail: bool,
) -> None:
super().__init__()
self.config = config
self.tokenizer = tokenizer
image_size: int = config.vision_config.image_size
patch_size: int = config.vision_config.patch_size
if min_dynamic_patch is None:
min_dynamic_patch = config.min_dynamic_patch
assert isinstance(min_dynamic_patch, int)
if max_dynamic_patch is None:
max_dynamic_patch = config.max_dynamic_patch
assert isinstance(max_dynamic_patch, int)
if dynamic_image_size is None:
dynamic_image_size = config.dynamic_image_size
assert isinstance(dynamic_image_size, bool)
self.num_image_token = int(
(image_size // patch_size) ** 2 * (config.downsample_ratio**2)
)
self.image_size = image_size
self.min_dynamic_patch = min_dynamic_patch
self.max_dynamic_patch = max_dynamic_patch
self.dynamic_image_size = dynamic_image_size
self.use_thumbnail: bool = config.use_thumbnail
@property
@abstractmethod
def image_token_id(self) -> int:
raise NotImplementedError
@abstractmethod
def get_image_repl(
self,
feature_size: int,
num_patches: int | None,
) -> PromptUpdateDetails[str]:
raise NotImplementedError
self.use_thumbnail = use_thumbnail
def resolve_min_max_num(
self,
@@ -291,18 +238,14 @@ class BaseInternVLProcessor(ABC):
dynamic_image_size: bool | None = None,
use_thumbnail: bool | None = None,
) -> tuple[int, int]:
min_dynamic_patch = (
self.min_dynamic_patch if min_dynamic_patch is None else min_dynamic_patch
)
max_dynamic_patch = (
self.max_dynamic_patch if max_dynamic_patch is None else max_dynamic_patch
)
dynamic_image_size = (
self.dynamic_image_size
if dynamic_image_size is None
else dynamic_image_size
)
use_thumbnail = self.use_thumbnail if use_thumbnail is None else use_thumbnail
if min_dynamic_patch is None:
min_dynamic_patch = self.min_dynamic_patch
if max_dynamic_patch is None:
max_dynamic_patch = self.max_dynamic_patch
if dynamic_image_size is None:
dynamic_image_size = self.dynamic_image_size
if use_thumbnail is None:
use_thumbnail = self.use_thumbnail
return resolve_internvl_min_max_num(
min_dynamic_patch=min_dynamic_patch,
@@ -311,43 +254,6 @@ class BaseInternVLProcessor(ABC):
use_thumbnail=use_thumbnail,
)
def resolve_target_ratios(
self,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
use_thumbnail: bool | None = None,
) -> list[tuple[int, int]]:
min_num, max_num = self.resolve_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=use_thumbnail,
)
return get_internvl_target_ratios(min_num, max_num)
def get_num_image_tokens(
self,
*,
image_width: int,
image_height: int,
) -> int:
target_ratios = self.resolve_target_ratios(
use_thumbnail=False, # Applied in calculate_targets
)
num_patches, _, _ = calculate_internvl_targets(
orig_width=image_width,
orig_height=image_height,
image_size=self.image_size,
target_ratios=target_ratios,
use_thumbnail=self.use_thumbnail,
)
return num_patches * self.num_image_token
def _images_to_pixel_values_lst(
self,
images: list[Image.Image],
@@ -355,7 +261,14 @@ class BaseInternVLProcessor(ABC):
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
) -> list[torch.Tensor]:
min_num, max_num = self.resolve_min_max_num(
if min_dynamic_patch is None:
min_dynamic_patch = self.min_dynamic_patch
if max_dynamic_patch is None:
max_dynamic_patch = self.max_dynamic_patch
if dynamic_image_size is None:
dynamic_image_size = self.dynamic_image_size
min_num, max_num = resolve_internvl_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
@@ -373,49 +286,9 @@ class BaseInternVLProcessor(ABC):
for image in images
]
def _preprocess_image(
self,
text: list[str],
images: list[Image.Image],
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
) -> tuple[list[str], dict[str, torch.Tensor]]:
if len(images) == 0:
image_inputs = {}
else:
pixel_values_lst = self._images_to_pixel_values_lst(
images,
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
)
image_inputs = {
"pixel_values_flat": torch.cat(pixel_values_lst),
"image_num_patches": torch.tensor(
[len(item) for item in pixel_values_lst]
),
}
for pixel_values in pixel_values_lst:
num_patches = pixel_values.shape[0]
feature_size = num_patches * self.num_image_token
image_repl = self.get_image_repl(feature_size, num_patches)
text = [t.replace("<image>", image_repl.full, 1) for t in text]
return text, image_inputs
def _make_batch_input(self, input_item: _T | list[_T] | None = None) -> list[_T]:
if input_item is None:
input_item = []
if not isinstance(input_item, list):
input_item = [input_item]
return input_item
def __call__(
self,
text: str | list[str] | None = None,
images: Image.Image | list[Image.Image] | None = None,
images: Image.Image | list[Image.Image],
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
@@ -423,120 +296,173 @@ class BaseInternVLProcessor(ABC):
return_tensors: str | TensorType | None = None,
**kwargs,
) -> BatchFeature:
text = self._make_batch_input(text)
images = self._make_batch_input(images)
images_lst = [images] if not isinstance(images, list) else images
text, image_inputs = self._preprocess_image(
text=text,
images=images,
pixel_values_lst = self._images_to_pixel_values_lst(
images_lst,
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
)
text_inputs = self.tokenizer(text)
combined_outputs = {**text_inputs, **image_inputs}
return BatchFeature(combined_outputs, tensor_type=return_tensors)
image_inputs = {
"pixel_values_flat": torch.cat(pixel_values_lst),
"image_num_patches": torch.tensor([len(item) for item in pixel_values_lst]),
}
return BatchFeature(image_inputs, tensor_type=return_tensors)
class InternVLProcessor(BaseInternVLProcessor):
"""
HF Processor for InternVLChatModel with extended video processing logic.
Code for video processing is adapted from video example:
https://huggingface.co/OpenGVLab/InternVL3-1B#inference-with-transformers
"""
class InternVLVideoProcessor:
def __init__(
self,
config: PretrainedConfig,
tokenizer: TokenizerLike,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
video_token: str | None = None,
image_size: int,
) -> None:
super().__init__(
config=config,
tokenizer=tokenizer,
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
)
# add extra video token for video processing
self.video_token = video_token
@property
def image_token_id(self) -> int:
return self.tokenizer.get_vocab()[IMG_CONTEXT]
@property
def video_token_id(self) -> int | None:
if self.video_token is None:
return None
return self.tokenizer.get_vocab().get(self.video_token, None)
@property
def supports_video(self) -> bool:
return self.video_token_id is not None
self.image_size = image_size
def _videos_to_pixel_values_lst(
self,
videos: list[npt.NDArray],
dynamic_image_size: bool | None = None,
) -> list[torch.Tensor]:
min_num, max_num = self.resolve_min_max_num(
min_dynamic_patch=1,
max_dynamic_patch=1,
dynamic_image_size=dynamic_image_size,
use_thumbnail=False, # Applied in image_to_pixel_values
)
return [
video_to_pixel_values_internvl(
video,
input_size=self.image_size,
min_num=min_num,
max_num=max_num,
min_num=1,
max_num=1,
use_thumbnail=False,
)
for video in videos
]
def _preprocess_video(
def __call__(
self,
text: list[str],
videos: list[npt.NDArray],
dynamic_image_size: bool | None = None,
) -> tuple[list[str], dict[str, Any]]:
if len(videos) == 0 or not self.supports_video:
return text, {}
videos: npt.NDArray | list[npt.NDArray],
*,
return_tensors: str | TensorType | None = None,
**kwargs,
) -> BatchFeature:
videos_lst = [videos] if not isinstance(videos, list) else videos
video_token = self.video_token
assert video_token is not None
pixel_values_lst = self._videos_to_pixel_values_lst(videos_lst)
pixel_values_lst_video = self._videos_to_pixel_values_lst(
videos,
dynamic_image_size=dynamic_image_size,
)
video_inputs = {
"pixel_values_flat_video": torch.cat(pixel_values_lst_video),
"video_num_patches": torch.tensor(
[len(item) for item in pixel_values_lst_video]
),
image_inputs = {
"pixel_values_flat_video": torch.cat(pixel_values_lst),
"video_num_patches": torch.tensor([len(item) for item in pixel_values_lst]),
}
return BatchFeature(image_inputs, tensor_type=return_tensors)
for pixel_values in pixel_values_lst_video:
num_patches = pixel_values.shape[0]
video_repl = self.get_video_repl(
self.num_image_token, num_patches, video_token
)
text = [t.replace("<video>", video_repl.full, 1) for t in text]
return text, video_inputs
class InternVLProcessor(ProcessorMixin):
"""
This model doesn't define its own HF processor,
so we implement our own one here.
The code to insert image tokens is based on:
https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py#L252
Code for video processing is adapted from video example:
https://huggingface.co/OpenGVLab/InternVL3-1B#inference-with-transformers
"""
attributes = ["image_processor", "tokenizer", "video_processor"]
def __init__(
self,
image_processor: InternVLImageProcessor,
tokenizer: HfTokenizer,
video_processor: InternVLVideoProcessor | None = None,
*,
image_seq_length: int,
start_image_token: str = "<img>",
end_image_token: str = "</img>",
ctx_image_token: str = "<IMG_CONTEXT>",
ctx_video_token: str | None = None,
) -> None:
self.image_processor = image_processor
self.tokenizer = tokenizer
self.video_processor = video_processor
self.image_seq_length = image_seq_length
self.start_image_token = start_image_token
self.end_image_token = end_image_token
self.ctx_image_token = ctx_image_token
self.ctx_video_token = ctx_video_token
self.start_image_token_id = tokenizer.convert_tokens_to_ids(start_image_token)
self.end_image_token_id = tokenizer.convert_tokens_to_ids(end_image_token)
self.ctx_image_token_id = tokenizer.convert_tokens_to_ids(ctx_image_token)
self.ctx_video_token_id = (
None
if ctx_video_token is None
else tokenizer.convert_tokens_to_ids(ctx_video_token)
)
def resolve_target_ratios(
self,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
use_thumbnail: bool | None = None,
) -> list[tuple[int, int]]:
min_num, max_num = self.image_processor.resolve_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=use_thumbnail,
)
return get_internvl_target_ratios(min_num, max_num)
def get_num_image_tokens(
self,
*,
image_width: int,
image_height: int,
) -> int:
image_processor = self.image_processor
target_ratios = self.resolve_target_ratios(
use_thumbnail=False, # Applied in calculate_targets
)
num_patches, _, _ = calculate_internvl_targets(
orig_width=image_width,
orig_height=image_height,
image_size=image_processor.image_size,
target_ratios=target_ratios,
use_thumbnail=image_processor.use_thumbnail,
)
return num_patches * self.image_seq_length
def get_image_repl(
self,
num_patches: int | None,
num_features: int | None = None,
) -> PromptUpdateDetails[str]:
if num_patches is None:
assert num_features is not None
else:
num_features = num_patches * self.image_seq_length
repl_features = self.ctx_image_token * num_features
repl_full = self.start_image_token + repl_features + self.end_image_token
return PromptUpdateDetails.select_text(repl_full, self.ctx_image_token)
def get_video_repl(self, num_patches: int) -> PromptUpdateDetails[str]:
assert self.ctx_video_token is not None
repl_features = self.ctx_video_token * self.image_seq_length
repl_features_with_sep = (
self.start_image_token + repl_features + self.end_image_token
)
# num_patches is equal to num_frames
repl_full = "".join(
[f"Frame{i + 1}: {repl_features_with_sep}" for i in range(num_patches)]
)
return PromptUpdateDetails.select_text(repl_full, self.ctx_video_token)
def __call__(
self,
@@ -550,54 +476,88 @@ class InternVLProcessor(BaseInternVLProcessor):
return_tensors: str | TensorType | None = None,
**kwargs,
) -> BatchFeature:
text = self._make_batch_input(text)
images = self._make_batch_input(images)
videos = self._make_batch_input(videos)
if images is not None:
image_inputs = self.image_processor(
images=images,
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
return_tensors=return_tensors,
)
image_num_patches = image_inputs["image_num_patches"]
else:
image_inputs = {}
image_num_patches = []
text, image_inputs = self._preprocess_image(
text=text,
images=images,
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
)
if videos is not None:
if self.video_processor is None:
raise ValueError("This model does not support video inputs")
text, video_inputs = self._preprocess_video(
text=text,
videos=videos,
dynamic_image_size=dynamic_image_size,
)
video_inputs = self.video_processor(
videos=videos,
return_tensors=return_tensors,
)
video_num_patches = video_inputs["video_num_patches"]
else:
video_inputs = {}
video_num_patches = []
text_inputs = self.tokenizer(text)
if text is not None:
if not isinstance(text, list):
text = [text]
if image_inputs:
image_token = "<image>"
image_index = 0
processed_text = list[str]()
replace_strings = list[str]()
for prompt in text:
new_prompt = prompt
while image_token in new_prompt:
new_prompt = new_prompt.replace(image_token, "<placeholder>", 1)
image_repl = self.get_image_repl(image_num_patches[image_index])
replace_strings.append(image_repl.full)
image_index += 1
while "<placeholder>" in new_prompt:
replace_str = replace_strings.pop(0)
new_prompt = new_prompt.replace("<placeholder>", replace_str, 1)
processed_text.append(new_prompt)
text = processed_text
if video_inputs:
video_token = "<video>"
video_index = 0
processed_text = list[str]()
replace_strings = list[str]()
assert video_token is not None
for prompt in text:
new_prompt = prompt
while video_token in new_prompt:
new_prompt = new_prompt.replace(video_token, "<placeholder>", 1)
video_repl = self.get_video_repl(video_num_patches[video_index])
replace_strings.append(video_repl.full)
video_index += 1
while "<placeholder>" in new_prompt:
replace_str = replace_strings.pop(0)
new_prompt = new_prompt.replace("<placeholder>", replace_str, 1)
processed_text.append(new_prompt)
text = processed_text
text_inputs = self.tokenizer(text, return_tensors=return_tensors)
else:
text_inputs = {}
combined_outputs = {**text_inputs, **image_inputs, **video_inputs}
return BatchFeature(combined_outputs, tensor_type=return_tensors)
def get_image_repl(
self,
feature_size: int,
num_patches: int | None,
) -> PromptUpdateDetails[str]:
repl_features = IMG_CONTEXT * feature_size
repl_full = IMG_START + repl_features + IMG_END
return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
def get_video_repl(
self,
feature_size: int,
num_patches: int | None,
video_context_token: str = IMG_CONTEXT,
) -> PromptUpdateDetails[str]:
if num_patches is None:
raise NotImplementedError("Embedding inputs are not supported")
repl_features = video_context_token * self.num_image_token
repl_features_with_sep = IMG_START + repl_features + IMG_END
# num_patches is equal to num_frames
repl_full = "".join(
[f"Frame{i + 1}: {repl_features_with_sep}" for i in range(num_patches)]
)
return PromptUpdateDetails.select_text(repl_full, video_context_token)

View File

@@ -25,7 +25,7 @@ from vllm.model_executor.models.parakeet import ParakeetExtractor
from vllm.multimodal.evs import compute_retained_tokens_count
from vllm.multimodal.inputs import AudioItem
from vllm.multimodal.processing.processor import PromptUpdateDetails, _seq2tokens
from vllm.tokenizers import TokenizerLike
from vllm.tokenizers.hf import HfTokenizer
from .internvl import calculate_internvl_targets, get_internvl_target_ratios
@@ -508,7 +508,7 @@ class BaseNanoNemotronVLProcessor(ABC):
def __init__(
self,
config: PretrainedConfig,
tokenizer: TokenizerLike,
tokenizer: HfTokenizer,
*args,
max_model_len: int,
max_num_tiles: int | None = None,
@@ -689,7 +689,7 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
def __init__(
self,
config: PretrainedConfig,
tokenizer: TokenizerLike,
tokenizer: HfTokenizer,
*,
max_model_len: int,
max_num_tiles: int | None = None,
@@ -961,7 +961,7 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
tokens_per_frame: list[int],
frames_indices: list[int],
frame_duration_ms: int,
tokenizer: TokenizerLike,
tokenizer: HfTokenizer,
img_start_token_ids: list[int],
img_end_token_ids: list[int],
img_context_token_ids: list[int],
@@ -986,7 +986,7 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
tokens_per_frame (list[int]): number of tokens per frame
frames_indices (list[int]): frame indices
frame_duration_ms (int): duration of each frame in milliseconds
tokenizer (TokenizerLike): tokenizer to use for tokenizing frame separators
tokenizer (HfTokenizer): tokenizer to use for tokenizing frame separators
img_start_token_ids (list[int]): pre-tokenized IMG_START tokens
img_end_token_ids (list[int]): pre-tokenized IMG_END tokens
img_context_token_ids (list[int]): pre-tokenized IMG_CONTEXT tokens

View File

@@ -1,18 +1,14 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from abc import ABC
import torch
import torchvision.transforms as T
from PIL import Image
from transformers import PretrainedConfig
from transformers.image_processing_utils_fast import BaseImageProcessorFast
from vllm.multimodal.image import convert_image_mode
from vllm.multimodal.processing import PromptUpdateDetails
from vllm.tokenizers import TokenizerLike
from vllm.tokenizers.hf import HfTokenizer
from .internvl import InternVLProcessor
from .internvl import InternVLImageProcessor, InternVLProcessor
# Configure PIL to handle large images without warnings
# This prevents DecompressionBombWarning for legitimate large images
@@ -172,80 +168,7 @@ def image_to_pixel_values_nemotron_vl(
return pixel_values
class NemotronVLProcessor(InternVLProcessor):
IMG_START = "<img>"
IMG_END = "</img>"
IMG_CONTEXT = "<image>"
def __init__(
self,
config: PretrainedConfig,
tokenizer: TokenizerLike,
image_processor: BaseImageProcessorFast,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
) -> None:
ABC.__init__(self)
self.config = config
self.tokenizer = tokenizer
self.image_processor = image_processor
image_size: int = config.force_image_size
patch_size: int = config.patch_size
if min_dynamic_patch is None:
min_dynamic_patch = 1
assert isinstance(min_dynamic_patch, int)
if max_dynamic_patch is None:
max_dynamic_patch = self.image_processor.max_num_tiles
assert isinstance(max_dynamic_patch, int)
if dynamic_image_size is None:
dynamic_image_size = True
assert isinstance(dynamic_image_size, bool)
self.num_image_token = int(
(image_size // patch_size) ** 2 * (config.downsample_ratio**2)
)
self.image_size = image_size
self.min_dynamic_patch = min_dynamic_patch
self.max_dynamic_patch = max_dynamic_patch
self.dynamic_image_size = dynamic_image_size
if image_processor is not None:
self.use_thumbnail = image_processor.use_thumbnail
else:
self.use_thumbnail = getattr(config, "use_thumbnail", True)
@property
def image_token_id(self) -> int:
return self.tokenizer.get_vocab()[self.IMG_CONTEXT]
def _get_transform(self) -> T.Compose:
return build_transform(input_size=self.image_size)
def get_num_image_tokens(
self,
*,
image_width: int,
image_height: int,
) -> int:
target_ratios = self.resolve_target_ratios(
use_thumbnail=False, # Applied in calculate_targets
)
num_patches, _, _ = calculate_nemotron_vl_targets(
orig_width=image_width,
orig_height=image_height,
image_size=self.image_size,
target_ratios=target_ratios,
use_thumbnail=self.use_thumbnail,
)
return num_patches * self.num_image_token
class LlamaNemotronNanoVLImageProcessor(InternVLImageProcessor):
def _images_to_pixel_values_lst(
self,
images: list[Image.Image],
@@ -267,62 +190,60 @@ class NemotronVLProcessor(InternVLProcessor):
min_num=min_num,
max_num=max_num,
use_thumbnail=self.use_thumbnail,
transform=self._get_transform(),
transform=build_transform(self.image_size),
)
for image in images
]
def _replace_image_tokens(
class LlamaNemotronNanoVLProcessor(InternVLProcessor):
"""
This model doesn't define its own HF processor,
so we implement our own one here.
The image processor is given by:
https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1/blob/main/image_processing.py
"""
def __init__(
self,
text: list[str],
pixel_values_lst: list[torch.Tensor],
) -> list[str]:
"""Replace <image> placeholders with image tokens."""
for pixel_values in pixel_values_lst:
num_patches = pixel_values.shape[0]
feature_size = num_patches * self.num_image_token
image_repl = self.get_image_repl(feature_size, num_patches)
# Use temporary placeholder to avoid replacing tokens we just inserted
NVL_IMAGE_CONTEXT = image_repl.full.replace("<image>", "<NVL_IMG_CONTEXT>")
text = [t.replace("<image>", NVL_IMAGE_CONTEXT, 1) for t in text]
return [t.replace("<NVL_IMG_CONTEXT>", self.IMG_CONTEXT) for t in text]
image_processor: LlamaNemotronNanoVLImageProcessor,
tokenizer: HfTokenizer,
*,
image_seq_length: int,
start_image_token: str = "<img>",
end_image_token: str = "</img>",
ctx_image_token: str = "<image>",
) -> None:
super().__init__(
image_processor=image_processor,
tokenizer=tokenizer,
image_seq_length=image_seq_length,
start_image_token=start_image_token,
end_image_token=end_image_token,
ctx_image_token=ctx_image_token,
)
def _preprocess_image(
def get_num_image_tokens(
self,
text: list[str],
images: list[Image.Image],
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
) -> tuple[list[str], dict[str, torch.Tensor]]:
if len(images) == 0:
image_inputs = {}
else:
pixel_values_lst = self._images_to_pixel_values_lst(
images,
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
)
image_inputs = {
"pixel_values_flat": torch.cat(pixel_values_lst),
"image_num_patches": torch.tensor(
[len(item) for item in pixel_values_lst]
),
}
*,
image_width: int,
image_height: int,
) -> int:
image_processor = self.image_processor
target_ratios = self.resolve_target_ratios(
use_thumbnail=False, # Applied in calculate_targets
)
text = self._replace_image_tokens(text, pixel_values_lst)
return text, image_inputs
num_patches, _, _ = calculate_nemotron_vl_targets(
orig_width=image_width,
orig_height=image_height,
image_size=image_processor.image_size,
target_ratios=target_ratios,
use_thumbnail=image_processor.use_thumbnail,
)
def get_image_repl(
self,
feature_size: int,
num_patches: int | None,
) -> PromptUpdateDetails[str]:
repl_features = self.IMG_CONTEXT * feature_size
repl_full = self.IMG_START + repl_features + self.IMG_END
return PromptUpdateDetails.select_text(repl_full, self.IMG_CONTEXT)
return num_patches * self.image_seq_length
# SigLIP normalization constants
@@ -343,7 +264,35 @@ def build_siglip_transform(input_size: int):
)
class LlamaNemotronVLEmbedProcessor(NemotronVLProcessor):
class LlamaNemotronVLEmbedImageProcessor(InternVLImageProcessor):
def _images_to_pixel_values_lst(
self,
images: list[Image.Image],
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
) -> list[torch.Tensor]:
min_num, max_num = self.resolve_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=False, # Applied in image_to_pixel_values
)
return [
image_to_pixel_values_nemotron_vl(
image,
input_size=self.image_size,
min_num=min_num,
max_num=max_num,
use_thumbnail=self.use_thumbnail,
transform=build_siglip_transform(self.image_size),
)
for image in images
]
class LlamaNemotronVLEmbedProcessor(InternVLProcessor):
"""
Processor for LlamaNemotronVL embedding model.
@@ -352,59 +301,44 @@ class LlamaNemotronVLEmbedProcessor(NemotronVLProcessor):
- Uses different image context token (<IMG_CONTEXT> vs <image>)
"""
IMG_CONTEXT = "<IMG_CONTEXT>"
def __init__(
self,
config: PretrainedConfig,
tokenizer: TokenizerLike,
processor_config: dict,
image_processor: LlamaNemotronVLEmbedImageProcessor,
tokenizer: HfTokenizer,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
image_seq_length: int,
start_image_token: str = "<img>",
end_image_token: str = "</img>",
ctx_image_token: str = "<IMG_CONTEXT>",
) -> None:
if min_dynamic_patch is None:
min_dynamic_patch = processor_config.get(
"min_input_tiles",
getattr(config, "min_dynamic_patch", 1),
)
if max_dynamic_patch is None:
max_dynamic_patch = processor_config.get(
"max_input_tiles",
getattr(config, "max_dynamic_patch", 1),
)
if dynamic_image_size is None:
dynamic_image_size = processor_config.get(
"dynamic_image_size",
getattr(config, "dynamic_image_size", True),
)
super().__init__(
config=config,
image_processor=image_processor,
tokenizer=tokenizer,
image_processor=None,
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
image_seq_length=image_seq_length,
start_image_token=start_image_token,
end_image_token=end_image_token,
ctx_image_token=ctx_image_token,
)
def _get_transform(self) -> T.Compose:
"""Override to add SigLIP normalization."""
return build_siglip_transform(input_size=self.image_size)
self.image_processor: LlamaNemotronVLEmbedImageProcessor
def _replace_image_tokens(
def get_num_image_tokens(
self,
text: list[str],
pixel_values_lst: list[torch.Tensor],
) -> list[str]:
"""Override with simpler token replacement for embedding model.
*,
image_width: int,
image_height: int,
) -> int:
image_processor = self.image_processor
target_ratios = self.resolve_target_ratios(
use_thumbnail=False, # Applied in calculate_targets
)
No temporary placeholder needed because IMG_CONTEXT is <IMG_CONTEXT>,
not <image>, so there's no collision risk.
"""
for pixel_values in pixel_values_lst:
num_patches = pixel_values.shape[0]
feature_size = num_patches * self.num_image_token
image_repl = self.get_image_repl(feature_size, num_patches)
text = [t.replace("<image>", image_repl.full, 1) for t in text]
return text
num_patches, _, _ = calculate_nemotron_vl_targets(
orig_width=image_width,
orig_height=image_height,
image_size=image_processor.image_size,
target_ratios=target_ratios,
use_thumbnail=image_processor.use_thumbnail,
)
return num_patches * self.image_seq_length

View File

@@ -8,37 +8,54 @@
# Licensed under Apache 2.0 License [see LICENSE for details]
# --------------------------------------------------------
from vllm.multimodal.processing import PromptUpdateDetails
from vllm.tokenizers.hf import HfTokenizer
from .internvl import BaseInternVLProcessor
IMG_PAD = "<|vision_pad|>"
from .internvl import InternVLImageProcessor, InternVLProcessor
class NVLMProcessor(BaseInternVLProcessor):
@property
def image_token_id(self) -> int:
return self.tokenizer.get_vocab()[IMG_PAD]
class NVLMProcessor(InternVLProcessor):
def __init__(
self,
image_processor: InternVLImageProcessor,
tokenizer: HfTokenizer,
*,
image_seq_length: int,
start_image_token: str = "<Image>",
end_image_token: str = "</Image>",
ctx_image_token: str = "<|vision_pad|>",
) -> None:
super().__init__(
image_processor=image_processor,
tokenizer=tokenizer,
image_seq_length=image_seq_length,
start_image_token=start_image_token,
end_image_token=end_image_token,
ctx_image_token=ctx_image_token,
)
def get_image_repl(
self,
feature_size: int,
num_patches: int | None,
num_features: int | None = None,
) -> PromptUpdateDetails[str]:
if num_patches is None:
raise NotImplementedError("Embedding inputs are not supported")
num_features = num_patches * self.image_seq_length
tile_pos_identifiers = [f"<tile_{i}>" for i in range(1, num_patches)]
if self.use_thumbnail:
if self.image_processor.use_thumbnail:
tile_pos_identifiers += ["<tile_global_thumbnail>"]
context_size = feature_size // num_patches
context_size = num_features // num_patches
features = "".join(
identifier + IMG_PAD * context_size for identifier in tile_pos_identifiers
(identifier + self.ctx_image_token * context_size)
for identifier in tile_pos_identifiers
)
# We include the start and end as well because "<Image><tile" is
# tokenized as ["<Image", "><", "tile"], resulting in assertion error
# when trying to find "<tile" as a subsequence of "<Image><tile"
repl = "<Image>" + features + "</Image>"
repl = self.start_image_token + features + self.end_image_token
return PromptUpdateDetails.select_text(repl, IMG_PAD)
return PromptUpdateDetails.select_text(repl, self.ctx_image_token)

View File

@@ -1,389 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/modeling_skywork_chat.py
# --------------------------------------------------------
# SkyworkR1V
# Copyright (c) 2025 Skywork
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
import torch
import torchvision.transforms as T
from PIL import Image
from transformers import BatchFeature, PretrainedConfig, TensorType
from vllm.multimodal.image import convert_image_mode
from vllm.multimodal.processing import PromptUpdateDetails
from vllm.tokenizers import TokenizerLike
IMG_START = "<img>"
IMG_END = "</img>"
IMG_CONTEXT = "<IMG_CONTEXT>"
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/
def build_transform(input_size: int):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
return T.Compose(
[
T.Lambda(lambda img: convert_image_mode(img, "RGB")),
T.Resize(
(input_size, input_size), interpolation=T.InterpolationMode.BICUBIC
),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD),
]
)
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/
def find_closest_aspect_ratio(
aspect_ratio: float,
target_ratios: list[tuple[int, int]],
*,
width: int,
height: int,
image_size: int,
) -> tuple[int, int]:
best_ratio_diff = float("inf")
best_ratio = (1, 1)
area = width * height
for ratio in target_ratios:
target_aspect_ratio = ratio[0] / ratio[1]
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
if ratio_diff < best_ratio_diff:
best_ratio_diff = ratio_diff
best_ratio = ratio
elif ratio_diff == best_ratio_diff:
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
best_ratio = ratio
return best_ratio
def resolve_skyworkr1v_min_max_num(
*,
min_dynamic_patch: int,
max_dynamic_patch: int,
dynamic_image_size: bool,
use_thumbnail: bool,
) -> tuple[int, int]:
min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1
max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
if use_thumbnail and max_dynamic_patch != 1:
max_dynamic_patch += 1
return min_dynamic_patch, max_dynamic_patch
def get_skyworkr1v_target_ratios(
min_num: int,
max_num: int,
) -> list[tuple[int, int]]:
target_ratios = {
(i, j)
for n in range(min_num, max_num + 1)
for i in range(1, n + 1)
for j in range(1, n + 1)
if min_num <= i * j <= max_num
}
return sorted(target_ratios, key=lambda x: x[0] * x[1])
def calculate_skyworkr1v_targets(
*,
orig_width: int,
orig_height: int,
target_ratios: list[tuple[int, int]],
image_size: int,
use_thumbnail: bool,
) -> tuple[int, int, int]:
aspect_ratio = orig_width / orig_height
# find the closest aspect ratio to the target
target_aspect_ratio = find_closest_aspect_ratio(
aspect_ratio,
target_ratios,
width=orig_width,
height=orig_height,
image_size=image_size,
)
# calculate the target width and height
target_width = image_size * target_aspect_ratio[0]
target_height = image_size * target_aspect_ratio[1]
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
# add thumbnail image if num_blocks != 1
if use_thumbnail and blocks != 1:
blocks += 1
return blocks, target_width, target_height
def dynamic_preprocess_skyworkr1v(
image: Image.Image,
*,
target_ratios: list[tuple[int, int]],
image_size: int,
use_thumbnail: bool,
) -> list[Image.Image]:
orig_width, orig_height = image.size
# calculate the number of blocks without thumbnail
blocks, target_width, target_height = calculate_skyworkr1v_targets(
orig_width=orig_width,
orig_height=orig_height,
target_ratios=target_ratios,
image_size=image_size,
use_thumbnail=False,
)
# resize the image
resized_img = image.resize((target_width, target_height))
processed_images = []
for i in range(blocks):
box = (
(i % (target_width // image_size)) * image_size,
(i // (target_width // image_size)) * image_size,
((i % (target_width // image_size)) + 1) * image_size,
((i // (target_width // image_size)) + 1) * image_size,
)
# split the image
split_img = resized_img.crop(box)
processed_images.append(split_img)
assert len(processed_images) == blocks
if use_thumbnail and len(processed_images) != 1:
thumbnail_img = image.resize((image_size, image_size))
processed_images.append(thumbnail_img)
return processed_images
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B
def image_to_pixel_values_skyworkr1v(
image: Image.Image,
*,
input_size: int,
min_num: int,
max_num: int,
use_thumbnail: bool,
) -> torch.Tensor:
target_ratios = get_skyworkr1v_target_ratios(min_num, max_num)
transform = build_transform(input_size=input_size)
images = dynamic_preprocess_skyworkr1v(
image,
target_ratios=target_ratios,
image_size=input_size,
use_thumbnail=use_thumbnail,
)
pixel_values = torch.stack([transform(image) for image in images])
return pixel_values
class SkyworkR1VProcessor:
"""
This model doesn't define its own HF processor,
so we implement our own one here.
The code to insert image tokens is based on:
https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/modeling_skywork_chat.py#L252
"""
def __init__(
self,
config: PretrainedConfig,
tokenizer: TokenizerLike,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
) -> None:
super().__init__()
self.config = config
self.tokenizer = tokenizer
image_size: int = config.vision_config.image_size
patch_size: int = config.vision_config.patch_size
if min_dynamic_patch is None:
min_dynamic_patch = config.min_dynamic_patch
assert isinstance(min_dynamic_patch, int)
if max_dynamic_patch is None:
max_dynamic_patch = config.max_dynamic_patch
assert isinstance(max_dynamic_patch, int)
if dynamic_image_size is None:
dynamic_image_size = config.dynamic_image_size
assert isinstance(dynamic_image_size, bool)
self.num_image_token = int(
(image_size // patch_size) ** 2 * (config.downsample_ratio**2)
)
self.image_size = image_size
self.min_dynamic_patch = min_dynamic_patch
self.max_dynamic_patch = max_dynamic_patch
self.dynamic_image_size = dynamic_image_size
self.use_thumbnail: bool = config.use_thumbnail
@property
def image_token_id(self) -> int:
return self.tokenizer.get_vocab()[IMG_CONTEXT]
def get_image_repl(
self,
feature_size: int,
num_patches: int | None,
) -> PromptUpdateDetails[str]:
repl_features = IMG_CONTEXT * feature_size
repl_full = IMG_START + repl_features + IMG_END
return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
def resolve_min_max_num(
self,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
use_thumbnail: bool | None = None,
) -> tuple[int, int]:
min_dynamic_patch = (
self.min_dynamic_patch if min_dynamic_patch is None else min_dynamic_patch
)
max_dynamic_patch = (
self.max_dynamic_patch if max_dynamic_patch is None else max_dynamic_patch
)
dynamic_image_size = (
self.dynamic_image_size
if dynamic_image_size is None
else dynamic_image_size
)
use_thumbnail = self.use_thumbnail if use_thumbnail is None else use_thumbnail
return resolve_skyworkr1v_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=use_thumbnail,
)
def resolve_target_ratios(
self,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
use_thumbnail: bool | None = None,
) -> list[tuple[int, int]]:
min_num, max_num = self.resolve_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=use_thumbnail,
)
return get_skyworkr1v_target_ratios(min_num, max_num)
def get_num_image_tokens(
self,
*,
image_width: int,
image_height: int,
) -> int:
target_ratios = self.resolve_target_ratios(
use_thumbnail=False, # Applied in calculate_targets
)
num_patches, _, _ = calculate_skyworkr1v_targets(
orig_width=image_width,
orig_height=image_height,
image_size=self.image_size,
target_ratios=target_ratios,
use_thumbnail=self.use_thumbnail,
)
return num_patches * self.num_image_token
def _images_to_pixel_values_lst(
self,
images: list[Image.Image],
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
) -> list[torch.Tensor]:
min_num, max_num = self.resolve_min_max_num(
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
use_thumbnail=False, # Applied in image_to_pixel_values
)
return [
image_to_pixel_values_skyworkr1v(
image,
input_size=self.image_size,
min_num=min_num,
max_num=max_num,
use_thumbnail=self.use_thumbnail,
)
for image in images
]
def __call__(
self,
text: str | list[str] | None = None,
images: Image.Image | list[Image.Image] | None = None,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
return_tensors: str | TensorType | None = None,
) -> BatchFeature:
if text is None:
text = []
if not isinstance(text, list):
text = [text]
if images is None:
images = []
if not isinstance(images, list):
images = [images]
if len(images) == 0:
image_inputs = {}
else:
pixel_values_lst = self._images_to_pixel_values_lst(
images,
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
)
image_inputs = {
"pixel_values_flat": torch.cat(pixel_values_lst),
"image_num_patches": torch.tensor(
[len(item) for item in pixel_values_lst]
),
}
for pixel_values in pixel_values_lst:
num_patches = pixel_values.shape[0]
feature_size = num_patches * self.num_image_token
image_repl = self.get_image_repl(feature_size, num_patches)
text = [t.replace("<image>", image_repl.full, 1) for t in text]
text_inputs = self.tokenizer(text)
combined_outputs = {**text_inputs, **image_inputs}
return BatchFeature(combined_outputs, tensor_type=return_tensors)