[2/3] Refactor InternVL-based processors (#37324)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -489,13 +489,14 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
self.image_size = self.vision_config.image_size
|
||||
|
||||
def __call__(self, text: str, images: Image | list[Image], **kwargs):
|
||||
from vllm.model_executor.models.h2ovl import (
|
||||
IMG_CONTEXT,
|
||||
IMG_END,
|
||||
IMG_START,
|
||||
from vllm.transformers_utils.processors.h2ovl import (
|
||||
image_to_pixel_values_h2ovl,
|
||||
)
|
||||
|
||||
IMG_START = "<img>"
|
||||
IMG_END = "</img>"
|
||||
IMG_CONTEXT = "<IMG_CONTEXT>"
|
||||
|
||||
images = [images] if isinstance(images, Image) else images
|
||||
pixel_values = [
|
||||
image_to_pixel_values_h2ovl(
|
||||
@@ -751,16 +752,17 @@ def skyworkr1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
self.image_size = self.vision_config.image_size
|
||||
|
||||
def __call__(self, text: str, images: Image | list[Image], **kwargs):
|
||||
from vllm.model_executor.models.skyworkr1v import (
|
||||
IMG_CONTEXT,
|
||||
IMG_END,
|
||||
IMG_START,
|
||||
image_to_pixel_values_skyworkr1v,
|
||||
from vllm.transformers_utils.processors.internvl import (
|
||||
image_to_pixel_values_internvl,
|
||||
)
|
||||
|
||||
IMG_START = "<img>"
|
||||
IMG_END = "</img>"
|
||||
IMG_CONTEXT = "<IMG_CONTEXT>"
|
||||
|
||||
images = [images] if isinstance(images, Image) else images
|
||||
pixel_values = [
|
||||
image_to_pixel_values_skyworkr1v(
|
||||
image_to_pixel_values_internvl(
|
||||
image,
|
||||
input_size=self.image_size,
|
||||
min_num=self.min_num,
|
||||
@@ -815,14 +817,15 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
videos: npt.NDArray | list[npt.NDArray] = None,
|
||||
**kwargs,
|
||||
):
|
||||
from vllm.model_executor.models.internvl import (
|
||||
IMG_CONTEXT,
|
||||
IMG_END,
|
||||
IMG_START,
|
||||
from vllm.transformers_utils.processors.internvl import (
|
||||
image_to_pixel_values_internvl,
|
||||
video_to_pixel_values_internvl,
|
||||
)
|
||||
|
||||
IMG_START = "<img>"
|
||||
IMG_END = "</img>"
|
||||
IMG_CONTEXT = "<IMG_CONTEXT>"
|
||||
|
||||
images = [images] if isinstance(images, Image) else images
|
||||
videos = [videos] if isinstance(videos, np.ndarray) else videos
|
||||
if images is not None:
|
||||
|
||||
@@ -779,7 +779,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
||||
"rednote-hilab/dots.ocr", trust_remote_code=True
|
||||
),
|
||||
"Eagle2_5_VLForConditionalGeneration": _HfExamplesInfo(
|
||||
"nvidia/Eagle2.5-8B", trust_remote_code=True, is_available_online=False
|
||||
"nvidia/Eagle2.5-8B",
|
||||
trust_remote_code=True,
|
||||
),
|
||||
"Emu3ForConditionalGeneration": _HfExamplesInfo("BAAI/Emu3-Chat-hf"),
|
||||
"Ernie4_5_VLMoeForConditionalGeneration": _HfExamplesInfo(
|
||||
|
||||
@@ -16,7 +16,10 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
|
||||
from vllm.model_executor.models.siglip import SiglipVisionModel
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.transformers_utils.processors.eagle2_5_vl import Eagle2_5_VLProcessor
|
||||
from vllm.transformers_utils.processors.internvl import (
|
||||
InternVLImageProcessor,
|
||||
InternVLProcessor,
|
||||
)
|
||||
from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
||||
|
||||
from .interfaces import (
|
||||
@@ -68,12 +71,35 @@ Eagle2_5_VLImageInputs: TypeAlias = (
|
||||
class Eagle2_5_VLProcessingInfo(BaseInternVLProcessingInfo):
|
||||
"""Processing info for Eagle2.5-VL model."""
|
||||
|
||||
def get_hf_processor(self, **kwargs) -> Eagle2_5_VLProcessor:
|
||||
return self.ctx.init_processor(
|
||||
Eagle2_5_VLProcessor,
|
||||
config=self.ctx.get_hf_config(),
|
||||
def get_image_processor(self, **kwargs):
|
||||
config = self.get_hf_config()
|
||||
vision_config = config.vision_config
|
||||
|
||||
kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
|
||||
kwargs.setdefault(
|
||||
"image_size", config.force_image_size or vision_config.image_size
|
||||
)
|
||||
kwargs.setdefault("min_dynamic_patch", config.min_dynamic_patch)
|
||||
kwargs.setdefault("max_dynamic_patch", config.max_dynamic_patch)
|
||||
kwargs.setdefault("dynamic_image_size", config.dynamic_image_size)
|
||||
kwargs.setdefault("use_thumbnail", config.use_thumbnail)
|
||||
|
||||
return InternVLImageProcessor(**kwargs)
|
||||
|
||||
def get_hf_processor(self, **kwargs) -> InternVLProcessor:
|
||||
config = self.get_hf_config()
|
||||
vision_config = config.vision_config
|
||||
|
||||
image_processor = self.get_image_processor(**kwargs)
|
||||
image_size = image_processor.image_size
|
||||
patch_size = vision_config.patch_size
|
||||
downsample_ratio = config.downsample_ratio
|
||||
image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2))
|
||||
|
||||
return InternVLProcessor(
|
||||
tokenizer=self.get_tokenizer(),
|
||||
**kwargs,
|
||||
image_processor=image_processor,
|
||||
image_seq_length=image_seq_length,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -395,13 +395,13 @@ class GLM4VProcessingInfo(BaseProcessingInfo):
|
||||
vision_config = config.vision_config
|
||||
|
||||
image_size = vision_config["image_size"]
|
||||
kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
|
||||
kwargs.setdefault("size", {"width": image_size, "height": image_size})
|
||||
|
||||
return GLM4VImageProcessorFast(**kwargs)
|
||||
|
||||
def get_hf_processor(self, **kwargs: object) -> GLM4VProcessor:
|
||||
return self.ctx.init_processor(
|
||||
GLM4VProcessor,
|
||||
return GLM4VProcessor(
|
||||
tokenizer=self.get_tokenizer(),
|
||||
image_processor=self.get_image_processor(**kwargs),
|
||||
)
|
||||
|
||||
@@ -28,7 +28,7 @@ from vllm.multimodal.processing.processor import (
|
||||
PromptUpdate,
|
||||
TimingContext,
|
||||
)
|
||||
from vllm.transformers_utils.processors.h2ovl import H2OVLProcessor
|
||||
from vllm.transformers_utils.processors.h2ovl import H2OVLImageProcessor, H2OVLProcessor
|
||||
|
||||
from .intern_vit import InternVisionModel
|
||||
from .internvl import (
|
||||
@@ -40,12 +40,34 @@ from .internvl import (
|
||||
|
||||
|
||||
class H2OVLProcessingInfo(BaseInternVLProcessingInfo):
|
||||
def get_image_processor(self, **kwargs):
|
||||
config = self.get_hf_config()
|
||||
vision_config = config.vision_config
|
||||
|
||||
kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
|
||||
kwargs.setdefault("image_size", vision_config.image_size)
|
||||
kwargs.setdefault("min_dynamic_patch", config.min_dynamic_patch)
|
||||
kwargs.setdefault("max_dynamic_patch", config.max_dynamic_patch)
|
||||
kwargs.setdefault("dynamic_image_size", config.dynamic_image_size)
|
||||
kwargs.setdefault("use_thumbnail", config.use_thumbnail)
|
||||
kwargs.setdefault("use_msac", config.use_msac)
|
||||
|
||||
return H2OVLImageProcessor(**kwargs)
|
||||
|
||||
def get_hf_processor(self, **kwargs: object) -> H2OVLProcessor:
|
||||
return self.ctx.init_processor(
|
||||
H2OVLProcessor,
|
||||
config=self.get_hf_config(),
|
||||
config = self.get_hf_config()
|
||||
vision_config = config.vision_config
|
||||
|
||||
image_processor = self.get_image_processor(**kwargs)
|
||||
image_size = image_processor.image_size
|
||||
patch_size = vision_config.patch_size
|
||||
downsample_ratio = config.downsample_ratio
|
||||
image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2))
|
||||
|
||||
return H2OVLProcessor(
|
||||
tokenizer=self.get_tokenizer(),
|
||||
**kwargs,
|
||||
image_processor=image_processor,
|
||||
image_seq_length=image_seq_length,
|
||||
)
|
||||
|
||||
def get_num_image_tokens(
|
||||
@@ -106,7 +128,7 @@ class H2OVLMultiModalProcessor(BaseInternVLMultiModalProcessor[H2OVLProcessingIn
|
||||
if num_patches is not None:
|
||||
assert isinstance(num_patches, int)
|
||||
|
||||
return hf_processor.get_image_repl(feature_size, num_patches)
|
||||
return hf_processor.get_image_repl(num_patches, num_features=feature_size)
|
||||
|
||||
return [
|
||||
PromptReplacement(
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
# --------------------------------------------------------
|
||||
from abc import abstractmethod
|
||||
from collections.abc import Iterable, Mapping, Sequence
|
||||
from functools import cached_property
|
||||
from typing import Annotated, Literal, TypeAlias, TypeVar
|
||||
|
||||
import torch
|
||||
@@ -45,8 +46,9 @@ from vllm.multimodal.processing import (
|
||||
)
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.transformers_utils.processors.internvl import (
|
||||
BaseInternVLProcessor,
|
||||
InternVLImageProcessor,
|
||||
InternVLProcessor,
|
||||
InternVLVideoProcessor,
|
||||
)
|
||||
from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
||||
|
||||
@@ -123,7 +125,7 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo):
|
||||
"""Basic image-only ProcessingInfo for InternVL-style models."""
|
||||
|
||||
@abstractmethod
|
||||
def get_hf_processor(self, **kwargs: object) -> BaseInternVLProcessor:
|
||||
def get_hf_processor(self, **kwargs: object) -> InternVLProcessor:
|
||||
raise NotImplementedError
|
||||
|
||||
def get_supported_mm_limits(self) -> Mapping[str, int | None]:
|
||||
@@ -134,7 +136,7 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo):
|
||||
*,
|
||||
image_width: int,
|
||||
image_height: int,
|
||||
processor: BaseInternVLProcessor,
|
||||
processor: InternVLProcessor,
|
||||
) -> int:
|
||||
return processor.get_num_image_tokens(
|
||||
image_width=image_width,
|
||||
@@ -143,8 +145,9 @@ class BaseInternVLProcessingInfo(BaseProcessingInfo):
|
||||
|
||||
def get_image_size_with_most_features(self) -> ImageSize:
|
||||
processor = self.get_hf_processor()
|
||||
image_processor = processor.image_processor
|
||||
|
||||
base_size = processor.image_size
|
||||
base_size = image_processor.image_size
|
||||
target_ratios = processor.resolve_target_ratios()
|
||||
|
||||
largest_feature_size, largest_feature_pinpoint = 0, None
|
||||
@@ -226,7 +229,7 @@ class BaseInternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
|
||||
)
|
||||
|
||||
hf_processor = self.info.get_hf_processor(**mm_kwargs)
|
||||
image_token_id = hf_processor.image_token_id
|
||||
image_token_id = hf_processor.ctx_image_token_id
|
||||
|
||||
# Since there may be extra tokens in the feature placeholders,
|
||||
# we need to pass the image token ID to the model to select the
|
||||
@@ -291,7 +294,7 @@ class BaseInternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
|
||||
if num_patches is not None:
|
||||
assert isinstance(num_patches, int)
|
||||
|
||||
return hf_processor.get_image_repl(feature_size, num_patches)
|
||||
return hf_processor.get_image_repl(num_patches, num_features=feature_size)
|
||||
|
||||
return [
|
||||
PromptReplacement(
|
||||
@@ -305,23 +308,73 @@ class BaseInternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
|
||||
class InternVLProcessingInfo(BaseInternVLProcessingInfo):
|
||||
"""InternVL ProcessingInfo extended for video processing"""
|
||||
|
||||
@property
|
||||
def supports_video(self):
|
||||
return self.get_hf_processor().supports_video
|
||||
def get_image_processor(self, **kwargs):
|
||||
config = self.get_hf_config()
|
||||
vision_config = config.vision_config
|
||||
|
||||
def get_supported_mm_limits(self):
|
||||
video_limit = {"video": None} if self.supports_video else {}
|
||||
return {**super().get_supported_mm_limits(), **video_limit}
|
||||
kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
|
||||
kwargs.setdefault("image_size", vision_config.image_size)
|
||||
kwargs.setdefault("min_dynamic_patch", config.min_dynamic_patch)
|
||||
kwargs.setdefault("max_dynamic_patch", config.max_dynamic_patch)
|
||||
kwargs.setdefault("dynamic_image_size", config.dynamic_image_size)
|
||||
kwargs.setdefault("use_thumbnail", config.use_thumbnail)
|
||||
|
||||
def get_video_token(self) -> str | None:
|
||||
return InternVLImageProcessor(**kwargs)
|
||||
|
||||
def get_video_processor(self, **kwargs):
|
||||
config = self.get_hf_config()
|
||||
vision_config = config.vision_config
|
||||
|
||||
kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
|
||||
kwargs.setdefault("image_size", vision_config.image_size)
|
||||
|
||||
return InternVLVideoProcessor(**kwargs)
|
||||
|
||||
@cached_property
|
||||
def ctx_video_token(self):
|
||||
text_model_type = self.get_hf_config().get_text_config().model_type
|
||||
video_token_map = {
|
||||
ctx_video_token_map = {
|
||||
"qwen2": "<|video_pad|>",
|
||||
"qwen3": "<|video_pad|>",
|
||||
"qwen3_moe": "<|video_pad|>",
|
||||
"gpt_oss": "<|reserved_200000|>",
|
||||
}
|
||||
return video_token_map.get(text_model_type)
|
||||
|
||||
if text_model_type not in ctx_video_token_map:
|
||||
return None
|
||||
|
||||
ctx_video_token = ctx_video_token_map[text_model_type]
|
||||
if ctx_video_token not in self.get_tokenizer().get_vocab():
|
||||
return None
|
||||
|
||||
return ctx_video_token
|
||||
|
||||
def get_hf_processor(self, **kwargs: object) -> InternVLProcessor:
|
||||
config = self.get_hf_config()
|
||||
vision_config = config.vision_config
|
||||
|
||||
image_processor = self.get_image_processor(**kwargs)
|
||||
image_size = image_processor.image_size
|
||||
patch_size = vision_config.patch_size
|
||||
downsample_ratio = config.downsample_ratio
|
||||
image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2))
|
||||
|
||||
ctx_video_token = self.ctx_video_token
|
||||
video_processor = (
|
||||
self.get_video_processor(**kwargs) if ctx_video_token else None
|
||||
)
|
||||
|
||||
return InternVLProcessor(
|
||||
tokenizer=self.get_tokenizer(),
|
||||
image_processor=image_processor,
|
||||
video_processor=video_processor,
|
||||
image_seq_length=image_seq_length,
|
||||
ctx_video_token=ctx_video_token,
|
||||
)
|
||||
|
||||
def get_supported_mm_limits(self):
|
||||
video_limit = {"video": None} if self.ctx_video_token else {}
|
||||
return {**super().get_supported_mm_limits(), **video_limit}
|
||||
|
||||
def get_num_frames_with_most_features(
|
||||
self,
|
||||
@@ -332,22 +385,14 @@ class InternVLProcessingInfo(BaseInternVLProcessingInfo):
|
||||
max_videos = mm_counts.get("video", 0)
|
||||
|
||||
processor = self.get_hf_processor()
|
||||
num_image_token = processor.image_seq_length
|
||||
|
||||
max_image_tokens = self.get_max_image_tokens() * max_images
|
||||
max_total_frames = (seq_len - max_image_tokens) // processor.num_image_token
|
||||
max_total_frames = (seq_len - max_image_tokens) // num_image_token
|
||||
max_frames_per_video = max_total_frames // max(max_videos, 1)
|
||||
|
||||
return max(max_frames_per_video, 1)
|
||||
|
||||
def get_hf_processor(self, **kwargs: object) -> InternVLProcessor:
|
||||
return self.ctx.init_processor(
|
||||
InternVLProcessor,
|
||||
config=self.get_hf_config(),
|
||||
tokenizer=self.get_tokenizer(),
|
||||
video_token=self.get_video_token(),
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
class InternVLDummyInputsBuilder(
|
||||
BaseInternVLDummyInputsBuilder[InternVLProcessingInfo]
|
||||
@@ -366,7 +411,7 @@ class InternVLDummyInputsBuilder(
|
||||
mm_options: Mapping[str, BaseDummyOptions],
|
||||
) -> MultiModalDataDict:
|
||||
dummy_image = super().get_dummy_mm_data(seq_len, mm_counts, mm_options)
|
||||
if self.info.supports_video:
|
||||
if self.info.ctx_video_token:
|
||||
config = self.info.get_hf_config()
|
||||
image_size: int = config.vision_config.image_size
|
||||
target_num_frames = self.info.get_num_frames_with_most_features(
|
||||
@@ -405,11 +450,9 @@ class InternVLMultiModalProcessor(
|
||||
)
|
||||
|
||||
hf_processor = self.info.get_hf_processor(**mm_kwargs)
|
||||
if (
|
||||
self.info.supports_video
|
||||
and (video_token_id := hf_processor.video_token_id) is not None
|
||||
):
|
||||
if (video_token_id := hf_processor.ctx_video_token_id) is not None:
|
||||
processed_outputs["video_token_id"] = torch.tensor(video_token_id)
|
||||
|
||||
return processed_outputs
|
||||
|
||||
def _get_mm_fields_config(
|
||||
@@ -418,7 +461,7 @@ class InternVLMultiModalProcessor(
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
) -> Mapping[str, MultiModalFieldConfig]:
|
||||
image_fields = super()._get_mm_fields_config(hf_inputs, hf_processor_mm_kwargs)
|
||||
if self.info.supports_video:
|
||||
if self.info.ctx_video_token:
|
||||
video_num_patches = hf_inputs.get("video_num_patches", torch.empty(0))
|
||||
num_videos = len(video_num_patches)
|
||||
video_fields = dict(
|
||||
@@ -444,6 +487,8 @@ class InternVLMultiModalProcessor(
|
||||
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
||||
out_mm_kwargs=out_mm_kwargs,
|
||||
)
|
||||
if self.info.ctx_video_token is None:
|
||||
return prompt_repl
|
||||
|
||||
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
|
||||
@@ -456,26 +501,20 @@ class InternVLMultiModalProcessor(
|
||||
video_num_patches = []
|
||||
|
||||
def get_video_replacement_internvl(item_idx: int):
|
||||
feature_size = hf_processor.num_image_token
|
||||
num_patches = video_num_patches[item_idx]
|
||||
if num_patches is not None:
|
||||
assert isinstance(num_patches, int)
|
||||
|
||||
return hf_processor.get_video_repl(
|
||||
feature_size, num_patches, video_context_token=hf_processor.video_token
|
||||
)
|
||||
return hf_processor.get_video_repl(num_patches)
|
||||
|
||||
if self.info.supports_video:
|
||||
prompt_repl = [
|
||||
*prompt_repl,
|
||||
PromptReplacement(
|
||||
modality="video",
|
||||
target="<video>",
|
||||
replacement=get_video_replacement_internvl,
|
||||
),
|
||||
]
|
||||
|
||||
return prompt_repl
|
||||
return [
|
||||
*prompt_repl,
|
||||
PromptReplacement(
|
||||
modality="video",
|
||||
target="<video>",
|
||||
replacement=get_video_replacement_internvl,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@MULTIMODAL_REGISTRY.register_processor(
|
||||
|
||||
@@ -26,8 +26,10 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.transformers_utils.processor import cached_image_processor_from_config
|
||||
from vllm.transformers_utils.processors.nemotron_vl import (
|
||||
LlamaNemotronNanoVLImageProcessor,
|
||||
LlamaNemotronNanoVLProcessor,
|
||||
LlamaNemotronVLEmbedImageProcessor,
|
||||
LlamaNemotronVLEmbedProcessor,
|
||||
NemotronVLProcessor,
|
||||
)
|
||||
from vllm.transformers_utils.repo_utils import get_hf_file_to_dict
|
||||
|
||||
@@ -50,19 +52,34 @@ from .utils import (
|
||||
class NemotronVLProcessingInfo(BaseInternVLProcessingInfo):
|
||||
"""Processing info for Nemotron VL models."""
|
||||
|
||||
def get_hf_processor(self, **kwargs: object) -> NemotronVLProcessor:
|
||||
return self.ctx.init_processor(
|
||||
NemotronVLProcessor,
|
||||
config=self.get_hf_config(),
|
||||
tokenizer=self.get_tokenizer(),
|
||||
image_processor=self.get_image_processor(),
|
||||
**kwargs,
|
||||
def get_image_processor(self, **kwargs: object):
|
||||
kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
|
||||
orig_processor = cached_image_processor_from_config(
|
||||
self.ctx.model_config, **kwargs
|
||||
)
|
||||
|
||||
def get_image_processor(self, **kwargs: object):
|
||||
return cached_image_processor_from_config(
|
||||
self.ctx.model_config,
|
||||
**kwargs,
|
||||
return LlamaNemotronNanoVLImageProcessor(
|
||||
image_size=orig_processor.image_size,
|
||||
min_dynamic_patch=1,
|
||||
max_dynamic_patch=orig_processor.max_num_tiles,
|
||||
dynamic_image_size=True,
|
||||
use_thumbnail=orig_processor.use_thumbnail,
|
||||
)
|
||||
|
||||
def get_hf_processor(self, **kwargs: object) -> LlamaNemotronNanoVLProcessor:
|
||||
config = self.get_hf_config()
|
||||
vision_config = config.vision_config
|
||||
|
||||
image_processor = self.get_image_processor(**kwargs)
|
||||
image_size = image_processor.image_size
|
||||
patch_size = vision_config.patch_size
|
||||
downsample_ratio = config.downsample_ratio
|
||||
image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2))
|
||||
|
||||
return LlamaNemotronNanoVLProcessor(
|
||||
tokenizer=self.get_tokenizer(),
|
||||
image_processor=image_processor,
|
||||
image_seq_length=image_seq_length,
|
||||
)
|
||||
|
||||
|
||||
@@ -386,29 +403,58 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, Suppor
|
||||
# --------------------------------------------------------
|
||||
|
||||
|
||||
class LlamaNemotronVLEmbedProcessingInfo(NemotronVLProcessingInfo):
|
||||
class LlamaNemotronVLEmbedProcessingInfo(BaseInternVLProcessingInfo):
|
||||
"""Processing info for LlamaNemotronVL embedding model."""
|
||||
|
||||
def get_hf_processor(self, **kwargs: object) -> LlamaNemotronVLEmbedProcessor:
|
||||
"""Override to create embedding-specific processor without image_processor."""
|
||||
def get_image_processor(self, **kwargs):
|
||||
model_config = self.ctx.model_config
|
||||
processor_config = {}
|
||||
if model_config.model is not None:
|
||||
processor_config = (
|
||||
get_hf_file_to_dict(
|
||||
"processor_config.json",
|
||||
model_config.model,
|
||||
model_config.revision,
|
||||
)
|
||||
or {}
|
||||
)
|
||||
|
||||
return self.ctx.init_processor(
|
||||
LlamaNemotronVLEmbedProcessor,
|
||||
config=self.get_hf_config(),
|
||||
config = self.get_hf_config()
|
||||
processor_config = (
|
||||
get_hf_file_to_dict(
|
||||
"processor_config.json",
|
||||
model_config.model,
|
||||
model_config.revision,
|
||||
)
|
||||
or {}
|
||||
)
|
||||
|
||||
min_dynamic_patch = processor_config.get(
|
||||
"min_input_tiles",
|
||||
getattr(config, "min_dynamic_patch", 1),
|
||||
)
|
||||
max_dynamic_patch = processor_config.get(
|
||||
"max_input_tiles",
|
||||
getattr(config, "max_dynamic_patch", 1),
|
||||
)
|
||||
dynamic_image_size = processor_config.get(
|
||||
"dynamic_image_size",
|
||||
getattr(config, "dynamic_image_size", True),
|
||||
)
|
||||
|
||||
kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
|
||||
kwargs.setdefault("image_size", config.force_image_size)
|
||||
kwargs.setdefault("min_dynamic_patch", min_dynamic_patch)
|
||||
kwargs.setdefault("max_dynamic_patch", max_dynamic_patch)
|
||||
kwargs.setdefault("dynamic_image_size", dynamic_image_size)
|
||||
kwargs.setdefault("use_thumbnail", True)
|
||||
|
||||
return LlamaNemotronVLEmbedImageProcessor(**kwargs)
|
||||
|
||||
def get_hf_processor(self, **kwargs: object) -> LlamaNemotronVLEmbedProcessor:
|
||||
config = self.get_hf_config()
|
||||
vision_config = config.vision_config
|
||||
|
||||
image_processor = self.get_image_processor(**kwargs)
|
||||
image_size = image_processor.image_size
|
||||
patch_size = vision_config.patch_size
|
||||
downsample_ratio = config.downsample_ratio
|
||||
image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2))
|
||||
|
||||
return LlamaNemotronVLEmbedProcessor(
|
||||
tokenizer=self.get_tokenizer(),
|
||||
processor_config=processor_config,
|
||||
**kwargs,
|
||||
image_processor=image_processor,
|
||||
image_seq_length=image_seq_length,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -27,7 +27,8 @@ from vllm.multimodal.processing import (
|
||||
PromptUpdate,
|
||||
PromptUpdateDetails,
|
||||
)
|
||||
from vllm.transformers_utils.processors.nvlm_d import IMG_PAD, NVLMProcessor
|
||||
from vllm.transformers_utils.processors.internvl import InternVLImageProcessor
|
||||
from vllm.transformers_utils.processors.nvlm_d import NVLMProcessor
|
||||
|
||||
from .intern_vit import InternVisionModel
|
||||
from .internvl import (
|
||||
@@ -39,12 +40,33 @@ from .internvl import (
|
||||
|
||||
|
||||
class NVLMProcessingInfo(BaseInternVLProcessingInfo):
|
||||
def get_image_processor(self, **kwargs):
|
||||
config = self.get_hf_config()
|
||||
vision_config = config.vision_config
|
||||
|
||||
kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
|
||||
kwargs.setdefault("image_size", vision_config.image_size)
|
||||
kwargs.setdefault("min_dynamic_patch", config.min_dynamic_patch)
|
||||
kwargs.setdefault("max_dynamic_patch", config.max_dynamic_patch)
|
||||
kwargs.setdefault("dynamic_image_size", config.dynamic_image_size)
|
||||
kwargs.setdefault("use_thumbnail", config.use_thumbnail)
|
||||
|
||||
return InternVLImageProcessor(**kwargs)
|
||||
|
||||
def get_hf_processor(self, **kwargs: object) -> NVLMProcessor:
|
||||
return self.ctx.init_processor(
|
||||
NVLMProcessor,
|
||||
config=self.get_hf_config(),
|
||||
config = self.get_hf_config()
|
||||
vision_config = config.vision_config
|
||||
|
||||
image_processor = self.get_image_processor(**kwargs)
|
||||
image_size = image_processor.image_size
|
||||
patch_size = vision_config.patch_size
|
||||
downsample_ratio = config.downsample_ratio
|
||||
image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2))
|
||||
|
||||
return NVLMProcessor(
|
||||
tokenizer=self.get_tokenizer(),
|
||||
**kwargs,
|
||||
image_processor=image_processor,
|
||||
image_seq_length=image_seq_length,
|
||||
)
|
||||
|
||||
|
||||
@@ -117,9 +139,11 @@ class NVLMMultiModalProcessor(BaseInternVLMultiModalProcessor[NVLMProcessingInfo
|
||||
if num_patches is not None:
|
||||
assert isinstance(num_patches, int)
|
||||
|
||||
repl = hf_processor.get_image_repl(feature_size, num_patches)
|
||||
repl = hf_processor.get_image_repl(num_patches, num_features=feature_size)
|
||||
|
||||
return PromptUpdateDetails.select_text(repl.full + "\n", IMG_PAD)
|
||||
return PromptUpdateDetails.select_text(
|
||||
repl.full + "\n", hf_processor.ctx_image_token
|
||||
)
|
||||
|
||||
# See note in dummy data regarding why we have the extra newline
|
||||
return [
|
||||
|
||||
@@ -440,13 +440,13 @@ class QwenVLProcessingInfo(BaseProcessingInfo):
|
||||
vision_config = config.visual
|
||||
|
||||
image_size = vision_config["image_size"]
|
||||
kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
|
||||
kwargs.setdefault("size", {"width": image_size, "height": image_size})
|
||||
|
||||
return QwenVLImageProcessorFast(**kwargs)
|
||||
|
||||
def get_hf_processor(self, **kwargs: object) -> QwenVLProcessor:
|
||||
return self.ctx.init_processor(
|
||||
QwenVLProcessor,
|
||||
return QwenVLProcessor(
|
||||
tokenizer=self.get_tokenizer(),
|
||||
image_processor=self.get_image_processor(**kwargs),
|
||||
)
|
||||
|
||||
@@ -43,7 +43,10 @@ from vllm.multimodal.processing import (
|
||||
PromptUpdate,
|
||||
)
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.transformers_utils.processors.skyworkr1v import SkyworkR1VProcessor
|
||||
from vllm.transformers_utils.processors.internvl import (
|
||||
InternVLImageProcessor,
|
||||
InternVLProcessor,
|
||||
)
|
||||
from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
||||
|
||||
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
|
||||
@@ -96,12 +99,33 @@ SkyworkR1VImageInputs: TypeAlias = (
|
||||
|
||||
|
||||
class SkyworkR1VProcessingInfo(BaseProcessingInfo):
|
||||
def get_hf_processor(self, **kwargs: object) -> SkyworkR1VProcessor:
|
||||
return self.ctx.init_processor(
|
||||
SkyworkR1VProcessor,
|
||||
config=self.get_hf_config(),
|
||||
def get_image_processor(self, **kwargs):
|
||||
config = self.get_hf_config()
|
||||
vision_config = config.vision_config
|
||||
|
||||
kwargs = self.ctx.get_merged_mm_kwargs(kwargs)
|
||||
kwargs.setdefault("image_size", vision_config.image_size)
|
||||
kwargs.setdefault("min_dynamic_patch", config.min_dynamic_patch)
|
||||
kwargs.setdefault("max_dynamic_patch", config.max_dynamic_patch)
|
||||
kwargs.setdefault("dynamic_image_size", config.dynamic_image_size)
|
||||
kwargs.setdefault("use_thumbnail", config.use_thumbnail)
|
||||
|
||||
return InternVLImageProcessor(**kwargs)
|
||||
|
||||
def get_hf_processor(self, **kwargs: object) -> InternVLProcessor:
|
||||
config = self.get_hf_config()
|
||||
vision_config = config.vision_config
|
||||
|
||||
image_processor = self.get_image_processor(**kwargs)
|
||||
image_size = image_processor.image_size
|
||||
patch_size = vision_config.patch_size
|
||||
downsample_ratio = config.downsample_ratio
|
||||
image_seq_length = int((image_size // patch_size) ** 2 * (downsample_ratio**2))
|
||||
|
||||
return InternVLProcessor(
|
||||
tokenizer=self.get_tokenizer(),
|
||||
**kwargs,
|
||||
image_processor=image_processor,
|
||||
image_seq_length=image_seq_length,
|
||||
)
|
||||
|
||||
def get_supported_mm_limits(self) -> Mapping[str, int | None]:
|
||||
@@ -112,7 +136,7 @@ class SkyworkR1VProcessingInfo(BaseProcessingInfo):
|
||||
*,
|
||||
image_width: int,
|
||||
image_height: int,
|
||||
processor: SkyworkR1VProcessor,
|
||||
processor: InternVLProcessor,
|
||||
) -> int:
|
||||
return processor.get_num_image_tokens(
|
||||
image_width=image_width,
|
||||
@@ -121,8 +145,9 @@ class SkyworkR1VProcessingInfo(BaseProcessingInfo):
|
||||
|
||||
def get_image_size_with_most_features(self) -> ImageSize:
|
||||
processor = self.get_hf_processor()
|
||||
image_processor = processor.image_processor
|
||||
|
||||
base_size = processor.image_size
|
||||
base_size = image_processor.image_size
|
||||
target_ratios = processor.resolve_target_ratios()
|
||||
|
||||
largest_feature_size, largest_feature_pinpoint = 0, None
|
||||
@@ -187,7 +212,7 @@ class SkyworkR1VMultiModalProcessor(BaseMultiModalProcessor[SkyworkR1VProcessing
|
||||
)
|
||||
|
||||
hf_processor = self.info.get_hf_processor(**mm_kwargs)
|
||||
image_token_id = hf_processor.image_token_id
|
||||
image_token_id = hf_processor.ctx_image_token_id
|
||||
|
||||
# Since there may be extra tokens in the feature placeholders,
|
||||
# we need to pass the image token ID to the model to select the
|
||||
@@ -252,7 +277,7 @@ class SkyworkR1VMultiModalProcessor(BaseMultiModalProcessor[SkyworkR1VProcessing
|
||||
if num_patches is not None:
|
||||
assert isinstance(num_patches, int)
|
||||
|
||||
return hf_processor.get_image_repl(feature_size, num_patches)
|
||||
return hf_processor.get_image_repl(num_patches, num_features=feature_size)
|
||||
|
||||
return [
|
||||
PromptReplacement(
|
||||
|
||||
@@ -14,7 +14,6 @@ __all__ = [
|
||||
"BagelProcessor",
|
||||
"CohereASRProcessor",
|
||||
"DeepseekVLV2Processor",
|
||||
"Eagle2_5_VLProcessor",
|
||||
"FireRedASR2Processor",
|
||||
"FunASRProcessor",
|
||||
"GLM4VProcessor",
|
||||
@@ -34,14 +33,12 @@ __all__ = [
|
||||
"Ovis2_5Processor",
|
||||
"QwenVLProcessor",
|
||||
"Qwen3ASRProcessor",
|
||||
"SkyworkR1VProcessor",
|
||||
]
|
||||
|
||||
_CLASS_TO_MODULE: dict[str, str] = {
|
||||
"BagelProcessor": "vllm.transformers_utils.processors.bagel",
|
||||
"CohereASRProcessor": "vllm.transformers_utils.processors.cohere_asr",
|
||||
"DeepseekVLV2Processor": "vllm.transformers_utils.processors.deepseek_vl2",
|
||||
"Eagle2_5_VLProcessor": "vllm.transformers_utils.processors.eagle2_5_vl",
|
||||
"FireRedASR2Processor": "vllm.transformers_utils.processors.fireredasr2",
|
||||
"FunASRProcessor": "vllm.transformers_utils.processors.funasr",
|
||||
"GLM4VProcessor": "vllm.transformers_utils.processors.glm4v",
|
||||
@@ -61,7 +58,6 @@ _CLASS_TO_MODULE: dict[str, str] = {
|
||||
"Ovis2_5Processor": "vllm.transformers_utils.processors.ovis2_5",
|
||||
"QwenVLProcessor": "vllm.transformers_utils.processors.qwen_vl",
|
||||
"Qwen3ASRProcessor": "vllm.transformers_utils.processors.qwen3_asr",
|
||||
"SkyworkR1VProcessor": "vllm.transformers_utils.processors.skyworkr1v",
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -1,85 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# Adapted from NVIDIA Eagle2.5-VL model
|
||||
# https://huggingface.co/nvidia/Eagle2.5-8B
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
from vllm.multimodal.processing import PromptUpdateDetails
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
|
||||
from .internvl import IMG_CONTEXT, IMG_END, IMG_START, BaseInternVLProcessor
|
||||
|
||||
|
||||
class Eagle2_5_VLProcessor(BaseInternVLProcessor):
|
||||
"""
|
||||
Custom processor for Eagle2.5-VL model.
|
||||
Extends BaseInternVLProcessor with Eagle-specific token handling.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
tokenizer: TokenizerLike,
|
||||
*,
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
) -> None:
|
||||
# Skip super().__init__() to avoid config manipulation
|
||||
# Directly initialize all required attributes
|
||||
self.config = config
|
||||
self.tokenizer = tokenizer
|
||||
|
||||
# Image size with force_image_size override
|
||||
image_size: int = config.vision_config.image_size
|
||||
if hasattr(config, "force_image_size") and config.force_image_size:
|
||||
image_size = config.force_image_size
|
||||
|
||||
patch_size: int = config.vision_config.patch_size
|
||||
downsample_ratio: float = getattr(config, "downsample_ratio", 0.5)
|
||||
|
||||
# Compute num_image_token
|
||||
self.num_image_token = int(
|
||||
(image_size // patch_size) ** 2 * (downsample_ratio**2)
|
||||
)
|
||||
self.image_size = image_size
|
||||
|
||||
# Dynamic patch settings with defaults
|
||||
self.min_dynamic_patch = (
|
||||
min_dynamic_patch
|
||||
if min_dynamic_patch is not None
|
||||
else getattr(config, "min_dynamic_patch", 1)
|
||||
)
|
||||
self.max_dynamic_patch = (
|
||||
max_dynamic_patch
|
||||
if max_dynamic_patch is not None
|
||||
else getattr(config, "max_dynamic_patch", 12)
|
||||
)
|
||||
self.dynamic_image_size = (
|
||||
dynamic_image_size
|
||||
if dynamic_image_size is not None
|
||||
else getattr(config, "dynamic_image_size", True)
|
||||
)
|
||||
self.use_thumbnail: bool = getattr(config, "use_thumbnail", True)
|
||||
|
||||
@property
|
||||
def image_token_id(self) -> int:
|
||||
"""Get the image token ID from config or tokenizer."""
|
||||
if hasattr(self.config, "image_token_index"):
|
||||
return self.config.image_token_index
|
||||
# Fallback to tokenizer vocab - use <IMG_CONTEXT> (ID: 151667)
|
||||
vocab = self.tokenizer.get_vocab()
|
||||
if IMG_CONTEXT in vocab:
|
||||
return vocab[IMG_CONTEXT]
|
||||
raise ValueError(f"Cannot find image token '{IMG_CONTEXT}' in vocabulary")
|
||||
|
||||
def get_image_repl(
|
||||
self,
|
||||
feature_size: int,
|
||||
num_patches: int | None,
|
||||
) -> PromptUpdateDetails[str]:
|
||||
"""Get image replacement string for prompt."""
|
||||
repl_features = IMG_CONTEXT * feature_size
|
||||
repl_full = IMG_START + repl_features + IMG_END
|
||||
|
||||
return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
|
||||
@@ -10,16 +10,12 @@
|
||||
# --------------------------------------------------------
|
||||
import torch
|
||||
from PIL import Image
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
from vllm.multimodal.processing import PromptUpdateDetails
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
from vllm.tokenizers.hf import HfTokenizer
|
||||
|
||||
from .internvl import (
|
||||
IMG_CONTEXT,
|
||||
IMG_END,
|
||||
IMG_START,
|
||||
BaseInternVLProcessor,
|
||||
InternVLImageProcessor,
|
||||
InternVLProcessor,
|
||||
build_transform,
|
||||
find_closest_aspect_ratio,
|
||||
get_internvl_target_ratios,
|
||||
@@ -217,45 +213,26 @@ def image_to_pixel_values_h2ovl(
|
||||
return pixel_values
|
||||
|
||||
|
||||
class H2OVLProcessor(BaseInternVLProcessor):
|
||||
class H2OVLImageProcessor(InternVLImageProcessor):
|
||||
def __init__(
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
tokenizer: TokenizerLike,
|
||||
*,
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
use_msac: bool | None = None,
|
||||
image_size: int,
|
||||
min_dynamic_patch: int,
|
||||
max_dynamic_patch: int,
|
||||
dynamic_image_size: bool,
|
||||
use_thumbnail: bool,
|
||||
use_msac: bool,
|
||||
) -> None:
|
||||
super().__init__(
|
||||
config,
|
||||
tokenizer,
|
||||
image_size=image_size,
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
use_thumbnail=use_thumbnail,
|
||||
)
|
||||
|
||||
if use_msac is None:
|
||||
use_msac = config.use_msac
|
||||
assert isinstance(use_msac, bool)
|
||||
|
||||
self.use_msac = use_msac
|
||||
|
||||
@property
|
||||
def image_token_id(self) -> int:
|
||||
return self.tokenizer.get_vocab()[IMG_CONTEXT]
|
||||
|
||||
def get_image_repl(
|
||||
self,
|
||||
feature_size: int,
|
||||
num_patches: int | None,
|
||||
) -> PromptUpdateDetails[str]:
|
||||
repl_features = IMG_CONTEXT * feature_size
|
||||
repl_full = IMG_START + repl_features + IMG_END
|
||||
|
||||
return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
|
||||
|
||||
def resolve_min_max_num(
|
||||
self,
|
||||
*,
|
||||
@@ -264,18 +241,14 @@ class H2OVLProcessor(BaseInternVLProcessor):
|
||||
dynamic_image_size: bool | None = None,
|
||||
use_thumbnail: bool | None = None,
|
||||
) -> tuple[int, int]:
|
||||
min_dynamic_patch = (
|
||||
self.min_dynamic_patch if min_dynamic_patch is None else min_dynamic_patch
|
||||
)
|
||||
max_dynamic_patch = (
|
||||
self.max_dynamic_patch if max_dynamic_patch is None else max_dynamic_patch
|
||||
)
|
||||
dynamic_image_size = (
|
||||
self.dynamic_image_size
|
||||
if dynamic_image_size is None
|
||||
else dynamic_image_size
|
||||
)
|
||||
use_thumbnail = self.use_thumbnail if use_thumbnail is None else use_thumbnail
|
||||
if min_dynamic_patch is None:
|
||||
min_dynamic_patch = self.min_dynamic_patch
|
||||
if max_dynamic_patch is None:
|
||||
max_dynamic_patch = self.max_dynamic_patch
|
||||
if dynamic_image_size is None:
|
||||
dynamic_image_size = self.dynamic_image_size
|
||||
if use_thumbnail is None:
|
||||
use_thumbnail = self.use_thumbnail
|
||||
|
||||
return resolve_h2ovl_min_max_num(
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
@@ -284,83 +257,6 @@ class H2OVLProcessor(BaseInternVLProcessor):
|
||||
use_thumbnail=use_thumbnail,
|
||||
)
|
||||
|
||||
def resolve_target_ratios(
|
||||
self,
|
||||
*,
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
use_thumbnail: bool | None = None,
|
||||
prior_aspect_ratio: tuple[int, int] | None = None,
|
||||
override_min_num: int | None = None,
|
||||
) -> list[tuple[int, int]]:
|
||||
min_num, max_num = self.resolve_min_max_num(
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
use_thumbnail=use_thumbnail,
|
||||
)
|
||||
if override_min_num is not None:
|
||||
min_num = override_min_num
|
||||
|
||||
return get_h2ovl_target_ratios(
|
||||
min_num,
|
||||
max_num,
|
||||
prior_aspect_ratio=prior_aspect_ratio,
|
||||
)
|
||||
|
||||
def get_num_image_tokens(
|
||||
self,
|
||||
*,
|
||||
image_width: int,
|
||||
image_height: int,
|
||||
use_msac: bool | None = None,
|
||||
) -> int:
|
||||
use_msac = self.use_msac if use_msac is None else use_msac
|
||||
|
||||
use_thumbnail = self.use_thumbnail
|
||||
|
||||
if use_msac:
|
||||
target_ratios_1 = self.resolve_target_ratios(
|
||||
use_thumbnail=False, # Applied in calculate_targets
|
||||
override_min_num=1,
|
||||
)
|
||||
num_patches_1, _, _, aspect_ratio_1 = calculate_h2ovl_targets(
|
||||
orig_width=image_width,
|
||||
orig_height=image_height,
|
||||
image_size=self.image_size,
|
||||
target_ratios=target_ratios_1,
|
||||
use_thumbnail=True,
|
||||
)
|
||||
|
||||
target_ratios_2 = self.resolve_target_ratios(
|
||||
use_thumbnail=False, # Applied in calculate_targets
|
||||
prior_aspect_ratio=aspect_ratio_1,
|
||||
override_min_num=3,
|
||||
)
|
||||
num_patches_2, _, _, _ = calculate_h2ovl_targets(
|
||||
orig_width=image_width,
|
||||
orig_height=image_height,
|
||||
image_size=self.image_size,
|
||||
target_ratios=target_ratios_2,
|
||||
use_thumbnail=True,
|
||||
)
|
||||
|
||||
num_patches = num_patches_1 + num_patches_2 - 1
|
||||
else:
|
||||
target_ratios = self.resolve_target_ratios(
|
||||
use_thumbnail=False, # Applied in calculate_targets
|
||||
)
|
||||
num_patches, _, _, _ = calculate_h2ovl_targets(
|
||||
orig_width=image_width,
|
||||
orig_height=image_height,
|
||||
image_size=self.image_size,
|
||||
target_ratios=target_ratios,
|
||||
use_thumbnail=use_thumbnail,
|
||||
)
|
||||
|
||||
return num_patches * self.num_image_token
|
||||
|
||||
def _images_to_pixel_values_lst(
|
||||
self,
|
||||
images: list[Image.Image],
|
||||
@@ -388,3 +284,104 @@ class H2OVLProcessor(BaseInternVLProcessor):
|
||||
)
|
||||
for image in images
|
||||
]
|
||||
|
||||
|
||||
class H2OVLProcessor(InternVLProcessor):
|
||||
def __init__(
|
||||
self,
|
||||
image_processor: H2OVLImageProcessor,
|
||||
tokenizer: HfTokenizer,
|
||||
*,
|
||||
image_seq_length: int,
|
||||
start_image_token: str = "<img>",
|
||||
end_image_token: str = "</img>",
|
||||
ctx_image_token: str = "<IMG_CONTEXT>",
|
||||
) -> None:
|
||||
super().__init__(
|
||||
image_processor=image_processor,
|
||||
tokenizer=tokenizer,
|
||||
image_seq_length=image_seq_length,
|
||||
start_image_token=start_image_token,
|
||||
end_image_token=end_image_token,
|
||||
ctx_image_token=ctx_image_token,
|
||||
)
|
||||
|
||||
self.image_processor: H2OVLImageProcessor
|
||||
|
||||
def resolve_target_ratios(
|
||||
self,
|
||||
*,
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
use_thumbnail: bool | None = None,
|
||||
prior_aspect_ratio: tuple[int, int] | None = None,
|
||||
override_min_num: int | None = None,
|
||||
) -> list[tuple[int, int]]:
|
||||
min_num, max_num = self.image_processor.resolve_min_max_num(
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
use_thumbnail=use_thumbnail,
|
||||
)
|
||||
if override_min_num is not None:
|
||||
min_num = override_min_num
|
||||
|
||||
return get_h2ovl_target_ratios(
|
||||
min_num,
|
||||
max_num,
|
||||
prior_aspect_ratio=prior_aspect_ratio,
|
||||
)
|
||||
|
||||
def get_num_image_tokens(
|
||||
self,
|
||||
*,
|
||||
image_width: int,
|
||||
image_height: int,
|
||||
use_msac: bool | None = None,
|
||||
) -> int:
|
||||
image_processor = self.image_processor
|
||||
use_msac = image_processor.use_msac if use_msac is None else use_msac
|
||||
|
||||
use_thumbnail = image_processor.use_thumbnail
|
||||
|
||||
if use_msac:
|
||||
target_ratios_1 = self.resolve_target_ratios(
|
||||
use_thumbnail=False, # Applied in calculate_targets
|
||||
override_min_num=1,
|
||||
)
|
||||
num_patches_1, _, _, aspect_ratio_1 = calculate_h2ovl_targets(
|
||||
orig_width=image_width,
|
||||
orig_height=image_height,
|
||||
image_size=image_processor.image_size,
|
||||
target_ratios=target_ratios_1,
|
||||
use_thumbnail=True,
|
||||
)
|
||||
|
||||
target_ratios_2 = self.resolve_target_ratios(
|
||||
use_thumbnail=False, # Applied in calculate_targets
|
||||
prior_aspect_ratio=aspect_ratio_1,
|
||||
override_min_num=3,
|
||||
)
|
||||
num_patches_2, _, _, _ = calculate_h2ovl_targets(
|
||||
orig_width=image_width,
|
||||
orig_height=image_height,
|
||||
image_size=image_processor.image_size,
|
||||
target_ratios=target_ratios_2,
|
||||
use_thumbnail=True,
|
||||
)
|
||||
|
||||
num_patches = num_patches_1 + num_patches_2 - 1
|
||||
else:
|
||||
target_ratios = self.resolve_target_ratios(
|
||||
use_thumbnail=False, # Applied in calculate_targets
|
||||
)
|
||||
num_patches, _, _, _ = calculate_h2ovl_targets(
|
||||
orig_width=image_width,
|
||||
orig_height=image_height,
|
||||
image_size=image_processor.image_size,
|
||||
target_ratios=target_ratios,
|
||||
use_thumbnail=use_thumbnail,
|
||||
)
|
||||
|
||||
return num_patches * self.image_seq_length
|
||||
|
||||
@@ -7,24 +7,17 @@
|
||||
# Copyright (c) 2023 OpenGVLab
|
||||
# Licensed under The MIT License [see LICENSE for details]
|
||||
# --------------------------------------------------------
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, TypeVar
|
||||
|
||||
import numpy.typing as npt
|
||||
import torch
|
||||
import torchvision.transforms as T
|
||||
from PIL import Image
|
||||
from transformers import BatchFeature, PretrainedConfig, TensorType
|
||||
from transformers import BatchFeature, TensorType
|
||||
from transformers.processing_utils import ProcessorMixin
|
||||
|
||||
from vllm.multimodal.image import convert_image_mode
|
||||
from vllm.multimodal.processing import PromptUpdateDetails
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
|
||||
_T = TypeVar("_T")
|
||||
|
||||
IMG_START = "<img>"
|
||||
IMG_END = "</img>"
|
||||
IMG_CONTEXT = "<IMG_CONTEXT>"
|
||||
from vllm.tokenizers.hf import HfTokenizer
|
||||
|
||||
IMAGENET_MEAN = (0.485, 0.456, 0.406)
|
||||
IMAGENET_STD = (0.229, 0.224, 0.225)
|
||||
@@ -33,7 +26,7 @@ IMAGENET_STD = (0.229, 0.224, 0.225)
|
||||
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
|
||||
def build_transform(input_size: int):
|
||||
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
|
||||
transform = T.Compose(
|
||||
return T.Compose(
|
||||
[
|
||||
T.Lambda(lambda img: convert_image_mode(img, "RGB")),
|
||||
T.Resize(
|
||||
@@ -43,7 +36,6 @@ def build_transform(input_size: int):
|
||||
T.Normalize(mean=MEAN, std=STD),
|
||||
]
|
||||
)
|
||||
return transform
|
||||
|
||||
|
||||
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
|
||||
@@ -223,65 +215,20 @@ def video_to_pixel_values_internvl(
|
||||
return pixel_values
|
||||
|
||||
|
||||
class BaseInternVLProcessor(ABC):
|
||||
"""
|
||||
This model doesn't define its own HF processor,
|
||||
so we implement our own one here.
|
||||
|
||||
The code to insert image tokens is based on:
|
||||
https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py#L252
|
||||
"""
|
||||
|
||||
class InternVLImageProcessor:
|
||||
def __init__(
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
tokenizer: TokenizerLike,
|
||||
*,
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
image_size: int,
|
||||
min_dynamic_patch: int,
|
||||
max_dynamic_patch: int,
|
||||
dynamic_image_size: bool,
|
||||
use_thumbnail: bool,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
self.config = config
|
||||
self.tokenizer = tokenizer
|
||||
|
||||
image_size: int = config.vision_config.image_size
|
||||
patch_size: int = config.vision_config.patch_size
|
||||
|
||||
if min_dynamic_patch is None:
|
||||
min_dynamic_patch = config.min_dynamic_patch
|
||||
assert isinstance(min_dynamic_patch, int)
|
||||
|
||||
if max_dynamic_patch is None:
|
||||
max_dynamic_patch = config.max_dynamic_patch
|
||||
assert isinstance(max_dynamic_patch, int)
|
||||
|
||||
if dynamic_image_size is None:
|
||||
dynamic_image_size = config.dynamic_image_size
|
||||
assert isinstance(dynamic_image_size, bool)
|
||||
|
||||
self.num_image_token = int(
|
||||
(image_size // patch_size) ** 2 * (config.downsample_ratio**2)
|
||||
)
|
||||
self.image_size = image_size
|
||||
self.min_dynamic_patch = min_dynamic_patch
|
||||
self.max_dynamic_patch = max_dynamic_patch
|
||||
self.dynamic_image_size = dynamic_image_size
|
||||
self.use_thumbnail: bool = config.use_thumbnail
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def image_token_id(self) -> int:
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def get_image_repl(
|
||||
self,
|
||||
feature_size: int,
|
||||
num_patches: int | None,
|
||||
) -> PromptUpdateDetails[str]:
|
||||
raise NotImplementedError
|
||||
self.use_thumbnail = use_thumbnail
|
||||
|
||||
def resolve_min_max_num(
|
||||
self,
|
||||
@@ -291,18 +238,14 @@ class BaseInternVLProcessor(ABC):
|
||||
dynamic_image_size: bool | None = None,
|
||||
use_thumbnail: bool | None = None,
|
||||
) -> tuple[int, int]:
|
||||
min_dynamic_patch = (
|
||||
self.min_dynamic_patch if min_dynamic_patch is None else min_dynamic_patch
|
||||
)
|
||||
max_dynamic_patch = (
|
||||
self.max_dynamic_patch if max_dynamic_patch is None else max_dynamic_patch
|
||||
)
|
||||
dynamic_image_size = (
|
||||
self.dynamic_image_size
|
||||
if dynamic_image_size is None
|
||||
else dynamic_image_size
|
||||
)
|
||||
use_thumbnail = self.use_thumbnail if use_thumbnail is None else use_thumbnail
|
||||
if min_dynamic_patch is None:
|
||||
min_dynamic_patch = self.min_dynamic_patch
|
||||
if max_dynamic_patch is None:
|
||||
max_dynamic_patch = self.max_dynamic_patch
|
||||
if dynamic_image_size is None:
|
||||
dynamic_image_size = self.dynamic_image_size
|
||||
if use_thumbnail is None:
|
||||
use_thumbnail = self.use_thumbnail
|
||||
|
||||
return resolve_internvl_min_max_num(
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
@@ -311,43 +254,6 @@ class BaseInternVLProcessor(ABC):
|
||||
use_thumbnail=use_thumbnail,
|
||||
)
|
||||
|
||||
def resolve_target_ratios(
|
||||
self,
|
||||
*,
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
use_thumbnail: bool | None = None,
|
||||
) -> list[tuple[int, int]]:
|
||||
min_num, max_num = self.resolve_min_max_num(
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
use_thumbnail=use_thumbnail,
|
||||
)
|
||||
|
||||
return get_internvl_target_ratios(min_num, max_num)
|
||||
|
||||
def get_num_image_tokens(
|
||||
self,
|
||||
*,
|
||||
image_width: int,
|
||||
image_height: int,
|
||||
) -> int:
|
||||
target_ratios = self.resolve_target_ratios(
|
||||
use_thumbnail=False, # Applied in calculate_targets
|
||||
)
|
||||
|
||||
num_patches, _, _ = calculate_internvl_targets(
|
||||
orig_width=image_width,
|
||||
orig_height=image_height,
|
||||
image_size=self.image_size,
|
||||
target_ratios=target_ratios,
|
||||
use_thumbnail=self.use_thumbnail,
|
||||
)
|
||||
|
||||
return num_patches * self.num_image_token
|
||||
|
||||
def _images_to_pixel_values_lst(
|
||||
self,
|
||||
images: list[Image.Image],
|
||||
@@ -355,7 +261,14 @@ class BaseInternVLProcessor(ABC):
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
) -> list[torch.Tensor]:
|
||||
min_num, max_num = self.resolve_min_max_num(
|
||||
if min_dynamic_patch is None:
|
||||
min_dynamic_patch = self.min_dynamic_patch
|
||||
if max_dynamic_patch is None:
|
||||
max_dynamic_patch = self.max_dynamic_patch
|
||||
if dynamic_image_size is None:
|
||||
dynamic_image_size = self.dynamic_image_size
|
||||
|
||||
min_num, max_num = resolve_internvl_min_max_num(
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
@@ -373,49 +286,9 @@ class BaseInternVLProcessor(ABC):
|
||||
for image in images
|
||||
]
|
||||
|
||||
def _preprocess_image(
|
||||
self,
|
||||
text: list[str],
|
||||
images: list[Image.Image],
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
) -> tuple[list[str], dict[str, torch.Tensor]]:
|
||||
if len(images) == 0:
|
||||
image_inputs = {}
|
||||
else:
|
||||
pixel_values_lst = self._images_to_pixel_values_lst(
|
||||
images,
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
)
|
||||
image_inputs = {
|
||||
"pixel_values_flat": torch.cat(pixel_values_lst),
|
||||
"image_num_patches": torch.tensor(
|
||||
[len(item) for item in pixel_values_lst]
|
||||
),
|
||||
}
|
||||
|
||||
for pixel_values in pixel_values_lst:
|
||||
num_patches = pixel_values.shape[0]
|
||||
feature_size = num_patches * self.num_image_token
|
||||
|
||||
image_repl = self.get_image_repl(feature_size, num_patches)
|
||||
text = [t.replace("<image>", image_repl.full, 1) for t in text]
|
||||
return text, image_inputs
|
||||
|
||||
def _make_batch_input(self, input_item: _T | list[_T] | None = None) -> list[_T]:
|
||||
if input_item is None:
|
||||
input_item = []
|
||||
if not isinstance(input_item, list):
|
||||
input_item = [input_item]
|
||||
return input_item
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
text: str | list[str] | None = None,
|
||||
images: Image.Image | list[Image.Image] | None = None,
|
||||
images: Image.Image | list[Image.Image],
|
||||
*,
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
@@ -423,120 +296,173 @@ class BaseInternVLProcessor(ABC):
|
||||
return_tensors: str | TensorType | None = None,
|
||||
**kwargs,
|
||||
) -> BatchFeature:
|
||||
text = self._make_batch_input(text)
|
||||
images = self._make_batch_input(images)
|
||||
images_lst = [images] if not isinstance(images, list) else images
|
||||
|
||||
text, image_inputs = self._preprocess_image(
|
||||
text=text,
|
||||
images=images,
|
||||
pixel_values_lst = self._images_to_pixel_values_lst(
|
||||
images_lst,
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
)
|
||||
|
||||
text_inputs = self.tokenizer(text)
|
||||
|
||||
combined_outputs = {**text_inputs, **image_inputs}
|
||||
|
||||
return BatchFeature(combined_outputs, tensor_type=return_tensors)
|
||||
image_inputs = {
|
||||
"pixel_values_flat": torch.cat(pixel_values_lst),
|
||||
"image_num_patches": torch.tensor([len(item) for item in pixel_values_lst]),
|
||||
}
|
||||
return BatchFeature(image_inputs, tensor_type=return_tensors)
|
||||
|
||||
|
||||
class InternVLProcessor(BaseInternVLProcessor):
|
||||
"""
|
||||
HF Processor for InternVLChatModel with extended video processing logic.
|
||||
|
||||
Code for video processing is adapted from video example:
|
||||
https://huggingface.co/OpenGVLab/InternVL3-1B#inference-with-transformers
|
||||
"""
|
||||
|
||||
class InternVLVideoProcessor:
|
||||
def __init__(
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
tokenizer: TokenizerLike,
|
||||
*,
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
video_token: str | None = None,
|
||||
image_size: int,
|
||||
) -> None:
|
||||
super().__init__(
|
||||
config=config,
|
||||
tokenizer=tokenizer,
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
)
|
||||
# add extra video token for video processing
|
||||
self.video_token = video_token
|
||||
|
||||
@property
|
||||
def image_token_id(self) -> int:
|
||||
return self.tokenizer.get_vocab()[IMG_CONTEXT]
|
||||
|
||||
@property
|
||||
def video_token_id(self) -> int | None:
|
||||
if self.video_token is None:
|
||||
return None
|
||||
return self.tokenizer.get_vocab().get(self.video_token, None)
|
||||
|
||||
@property
|
||||
def supports_video(self) -> bool:
|
||||
return self.video_token_id is not None
|
||||
self.image_size = image_size
|
||||
|
||||
def _videos_to_pixel_values_lst(
|
||||
self,
|
||||
videos: list[npt.NDArray],
|
||||
dynamic_image_size: bool | None = None,
|
||||
) -> list[torch.Tensor]:
|
||||
min_num, max_num = self.resolve_min_max_num(
|
||||
min_dynamic_patch=1,
|
||||
max_dynamic_patch=1,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
use_thumbnail=False, # Applied in image_to_pixel_values
|
||||
)
|
||||
|
||||
return [
|
||||
video_to_pixel_values_internvl(
|
||||
video,
|
||||
input_size=self.image_size,
|
||||
min_num=min_num,
|
||||
max_num=max_num,
|
||||
min_num=1,
|
||||
max_num=1,
|
||||
use_thumbnail=False,
|
||||
)
|
||||
for video in videos
|
||||
]
|
||||
|
||||
def _preprocess_video(
|
||||
def __call__(
|
||||
self,
|
||||
text: list[str],
|
||||
videos: list[npt.NDArray],
|
||||
dynamic_image_size: bool | None = None,
|
||||
) -> tuple[list[str], dict[str, Any]]:
|
||||
if len(videos) == 0 or not self.supports_video:
|
||||
return text, {}
|
||||
videos: npt.NDArray | list[npt.NDArray],
|
||||
*,
|
||||
return_tensors: str | TensorType | None = None,
|
||||
**kwargs,
|
||||
) -> BatchFeature:
|
||||
videos_lst = [videos] if not isinstance(videos, list) else videos
|
||||
|
||||
video_token = self.video_token
|
||||
assert video_token is not None
|
||||
pixel_values_lst = self._videos_to_pixel_values_lst(videos_lst)
|
||||
|
||||
pixel_values_lst_video = self._videos_to_pixel_values_lst(
|
||||
videos,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
)
|
||||
video_inputs = {
|
||||
"pixel_values_flat_video": torch.cat(pixel_values_lst_video),
|
||||
"video_num_patches": torch.tensor(
|
||||
[len(item) for item in pixel_values_lst_video]
|
||||
),
|
||||
image_inputs = {
|
||||
"pixel_values_flat_video": torch.cat(pixel_values_lst),
|
||||
"video_num_patches": torch.tensor([len(item) for item in pixel_values_lst]),
|
||||
}
|
||||
return BatchFeature(image_inputs, tensor_type=return_tensors)
|
||||
|
||||
for pixel_values in pixel_values_lst_video:
|
||||
num_patches = pixel_values.shape[0]
|
||||
|
||||
video_repl = self.get_video_repl(
|
||||
self.num_image_token, num_patches, video_token
|
||||
)
|
||||
text = [t.replace("<video>", video_repl.full, 1) for t in text]
|
||||
return text, video_inputs
|
||||
class InternVLProcessor(ProcessorMixin):
|
||||
"""
|
||||
This model doesn't define its own HF processor,
|
||||
so we implement our own one here.
|
||||
|
||||
The code to insert image tokens is based on:
|
||||
https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py#L252
|
||||
|
||||
Code for video processing is adapted from video example:
|
||||
https://huggingface.co/OpenGVLab/InternVL3-1B#inference-with-transformers
|
||||
"""
|
||||
|
||||
attributes = ["image_processor", "tokenizer", "video_processor"]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
image_processor: InternVLImageProcessor,
|
||||
tokenizer: HfTokenizer,
|
||||
video_processor: InternVLVideoProcessor | None = None,
|
||||
*,
|
||||
image_seq_length: int,
|
||||
start_image_token: str = "<img>",
|
||||
end_image_token: str = "</img>",
|
||||
ctx_image_token: str = "<IMG_CONTEXT>",
|
||||
ctx_video_token: str | None = None,
|
||||
) -> None:
|
||||
self.image_processor = image_processor
|
||||
self.tokenizer = tokenizer
|
||||
self.video_processor = video_processor
|
||||
|
||||
self.image_seq_length = image_seq_length
|
||||
self.start_image_token = start_image_token
|
||||
self.end_image_token = end_image_token
|
||||
self.ctx_image_token = ctx_image_token
|
||||
self.ctx_video_token = ctx_video_token
|
||||
|
||||
self.start_image_token_id = tokenizer.convert_tokens_to_ids(start_image_token)
|
||||
self.end_image_token_id = tokenizer.convert_tokens_to_ids(end_image_token)
|
||||
self.ctx_image_token_id = tokenizer.convert_tokens_to_ids(ctx_image_token)
|
||||
self.ctx_video_token_id = (
|
||||
None
|
||||
if ctx_video_token is None
|
||||
else tokenizer.convert_tokens_to_ids(ctx_video_token)
|
||||
)
|
||||
|
||||
def resolve_target_ratios(
|
||||
self,
|
||||
*,
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
use_thumbnail: bool | None = None,
|
||||
) -> list[tuple[int, int]]:
|
||||
min_num, max_num = self.image_processor.resolve_min_max_num(
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
use_thumbnail=use_thumbnail,
|
||||
)
|
||||
|
||||
return get_internvl_target_ratios(min_num, max_num)
|
||||
|
||||
def get_num_image_tokens(
|
||||
self,
|
||||
*,
|
||||
image_width: int,
|
||||
image_height: int,
|
||||
) -> int:
|
||||
image_processor = self.image_processor
|
||||
target_ratios = self.resolve_target_ratios(
|
||||
use_thumbnail=False, # Applied in calculate_targets
|
||||
)
|
||||
|
||||
num_patches, _, _ = calculate_internvl_targets(
|
||||
orig_width=image_width,
|
||||
orig_height=image_height,
|
||||
image_size=image_processor.image_size,
|
||||
target_ratios=target_ratios,
|
||||
use_thumbnail=image_processor.use_thumbnail,
|
||||
)
|
||||
|
||||
return num_patches * self.image_seq_length
|
||||
|
||||
def get_image_repl(
|
||||
self,
|
||||
num_patches: int | None,
|
||||
num_features: int | None = None,
|
||||
) -> PromptUpdateDetails[str]:
|
||||
if num_patches is None:
|
||||
assert num_features is not None
|
||||
else:
|
||||
num_features = num_patches * self.image_seq_length
|
||||
|
||||
repl_features = self.ctx_image_token * num_features
|
||||
repl_full = self.start_image_token + repl_features + self.end_image_token
|
||||
|
||||
return PromptUpdateDetails.select_text(repl_full, self.ctx_image_token)
|
||||
|
||||
def get_video_repl(self, num_patches: int) -> PromptUpdateDetails[str]:
|
||||
assert self.ctx_video_token is not None
|
||||
|
||||
repl_features = self.ctx_video_token * self.image_seq_length
|
||||
repl_features_with_sep = (
|
||||
self.start_image_token + repl_features + self.end_image_token
|
||||
)
|
||||
# num_patches is equal to num_frames
|
||||
repl_full = "".join(
|
||||
[f"Frame{i + 1}: {repl_features_with_sep}" for i in range(num_patches)]
|
||||
)
|
||||
|
||||
return PromptUpdateDetails.select_text(repl_full, self.ctx_video_token)
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
@@ -550,54 +476,88 @@ class InternVLProcessor(BaseInternVLProcessor):
|
||||
return_tensors: str | TensorType | None = None,
|
||||
**kwargs,
|
||||
) -> BatchFeature:
|
||||
text = self._make_batch_input(text)
|
||||
images = self._make_batch_input(images)
|
||||
videos = self._make_batch_input(videos)
|
||||
if images is not None:
|
||||
image_inputs = self.image_processor(
|
||||
images=images,
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
return_tensors=return_tensors,
|
||||
)
|
||||
image_num_patches = image_inputs["image_num_patches"]
|
||||
else:
|
||||
image_inputs = {}
|
||||
image_num_patches = []
|
||||
|
||||
text, image_inputs = self._preprocess_image(
|
||||
text=text,
|
||||
images=images,
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
)
|
||||
if videos is not None:
|
||||
if self.video_processor is None:
|
||||
raise ValueError("This model does not support video inputs")
|
||||
|
||||
text, video_inputs = self._preprocess_video(
|
||||
text=text,
|
||||
videos=videos,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
)
|
||||
video_inputs = self.video_processor(
|
||||
videos=videos,
|
||||
return_tensors=return_tensors,
|
||||
)
|
||||
video_num_patches = video_inputs["video_num_patches"]
|
||||
else:
|
||||
video_inputs = {}
|
||||
video_num_patches = []
|
||||
|
||||
text_inputs = self.tokenizer(text)
|
||||
if text is not None:
|
||||
if not isinstance(text, list):
|
||||
text = [text]
|
||||
|
||||
if image_inputs:
|
||||
image_token = "<image>"
|
||||
image_index = 0
|
||||
processed_text = list[str]()
|
||||
replace_strings = list[str]()
|
||||
|
||||
for prompt in text:
|
||||
new_prompt = prompt
|
||||
|
||||
while image_token in new_prompt:
|
||||
new_prompt = new_prompt.replace(image_token, "<placeholder>", 1)
|
||||
image_repl = self.get_image_repl(image_num_patches[image_index])
|
||||
replace_strings.append(image_repl.full)
|
||||
image_index += 1
|
||||
|
||||
while "<placeholder>" in new_prompt:
|
||||
replace_str = replace_strings.pop(0)
|
||||
new_prompt = new_prompt.replace("<placeholder>", replace_str, 1)
|
||||
|
||||
processed_text.append(new_prompt)
|
||||
|
||||
text = processed_text
|
||||
|
||||
if video_inputs:
|
||||
video_token = "<video>"
|
||||
video_index = 0
|
||||
processed_text = list[str]()
|
||||
replace_strings = list[str]()
|
||||
|
||||
assert video_token is not None
|
||||
|
||||
for prompt in text:
|
||||
new_prompt = prompt
|
||||
|
||||
while video_token in new_prompt:
|
||||
new_prompt = new_prompt.replace(video_token, "<placeholder>", 1)
|
||||
video_repl = self.get_video_repl(video_num_patches[video_index])
|
||||
replace_strings.append(video_repl.full)
|
||||
video_index += 1
|
||||
|
||||
while "<placeholder>" in new_prompt:
|
||||
replace_str = replace_strings.pop(0)
|
||||
new_prompt = new_prompt.replace("<placeholder>", replace_str, 1)
|
||||
|
||||
processed_text.append(new_prompt)
|
||||
|
||||
text = processed_text
|
||||
|
||||
text_inputs = self.tokenizer(text, return_tensors=return_tensors)
|
||||
else:
|
||||
text_inputs = {}
|
||||
|
||||
combined_outputs = {**text_inputs, **image_inputs, **video_inputs}
|
||||
|
||||
return BatchFeature(combined_outputs, tensor_type=return_tensors)
|
||||
|
||||
def get_image_repl(
|
||||
self,
|
||||
feature_size: int,
|
||||
num_patches: int | None,
|
||||
) -> PromptUpdateDetails[str]:
|
||||
repl_features = IMG_CONTEXT * feature_size
|
||||
repl_full = IMG_START + repl_features + IMG_END
|
||||
|
||||
return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
|
||||
|
||||
def get_video_repl(
|
||||
self,
|
||||
feature_size: int,
|
||||
num_patches: int | None,
|
||||
video_context_token: str = IMG_CONTEXT,
|
||||
) -> PromptUpdateDetails[str]:
|
||||
if num_patches is None:
|
||||
raise NotImplementedError("Embedding inputs are not supported")
|
||||
|
||||
repl_features = video_context_token * self.num_image_token
|
||||
repl_features_with_sep = IMG_START + repl_features + IMG_END
|
||||
# num_patches is equal to num_frames
|
||||
repl_full = "".join(
|
||||
[f"Frame{i + 1}: {repl_features_with_sep}" for i in range(num_patches)]
|
||||
)
|
||||
|
||||
return PromptUpdateDetails.select_text(repl_full, video_context_token)
|
||||
|
||||
@@ -25,7 +25,7 @@ from vllm.model_executor.models.parakeet import ParakeetExtractor
|
||||
from vllm.multimodal.evs import compute_retained_tokens_count
|
||||
from vllm.multimodal.inputs import AudioItem
|
||||
from vllm.multimodal.processing.processor import PromptUpdateDetails, _seq2tokens
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
from vllm.tokenizers.hf import HfTokenizer
|
||||
|
||||
from .internvl import calculate_internvl_targets, get_internvl_target_ratios
|
||||
|
||||
@@ -508,7 +508,7 @@ class BaseNanoNemotronVLProcessor(ABC):
|
||||
def __init__(
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
tokenizer: TokenizerLike,
|
||||
tokenizer: HfTokenizer,
|
||||
*args,
|
||||
max_model_len: int,
|
||||
max_num_tiles: int | None = None,
|
||||
@@ -689,7 +689,7 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
|
||||
def __init__(
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
tokenizer: TokenizerLike,
|
||||
tokenizer: HfTokenizer,
|
||||
*,
|
||||
max_model_len: int,
|
||||
max_num_tiles: int | None = None,
|
||||
@@ -961,7 +961,7 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
|
||||
tokens_per_frame: list[int],
|
||||
frames_indices: list[int],
|
||||
frame_duration_ms: int,
|
||||
tokenizer: TokenizerLike,
|
||||
tokenizer: HfTokenizer,
|
||||
img_start_token_ids: list[int],
|
||||
img_end_token_ids: list[int],
|
||||
img_context_token_ids: list[int],
|
||||
@@ -986,7 +986,7 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
|
||||
tokens_per_frame (list[int]): number of tokens per frame
|
||||
frames_indices (list[int]): frame indices
|
||||
frame_duration_ms (int): duration of each frame in milliseconds
|
||||
tokenizer (TokenizerLike): tokenizer to use for tokenizing frame separators
|
||||
tokenizer (HfTokenizer): tokenizer to use for tokenizing frame separators
|
||||
img_start_token_ids (list[int]): pre-tokenized IMG_START tokens
|
||||
img_end_token_ids (list[int]): pre-tokenized IMG_END tokens
|
||||
img_context_token_ids (list[int]): pre-tokenized IMG_CONTEXT tokens
|
||||
|
||||
@@ -1,18 +1,14 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from abc import ABC
|
||||
|
||||
import torch
|
||||
import torchvision.transforms as T
|
||||
from PIL import Image
|
||||
from transformers import PretrainedConfig
|
||||
from transformers.image_processing_utils_fast import BaseImageProcessorFast
|
||||
|
||||
from vllm.multimodal.image import convert_image_mode
|
||||
from vllm.multimodal.processing import PromptUpdateDetails
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
from vllm.tokenizers.hf import HfTokenizer
|
||||
|
||||
from .internvl import InternVLProcessor
|
||||
from .internvl import InternVLImageProcessor, InternVLProcessor
|
||||
|
||||
# Configure PIL to handle large images without warnings
|
||||
# This prevents DecompressionBombWarning for legitimate large images
|
||||
@@ -172,80 +168,7 @@ def image_to_pixel_values_nemotron_vl(
|
||||
return pixel_values
|
||||
|
||||
|
||||
class NemotronVLProcessor(InternVLProcessor):
|
||||
IMG_START = "<img>"
|
||||
IMG_END = "</img>"
|
||||
IMG_CONTEXT = "<image>"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
tokenizer: TokenizerLike,
|
||||
image_processor: BaseImageProcessorFast,
|
||||
*,
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
) -> None:
|
||||
ABC.__init__(self)
|
||||
self.config = config
|
||||
self.tokenizer = tokenizer
|
||||
self.image_processor = image_processor
|
||||
image_size: int = config.force_image_size
|
||||
patch_size: int = config.patch_size
|
||||
|
||||
if min_dynamic_patch is None:
|
||||
min_dynamic_patch = 1
|
||||
assert isinstance(min_dynamic_patch, int)
|
||||
|
||||
if max_dynamic_patch is None:
|
||||
max_dynamic_patch = self.image_processor.max_num_tiles
|
||||
assert isinstance(max_dynamic_patch, int)
|
||||
|
||||
if dynamic_image_size is None:
|
||||
dynamic_image_size = True
|
||||
assert isinstance(dynamic_image_size, bool)
|
||||
|
||||
self.num_image_token = int(
|
||||
(image_size // patch_size) ** 2 * (config.downsample_ratio**2)
|
||||
)
|
||||
self.image_size = image_size
|
||||
self.min_dynamic_patch = min_dynamic_patch
|
||||
self.max_dynamic_patch = max_dynamic_patch
|
||||
self.dynamic_image_size = dynamic_image_size
|
||||
|
||||
if image_processor is not None:
|
||||
self.use_thumbnail = image_processor.use_thumbnail
|
||||
else:
|
||||
self.use_thumbnail = getattr(config, "use_thumbnail", True)
|
||||
|
||||
@property
|
||||
def image_token_id(self) -> int:
|
||||
return self.tokenizer.get_vocab()[self.IMG_CONTEXT]
|
||||
|
||||
def _get_transform(self) -> T.Compose:
|
||||
return build_transform(input_size=self.image_size)
|
||||
|
||||
def get_num_image_tokens(
|
||||
self,
|
||||
*,
|
||||
image_width: int,
|
||||
image_height: int,
|
||||
) -> int:
|
||||
target_ratios = self.resolve_target_ratios(
|
||||
use_thumbnail=False, # Applied in calculate_targets
|
||||
)
|
||||
|
||||
num_patches, _, _ = calculate_nemotron_vl_targets(
|
||||
orig_width=image_width,
|
||||
orig_height=image_height,
|
||||
image_size=self.image_size,
|
||||
target_ratios=target_ratios,
|
||||
use_thumbnail=self.use_thumbnail,
|
||||
)
|
||||
|
||||
return num_patches * self.num_image_token
|
||||
|
||||
class LlamaNemotronNanoVLImageProcessor(InternVLImageProcessor):
|
||||
def _images_to_pixel_values_lst(
|
||||
self,
|
||||
images: list[Image.Image],
|
||||
@@ -267,62 +190,60 @@ class NemotronVLProcessor(InternVLProcessor):
|
||||
min_num=min_num,
|
||||
max_num=max_num,
|
||||
use_thumbnail=self.use_thumbnail,
|
||||
transform=self._get_transform(),
|
||||
transform=build_transform(self.image_size),
|
||||
)
|
||||
for image in images
|
||||
]
|
||||
|
||||
def _replace_image_tokens(
|
||||
|
||||
class LlamaNemotronNanoVLProcessor(InternVLProcessor):
|
||||
"""
|
||||
This model doesn't define its own HF processor,
|
||||
so we implement our own one here.
|
||||
|
||||
The image processor is given by:
|
||||
https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1/blob/main/image_processing.py
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
text: list[str],
|
||||
pixel_values_lst: list[torch.Tensor],
|
||||
) -> list[str]:
|
||||
"""Replace <image> placeholders with image tokens."""
|
||||
for pixel_values in pixel_values_lst:
|
||||
num_patches = pixel_values.shape[0]
|
||||
feature_size = num_patches * self.num_image_token
|
||||
image_repl = self.get_image_repl(feature_size, num_patches)
|
||||
# Use temporary placeholder to avoid replacing tokens we just inserted
|
||||
NVL_IMAGE_CONTEXT = image_repl.full.replace("<image>", "<NVL_IMG_CONTEXT>")
|
||||
text = [t.replace("<image>", NVL_IMAGE_CONTEXT, 1) for t in text]
|
||||
return [t.replace("<NVL_IMG_CONTEXT>", self.IMG_CONTEXT) for t in text]
|
||||
image_processor: LlamaNemotronNanoVLImageProcessor,
|
||||
tokenizer: HfTokenizer,
|
||||
*,
|
||||
image_seq_length: int,
|
||||
start_image_token: str = "<img>",
|
||||
end_image_token: str = "</img>",
|
||||
ctx_image_token: str = "<image>",
|
||||
) -> None:
|
||||
super().__init__(
|
||||
image_processor=image_processor,
|
||||
tokenizer=tokenizer,
|
||||
image_seq_length=image_seq_length,
|
||||
start_image_token=start_image_token,
|
||||
end_image_token=end_image_token,
|
||||
ctx_image_token=ctx_image_token,
|
||||
)
|
||||
|
||||
def _preprocess_image(
|
||||
def get_num_image_tokens(
|
||||
self,
|
||||
text: list[str],
|
||||
images: list[Image.Image],
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
) -> tuple[list[str], dict[str, torch.Tensor]]:
|
||||
if len(images) == 0:
|
||||
image_inputs = {}
|
||||
else:
|
||||
pixel_values_lst = self._images_to_pixel_values_lst(
|
||||
images,
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
)
|
||||
image_inputs = {
|
||||
"pixel_values_flat": torch.cat(pixel_values_lst),
|
||||
"image_num_patches": torch.tensor(
|
||||
[len(item) for item in pixel_values_lst]
|
||||
),
|
||||
}
|
||||
*,
|
||||
image_width: int,
|
||||
image_height: int,
|
||||
) -> int:
|
||||
image_processor = self.image_processor
|
||||
target_ratios = self.resolve_target_ratios(
|
||||
use_thumbnail=False, # Applied in calculate_targets
|
||||
)
|
||||
|
||||
text = self._replace_image_tokens(text, pixel_values_lst)
|
||||
return text, image_inputs
|
||||
num_patches, _, _ = calculate_nemotron_vl_targets(
|
||||
orig_width=image_width,
|
||||
orig_height=image_height,
|
||||
image_size=image_processor.image_size,
|
||||
target_ratios=target_ratios,
|
||||
use_thumbnail=image_processor.use_thumbnail,
|
||||
)
|
||||
|
||||
def get_image_repl(
|
||||
self,
|
||||
feature_size: int,
|
||||
num_patches: int | None,
|
||||
) -> PromptUpdateDetails[str]:
|
||||
repl_features = self.IMG_CONTEXT * feature_size
|
||||
repl_full = self.IMG_START + repl_features + self.IMG_END
|
||||
|
||||
return PromptUpdateDetails.select_text(repl_full, self.IMG_CONTEXT)
|
||||
return num_patches * self.image_seq_length
|
||||
|
||||
|
||||
# SigLIP normalization constants
|
||||
@@ -343,7 +264,35 @@ def build_siglip_transform(input_size: int):
|
||||
)
|
||||
|
||||
|
||||
class LlamaNemotronVLEmbedProcessor(NemotronVLProcessor):
|
||||
class LlamaNemotronVLEmbedImageProcessor(InternVLImageProcessor):
|
||||
def _images_to_pixel_values_lst(
|
||||
self,
|
||||
images: list[Image.Image],
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
) -> list[torch.Tensor]:
|
||||
min_num, max_num = self.resolve_min_max_num(
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
use_thumbnail=False, # Applied in image_to_pixel_values
|
||||
)
|
||||
|
||||
return [
|
||||
image_to_pixel_values_nemotron_vl(
|
||||
image,
|
||||
input_size=self.image_size,
|
||||
min_num=min_num,
|
||||
max_num=max_num,
|
||||
use_thumbnail=self.use_thumbnail,
|
||||
transform=build_siglip_transform(self.image_size),
|
||||
)
|
||||
for image in images
|
||||
]
|
||||
|
||||
|
||||
class LlamaNemotronVLEmbedProcessor(InternVLProcessor):
|
||||
"""
|
||||
Processor for LlamaNemotronVL embedding model.
|
||||
|
||||
@@ -352,59 +301,44 @@ class LlamaNemotronVLEmbedProcessor(NemotronVLProcessor):
|
||||
- Uses different image context token (<IMG_CONTEXT> vs <image>)
|
||||
"""
|
||||
|
||||
IMG_CONTEXT = "<IMG_CONTEXT>"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
tokenizer: TokenizerLike,
|
||||
processor_config: dict,
|
||||
image_processor: LlamaNemotronVLEmbedImageProcessor,
|
||||
tokenizer: HfTokenizer,
|
||||
*,
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
image_seq_length: int,
|
||||
start_image_token: str = "<img>",
|
||||
end_image_token: str = "</img>",
|
||||
ctx_image_token: str = "<IMG_CONTEXT>",
|
||||
) -> None:
|
||||
if min_dynamic_patch is None:
|
||||
min_dynamic_patch = processor_config.get(
|
||||
"min_input_tiles",
|
||||
getattr(config, "min_dynamic_patch", 1),
|
||||
)
|
||||
if max_dynamic_patch is None:
|
||||
max_dynamic_patch = processor_config.get(
|
||||
"max_input_tiles",
|
||||
getattr(config, "max_dynamic_patch", 1),
|
||||
)
|
||||
if dynamic_image_size is None:
|
||||
dynamic_image_size = processor_config.get(
|
||||
"dynamic_image_size",
|
||||
getattr(config, "dynamic_image_size", True),
|
||||
)
|
||||
super().__init__(
|
||||
config=config,
|
||||
image_processor=image_processor,
|
||||
tokenizer=tokenizer,
|
||||
image_processor=None,
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
image_seq_length=image_seq_length,
|
||||
start_image_token=start_image_token,
|
||||
end_image_token=end_image_token,
|
||||
ctx_image_token=ctx_image_token,
|
||||
)
|
||||
|
||||
def _get_transform(self) -> T.Compose:
|
||||
"""Override to add SigLIP normalization."""
|
||||
return build_siglip_transform(input_size=self.image_size)
|
||||
self.image_processor: LlamaNemotronVLEmbedImageProcessor
|
||||
|
||||
def _replace_image_tokens(
|
||||
def get_num_image_tokens(
|
||||
self,
|
||||
text: list[str],
|
||||
pixel_values_lst: list[torch.Tensor],
|
||||
) -> list[str]:
|
||||
"""Override with simpler token replacement for embedding model.
|
||||
*,
|
||||
image_width: int,
|
||||
image_height: int,
|
||||
) -> int:
|
||||
image_processor = self.image_processor
|
||||
target_ratios = self.resolve_target_ratios(
|
||||
use_thumbnail=False, # Applied in calculate_targets
|
||||
)
|
||||
|
||||
No temporary placeholder needed because IMG_CONTEXT is <IMG_CONTEXT>,
|
||||
not <image>, so there's no collision risk.
|
||||
"""
|
||||
for pixel_values in pixel_values_lst:
|
||||
num_patches = pixel_values.shape[0]
|
||||
feature_size = num_patches * self.num_image_token
|
||||
image_repl = self.get_image_repl(feature_size, num_patches)
|
||||
text = [t.replace("<image>", image_repl.full, 1) for t in text]
|
||||
return text
|
||||
num_patches, _, _ = calculate_nemotron_vl_targets(
|
||||
orig_width=image_width,
|
||||
orig_height=image_height,
|
||||
image_size=image_processor.image_size,
|
||||
target_ratios=target_ratios,
|
||||
use_thumbnail=image_processor.use_thumbnail,
|
||||
)
|
||||
|
||||
return num_patches * self.image_seq_length
|
||||
|
||||
@@ -8,37 +8,54 @@
|
||||
# Licensed under Apache 2.0 License [see LICENSE for details]
|
||||
# --------------------------------------------------------
|
||||
from vllm.multimodal.processing import PromptUpdateDetails
|
||||
from vllm.tokenizers.hf import HfTokenizer
|
||||
|
||||
from .internvl import BaseInternVLProcessor
|
||||
|
||||
IMG_PAD = "<|vision_pad|>"
|
||||
from .internvl import InternVLImageProcessor, InternVLProcessor
|
||||
|
||||
|
||||
class NVLMProcessor(BaseInternVLProcessor):
|
||||
@property
|
||||
def image_token_id(self) -> int:
|
||||
return self.tokenizer.get_vocab()[IMG_PAD]
|
||||
class NVLMProcessor(InternVLProcessor):
|
||||
def __init__(
|
||||
self,
|
||||
image_processor: InternVLImageProcessor,
|
||||
tokenizer: HfTokenizer,
|
||||
*,
|
||||
image_seq_length: int,
|
||||
start_image_token: str = "<Image>",
|
||||
end_image_token: str = "</Image>",
|
||||
ctx_image_token: str = "<|vision_pad|>",
|
||||
) -> None:
|
||||
super().__init__(
|
||||
image_processor=image_processor,
|
||||
tokenizer=tokenizer,
|
||||
image_seq_length=image_seq_length,
|
||||
start_image_token=start_image_token,
|
||||
end_image_token=end_image_token,
|
||||
ctx_image_token=ctx_image_token,
|
||||
)
|
||||
|
||||
def get_image_repl(
|
||||
self,
|
||||
feature_size: int,
|
||||
num_patches: int | None,
|
||||
num_features: int | None = None,
|
||||
) -> PromptUpdateDetails[str]:
|
||||
if num_patches is None:
|
||||
raise NotImplementedError("Embedding inputs are not supported")
|
||||
|
||||
num_features = num_patches * self.image_seq_length
|
||||
|
||||
tile_pos_identifiers = [f"<tile_{i}>" for i in range(1, num_patches)]
|
||||
if self.use_thumbnail:
|
||||
if self.image_processor.use_thumbnail:
|
||||
tile_pos_identifiers += ["<tile_global_thumbnail>"]
|
||||
|
||||
context_size = feature_size // num_patches
|
||||
context_size = num_features // num_patches
|
||||
features = "".join(
|
||||
identifier + IMG_PAD * context_size for identifier in tile_pos_identifiers
|
||||
(identifier + self.ctx_image_token * context_size)
|
||||
for identifier in tile_pos_identifiers
|
||||
)
|
||||
|
||||
# We include the start and end as well because "<Image><tile" is
|
||||
# tokenized as ["<Image", "><", "tile"], resulting in assertion error
|
||||
# when trying to find "<tile" as a subsequence of "<Image><tile"
|
||||
repl = "<Image>" + features + "</Image>"
|
||||
repl = self.start_image_token + features + self.end_image_token
|
||||
|
||||
return PromptUpdateDetails.select_text(repl, IMG_PAD)
|
||||
return PromptUpdateDetails.select_text(repl, self.ctx_image_token)
|
||||
|
||||
@@ -1,389 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/modeling_skywork_chat.py
|
||||
# --------------------------------------------------------
|
||||
# SkyworkR1V
|
||||
# Copyright (c) 2025 Skywork
|
||||
# Licensed under The MIT License [see LICENSE for details]
|
||||
# --------------------------------------------------------
|
||||
|
||||
import torch
|
||||
import torchvision.transforms as T
|
||||
from PIL import Image
|
||||
from transformers import BatchFeature, PretrainedConfig, TensorType
|
||||
|
||||
from vllm.multimodal.image import convert_image_mode
|
||||
from vllm.multimodal.processing import PromptUpdateDetails
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
|
||||
IMG_START = "<img>"
|
||||
IMG_END = "</img>"
|
||||
IMG_CONTEXT = "<IMG_CONTEXT>"
|
||||
|
||||
IMAGENET_MEAN = (0.485, 0.456, 0.406)
|
||||
IMAGENET_STD = (0.229, 0.224, 0.225)
|
||||
|
||||
|
||||
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/
|
||||
def build_transform(input_size: int):
|
||||
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
|
||||
return T.Compose(
|
||||
[
|
||||
T.Lambda(lambda img: convert_image_mode(img, "RGB")),
|
||||
T.Resize(
|
||||
(input_size, input_size), interpolation=T.InterpolationMode.BICUBIC
|
||||
),
|
||||
T.ToTensor(),
|
||||
T.Normalize(mean=MEAN, std=STD),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B/
|
||||
def find_closest_aspect_ratio(
|
||||
aspect_ratio: float,
|
||||
target_ratios: list[tuple[int, int]],
|
||||
*,
|
||||
width: int,
|
||||
height: int,
|
||||
image_size: int,
|
||||
) -> tuple[int, int]:
|
||||
best_ratio_diff = float("inf")
|
||||
best_ratio = (1, 1)
|
||||
area = width * height
|
||||
for ratio in target_ratios:
|
||||
target_aspect_ratio = ratio[0] / ratio[1]
|
||||
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
|
||||
if ratio_diff < best_ratio_diff:
|
||||
best_ratio_diff = ratio_diff
|
||||
best_ratio = ratio
|
||||
elif ratio_diff == best_ratio_diff:
|
||||
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
|
||||
best_ratio = ratio
|
||||
return best_ratio
|
||||
|
||||
|
||||
def resolve_skyworkr1v_min_max_num(
|
||||
*,
|
||||
min_dynamic_patch: int,
|
||||
max_dynamic_patch: int,
|
||||
dynamic_image_size: bool,
|
||||
use_thumbnail: bool,
|
||||
) -> tuple[int, int]:
|
||||
min_dynamic_patch = min_dynamic_patch if dynamic_image_size else 1
|
||||
max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1
|
||||
|
||||
if use_thumbnail and max_dynamic_patch != 1:
|
||||
max_dynamic_patch += 1
|
||||
|
||||
return min_dynamic_patch, max_dynamic_patch
|
||||
|
||||
|
||||
def get_skyworkr1v_target_ratios(
|
||||
min_num: int,
|
||||
max_num: int,
|
||||
) -> list[tuple[int, int]]:
|
||||
target_ratios = {
|
||||
(i, j)
|
||||
for n in range(min_num, max_num + 1)
|
||||
for i in range(1, n + 1)
|
||||
for j in range(1, n + 1)
|
||||
if min_num <= i * j <= max_num
|
||||
}
|
||||
return sorted(target_ratios, key=lambda x: x[0] * x[1])
|
||||
|
||||
|
||||
def calculate_skyworkr1v_targets(
|
||||
*,
|
||||
orig_width: int,
|
||||
orig_height: int,
|
||||
target_ratios: list[tuple[int, int]],
|
||||
image_size: int,
|
||||
use_thumbnail: bool,
|
||||
) -> tuple[int, int, int]:
|
||||
aspect_ratio = orig_width / orig_height
|
||||
|
||||
# find the closest aspect ratio to the target
|
||||
target_aspect_ratio = find_closest_aspect_ratio(
|
||||
aspect_ratio,
|
||||
target_ratios,
|
||||
width=orig_width,
|
||||
height=orig_height,
|
||||
image_size=image_size,
|
||||
)
|
||||
|
||||
# calculate the target width and height
|
||||
target_width = image_size * target_aspect_ratio[0]
|
||||
target_height = image_size * target_aspect_ratio[1]
|
||||
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
|
||||
|
||||
# add thumbnail image if num_blocks != 1
|
||||
if use_thumbnail and blocks != 1:
|
||||
blocks += 1
|
||||
|
||||
return blocks, target_width, target_height
|
||||
|
||||
|
||||
def dynamic_preprocess_skyworkr1v(
|
||||
image: Image.Image,
|
||||
*,
|
||||
target_ratios: list[tuple[int, int]],
|
||||
image_size: int,
|
||||
use_thumbnail: bool,
|
||||
) -> list[Image.Image]:
|
||||
orig_width, orig_height = image.size
|
||||
|
||||
# calculate the number of blocks without thumbnail
|
||||
blocks, target_width, target_height = calculate_skyworkr1v_targets(
|
||||
orig_width=orig_width,
|
||||
orig_height=orig_height,
|
||||
target_ratios=target_ratios,
|
||||
image_size=image_size,
|
||||
use_thumbnail=False,
|
||||
)
|
||||
|
||||
# resize the image
|
||||
resized_img = image.resize((target_width, target_height))
|
||||
processed_images = []
|
||||
for i in range(blocks):
|
||||
box = (
|
||||
(i % (target_width // image_size)) * image_size,
|
||||
(i // (target_width // image_size)) * image_size,
|
||||
((i % (target_width // image_size)) + 1) * image_size,
|
||||
((i // (target_width // image_size)) + 1) * image_size,
|
||||
)
|
||||
# split the image
|
||||
split_img = resized_img.crop(box)
|
||||
processed_images.append(split_img)
|
||||
|
||||
assert len(processed_images) == blocks
|
||||
|
||||
if use_thumbnail and len(processed_images) != 1:
|
||||
thumbnail_img = image.resize((image_size, image_size))
|
||||
processed_images.append(thumbnail_img)
|
||||
|
||||
return processed_images
|
||||
|
||||
|
||||
# adapted from https://huggingface.co/Skywork/Skywork-R1V-38B
|
||||
def image_to_pixel_values_skyworkr1v(
|
||||
image: Image.Image,
|
||||
*,
|
||||
input_size: int,
|
||||
min_num: int,
|
||||
max_num: int,
|
||||
use_thumbnail: bool,
|
||||
) -> torch.Tensor:
|
||||
target_ratios = get_skyworkr1v_target_ratios(min_num, max_num)
|
||||
|
||||
transform = build_transform(input_size=input_size)
|
||||
images = dynamic_preprocess_skyworkr1v(
|
||||
image,
|
||||
target_ratios=target_ratios,
|
||||
image_size=input_size,
|
||||
use_thumbnail=use_thumbnail,
|
||||
)
|
||||
|
||||
pixel_values = torch.stack([transform(image) for image in images])
|
||||
return pixel_values
|
||||
|
||||
|
||||
class SkyworkR1VProcessor:
|
||||
"""
|
||||
This model doesn't define its own HF processor,
|
||||
so we implement our own one here.
|
||||
|
||||
The code to insert image tokens is based on:
|
||||
https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/modeling_skywork_chat.py#L252
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
tokenizer: TokenizerLike,
|
||||
*,
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
self.config = config
|
||||
self.tokenizer = tokenizer
|
||||
|
||||
image_size: int = config.vision_config.image_size
|
||||
patch_size: int = config.vision_config.patch_size
|
||||
|
||||
if min_dynamic_patch is None:
|
||||
min_dynamic_patch = config.min_dynamic_patch
|
||||
assert isinstance(min_dynamic_patch, int)
|
||||
|
||||
if max_dynamic_patch is None:
|
||||
max_dynamic_patch = config.max_dynamic_patch
|
||||
assert isinstance(max_dynamic_patch, int)
|
||||
|
||||
if dynamic_image_size is None:
|
||||
dynamic_image_size = config.dynamic_image_size
|
||||
assert isinstance(dynamic_image_size, bool)
|
||||
|
||||
self.num_image_token = int(
|
||||
(image_size // patch_size) ** 2 * (config.downsample_ratio**2)
|
||||
)
|
||||
self.image_size = image_size
|
||||
self.min_dynamic_patch = min_dynamic_patch
|
||||
self.max_dynamic_patch = max_dynamic_patch
|
||||
self.dynamic_image_size = dynamic_image_size
|
||||
self.use_thumbnail: bool = config.use_thumbnail
|
||||
|
||||
@property
|
||||
def image_token_id(self) -> int:
|
||||
return self.tokenizer.get_vocab()[IMG_CONTEXT]
|
||||
|
||||
def get_image_repl(
|
||||
self,
|
||||
feature_size: int,
|
||||
num_patches: int | None,
|
||||
) -> PromptUpdateDetails[str]:
|
||||
repl_features = IMG_CONTEXT * feature_size
|
||||
repl_full = IMG_START + repl_features + IMG_END
|
||||
|
||||
return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
|
||||
|
||||
def resolve_min_max_num(
|
||||
self,
|
||||
*,
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
use_thumbnail: bool | None = None,
|
||||
) -> tuple[int, int]:
|
||||
min_dynamic_patch = (
|
||||
self.min_dynamic_patch if min_dynamic_patch is None else min_dynamic_patch
|
||||
)
|
||||
max_dynamic_patch = (
|
||||
self.max_dynamic_patch if max_dynamic_patch is None else max_dynamic_patch
|
||||
)
|
||||
dynamic_image_size = (
|
||||
self.dynamic_image_size
|
||||
if dynamic_image_size is None
|
||||
else dynamic_image_size
|
||||
)
|
||||
use_thumbnail = self.use_thumbnail if use_thumbnail is None else use_thumbnail
|
||||
|
||||
return resolve_skyworkr1v_min_max_num(
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
use_thumbnail=use_thumbnail,
|
||||
)
|
||||
|
||||
def resolve_target_ratios(
|
||||
self,
|
||||
*,
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
use_thumbnail: bool | None = None,
|
||||
) -> list[tuple[int, int]]:
|
||||
min_num, max_num = self.resolve_min_max_num(
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
use_thumbnail=use_thumbnail,
|
||||
)
|
||||
|
||||
return get_skyworkr1v_target_ratios(min_num, max_num)
|
||||
|
||||
def get_num_image_tokens(
|
||||
self,
|
||||
*,
|
||||
image_width: int,
|
||||
image_height: int,
|
||||
) -> int:
|
||||
target_ratios = self.resolve_target_ratios(
|
||||
use_thumbnail=False, # Applied in calculate_targets
|
||||
)
|
||||
|
||||
num_patches, _, _ = calculate_skyworkr1v_targets(
|
||||
orig_width=image_width,
|
||||
orig_height=image_height,
|
||||
image_size=self.image_size,
|
||||
target_ratios=target_ratios,
|
||||
use_thumbnail=self.use_thumbnail,
|
||||
)
|
||||
|
||||
return num_patches * self.num_image_token
|
||||
|
||||
def _images_to_pixel_values_lst(
|
||||
self,
|
||||
images: list[Image.Image],
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
) -> list[torch.Tensor]:
|
||||
min_num, max_num = self.resolve_min_max_num(
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
use_thumbnail=False, # Applied in image_to_pixel_values
|
||||
)
|
||||
|
||||
return [
|
||||
image_to_pixel_values_skyworkr1v(
|
||||
image,
|
||||
input_size=self.image_size,
|
||||
min_num=min_num,
|
||||
max_num=max_num,
|
||||
use_thumbnail=self.use_thumbnail,
|
||||
)
|
||||
for image in images
|
||||
]
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
text: str | list[str] | None = None,
|
||||
images: Image.Image | list[Image.Image] | None = None,
|
||||
min_dynamic_patch: int | None = None,
|
||||
max_dynamic_patch: int | None = None,
|
||||
dynamic_image_size: bool | None = None,
|
||||
return_tensors: str | TensorType | None = None,
|
||||
) -> BatchFeature:
|
||||
if text is None:
|
||||
text = []
|
||||
if not isinstance(text, list):
|
||||
text = [text]
|
||||
if images is None:
|
||||
images = []
|
||||
if not isinstance(images, list):
|
||||
images = [images]
|
||||
|
||||
if len(images) == 0:
|
||||
image_inputs = {}
|
||||
else:
|
||||
pixel_values_lst = self._images_to_pixel_values_lst(
|
||||
images,
|
||||
min_dynamic_patch=min_dynamic_patch,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
)
|
||||
image_inputs = {
|
||||
"pixel_values_flat": torch.cat(pixel_values_lst),
|
||||
"image_num_patches": torch.tensor(
|
||||
[len(item) for item in pixel_values_lst]
|
||||
),
|
||||
}
|
||||
|
||||
for pixel_values in pixel_values_lst:
|
||||
num_patches = pixel_values.shape[0]
|
||||
feature_size = num_patches * self.num_image_token
|
||||
|
||||
image_repl = self.get_image_repl(feature_size, num_patches)
|
||||
|
||||
text = [t.replace("<image>", image_repl.full, 1) for t in text]
|
||||
|
||||
text_inputs = self.tokenizer(text)
|
||||
|
||||
combined_outputs = {**text_inputs, **image_inputs}
|
||||
|
||||
return BatchFeature(combined_outputs, tensor_type=return_tensors)
|
||||
Reference in New Issue
Block a user