Remove all references to yapf as it's no longer used (#26251)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@@ -28,7 +28,6 @@ def _get_minicpmv_chat_template_fallback(tokenizer_name_or_path: str) -> Optiona
|
||||
return CHAT_TEMPLATES_DIR / "template_chatml.jinja"
|
||||
|
||||
|
||||
# yapf: disable
|
||||
_MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK: dict[str, ChatTemplatePath] = {
|
||||
"blip-2": CHAT_TEMPLATES_DIR / "template_blip2.jinja",
|
||||
"clip": CHAT_TEMPLATES_DIR / "template_basic.jinja",
|
||||
@@ -39,7 +38,6 @@ _MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK: dict[str, ChatTemplatePath] = {
|
||||
"paligemma": CHAT_TEMPLATES_DIR / "template_basic.jinja",
|
||||
"qwen": _get_qwen_chat_template_fallback,
|
||||
}
|
||||
# yapf: enable
|
||||
|
||||
|
||||
def register_chat_template_fallback_path(
|
||||
|
||||
@@ -1,12 +1,11 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# yapf: disable
|
||||
# ruff: noqa: E501
|
||||
# coding=utf-8
|
||||
# Copied from
|
||||
# https://huggingface.co/Snowflake/snowflake-arctic-instruct/blob/main/configuration_arctic.py
|
||||
""" Arctic model configuration"""
|
||||
"""Arctic model configuration"""
|
||||
|
||||
from dataclasses import asdict, dataclass
|
||||
from typing import Any
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# yapf: disable
|
||||
# ruff: noqa: E501
|
||||
# Adapted from
|
||||
# https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1/blob/main/configuration.py
|
||||
@@ -16,7 +15,7 @@ from transformers.dynamic_module_utils import get_class_from_dynamic_module
|
||||
|
||||
|
||||
class Nemotron_Nano_VL_Config(PretrainedConfig):
|
||||
model_type = 'Llama_Nemotron_Nano_VL'
|
||||
model_type = "Llama_Nemotron_Nano_VL"
|
||||
is_composition = True
|
||||
|
||||
def __init__(
|
||||
@@ -26,17 +25,22 @@ class Nemotron_Nano_VL_Config(PretrainedConfig):
|
||||
force_image_size=None,
|
||||
downsample_ratio=0.5,
|
||||
template=None,
|
||||
ps_version='v1',
|
||||
ps_version="v1",
|
||||
image_tag_type="internvl",
|
||||
projector_hidden_size=4096,
|
||||
vit_hidden_size=1280,
|
||||
**kwargs
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
if vision_config is not None:
|
||||
assert "auto_map" in vision_config and "AutoConfig" in vision_config["auto_map"]
|
||||
vision_auto_config = get_class_from_dynamic_module(*vision_config["auto_map"]["AutoConfig"].split("--")[::-1])
|
||||
assert (
|
||||
"auto_map" in vision_config
|
||||
and "AutoConfig" in vision_config["auto_map"]
|
||||
)
|
||||
vision_auto_config = get_class_from_dynamic_module(
|
||||
*vision_config["auto_map"]["AutoConfig"].split("--")[::-1]
|
||||
)
|
||||
self.vision_config = vision_auto_config(**vision_config)
|
||||
else:
|
||||
self.vision_config = PretrainedConfig()
|
||||
@@ -51,6 +55,6 @@ class Nemotron_Nano_VL_Config(PretrainedConfig):
|
||||
self.downsample_ratio = downsample_ratio
|
||||
self.template = template # TODO move out of here and into the tokenizer
|
||||
self.ps_version = ps_version # Pixel shuffle version
|
||||
self.image_tag_type = image_tag_type # TODO: into the tokenizer too?
|
||||
self.image_tag_type = image_tag_type # TODO: into the tokenizer too?
|
||||
self.projector_hidden_size = projector_hidden_size
|
||||
self.vit_hidden_size = vit_hidden_size
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# yapf: disable
|
||||
# ruff: noqa: E501
|
||||
# adapted from https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_aimv2.py
|
||||
# and https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_ovis.py
|
||||
@@ -70,34 +69,37 @@ class AIMv2Config(PretrainedConfig):
|
||||
# Visual Tokenizer Configuration
|
||||
# ----------------------------------------------------------------------
|
||||
class BaseVisualTokenizerConfig(PretrainedConfig):
|
||||
|
||||
def __init__(self,
|
||||
vocab_size=16384,
|
||||
tokenize_function="softmax",
|
||||
tau=1.0,
|
||||
depths=None,
|
||||
drop_cls_token=False,
|
||||
backbone_config: Optional[Union[PretrainedConfig,
|
||||
dict]] = None,
|
||||
hidden_stride: int = 1,
|
||||
**kwargs):
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=16384,
|
||||
tokenize_function="softmax",
|
||||
tau=1.0,
|
||||
depths=None,
|
||||
drop_cls_token=False,
|
||||
backbone_config: Optional[Union[PretrainedConfig, dict]] = None,
|
||||
hidden_stride: int = 1,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
self.vocab_size = vocab_size
|
||||
self.tokenize_function = tokenize_function
|
||||
self.tau = tau
|
||||
if isinstance(depths, str):
|
||||
depths = [int(x) for x in depths.split('|')]
|
||||
depths = [int(x) for x in depths.split("|")]
|
||||
self.depths = depths
|
||||
self.backbone_kwargs = dict[str, Any]()
|
||||
self.drop_cls_token = drop_cls_token
|
||||
if backbone_config is not None:
|
||||
assert isinstance(backbone_config, (PretrainedConfig, dict)), \
|
||||
assert isinstance(backbone_config, (PretrainedConfig, dict)), (
|
||||
f"expect `backbone_config` to be instance of PretrainedConfig or dict, but got {type(backbone_config)} type"
|
||||
)
|
||||
if not isinstance(backbone_config, PretrainedConfig):
|
||||
model_type = backbone_config['model_type']
|
||||
model_type = backbone_config["model_type"]
|
||||
if model_type != "aimv2":
|
||||
backbone_config.pop('model_type')
|
||||
backbone_config = AutoConfig.for_model(model_type, **backbone_config)
|
||||
backbone_config.pop("model_type")
|
||||
backbone_config = AutoConfig.for_model(
|
||||
model_type, **backbone_config
|
||||
)
|
||||
else:
|
||||
backbone_config = AIMv2Config(**backbone_config)
|
||||
self.backbone_config = backbone_config
|
||||
@@ -113,7 +115,7 @@ class Aimv2VisualTokenizerConfig(BaseVisualTokenizerConfig):
|
||||
self.drop_cls_token = False
|
||||
if self.depths:
|
||||
assert len(self.depths) == 1
|
||||
self.backbone_kwargs['num_hidden_layers'] = self.depths[0]
|
||||
self.backbone_kwargs["num_hidden_layers"] = self.depths[0]
|
||||
|
||||
|
||||
class SiglipVisualTokenizerConfig(BaseVisualTokenizerConfig):
|
||||
@@ -125,7 +127,7 @@ class SiglipVisualTokenizerConfig(BaseVisualTokenizerConfig):
|
||||
self.drop_cls_token = False
|
||||
if self.depths:
|
||||
assert len(self.depths) == 1
|
||||
self.backbone_kwargs['num_hidden_layers'] = self.depths[0]
|
||||
self.backbone_kwargs["num_hidden_layers"] = self.depths[0]
|
||||
|
||||
|
||||
AutoConfig.register("siglip_visual_tokenizer", SiglipVisualTokenizerConfig)
|
||||
@@ -138,35 +140,39 @@ AutoConfig.register("aimv2_visual_tokenizer", Aimv2VisualTokenizerConfig)
|
||||
class OvisConfig(PretrainedConfig):
|
||||
model_type = "ovis"
|
||||
|
||||
def __init__(self,
|
||||
llm_config: Optional[Union[PretrainedConfig, dict]] = None,
|
||||
visual_tokenizer_config: Optional[Union[PretrainedConfig,
|
||||
dict]] = None,
|
||||
multimodal_max_length=8192,
|
||||
hidden_size=None,
|
||||
conversation_formatter_class=None,
|
||||
llm_attn_implementation=None,
|
||||
disable_tie_weight=False,
|
||||
**kwargs):
|
||||
def __init__(
|
||||
self,
|
||||
llm_config: Optional[Union[PretrainedConfig, dict]] = None,
|
||||
visual_tokenizer_config: Optional[Union[PretrainedConfig, dict]] = None,
|
||||
multimodal_max_length=8192,
|
||||
hidden_size=None,
|
||||
conversation_formatter_class=None,
|
||||
llm_attn_implementation=None,
|
||||
disable_tie_weight=False,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
if llm_config is not None:
|
||||
assert isinstance(llm_config, (PretrainedConfig, dict)), \
|
||||
assert isinstance(llm_config, (PretrainedConfig, dict)), (
|
||||
f"expect `llm_config` to be instance of PretrainedConfig or dict, but got {type(llm_config)} type"
|
||||
)
|
||||
if not isinstance(llm_config, PretrainedConfig):
|
||||
model_type = llm_config['model_type']
|
||||
llm_config.pop('model_type')
|
||||
model_type = llm_config["model_type"]
|
||||
llm_config.pop("model_type")
|
||||
llm_config = AutoConfig.for_model(model_type, **llm_config)
|
||||
|
||||
# map llm_config to text_config
|
||||
self.text_config = llm_config
|
||||
if visual_tokenizer_config is not None:
|
||||
assert isinstance(visual_tokenizer_config, (PretrainedConfig, dict)), \
|
||||
assert isinstance(visual_tokenizer_config, (PretrainedConfig, dict)), (
|
||||
f"expect `visual_tokenizer_config` to be instance of PretrainedConfig or dict, but got {type(visual_tokenizer_config)} type"
|
||||
)
|
||||
if not isinstance(visual_tokenizer_config, PretrainedConfig):
|
||||
model_type = visual_tokenizer_config['model_type']
|
||||
visual_tokenizer_config.pop('model_type')
|
||||
model_type = visual_tokenizer_config["model_type"]
|
||||
visual_tokenizer_config.pop("model_type")
|
||||
visual_tokenizer_config = AutoConfig.for_model(
|
||||
model_type, **visual_tokenizer_config)
|
||||
model_type, **visual_tokenizer_config
|
||||
)
|
||||
|
||||
self.visual_tokenizer_config = visual_tokenizer_config
|
||||
self.multimodal_max_length = multimodal_max_length
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# yapf: disable
|
||||
# ruff: noqa: E501
|
||||
# coding=utf-8
|
||||
# adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/ff23960c5cf9e6874b44be38af930cfb0ccbb620/deepseek_vl2/models/processing_deepseek_vl_v2.py
|
||||
@@ -35,11 +34,12 @@ from transformers.processing_utils import ProcessorMixin
|
||||
|
||||
|
||||
class ImageTransform:
|
||||
|
||||
def __init__(self,
|
||||
mean: tuple[float, float, float] = (0.5, 0.5, 0.5),
|
||||
std: tuple[float, float, float] = (0.5, 0.5, 0.5),
|
||||
normalize: bool = True):
|
||||
def __init__(
|
||||
self,
|
||||
mean: tuple[float, float, float] = (0.5, 0.5, 0.5),
|
||||
std: tuple[float, float, float] = (0.5, 0.5, 0.5),
|
||||
normalize: bool = True,
|
||||
):
|
||||
self.mean = mean
|
||||
self.std = std
|
||||
self.normalize = normalize
|
||||
@@ -77,7 +77,6 @@ class DeepseekVLV2Processor(ProcessorMixin):
|
||||
ignore_id: int = -100,
|
||||
**kwargs,
|
||||
):
|
||||
|
||||
self.candidate_resolutions = candidate_resolutions
|
||||
self.image_size = candidate_resolutions[0][0]
|
||||
self.patch_size = patch_size
|
||||
@@ -86,13 +85,15 @@ class DeepseekVLV2Processor(ProcessorMixin):
|
||||
self.normalize = normalize
|
||||
self.downsample_ratio = downsample_ratio
|
||||
|
||||
self.image_transform = ImageTransform(mean=image_mean, std=image_std, normalize=normalize)
|
||||
self.image_transform = ImageTransform(
|
||||
mean=image_mean, std=image_std, normalize=normalize
|
||||
)
|
||||
self.tokenizer = tokenizer
|
||||
self.tokenizer.padding_side = 'left' # must set this,padding side with make a difference in batch inference
|
||||
self.tokenizer.padding_side = "left" # must set this,padding side with make a difference in batch inference
|
||||
|
||||
# add the pad_token as special token to use 'tokenizer.pad_token' and 'tokenizer.pad_token_id'
|
||||
if tokenizer.pad_token is None:
|
||||
self.tokenizer.add_special_tokens({'pad_token': pad_token})
|
||||
self.tokenizer.add_special_tokens({"pad_token": pad_token})
|
||||
|
||||
# add image token
|
||||
image_token_id = self.tokenizer.vocab.get(image_token)
|
||||
@@ -104,7 +105,7 @@ class DeepseekVLV2Processor(ProcessorMixin):
|
||||
|
||||
# add five special tokens for grounding-related tasks
|
||||
# <|ref|>, <|/ref|>, <|det|>, <|/det|>, <|grounding|>
|
||||
special_tokens = ['<|ref|>', '<|/ref|>', '<|det|>', '<|/det|>', '<|grounding|>']
|
||||
special_tokens = ["<|ref|>", "<|/ref|>", "<|det|>", "<|/det|>", "<|grounding|>"]
|
||||
special_tokens_dict = {"additional_special_tokens": special_tokens}
|
||||
self.tokenizer.add_special_tokens(special_tokens_dict)
|
||||
|
||||
@@ -134,15 +135,19 @@ class DeepseekVLV2Processor(ProcessorMixin):
|
||||
|
||||
for width, height in self.candidate_resolutions:
|
||||
scale = min(width / original_width, height / original_height)
|
||||
downscaled_width, downscaled_height = int(
|
||||
original_width * scale), int(original_height * scale)
|
||||
effective_resolution = min(downscaled_width * downscaled_height,
|
||||
original_width * original_height)
|
||||
downscaled_width, downscaled_height = (
|
||||
int(original_width * scale),
|
||||
int(original_height * scale),
|
||||
)
|
||||
effective_resolution = min(
|
||||
downscaled_width * downscaled_height, original_width * original_height
|
||||
)
|
||||
wasted_resolution = (width * height) - effective_resolution
|
||||
|
||||
if effective_resolution > max_effective_resolution or (
|
||||
effective_resolution == max_effective_resolution
|
||||
and wasted_resolution < min_wasted_resolution):
|
||||
effective_resolution == max_effective_resolution
|
||||
and wasted_resolution < min_wasted_resolution
|
||||
):
|
||||
max_effective_resolution = effective_resolution
|
||||
min_wasted_resolution = wasted_resolution
|
||||
best_fit = (width, height)
|
||||
@@ -198,12 +203,20 @@ class DeepseekVLV2Processor(ProcessorMixin):
|
||||
- num_image_tokens (list[int]): the number of image tokens
|
||||
"""
|
||||
|
||||
assert (prompt is not None and images is not None
|
||||
), "prompt and images must be used at the same time."
|
||||
assert prompt is not None and images is not None, (
|
||||
"prompt and images must be used at the same time."
|
||||
)
|
||||
|
||||
sft_format = prompt
|
||||
tokenized_str, images_list, images_seq_mask, images_spatial_crop, num_image_tokens = self.tokenize_with_images(
|
||||
sft_format, images, bos=True, eos=True, cropping=len(images) <= 2)
|
||||
(
|
||||
tokenized_str,
|
||||
images_list,
|
||||
images_seq_mask,
|
||||
images_spatial_crop,
|
||||
num_image_tokens,
|
||||
) = self.tokenize_with_images(
|
||||
sft_format, images, bos=True, eos=True, cropping=len(images) <= 2
|
||||
)
|
||||
masked_tokenized_str = []
|
||||
for token_index in tokenized_str:
|
||||
if token_index != self.image_token_id:
|
||||
@@ -211,17 +224,21 @@ class DeepseekVLV2Processor(ProcessorMixin):
|
||||
else:
|
||||
masked_tokenized_str.append(self.ignore_id)
|
||||
|
||||
assert len(tokenized_str) == len(images_seq_mask) == len(masked_tokenized_str), \
|
||||
(f"tokenized_str's length {len(tokenized_str)}, input_ids' length {len(masked_tokenized_str)}, "
|
||||
f"imags_seq_mask's length {len(images_seq_mask)}, are not equal")
|
||||
assert (
|
||||
len(tokenized_str) == len(images_seq_mask) == len(masked_tokenized_str)
|
||||
), (
|
||||
f"tokenized_str's length {len(tokenized_str)}, input_ids' length {len(masked_tokenized_str)}, "
|
||||
f"imags_seq_mask's length {len(images_seq_mask)}, are not equal"
|
||||
)
|
||||
|
||||
input_ids = torch.LongTensor(tokenized_str)
|
||||
target_ids = torch.LongTensor(masked_tokenized_str)
|
||||
images_seq_mask = torch.tensor(images_seq_mask, dtype=torch.bool)
|
||||
|
||||
# set input_ids < 0 | input_ids == self.image_token_id as ignore_id
|
||||
target_ids[(input_ids < 0) |
|
||||
(input_ids == self.image_token_id)] = self.ignore_id
|
||||
target_ids[(input_ids < 0) | (input_ids == self.image_token_id)] = (
|
||||
self.ignore_id
|
||||
)
|
||||
input_ids[input_ids < 0] = self.pad_id
|
||||
|
||||
if inference_mode:
|
||||
@@ -311,30 +328,50 @@ class DeepseekVLV2Processor(ProcessorMixin):
|
||||
best_width, best_height = self.image_size, self.image_size
|
||||
|
||||
"""process the global view"""
|
||||
global_view = ImageOps.pad(image, (self.image_size, self.image_size),
|
||||
color=tuple(int(x * 255) for x in self.image_transform.mean))
|
||||
global_view = ImageOps.pad(
|
||||
image,
|
||||
(self.image_size, self.image_size),
|
||||
color=tuple(int(x * 255) for x in self.image_transform.mean),
|
||||
)
|
||||
images_list.append(self.image_transform(global_view))
|
||||
|
||||
"""process the local views"""
|
||||
local_view = ImageOps.pad(image, (best_width, best_height),
|
||||
color=tuple(int(x * 255) for x in self.image_transform.mean))
|
||||
local_view = ImageOps.pad(
|
||||
image,
|
||||
(best_width, best_height),
|
||||
color=tuple(int(x * 255) for x in self.image_transform.mean),
|
||||
)
|
||||
for i in range(0, best_height, self.image_size):
|
||||
for j in range(0, best_width, self.image_size):
|
||||
images_list.append(
|
||||
self.image_transform(local_view.crop((j, i, j + self.image_size, i + self.image_size))))
|
||||
self.image_transform(
|
||||
local_view.crop(
|
||||
(j, i, j + self.image_size, i + self.image_size)
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
"""record height / width crop num"""
|
||||
num_width_tiles, num_height_tiles = best_width // self.image_size, best_height // self.image_size
|
||||
num_width_tiles, num_height_tiles = (
|
||||
best_width // self.image_size,
|
||||
best_height // self.image_size,
|
||||
)
|
||||
images_spatial_crop.append([num_width_tiles, num_height_tiles])
|
||||
|
||||
"""add image tokens"""
|
||||
h = w = math.ceil((self.image_size // self.patch_size) / self.downsample_ratio)
|
||||
h = w = math.ceil(
|
||||
(self.image_size // self.patch_size) / self.downsample_ratio
|
||||
)
|
||||
# global views tokens h * (w + 1), 1 is for line separator
|
||||
tokenized_image = [self.image_token_id] * h * (w + 1)
|
||||
# add a separator between global and local views
|
||||
tokenized_image += [self.image_token_id]
|
||||
# local views tokens, (num_height_tiles * h) * (num_width_tiles * w + 1)
|
||||
tokenized_image += [self.image_token_id] * (num_height_tiles * h) * (num_width_tiles * w + 1)
|
||||
tokenized_image += (
|
||||
[self.image_token_id]
|
||||
* (num_height_tiles * h)
|
||||
* (num_width_tiles * w + 1)
|
||||
)
|
||||
|
||||
tokenized_str += tokenized_image
|
||||
images_seq_mask += [True] * len(tokenized_image)
|
||||
@@ -353,10 +390,17 @@ class DeepseekVLV2Processor(ProcessorMixin):
|
||||
tokenized_str = tokenized_str + [self.eos_id]
|
||||
images_seq_mask = images_seq_mask + [False]
|
||||
|
||||
assert len(tokenized_str) == len(
|
||||
images_seq_mask), f"tokenize_with_images func: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}"
|
||||
assert len(tokenized_str) == len(images_seq_mask), (
|
||||
f"tokenize_with_images func: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}"
|
||||
)
|
||||
|
||||
return tokenized_str, images_list, images_seq_mask, images_spatial_crop, num_image_tokens
|
||||
return (
|
||||
tokenized_str,
|
||||
images_list,
|
||||
images_seq_mask,
|
||||
images_spatial_crop,
|
||||
num_image_tokens,
|
||||
)
|
||||
|
||||
|
||||
AutoProcessor.register("DeepseekVLV2Processor", DeepseekVLV2Processor)
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# yapf: disable
|
||||
# ruff: noqa: E501
|
||||
# coding=utf-8
|
||||
# adapted from https://github.com/AIDC-AI/Ovis/blob/35ab51a1a1e3542fa6db260a1084cefbc8f164bb/ovis/vllm/processing_ovis.py
|
||||
@@ -35,23 +34,24 @@ from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
|
||||
|
||||
from vllm.multimodal.image import convert_image_mode
|
||||
|
||||
__all__ = ['OvisProcessor']
|
||||
__all__ = ["OvisProcessor"]
|
||||
IGNORE_ID = -100
|
||||
|
||||
class OvisProcessorKwargs(ProcessingKwargs, total=False): # type: ignore[call-arg]
|
||||
|
||||
class OvisProcessorKwargs(ProcessingKwargs, total=False): # type: ignore[call-arg]
|
||||
_defaults = {
|
||||
"text_kwargs": {
|
||||
"padding": False,
|
||||
},
|
||||
"images_kwargs": {
|
||||
'max_partition':9,
|
||||
'covering_threshold':0.9,
|
||||
'convert_to_rgb':True,
|
||||
'return_tensors':'pt'},
|
||||
"max_partition": 9,
|
||||
"covering_threshold": 0.9,
|
||||
"convert_to_rgb": True,
|
||||
"return_tensors": "pt",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
|
||||
class OvisProcessor(ProcessorMixin):
|
||||
r"""
|
||||
Constructs an Ovis processor which wraps an Ovis image processor and a Qwen2 tokenizer into a single processor.
|
||||
@@ -97,14 +97,16 @@ class OvisProcessor(ProcessorMixin):
|
||||
"image_col_sep": -303,
|
||||
"image_row_sep": -304,
|
||||
"image_end": -305,
|
||||
'image_pad': image_pad_token_id,
|
||||
"image_pad": image_pad_token_id,
|
||||
}
|
||||
return extra_special_tokens
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
images: ImageInput = None,
|
||||
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
|
||||
text: Union[
|
||||
TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]
|
||||
] = None,
|
||||
**kwargs: Unpack[OvisProcessorKwargs],
|
||||
) -> BatchFeature:
|
||||
"""
|
||||
@@ -169,7 +171,6 @@ class OvisProcessor(ProcessorMixin):
|
||||
|
||||
# Process text input
|
||||
if text is not None:
|
||||
|
||||
if not isinstance(text, list):
|
||||
text = [text]
|
||||
|
||||
@@ -178,7 +179,10 @@ class OvisProcessor(ProcessorMixin):
|
||||
replaced_ids_list = []
|
||||
idx = 0
|
||||
for ids_tensor in tokenized_batched_text:
|
||||
if image_token_id in ids_tensor and "image_placeholders" in image_features:
|
||||
if (
|
||||
image_token_id in ids_tensor
|
||||
and "image_placeholders" in image_features
|
||||
):
|
||||
if idx < len(image_features["image_placeholders"]):
|
||||
# Converts in list for ease of use
|
||||
ids_list = ids_tensor.tolist()
|
||||
@@ -188,7 +192,9 @@ class OvisProcessor(ProcessorMixin):
|
||||
# replace placeholders
|
||||
for i, token_id in enumerate(ids_list):
|
||||
if token_id == image_token_id:
|
||||
placeholder_ids = image_features["image_placeholders"][idx]
|
||||
placeholder_ids = image_features["image_placeholders"][
|
||||
idx
|
||||
]
|
||||
new_ids.extend(placeholder_ids)
|
||||
idx += 1
|
||||
else:
|
||||
@@ -198,7 +204,8 @@ class OvisProcessor(ProcessorMixin):
|
||||
ids_tensor = torch.tensor(new_ids, dtype=torch.long)
|
||||
else:
|
||||
raise RuntimeError(
|
||||
'Mismatch between the images you provided and the number of placeholder present in the text')
|
||||
"Mismatch between the images you provided and the number of placeholder present in the text"
|
||||
)
|
||||
|
||||
replaced_ids_list.append(ids_tensor)
|
||||
|
||||
@@ -217,7 +224,7 @@ class OvisProcessor(ProcessorMixin):
|
||||
# Add image features if present
|
||||
if image_features:
|
||||
output["pixel_values"] = processed_images
|
||||
output['grids'] = grids
|
||||
output["grids"] = grids
|
||||
|
||||
return output
|
||||
|
||||
@@ -227,8 +234,10 @@ class OvisProcessor(ProcessorMixin):
|
||||
def _tokenize_with_image_symbol(self, text_list: list[str]) -> torch.LongTensor:
|
||||
batch_token_ids = []
|
||||
for text in text_list:
|
||||
text_chunks = [self.tokenizer(chunk, add_special_tokens=False).input_ids for chunk in
|
||||
text.split(self.image_token)]
|
||||
text_chunks = [
|
||||
self.tokenizer(chunk, add_special_tokens=False).input_ids
|
||||
for chunk in text.split(self.image_token)
|
||||
]
|
||||
token_ids = []
|
||||
num_chuck = len(text_chunks)
|
||||
for i, chunk in enumerate(text_chunks):
|
||||
@@ -240,50 +249,60 @@ class OvisProcessor(ProcessorMixin):
|
||||
|
||||
def get_image_size(self):
|
||||
size = self.image_processor.size
|
||||
if 'shortest_edge' in size:
|
||||
width = height = size['shortest_edge']
|
||||
if "shortest_edge" in size:
|
||||
width = height = size["shortest_edge"]
|
||||
elif "height" in size and "width" in size:
|
||||
width = size['width']
|
||||
height = size['height']
|
||||
width = size["width"]
|
||||
height = size["height"]
|
||||
else:
|
||||
raise ValueError( "Can't parse image size from image_processor config.")
|
||||
raise ValueError("Can't parse image size from image_processor config.")
|
||||
return height, width
|
||||
|
||||
def get_token_value(self, tok):
|
||||
return self.extra_special_tokens[tok]
|
||||
|
||||
def construct_image_indicators(self, grid):
|
||||
image_placeholders = [self.get_token_value('image_start'),
|
||||
self.get_token_value('image_atom'),
|
||||
self.get_token_value('image_prefix')]
|
||||
image_placeholders = [
|
||||
self.get_token_value("image_start"),
|
||||
self.get_token_value("image_atom"),
|
||||
self.get_token_value("image_prefix"),
|
||||
]
|
||||
if grid[0] * grid[1] > 1:
|
||||
for r in range(grid[0]):
|
||||
for c in range(grid[1]):
|
||||
image_placeholders.append(self.get_token_value('image_atom') )
|
||||
image_placeholders.append(self.get_token_value("image_atom"))
|
||||
if c < grid[1] - 1:
|
||||
image_placeholders.append(self.get_token_value('image_col_sep'))
|
||||
image_placeholders.append(self.get_token_value("image_col_sep"))
|
||||
if r < grid[0] - 1:
|
||||
image_placeholders.append(self.get_token_value('image_row_sep'))
|
||||
image_placeholders.append(self.get_token_value('image_end'))
|
||||
image_placeholders.append(self.get_token_value("image_row_sep"))
|
||||
image_placeholders.append(self.get_token_value("image_end"))
|
||||
return image_placeholders
|
||||
|
||||
def construct_image_placeholders(self, grid):
|
||||
|
||||
image_placeholders = self.construct_image_indicators(grid)
|
||||
|
||||
image_atom_token_id = self.get_token_value('image_atom')
|
||||
image_atom_token_id = self.get_token_value("image_atom")
|
||||
# Extract the padding token ID from tokenizer
|
||||
image_padding_token_id = self.get_token_value('image_pad')
|
||||
image_padding_token_id = self.get_token_value("image_pad")
|
||||
|
||||
# Create a new list with padding tokens inserted
|
||||
padded_placeholder_tokens = []
|
||||
for token in image_placeholders:
|
||||
padded_placeholder_tokens.append(image_padding_token_id)
|
||||
if token == image_atom_token_id:
|
||||
padded_placeholder_tokens.extend([image_padding_token_id] * self.image_segment_len)
|
||||
padded_placeholder_tokens.extend(
|
||||
[image_padding_token_id] * self.image_segment_len
|
||||
)
|
||||
return padded_placeholder_tokens
|
||||
|
||||
def preprocess_image(self, image: PIL.Image.Image, max_partition, covering_threshold, convert_to_rgb, return_tensors):
|
||||
def preprocess_image(
|
||||
self,
|
||||
image: PIL.Image.Image,
|
||||
max_partition,
|
||||
covering_threshold,
|
||||
convert_to_rgb,
|
||||
return_tensors,
|
||||
):
|
||||
def _preprocess(img: PIL.Image.Image, side):
|
||||
# first resize and preprocess
|
||||
w, h = img.size
|
||||
@@ -296,19 +315,27 @@ class OvisProcessor(ProcessorMixin):
|
||||
new_height = side
|
||||
new_width = int(w / h * new_height)
|
||||
new_size = dict(height=new_height, width=new_width)
|
||||
pixel_values = self.image_processor.preprocess(img, size=new_size, return_tensors=return_tensors)['pixel_values']
|
||||
pixel_values = self.image_processor.preprocess(
|
||||
img, size=new_size, return_tensors=return_tensors
|
||||
)["pixel_values"]
|
||||
|
||||
# then pad to square
|
||||
square_values = torch.zeros([1, 3, side, side], dtype=pixel_values.dtype, device=pixel_values.device)
|
||||
square_values = torch.zeros(
|
||||
[1, 3, side, side], dtype=pixel_values.dtype, device=pixel_values.device
|
||||
)
|
||||
new_height, new_width = pixel_values.shape[2:]
|
||||
if new_height == new_width:
|
||||
square_values[:, :, :, :] = pixel_values
|
||||
elif new_height > new_width:
|
||||
from_index = (side - new_width) // 2
|
||||
square_values[:, :, :, from_index:from_index + new_width] = pixel_values
|
||||
square_values[:, :, :, from_index : from_index + new_width] = (
|
||||
pixel_values
|
||||
)
|
||||
else:
|
||||
from_index = (side - new_height) // 2
|
||||
square_values[:, :, from_index:from_index + new_height, :] = pixel_values
|
||||
square_values[:, :, from_index : from_index + new_height, :] = (
|
||||
pixel_values
|
||||
)
|
||||
|
||||
return square_values
|
||||
|
||||
@@ -350,7 +377,9 @@ class OvisProcessor(ProcessorMixin):
|
||||
good_grids = []
|
||||
for grid in candidate_grids:
|
||||
partition = _partition(img, grid)
|
||||
covering_ratio = sum([_covering_area(*p, side) for p in partition]) / img_area
|
||||
covering_ratio = (
|
||||
sum([_covering_area(*p, side) for p in partition]) / img_area
|
||||
)
|
||||
assert covering_ratio <= 1.0
|
||||
all_grids.append((grid, covering_ratio))
|
||||
if covering_ratio > covering_threshold:
|
||||
@@ -358,18 +387,19 @@ class OvisProcessor(ProcessorMixin):
|
||||
|
||||
if len(good_grids) > 0:
|
||||
# pick the good partition with minimum #sub_images and break the tie using covering_ratio
|
||||
return sorted(good_grids, key=lambda x: (x[0][0] * x[0][1], -x[1]))[0][0]
|
||||
return sorted(good_grids, key=lambda x: (x[0][0] * x[0][1], -x[1]))[0][
|
||||
0
|
||||
]
|
||||
else:
|
||||
# pick the partition with maximum covering_ratio and break the tie using #sub_images
|
||||
return sorted(all_grids, key=lambda x: (-x[1], x[0][0] * x[0][1]))[0][0]
|
||||
|
||||
if convert_to_rgb:
|
||||
image = convert_image_mode(image, 'RGB')
|
||||
|
||||
image = convert_image_mode(image, "RGB")
|
||||
|
||||
sides = self.get_image_size()
|
||||
if sides[0] != sides[1]:
|
||||
raise ValueError('get_image_size() returns non-square size')
|
||||
raise ValueError("get_image_size() returns non-square size")
|
||||
side = sides[0]
|
||||
grid = _get_best_grid(image, side)
|
||||
partition = _partition(image, grid)
|
||||
@@ -405,14 +435,18 @@ class OvisProcessor(ProcessorMixin):
|
||||
`list[str]`: The decoded text.
|
||||
"""
|
||||
return self.tokenizer.batch_decode(
|
||||
generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
||||
generated_outputs,
|
||||
skip_special_tokens=True,
|
||||
clean_up_tokenization_spaces=False,
|
||||
)
|
||||
|
||||
@property
|
||||
def model_input_names(self):
|
||||
tokenizer_input_names = self.tokenizer.model_input_names
|
||||
image_processor_input_names = self.image_processor.model_input_names
|
||||
names_from_processor = list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
|
||||
names_from_processor = list(
|
||||
dict.fromkeys(tokenizer_input_names + image_processor_input_names)
|
||||
)
|
||||
return names_from_processor + ["second_per_grid_ts"]
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user