Remove all references to yapf as it's no longer used (#26251)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-05 17:18:11 +01:00
parent d6953beb91
commit 4e256cadc2
78 changed files with 1992 additions and 1717 deletions
--- a/vllm/transformers_utils/chat_templates/registry.py
+++ b/vllm/transformers_utils/chat_templates/registry.py
@@ -28,7 +28,6 @@ def _get_minicpmv_chat_template_fallback(tokenizer_name_or_path: str) -> Optiona
    return CHAT_TEMPLATES_DIR / "template_chatml.jinja"


-# yapf: disable
 _MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK: dict[str, ChatTemplatePath] = {
    "blip-2": CHAT_TEMPLATES_DIR / "template_blip2.jinja",
    "clip": CHAT_TEMPLATES_DIR / "template_basic.jinja",
@@ -39,7 +38,6 @@ _MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK: dict[str, ChatTemplatePath] = {
    "paligemma": CHAT_TEMPLATES_DIR / "template_basic.jinja",
    "qwen": _get_qwen_chat_template_fallback,
 }
-# yapf: enable


 def register_chat_template_fallback_path(
--- a/vllm/transformers_utils/configs/arctic.py
+++ b/vllm/transformers_utils/configs/arctic.py
@@ -1,12 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-# yapf: disable
 # ruff: noqa: E501
 # coding=utf-8
 # Copied from
 # https://huggingface.co/Snowflake/snowflake-arctic-instruct/blob/main/configuration_arctic.py
-""" Arctic model configuration"""
+"""Arctic model configuration"""

 from dataclasses import asdict, dataclass
 from typing import Any
--- a/vllm/transformers_utils/configs/nemotron_vl.py
+++ b/vllm/transformers_utils/configs/nemotron_vl.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-# yapf: disable
 # ruff: noqa: E501
 # Adapted from
 # https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1/blob/main/configuration.py
@@ -16,7 +15,7 @@ from transformers.dynamic_module_utils import get_class_from_dynamic_module


 class Nemotron_Nano_VL_Config(PretrainedConfig):
-    model_type = 'Llama_Nemotron_Nano_VL'
+    model_type = "Llama_Nemotron_Nano_VL"
    is_composition = True

    def __init__(
@@ -26,17 +25,22 @@ class Nemotron_Nano_VL_Config(PretrainedConfig):
        force_image_size=None,
        downsample_ratio=0.5,
        template=None,
-        ps_version='v1',
+        ps_version="v1",
        image_tag_type="internvl",
        projector_hidden_size=4096,
        vit_hidden_size=1280,
-        **kwargs
+        **kwargs,
    ):
        super().__init__(**kwargs)

        if vision_config is not None:
-            assert "auto_map" in vision_config and "AutoConfig" in vision_config["auto_map"]
-            vision_auto_config = get_class_from_dynamic_module(*vision_config["auto_map"]["AutoConfig"].split("--")[::-1])
+            assert (
+                "auto_map" in vision_config
+                and "AutoConfig" in vision_config["auto_map"]
+            )
+            vision_auto_config = get_class_from_dynamic_module(
+                *vision_config["auto_map"]["AutoConfig"].split("--")[::-1]
+            )
            self.vision_config = vision_auto_config(**vision_config)
        else:
            self.vision_config = PretrainedConfig()
@@ -51,6 +55,6 @@ class Nemotron_Nano_VL_Config(PretrainedConfig):
        self.downsample_ratio = downsample_ratio
        self.template = template  # TODO move out of here and into the tokenizer
        self.ps_version = ps_version  # Pixel shuffle version
-        self.image_tag_type = image_tag_type # TODO: into the tokenizer too?
+        self.image_tag_type = image_tag_type  # TODO: into the tokenizer too?
        self.projector_hidden_size = projector_hidden_size
        self.vit_hidden_size = vit_hidden_size
--- a/vllm/transformers_utils/configs/ovis.py
+++ b/vllm/transformers_utils/configs/ovis.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-# yapf: disable
 # ruff: noqa: E501
 # adapted from https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_aimv2.py
 # and https://huggingface.co/AIDC-AI/Ovis2-1B/blob/main/configuration_ovis.py
@@ -70,34 +69,37 @@ class AIMv2Config(PretrainedConfig):
 #                     Visual Tokenizer Configuration
 # ----------------------------------------------------------------------
 class BaseVisualTokenizerConfig(PretrainedConfig):
-
-    def __init__(self,
-                 vocab_size=16384,
-                 tokenize_function="softmax",
-                 tau=1.0,
-                 depths=None,
-                 drop_cls_token=False,
-                 backbone_config: Optional[Union[PretrainedConfig,
-                                                 dict]] = None,
-                 hidden_stride: int = 1,
-                 **kwargs):
+    def __init__(
+        self,
+        vocab_size=16384,
+        tokenize_function="softmax",
+        tau=1.0,
+        depths=None,
+        drop_cls_token=False,
+        backbone_config: Optional[Union[PretrainedConfig, dict]] = None,
+        hidden_stride: int = 1,
+        **kwargs,
+    ):
        super().__init__(**kwargs)
        self.vocab_size = vocab_size
        self.tokenize_function = tokenize_function
        self.tau = tau
        if isinstance(depths, str):
-            depths = [int(x) for x in depths.split('|')]
+            depths = [int(x) for x in depths.split("|")]
        self.depths = depths
        self.backbone_kwargs = dict[str, Any]()
        self.drop_cls_token = drop_cls_token
        if backbone_config is not None:
-            assert isinstance(backbone_config, (PretrainedConfig, dict)), \
+            assert isinstance(backbone_config, (PretrainedConfig, dict)), (
                f"expect `backbone_config` to be instance of PretrainedConfig or dict, but got {type(backbone_config)} type"
+            )
            if not isinstance(backbone_config, PretrainedConfig):
-                model_type = backbone_config['model_type']
+                model_type = backbone_config["model_type"]
                if model_type != "aimv2":
-                    backbone_config.pop('model_type')
-                    backbone_config = AutoConfig.for_model(model_type, **backbone_config)
+                    backbone_config.pop("model_type")
+                    backbone_config = AutoConfig.for_model(
+                        model_type, **backbone_config
+                    )
                else:
                    backbone_config = AIMv2Config(**backbone_config)
        self.backbone_config = backbone_config
@@ -113,7 +115,7 @@ class Aimv2VisualTokenizerConfig(BaseVisualTokenizerConfig):
            self.drop_cls_token = False
        if self.depths:
            assert len(self.depths) == 1
-            self.backbone_kwargs['num_hidden_layers'] = self.depths[0]
+            self.backbone_kwargs["num_hidden_layers"] = self.depths[0]


 class SiglipVisualTokenizerConfig(BaseVisualTokenizerConfig):
@@ -125,7 +127,7 @@ class SiglipVisualTokenizerConfig(BaseVisualTokenizerConfig):
            self.drop_cls_token = False
        if self.depths:
            assert len(self.depths) == 1
-            self.backbone_kwargs['num_hidden_layers'] = self.depths[0]
+            self.backbone_kwargs["num_hidden_layers"] = self.depths[0]


 AutoConfig.register("siglip_visual_tokenizer", SiglipVisualTokenizerConfig)
@@ -138,35 +140,39 @@ AutoConfig.register("aimv2_visual_tokenizer", Aimv2VisualTokenizerConfig)
 class OvisConfig(PretrainedConfig):
    model_type = "ovis"

-    def __init__(self,
-                 llm_config: Optional[Union[PretrainedConfig, dict]] = None,
-                 visual_tokenizer_config: Optional[Union[PretrainedConfig,
-                                                         dict]] = None,
-                 multimodal_max_length=8192,
-                 hidden_size=None,
-                 conversation_formatter_class=None,
-                 llm_attn_implementation=None,
-                 disable_tie_weight=False,
-                 **kwargs):
+    def __init__(
+        self,
+        llm_config: Optional[Union[PretrainedConfig, dict]] = None,
+        visual_tokenizer_config: Optional[Union[PretrainedConfig, dict]] = None,
+        multimodal_max_length=8192,
+        hidden_size=None,
+        conversation_formatter_class=None,
+        llm_attn_implementation=None,
+        disable_tie_weight=False,
+        **kwargs,
+    ):
        super().__init__(**kwargs)
        if llm_config is not None:
-            assert isinstance(llm_config, (PretrainedConfig, dict)), \
+            assert isinstance(llm_config, (PretrainedConfig, dict)), (
                f"expect `llm_config` to be instance of PretrainedConfig or dict, but got {type(llm_config)} type"
+            )
            if not isinstance(llm_config, PretrainedConfig):
-                model_type = llm_config['model_type']
-                llm_config.pop('model_type')
+                model_type = llm_config["model_type"]
+                llm_config.pop("model_type")
                llm_config = AutoConfig.for_model(model_type, **llm_config)

        # map llm_config to text_config
        self.text_config = llm_config
        if visual_tokenizer_config is not None:
-            assert isinstance(visual_tokenizer_config, (PretrainedConfig, dict)), \
+            assert isinstance(visual_tokenizer_config, (PretrainedConfig, dict)), (
                f"expect `visual_tokenizer_config` to be instance of PretrainedConfig or dict, but got {type(visual_tokenizer_config)} type"
+            )
            if not isinstance(visual_tokenizer_config, PretrainedConfig):
-                model_type = visual_tokenizer_config['model_type']
-                visual_tokenizer_config.pop('model_type')
+                model_type = visual_tokenizer_config["model_type"]
+                visual_tokenizer_config.pop("model_type")
                visual_tokenizer_config = AutoConfig.for_model(
-                    model_type, **visual_tokenizer_config)
+                    model_type, **visual_tokenizer_config
+                )

        self.visual_tokenizer_config = visual_tokenizer_config
        self.multimodal_max_length = multimodal_max_length
--- a/vllm/transformers_utils/processors/deepseek_vl2.py
+++ b/vllm/transformers_utils/processors/deepseek_vl2.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-# yapf: disable
 # ruff: noqa: E501
 # coding=utf-8
 # adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/ff23960c5cf9e6874b44be38af930cfb0ccbb620/deepseek_vl2/models/processing_deepseek_vl_v2.py
@@ -35,11 +34,12 @@ from transformers.processing_utils import ProcessorMixin


 class ImageTransform:
-
-    def __init__(self,
-                 mean: tuple[float, float, float] = (0.5, 0.5, 0.5),
-                 std: tuple[float, float, float] = (0.5, 0.5, 0.5),
-                 normalize: bool = True):
+    def __init__(
+        self,
+        mean: tuple[float, float, float] = (0.5, 0.5, 0.5),
+        std: tuple[float, float, float] = (0.5, 0.5, 0.5),
+        normalize: bool = True,
+    ):
        self.mean = mean
        self.std = std
        self.normalize = normalize
@@ -77,7 +77,6 @@ class DeepseekVLV2Processor(ProcessorMixin):
        ignore_id: int = -100,
        **kwargs,
    ):
-
        self.candidate_resolutions = candidate_resolutions
        self.image_size = candidate_resolutions[0][0]
        self.patch_size = patch_size
@@ -86,13 +85,15 @@ class DeepseekVLV2Processor(ProcessorMixin):
        self.normalize = normalize
        self.downsample_ratio = downsample_ratio

-        self.image_transform = ImageTransform(mean=image_mean, std=image_std, normalize=normalize)
+        self.image_transform = ImageTransform(
+            mean=image_mean, std=image_std, normalize=normalize
+        )
        self.tokenizer = tokenizer
-        self.tokenizer.padding_side = 'left'  # must set this，padding side with make a difference in batch inference
+        self.tokenizer.padding_side = "left"  # must set this，padding side with make a difference in batch inference

        # add the pad_token as special token to use 'tokenizer.pad_token' and 'tokenizer.pad_token_id'
        if tokenizer.pad_token is None:
-            self.tokenizer.add_special_tokens({'pad_token': pad_token})
+            self.tokenizer.add_special_tokens({"pad_token": pad_token})

        # add image token
        image_token_id = self.tokenizer.vocab.get(image_token)
@@ -104,7 +105,7 @@ class DeepseekVLV2Processor(ProcessorMixin):

        # add five special tokens for grounding-related tasks
        # <|ref|>, <|/ref|>, <|det|>, <|/det|>, <|grounding|>
-        special_tokens = ['<|ref|>', '<|/ref|>', '<|det|>', '<|/det|>', '<|grounding|>']
+        special_tokens = ["<|ref|>", "<|/ref|>", "<|det|>", "<|/det|>", "<|grounding|>"]
        special_tokens_dict = {"additional_special_tokens": special_tokens}
        self.tokenizer.add_special_tokens(special_tokens_dict)

@@ -134,15 +135,19 @@ class DeepseekVLV2Processor(ProcessorMixin):

        for width, height in self.candidate_resolutions:
            scale = min(width / original_width, height / original_height)
-            downscaled_width, downscaled_height = int(
-                original_width * scale), int(original_height * scale)
-            effective_resolution = min(downscaled_width * downscaled_height,
-                                       original_width * original_height)
+            downscaled_width, downscaled_height = (
+                int(original_width * scale),
+                int(original_height * scale),
+            )
+            effective_resolution = min(
+                downscaled_width * downscaled_height, original_width * original_height
+            )
            wasted_resolution = (width * height) - effective_resolution

            if effective_resolution > max_effective_resolution or (
-                    effective_resolution == max_effective_resolution
-                    and wasted_resolution < min_wasted_resolution):
+                effective_resolution == max_effective_resolution
+                and wasted_resolution < min_wasted_resolution
+            ):
                max_effective_resolution = effective_resolution
                min_wasted_resolution = wasted_resolution
                best_fit = (width, height)
@@ -198,12 +203,20 @@ class DeepseekVLV2Processor(ProcessorMixin):
                - num_image_tokens (list[int]): the number of image tokens
        """

-        assert (prompt is not None and images is not None
-                ), "prompt and images must be used at the same time."
+        assert prompt is not None and images is not None, (
+            "prompt and images must be used at the same time."
+        )

        sft_format = prompt
-        tokenized_str, images_list, images_seq_mask, images_spatial_crop, num_image_tokens = self.tokenize_with_images(
-            sft_format, images, bos=True, eos=True, cropping=len(images) <= 2)
+        (
+            tokenized_str,
+            images_list,
+            images_seq_mask,
+            images_spatial_crop,
+            num_image_tokens,
+        ) = self.tokenize_with_images(
+            sft_format, images, bos=True, eos=True, cropping=len(images) <= 2
+        )
        masked_tokenized_str = []
        for token_index in tokenized_str:
            if token_index != self.image_token_id:
@@ -211,17 +224,21 @@ class DeepseekVLV2Processor(ProcessorMixin):
            else:
                masked_tokenized_str.append(self.ignore_id)

-        assert len(tokenized_str) == len(images_seq_mask) == len(masked_tokenized_str), \
-            (f"tokenized_str's length {len(tokenized_str)}, input_ids' length {len(masked_tokenized_str)}, "
-             f"imags_seq_mask's length {len(images_seq_mask)}, are not equal")
+        assert (
+            len(tokenized_str) == len(images_seq_mask) == len(masked_tokenized_str)
+        ), (
+            f"tokenized_str's length {len(tokenized_str)}, input_ids' length {len(masked_tokenized_str)}, "
+            f"imags_seq_mask's length {len(images_seq_mask)}, are not equal"
+        )

        input_ids = torch.LongTensor(tokenized_str)
        target_ids = torch.LongTensor(masked_tokenized_str)
        images_seq_mask = torch.tensor(images_seq_mask, dtype=torch.bool)

        # set input_ids < 0 | input_ids == self.image_token_id as ignore_id
-        target_ids[(input_ids < 0) |
-                   (input_ids == self.image_token_id)] = self.ignore_id
+        target_ids[(input_ids < 0) | (input_ids == self.image_token_id)] = (
+            self.ignore_id
+        )
        input_ids[input_ids < 0] = self.pad_id

        if inference_mode:
@@ -311,30 +328,50 @@ class DeepseekVLV2Processor(ProcessorMixin):
                best_width, best_height = self.image_size, self.image_size

            """process the global view"""
-            global_view = ImageOps.pad(image, (self.image_size, self.image_size),
-                                       color=tuple(int(x * 255) for x in self.image_transform.mean))
+            global_view = ImageOps.pad(
+                image,
+                (self.image_size, self.image_size),
+                color=tuple(int(x * 255) for x in self.image_transform.mean),
+            )
            images_list.append(self.image_transform(global_view))

            """process the local views"""
-            local_view = ImageOps.pad(image, (best_width, best_height),
-                                      color=tuple(int(x * 255) for x in self.image_transform.mean))
+            local_view = ImageOps.pad(
+                image,
+                (best_width, best_height),
+                color=tuple(int(x * 255) for x in self.image_transform.mean),
+            )
            for i in range(0, best_height, self.image_size):
                for j in range(0, best_width, self.image_size):
                    images_list.append(
-                        self.image_transform(local_view.crop((j, i, j + self.image_size, i + self.image_size))))
+                        self.image_transform(
+                            local_view.crop(
+                                (j, i, j + self.image_size, i + self.image_size)
+                            )
+                        )
+                    )

            """record height / width crop num"""
-            num_width_tiles, num_height_tiles = best_width // self.image_size, best_height // self.image_size
+            num_width_tiles, num_height_tiles = (
+                best_width // self.image_size,
+                best_height // self.image_size,
+            )
            images_spatial_crop.append([num_width_tiles, num_height_tiles])

            """add image tokens"""
-            h = w = math.ceil((self.image_size // self.patch_size) / self.downsample_ratio)
+            h = w = math.ceil(
+                (self.image_size // self.patch_size) / self.downsample_ratio
+            )
            # global views tokens h * (w + 1), 1 is for line separator
            tokenized_image = [self.image_token_id] * h * (w + 1)
            # add a separator between global and local views
            tokenized_image += [self.image_token_id]
            # local views tokens, (num_height_tiles * h) * (num_width_tiles * w + 1)
-            tokenized_image += [self.image_token_id] * (num_height_tiles * h) * (num_width_tiles * w + 1)
+            tokenized_image += (
+                [self.image_token_id]
+                * (num_height_tiles * h)
+                * (num_width_tiles * w + 1)
+            )

            tokenized_str += tokenized_image
            images_seq_mask += [True] * len(tokenized_image)
@@ -353,10 +390,17 @@ class DeepseekVLV2Processor(ProcessorMixin):
            tokenized_str = tokenized_str + [self.eos_id]
            images_seq_mask = images_seq_mask + [False]

-        assert len(tokenized_str) == len(
-            images_seq_mask), f"tokenize_with_images func: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}"
+        assert len(tokenized_str) == len(images_seq_mask), (
+            f"tokenize_with_images func: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}"
+        )

-        return tokenized_str, images_list, images_seq_mask, images_spatial_crop, num_image_tokens
+        return (
+            tokenized_str,
+            images_list,
+            images_seq_mask,
+            images_spatial_crop,
+            num_image_tokens,
+        )


 AutoProcessor.register("DeepseekVLV2Processor", DeepseekVLV2Processor)
--- a/vllm/transformers_utils/processors/ovis.py
+++ b/vllm/transformers_utils/processors/ovis.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-# yapf: disable
 # ruff: noqa: E501
 # coding=utf-8
 # adapted from https://github.com/AIDC-AI/Ovis/blob/35ab51a1a1e3542fa6db260a1084cefbc8f164bb/ovis/vllm/processing_ovis.py
@@ -35,23 +34,24 @@ from transformers.tokenization_utils_base import PreTokenizedInput, TextInput

 from vllm.multimodal.image import convert_image_mode

-__all__ = ['OvisProcessor']
+__all__ = ["OvisProcessor"]
 IGNORE_ID = -100

-class OvisProcessorKwargs(ProcessingKwargs, total=False):   # type: ignore[call-arg]
+
+class OvisProcessorKwargs(ProcessingKwargs, total=False):  # type: ignore[call-arg]
    _defaults = {
        "text_kwargs": {
            "padding": False,
        },
        "images_kwargs": {
-            'max_partition':9,
-            'covering_threshold':0.9,
-            'convert_to_rgb':True,
-        'return_tensors':'pt'},
+            "max_partition": 9,
+            "covering_threshold": 0.9,
+            "convert_to_rgb": True,
+            "return_tensors": "pt",
+        },
    }


-
 class OvisProcessor(ProcessorMixin):
    r"""
    Constructs an Ovis processor which wraps an Ovis image processor and a Qwen2 tokenizer into a single processor.
@@ -97,14 +97,16 @@ class OvisProcessor(ProcessorMixin):
            "image_col_sep": -303,
            "image_row_sep": -304,
            "image_end": -305,
-            'image_pad': image_pad_token_id,
+            "image_pad": image_pad_token_id,
        }
        return extra_special_tokens

    def __call__(
        self,
        images: ImageInput = None,
-        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
+        text: Union[
+            TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]
+        ] = None,
        **kwargs: Unpack[OvisProcessorKwargs],
    ) -> BatchFeature:
        """
@@ -169,7 +171,6 @@ class OvisProcessor(ProcessorMixin):

        # Process text input
        if text is not None:
-
            if not isinstance(text, list):
                text = [text]

@@ -178,7 +179,10 @@ class OvisProcessor(ProcessorMixin):
            replaced_ids_list = []
            idx = 0
            for ids_tensor in tokenized_batched_text:
-                if image_token_id in ids_tensor and "image_placeholders" in image_features:
+                if (
+                    image_token_id in ids_tensor
+                    and "image_placeholders" in image_features
+                ):
                    if idx < len(image_features["image_placeholders"]):
                        # Converts in list for ease of use
                        ids_list = ids_tensor.tolist()
@@ -188,7 +192,9 @@ class OvisProcessor(ProcessorMixin):
                        # replace placeholders
                        for i, token_id in enumerate(ids_list):
                            if token_id == image_token_id:
-                                placeholder_ids = image_features["image_placeholders"][idx]
+                                placeholder_ids = image_features["image_placeholders"][
+                                    idx
+                                ]
                                new_ids.extend(placeholder_ids)
                                idx += 1
                            else:
@@ -198,7 +204,8 @@ class OvisProcessor(ProcessorMixin):
                        ids_tensor = torch.tensor(new_ids, dtype=torch.long)
                    else:
                        raise RuntimeError(
-                            'Mismatch between the images you provided and the number of placeholder present in the text')
+                            "Mismatch between the images you provided and the number of placeholder present in the text"
+                        )

                replaced_ids_list.append(ids_tensor)

@@ -217,7 +224,7 @@ class OvisProcessor(ProcessorMixin):
            # Add image features if present
            if image_features:
                output["pixel_values"] = processed_images
-                output['grids'] = grids
+                output["grids"] = grids

            return output

@@ -227,8 +234,10 @@ class OvisProcessor(ProcessorMixin):
    def _tokenize_with_image_symbol(self, text_list: list[str]) -> torch.LongTensor:
        batch_token_ids = []
        for text in text_list:
-            text_chunks = [self.tokenizer(chunk, add_special_tokens=False).input_ids for chunk in
-                           text.split(self.image_token)]
+            text_chunks = [
+                self.tokenizer(chunk, add_special_tokens=False).input_ids
+                for chunk in text.split(self.image_token)
+            ]
            token_ids = []
            num_chuck = len(text_chunks)
            for i, chunk in enumerate(text_chunks):
@@ -240,50 +249,60 @@ class OvisProcessor(ProcessorMixin):

    def get_image_size(self):
        size = self.image_processor.size
-        if 'shortest_edge' in size:
-            width = height = size['shortest_edge']
+        if "shortest_edge" in size:
+            width = height = size["shortest_edge"]
        elif "height" in size and "width" in size:
-            width = size['width']
-            height = size['height']
+            width = size["width"]
+            height = size["height"]
        else:
-            raise ValueError( "Can't parse image size from image_processor config.")
+            raise ValueError("Can't parse image size from image_processor config.")
        return height, width

    def get_token_value(self, tok):
        return self.extra_special_tokens[tok]

    def construct_image_indicators(self, grid):
-        image_placeholders = [self.get_token_value('image_start'),
-                              self.get_token_value('image_atom'),
-                              self.get_token_value('image_prefix')]
+        image_placeholders = [
+            self.get_token_value("image_start"),
+            self.get_token_value("image_atom"),
+            self.get_token_value("image_prefix"),
+        ]
        if grid[0] * grid[1] > 1:
            for r in range(grid[0]):
                for c in range(grid[1]):
-                    image_placeholders.append(self.get_token_value('image_atom') )
+                    image_placeholders.append(self.get_token_value("image_atom"))
                    if c < grid[1] - 1:
-                        image_placeholders.append(self.get_token_value('image_col_sep'))
+                        image_placeholders.append(self.get_token_value("image_col_sep"))
                if r < grid[0] - 1:
-                    image_placeholders.append(self.get_token_value('image_row_sep'))
-        image_placeholders.append(self.get_token_value('image_end'))
+                    image_placeholders.append(self.get_token_value("image_row_sep"))
+        image_placeholders.append(self.get_token_value("image_end"))
        return image_placeholders

    def construct_image_placeholders(self, grid):
-
        image_placeholders = self.construct_image_indicators(grid)

-        image_atom_token_id = self.get_token_value('image_atom')
+        image_atom_token_id = self.get_token_value("image_atom")
        # Extract the padding token ID from tokenizer
-        image_padding_token_id = self.get_token_value('image_pad')
+        image_padding_token_id = self.get_token_value("image_pad")

        # Create a new list with padding tokens inserted
        padded_placeholder_tokens = []
        for token in image_placeholders:
            padded_placeholder_tokens.append(image_padding_token_id)
            if token == image_atom_token_id:
-                padded_placeholder_tokens.extend([image_padding_token_id] * self.image_segment_len)
+                padded_placeholder_tokens.extend(
+                    [image_padding_token_id] * self.image_segment_len
+                )
        return padded_placeholder_tokens

-    def preprocess_image(self, image: PIL.Image.Image, max_partition, covering_threshold, convert_to_rgb, return_tensors):
+    def preprocess_image(
+        self,
+        image: PIL.Image.Image,
+        max_partition,
+        covering_threshold,
+        convert_to_rgb,
+        return_tensors,
+    ):
        def _preprocess(img: PIL.Image.Image, side):
            # first resize and preprocess
            w, h = img.size
@@ -296,19 +315,27 @@ class OvisProcessor(ProcessorMixin):
                new_height = side
                new_width = int(w / h * new_height)
            new_size = dict(height=new_height, width=new_width)
-            pixel_values = self.image_processor.preprocess(img, size=new_size, return_tensors=return_tensors)['pixel_values']
+            pixel_values = self.image_processor.preprocess(
+                img, size=new_size, return_tensors=return_tensors
+            )["pixel_values"]

            # then pad to square
-            square_values = torch.zeros([1, 3, side, side], dtype=pixel_values.dtype, device=pixel_values.device)
+            square_values = torch.zeros(
+                [1, 3, side, side], dtype=pixel_values.dtype, device=pixel_values.device
+            )
            new_height, new_width = pixel_values.shape[2:]
            if new_height == new_width:
                square_values[:, :, :, :] = pixel_values
            elif new_height > new_width:
                from_index = (side - new_width) // 2
-                square_values[:, :, :, from_index:from_index + new_width] = pixel_values
+                square_values[:, :, :, from_index : from_index + new_width] = (
+                    pixel_values
+                )
            else:
                from_index = (side - new_height) // 2
-                square_values[:, :, from_index:from_index + new_height, :] = pixel_values
+                square_values[:, :, from_index : from_index + new_height, :] = (
+                    pixel_values
+                )

            return square_values

@@ -350,7 +377,9 @@ class OvisProcessor(ProcessorMixin):
            good_grids = []
            for grid in candidate_grids:
                partition = _partition(img, grid)
-                covering_ratio = sum([_covering_area(*p, side) for p in partition]) / img_area
+                covering_ratio = (
+                    sum([_covering_area(*p, side) for p in partition]) / img_area
+                )
                assert covering_ratio <= 1.0
                all_grids.append((grid, covering_ratio))
                if covering_ratio > covering_threshold:
@@ -358,18 +387,19 @@ class OvisProcessor(ProcessorMixin):

            if len(good_grids) > 0:
                # pick the good partition with minimum #sub_images and break the tie using covering_ratio
-                return sorted(good_grids, key=lambda x: (x[0][0] * x[0][1], -x[1]))[0][0]
+                return sorted(good_grids, key=lambda x: (x[0][0] * x[0][1], -x[1]))[0][
+                    0
+                ]
            else:
                # pick the partition with maximum covering_ratio and break the tie using #sub_images
                return sorted(all_grids, key=lambda x: (-x[1], x[0][0] * x[0][1]))[0][0]

        if convert_to_rgb:
-            image = convert_image_mode(image, 'RGB')
-
+            image = convert_image_mode(image, "RGB")

        sides = self.get_image_size()
        if sides[0] != sides[1]:
-            raise ValueError('get_image_size() returns non-square size')
+            raise ValueError("get_image_size() returns non-square size")
        side = sides[0]
        grid = _get_best_grid(image, side)
        partition = _partition(image, grid)
@@ -405,14 +435,18 @@ class OvisProcessor(ProcessorMixin):
            `list[str]`: The decoded text.
        """
        return self.tokenizer.batch_decode(
-            generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False
+            generated_outputs,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False,
        )

    @property
    def model_input_names(self):
        tokenizer_input_names = self.tokenizer.model_input_names
        image_processor_input_names = self.image_processor.model_input_names
-        names_from_processor = list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+        names_from_processor = list(
+            dict.fromkeys(tokenizer_input_names + image_processor_input_names)
+        )
        return names_from_processor + ["second_per_grid_ts"]