Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-05 15:06:22 +01:00
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions
--- a/tests/models/multimodal/generation/vlm_utils/model_utils.py
+++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py
@@ -4,6 +4,7 @@
 for manipulating the input / output of HF & vLLM test runners, which are
 typically specific to a small subset of models.
 """
+
 import types
 from pathlib import PosixPath
 from typing import Optional, Union
@@ -15,8 +16,13 @@ import pytest
 import regex as re
 import torch
 from PIL.Image import Image
-from transformers import (AutoConfig, AutoTokenizer, BatchFeature,
-                          GenerationConfig, GenerationMixin)
+from transformers import (
+    AutoConfig,
+    AutoTokenizer,
+    BatchFeature,
+    GenerationConfig,
+    GenerationMixin,
+)
 from transformers.video_utils import VideoMetadata

 from vllm.logprobs import SampleLogprobs
@@ -27,8 +33,7 @@ from .types import RunnerOutput


 ####### vLLM output processors functions
-def blip2_vllm_to_hf_output(vllm_output: RunnerOutput,
-                            model: str) -> RunnerOutput:
+def blip2_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
    """Sanitize vllm output [blip2 models] to be comparable with hf output."""
    _, output_str, out_logprobs = vllm_output

@@ -42,8 +47,7 @@ def blip2_vllm_to_hf_output(vllm_output: RunnerOutput,
    return hf_output_ids, hf_output_str, out_logprobs


-def fuyu_vllm_to_hf_output(vllm_output: RunnerOutput,
-                           model: str) -> RunnerOutput:
+def fuyu_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
    """Sanitize vllm output [fuyu models] to be comparable with hf output."""
    output_ids, output_str, out_logprobs = vllm_output

@@ -53,8 +57,8 @@ def fuyu_vllm_to_hf_output(vllm_output: RunnerOutput,


 def qwen_vllm_to_hf_output(
-        vllm_output: RunnerOutput,
-        model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
+    vllm_output: RunnerOutput, model: str
+) -> tuple[list[int], str, Optional[SampleLogprobs]]:
    """Sanitize vllm output [qwen models] to be comparable with hf output."""
    output_ids, output_str, out_logprobs = vllm_output

@@ -64,8 +68,8 @@ def qwen_vllm_to_hf_output(


 def qwen2_vllm_to_hf_output(
-        vllm_output: RunnerOutput,
-        model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
+    vllm_output: RunnerOutput, model: str
+) -> tuple[list[int], str, Optional[SampleLogprobs]]:
    """Sanitize vllm output [qwen2 models] to be comparable with hf output."""
    output_ids, output_str, out_logprobs = vllm_output

@@ -75,8 +79,8 @@ def qwen2_vllm_to_hf_output(


 def kimiv_vl_vllm_to_hf_output(
-        vllm_output: RunnerOutput,
-        model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
+    vllm_output: RunnerOutput, model: str
+) -> tuple[list[int], str, Optional[SampleLogprobs]]:
    """Sanitize vllm output [kimi_vl models] to be comparable with hf output."""
    output_ids, output_str, out_logprobs = vllm_output

@@ -85,23 +89,25 @@ def kimiv_vl_vllm_to_hf_output(
    return output_ids, hf_output_str, out_logprobs


-def llava_image_vllm_to_hf_output(vllm_output: RunnerOutput,
-                                  model: str) -> RunnerOutput:
+def llava_image_vllm_to_hf_output(
+    vllm_output: RunnerOutput, model: str
+) -> RunnerOutput:
    config = AutoConfig.from_pretrained(model)
    mm_token_id = config.image_token_index
    return _llava_vllm_to_hf_output(vllm_output, model, mm_token_id)


 def llava_video_vllm_to_hf_output(
-        vllm_output: RunnerOutput,
-        model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
+    vllm_output: RunnerOutput, model: str
+) -> tuple[list[int], str, Optional[SampleLogprobs]]:
    config = AutoConfig.from_pretrained(model)
    mm_token_id = config.video_token_index
    return _llava_vllm_to_hf_output(vllm_output, model, mm_token_id)


-def _llava_vllm_to_hf_output(vllm_output: RunnerOutput, model: str,
-                             mm_token_id: int) -> RunnerOutput:
+def _llava_vllm_to_hf_output(
+    vllm_output: RunnerOutput, model: str, mm_token_id: int
+) -> RunnerOutput:
    """Sanitize vllm output [Llava models] to be comparable with hf output."""
    output_ids, output_str, out_logprobs = vllm_output

@@ -109,7 +115,8 @@ def _llava_vllm_to_hf_output(vllm_output: RunnerOutput, model: str,
    eos_token_id = tokenizer.eos_token_id

    hf_output_ids = [
-        token_id for idx, token_id in enumerate(output_ids)
+        token_id
+        for idx, token_id in enumerate(output_ids)
        if token_id != mm_token_id or output_ids[idx - 1] != mm_token_id
    ]

@@ -128,8 +135,9 @@ def llava_onevision_hf_model_kwargs(model: str) -> dict:
    return config.to_dict()


-def llava_onevision_vllm_to_hf_output(vllm_output: RunnerOutput,
-                                      model: str) -> RunnerOutput:
+def llava_onevision_vllm_to_hf_output(
+    vllm_output: RunnerOutput, model: str
+) -> RunnerOutput:
    """Sanitize vllm output [llava-onevision] to compare with hf output."""
    output_ids, output_str, out_logprobs = vllm_output

@@ -140,7 +148,8 @@ def llava_onevision_vllm_to_hf_output(vllm_output: RunnerOutput,
    eos_token_id = tokenizer.eos_token_id

    hf_output_ids = [
-        token_id for idx, token_id in enumerate(output_ids)
+        token_id
+        for idx, token_id in enumerate(output_ids)
        if token_id != video_token_id or output_ids[idx - 1] != video_token_id
    ]

@@ -151,8 +160,7 @@ def llava_onevision_vllm_to_hf_output(vllm_output: RunnerOutput,
    return hf_output_ids, hf_output_str, out_logprobs


-def mantis_vllm_to_hf_output(vllm_output: RunnerOutput,
-                             model: str) -> RunnerOutput:
+def mantis_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
    """Sanitize vllm output [mantis] to compare with hf output."""
    output_ids, output_str, out_logprobs = vllm_output

@@ -161,8 +169,7 @@ def mantis_vllm_to_hf_output(vllm_output: RunnerOutput,
    return output_ids, hf_output_str, out_logprobs


-def phi3v_vllm_to_hf_output(vllm_output: RunnerOutput,
-                            model: str) -> RunnerOutput:
+def phi3v_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
    """Sanitize vllm output [phi3v] to be comparable with hf output."""
    _, output_str, out_logprobs = vllm_output

@@ -180,8 +187,7 @@ def phi3v_vllm_to_hf_output(vllm_output: RunnerOutput,
    return hf_output_ids, hf_output_str, out_logprobs


-def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput,
-                                model: str) -> RunnerOutput:
+def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
    """Sanitize vllm output to be comparable with hf output."""
    output_ids, output_str, out_logprobs = vllm_output

@@ -192,7 +198,8 @@ def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput,
    eos_token_id = tokenizer.eos_token_id

    hf_output_ids = [
-        token_id for idx, token_id in enumerate(output_ids)
+        token_id
+        for idx, token_id in enumerate(output_ids)
        if token_id != image_token_id or output_ids[idx - 1] != image_token_id
    ]

@@ -205,46 +212,40 @@ def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput,


 ####### Post-processors for HF outputs
-def deepseekvl2_trunc_hf_output(hf_output: RunnerOutput,
-                                model: str) -> RunnerOutput:
+def deepseekvl2_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
    output_ids, output_str, out_logprobs = hf_output
    if output_str.endswith("<｜end▁of▁sentence｜>"):
        output_str = output_str.split("<｜end▁of▁sentence｜>")[0]
    return output_ids, output_str, out_logprobs


-def idefics3_trunc_hf_output(hf_output: RunnerOutput,
-                             model: str) -> RunnerOutput:
+def idefics3_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
    output_ids, output_str, out_logprobs = hf_output
    if output_str.endswith("<end_of_utterance>"):
        output_str = output_str.split("<end_of_utterance>")[0]
    return output_ids, output_str, out_logprobs


-def smolvlm_trunc_hf_output(hf_output: RunnerOutput,
-                            model: str) -> RunnerOutput:
+def smolvlm_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
    # Based on Idefics3
    return idefics3_trunc_hf_output(hf_output, model)


-def minicpmv_trunc_hf_output(hf_output: RunnerOutput,
-                             model: str) -> RunnerOutput:
+def minicpmv_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
    output_ids, output_str, out_logprobs = hf_output
    if output_str.endswith("<|eot_id|>"):
        output_str = output_str.split("<|eot_id|>")[0]
    return output_ids, output_str, out_logprobs


-def minimax_vl_01_hf_output(hf_output: RunnerOutput,
-                            model: str) -> RunnerOutput:
+def minimax_vl_01_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
    output_ids, output_str, out_logprobs = hf_output
    if output_str.endswith("<end_of_sentence>"):
        output_str = output_str.split("<end_of_sentence>")[0]
    return output_ids, output_str, out_logprobs


-def ultravox_trunc_hf_output(hf_output: RunnerOutput,
-                             model: str) -> RunnerOutput:
+def ultravox_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
    output_ids, output_str, out_logprobs = hf_output

    tokenizer = AutoTokenizer.from_pretrained(model)
@@ -262,8 +263,8 @@ def get_llava_embeddings(image_assets: ImageTestAssets):

 ####### Prompt path encoders for models that need models on disk
 def qwen_prompt_path_encoder(
-        tmp_path: PosixPath, prompt: str,
-        assets: Union[list[ImageAsset], ImageTestAssets]) -> str:
+    tmp_path: PosixPath, prompt: str, assets: Union[list[ImageAsset], ImageTestAssets]
+) -> str:
    """Given a temporary dir path, export one or more image assets into the
    tempdir & replace its contents with the local path to the string so that
    the HF version of Qwen-VL can resolve the path and load the image in its
@@ -313,8 +314,9 @@ def deepseekvl2_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
        return BatchFeature(data=inputs, tensor_type="pt")

    hf_model.processor = processor
-    hf_model.model.get_output_embeddings = lambda: \
-        hf_model.model.language.model.embed_tokens
+    hf_model.model.get_output_embeddings = (
+        lambda: hf_model.model.language.model.embed_tokens
+    )
    return hf_model


@@ -357,11 +359,10 @@ def glm4v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
        assert len(contents) == len(images)

        return hf_processor.apply_chat_template(
-            [{
-                "role": "user",
-                "image": image,
-                "content": content
-            } for image, content in zip(images, contents)],
+            [
+                {"role": "user", "image": image, "content": content}
+                for image, content in zip(images, contents)
+            ],
            add_generation_prompt=True,
            tokenize=True,
            return_dict=True,
@@ -369,8 +370,9 @@ def glm4v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
        )

    hf_model.processor = processor
-    hf_model.model.get_output_embeddings = lambda: \
-        hf_model.model.transformer.output_layer
+    hf_model.model.get_output_embeddings = (
+        lambda: hf_model.model.transformer.output_layer
+    )
    return hf_model


@@ -387,10 +389,9 @@ def glm4_1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
        else:
            video_metadata = None

-        return hf_processor(*args,
-                            videos=videos,
-                            video_metadata=video_metadata,
-                            **kwargs)
+        return hf_processor(
+            *args, videos=videos, video_metadata=video_metadata, **kwargs
+        )

    hf_model.processor = processor
    return hf_model
@@ -406,8 +407,9 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
            self.num_image_token = hf_runner.model.num_image_token
            self.tokenizer = hf_runner.tokenizer

-            self.config = AutoConfig.from_pretrained(hf_runner.model_name,
-                                                     trust_remote_code=True)
+            self.config = AutoConfig.from_pretrained(
+                hf_runner.model_name, trust_remote_code=True
+            )
            self.vision_config = self.config.vision_config
            self.use_thumbnail = self.config.use_thumbnail
            self.use_msac = self.config.use_msac
@@ -415,11 +417,14 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
            self.max_num = self.config.max_dynamic_patch
            self.image_size = self.vision_config.image_size

-        def __call__(self, text: str, images: Union[Image, list[Image]],
-                     **kwargs):
+        def __call__(self, text: str, images: Union[Image, list[Image]], **kwargs):
            # yapf: disable
            from vllm.model_executor.models.h2ovl import (
-                IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values_h2ovl)
+                IMG_CONTEXT,
+                IMG_END,
+                IMG_START,
+                image_to_pixel_values_h2ovl,
+            )

            # yapf: enable
            images = [images] if isinstance(images, Image) else images
@@ -431,29 +436,26 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
                    max_num=self.max_num,
                    use_thumbnail=self.use_thumbnail,
                    use_msac=self.use_msac,
-                ) for image in images
-            ]
-            num_patches_list = [
-                pixel_value.shape[0] for pixel_value in pixel_values
+                )
+                for image in images
            ]
+            num_patches_list = [pixel_value.shape[0] for pixel_value in pixel_values]
            pixel_values = torch.cat(pixel_values, dim=0)
            for num_patches in num_patches_list:
-                context_tokens = IMG_CONTEXT * self.num_image_token \
-                    * num_patches
+                context_tokens = IMG_CONTEXT * self.num_image_token * num_patches
                image_tokens = IMG_START + context_tokens + IMG_END
-                text = text.replace('<image>', image_tokens, 1)
+                text = text.replace("<image>", image_tokens, 1)
            prompt = self.tokenizer(text, return_tensors="pt")
            prompt.update({"pixel_values": pixel_values})
            return prompt

-    img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
-        "<IMG_CONTEXT>")
+    img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
    hf_model.model.img_context_token_id = img_context_token_id
    hf_model.processor = H2OVLProcessor(hf_model)
-    hf_model.model.get_output_embeddings = lambda: \
-        hf_model.model.language_model.get_output_embeddings()
-    hf_model.model.generate = types.MethodType(_internvl_generate,
-                                               hf_model.model)
+    hf_model.model.get_output_embeddings = (
+        lambda: hf_model.model.language_model.get_output_embeddings()
+    )
+    hf_model.model.generate = types.MethodType(_internvl_generate, hf_model.model)
    return hf_model


@@ -467,19 +469,23 @@ def skyworkr1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
            self.num_image_token = hf_runner.model.num_image_token
            self.tokenizer = hf_runner.tokenizer

-            self.config = AutoConfig.from_pretrained(hf_runner.model_name,
-                                                     trust_remote_code=True)
+            self.config = AutoConfig.from_pretrained(
+                hf_runner.model_name, trust_remote_code=True
+            )
            self.vision_config = self.config.vision_config
            self.use_thumbnail = self.config.use_thumbnail
            self.min_num = self.config.min_dynamic_patch
            self.max_num = self.config.max_dynamic_patch
            self.image_size = self.vision_config.image_size

-        def __call__(self, text: str, images: Union[Image, list[Image]],
-                     **kwargs):
+        def __call__(self, text: str, images: Union[Image, list[Image]], **kwargs):
            from vllm.model_executor.models.skyworkr1v import (
-                IMG_CONTEXT, IMG_END, IMG_START,
-                image_to_pixel_values_skyworkr1v)
+                IMG_CONTEXT,
+                IMG_END,
+                IMG_START,
+                image_to_pixel_values_skyworkr1v,
+            )
+
            images = [images] if isinstance(images, Image) else images
            pixel_values = [
                image_to_pixel_values_skyworkr1v(
@@ -488,29 +494,26 @@ def skyworkr1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
                    min_num=self.min_num,
                    max_num=self.max_num,
                    use_thumbnail=self.use_thumbnail,
-                ) for image in images
-            ]
-            num_patches_list = [
-                pixel_value.shape[0] for pixel_value in pixel_values
+                )
+                for image in images
            ]
+            num_patches_list = [pixel_value.shape[0] for pixel_value in pixel_values]
            pixel_values = torch.cat(pixel_values, dim=0)
            for num_patches in num_patches_list:
-                context_tokens = IMG_CONTEXT * self.num_image_token \
-                    * num_patches
+                context_tokens = IMG_CONTEXT * self.num_image_token * num_patches
                image_tokens = IMG_START + context_tokens + IMG_END
-                text = text.replace('<image>', image_tokens, 1)
+                text = text.replace("<image>", image_tokens, 1)
            prompt = self.tokenizer(text, return_tensors="pt")
            prompt.update({"pixel_values": pixel_values})
            return prompt

-    img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
-        "<IMG_CONTEXT>")
+    img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
    hf_model.model.img_context_token_id = img_context_token_id
    hf_model.processor = SkyworkR1VProcessor(hf_model)
-    hf_model.model.get_output_embeddings = lambda: \
-        hf_model.model.language_model.get_output_embeddings()
-    hf_model.model.generate = types.MethodType(_internvl_generate,
-                                               hf_model.model)
+    hf_model.model.get_output_embeddings = (
+        lambda: hf_model.model.language_model.get_output_embeddings()
+    )
+    hf_model.model.generate = types.MethodType(_internvl_generate, hf_model.model)
    return hf_model


@@ -524,8 +527,9 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
            self.num_image_token = hf_runner.model.num_image_token
            self.tokenizer = hf_runner.tokenizer

-            self.config = AutoConfig.from_pretrained(hf_runner.model_name,
-                                                     trust_remote_code=True)
+            self.config = AutoConfig.from_pretrained(
+                hf_runner.model_name, trust_remote_code=True
+            )
            self.vision_config = self.config.vision_config
            self.use_thumbnail = self.config.use_thumbnail
            self.min_num = self.config.min_dynamic_patch
@@ -540,8 +544,13 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
            **kwargs,
        ):
            from vllm.model_executor.models.internvl import (
-                IMG_CONTEXT, IMG_END, IMG_START,
-                image_to_pixel_values_internvl, video_to_pixel_values_internvl)
+                IMG_CONTEXT,
+                IMG_END,
+                IMG_START,
+                image_to_pixel_values_internvl,
+                video_to_pixel_values_internvl,
+            )
+
            images = [images] if isinstance(images, Image) else images
            videos = [videos] if isinstance(videos, np.ndarray) else videos
            if images is not None:
@@ -552,7 +561,8 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
                        min_num=self.min_num,
                        max_num=self.max_num,
                        use_thumbnail=self.use_thumbnail,
-                    ) for image in images
+                    )
+                    for image in images
                ]
                num_patches_images = [
                    pixel_value.shape[0] for pixel_value in pixel_values_images
@@ -568,7 +578,8 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
                        min_num=1,
                        max_num=1,
                        use_thumbnail=False,
-                    ) for video in videos
+                    )
+                    for video in videos
                ]
                num_patches_videos = [
                    pixel_value.shape[0] for pixel_value in pixel_values_videos
@@ -580,38 +591,37 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
            while ("<image>" in text) or ("<video>" in text):
                image_index = text.find("<image>")
                video_index = text.find("<video>")
-                if image_index == -1 or (video_index > -1
-                                         and video_index < image_index):
+                if image_index == -1 or (
+                    video_index > -1 and video_index < image_index
+                ):
                    num_patches = num_patches_videos.pop(0)
                    pixel_values.append(pixel_values_videos.pop(0))
-                    context_tokens = IMG_START + \
-                        IMG_CONTEXT * self.num_image_token + IMG_END
-                    video_tokens = ''.join([
-                        f'Frame{i+1}: {context_tokens}'
-                        for i in range(num_patches)
-                    ])
-                    text = text.replace('<video>', video_tokens, 1)
+                    context_tokens = (
+                        IMG_START + IMG_CONTEXT * self.num_image_token + IMG_END
+                    )
+                    video_tokens = "".join(
+                        [f"Frame{i + 1}: {context_tokens}" for i in range(num_patches)]
+                    )
+                    text = text.replace("<video>", video_tokens, 1)
                else:
                    num_patches = num_patches_images.pop(0)
                    pixel_values.append(pixel_values_images.pop(0))
-                    context_tokens = IMG_CONTEXT * self.num_image_token \
-                        * num_patches
+                    context_tokens = IMG_CONTEXT * self.num_image_token * num_patches
                    image_tokens = IMG_START + context_tokens + IMG_END
-                    text = text.replace('<image>', image_tokens, 1)
+                    text = text.replace("<image>", image_tokens, 1)
            pixel_values = torch.cat(pixel_values, dim=0)

            prompt = self.tokenizer(text, return_tensors="pt")
            prompt.update({"pixel_values": pixel_values})
            return prompt

-    img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
-        "<IMG_CONTEXT>")
+    img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
    hf_model.model.img_context_token_id = img_context_token_id
    hf_model.processor = InternVLProcessor(hf_model)
-    hf_model.model.get_output_embeddings = lambda: \
-        hf_model.model.language_model.get_output_embeddings()
-    hf_model.model.generate = types.MethodType(_internvl_generate,
-                                               hf_model.model)
+    hf_model.model.get_output_embeddings = (
+        lambda: hf_model.model.language_model.get_output_embeddings()
+    )
+    hf_model.model.generate = types.MethodType(_internvl_generate, hf_model.model)
    return hf_model


@@ -631,7 +641,7 @@ def _internvl_generate(
    input_embeds = input_embeds.reshape(B * N, C)

    input_ids = input_ids.reshape(B * N)
-    selected = (input_ids == self.img_context_token_id)
+    selected = input_ids == self.img_context_token_id
    assert selected.sum() != 0
    input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device)

@@ -778,8 +788,9 @@ def molmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:

 def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
    """Patches and returns an instance of the HfRunner to use for Ovis2."""
-    hf_model.model.get_output_embeddings = lambda: \
-        hf_model.model.llm.get_output_embeddings()
+    hf_model.model.get_output_embeddings = (
+        lambda: hf_model.model.llm.get_output_embeddings()
+    )

    def processor(*args, text="", images=None, **kwargs):
        text_tokenizer = hf_model.model.get_text_tokenizer()
@@ -787,8 +798,7 @@ def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:

        prompt_start_and_end = {
            "qwen2": ("<|im_start|>user\n", "<|im_end|>\n"),
-            "llama":
-            ("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
+            "llama": ("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
            "gemma2": ("<start_of_turn>user\n", "<end_of_turn>\n"),
        }
        for start, end in prompt_start_and_end.values():
@@ -797,7 +807,8 @@ def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
                break

        prompt, input_ids, pixel_values = hf_model.model.preprocess_inputs(
-            text_or_conversations=text, images=images)
+            text_or_conversations=text, images=images
+        )
        attention_mask = torch.ne(input_ids, text_tokenizer.pad_token_id)

        inputs = {
@@ -813,8 +824,9 @@ def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:

 def ovis2_5_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
    """Patches and returns an instance of the HfRunner to use for Ovis2."""
-    hf_model.model.get_output_embeddings = lambda: \
-        hf_model.model.llm.get_output_embeddings()
+    hf_model.model.get_output_embeddings = (
+        lambda: hf_model.model.llm.get_output_embeddings()
+    )

    def processor(*args, text="", images=None, videos=None, **kwargs):
        if images is None:
@@ -825,13 +837,11 @@ def ovis2_5_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
            videos = []
        else:
            videos = [videos] if isinstance(videos, np.ndarray) else videos
-            videos = [[PIL.Image.fromarray(frame) for frame in vid]
-                      for vid in videos]
+            videos = [[PIL.Image.fromarray(frame) for frame in vid] for vid in videos]

        prompt_start_and_end = {
            "qwen2": ("<|im_start|>user\n", "<|im_end|>\n"),
-            "llama":
-            ("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
+            "llama": ("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
            "gemma2": ("<start_of_turn>user\n", "<end_of_turn>\n"),
        }
        for start, end in prompt_start_and_end.values():
@@ -842,21 +852,20 @@ def ovis2_5_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
        images_message = [{"type": "image", "image": img} for img in images]
        videos_message = [{"type": "video", "video": vid} for vid in videos]

-        messages = [{
-            "role":
-            "user",
-            "content": [
-                *images_message,
-                *videos_message,
-                {
-                    "type": "text",
-                    "text": text
-                },
-            ],
-        }]
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    *images_message,
+                    *videos_message,
+                    {"type": "text", "text": text},
+                ],
+            }
+        ]

        input_ids, pixel_values, grid_thws = hf_model.model.preprocess_inputs(
-            messages=messages, enable_thinking=True)
+            messages=messages, enable_thinking=True
+        )
        inputs = {
            "inputs": input_ids,
            "pixel_values": pixel_values,