Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-05 15:06:22 +01:00
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -6,22 +6,27 @@ from typing import Optional, Union

 import numpy as np
 import pytest
-from mistral_common.protocol.instruct.messages import (ImageChunk, TextChunk,
-                                                       UserMessage)
+from mistral_common.protocol.instruct.messages import ImageChunk, TextChunk, UserMessage
 from mistral_common.protocol.instruct.request import ChatCompletionRequest
 from PIL import Image

 from vllm.config import ModelConfig
-from vllm.config.multimodal import (AudioDummyOptions, BaseDummyOptions,
-                                    ImageDummyOptions, VideoDummyOptions)
+from vllm.config.multimodal import (
+    AudioDummyOptions,
+    BaseDummyOptions,
+    ImageDummyOptions,
+    VideoDummyOptions,
+)
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
 from vllm.multimodal.cache import MultiModalProcessorOnlyCache
 from vllm.multimodal.inputs import MultiModalInputs
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        InputProcessingContext)
-from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
-                                               cached_tokenizer_from_config,
-                                               encode_tokens)
+from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
+from vllm.transformers_utils.tokenizer import (
+    AnyTokenizer,
+    MistralTokenizer,
+    cached_tokenizer_from_config,
+    encode_tokens,
+)

 from ....multimodal.utils import random_audio, random_image, random_video
 from ...registry import HF_EXAMPLE_MODELS
@@ -36,14 +41,17 @@ def glm4_1v_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
        # GLM4.1V doesn't support multiple videos
        video = mm_data["video"]
        num_frames = len(video)
-        mm_data["video"] = (video, {
-            "total_num_frames": num_frames,
-            "fps": num_frames,
-            "duration": 1,
-            "frames_indices": [i for i in range(num_frames)],
-            "video_backend": "opencv",
-            "do_sample_frames": True,
-        })
+        mm_data["video"] = (
+            video,
+            {
+                "total_num_frames": num_frames,
+                "fps": num_frames,
+                "duration": 1,
+                "frames_indices": [i for i in range(num_frames)],
+                "video_backend": "opencv",
+                "do_sample_frames": True,
+            },
+        )
    return mm_data


@@ -102,7 +110,8 @@ def _test_processing_correctness(
        mm_processor_cache_gb=2048,
        skip_tokenizer_init=model_info.skip_tokenizer_init,
        enforce_eager=model_info.enforce_eager,
-        dtype=model_info.dtype)
+        dtype=model_info.dtype,
+    )

    model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
    factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
@@ -145,27 +154,22 @@ def _test_processing_correctness(
    input_to_hit = {
        "image": Image.new("RGB", size=(128, 128)),
        "video": np.zeros((4, 128, 128, 3), dtype=np.uint8),
-        "audio": (np.zeros((512, )), 16000),
+        "audio": (np.zeros((512,)), 16000),
    }
    input_factory = {
-        "image":
-        partial(random_image, rng, min_wh=128, max_wh=256),
-        "video":
-        partial(random_video,
-                rng,
-                min_frames=2,
-                max_frames=16,
-                min_wh=128,
-                max_wh=256),
-        "audio":
-        partial(random_audio, rng, min_len=512, max_len=1024, sr=16000),
+        "image": partial(random_image, rng, min_wh=128, max_wh=256),
+        "video": partial(
+            random_video, rng, min_frames=2, max_frames=16, min_wh=128, max_wh=256
+        ),
+        "audio": partial(random_audio, rng, min_len=512, max_len=1024, sr=16000),
    }

    for batch_idx in range(num_batches):
        mm_data = {
-            k:
-            [(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]())
-             for _ in range(rng.randint(limit + 1))]
+            k: [
+                (input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]())
+                for _ in range(rng.randint(limit + 1))
+            ]
            for k, limit in limit_mm_per_prompt_ints.items()
        }

@@ -174,12 +178,16 @@ def _test_processing_correctness(
        # Mistral chat outputs tokens directly, rather than text prompts
        if isinstance(tokenizer, MistralTokenizer):
            images = mm_data.get("image", [])
-            request = ChatCompletionRequest(messages=[
-                UserMessage(content=[
-                    TextChunk(text=""),
-                    *(ImageChunk(image=image) for image in images),
-                ]),
-            ])
+            request = ChatCompletionRequest(
+                messages=[
+                    UserMessage(
+                        content=[
+                            TextChunk(text=""),
+                            *(ImageChunk(image=image) for image in images),
+                        ]
+                    ),
+                ]
+            )
            res = tokenizer.mistral.encode_chat_completion(request)
            prompt = res.tokens
        else:
@@ -303,16 +311,14 @@ def _test_processing_correctness_one(
            baseline_text_result,
            baseline_tokenized_result,
            ignore_mm_keys=ignore_mm_keys,
-            msg=f"Failed ({batch_idx=}, {text_prompt=}, "
-            f"{token_prompt=}, {mm_data=})",
+            msg=f"Failed ({batch_idx=}, {text_prompt=}, {token_prompt=}, {mm_data=})",
        )

        _assert_inputs_equal(
            cached_text_result,
            cached_tokenized_result,
            ignore_mm_keys=ignore_mm_keys,
-            msg=f"Failed ({batch_idx=}, {text_prompt=}, "
-            f"{token_prompt=}, {mm_data=})",
+            msg=f"Failed ({batch_idx=}, {text_prompt=}, {token_prompt=}, {mm_data=})",
        )


--- a/tests/models/multimodal/processing/test_glm4_1v.py
+++ b/tests/models/multimodal/processing/test_glm4_1v.py
@@ -24,7 +24,8 @@ from ...utils import build_model_context
        # post-sampled frames (expected behavior)
        (-1, 1, 5),
        (-1, 2, 10),
-    ])
+    ],
+)
 def test_processor_override(
    model_id: str,
    expected_toks_per_frame: int,
@@ -55,10 +56,8 @@ def test_processor_override(
    # Ensure we have the right number of placeholders per num_crops size
    hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
    video_token_id = tokenizer.convert_tokens_to_ids(hf_processor.video_token)
-    video_tok_count = processed_inputs["prompt_token_ids"].count(
-        video_token_id)
-    grid_t, _, _ = processed_inputs["mm_kwargs"].get_data(
-    )["video_grid_thw"][0]
+    video_tok_count = processed_inputs["prompt_token_ids"].count(video_token_id)
+    grid_t, _, _ = processed_inputs["mm_kwargs"].get_data()["video_grid_thw"][0]

    assert grid_t == expected_grid_t
    assert video_tok_count == expected_toks_per_frame * grid_t
@@ -71,7 +70,7 @@ def test_video_loader_consistency(
    fps: int,
 ):
    """
-    Ensure dynamic video loader (pre-sampled by loader) and normal video 
+    Ensure dynamic video loader (pre-sampled by loader) and normal video
    loader (post-sampled by processor) produce same video processing outputs.
    """
    ctx = build_model_context(
@@ -91,7 +90,8 @@ def test_video_loader_consistency(

    static_video, static_metadata = OpenCVVideoBackend.load_bytes(video_bytes)
    dynamic_video, dynamic_metadata = OpenCVDynamicVideoBackend.load_bytes(
-        video_bytes, fps=fps)
+        video_bytes, fps=fps
+    )

    # pre-sampled loader shouldn't read all frames
    assert len(dynamic_video) < len(static_video)
@@ -99,12 +99,11 @@ def test_video_loader_consistency(
    static_mm_data = {"video": [(static_video, static_metadata)]}
    dynamic_mm_data = {"video": [(dynamic_video, dynamic_metadata)]}

-    static_outputs = processor.apply(prompt, static_mm_data,
-                                     hf_processor_mm_kwargs)
-    dynamic_outputs = processor.apply(prompt, dynamic_mm_data,
-                                      hf_processor_mm_kwargs)
+    static_outputs = processor.apply(prompt, static_mm_data, hf_processor_mm_kwargs)
+    dynamic_outputs = processor.apply(prompt, dynamic_mm_data, hf_processor_mm_kwargs)

-    assert static_outputs["prompt_token_ids"] == dynamic_outputs[
-        "prompt_token_ids"]
-    assert static_outputs["mm_kwargs"].get_data(
-    ) == dynamic_outputs["mm_kwargs"].get_data()
+    assert static_outputs["prompt_token_ids"] == dynamic_outputs["prompt_token_ids"]
+    assert (
+        static_outputs["mm_kwargs"].get_data()
+        == dynamic_outputs["mm_kwargs"].get_data()
+    )
--- a/tests/models/multimodal/processing/test_h2ovl.py
+++ b/tests/models/multimodal/processing/test_h2ovl.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for H2OVL's multimodal preprocessing kwargs."""
+
 from collections.abc import Mapping
 from typing import Optional

@@ -23,8 +24,10 @@ def _get_expected_num_patches(
    min_num: int,
    max_num: int,
 ):
-    from vllm.model_executor.models.h2ovl import (calculate_h2ovl_targets,
-                                                  get_h2ovl_target_ratios)
+    from vllm.model_executor.models.h2ovl import (
+        calculate_h2ovl_targets,
+        get_h2ovl_target_ratios,
+    )

    width, height = image.size

@@ -101,24 +104,27 @@ def _run_check(

    total_expected_num_patches = sum(
        _get_expected_num_patches(config, image, len(images), min_num, max_num)
-        for image in images)
+        for image in images
+    )

    processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)

    # Ensure we have the right number of placeholders per num_crops size
    image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
    img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
-    pixel_shape = processed_inputs["mm_kwargs"].get_data(
-    )["pixel_values_flat"].shape
+    pixel_shape = processed_inputs["mm_kwargs"].get_data()["pixel_values_flat"].shape

    assert img_tok_count == 256 * total_expected_num_patches
    assert pixel_shape[0] == total_expected_num_patches


-@pytest.mark.parametrize("model_id", [
-    "h2oai/h2ovl-mississippi-800m",
-    "h2oai/h2ovl-mississippi-2b",
-])
+@pytest.mark.parametrize(
+    "model_id",
+    [
+        "h2oai/h2ovl-mississippi-800m",
+        "h2oai/h2ovl-mississippi-2b",
+    ],
+)
@pytest.mark.parametrize(
    "size_factors",
    [
@@ -165,10 +171,7 @@ def test_processor_override(

    _run_check(
        processor,
-        [
-            rescale_image_size(image_assets[0].pil_image, f)
-            for f in size_factors
-        ],
+        [rescale_image_size(image_assets[0].pil_image, f) for f in size_factors],
        min_num,
        max_num,
        hf_processor_mm_kwargs,
--- a/tests/models/multimodal/processing/test_idefics3.py
+++ b/tests/models/multimodal/processing/test_idefics3.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for Idefics3's multimodal preprocessing kwargs."""
+
 import pytest
 from transformers import Idefics3Config

@@ -17,7 +18,8 @@ from ...utils import build_model_context
    [
        ({"size": {"longest_edge": 364}}, 169),
        ({"size": {"longest_edge": 728}}, 169 * (2**2 + 1)),
-    ])
+    ],
+)
 # yapf: enable
@pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False])
@@ -42,8 +44,11 @@ def test_processor_override(
    hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs

    # Build the image str / prompt based on the number of images we pass
-    placeholders = "<image>" if num_imgs == 1 else "\n".join(
-        f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
+    placeholders = (
+        "<image>"
+        if num_imgs == 1
+        else "\n".join(f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
+    )
    prompt = f"<|begin_of_text|>User:{placeholders}\n<end_of_utterance>\nAssistant:"  # noqa: E501

    # Build mm_data
@@ -57,8 +62,7 @@ def test_processor_override(
    # Ensure the placeholders format are correct
    hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
    hf_processed_inputs = hf_processor(text=prompt, images=mm_data["image"])
-    assert processed_inputs["prompt_token_ids"] == hf_processed_inputs[
-        "input_ids"][0]
+    assert processed_inputs["prompt_token_ids"] == hf_processed_inputs["input_ids"][0]

    # Ensure we have the right number of placeholders per num_crops size
    image_token_id = ctx.get_hf_config().image_token_id
--- a/tests/models/multimodal/processing/test_internvl.py
+++ b/tests/models/multimodal/processing/test_internvl.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for InternVL's multimodal preprocessing kwargs."""
+
 from collections.abc import Mapping
 from typing import Optional

@@ -24,7 +25,9 @@ def _get_expected_num_patches(
    max_num: int,
 ):
    from vllm.model_executor.models.internvl import (
-        calculate_internvl_targets, get_internvl_target_ratios)
+        calculate_internvl_targets,
+        get_internvl_target_ratios,
+    )

    width, height = image.size

@@ -61,15 +64,15 @@ def _run_check(

    total_expected_num_patches = sum(
        _get_expected_num_patches(config, image, len(images), min_num, max_num)
-        for image in images)
+        for image in images
+    )

    processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)

    # Ensure we have the right number of placeholders per num_crops size
    image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
    img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
-    pixel_shape = processed_inputs["mm_kwargs"].get_data(
-    )["pixel_values_flat"].shape
+    pixel_shape = processed_inputs["mm_kwargs"].get_data()["pixel_values_flat"].shape

    assert img_tok_count == 256 * total_expected_num_patches
    assert pixel_shape[0] == total_expected_num_patches
@@ -122,10 +125,7 @@ def test_processor_override(

    _run_check(
        processor,
-        [
-            rescale_image_size(image_assets[0].pil_image, f)
-            for f in size_factors
-        ],
+        [rescale_image_size(image_assets[0].pil_image, f) for f in size_factors],
        min_num,
        max_num,
        hf_processor_mm_kwargs,
--- a/tests/models/multimodal/processing/test_llama4.py
+++ b/tests/models/multimodal/processing/test_llama4.py
@@ -11,8 +11,7 @@ from ....conftest import ImageTestAssets
 from ...utils import build_model_context


-@pytest.mark.parametrize("model_id",
-                         ["meta-llama/Llama-4-Scout-17B-16E-Instruct"])
+@pytest.mark.parametrize("model_id", ["meta-llama/Llama-4-Scout-17B-16E-Instruct"])
@pytest.mark.parametrize("mm_processor_kwargs", [{}])
@pytest.mark.parametrize("num_imgs", [1, 5])
@pytest.mark.parametrize("mm_processor_cache_gb", [0, 4])
@@ -38,13 +37,14 @@ def test_processor_override(
    hf_processor = processor.info.get_hf_processor()
    vocab = tokenizer.get_vocab()

-    prompt = "<|begin_of_text|><|header_start|>user<|header_end|>" \
-        + "<|image|>" * num_imgs \
+    prompt = (
+        "<|begin_of_text|><|header_start|>user<|header_end|>"
+        + "<|image|>" * num_imgs
        + "<|eot|><|header_start|>assistant<|header_end|>"
+    )
    mm_data = {
        "image": [
-            image_assets[(i % len(image_assets))].pil_image
-            for i in range(num_imgs)
+            image_assets[(i % len(image_assets))].pil_image for i in range(num_imgs)
        ]
    }
    if tokenized_prompt:
@@ -64,22 +64,23 @@ def test_processor_override(
        if tiles_x * tiles_y > 1:
            num_x_separators += (tiles_x - 1) * tiles_y
            num_y_separators += tiles_y
-    assert prompt_token_ids.count(vocab[hf_processor.tile_token]) \
-        == num_x_separators
-    assert prompt_token_ids.count(vocab[hf_processor.tile_global_token]) \
-        ==  num_y_separators
+    assert prompt_token_ids.count(vocab[hf_processor.tile_token]) == num_x_separators
+    assert (
+        prompt_token_ids.count(vocab[hf_processor.tile_global_token])
+        == num_y_separators
+    )

    # image token offsets
    img_locs = processed_inputs["mm_placeholders"].get("image", [])
    assert len(img_locs) == num_imgs
-    assert [img_loc.offset for img_loc in img_locs] == \
-        [i for i, v in enumerate(prompt_token_ids) \
-        if v == config.boi_token_index]
+    assert [img_loc.offset for img_loc in img_locs] == [
+        i for i, v in enumerate(prompt_token_ids) if v == config.boi_token_index
+    ]

    # patch sizes and masks
-    num_patches_per_chunk = processor.info.get_patch_per_chunk(
-        config.vision_config)
-    assert prompt_token_ids.count(config.image_token_index) \
+    num_patches_per_chunk = processor.info.get_patch_per_chunk(config.vision_config)
+    assert (
+        prompt_token_ids.count(config.image_token_index)
        == sum(mm_data["patches_per_image"]) * num_patches_per_chunk
-    assert len(mm_data["pixel_values"]) \
-        == sum(mm_data["patches_per_image"])
+    )
+    assert len(mm_data["pixel_values"]) == sum(mm_data["patches_per_image"])
--- a/tests/models/multimodal/processing/test_llava_next.py
+++ b/tests/models/multimodal/processing/test_llava_next.py
@@ -22,8 +22,9 @@ def _validate_image_max_tokens_one(
    image_size: ImageSize,
 ) -> None:
    info = processor.info
-    feature_size = info.get_num_image_tokens(image_width=image_size.width,
-                                             image_height=image_size.height)
+    feature_size = info.get_num_image_tokens(
+        image_width=image_size.width, image_height=image_size.height
+    )

    try:
        assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}"
@@ -31,8 +32,9 @@ def _validate_image_max_tokens_one(
        failed_size_excs.append((image_size, exc))


-@pytest.mark.skip("This test takes around 5 minutes to run. "
-                  "Comment this out to run it manually.")
+@pytest.mark.skip(
+    "This test takes around 5 minutes to run. Comment this out to run it manually."
+)
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
 def test_processor_max_tokens(model_id):
    ctx = build_model_context(
@@ -66,9 +68,9 @@ def test_processor_max_tokens(model_id):
    pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")

    if failed_size_excs:
-        msg = "Found failing image sizes:" \
-            + "\n========\n".join(f"[{size}]\n{exc}"
-                                  for size, exc in failed_size_excs)
+        msg = "Found failing image sizes:" + "\n========\n".join(
+            f"[{size}]\n{exc}" for size, exc in failed_size_excs
+        )
        raise AssertionError(msg)


@@ -94,8 +96,10 @@ def _validate_image_prompt_replacements_one(

        # NOTE: There is a BOS token
        assert first_placeholder.offset == 1
-        assert first_placeholder.length == (
-            len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs
+        assert (
+            first_placeholder.length
+            == (len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs
+        )

    except Exception as exc:
        failed_size_excs.append((image_size, exc))
@@ -122,9 +126,9 @@ def _test_image_prompt_replacements(
    pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")

    if failed_size_excs:
-        msg = "Found failing image sizes:" \
-            + "\n========\n".join(f"[{size}]\n{exc}"
-                                  for size, exc in failed_size_excs)
+        msg = "Found failing image sizes:" + "\n========\n".join(
+            f"[{size}]\n{exc}" for size, exc in failed_size_excs
+        )
        raise AssertionError(msg)


@@ -138,11 +142,17 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
    )
    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)

-    image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
-                    (488, 183), (2560, 1669)]
+    image_ratios = [
+        (171, 152),
+        (184, 161),
+        (198, 176),
+        (333, 296),
+        (369, 328),
+        (488, 183),
+        (2560, 1669),
+    ]
    image_sizes = [
-        size for w, h in image_ratios
-        for size in [ImageSize(w, h), ImageSize(h, w)]
+        size for w, h in image_ratios for size in [ImageSize(w, h), ImageSize(h, w)]
    ]

    _test_image_prompt_replacements(
@@ -152,8 +162,9 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
    )


-@pytest.mark.skip("This test takes around 2 hours to run. "
-                  "Comment this out to run it manually.")
+@pytest.mark.skip(
+    "This test takes around 2 hours to run. Comment this out to run it manually."
+)
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
@pytest.mark.parametrize("num_imgs", [1])
 def test_processor_prompt_replacements_all(model_id, num_imgs):
--- a/tests/models/multimodal/processing/test_llava_onevision.py
+++ b/tests/models/multimodal/processing/test_llava_onevision.py
@@ -22,8 +22,9 @@ def _validate_image_max_tokens_one(
    image_size: ImageSize,
 ) -> None:
    info = processor.info
-    feature_size = info.get_num_image_tokens(image_width=image_size.width,
-                                             image_height=image_size.height)
+    feature_size = info.get_num_image_tokens(
+        image_width=image_size.width, image_height=image_size.height
+    )

    try:
        assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}"
@@ -31,10 +32,10 @@ def _validate_image_max_tokens_one(
        failed_size_excs.append((image_size, exc))


-@pytest.mark.skip("This test takes around 5 minutes to run. "
-                  "Comment this out to run it manually.")
-@pytest.mark.parametrize("model_id",
-                         ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
+@pytest.mark.skip(
+    "This test takes around 5 minutes to run. Comment this out to run it manually."
+)
+@pytest.mark.parametrize("model_id", ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
 def test_processor_max_tokens(model_id):
    ctx = build_model_context(
        model_id,
@@ -67,9 +68,9 @@ def test_processor_max_tokens(model_id):
    pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")

    if failed_size_excs:
-        msg = "Found failing image sizes:" \
-            + "\n========\n".join(f"[{size}]\n{exc}"
-                                  for size, exc in failed_size_excs)
+        msg = "Found failing image sizes:" + "\n========\n".join(
+            f"[{size}]\n{exc}" for size, exc in failed_size_excs
+        )
        raise AssertionError(msg)


@@ -94,8 +95,10 @@ def _validate_image_prompt_replacements_one(
        first_placeholder = image_placeholders[0]

        assert first_placeholder.offset == 0
-        assert first_placeholder.length == len(
-            processed_inputs["prompt_token_ids"]) // num_imgs
+        assert (
+            first_placeholder.length
+            == len(processed_inputs["prompt_token_ids"]) // num_imgs
+        )
    except Exception as exc:
        failed_size_excs.append((image_size, exc))

@@ -121,14 +124,13 @@ def _test_image_prompt_replacements(
    pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")

    if failed_size_excs:
-        msg = "Found failing image sizes:" \
-            + "\n========\n".join(f"[{size}]\n{exc}"
-                                  for size, exc in failed_size_excs)
+        msg = "Found failing image sizes:" + "\n========\n".join(
+            f"[{size}]\n{exc}" for size, exc in failed_size_excs
+        )
        raise AssertionError(msg)


-@pytest.mark.parametrize("model_id",
-                         ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
+@pytest.mark.parametrize("model_id", ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
@pytest.mark.parametrize("num_imgs", [1, 2])
 def test_processor_prompt_replacements_regression(model_id, num_imgs):
    ctx = build_model_context(
@@ -138,11 +140,17 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
    )
    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)

-    image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
-                    (488, 183), (2560, 1669)]
+    image_ratios = [
+        (171, 152),
+        (184, 161),
+        (198, 176),
+        (333, 296),
+        (369, 328),
+        (488, 183),
+        (2560, 1669),
+    ]
    image_sizes = [
-        size for w, h in image_ratios
-        for size in [ImageSize(w, h), ImageSize(h, w)]
+        size for w, h in image_ratios for size in [ImageSize(w, h), ImageSize(h, w)]
    ]

    _test_image_prompt_replacements(
@@ -152,10 +160,10 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
    )


-@pytest.mark.skip("This test takes around 2 hours to run. "
-                  "Comment this out to run it manually.")
-@pytest.mark.parametrize("model_id",
-                         ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
+@pytest.mark.skip(
+    "This test takes around 2 hours to run. Comment this out to run it manually."
+)
+@pytest.mark.parametrize("model_id", ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
@pytest.mark.parametrize("num_imgs", [1])
 def test_processor_prompt_replacements_all(model_id, num_imgs):
    ctx = build_model_context(
--- a/tests/models/multimodal/processing/test_minimax_vl_01.py
+++ b/tests/models/multimodal/processing/test_minimax_vl_01.py
@@ -61,17 +61,17 @@ def _test_image_prompt_replacements(
    num_imgs: int,
    image_sizes: list[ImageSize],
 ) -> None:
-
    failed_size_excs = list[tuple[ImageSize, Exception]]()

    for size in image_sizes:
-        _validate_image_prompt_replacements_one(processor, num_imgs,
-                                                failed_size_excs, size)
+        _validate_image_prompt_replacements_one(
+            processor, num_imgs, failed_size_excs, size
+        )

    if failed_size_excs:
-        msg = "Found failing image sizes:" \
-            + "\n========\n".join(f"[{size}]\n{exc}"
-                                  for size, exc in failed_size_excs)
+        msg = "Found failing image sizes:" + "\n========\n".join(
+            f"[{size}]\n{exc}" for size, exc in failed_size_excs
+        )
        raise AssertionError(msg)


@@ -85,11 +85,17 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
    )
    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)

-    image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
-                    (488, 183), (2560, 1669)]
+    image_ratios = [
+        (171, 152),
+        (184, 161),
+        (198, 176),
+        (333, 296),
+        (369, 328),
+        (488, 183),
+        (2560, 1669),
+    ]
    image_sizes = [
-        size for w, h in image_ratios
-        for size in [ImageSize(w, h), ImageSize(h, w)]
+        size for w, h in image_ratios for size in [ImageSize(w, h), ImageSize(h, w)]
    ]

    _test_image_prompt_replacements(
--- a/tests/models/multimodal/processing/test_mllama4.py
+++ b/tests/models/multimodal/processing/test_mllama4.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for mllama's multimodal preprocessing and profiling."""
+
 import pytest
 from torch import prod
 from transformers import Llama4Config
@@ -47,14 +48,17 @@ def test_profiling(model_id: str, max_model_len: int):
    image_size = hf_config.vision_config.image_size
    patch_size = hf_config.vision_config.patch_size
    downsample_ratio = int(
-        round(1.0 / (hf_config.vision_config.pixel_shuffle_ratio**2)))
-    tokens_per_patch = ((image_size // patch_size)**2) // downsample_ratio
+        round(1.0 / (hf_config.vision_config.pixel_shuffle_ratio**2))
+    )
+    tokens_per_patch = ((image_size // patch_size) ** 2) // downsample_ratio
    chunks_per_image = prod(mm_data["patches_per_image"])
    total_num_patches = chunks_per_image * tokens_per_patch
-    num_tiles = mm_data["aspect_ratios"][0][0] * mm_data["aspect_ratios"][0][
-        1]  # x-y separator tokens
-    total_tokens = total_num_patches.item() + num_tiles.item(
-    ) + 3  # image start, image, image end
+    num_tiles = (
+        mm_data["aspect_ratios"][0][0] * mm_data["aspect_ratios"][0][1]
+    )  # x-y separator tokens
+    total_tokens = (
+        total_num_patches.item() + num_tiles.item() + 3
+    )  # image start, image, image end

    profiled_tokens = profiler.get_mm_max_contiguous_tokens(
        max_model_len,
@@ -63,5 +67,6 @@ def test_profiling(model_id: str, max_model_len: int):

    assert total_tokens == profiled_tokens["image"]
    assert total_tokens == sum(
-        placeholder.length for placeholder in
-        decoder_dummy_data.multi_modal_placeholders["image"])
+        placeholder.length
+        for placeholder in decoder_dummy_data.multi_modal_placeholders["image"]
+    )
--- a/tests/models/multimodal/processing/test_nemotron_vl.py
+++ b/tests/models/multimodal/processing/test_nemotron_vl.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for Nemotron-Nano-VL's multimodal preprocessing kwargs."""
+
 from collections.abc import Mapping
 from typing import Optional

@@ -24,7 +25,9 @@ def _get_expected_num_patches(
    max_num: int,
 ):
    from vllm.model_executor.models.nemotron_vl import (
-        calculate_nemotron_vl_targets, get_nemotron_vl_target_ratios)
+        calculate_nemotron_vl_targets,
+        get_nemotron_vl_target_ratios,
+    )

    width, height = image.size

@@ -63,22 +66,21 @@ def _run_check(

    total_expected_num_patches = sum(
        _get_expected_num_patches(config, image, len(images), min_num, max_num)
-        for image in images)
+        for image in images
+    )
    print(total_expected_num_patches)
    processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)

    # Ensure we have the right number of placeholders per num_crops size
    image_token_id = tokenizer.convert_tokens_to_ids("<image>")
    img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
-    pixel_shape = processed_inputs["mm_kwargs"].get_data(
-    )["pixel_values_flat"].shape
+    pixel_shape = processed_inputs["mm_kwargs"].get_data()["pixel_values_flat"].shape
    print("Image token count:", img_tok_count, "Pixel shape:", pixel_shape)
    assert img_tok_count == 256 * total_expected_num_patches
    assert pixel_shape[0] == total_expected_num_patches


-@pytest.mark.parametrize("model_id",
-                         ["nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"])
+@pytest.mark.parametrize("model_id", ["nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"])
@pytest.mark.parametrize(
    "size_factors",
    [
@@ -125,10 +127,7 @@ def test_processor_override(

    _run_check(
        processor,
-        [
-            rescale_image_size(image_assets[0].pil_image, f)
-            for f in size_factors
-        ],
+        [rescale_image_size(image_assets[0].pil_image, f) for f in size_factors],
        min_num,
        max_num,
        hf_processor_mm_kwargs,
--- a/tests/models/multimodal/processing/test_phi3v.py
+++ b/tests/models/multimodal/processing/test_phi3v.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for phi3v's multimodal preprocessing kwargs."""
+
 import pytest

 from vllm.multimodal import MULTIMODAL_REGISTRY
@@ -18,7 +19,8 @@ from ...utils import build_model_context
        ({"num_crops": 16}, 1921),
        # the default num_crops of phi-3.5-vision is 4
        ({}, 757),
-    ])
+    ],
+)
 # yapf: enable
@pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False])
--- a/tests/models/multimodal/processing/test_phi4mm.py
+++ b/tests/models/multimodal/processing/test_phi4mm.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for phi4mm's multimodal preprocessing kwargs."""
+
 import pytest

 from vllm.multimodal import MULTIMODAL_REGISTRY
@@ -18,7 +19,8 @@ from ...utils import build_model_context
        ({"dynamic_hd": 16}, 4433),
        # the default num_crops of phi-4-multimodal is 36
        ({}, 9585),
-    ])
+    ],
+)
 # yapf: enable
@pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False])
@@ -46,8 +48,7 @@ def test_processor_override(
    img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)])
    prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n"

-    image_size = ctx.get_hf_config(
-    ).embd_layer["image_embd_layer"]["crop_size"]
+    image_size = ctx.get_hf_config().embd_layer["image_embd_layer"]["crop_size"]
    dummy_image_size = (image_size * 7, image_size * 7)
    dummy_image = image_assets[0].pil_image.resize(dummy_image_size)
    mm_data = {"image": [dummy_image] * num_imgs}
@@ -56,5 +57,6 @@ def test_processor_override(

    # Ensure we have the right number of placeholders per num_crops size
    img_tok_count = processed_inputs["prompt_token_ids"].count(
-        _IMAGE_PLACEHOLDER_TOKEN_ID)
+        _IMAGE_PLACEHOLDER_TOKEN_ID
+    )
    assert img_tok_count == expected_toks_per_img * num_imgs
--- a/tests/models/multimodal/processing/test_qwen2_vl.py
+++ b/tests/models/multimodal/processing/test_qwen2_vl.py
@@ -12,10 +12,12 @@ from ...utils import build_model_context
@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"])
 # yapf: disable
@pytest.mark.parametrize(
-    ("mm_processor_kwargs", "expected_toks_per_img", "expected_pixels_shape"), [
+    ("mm_processor_kwargs", "expected_toks_per_img", "expected_pixels_shape"),
+    [
        ({}, 1426, (5704, 1176)),
        ({"min_pixels": 64**2, "max_pixels": 512**2}, 330, (1320, 1176)),
-    ])
+    ],
+)
 # yapf: enable
@pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False])
@@ -48,8 +50,7 @@ def test_processor_override(
    hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
    image_token_id = tokenizer.convert_tokens_to_ids(hf_processor.image_token)
    img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
-    pixel_shape = processed_inputs["mm_kwargs"].get_data(
-    )["pixel_values"].shape
+    pixel_shape = processed_inputs["mm_kwargs"].get_data()["pixel_values"].shape

    assert img_tok_count == expected_toks_per_img * num_imgs
    assert pixel_shape[0] == expected_pixels_shape[0] * num_imgs
--- a/tests/models/multimodal/processing/test_smolvlm.py
+++ b/tests/models/multimodal/processing/test_smolvlm.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for smolvlm's multimodal preprocessing kwargs."""
+
 import pytest
 from transformers import SmolVLMConfig

@@ -17,7 +18,8 @@ from ...utils import build_model_context
    [
        ({"max_image_size": {"longest_edge": 384}}, 1377),
        ({"max_image_size": {"longest_edge": 768}}, 405),
-    ])
+    ],
+)
 # yapf: enable
@pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False])
@@ -42,8 +44,11 @@ def test_processor_override(
    hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs

    # Build the image str / prompt based on the number of images we pass
-    placeholders = "<image>" if num_imgs == 1 else "\n".join(
-        f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
+    placeholders = (
+        "<image>"
+        if num_imgs == 1
+        else "\n".join(f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
+    )
    prompt = f"<|im_start|>User:{placeholders}\n<end_of_utterance>\nAssistant:"  # noqa: E501

    # Build mm_data
@@ -57,8 +62,7 @@ def test_processor_override(
    # Ensure the placeholders format are correct
    hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
    hf_processed_inputs = hf_processor(text=prompt, images=mm_data["image"])
-    assert processed_inputs["prompt_token_ids"] == hf_processed_inputs[
-        "input_ids"][0]
+    assert processed_inputs["prompt_token_ids"] == hf_processed_inputs["input_ids"][0]

    # Ensure we have the right number of placeholders per num_crops size
    image_token_id = ctx.get_hf_config().image_token_id
--- a/tests/models/multimodal/processing/test_tensor_schema.py
+++ b/tests/models/multimodal/processing/test_tensor_schema.py
@@ -9,23 +9,29 @@ from typing import Any, Union
 import numpy as np
 import pytest
 import torch.nn as nn
-from mistral_common.protocol.instruct.messages import (ImageChunk, TextChunk,
-                                                       UserMessage)
+from mistral_common.protocol.instruct.messages import ImageChunk, TextChunk, UserMessage
 from mistral_common.protocol.instruct.request import ChatCompletionRequest
 from PIL import Image

 from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
-from vllm.config.multimodal import (AudioDummyOptions, BaseDummyOptions,
-                                    ImageDummyOptions, VideoDummyOptions)
-from vllm.distributed import (cleanup_dist_env_and_memory,
-                              init_distributed_environment,
-                              initialize_model_parallel)
+from vllm.config.multimodal import (
+    AudioDummyOptions,
+    BaseDummyOptions,
+    ImageDummyOptions,
+    VideoDummyOptions,
+)
+from vllm.distributed import (
+    cleanup_dist_env_and_memory,
+    init_distributed_environment,
+    initialize_model_parallel,
+)
 from vllm.model_executor.model_loader.utils import set_default_torch_dtype
-from vllm.model_executor.models.interfaces import (SupportsMultiModal,
-                                                   supports_multimodal)
+from vllm.model_executor.models.interfaces import (
+    SupportsMultiModal,
+    supports_multimodal,
+)
 from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensorInputs
-from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        InputProcessingContext)
+from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
 from vllm.multimodal.utils import group_mm_kwargs_by_modality
 from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 from vllm.utils import is_list_of
@@ -48,13 +54,15 @@ REPO_ID_TO_SKIP = {
 }

 ImageInput = list[Image.Image]
-VideoInput = Union[list[Image.Image], list[np.ndarray],
-                   list[tuple[np.ndarray, dict[str, Any]]]]
+VideoInput = Union[
+    list[Image.Image], list[np.ndarray], list[tuple[np.ndarray, dict[str, Any]]]
+]
 AudioInput = list[tuple[np.ndarray, int]]


-def _resize_data(_data: Union[Image.Image, np.ndarray],
-                 size_factor: float) -> Union[Image.Image, np.ndarray]:
+def _resize_data(
+    _data: Union[Image.Image, np.ndarray], size_factor: float
+) -> Union[Image.Image, np.ndarray]:
    assert size_factor <= 1, "Size factor must be less than 1"
    # Image input
    if isinstance(_data, Image.Image):
@@ -74,20 +82,18 @@ def _resize_data(_data: Union[Image.Image, np.ndarray],
        return _data[..., :T, :H, :W, :C]
    # Audio input
    elif isinstance(_data, np.ndarray) and _data.ndim == 1:
-        return _data[:int(len(_data) * size_factor)]
+        return _data[: int(len(_data) * size_factor)]
    raise AssertionError("This line should be unreachable.")


 def resize_mm_data(
-    data: Union[ImageInput, VideoInput, AudioInput],
-    size_factors: tuple[float,
-                        ...]) -> Union[ImageInput, VideoInput, AudioInput]:
-    size_factors = size_factors[:len(data)]
+    data: Union[ImageInput, VideoInput, AudioInput], size_factors: tuple[float, ...]
+) -> Union[ImageInput, VideoInput, AudioInput]:
+    size_factors = size_factors[: len(data)]
    if is_list_of(data, (Image.Image, np.ndarray, list)):
        return [_resize_data(d, s) for d, s in zip(data, size_factors)]
    elif is_list_of(data, tuple):
-        return [(_resize_data(d, s), meta)
-                for (d, meta), s in zip(data, size_factors)]
+        return [(_resize_data(d, s), meta) for (d, meta), s in zip(data, size_factors)]
    raise ValueError("Unsupported multimodal data type.")


@@ -116,12 +122,16 @@ def create_batched_mm_kwargs(
    # Mistral chat outputs tokens directly, rather than text prompts
    if model_config.tokenizer_mode == "mistral":
        images = resized_mm_data.get("image", [])
-        request = ChatCompletionRequest(messages=[
-            UserMessage(content=[
-                TextChunk(text=""),
-                *(ImageChunk(image=image) for image in images),
-            ]),
-        ])
+        request = ChatCompletionRequest(
+            messages=[
+                UserMessage(
+                    content=[
+                        TextChunk(text=""),
+                        *(ImageChunk(image=image) for image in images),
+                    ]
+                ),
+            ]
+        )
        tokenizer = processing_info.get_tokenizer()
        res = tokenizer.mistral.encode_chat_completion(request)
        prompt = res.tokens
@@ -133,10 +143,7 @@ def create_batched_mm_kwargs(
        hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
        tokenization_kwargs=processor_inputs.tokenization_kwargs,
    )["mm_kwargs"].require_data()
-    items = [
-        item for modality in supported_mm_limits
-        for item in mm_kwargs[modality]
-    ]
+    items = [item for modality in supported_mm_limits for item in mm_kwargs[modality]]
    return group_mm_kwargs_by_modality(
        items,
        merge_by_field_config=model_cls.merge_by_field_config,
@@ -167,15 +174,17 @@ def initialize_dummy_model(
    cleanup_dist_env_and_memory()


-def get_model_id_to_test(
-        model_arch_list: Iterable[str]) -> list[tuple[str, str]]:
+def get_model_id_to_test(model_arch_list: Iterable[str]) -> list[tuple[str, str]]:
    filtered_results = []
    for model_arch in model_arch_list:
        model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
        if model_info.extras and model_arch in ARCH_NEEDS_EXTRAS:
            available_repos = list(
-                map(lambda model_id: (model_arch, model_id),
-                    [model_info.default, *model_info.extras.values()]))
+                map(
+                    lambda model_id: (model_arch, model_id),
+                    [model_info.default, *model_info.extras.values()],
+                )
+            )
            filtered_results.extend(available_repos)
        else:
            filtered_results.append((model_arch, model_info.default))
@@ -183,8 +192,8 @@ def get_model_id_to_test(


@pytest.mark.parametrize(
-    "model_arch, model_id",
-    get_model_id_to_test(_MULTIMODAL_EXAMPLE_MODELS.keys()))
+    "model_arch, model_id", get_model_id_to_test(_MULTIMODAL_EXAMPLE_MODELS.keys())
+)
 def test_model_tensor_schema(model_arch: str, model_id: str):
    if model_arch in ARCH_TO_SKIP:
        pytest.skip(f"Skipping {model_arch} due to {ARCH_TO_SKIP[model_arch]}")
@@ -193,12 +202,13 @@ def test_model_tensor_schema(model_arch: str, model_id: str):

    model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
    model_info.check_available_online(on_fail="skip")
-    model_info.check_transformers_version(on_fail="skip",
-                                          check_max_version=False)
+    model_info.check_transformers_version(on_fail="skip", check_max_version=False)

-    hf_overrides_fn = partial(dummy_hf_overrides,
-                              model_arch=model_arch,
-                              exist_overrides=model_info.hf_overrides)
+    hf_overrides_fn = partial(
+        dummy_hf_overrides,
+        model_arch=model_arch,
+        exist_overrides=model_info.hf_overrides,
+    )

    model_config = ModelConfig(
        model_id,
@@ -256,8 +266,11 @@ def test_model_tensor_schema(model_arch: str, model_id: str):

    with initialize_dummy_model(model_cls, model_config) as model:
        for modality, _, mm_kwargs in create_batched_mm_kwargs(
-                model_cls, model_config, processor):
+            model_cls, model_config, processor
+        ):
            for method_name in inputs_parse_methods:
-                print(f"Testing `{method_name}` with modality={modality} "
-                      f"and mm_kwargs{list(mm_kwargs.keys())}")
+                print(
+                    f"Testing `{method_name}` with modality={modality} "
+                    f"and mm_kwargs{list(mm_kwargs.keys())}"
+                )
                getattr(model, method_name)(modality=modality, **mm_kwargs)