Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-05 15:06:22 +01:00
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -3,27 +3,40 @@
 """Common tests for testing .generate() functionality for single / multiple
 image, embedding, and video support for different VLMs in vLLM.
 """
+
 import math
 import os
 from collections import defaultdict
 from pathlib import PosixPath

 import pytest
-from transformers import (AutoModel, AutoModelForImageTextToText,
-                          AutoModelForTextToWaveform)
+from transformers import (
+    AutoModel,
+    AutoModelForImageTextToText,
+    AutoModelForTextToWaveform,
+)

 from vllm.platforms import current_platform
 from vllm.utils import identity

-from ....conftest import (IMAGE_ASSETS, AudioTestAssets, HfRunner,
-                          ImageTestAssets, VideoTestAssets, VllmRunner)
-from ....utils import (create_new_process_for_each_test, large_gpu_mark,
-                       multi_gpu_marks)
+from ....conftest import (
+    IMAGE_ASSETS,
+    AudioTestAssets,
+    HfRunner,
+    ImageTestAssets,
+    VideoTestAssets,
+    VllmRunner,
+)
+from ....utils import create_new_process_for_each_test, large_gpu_mark, multi_gpu_marks
 from ...utils import check_outputs_equal
 from .vlm_utils import custom_inputs, model_utils, runners
 from .vlm_utils.case_filtering import get_parametrized_options
-from .vlm_utils.types import (CustomTestOptions, ExpandableVLMTestArgs,
-                              VLMTestInfo, VLMTestType)
+from .vlm_utils.types import (
+    CustomTestOptions,
+    ExpandableVLMTestArgs,
+    VLMTestInfo,
+    VLMTestType,
+)

 # This hack is needed for phi3v & paligemma models
 # ROCm Triton FA can run into shared memory issues with these models,
@@ -828,7 +841,7 @@ def _mark_splits(
    new_test_settings = dict[str, VLMTestInfo]()

    for i in range(num_groups):
-        models_in_group = models[i * split_size:(i + 1) * split_size]
+        models_in_group = models[i * split_size : (i + 1) * split_size]

        for model in models_in_group:
            for info in test_infos_by_model[model]:
@@ -859,7 +872,8 @@ VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2)
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.IMAGE,
        create_new_process_for_each_test=False,
-    ))
+    ),
+)
 def test_single_image_models(
    tmp_path: PosixPath,
    model_type: str,
@@ -885,7 +899,8 @@ def test_single_image_models(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.MULTI_IMAGE,
        create_new_process_for_each_test=False,
-    ))
+    ),
+)
 def test_multi_image_models(
    tmp_path: PosixPath,
    model_type: str,
@@ -911,7 +926,8 @@ def test_multi_image_models(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.EMBEDDING,
        create_new_process_for_each_test=False,
-    ))
+    ),
+)
 def test_image_embedding_models(
    model_type: str,
    test_case: ExpandableVLMTestArgs,
@@ -935,7 +951,8 @@ def test_image_embedding_models(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.VIDEO,
        create_new_process_for_each_test=False,
-    ))
+    ),
+)
 def test_video_models(
    model_type: str,
    test_case: ExpandableVLMTestArgs,
@@ -959,7 +976,8 @@ def test_video_models(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.AUDIO,
        create_new_process_for_each_test=False,
-    ))
+    ),
+)
 def test_audio_models(
    model_type: str,
    test_case: ExpandableVLMTestArgs,
@@ -983,7 +1001,8 @@ def test_audio_models(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.CUSTOM_INPUTS,
        create_new_process_for_each_test=False,
-    ))
+    ),
+)
 def test_custom_inputs_models(
    model_type: str,
    test_case: ExpandableVLMTestArgs,
@@ -1006,7 +1025,8 @@ def test_custom_inputs_models(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.IMAGE,
        create_new_process_for_each_test=True,
-    ))
+    ),
+)
@create_new_process_for_each_test()
 def test_single_image_models_heavy(
    tmp_path: PosixPath,
@@ -1033,7 +1053,8 @@ def test_single_image_models_heavy(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.MULTI_IMAGE,
        create_new_process_for_each_test=True,
-    ))
+    ),
+)
@create_new_process_for_each_test()
 def test_multi_image_models_heavy(
    tmp_path: PosixPath,
@@ -1060,7 +1081,8 @@ def test_multi_image_models_heavy(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.EMBEDDING,
        create_new_process_for_each_test=True,
-    ))
+    ),
+)
@create_new_process_for_each_test()
 def test_image_embedding_models_heavy(
    model_type: str,
@@ -1085,7 +1107,8 @@ def test_image_embedding_models_heavy(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.VIDEO,
        create_new_process_for_each_test=True,
-    ))
+    ),
+)
 def test_video_models_heavy(
    model_type: str,
    test_case: ExpandableVLMTestArgs,
@@ -1109,7 +1132,8 @@ def test_video_models_heavy(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.AUDIO,
        create_new_process_for_each_test=True,
-    ))
+    ),
+)
 def test_audio_models_heavy(
    model_type: str,
    test_case: ExpandableVLMTestArgs,
@@ -1133,7 +1157,8 @@ def test_audio_models_heavy(
        VLM_TEST_SETTINGS,
        test_type=VLMTestType.CUSTOM_INPUTS,
        create_new_process_for_each_test=True,
-    ))
+    ),
+)
@create_new_process_for_each_test()
 def test_custom_inputs_models_heavy(
    model_type: str,
--- a/tests/models/multimodal/generation/test_granite_speech.py
+++ b/tests/models/multimodal/generation/test_granite_speech.py
@@ -10,8 +10,7 @@ from transformers import AutoModelForSpeechSeq2Seq
 from vllm.logprobs import SampleLogprobs
 from vllm.lora.request import LoRARequest

-from ....conftest import (AudioTestAssets, HfRunner, PromptAudioInput,
-                          VllmRunner)
+from ....conftest import AudioTestAssets, HfRunner, PromptAudioInput, VllmRunner
 from ...registry import HF_EXAMPLE_MODELS
 from ...utils import check_logprobs_close

@@ -64,50 +63,49 @@ def run_test(
    # will hurt multiprocessing backend with fork method (the default method).
    # max_model_len should be greater than image_feature_size
    with vllm_runner(
-            model,
-            runner="generate",
-            max_model_len=max_model_len,
-            max_num_seqs=1,
-            dtype=dtype,
-            limit_mm_per_prompt={"audio": 1},
-            tensor_parallel_size=tensor_parallel_size,
-            distributed_executor_backend=distributed_executor_backend,
-            enable_lora=True,
-            max_lora_rank=64,
-            enforce_eager=True,
+        model,
+        runner="generate",
+        max_model_len=max_model_len,
+        max_num_seqs=1,
+        dtype=dtype,
+        limit_mm_per_prompt={"audio": 1},
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
+        enable_lora=True,
+        max_lora_rank=64,
+        enforce_eager=True,
    ) as vllm_model:
        lora_request = LoRARequest("audio", 1, audio_lora_path)
        vllm_outputs_per_case = [
-            vllm_model.generate_greedy_logprobs(prompts,
-                                                max_tokens,
-                                                num_logprobs=num_logprobs,
-                                                audios=audios,
-                                                lora_request=lora_request)
+            vllm_model.generate_greedy_logprobs(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                audios=audios,
+                lora_request=lora_request,
+            )
            for prompts, audios in inputs
        ]

-    with hf_runner(model, dtype=dtype,
-                   auto_cls=AutoModelForSpeechSeq2Seq) as hf_model:
-
+    with hf_runner(model, dtype=dtype, auto_cls=AutoModelForSpeechSeq2Seq) as hf_model:
        hf_processor = hf_model.processor
        eos_token_id = hf_processor.tokenizer.eos_token_id

        hf_outputs_per_case = [
-            hf_model.generate_greedy_logprobs_limit(prompts,
-                                                    max_tokens,
-                                                    num_logprobs=num_logprobs,
-                                                    audios=[audios],
-                                                    eos_token_id=eos_token_id)
+            hf_model.generate_greedy_logprobs_limit(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                audios=[audios],
+                eos_token_id=eos_token_id,
+            )
            for prompts, audios in inputs
        ]

-    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
-                                        vllm_outputs_per_case):
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
        check_logprobs_close(
            outputs_0_lst=hf_outputs,
-            outputs_1_lst=[
-                vllm_to_hf_output(output) for output in vllm_outputs
-            ],
+            outputs_1_lst=[vllm_to_hf_output(output) for output in vllm_outputs],
            name_0="hf",
            name_1="vllm",
        )
@@ -118,9 +116,16 @@ def run_test(
@pytest.mark.parametrize("max_model_len", [2048])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
-def test_models(hf_runner, vllm_runner, model: str,
-                audio_assets: AudioTestAssets, dtype: str, max_model_len: int,
-                max_tokens: int, num_logprobs: int) -> None:
+def test_models(
+    hf_runner,
+    vllm_runner,
+    model: str,
+    audio_assets: AudioTestAssets,
+    dtype: str,
+    max_model_len: int,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
    model_info.check_available_online(on_fail="skip")
    model_info.check_transformers_version(on_fail="skip")
--- a/tests/models/multimodal/generation/test_interleaved.py
+++ b/tests/models/multimodal/generation/test_interleaved.py
@@ -28,8 +28,7 @@ def test_models(vllm_runner, model, dtype: str, max_tokens: int) -> None:
    give the same result.
    """

-    image_cherry = convert_image_mode(
-        ImageAsset("cherry_blossom").pil_image, "RGB")
+    image_cherry = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")
    image_stop = convert_image_mode(ImageAsset("stop_sign").pil_image, "RGB")
    images = [image_cherry, image_stop]
    video = VideoAsset(name="baby_reading", num_frames=16).np_ndarrays
@@ -47,29 +46,30 @@ def test_models(vllm_runner, model, dtype: str, max_tokens: int) -> None:
        ),
    ]

-    with vllm_runner(model,
-                     runner="generate",
-                     dtype=dtype,
-                     limit_mm_per_prompt={"image": 2},
-                     max_model_len=32768,
-                     max_num_seqs=2,
-                     tensor_parallel_size=1,
-                     enforce_eager=True) as vllm_model:
+    with vllm_runner(
+        model,
+        runner="generate",
+        dtype=dtype,
+        limit_mm_per_prompt={"image": 2},
+        max_model_len=32768,
+        max_num_seqs=2,
+        tensor_parallel_size=1,
+        enforce_eager=True,
+    ) as vllm_model:
        vllm_outputs_per_case = [
-            vllm_model.generate_greedy(prompts,
-                                       max_tokens,
-                                       images=images,
-                                       videos=videos)
+            vllm_model.generate_greedy(
+                prompts, max_tokens, images=images, videos=videos
+            )
            for prompts, images, videos in inputs
        ]

    all_results = [output[0][1] for output in vllm_outputs_per_case]
-    outputs = [(total_str, total_str.find("assistant\n") + len("assistant\n"))
-               for total_str in all_results]
-    prompt_lengths = [prompt_len for _, prompt_len in outputs]
-    generated_strs = [
-        total_str[prompt_len:] for total_str, prompt_len in outputs
+    outputs = [
+        (total_str, total_str.find("assistant\n") + len("assistant\n"))
+        for total_str in all_results
    ]
+    prompt_lengths = [prompt_len for _, prompt_len in outputs]
+    generated_strs = [total_str[prompt_len:] for total_str, prompt_len in outputs]
    interleaved_prompt_len, noninterleaved_prompt_len = prompt_lengths
    interleaved_output_str, noninterleaved_output_str = generated_strs

--- a/tests/models/multimodal/generation/test_maverick.py
+++ b/tests/models/multimodal/generation/test_maverick.py
@@ -18,13 +18,11 @@ from typing import Any
 import pytest
 import torch
 from safetensors.torch import save_file
-from transformers import (AutoConfig, AutoProcessor, AutoTokenizer,
-                          GenerationConfig)
+from transformers import AutoConfig, AutoProcessor, AutoTokenizer, GenerationConfig

 from vllm import LLM, SamplingParams
 from vllm.v1.executor.abstract import Executor
-from vllm.v1.kv_cache_interface import (ChunkedLocalAttentionSpec,
-                                        FullAttentionSpec)
+from vllm.v1.kv_cache_interface import ChunkedLocalAttentionSpec, FullAttentionSpec

 from ....utils import multi_gpu_test

@@ -93,8 +91,7 @@ def get_rope_layers_config(model_path: str) -> list[int]:


 def create_reduced_maverick_model(
-    original_model_name:
-    str = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
+    original_model_name: str = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
    output_dir: str = "/tmp/reduced_maverick",
    text_layers: int = 4,
    num_experts: int = 4,
@@ -118,7 +115,8 @@ def create_reduced_maverick_model(

    print(
        f"Creating reduced Maverick model with {text_layers} text layers and "
-        f"{vision_layers} vision layers...")
+        f"{vision_layers} vision layers..."
+    )

    # Create output directory
    output_path = Path(output_dir)
@@ -126,19 +124,23 @@ def create_reduced_maverick_model(
        if force_recreate:
            shutil.rmtree(output_path)
        else:
-            print(f"Output directory {output_dir} already exists. "
-                  "Use --force-recreate to overwrite.")
+            print(
+                f"Output directory {output_dir} already exists. "
+                "Use --force-recreate to overwrite."
+            )
            return str(output_path)

    output_path.mkdir(parents=True, exist_ok=True)

    try:
        print("Loading original model configuration...")
-        original_config = AutoConfig.from_pretrained(original_model_name,
-                                                     trust_remote_code=True)
+        original_config = AutoConfig.from_pretrained(
+            original_model_name, trust_remote_code=True
+        )
        print("Creating reduced configuration...")
-        reduced_config = create_reduced_config(original_config, text_layers,
-                                               num_experts, vision_layers)
+        reduced_config = create_reduced_config(
+            original_config, text_layers, num_experts, vision_layers
+        )

        config_path = output_path / "config.json"
        with open(config_path, "w") as f:
@@ -149,8 +151,7 @@ def create_reduced_maverick_model(
        copy_tokenizer_files(original_model_name, output_path)

        print("Creating reduced safetensors files...")
-        create_reduced_safetensors(original_config, reduced_config,
-                                   output_path)
+        create_reduced_safetensors(original_config, reduced_config, output_path)

        print("Creating preprocessor config...")
        create_preprocessor_config(original_config, output_path)
@@ -173,9 +174,9 @@ def create_reduced_maverick_model(
        raise


-def create_reduced_config(original_config: Any, text_layers: int,
-                          num_experts: int,
-                          vision_layers: int) -> dict[str, Any]:
+def create_reduced_config(
+    original_config: Any, text_layers: int, num_experts: int, vision_layers: int
+) -> dict[str, Any]:
    """Create a reduced configuration based on the original."""

    # Convert config to dictionary
@@ -185,23 +186,18 @@ def create_reduced_config(original_config: Any, text_layers: int,
    if "text_config" in config_dict:
        original_text_layers = config_dict["text_config"]["num_hidden_layers"]
        config_dict["text_config"]["num_hidden_layers"] = text_layers
-        print(
-            f"Reduced text layers from {original_text_layers} to {text_layers}"
-        )
+        print(f"Reduced text layers from {original_text_layers} to {text_layers}")

        original_num_experts = config_dict["text_config"]["num_local_experts"]
        config_dict["text_config"]["num_local_experts"] = num_experts
-        print(
-            f"Reduced num experts from {original_num_experts} to {num_experts}"
-        )
+        print(f"Reduced num experts from {original_num_experts} to {num_experts}")

        hidden_dim_divisor = 4

        original_hidden_size = config_dict["text_config"]["hidden_size"]
        new_hidden_size = original_hidden_size // hidden_dim_divisor
        config_dict["text_config"]["hidden_size"] = new_hidden_size
-        print(f"Reduced hidden size from {original_hidden_size} to "
-              f"{new_hidden_size}")
+        print(f"Reduced hidden size from {original_hidden_size} to {new_hidden_size}")

        original_head_dim = config_dict["text_config"]["head_dim"]
        new_head_dim = original_head_dim // hidden_dim_divisor
@@ -210,15 +206,12 @@ def create_reduced_config(original_config: Any, text_layers: int,

    # Reduce vision layers
    if "vision_config" in config_dict:
-        original_vision_layers = config_dict["vision_config"][
-            "num_hidden_layers"]
+        original_vision_layers = config_dict["vision_config"]["num_hidden_layers"]
        config_dict["vision_config"]["num_hidden_layers"] = vision_layers
-        print(f"Reduced vision layers from {original_vision_layers} "
-              f"to {vision_layers}")
+        print(f"Reduced vision layers from {original_vision_layers} to {vision_layers}")

    # Update model name to indicate it's a reduced version
-    config_dict["_name_or_path"] = (
-        f"reduced_maverick_{text_layers}t_{vision_layers}v")
+    config_dict["_name_or_path"] = f"reduced_maverick_{text_layers}t_{vision_layers}v"

    return config_dict

@@ -227,16 +220,16 @@ def copy_tokenizer_files(original_model_name: str, output_path: Path) -> None:
    """Copy tokenizer files from the original model."""

    try:
-        tokenizer = AutoTokenizer.from_pretrained(original_model_name,
-                                                  trust_remote_code=True)
+        tokenizer = AutoTokenizer.from_pretrained(
+            original_model_name, trust_remote_code=True
+        )
        tokenizer.save_pretrained(output_path)
        print("Tokenizer files copied successfully")
    except Exception as e:
        print(f"Warning: Could not copy tokenizer files: {e}")


-def create_preprocessor_config(original_config: Any,
-                               output_path: Path) -> None:
+def create_preprocessor_config(original_config: Any, output_path: Path) -> None:
    """Create preprocessor_config.json for multimodal model."""

    # Try to load the original preprocessor config
@@ -254,9 +247,9 @@ def create_preprocessor_config(original_config: Any,
        raise


-def create_reduced_safetensors(original_config: Any, reduced_config: dict[str,
-                                                                          Any],
-                               output_path: Path) -> None:
+def create_reduced_safetensors(
+    original_config: Any, reduced_config: dict[str, Any], output_path: Path
+) -> None:
    """Create safetensors files with weights for the reduced model."""

    print("Generating synthetic weights for reduced model...")
@@ -279,8 +272,7 @@ def create_reduced_safetensors(original_config: Any, reduced_config: dict[str,
    save_weights_to_safetensors(weights, output_path)


-def create_text_model_weights(
-        text_config: dict[str, Any]) -> dict[str, torch.Tensor]:
+def create_text_model_weights(text_config: dict[str, Any]) -> dict[str, torch.Tensor]:
    """Create synthetic weights for the text model with MoE structure."""

    weights = {}
@@ -291,19 +283,18 @@ def create_text_model_weights(
    intermediate_size_mlp = text_config["intermediate_size_mlp"]
    num_layers = text_config["num_hidden_layers"]
    num_attention_heads = text_config["num_attention_heads"]
-    num_key_value_heads = text_config.get("num_key_value_heads",
-                                          num_attention_heads)
+    num_key_value_heads = text_config.get("num_key_value_heads", num_attention_heads)

    # MoE specific parameters
    num_experts = text_config.get("num_local_experts")
-    assert (num_experts
-            is not None), "num_local_experts must be specified for MoE"
+    assert num_experts is not None, "num_local_experts must be specified for MoE"

    head_dim = hidden_size // num_attention_heads

    # Embedding layers
    weights["language_model.model.embed_tokens.weight"] = torch.randn(
-        vocab_size, hidden_size, dtype=torch.float16)
+        vocab_size, hidden_size, dtype=torch.float16
+    )

    # Transformer layers
    for layer_idx in range(num_layers):
@@ -312,95 +303,105 @@ def create_text_model_weights(

        # Self-attention weights (separate q, k, v projections)
        weights[f"{layer_prefix}.self_attn.q_proj.weight"] = torch.randn(
-            hidden_size, num_attention_heads * head_dim, dtype=torch.bfloat16)
+            hidden_size, num_attention_heads * head_dim, dtype=torch.bfloat16
+        )
        weights[f"{layer_prefix}.self_attn.k_proj.weight"] = torch.randn(
-            hidden_size, num_key_value_heads * head_dim, dtype=torch.bfloat16)
+            hidden_size, num_key_value_heads * head_dim, dtype=torch.bfloat16
+        )
        weights[f"{layer_prefix}.self_attn.v_proj.weight"] = torch.randn(
-            num_key_value_heads * head_dim, hidden_size, dtype=torch.bfloat16)
+            num_key_value_heads * head_dim, hidden_size, dtype=torch.bfloat16
+        )
        weights[f"{layer_prefix}.self_attn.o_proj.weight"] = torch.randn(
-            hidden_size, num_attention_heads * head_dim, dtype=torch.bfloat16)
+            hidden_size, num_attention_heads * head_dim, dtype=torch.bfloat16
+        )
        print("Self-attention weights created.")

        # Feed-forward weights - MoE pattern based on interleave_moe_layer_step
        # For interleave_moe_layer_step=2: layers 1,3,5,... are MoE, layers
        # 0,2,4,... are dense
        interleave_step = text_config.get("interleave_moe_layer_step", 1)
-        is_moe_layer = (interleave_step > 0
-                        and (layer_idx + 1) % interleave_step == 0)
+        is_moe_layer = interleave_step > 0 and (layer_idx + 1) % interleave_step == 0

        if is_moe_layer:
            # MoE layer structure
            # 1. Router weights
-            weights[
-                f"{layer_prefix}.feed_forward.router.weight"] = torch.randn(
-                    num_experts, hidden_size, dtype=torch.float16)
+            weights[f"{layer_prefix}.feed_forward.router.weight"] = torch.randn(
+                num_experts, hidden_size, dtype=torch.float16
+            )

            # 2. Individual expert weights (not fused)
            for expert_idx in range(num_experts):
-                expert_prefix = (
-                    f"{layer_prefix}.feed_forward.experts.{expert_idx}")
+                expert_prefix = f"{layer_prefix}.feed_forward.experts.{expert_idx}"

                weights[f"{expert_prefix}.gate_proj.weight"] = torch.randn(
-                    intermediate_size, hidden_size, dtype=torch.bfloat16)
+                    intermediate_size, hidden_size, dtype=torch.bfloat16
+                )
                weights[f"{expert_prefix}.up_proj.weight"] = torch.randn(
-                    intermediate_size, hidden_size, dtype=torch.bfloat16)
+                    intermediate_size, hidden_size, dtype=torch.bfloat16
+                )
                weights[f"{expert_prefix}.down_proj.weight"] = torch.randn(
-                    hidden_size, intermediate_size, dtype=torch.bfloat16)
+                    hidden_size, intermediate_size, dtype=torch.bfloat16
+                )

                # Expert weight scales (FP8 quantization)
-                weights[
-                    f"{expert_prefix}.gate_proj.weight_scale"] = torch.ones(
-                        intermediate_size, 1, dtype=torch.bfloat16)
+                weights[f"{expert_prefix}.gate_proj.weight_scale"] = torch.ones(
+                    intermediate_size, 1, dtype=torch.bfloat16
+                )
                weights[f"{expert_prefix}.up_proj.weight_scale"] = torch.ones(
-                    intermediate_size, 1, dtype=torch.bfloat16)
-                weights[
-                    f"{expert_prefix}.down_proj.weight_scale"] = torch.ones(
-                        hidden_size, 1, dtype=torch.bfloat16)
+                    intermediate_size, 1, dtype=torch.bfloat16
+                )
+                weights[f"{expert_prefix}.down_proj.weight_scale"] = torch.ones(
+                    hidden_size, 1, dtype=torch.bfloat16
+                )

            # 3. Shared expert weights
            shared_expert_prefix = f"{layer_prefix}.feed_forward.shared_expert"
            weights[f"{shared_expert_prefix}.gate_proj.weight"] = torch.randn(
-                intermediate_size, hidden_size, dtype=torch.bfloat16)
+                intermediate_size, hidden_size, dtype=torch.bfloat16
+            )
            weights[f"{shared_expert_prefix}.up_proj.weight"] = torch.randn(
-                intermediate_size, hidden_size, dtype=torch.bfloat16)
+                intermediate_size, hidden_size, dtype=torch.bfloat16
+            )
            weights[f"{shared_expert_prefix}.down_proj.weight"] = torch.randn(
-                hidden_size, intermediate_size, dtype=torch.bfloat16)
+                hidden_size, intermediate_size, dtype=torch.bfloat16
+            )
            print(f"MoE feed-forward weights created for layer {layer_idx}.")
        else:
            # Dense layer structure
-            weights[f"{layer_prefix}.feed_forward.gate_proj.weight"] = (
-                torch.randn(intermediate_size_mlp,
-                            hidden_size,
-                            dtype=torch.bfloat16))
-            weights[f"{layer_prefix}.feed_forward.up_proj.weight"] = (
-                torch.randn(intermediate_size_mlp,
-                            hidden_size,
-                            dtype=torch.bfloat16))
-            weights[f"{layer_prefix}.feed_forward.down_proj.weight"] = (
-                torch.randn(hidden_size,
-                            intermediate_size_mlp,
-                            dtype=torch.bfloat16))
+            weights[f"{layer_prefix}.feed_forward.gate_proj.weight"] = torch.randn(
+                intermediate_size_mlp, hidden_size, dtype=torch.bfloat16
+            )
+            weights[f"{layer_prefix}.feed_forward.up_proj.weight"] = torch.randn(
+                intermediate_size_mlp, hidden_size, dtype=torch.bfloat16
+            )
+            weights[f"{layer_prefix}.feed_forward.down_proj.weight"] = torch.randn(
+                hidden_size, intermediate_size_mlp, dtype=torch.bfloat16
+            )
            print(f"Dense feed-forward weights created for layer {layer_idx}.")

        # Layer norms
        weights[f"{layer_prefix}.input_layernorm.weight"] = torch.ones(
-            hidden_size, dtype=torch.bfloat16)
-        weights[
-            f"{layer_prefix}.post_attention_layernorm.weight"] = torch.ones(
-                hidden_size, dtype=torch.bfloat16)
+            hidden_size, dtype=torch.bfloat16
+        )
+        weights[f"{layer_prefix}.post_attention_layernorm.weight"] = torch.ones(
+            hidden_size, dtype=torch.bfloat16
+        )
        print("Layer norms created.")

    # Final layer norm and output projection
    weights["language_model.model.norm.weight"] = torch.ones(
-        hidden_size, dtype=torch.bfloat16)
+        hidden_size, dtype=torch.bfloat16
+    )
    weights["language_model.lm_head.weight"] = torch.randn(
-        vocab_size, hidden_size, dtype=torch.bfloat16)
+        vocab_size, hidden_size, dtype=torch.bfloat16
+    )

    return weights


 def create_vision_model_weights(
-        vision_config: dict[str, Any]) -> dict[str, torch.Tensor]:
+    vision_config: dict[str, Any],
+) -> dict[str, torch.Tensor]:
    """Create synthetic weights for the vision model."""

    weights = {}
@@ -414,47 +415,62 @@ def create_vision_model_weights(
        layer_prefix = f"vision_model.model.layers.{layer_idx}"

        weights[f"{layer_prefix}.self_attn.q_proj.weight"] = torch.randn(
-            hidden_size, hidden_size, dtype=torch.bfloat16)
+            hidden_size, hidden_size, dtype=torch.bfloat16
+        )
        weights[f"{layer_prefix}.self_attn.q_proj.bias"] = torch.zeros(
-            hidden_size, dtype=torch.bfloat16)
+            hidden_size, dtype=torch.bfloat16
+        )
        weights[f"{layer_prefix}.self_attn.k_proj.weight"] = torch.randn(
-            hidden_size, hidden_size, dtype=torch.bfloat16)
+            hidden_size, hidden_size, dtype=torch.bfloat16
+        )
        weights[f"{layer_prefix}.self_attn.k_proj.bias"] = torch.zeros(
-            hidden_size, dtype=torch.bfloat16)
+            hidden_size, dtype=torch.bfloat16
+        )
        weights[f"{layer_prefix}.self_attn.v_proj.weight"] = torch.randn(
-            hidden_size, hidden_size, dtype=torch.bfloat16)
+            hidden_size, hidden_size, dtype=torch.bfloat16
+        )
        weights[f"{layer_prefix}.self_attn.v_proj.bias"] = torch.zeros(
-            hidden_size, dtype=torch.bfloat16)
+            hidden_size, dtype=torch.bfloat16
+        )
        weights[f"{layer_prefix}.self_attn.o_proj.weight"] = torch.randn(
-            hidden_size, hidden_size, dtype=torch.bfloat16)
+            hidden_size, hidden_size, dtype=torch.bfloat16
+        )
        weights[f"{layer_prefix}.self_attn.o_proj.bias"] = torch.zeros(
-            hidden_size, dtype=torch.bfloat16)
+            hidden_size, dtype=torch.bfloat16
+        )

        weights[f"{layer_prefix}.mlp.fc1.weight"] = torch.randn(
-            intermediate_size, hidden_size, dtype=torch.bfloat16)
+            intermediate_size, hidden_size, dtype=torch.bfloat16
+        )
        weights[f"{layer_prefix}.mlp.fc1.bias"] = torch.zeros(
-            intermediate_size, dtype=torch.bfloat16)
+            intermediate_size, dtype=torch.bfloat16
+        )
        weights[f"{layer_prefix}.mlp.fc2.weight"] = torch.randn(
-            hidden_size, intermediate_size, dtype=torch.bfloat16)
+            hidden_size, intermediate_size, dtype=torch.bfloat16
+        )
        weights[f"{layer_prefix}.mlp.fc2.bias"] = torch.zeros(
-            hidden_size, dtype=torch.bfloat16)
+            hidden_size, dtype=torch.bfloat16
+        )

        weights[f"{layer_prefix}.input_layernorm.weight"] = torch.ones(
-            hidden_size, dtype=torch.bfloat16)
+            hidden_size, dtype=torch.bfloat16
+        )
        weights[f"{layer_prefix}.input_layernorm.bias"] = torch.zeros(
-            hidden_size, dtype=torch.bfloat16)
-        weights[
-            f"{layer_prefix}.post_attention_layernorm.weight"] = torch.ones(
-                hidden_size, dtype=torch.bfloat16)
+            hidden_size, dtype=torch.bfloat16
+        )
+        weights[f"{layer_prefix}.post_attention_layernorm.weight"] = torch.ones(
+            hidden_size, dtype=torch.bfloat16
+        )
        weights[f"{layer_prefix}.post_attention_layernorm.bias"] = torch.zeros(
-            hidden_size, dtype=torch.bfloat16)
+            hidden_size, dtype=torch.bfloat16
+        )

    return weights


 def create_shared_weights(
-        text_config: dict[str, Any],
-        vision_config: dict[str, Any]) -> dict[str, torch.Tensor]:
+    text_config: dict[str, Any], vision_config: dict[str, Any]
+) -> dict[str, torch.Tensor]:
    """Create weights for shared components (vision-language connector)"""

    weights = {}
@@ -464,13 +480,15 @@ def create_shared_weights(

    # Vision-language connector (projects vision features to text space)
    weights["multi_modal_projector.linear_1.weight"] = torch.randn(
-        text_hidden_size, projector_input_dim, dtype=torch.bfloat16)
+        text_hidden_size, projector_input_dim, dtype=torch.bfloat16
+    )

    return weights


-def save_weights_to_safetensors(weights: dict[str, torch.Tensor],
-                                output_path: Path) -> None:
+def save_weights_to_safetensors(
+    weights: dict[str, torch.Tensor], output_path: Path
+) -> None:
    """Save weights to safetensors files and create index."""

    # Determine how to shard the weights
@@ -507,18 +525,18 @@ def save_weights_to_safetensors(weights: dict[str, torch.Tensor],
    else:
        # Multiple shards
        for i, shard in enumerate(shards):
-            filename = f"model-{i+1:05d}-of-{len(shards):05d}.safetensors"
+            filename = f"model-{i + 1:05d}-of-{len(shards):05d}.safetensors"
            save_file(shard, output_path / filename)
            for name in shard:
                weight_map[name] = filename
-            print(f"Saved shard {i+1}/{len(shards)}: {filename}")
+            print(f"Saved shard {i + 1}/{len(shards)}: {filename}")

    # Create index file
    index_data = {
        "metadata": {
-            "total_size":
-            sum(tensor.numel() * tensor.element_size()
-                for tensor in weights.values())
+            "total_size": sum(
+                tensor.numel() * tensor.element_size() for tensor in weights.values()
+            )
        },
        "weight_map": weight_map,
    }
@@ -528,8 +546,9 @@ def save_weights_to_safetensors(weights: dict[str, torch.Tensor],
        json.dump(index_data, f, indent=2)

    print(f"Created index file: {index_path}")
-    print(f"Total model size: "
-          f"{index_data['metadata']['total_size'] / (1024**3):.2f} GB")
+    print(
+        f"Total model size: {index_data['metadata']['total_size'] / (1024**3):.2f} GB"
+    )


 def check_attention_spec_interleaved_rope(
@@ -540,8 +559,7 @@ def check_attention_spec_interleaved_rope(
 ):
    """Check that the attention spec is correct."""
    assert isinstance(llm.llm_engine.model_executor, Executor)
-    kv_cache_specs_per_rank = llm.llm_engine.model_executor.get_kv_cache_specs(
-    )
+    kv_cache_specs_per_rank = llm.llm_engine.model_executor.get_kv_cache_specs()
    for rank in range(num_ranks):
        kv_cache_specs = kv_cache_specs_per_rank[rank]
        assert len(kv_cache_specs.keys()) == num_attention_layers
@@ -551,16 +569,14 @@ def check_attention_spec_interleaved_rope(
            else:
                expected_spec = ChunkedLocalAttentionSpec
            assert isinstance(
-                kv_cache_specs[
-                    f"language_model.model.layers.{i}.self_attn.attn"],
-                expected_spec)
+                kv_cache_specs[f"language_model.model.layers.{i}.self_attn.attn"],
+                expected_spec,
+            )


 def run_reduced_model(llm: LLM, should_profile: bool = False) -> None:
    """Test the created reduced model with vLLM."""
-    sampling_params = SamplingParams(temperature=0.8,
-                                     top_p=0.95,
-                                     max_tokens=50)
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=50)

    if should_profile:
        llm.start_profile()
@@ -571,15 +587,15 @@ def run_reduced_model(llm: LLM, should_profile: bool = False) -> None:
    print("Test generation successful!")
    for output in outputs:
        print(f"Prompt: {output.prompt}")
-        print(f"Output: "
-              f"{output.outputs[0].text}")
+        print(f"Output: {output.outputs[0].text}")
        print("-" * 40)


@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize(
    "original_model_name,text_layers,num_experts,vision_layers,",
-    [("meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", 4, 4, 2)])
+    [("meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", 4, 4, 2)],
+)
@pytest.mark.parametrize("enforce_eager", [True, False])
@pytest.mark.parametrize("tp,ep", [(2, True)])
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
@@ -640,7 +656,8 @@ def main():
    import argparse

    parser = argparse.ArgumentParser(
-        description="Create a reduced-layer Maverick model")
+        description="Create a reduced-layer Maverick model"
+    )
    parser.add_argument(
        "--output-dir",
        default="/tmp/reduced_maverick",
@@ -652,10 +669,7 @@ def main():
        default=4,
        help="Number of text transformer layers",
    )
-    parser.add_argument("--num-experts",
-                        type=int,
-                        default=4,
-                        help="Number of experts")
+    parser.add_argument("--num-experts", type=int, default=4, help="Number of experts")
    parser.add_argument(
        "--vision-layers",
        type=int,
@@ -667,12 +681,12 @@ def main():
        action="store_true",
        help="Force recreation if output directory exists",
    )
-    parser.add_argument("--test",
-                        action="store_true",
-                        help="Test the created model with vLLM")
-    parser.add_argument("--profile",
-                        action="store_true",
-                        help="Profile the created model with vLLM")
+    parser.add_argument(
+        "--test", action="store_true", help="Test the created model with vLLM"
+    )
+    parser.add_argument(
+        "--profile", action="store_true", help="Profile the created model with vLLM"
+    )
    parser.add_argument(
        "--test-original",
        action="store_true",
@@ -687,16 +701,18 @@ def main():
    args = parser.parse_args()

    if args.test:
-        test_dummy_maverick(original_model_name=args.original_model,
-                            output_dir=args.output_dir,
-                            text_layers=args.text_layers,
-                            num_experts=args.num_experts,
-                            vision_layers=args.vision_layers,
-                            force_recreate=args.force_recreate,
-                            tp=2,
-                            ep=True,
-                            enforce_eager=True,
-                            profile=args.profile)
+        test_dummy_maverick(
+            original_model_name=args.original_model,
+            output_dir=args.output_dir,
+            text_layers=args.text_layers,
+            num_experts=args.num_experts,
+            vision_layers=args.vision_layers,
+            force_recreate=args.force_recreate,
+            tp=2,
+            ep=True,
+            enforce_eager=True,
+            profile=args.profile,
+        )

    if args.test_original:
        run_maverick_serving(args.original_model)
--- a/tests/models/multimodal/generation/test_phi4_multimodal.py
+++ b/tests/models/multimodal/generation/test_phi4_multimodal.py
@@ -14,26 +14,35 @@ from vllm.lora.request import LoRARequest
 from vllm.multimodal.image import rescale_image_size
 from vllm.platforms import current_platform

-from ....conftest import (IMAGE_ASSETS, HfRunner, PromptAudioInput,
-                          PromptImageInput, VllmRunner)
+from ....conftest import (
+    IMAGE_ASSETS,
+    HfRunner,
+    PromptAudioInput,
+    PromptImageInput,
+    VllmRunner,
+)
 from ....utils import large_gpu_test
 from ...utils import check_logprobs_close

-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
-    "stop_sign":
-    "<|user|>\n<|image|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n",  # noqa: E501
-    "cherry_blossom":
-    "<|user|>\n<|image|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n",  # noqa: E501
-})
-HF_MULTIIMAGE_IMAGE_PROMPT = "<|user|>\n<|image|>\n<|image|>\nDescribe these images.<|end|>\n<|assistant|>\n"  # noqa: E501
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
+    {
+        "stop_sign": "<|user|>\n<|image|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n",  # noqa: E501
+        "cherry_blossom": "<|user|>\n<|image|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n",  # noqa: E501
+    }
+)
+HF_MULTIIMAGE_IMAGE_PROMPT = (
+    "<|user|>\n<|image|>\n<|image|>\nDescribe these images.<|end|>\n<|assistant|>\n"  # noqa: E501
+)

-model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct",
-                               revision="refs/pr/70")
+model_path = snapshot_download(
+    "microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70"
+)
 # Since the vision-lora and speech-lora co-exist with the base model,
 # we have to manually specify the path of the lora weights.
 vision_lora_path = os.path.join(model_path, "vision-lora")
-speech_question = os.path.join(model_path, "examples",
-                               "what_is_shown_in_this_image.wav")
+speech_question = os.path.join(
+    model_path, "examples", "what_is_shown_in_this_image.wav"
+)
 models = [model_path]

 target_dtype = "half"
@@ -48,8 +57,7 @@ if current_platform.is_rocm():
 def run_test(
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
-    inputs: Sequence[tuple[list[str], PromptImageInput,
-                           Optional[PromptAudioInput]]],
+    inputs: Sequence[tuple[list[str], PromptImageInput, Optional[PromptAudioInput]]],
    model: str,
    *,
    max_model_len: int,
@@ -75,28 +83,30 @@ def run_test(
    # will hurt multiprocessing backend with fork method (the default method).
    # max_model_len should be greater than image_feature_size
    with vllm_runner(
-            model,
-            task="generate",
-            max_model_len=max_model_len,
-            max_num_seqs=2,
-            dtype=dtype,
-            limit_mm_per_prompt={"image": mm_limit},
-            tensor_parallel_size=tensor_parallel_size,
-            distributed_executor_backend=distributed_executor_backend,
-            enable_lora=True,
-            max_lora_rank=320,
-            gpu_memory_utilization=0.8,  # set to 0.8 to avoid OOM in CI
-            enforce_eager=True,
-            trust_remote_code=False,
+        model,
+        task="generate",
+        max_model_len=max_model_len,
+        max_num_seqs=2,
+        dtype=dtype,
+        limit_mm_per_prompt={"image": mm_limit},
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
+        enable_lora=True,
+        max_lora_rank=320,
+        gpu_memory_utilization=0.8,  # set to 0.8 to avoid OOM in CI
+        enforce_eager=True,
+        trust_remote_code=False,
    ) as vllm_model:
        lora_request = LoRARequest("vision", 1, vision_lora_path)
        vllm_outputs_per_case = [
-            vllm_model.generate_greedy_logprobs(prompts,
-                                                max_tokens,
-                                                num_logprobs=num_logprobs,
-                                                images=images,
-                                                audios=audios,
-                                                lora_request=lora_request)
+            vllm_model.generate_greedy_logprobs(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                images=images,
+                audios=audios,
+                lora_request=lora_request,
+            )
            for prompts, images, audios in inputs
        ]

@@ -108,17 +118,18 @@ def run_test(
        hf_processor = hf_model.processor
        eos_token_id = hf_processor.tokenizer.eos_token_id
        hf_outputs_per_case = [
-            hf_model.generate_greedy_logprobs_limit(prompts,
-                                                    max_tokens,
-                                                    num_logprobs=num_logprobs,
-                                                    images=images,
-                                                    audios=audios,
-                                                    eos_token_id=eos_token_id)
+            hf_model.generate_greedy_logprobs_limit(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                images=images,
+                audios=audios,
+                eos_token_id=eos_token_id,
+            )
            for prompts, images, audios in inputs
        ]

-    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
-                                        vllm_outputs_per_case):
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
        check_logprobs_close(
            outputs_0_lst=hf_outputs,
            outputs_1_lst=vllm_outputs,
@@ -145,16 +156,27 @@ def run_test(
@pytest.mark.parametrize("max_model_len", [12800])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
-def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
-                dtype: str, max_model_len: int, max_tokens: int,
-                num_logprobs: int) -> None:
+def test_models(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    model,
+    size_factors,
+    dtype: str,
+    max_model_len: int,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
    images = [asset.pil_image for asset in image_assets]

-    inputs_per_image = [(
-        [prompt for _ in size_factors],
-        [rescale_image_size(image, factor) for factor in size_factors],
-        None,
-    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+    inputs_per_image = [
+        (
+            [prompt for _ in size_factors],
+            [rescale_image_size(image, factor) for factor in size_factors],
+            None,
+        )
+        for image, prompt in zip(images, HF_IMAGE_PROMPTS)
+    ]

    run_test(
        hf_runner,
@@ -189,16 +211,26 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
@pytest.mark.parametrize("max_model_len", [25600])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
-def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
-                             size_factors, dtype: str, max_model_len: int,
-                             max_tokens: int, num_logprobs: int) -> None:
+def test_multi_images_models(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    model,
+    size_factors,
+    dtype: str,
+    max_model_len: int,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
    images = [asset.pil_image for asset in image_assets]

    inputs_per_case = [
        (
            [HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
-            [[rescale_image_size(image, factor) for image in images]
-             for factor in size_factors],
+            [
+                [rescale_image_size(image, factor) for image in images]
+                for factor in size_factors
+            ],
            None,
        ),
    ]
@@ -222,10 +254,15 @@ def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
@pytest.mark.parametrize("max_model_len", [12800])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
-def test_vision_speech_models(hf_runner, vllm_runner, model, dtype: str,
-                              max_model_len: int, max_tokens: int,
-                              num_logprobs: int) -> None:
-
+def test_vision_speech_models(
+    hf_runner,
+    vllm_runner,
+    model,
+    dtype: str,
+    max_model_len: int,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
    # use the example speech question so that the model outputs are reasonable
    audio = librosa.load(speech_question, sr=16000)
    image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
--- a/tests/models/multimodal/generation/test_phi4mm.py
+++ b/tests/models/multimodal/generation/test_phi4mm.py
@@ -17,31 +17,39 @@ from vllm.lora.request import LoRARequest
 from vllm.multimodal.image import convert_image_mode, rescale_image_size
 from vllm.platforms import current_platform

-from ....conftest import (IMAGE_ASSETS, HfRunner, PromptAudioInput,
-                          PromptImageInput, VllmRunner)
+from ....conftest import (
+    IMAGE_ASSETS,
+    HfRunner,
+    PromptAudioInput,
+    PromptImageInput,
+    VllmRunner,
+)
 from ....utils import large_gpu_test
 from ...utils import check_logprobs_close

-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
-    "stop_sign":
-    "<|user|>\n<|image_1|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n",  # noqa: E501
-    "cherry_blossom":
-    "<|user|>\n<|image_1|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n",  # noqa: E501
-})
-HF_MULTIIMAGE_IMAGE_PROMPT = "<|user|>\n<|image_1|>\n<|image_2|>\nDescribe these images.<|end|>\n<|assistant|>\n"  # noqa: E501
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
+    {
+        "stop_sign": "<|user|>\n<|image_1|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n",  # noqa: E501
+        "cherry_blossom": "<|user|>\n<|image_1|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n",  # noqa: E501
+    }
+)
+HF_MULTIIMAGE_IMAGE_PROMPT = (
+    "<|user|>\n<|image_1|>\n<|image_2|>\nDescribe these images.<|end|>\n<|assistant|>\n"  # noqa: E501
+)

 model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
 # Since the vision-lora and speech-lora co-exist with the base model,
 # we have to manually specify the path of the lora weights.
 vision_lora_path = os.path.join(model_path, "vision-lora")
-speech_question = os.path.join(model_path, "examples",
-                               "what_is_shown_in_this_image.wav")
+speech_question = os.path.join(
+    model_path, "examples", "what_is_shown_in_this_image.wav"
+)
 models = [model_path]


-def vllm_to_hf_output(vllm_output: tuple[list[int], str,
-                                         Optional[SampleLogprobs]],
-                      model: str):
+def vllm_to_hf_output(
+    vllm_output: tuple[list[int], str, Optional[SampleLogprobs]], model: str
+):
    """Sanitize vllm output to be comparable with hf output."""
    _, output_str, out_logprobs = vllm_output

@@ -71,8 +79,7 @@ if current_platform.is_rocm():
 def run_test(
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
-    inputs: Sequence[tuple[list[str], PromptImageInput,
-                           Optional[PromptAudioInput]]],
+    inputs: Sequence[tuple[list[str], PromptImageInput, Optional[PromptAudioInput]]],
    model: str,
    *,
    max_model_len: int,
@@ -98,27 +105,29 @@ def run_test(
    # will hurt multiprocessing backend with fork method (the default method).
    # max_model_len should be greater than image_feature_size
    with vllm_runner(
-            model,
-            runner="generate",
-            max_model_len=max_model_len,
-            max_num_seqs=2,
-            dtype=dtype,
-            limit_mm_per_prompt={"image": mm_limit},
-            tensor_parallel_size=tensor_parallel_size,
-            distributed_executor_backend=distributed_executor_backend,
-            enable_lora=True,
-            max_lora_rank=320,
-            gpu_memory_utilization=0.8,  # set to 0.8 to avoid OOM in CI
-            enforce_eager=True,
+        model,
+        runner="generate",
+        max_model_len=max_model_len,
+        max_num_seqs=2,
+        dtype=dtype,
+        limit_mm_per_prompt={"image": mm_limit},
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
+        enable_lora=True,
+        max_lora_rank=320,
+        gpu_memory_utilization=0.8,  # set to 0.8 to avoid OOM in CI
+        enforce_eager=True,
    ) as vllm_model:
        lora_request = LoRARequest("vision", 1, vision_lora_path)
        vllm_outputs_per_case = [
-            vllm_model.generate_greedy_logprobs(prompts,
-                                                max_tokens,
-                                                num_logprobs=num_logprobs,
-                                                images=images,
-                                                audios=audios,
-                                                lora_request=lora_request)
+            vllm_model.generate_greedy_logprobs(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                images=images,
+                audios=audios,
+                lora_request=lora_request,
+            )
            for prompts, images, audios in inputs
        ]

@@ -127,42 +136,36 @@ def run_test(
    pytest.skip("HF impl is not compatible with current transformers")

    hf_model_kwargs = {"_attn_implementation": "sdpa"}
-    with hf_runner(model, dtype=dtype,
-                   model_kwargs=hf_model_kwargs) as hf_model:
-
+    with hf_runner(model, dtype=dtype, model_kwargs=hf_model_kwargs) as hf_model:
        hf_processor = hf_model.processor
        eos_token_id = hf_processor.tokenizer.eos_token_id

-        def patch_hf_processor(*args,
-                               text="",
-                               images=None,
-                               audio=None,
-                               sampling_rate=None,
-                               **kwargs):
+        def patch_hf_processor(
+            *args, text="", images=None, audio=None, sampling_rate=None, **kwargs
+        ):
            audios = None
            if audio is not None and sampling_rate is not None:
                audios = [(audio, sampling_rate)]
-            return hf_processor(*args,
-                                text=text,
-                                images=images,
-                                audios=audios,
-                                **kwargs)
+            return hf_processor(
+                *args, text=text, images=images, audios=audios, **kwargs
+            )

        hf_model.processor = patch_hf_processor

        hf_outputs_per_case = [
-            hf_model.generate_greedy_logprobs_limit(prompts,
-                                                    max_tokens,
-                                                    num_logprobs=num_logprobs,
-                                                    images=images,
-                                                    audios=audios,
-                                                    eos_token_id=eos_token_id,
-                                                    num_logits_to_keep=0)
+            hf_model.generate_greedy_logprobs_limit(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                images=images,
+                audios=audios,
+                eos_token_id=eos_token_id,
+                num_logits_to_keep=0,
+            )
            for prompts, images, audios in inputs
        ]

-    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
-                                        vllm_outputs_per_case):
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
        check_logprobs_close(
            outputs_0_lst=hf_outputs,
            outputs_1_lst=vllm_outputs,
@@ -189,16 +192,27 @@ def run_test(
@pytest.mark.parametrize("max_model_len", [12800])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
-def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
-                dtype: str, max_model_len: int, max_tokens: int,
-                num_logprobs: int) -> None:
+def test_models(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    model,
+    size_factors,
+    dtype: str,
+    max_model_len: int,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
    images = [asset.pil_image for asset in image_assets]

-    inputs_per_image = [(
-        [prompt for _ in size_factors],
-        [rescale_image_size(image, factor) for factor in size_factors],
-        None,
-    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+    inputs_per_image = [
+        (
+            [prompt for _ in size_factors],
+            [rescale_image_size(image, factor) for factor in size_factors],
+            None,
+        )
+        for image, prompt in zip(images, HF_IMAGE_PROMPTS)
+    ]

    run_test(
        hf_runner,
@@ -233,16 +247,26 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
@pytest.mark.parametrize("max_model_len", [25600])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
-def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
-                             size_factors, dtype: str, max_model_len: int,
-                             max_tokens: int, num_logprobs: int) -> None:
+def test_multi_images_models(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    model,
+    size_factors,
+    dtype: str,
+    max_model_len: int,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
    images = [asset.pil_image for asset in image_assets]

    inputs_per_case = [
        (
            [HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
-            [[rescale_image_size(image, factor) for image in images]
-             for factor in size_factors],
+            [
+                [rescale_image_size(image, factor) for image in images]
+                for factor in size_factors
+            ],
            None,
        ),
    ]
@@ -266,10 +290,15 @@ def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
@pytest.mark.parametrize("max_model_len", [12800])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
-def test_vision_speech_models(hf_runner, vllm_runner, model, dtype: str,
-                              max_model_len: int, max_tokens: int,
-                              num_logprobs: int) -> None:
-
+def test_vision_speech_models(
+    hf_runner,
+    vllm_runner,
+    model,
+    dtype: str,
+    max_model_len: int,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
    # use the example speech question so that the model outputs are reasonable
    audio = librosa.load(speech_question, sr=None)
    image = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")
--- a/tests/models/multimodal/generation/test_pixtral.py
+++ b/tests/models/multimodal/generation/test_pixtral.py
@@ -37,33 +37,33 @@ PROMPT = "Describe each image in one short sentence."


 def _create_msg_format(urls: list[str]) -> list[dict[str, Any]]:
-    return [{
-        "role":
-        "user",
-        "content": [{
-            "type": "text",
-            "text": PROMPT,
-        }] + [{
-            "type": "image_url",
-            "image_url": {
-                "url": url
-            }
-        } for url in urls],
-    }]
+    return [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": PROMPT,
+                }
+            ]
+            + [{"type": "image_url", "image_url": {"url": url}} for url in urls],
+        }
+    ]


 def _create_msg_format_hf(urls: list[str]) -> list[dict[str, Any]]:
-    return [{
-        "role":
-        "user",
-        "content": [{
-            "type": "text",
-            "content": PROMPT,
-        }, *({
-            "type": "image",
-            "image": download_image(url)
-        } for url in urls)],
-    }]
+    return [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "content": PROMPT,
+                },
+                *({"type": "image", "image": download_image(url)} for url in urls),
+            ],
+        }
+    ]


 def _create_engine_inputs(urls: list[str]) -> TokensPrompt:
@@ -125,11 +125,17 @@ def _dump_outputs_w_logprobs(
    outputs: OutputsLogprobs,
    filename: "StrPath",
 ) -> None:
-    json_data = [(tokens, text, [{
-        k: asdict(v)
-        for k, v in token_logprobs.items()
-    } for token_logprobs in (logprobs or [])])
-                 for tokens, text, logprobs in outputs]
+    json_data = [
+        (
+            tokens,
+            text,
+            [
+                {k: asdict(v) for k, v in token_logprobs.items()}
+                for token_logprobs in (logprobs or [])
+            ],
+        )
+        for tokens, text, logprobs in outputs
+    ]

    with open(filename, "w") as f:
        json.dump(json_data, f)
@@ -139,28 +145,35 @@ def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
    with open(filename, "rb") as f:
        json_data = json.load(f)

-    return [(tokens, text, [{
-        int(k): Logprob(**v)
-        for k, v in token_logprobs.items()
-    } for token_logprobs in logprobs]) for tokens, text, logprobs in json_data]
+    return [
+        (
+            tokens,
+            text,
+            [
+                {int(k): Logprob(**v) for k, v in token_logprobs.items()}
+                for token_logprobs in logprobs
+            ],
+        )
+        for tokens, text, logprobs in json_data
+    ]


@large_gpu_test(min_gb=80)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_model_len", MAX_MODEL_LEN)
@pytest.mark.parametrize("dtype", ["bfloat16"])
-def test_chat(vllm_runner, max_model_len: int, model: str, dtype: str,
-              local_asset_server) -> None:
-    EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(
-        FIXTURE_LOGPROBS_CHAT[model])
+def test_chat(
+    vllm_runner, max_model_len: int, model: str, dtype: str, local_asset_server
+) -> None:
+    EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_CHAT[model])
    with vllm_runner(
-            model,
-            dtype=dtype,
-            tokenizer_mode="mistral",
-            load_format="mistral",
-            config_format="mistral",
-            max_model_len=max_model_len,
-            limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
+        model,
+        dtype=dtype,
+        tokenizer_mode="mistral",
+        load_format="mistral",
+        config_format="mistral",
+        max_model_len=max_model_len,
+        limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
    ) as vllm_model:
        outputs = []

@@ -180,7 +193,9 @@ def test_chat(vllm_runner, max_model_len: int, model: str, dtype: str,
    for i in range(len(logprobs)):
        assert logprobs[i][-1] is None
        logprobs[i] = logprobs[i][:-1]
-    check_logprobs_close(outputs_0_lst=EXPECTED_CHAT_LOGPROBS,
-                         outputs_1_lst=logprobs,
-                         name_0="h100_ref",
-                         name_1="output")
+    check_logprobs_close(
+        outputs_0_lst=EXPECTED_CHAT_LOGPROBS,
+        outputs_1_lst=logprobs,
+        name_0="h100_ref",
+        name_1="output",
+    )
--- a/tests/models/multimodal/generation/test_qwen2_5_vl.py
+++ b/tests/models/multimodal/generation/test_qwen2_5_vl.py
@@ -17,14 +17,15 @@ def qwen2_5_vl_chat_template(*query):
    return f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{''.join(query)}<|im_end|><|im_start|>assistant\n"  # noqa: E501


-VIDEO_PROMPTS = VIDEO_ASSETS.prompts({
-    "baby_reading":
-    qwen2_5_vl_chat_template(
-        VIDEO_PLACEHOLDER,
-        "Describe this video with a short sentence ",
-        "(no more than 20 words)",
-    ),
-})
+VIDEO_PROMPTS = VIDEO_ASSETS.prompts(
+    {
+        "baby_reading": qwen2_5_vl_chat_template(
+            VIDEO_PLACEHOLDER,
+            "Describe this video with a short sentence ",
+            "(no more than 20 words)",
+        ),
+    }
+)


@pytest.mark.core_model
@@ -33,10 +34,15 @@ VIDEO_PROMPTS = VIDEO_ASSETS.prompts({
@pytest.mark.parametrize("num_frames", [16])
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [128])
-def test_qwen2_5_vl_evs_functionality(vllm_runner, video_assets, model,
-                                      video_pruning_rate: float,
-                                      num_frames: int, dtype: str,
-                                      max_tokens: int) -> None:
+def test_qwen2_5_vl_evs_functionality(
+    vllm_runner,
+    video_assets,
+    model,
+    video_pruning_rate: float,
+    num_frames: int,
+    dtype: str,
+    max_tokens: int,
+) -> None:
    """Test EVS (Efficient Video Sampling) functionality with different
    pruning rates.
    """
@@ -51,19 +57,18 @@ def test_qwen2_5_vl_evs_functionality(vllm_runner, video_assets, model,
    videos = [sampled_vids[0]]

    # Initialize model with EVS configuration
-    with vllm_runner(model,
-                     runner="generate",
-                     max_model_len=4000,
-                     max_num_seqs=1,
-                     dtype=dtype,
-                     limit_mm_per_prompt={"video": 1},
-                     tensor_parallel_size=1,
-                     video_pruning_rate=video_pruning_rate) as vllm_model:
-
+    with vllm_runner(
+        model,
+        runner="generate",
+        max_model_len=4000,
+        max_num_seqs=1,
+        dtype=dtype,
+        limit_mm_per_prompt={"video": 1},
+        tensor_parallel_size=1,
+        video_pruning_rate=video_pruning_rate,
+    ) as vllm_model:
        # Generate output - this should not crash
-        outputs = vllm_model.generate_greedy(prompts,
-                                             max_tokens,
-                                             videos=videos)
+        outputs = vllm_model.generate_greedy(prompts, max_tokens, videos=videos)

        # Basic validation that we got a response
        assert len(outputs) == 1
@@ -83,10 +88,15 @@ def test_qwen2_5_vl_evs_functionality(vllm_runner, video_assets, model,
@pytest.mark.parametrize("num_frames", [16])
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [128])
-def test_qwen2_5_vl_evs_batched_videos(vllm_runner, video_assets, model,
-                                       video_pruning_rate: float,
-                                       num_frames: int, dtype: str,
-                                       max_tokens: int) -> None:
+def test_qwen2_5_vl_evs_batched_videos(
+    vllm_runner,
+    video_assets,
+    model,
+    video_pruning_rate: float,
+    num_frames: int,
+    dtype: str,
+    max_tokens: int,
+) -> None:
    """Test EVS functionality with batched videos.

    This test validates that:
@@ -102,23 +112,21 @@ def test_qwen2_5_vl_evs_batched_videos(vllm_runner, video_assets, model,

    # Test batched videos
    prompts = [VIDEO_PROMPTS[0], VIDEO_PROMPTS[0]]
-    videos = [sampled_vids[0],
-              sampled_vids[0]]  # Use same video twice for testing
+    videos = [sampled_vids[0], sampled_vids[0]]  # Use same video twice for testing

    # Initialize model with EVS configuration
-    with vllm_runner(model,
-                     runner="generate",
-                     max_model_len=4000,
-                     max_num_seqs=2,
-                     dtype=dtype,
-                     limit_mm_per_prompt={"video": 2},
-                     tensor_parallel_size=1,
-                     video_pruning_rate=video_pruning_rate) as vllm_model:
-
+    with vllm_runner(
+        model,
+        runner="generate",
+        max_model_len=4000,
+        max_num_seqs=2,
+        dtype=dtype,
+        limit_mm_per_prompt={"video": 2},
+        tensor_parallel_size=1,
+        video_pruning_rate=video_pruning_rate,
+    ) as vllm_model:
        # Generate output - this should not crash
-        outputs = vllm_model.generate_greedy(prompts,
-                                             max_tokens,
-                                             videos=videos)
+        outputs = vllm_model.generate_greedy(prompts, max_tokens, videos=videos)

        # Basic validation that we got responses for both videos
        assert len(outputs) == 2
--- a/tests/models/multimodal/generation/test_qwen2_vl.py
+++ b/tests/models/multimodal/generation/test_qwen2_vl.py
@@ -11,8 +11,13 @@ from PIL import Image
 from vllm.multimodal.image import rescale_image_size
 from vllm.multimodal.video import rescale_video_size, sample_frames_from_video

-from ....conftest import (IMAGE_ASSETS, VIDEO_ASSETS, PromptImageInput,
-                          PromptVideoInput, VllmRunner)
+from ....conftest import (
+    IMAGE_ASSETS,
+    VIDEO_ASSETS,
+    PromptImageInput,
+    PromptVideoInput,
+    VllmRunner,
+)
 from ...utils import check_logprobs_close


@@ -34,28 +39,29 @@ def qwen2_vl_chat_template(*query):
    return f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{''.join(query)}<|im_end|><|im_start|>assistant\n"  # noqa: E501


-IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
-    "stop_sign":
-    qwen2_vl_chat_template(
-        IMAGE_PLACEHOLDER,
-        "What is the biggest text's content in this image?",
-    ),
-    "cherry_blossom":
-    qwen2_vl_chat_template(
-        IMAGE_PLACEHOLDER,
-        "What is the season shown in this image? ",
-        "Reply with a short sentence (no more than 20 words)",
-    ),
-})
+IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
+    {
+        "stop_sign": qwen2_vl_chat_template(
+            IMAGE_PLACEHOLDER,
+            "What is the biggest text's content in this image?",
+        ),
+        "cherry_blossom": qwen2_vl_chat_template(
+            IMAGE_PLACEHOLDER,
+            "What is the season shown in this image? ",
+            "Reply with a short sentence (no more than 20 words)",
+        ),
+    }
+)

-VIDEO_PROMPTS = VIDEO_ASSETS.prompts({
-    "baby_reading":
-    qwen2_vl_chat_template(
-        VIDEO_PLACEHOLDER,
-        "Describe this video with a short sentence ",
-        "(no more than 20 words)",
-    ),
-})
+VIDEO_PROMPTS = VIDEO_ASSETS.prompts(
+    {
+        "baby_reading": qwen2_vl_chat_template(
+            VIDEO_PLACEHOLDER,
+            "Describe this video with a short sentence ",
+            "(no more than 20 words)",
+        ),
+    }
+)

 MULTIIMAGE_PROMPT = qwen2_vl_chat_template(
    IMAGE_PLACEHOLDER,
@@ -77,17 +83,19 @@ class Qwen2VLPromptVideoEmbeddingInput(TypedDict):


 def batch_make_image_embeddings(
-        image_batches: list[Union[Image.Image, list[Image.Image]]], processor,
-        llm: VllmRunner) -> list[Qwen2VLPromptImageEmbeddingInput]:
+    image_batches: list[Union[Image.Image, list[Image.Image]]],
+    processor,
+    llm: VllmRunner,
+) -> list[Qwen2VLPromptImageEmbeddingInput]:
    """batched image embeddings for Qwen2-VL

-    This will infer all images' embeddings in a single batch, 
+    This will infer all images' embeddings in a single batch,
      and split the result according to input batches.

    image_batches:
      - Single-image batches: `list[Image.Image]`
      - Multiple-image batches: `list[list[Image.Image]]]`
-    
+
    returns: `list[Qwen2VLPromptImageEmbeddingInput]`
    """

@@ -108,9 +116,9 @@ def batch_make_image_embeddings(
    # image to pixel values
    image_processor = processor.image_processor

-    preprocess_result = image_processor \
-        .preprocess(images=images, return_tensors="pt") \
-        .data
+    preprocess_result = image_processor.preprocess(
+        images=images, return_tensors="pt"
+    ).data
    pixel_values = preprocess_result["pixel_values"]
    image_grid_thw = preprocess_result["image_grid_thw"]

@@ -119,12 +127,13 @@ def batch_make_image_embeddings(
        with torch.no_grad():
            visual = model.visual

-            pixel_values_on_device = pixel_values.to(visual.device,
-                                                     dtype=visual.dtype)
-            image_grid_thw_on_device = image_grid_thw.to(visual.device,
-                                                         dtype=torch.int64)
-            return visual(pixel_values_on_device,
-                          grid_thw=image_grid_thw_on_device).cpu()
+            pixel_values_on_device = pixel_values.to(visual.device, dtype=visual.dtype)
+            image_grid_thw_on_device = image_grid_thw.to(
+                visual.device, dtype=torch.int64
+            )
+            return visual(
+                pixel_values_on_device, grid_thw=image_grid_thw_on_device
+            ).cpu()

    image_embeds = torch.concat(llm.apply_model(get_image_embeds))

@@ -137,16 +146,21 @@ def batch_make_image_embeddings(
        merge_size = image_processor.merge_size
        cur_batch_embed_len = sum(
            grid_thw.prod(-1) // merge_size // merge_size
-            for grid_thw in image_grid_thw[image_counter:image_counter +
-                                           cur_batch_image_count])
+            for grid_thw in image_grid_thw[
+                image_counter : image_counter + cur_batch_image_count
+            ]
+        )

-        result.append({
-            "image_embeds":
-            image_embeds[embed_counter:embed_counter + cur_batch_embed_len],
-            "image_grid_thw":
-            image_grid_thw[image_counter:image_counter +
-                           cur_batch_image_count],
-        })
+        result.append(
+            {
+                "image_embeds": image_embeds[
+                    embed_counter : embed_counter + cur_batch_embed_len
+                ],
+                "image_grid_thw": image_grid_thw[
+                    image_counter : image_counter + cur_batch_image_count
+                ],
+            }
+        )

        embed_counter += cur_batch_embed_len
        image_counter += cur_batch_image_count
@@ -160,13 +174,13 @@ def batch_make_image_embeddings(


 def batch_make_video_embeddings(
-        video_batches: PromptVideoInput, processor,
-        llm: VllmRunner) -> list[Qwen2VLPromptVideoEmbeddingInput]:
+    video_batches: PromptVideoInput, processor, llm: VllmRunner
+) -> list[Qwen2VLPromptVideoEmbeddingInput]:
    """batched video embeddings for Qwen2-VL

    A NDArray represents a single video's all frames.

-    This will infer all videos' embeddings in a single batch, 
+    This will infer all videos' embeddings in a single batch,
      and split the result according to input batches.

    video_batches:
@@ -191,9 +205,9 @@ def batch_make_video_embeddings(
    # video to pixel values
    image_processor = processor.image_processor

-    preprocess_result = image_processor \
-        .preprocess(images=None, videos=videos, return_tensors="pt") \
-        .data
+    preprocess_result = image_processor.preprocess(
+        images=None, videos=videos, return_tensors="pt"
+    ).data
    pixel_values = preprocess_result["pixel_values_videos"]
    video_grid_thw = preprocess_result["video_grid_thw"]

@@ -202,12 +216,13 @@ def batch_make_video_embeddings(
        with torch.no_grad():
            visual = model.visual

-            pixel_values_on_device = pixel_values.to(visual.device,
-                                                     dtype=visual.dtype)
-            video_grid_thw_on_device = video_grid_thw.to(visual.device,
-                                                         dtype=torch.int64)
-            return visual(pixel_values_on_device,
-                          grid_thw=video_grid_thw_on_device).cpu()
+            pixel_values_on_device = pixel_values.to(visual.device, dtype=visual.dtype)
+            video_grid_thw_on_device = video_grid_thw.to(
+                visual.device, dtype=torch.int64
+            )
+            return visual(
+                pixel_values_on_device, grid_thw=video_grid_thw_on_device
+            ).cpu()

    video_embeds = torch.concat(llm.apply_model(get_image_embeds))

@@ -220,16 +235,21 @@ def batch_make_video_embeddings(
        merge_size = image_processor.merge_size
        cur_batch_embed_len = sum(
            grid_thw.prod(-1) // merge_size // merge_size
-            for grid_thw in video_grid_thw[video_counter:video_counter +
-                                           cur_batch_video_count])
+            for grid_thw in video_grid_thw[
+                video_counter : video_counter + cur_batch_video_count
+            ]
+        )

-        result.append({
-            "video_embeds":
-            video_embeds[embed_counter:embed_counter + cur_batch_embed_len],
-            "video_grid_thw":
-            video_grid_thw[video_counter:video_counter +
-                           cur_batch_video_count],
-        })
+        result.append(
+            {
+                "video_embeds": video_embeds[
+                    embed_counter : embed_counter + cur_batch_embed_len
+                ],
+                "video_grid_thw": video_grid_thw[
+                    video_counter : video_counter + cur_batch_video_count
+                ],
+            }
+        )

        embed_counter += cur_batch_embed_len
        video_counter += cur_batch_video_count
@@ -263,25 +283,24 @@ def run_embedding_input_test(

    # max_model_len should be greater than image_feature_size
    with vllm_runner(
-            model,
-            runner="generate",
-            max_model_len=4000,
-            max_num_seqs=3,
-            dtype=dtype,
-            limit_mm_per_prompt={
-                "image": mm_limit,
-                "video": mm_limit
-            },
-            tensor_parallel_size=tensor_parallel_size,
-            distributed_executor_backend=distributed_executor_backend,
-            default_torch_num_threads=1,
+        model,
+        runner="generate",
+        max_model_len=4000,
+        max_num_seqs=3,
+        dtype=dtype,
+        limit_mm_per_prompt={"image": mm_limit, "video": mm_limit},
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
+        default_torch_num_threads=1,
    ) as vllm_model:
        outputs_per_case_for_original_input = [
-            vllm_model.generate_greedy_logprobs(prompts,
-                                                max_tokens,
-                                                num_logprobs=num_logprobs,
-                                                images=images or None,
-                                                videos=videos or None)
+            vllm_model.generate_greedy_logprobs(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                images=images or None,
+                videos=videos or None,
+            )
            for prompts, images, videos in inputs
        ]

@@ -290,17 +309,19 @@ def run_embedding_input_test(
                prompts,
                max_tokens,
                num_logprobs=num_logprobs,
-                images=batch_make_image_embeddings(
-                    images, processor, vllm_model) if images else None,
-                videos=batch_make_video_embeddings(
-                    videos, processor, vllm_model) if videos else None)
+                images=batch_make_image_embeddings(images, processor, vllm_model)
+                if images
+                else None,
+                videos=batch_make_video_embeddings(videos, processor, vllm_model)
+                if videos
+                else None,
+            )
            for prompts, images, videos in inputs
        ]

-    for outputs_for_original_input, \
-        outputs_for_embeddings_input \
-        in zip(outputs_per_case_for_original_input,
-            outputs_per_case_for_embeddings_input):
+    for outputs_for_original_input, outputs_for_embeddings_input in zip(
+        outputs_per_case_for_original_input, outputs_per_case_for_embeddings_input
+    ):
        check_logprobs_close(
            outputs_0_lst=outputs_for_original_input,
            outputs_1_lst=outputs_for_embeddings_input,
@@ -325,17 +346,26 @@ def run_embedding_input_test(
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
-def test_qwen2_vl_image_embeddings_input(vllm_runner, image_assets, model,
-                                         size_factors, dtype, max_tokens,
-                                         num_logprobs, monkeypatch) -> None:
+def test_qwen2_vl_image_embeddings_input(
+    vllm_runner,
+    image_assets,
+    model,
+    size_factors,
+    dtype,
+    max_tokens,
+    num_logprobs,
+    monkeypatch,
+) -> None:
    images = [asset.pil_image for asset in image_assets]

-    inputs_per_case: list[tuple[
-        list[str], PromptImageInput, PromptVideoInput]] = [(
+    inputs_per_case: list[tuple[list[str], PromptImageInput, PromptVideoInput]] = [
+        (
            [prompt for _ in size_factors],
            [rescale_image_size(image, factor) for factor in size_factors],
            [],
-        ) for image, prompt in zip(images, IMAGE_PROMPTS)]
+        )
+        for image, prompt in zip(images, IMAGE_PROMPTS)
+    ]

    run_embedding_input_test(
        vllm_runner,
@@ -366,21 +396,27 @@ def test_qwen2_vl_image_embeddings_input(vllm_runner, image_assets, model,
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
-def test_qwen2_vl_multiple_image_embeddings_input(vllm_runner, image_assets,
-                                                  model, size_factors,
-                                                  dtype: str, max_tokens: int,
-                                                  num_logprobs: int) -> None:
+def test_qwen2_vl_multiple_image_embeddings_input(
+    vllm_runner,
+    image_assets,
+    model,
+    size_factors,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
    images = [asset.pil_image for asset in image_assets]

-    inputs_per_case: list[tuple[list[str], PromptImageInput,
-                                PromptVideoInput]] = [(
-                                    [MULTIIMAGE_PROMPT for _ in size_factors],
-                                    [[
-                                        rescale_image_size(image, factor)
-                                        for image in images
-                                    ] for factor in size_factors],
-                                    [],
-                                )]
+    inputs_per_case: list[tuple[list[str], PromptImageInput, PromptVideoInput]] = [
+        (
+            [MULTIIMAGE_PROMPT for _ in size_factors],
+            [
+                [rescale_image_size(image, factor) for image in images]
+                for factor in size_factors
+            ],
+            [],
+        )
+    ]

    run_embedding_input_test(
        vllm_runner,
@@ -410,22 +446,29 @@ def test_qwen2_vl_multiple_image_embeddings_input(vllm_runner, image_assets,
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
-def test_qwen2_vl_video_embeddings_input(vllm_runner, video_assets, model,
-                                         size_factors, dtype: str,
-                                         max_tokens: int,
-                                         num_logprobs: int) -> None:
+def test_qwen2_vl_video_embeddings_input(
+    vllm_runner,
+    video_assets,
+    model,
+    size_factors,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
    num_frames = 4
    sampled_vids = [
        sample_frames_from_video(asset.np_ndarrays, num_frames)
        for asset in video_assets
    ]

-    inputs_per_case: list[tuple[
-        list[str], PromptImageInput, PromptVideoInput]] = [(
+    inputs_per_case: list[tuple[list[str], PromptImageInput, PromptVideoInput]] = [
+        (
            [prompt for _ in size_factors],
            [],
            [rescale_video_size(video, factor) for factor in size_factors],
-        ) for video, prompt in zip(sampled_vids, VIDEO_PROMPTS)]
+        )
+        for video, prompt in zip(sampled_vids, VIDEO_PROMPTS)
+    ]

    run_embedding_input_test(
        vllm_runner,
--- a/tests/models/multimodal/generation/test_ultravox.py
+++ b/tests/models/multimodal/generation/test_ultravox.py
@@ -15,12 +15,12 @@ from ...registry import HF_EXAMPLE_MODELS

 MODEL_NAME = "fixie-ai/ultravox-v0_5-llama-3_2-1b"

-AUDIO_PROMPTS = AUDIO_ASSETS.prompts({
-    "mary_had_lamb":
-    "Transcribe this into English.",
-    "winning_call":
-    "What is happening in this audio clip?",
-})
+AUDIO_PROMPTS = AUDIO_ASSETS.prompts(
+    {
+        "mary_had_lamb": "Transcribe this into English.",
+        "winning_call": "What is happening in this audio clip?",
+    }
+)

 MULTI_AUDIO_PROMPT = "Describe each of the audios above."

@@ -33,7 +33,7 @@ CHUNKED_PREFILL_KWARGS = {
    "enable_chunked_prefill": True,
    "max_num_seqs": 2,
    # Use a very small limit to exercise chunked prefill.
-    "max_num_batched_tokens": 16
+    "max_num_batched_tokens": 16,
 }


@@ -43,27 +43,33 @@ def params_kwargs_to_cli_args(params_kwargs: dict[str, Any]) -> list[str]:
    for key, value in params_kwargs.items():
        if isinstance(value, bool):
            if value:
-                args.append(f"--{key.replace('_','-')}")
+                args.append(f"--{key.replace('_', '-')}")
        else:
-            args.append(f"--{key.replace('_','-')}={value}")
+            args.append(f"--{key.replace('_', '-')}={value}")
    return args


-@pytest.fixture(params=[
-    pytest.param({}, marks=pytest.mark.cpu_model),
-    pytest.param(CHUNKED_PREFILL_KWARGS),
-])
+@pytest.fixture(
+    params=[
+        pytest.param({}, marks=pytest.mark.cpu_model),
+        pytest.param(CHUNKED_PREFILL_KWARGS),
+    ]
+)
 def server(request, audio_assets: AudioTestAssets):
    args = [
-        "--dtype", "bfloat16", "--max-model-len", "4096", "--enforce-eager",
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "4096",
+        "--enforce-eager",
        "--limit-mm-per-prompt",
-        json.dumps({"audio": len(audio_assets)}), "--trust-remote-code"
+        json.dumps({"audio": len(audio_assets)}),
+        "--trust-remote-code",
    ] + params_kwargs_to_cli_args(request.param)

-    with RemoteOpenAIServer(MODEL_NAME,
-                            args,
-                            env_dict={"VLLM_AUDIO_FETCH_TIMEOUT":
-                                      "30"}) as remote_server:
+    with RemoteOpenAIServer(
+        MODEL_NAME, args, env_dict={"VLLM_AUDIO_FETCH_TIMEOUT": "30"}
+    ) as remote_server:
        yield remote_server


@@ -77,12 +83,11 @@ def _get_prompt(audio_count, question, placeholder):
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    placeholder = f"{placeholder}\n" * audio_count

-    return tokenizer.apply_chat_template([{
-        'role': 'user',
-        'content': f"{placeholder}{question}"
-    }],
-                                         tokenize=False,
-                                         add_generation_prompt=True)
+    return tokenizer.apply_chat_template(
+        [{"role": "user", "content": f"{placeholder}{question}"}],
+        tokenize=False,
+        add_generation_prompt=True,
+    )


 def run_multi_audio_test(
@@ -99,19 +104,21 @@ def run_multi_audio_test(
    model_info.check_available_online(on_fail="skip")
    model_info.check_transformers_version(on_fail="skip")

-    with vllm_runner(model,
-                     dtype=dtype,
-                     enforce_eager=True,
-                     limit_mm_per_prompt={
-                         "audio":
-                         max((len(audio) for _, audio in prompts_and_audios))
-                     },
-                     **kwargs) as vllm_model:
+    with vllm_runner(
+        model,
+        dtype=dtype,
+        enforce_eager=True,
+        limit_mm_per_prompt={
+            "audio": max((len(audio) for _, audio in prompts_and_audios))
+        },
+        **kwargs,
+    ) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy_logprobs(
            [prompt for prompt, _ in prompts_and_audios],
            max_tokens,
            num_logprobs=num_logprobs,
-            audios=[audios for _, audios in prompts_and_audios])
+            audios=[audios for _, audios in prompts_and_audios],
+        )

    # The HuggingFace model doesn't support multiple audios yet, so
    # just assert that some tokens were generated.
@@ -122,21 +129,25 @@ def run_multi_audio_test(
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
-@pytest.mark.parametrize("vllm_kwargs", [
-    pytest.param({}, marks=pytest.mark.cpu_model),
-    pytest.param(CHUNKED_PREFILL_KWARGS),
-])
-def test_models_with_multiple_audios(vllm_runner,
-                                     audio_assets: AudioTestAssets, dtype: str,
-                                     max_tokens: int, num_logprobs: int,
-                                     vllm_kwargs: dict) -> None:
-
-    vllm_prompt = _get_prompt(len(audio_assets), MULTI_AUDIO_PROMPT,
-                              VLLM_PLACEHOLDER)
+@pytest.mark.parametrize(
+    "vllm_kwargs",
+    [
+        pytest.param({}, marks=pytest.mark.cpu_model),
+        pytest.param(CHUNKED_PREFILL_KWARGS),
+    ],
+)
+def test_models_with_multiple_audios(
+    vllm_runner,
+    audio_assets: AudioTestAssets,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    vllm_kwargs: dict,
+) -> None:
+    vllm_prompt = _get_prompt(len(audio_assets), MULTI_AUDIO_PROMPT, VLLM_PLACEHOLDER)
    run_multi_audio_test(
        vllm_runner,
-        [(vllm_prompt, [audio.audio_and_sample_rate
-                        for audio in audio_assets])],
+        [(vllm_prompt, [audio.audio_and_sample_rate for audio in audio_assets])],
        MODEL_NAME,
        dtype=dtype,
        max_tokens=max_tokens,
@@ -149,28 +160,25 @@ def test_models_with_multiple_audios(vllm_runner,
 async def test_online_serving(client, audio_assets: AudioTestAssets):
    """Exercises online serving with/without chunked prefill enabled."""

-    messages = [{
-        "role":
-        "user",
-        "content": [
-            *[{
-                "type": "audio_url",
-                "audio_url": {
-                    "url": audio.url
-                }
-            } for audio in audio_assets],
-            {
-                "type":
-                "text",
-                "text":
-                f"What's happening in these {len(audio_assets)} audio clips?"
-            },
-        ],
-    }]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *[
+                    {"type": "audio_url", "audio_url": {"url": audio.url}}
+                    for audio in audio_assets
+                ],
+                {
+                    "type": "text",
+                    "text": f"What's happening in these {len(audio_assets)} audio clips?",
+                },
+            ],
+        }
+    ]

-    chat_completion = await client.chat.completions.create(model=MODEL_NAME,
-                                                           messages=messages,
-                                                           max_tokens=10)
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME, messages=messages, max_tokens=10
+    )

    assert len(chat_completion.choices) == 1
    choice = chat_completion.choices[0]
--- a/tests/models/multimodal/generation/test_voxtral.py
+++ b/tests/models/multimodal/generation/test_voxtral.py
@@ -6,8 +6,12 @@ import json
 import pytest
 import pytest_asyncio
 from mistral_common.audio import Audio
-from mistral_common.protocol.instruct.messages import (AudioChunk, RawAudio,
-                                                       TextChunk, UserMessage)
+from mistral_common.protocol.instruct.messages import (
+    AudioChunk,
+    RawAudio,
+    TextChunk,
+    UserMessage,
+)

 from vllm.transformers_utils.tokenizer import MistralTokenizer

@@ -17,8 +21,12 @@ from .test_ultravox import MULTI_AUDIO_PROMPT, run_multi_audio_test

 MODEL_NAME = "mistralai/Voxtral-Mini-3B-2507"
 MISTRAL_FORMAT_ARGS = [
-    "--tokenizer_mode", "mistral", "--config_format", "mistral",
-    "--load_format", "mistral"
+    "--tokenizer_mode",
+    "mistral",
+    "--config_format",
+    "mistral",
+    "--load_format",
+    "mistral",
 ]


@@ -30,10 +38,9 @@ def server(request, audio_assets: AudioTestAssets):
        json.dumps({"audio": len(audio_assets)}),
    ] + MISTRAL_FORMAT_ARGS

-    with RemoteOpenAIServer(MODEL_NAME,
-                            args,
-                            env_dict={"VLLM_AUDIO_FETCH_TIMEOUT":
-                                      "30"}) as remote_server:
+    with RemoteOpenAIServer(
+        MODEL_NAME, args, env_dict={"VLLM_AUDIO_FETCH_TIMEOUT": "30"}
+    ) as remote_server:
        yield remote_server


@@ -64,15 +71,17 @@ def _get_prompt(audio_assets, question):
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
-def test_models_with_multiple_audios(vllm_runner,
-                                     audio_assets: AudioTestAssets, dtype: str,
-                                     max_tokens: int,
-                                     num_logprobs: int) -> None:
+def test_models_with_multiple_audios(
+    vllm_runner,
+    audio_assets: AudioTestAssets,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
    vllm_prompt = _get_prompt(audio_assets, MULTI_AUDIO_PROMPT)
    run_multi_audio_test(
        vllm_runner,
-        [(vllm_prompt, [audio.audio_and_sample_rate
-                        for audio in audio_assets])],
+        [(vllm_prompt, [audio.audio_and_sample_rate for audio in audio_assets])],
        MODEL_NAME,
        dtype=dtype,
        max_tokens=max_tokens,
@@ -92,23 +101,22 @@ async def test_online_serving(client, audio_assets: AudioTestAssets):
        return audio_dict

    audio_chunks = [asset_to_chunk(asset) for asset in audio_assets]
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            *audio_chunks,
-            {
-                "type":
-                "text",
-                "text":
-                f"What's happening in these {len(audio_assets)} audio clips?"
-            },
-        ],
-    }]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *audio_chunks,
+                {
+                    "type": "text",
+                    "text": f"What's happening in these {len(audio_assets)} audio clips?",
+                },
+            ],
+        }
+    ]

-    chat_completion = await client.chat.completions.create(model=MODEL_NAME,
-                                                           messages=messages,
-                                                           max_tokens=10)
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME, messages=messages, max_tokens=10
+    )

    assert len(chat_completion.choices) == 1
    choice = chat_completion.choices[0]
--- a/tests/models/multimodal/generation/test_whisper.py
+++ b/tests/models/multimodal/generation/test_whisper.py
@@ -12,8 +12,7 @@ from ....utils import create_new_process_for_each_test, multi_gpu_test

 PROMPTS = [
    {
-        "prompt":
-        "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
+        "prompt": "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
        "multi_modal_data": {
            "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
        },
@@ -25,9 +24,8 @@ PROMPTS = [
                "audio": AudioAsset("winning_call").audio_and_sample_rate,
            },
        },
-        "decoder_prompt":
-        "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
-    }
+        "decoder_prompt": "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
+    },
 ]

 EXPECTED = {
@@ -41,7 +39,7 @@ EXPECTED = {
        " is June and the third base. They're going to wave him in. The throw"
        " to the plate will be late. The Mariners are going to play for the"
        " American League Championship. I don't believe it. It just continues"
-        " by all five."
+        " by all five.",
    ],
    "openai/whisper-small": [
        " The first words I spoke in the original pornograph. A little piece"
@@ -51,7 +49,7 @@ EXPECTED = {
        " comes joy. Here is Junior to third base. They're gonna wave him"
        " in. The throw to the plate will be late. The Mariners are going to"
        " play for the American League Championship. I don't believe it. It"
-        " just continues. My, oh my."
+        " just continues. My, oh my.",
    ],
    "openai/whisper-medium": [
        " The first words I spoke in the original phonograph, a little piece"
@@ -62,7 +60,7 @@ EXPECTED = {
        " Jorgen at third base. They're going to wave him in. The throw to the"
        " plate will be late. The Mariners are going to play for the American"
        " League Championship. I don't believe it. It just continues. My, oh"
-        " my."
+        " my.",
    ],
    "openai/whisper-large-v3": [
        " The first words I spoke in the original phonograph, a little piece"
@@ -73,7 +71,7 @@ EXPECTED = {
        " Junior to third base. They're going to wave him in. The throw to the"
        " plate will be late. The Mariners are going to play for the American"
        " League Championship. I don't believe it. It just continues. My, oh,"
-        " my."
+        " my.",
    ],
    "openai/whisper-large-v3-turbo": [
        " The first words I spoke in the original phonograph, a little piece"
@@ -84,8 +82,8 @@ EXPECTED = {
        " Junior to third base. They're going to wave him in. The throw to the"
        " plate will be late. The Mariners are going to play for the American"
        " League Championship. I don't believe it. It just continues. My, oh,"
-        " my."
-    ]
+        " my.",
+    ],
 }


@@ -100,11 +98,11 @@ def run_test(
    expected_list = EXPECTED[model] * 10

    with vllm_runner(
-            model,
-            dtype="half",
-            max_model_len=448,
-            tensor_parallel_size=tensor_parallel_size,
-            distributed_executor_backend=distributed_executor_backend,
+        model,
+        dtype="half",
+        max_model_len=448,
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
    ) as vllm_model:
        llm = vllm_model.llm

--- a/tests/models/multimodal/generation/vlm_utils/builders.py
+++ b/tests/models/multimodal/generation/vlm_utils/builders.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Helpers for building inputs that can be leveraged for different test types.
-"""
+"""Helpers for building inputs that can be leveraged for different test types."""
+
 from collections.abc import Iterable
 from pathlib import PosixPath
 from typing import Callable, Optional, Union
@@ -10,20 +10,30 @@ import torch

 from vllm.multimodal.audio import AudioResampler
 from vllm.multimodal.image import rescale_image_size
-from vllm.multimodal.video import (rescale_video_size, resize_video,
-                                   sample_frames_from_video)
+from vllm.multimodal.video import (
+    rescale_video_size,
+    resize_video,
+    sample_frames_from_video,
+)

 from .....conftest import AudioTestAssets, ImageTestAssets, VideoTestAssets
-from .types import (SINGLE_AUDIO_BASE_PROMPT, SINGLE_IMAGE_BASE_PROMPTS,
-                    TEST_AUDIO_PLACEHOLDER, TEST_IMG_PLACEHOLDER,
-                    TEST_VIDEO_PLACEHOLDER, VIDEO_BASE_PROMPT,
-                    ImageSizeWrapper, PromptWithMultiModalInput, SizeType,
-                    VLMTestInfo)
+from .types import (
+    SINGLE_AUDIO_BASE_PROMPT,
+    SINGLE_IMAGE_BASE_PROMPTS,
+    TEST_AUDIO_PLACEHOLDER,
+    TEST_IMG_PLACEHOLDER,
+    TEST_VIDEO_PLACEHOLDER,
+    VIDEO_BASE_PROMPT,
+    ImageSizeWrapper,
+    PromptWithMultiModalInput,
+    SizeType,
+    VLMTestInfo,
+)


-def replace_test_placeholder(prompt: str, mm_idx_to_prompt: Callable[[int],
-                                                                     str],
-                             test_placeholder: str) -> str:
+def replace_test_placeholder(
+    prompt: str, mm_idx_to_prompt: Callable[[int], str], test_placeholder: str
+) -> str:
    """Given a prompt, replaces each test placeholder with the
    model-specific tag.
    """
@@ -35,11 +45,13 @@ def replace_test_placeholder(prompt: str, mm_idx_to_prompt: Callable[[int],
    return img_prompt


-def get_model_prompts(base_prompts: Iterable[str],
-                      img_idx_to_prompt: Optional[Callable[[int], str]],
-                      video_idx_to_prompt: Optional[Callable[[int], str]],
-                      audio_idx_to_prompt: Optional[Callable[[int], str]],
-                      prompt_formatter: Callable[[str], str]) -> list[str]:
+def get_model_prompts(
+    base_prompts: Iterable[str],
+    img_idx_to_prompt: Optional[Callable[[int], str]],
+    video_idx_to_prompt: Optional[Callable[[int], str]],
+    audio_idx_to_prompt: Optional[Callable[[int], str]],
+    prompt_formatter: Callable[[str], str],
+) -> list[str]:
    """Given a model-agnostic base prompt and test configuration for a model(s)
    to be tested, update the media placeholders and apply the prompt formatting
    to get the test prompt string for this model.
@@ -56,19 +68,19 @@ def get_model_prompts(base_prompts: Iterable[str],
        # Replace the multimodal placeholders in the base prompt with
        # the correct ones for the model that we are testing
        if img_idx_to_prompt:
-            base_prompt = replace_test_placeholder(base_prompt,
-                                                   img_idx_to_prompt,
-                                                   TEST_IMG_PLACEHOLDER)
+            base_prompt = replace_test_placeholder(
+                base_prompt, img_idx_to_prompt, TEST_IMG_PLACEHOLDER
+            )

        if video_idx_to_prompt:
-            base_prompt = replace_test_placeholder(base_prompt,
-                                                   video_idx_to_prompt,
-                                                   TEST_VIDEO_PLACEHOLDER)
+            base_prompt = replace_test_placeholder(
+                base_prompt, video_idx_to_prompt, TEST_VIDEO_PLACEHOLDER
+            )

        if audio_idx_to_prompt:
-            base_prompt = replace_test_placeholder(base_prompt,
-                                                   audio_idx_to_prompt,
-                                                   TEST_AUDIO_PLACEHOLDER)
+            base_prompt = replace_test_placeholder(
+                base_prompt, audio_idx_to_prompt, TEST_AUDIO_PLACEHOLDER
+            )

        # Apply the prompt formatter to wrap the base prompt with
        # the correct media placeholders to get the model test prompt
@@ -84,14 +96,15 @@ def build_single_image_inputs_from_test_info(
    tmp_path: Optional[PosixPath] = None,
 ) -> list[PromptWithMultiModalInput]:
    if test_info.prompt_formatter is None:
-        raise ValueError(
-            "Prompt formatter must be set to build single image inputs")
+        raise ValueError("Prompt formatter must be set to build single image inputs")

-    model_prompts = get_model_prompts(test_info.single_image_prompts,
-                                      test_info.img_idx_to_prompt,
-                                      test_info.video_idx_to_prompt,
-                                      test_info.audio_idx_to_prompt,
-                                      test_info.prompt_formatter)
+    model_prompts = get_model_prompts(
+        test_info.single_image_prompts,
+        test_info.img_idx_to_prompt,
+        test_info.video_idx_to_prompt,
+        test_info.audio_idx_to_prompt,
+        test_info.prompt_formatter,
+    )

    # For models that require a local path / URL encoded in the image; export
    # assets and encode into tmp_path for this test. This should be avoided
@@ -110,8 +123,8 @@ def build_single_image_inputs_from_test_info(


 def build_single_image_inputs(
-        images, model_prompts,
-        size_wrapper: ImageSizeWrapper) -> list[PromptWithMultiModalInput]:
+    images, model_prompts, size_wrapper: ImageSizeWrapper
+) -> list[PromptWithMultiModalInput]:
    # For every image / prompt pair, get a pair containing two lists of
    # length size_factors, where the first contains duplicates of the model
    # prompt [str], and the second contains copies of the image after being
@@ -125,7 +138,8 @@ def build_single_image_inputs(
                apply_image_size_scaling(image, size, size_wrapper.type)
                for size in size_wrapper.data
            ],
-        ) for image, prompt in zip(images, model_prompts)
+        )
+        for image, prompt in zip(images, model_prompts)
    ]


@@ -136,14 +150,15 @@ def build_multi_image_inputs_from_test_info(
    tmp_path: Optional[PosixPath] = None,
 ) -> list[PromptWithMultiModalInput]:
    if test_info.prompt_formatter is None:
-        raise ValueError(
-            "Prompt formatter must be set to build multi image inputs")
+        raise ValueError("Prompt formatter must be set to build multi image inputs")

-    model_prompts = get_model_prompts([test_info.multi_image_prompt],
-                                      test_info.img_idx_to_prompt,
-                                      test_info.video_idx_to_prompt,
-                                      test_info.audio_idx_to_prompt,
-                                      test_info.prompt_formatter)
+    model_prompts = get_model_prompts(
+        [test_info.multi_image_prompt],
+        test_info.img_idx_to_prompt,
+        test_info.video_idx_to_prompt,
+        test_info.audio_idx_to_prompt,
+        test_info.prompt_formatter,
+    )

    if test_info.prompt_path_encoder is not None:
        if tmp_path is None:
@@ -164,16 +179,20 @@ def build_multi_image_inputs_from_test_info(


 def build_multi_image_inputs(
-        image_lists, model_prompts,
-        size_wrapper: ImageSizeWrapper) -> list[PromptWithMultiModalInput]:
+    image_lists, model_prompts, size_wrapper: ImageSizeWrapper
+) -> list[PromptWithMultiModalInput]:
    return [
        PromptWithMultiModalInput(
            prompts=[prompt for _ in size_wrapper.data],
-            image_data=[[
-                apply_image_size_scaling(image, size, size_wrapper.type)
-                for image in images
-            ] for size in size_wrapper.data],
-        ) for images, prompt in zip(image_lists, model_prompts)
+            image_data=[
+                [
+                    apply_image_size_scaling(image, size, size_wrapper.type)
+                    for image in images
+                ]
+                for size in size_wrapper.data
+            ],
+        )
+        for images, prompt in zip(image_lists, model_prompts)
    ]


@@ -185,10 +204,10 @@ def build_embedding_inputs_from_test_info(
    # These conditions will always be true if invoked through filtering,
    # but we still check them in case this is ever called directly
    if test_info.prompt_formatter is None:
-        raise ValueError(
-            "Prompt formatter must be set to build image embedding inputs")
-    if size_wrapper.type != SizeType.SIZE_FACTOR or not \
-            all(factor == 1.0 for factor in size_wrapper.data):
+        raise ValueError("Prompt formatter must be set to build image embedding inputs")
+    if size_wrapper.type != SizeType.SIZE_FACTOR or not all(
+        factor == 1.0 for factor in size_wrapper.data
+    ):
        raise ValueError("Embedding tests require constant (1.0) size factors")
    if test_info.convert_assets_to_embeddings is None:
        raise ValueError("No conversion func for getting embeddings found")
@@ -209,8 +228,7 @@ def build_embedding_inputs_from_test_info(
    assert len(images) == len(model_prompts)

    inputs = build_single_image_inputs(images, model_prompts, size_wrapper)
-    vllm_embeddings = build_single_image_inputs(embeds, model_prompts,
-                                                size_wrapper)
+    vllm_embeddings = build_single_image_inputs(embeds, model_prompts, size_wrapper)
    return inputs, vllm_embeddings


@@ -235,21 +253,22 @@ def build_video_inputs_from_test_info(
        for asset in video_assets
    ]

-    video_scaler = (resize_video if size_wrapper.type == SizeType.FIXED_SIZE
-                    else rescale_video_size)
+    video_scaler = (
+        resize_video if size_wrapper.type == SizeType.FIXED_SIZE else rescale_video_size
+    )

    return [
        PromptWithMultiModalInput(
            prompts=[prompt for _ in size_wrapper.data],
-            video_data=[
-                video_scaler(video, size) for size in size_wrapper.data
-            ],
-        ) for video, prompt in zip(sampled_vids, model_prompts)
+            video_data=[video_scaler(video, size) for size in size_wrapper.data],
+        )
+        for video, prompt in zip(sampled_vids, model_prompts)
    ]


-def apply_image_size_scaling(image, size: Union[float, tuple[int, int]],
-                             size_type: SizeType):
+def apply_image_size_scaling(
+    image, size: Union[float, tuple[int, int]], size_type: SizeType
+):
    """Applies a size scaler to one image; this can be an image size factor,
    which scales the image while maintaining the aspect ratio"""
    # Special case for embeddings; if it's a tensor, it's only valid if we
@@ -285,13 +304,16 @@ def build_audio_inputs_from_test_info(
        method="librosa",
    )
    audios = [asset.audio_and_sample_rate for asset in audio_assets]
-    resampled_audios = [(
-        resampler.resample(
-            audio,
-            orig_sr=sr,
-        ),
-        int(resampler.target_sr),
-    ) for audio, sr in audios]
+    resampled_audios = [
+        (
+            resampler.resample(
+                audio,
+                orig_sr=sr,
+            ),
+            int(resampler.target_sr),
+        )
+        for audio, sr in audios
+    ]

    return [
        PromptWithMultiModalInput(
--- a/tests/models/multimodal/generation/vlm_utils/case_filtering.py
+++ b/tests/models/multimodal/generation/vlm_utils/case_filtering.py
@@ -4,19 +4,28 @@
 modality, getting all combinations (similar to pytest's parametrization),
 handling multimodal placeholder substitution, and so on.
 """
+
 import itertools
 from collections import OrderedDict
 from collections.abc import Iterable

 import pytest

-from .types import (EMBEDDING_SIZE_FACTORS, ExpandableVLMTestArgs,
-                    ImageSizeWrapper, SizeType, VLMTestInfo, VLMTestType)
+from .types import (
+    EMBEDDING_SIZE_FACTORS,
+    ExpandableVLMTestArgs,
+    ImageSizeWrapper,
+    SizeType,
+    VLMTestInfo,
+    VLMTestType,
+)


 def get_filtered_test_settings(
-        test_settings: dict[str, VLMTestInfo], test_type: VLMTestType,
-        new_proc_per_test: bool) -> dict[str, VLMTestInfo]:
+    test_settings: dict[str, VLMTestInfo],
+    test_type: VLMTestType,
+    new_proc_per_test: bool,
+) -> dict[str, VLMTestInfo]:
    """Given the dict of potential test settings to run, return a subdict
    of tests who have the current test type enabled with the matching val for
    fork_per_test.
@@ -25,7 +34,8 @@ def get_filtered_test_settings(
    def matches_test_type(test_info: VLMTestInfo, test_type: VLMTestType):
        return test_info.test_type == test_type or (
            isinstance(test_info.test_type, Iterable)
-            and test_type in test_info.test_type)
+            and test_type in test_info.test_type
+        )

    matching_tests = {}
    for test_name, test_info in test_settings.items():
@@ -36,62 +46,69 @@ def get_filtered_test_settings(
                assert test_info.convert_assets_to_embeddings is not None
            # Custom test inputs need to explicitly define the mm limit/inputs
            if matches_test_type(test_info, VLMTestType.CUSTOM_INPUTS):
-                assert (test_info.custom_test_opts is not None
-                        and isinstance(test_info.custom_test_opts, Iterable))
+                assert test_info.custom_test_opts is not None and isinstance(
+                    test_info.custom_test_opts, Iterable
+                )
            # For all types besides custom inputs, we need a prompt formatter
            else:
                assert test_info.prompt_formatter is not None

            # Everything looks okay; keep if this is correct proc handling
-            if (test_info.distributed_executor_backend
-                    is not None) == new_proc_per_test:
+            if (
+                test_info.distributed_executor_backend is not None
+            ) == new_proc_per_test:
                matching_tests[test_name] = test_info

    return matching_tests


-def get_parametrized_options(test_settings: dict[str, VLMTestInfo],
-                             test_type: VLMTestType,
-                             create_new_process_for_each_test: bool):
+def get_parametrized_options(
+    test_settings: dict[str, VLMTestInfo],
+    test_type: VLMTestType,
+    create_new_process_for_each_test: bool,
+):
    """Converts all of our VLMTestInfo into an expanded list of parameters.
    This is similar to nesting pytest parametrize calls, but done directly
    through an itertools product so that each test can set things like
    size factors etc, while still running in isolated test cases.
    """
    matching_tests = get_filtered_test_settings(
-        test_settings, test_type, create_new_process_for_each_test)
+        test_settings, test_type, create_new_process_for_each_test
+    )

    # Ensure that something is wrapped as an iterable it's not already
-    ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e, )
+    ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e,)

    def get_model_type_cases(model_type: str, test_info: VLMTestInfo):
        # This is essentially the same as nesting a bunch of mark.parametrize
        # decorators, but we do it programmatically to allow overrides for on
        # a per-model basis, while still being able to execute each of these
        # as individual test cases in pytest.
-        iter_kwargs = OrderedDict([
-            ("model", ensure_wrapped(test_info.models)),
-            ("max_tokens", ensure_wrapped(test_info.max_tokens)),
-            ("num_logprobs", ensure_wrapped(test_info.num_logprobs)),
-            ("dtype", ensure_wrapped(test_info.dtype)),
-            ("distributed_executor_backend",
-             ensure_wrapped(test_info.distributed_executor_backend)),
-        ])
+        iter_kwargs = OrderedDict(
+            [
+                ("model", ensure_wrapped(test_info.models)),
+                ("max_tokens", ensure_wrapped(test_info.max_tokens)),
+                ("num_logprobs", ensure_wrapped(test_info.num_logprobs)),
+                ("dtype", ensure_wrapped(test_info.dtype)),
+                (
+                    "distributed_executor_backend",
+                    ensure_wrapped(test_info.distributed_executor_backend),
+                ),
+            ]
+        )

        # num_frames is video only
        if test_type == VLMTestType.VIDEO:
-            iter_kwargs["num_video_frames"] = ensure_wrapped(
-                test_info.num_video_frames)
+            iter_kwargs["num_video_frames"] = ensure_wrapped(test_info.num_video_frames)

        # No sizes passed for custom inputs, since inputs are directly provided
        if test_type not in (VLMTestType.CUSTOM_INPUTS, VLMTestType.AUDIO):
            wrapped_sizes = get_wrapped_test_sizes(test_info, test_type)
            if wrapped_sizes is None:
-                raise ValueError(
-                    f"Sizes must be set for test type {test_type}")
+                raise ValueError(f"Sizes must be set for test type {test_type}")
            iter_kwargs["size_wrapper"] = wrapped_sizes

-        #Otherwise expand the custom test options instead
+        # Otherwise expand the custom test options instead
        elif test_type == VLMTestType.CUSTOM_INPUTS:
            if test_info.custom_test_opts is None:
                raise ValueError("Test has type CUSTOM_INPUTS, but none given")
@@ -121,8 +138,8 @@ def get_parametrized_options(test_settings: dict[str, VLMTestInfo],


 def get_wrapped_test_sizes(
-        test_info: VLMTestInfo,
-        test_type: VLMTestType) -> tuple[ImageSizeWrapper, ...]:
+    test_info: VLMTestInfo, test_type: VLMTestType
+) -> tuple[ImageSizeWrapper, ...]:
    """Given a test info which may have size factors or fixed sizes, wrap them
    and combine them into an iterable, each of which will be used in parameter
    expansion.
@@ -133,18 +150,18 @@ def get_wrapped_test_sizes(
    """
    # If it is an embedding test, we always use the EMBEDDING_SIZE_FACTORS
    if test_type == VLMTestType.EMBEDDING:
-        return tuple([
-            ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
-            for factor in EMBEDDING_SIZE_FACTORS
-        ])
+        return tuple(
+            [
+                ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
+                for factor in EMBEDDING_SIZE_FACTORS
+            ]
+        )
    # Audio and Custom inputs have preprocessed inputs
    elif test_type in (VLMTestType.AUDIO, VLMTestType.CUSTOM_INPUTS):
        return tuple()

-    size_factors = test_info.image_size_factors \
-        if test_info.image_size_factors else []
-    fixed_sizes = test_info.image_sizes \
-        if test_info.image_sizes else []
+    size_factors = test_info.image_size_factors if test_info.image_size_factors else []
+    fixed_sizes = test_info.image_sizes if test_info.image_sizes else []

    wrapped_factors = [
        ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
@@ -152,8 +169,7 @@ def get_wrapped_test_sizes(
    ]

    wrapped_sizes = [
-        ImageSizeWrapper(type=SizeType.FIXED_SIZE, data=size)
-        for size in fixed_sizes
+        ImageSizeWrapper(type=SizeType.FIXED_SIZE, data=size) for size in fixed_sizes
    ]

    return tuple(wrapped_factors + wrapped_sizes)
--- a/tests/models/multimodal/generation/vlm_utils/core.py
+++ b/tests/models/multimodal/generation/vlm_utils/core.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Core test implementation to be shared across modalities."""
+
 from typing import Any, Callable, Optional

 import torch
@@ -70,22 +71,23 @@ def run_test(
    if model_info.hf_overrides:
        vllm_runner_kwargs_["hf_overrides"] = model_info.hf_overrides
    if model_info.skip_tokenizer_init:
-        vllm_runner_kwargs_[
-            "skip_tokenizer_init"] = model_info.skip_tokenizer_init
+        vllm_runner_kwargs_["skip_tokenizer_init"] = model_info.skip_tokenizer_init

    if vllm_runner_kwargs:
        vllm_runner_kwargs_.update(vllm_runner_kwargs)

-    with vllm_runner(model,
-                     max_model_len=max_model_len,
-                     max_num_seqs=max_num_seqs,
-                     dtype=dtype,
-                     limit_mm_per_prompt=limit_mm_per_prompt,
-                     tensor_parallel_size=tensor_parallel_size,
-                     distributed_executor_backend=distributed_executor_backend,
-                     enforce_eager=enforce_eager,
-                     runner=runner,
-                     **vllm_runner_kwargs_) as vllm_model:
+    with vllm_runner(
+        model,
+        max_model_len=max_model_len,
+        max_num_seqs=max_num_seqs,
+        dtype=dtype,
+        limit_mm_per_prompt=limit_mm_per_prompt,
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
+        enforce_eager=enforce_eager,
+        runner=runner,
+        **vllm_runner_kwargs_,
+    ) as vllm_model:
        tokenizer = vllm_model.llm.get_tokenizer()

        vllm_kwargs: dict[str, Any] = {}
@@ -95,21 +97,19 @@ def run_test(
            vllm_kwargs["stop"] = stop_str

        for prompts, image_data, video_data, audio_data in vllm_inputs:
-            mm_data = dict(images=image_data,
-                           videos=video_data,
-                           audios=audio_data)
+            mm_data = dict(images=image_data, videos=video_data, audios=audio_data)
            vllm_kwargs_with_mm_data = vllm_kwargs | mm_data
            vllm_output = vllm_model.generate_greedy_logprobs(
                prompts,
                max_tokens,
                num_logprobs=num_logprobs,
-                **vllm_kwargs_with_mm_data)
+                **vllm_kwargs_with_mm_data,
+            )
            vllm_outputs_per_mm.append(vllm_output)

-    hf_model = hf_runner(model,
-                         dtype=dtype,
-                         auto_cls=auto_cls,
-                         model_kwargs=hf_model_kwargs)
+    hf_model = hf_runner(
+        model, dtype=dtype, auto_cls=auto_cls, model_kwargs=hf_model_kwargs
+    )

    # Some models need to patch things like the model processor, e.g., internvl
    if patch_hf_runner is not None:
@@ -129,16 +129,15 @@ def run_test(
            hf_kwargs["stop_strings"] = stop_str

        for prompts, image_data, video_data, audio_data in inputs:
-            mm_data = dict(images=image_data,
-                           videos=video_data,
-                           audios=audio_data)
+            mm_data = dict(images=image_data, videos=video_data, audios=audio_data)
            hf_kwargs_with_mm_data = hf_kwargs | mm_data
            hf_output = hf_model.generate_greedy_logprobs_limit(
                prompts,
                max_tokens,
                num_logprobs=num_logprobs,
                tokenizer=tokenizer,
-                **hf_kwargs_with_mm_data)
+                **hf_kwargs_with_mm_data,
+            )
            hf_outputs_per_mm.append(hf_output)

    # Apply output processing / sanitation to the vLLM and HF runner results
@@ -150,8 +149,7 @@ def run_test(
        second_runner_processor=vllm_output_post_proc,
    )

-    for hf_outputs, vllm_outputs in zip(hf_outputs_per_mm,
-                                        vllm_outputs_per_mm):
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_mm, vllm_outputs_per_mm):
        # This is usually check_logprobs_close, but it's passed through to
        # allow things like check_outputs_equal where needed
        comparator(
@@ -171,15 +169,19 @@ def process_runner_outputs(
 ):
    """Applies the runner processor(s) to the runner outputs, if any."""
    if first_runner_processor is not None:
-        first_runner_outputs = process_outputs(first_runner_processor, model,
-                                               first_runner_outputs)
+        first_runner_outputs = process_outputs(
+            first_runner_processor, model, first_runner_outputs
+        )
    if second_runner_processor is not None:
-        second_runner_outputs = process_outputs(second_runner_processor, model,
-                                                second_runner_outputs)
+        second_runner_outputs = process_outputs(
+            second_runner_processor, model, second_runner_outputs
+        )
    return first_runner_outputs, second_runner_outputs


 def process_outputs(output_processor, model, outputs_per_image):
    """Applies a model specific post-processor function to a runner's output"""
-    return [[output_processor(res, model) for res in outputs]
-            for outputs in outputs_per_image]
+    return [
+        [output_processor(res, model) for res in outputs]
+        for outputs in outputs_per_image
+    ]
--- a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
+++ b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
@@ -1,12 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Custom input builders for edge-cases in different models."""
+
 from typing import Callable

 from vllm.assets.image import ImageAsset
 from vllm.multimodal.image import rescale_image_size
-from vllm.multimodal.video import (rescale_video_size, resize_video,
-                                   sample_frames_from_video)
+from vllm.multimodal.video import (
+    rescale_video_size,
+    resize_video,
+    sample_frames_from_video,
+)

 from .....conftest import IMAGE_ASSETS, VIDEO_ASSETS
 from .builders import build_multi_image_inputs, build_single_image_inputs
@@ -15,7 +19,7 @@ from .types import ImageSizeWrapper, PromptWithMultiModalInput, SizeType

 def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]):
    """Builds inputs for multi-image (varied sizes/aspect ratio) testing.
-    
+
    Args:
        formatter: model-specific prompt formatter.
    """
@@ -41,7 +45,7 @@ def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]):
            stop_sign,
            rescale_image_size(stop_sign, 0.25),
            cherry_blossom.resize((183, 488)),
-            cherry_blossom.resize((488, 183))
+            cherry_blossom.resize((488, 183)),
        ],
        cherry_blossom,
    ]
@@ -54,10 +58,11 @@ def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]):
    ]


-def multi_video_multi_aspect_ratio_inputs(formatter: Callable[[str], str],
-                                          num_frames: int = 16):
+def multi_video_multi_aspect_ratio_inputs(
+    formatter: Callable[[str], str], num_frames: int = 16
+):
    """Builds inputs for multi-video (varied sizes/aspect ratio) testing.
-    
+
    Args:
        formatter: model-specific prompt formatter.
    """
@@ -81,7 +86,7 @@ def multi_video_multi_aspect_ratio_inputs(formatter: Callable[[str], str],
            video,
            rescale_video_size(video, 0.25),
            resize_video(video, (183, 488)),
-            resize_video(video, (488, 183))
+            resize_video(video, (488, 183)),
        ],
        video,
    ]
@@ -96,7 +101,9 @@ def multi_video_multi_aspect_ratio_inputs(formatter: Callable[[str], str],

 def different_patch_input_cases_internvl():
    images = [asset.pil_image.resize((896, 896)) for asset in IMAGE_ASSETS]
-    formatter = lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n"  # noqa: E501
+    formatter = (
+        lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n"
+    )  # noqa: E501
    single_img_prompts = [
        "<image>\nWhat's the content in the center of the image?",
        "<image>\nWhat is the season?",
@@ -115,14 +122,14 @@ def different_patch_input_cases_internvl():


 def windows_attention_image_qwen2_5_vl():
-
    # image from regression issue: https://github.com/vllm-project/vllm/issues/15122 # noqa: E501
    image = ImageAsset("hato").pil_image

    question = "Describe the image."
    img_prompt = "<|vision_start|><|image_pad|><|vision_end|>"
-    prompt = (f"<|im_start|>User\n{img_prompt}{question}<|im_end|>\n"
-              "<|im_start|>assistant\n")
+    prompt = (
+        f"<|im_start|>User\n{img_prompt}{question}<|im_end|>\n<|im_start|>assistant\n"
+    )

    wrapped_sf = ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=[0.5])
    return build_single_image_inputs([image], [prompt], wrapped_sf)
@@ -136,8 +143,9 @@ def video_with_metadata_glm4_1v():
    formatted_prompt = f"<|user|>\n{video_prompt}{question}<|assistant|>\n"

    scales = [0.1, 0.2, 0.25]
-    video_input = [[(rescale_video_size(video_array, scale), metadata)]
-                   for scale in scales]
+    video_input = [
+        [(rescale_video_size(video_array, scale), metadata)] for scale in scales
+    ]
    prompts = [formatted_prompt] * len(video_input)

    return [
--- a/tests/models/multimodal/generation/vlm_utils/model_utils.py
+++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py
@@ -4,6 +4,7 @@
 for manipulating the input / output of HF & vLLM test runners, which are
 typically specific to a small subset of models.
 """
+
 import types
 from pathlib import PosixPath
 from typing import Optional, Union
@@ -15,8 +16,13 @@ import pytest
 import regex as re
 import torch
 from PIL.Image import Image
-from transformers import (AutoConfig, AutoTokenizer, BatchFeature,
-                          GenerationConfig, GenerationMixin)
+from transformers import (
+    AutoConfig,
+    AutoTokenizer,
+    BatchFeature,
+    GenerationConfig,
+    GenerationMixin,
+)
 from transformers.video_utils import VideoMetadata

 from vllm.logprobs import SampleLogprobs
@@ -27,8 +33,7 @@ from .types import RunnerOutput


 ####### vLLM output processors functions
-def blip2_vllm_to_hf_output(vllm_output: RunnerOutput,
-                            model: str) -> RunnerOutput:
+def blip2_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
    """Sanitize vllm output [blip2 models] to be comparable with hf output."""
    _, output_str, out_logprobs = vllm_output

@@ -42,8 +47,7 @@ def blip2_vllm_to_hf_output(vllm_output: RunnerOutput,
    return hf_output_ids, hf_output_str, out_logprobs


-def fuyu_vllm_to_hf_output(vllm_output: RunnerOutput,
-                           model: str) -> RunnerOutput:
+def fuyu_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
    """Sanitize vllm output [fuyu models] to be comparable with hf output."""
    output_ids, output_str, out_logprobs = vllm_output

@@ -53,8 +57,8 @@ def fuyu_vllm_to_hf_output(vllm_output: RunnerOutput,


 def qwen_vllm_to_hf_output(
-        vllm_output: RunnerOutput,
-        model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
+    vllm_output: RunnerOutput, model: str
+) -> tuple[list[int], str, Optional[SampleLogprobs]]:
    """Sanitize vllm output [qwen models] to be comparable with hf output."""
    output_ids, output_str, out_logprobs = vllm_output

@@ -64,8 +68,8 @@ def qwen_vllm_to_hf_output(


 def qwen2_vllm_to_hf_output(
-        vllm_output: RunnerOutput,
-        model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
+    vllm_output: RunnerOutput, model: str
+) -> tuple[list[int], str, Optional[SampleLogprobs]]:
    """Sanitize vllm output [qwen2 models] to be comparable with hf output."""
    output_ids, output_str, out_logprobs = vllm_output

@@ -75,8 +79,8 @@ def qwen2_vllm_to_hf_output(


 def kimiv_vl_vllm_to_hf_output(
-        vllm_output: RunnerOutput,
-        model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
+    vllm_output: RunnerOutput, model: str
+) -> tuple[list[int], str, Optional[SampleLogprobs]]:
    """Sanitize vllm output [kimi_vl models] to be comparable with hf output."""
    output_ids, output_str, out_logprobs = vllm_output

@@ -85,23 +89,25 @@ def kimiv_vl_vllm_to_hf_output(
    return output_ids, hf_output_str, out_logprobs


-def llava_image_vllm_to_hf_output(vllm_output: RunnerOutput,
-                                  model: str) -> RunnerOutput:
+def llava_image_vllm_to_hf_output(
+    vllm_output: RunnerOutput, model: str
+) -> RunnerOutput:
    config = AutoConfig.from_pretrained(model)
    mm_token_id = config.image_token_index
    return _llava_vllm_to_hf_output(vllm_output, model, mm_token_id)


 def llava_video_vllm_to_hf_output(
-        vllm_output: RunnerOutput,
-        model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
+    vllm_output: RunnerOutput, model: str
+) -> tuple[list[int], str, Optional[SampleLogprobs]]:
    config = AutoConfig.from_pretrained(model)
    mm_token_id = config.video_token_index
    return _llava_vllm_to_hf_output(vllm_output, model, mm_token_id)


-def _llava_vllm_to_hf_output(vllm_output: RunnerOutput, model: str,
-                             mm_token_id: int) -> RunnerOutput:
+def _llava_vllm_to_hf_output(
+    vllm_output: RunnerOutput, model: str, mm_token_id: int
+) -> RunnerOutput:
    """Sanitize vllm output [Llava models] to be comparable with hf output."""
    output_ids, output_str, out_logprobs = vllm_output

@@ -109,7 +115,8 @@ def _llava_vllm_to_hf_output(vllm_output: RunnerOutput, model: str,
    eos_token_id = tokenizer.eos_token_id

    hf_output_ids = [
-        token_id for idx, token_id in enumerate(output_ids)
+        token_id
+        for idx, token_id in enumerate(output_ids)
        if token_id != mm_token_id or output_ids[idx - 1] != mm_token_id
    ]

@@ -128,8 +135,9 @@ def llava_onevision_hf_model_kwargs(model: str) -> dict:
    return config.to_dict()


-def llava_onevision_vllm_to_hf_output(vllm_output: RunnerOutput,
-                                      model: str) -> RunnerOutput:
+def llava_onevision_vllm_to_hf_output(
+    vllm_output: RunnerOutput, model: str
+) -> RunnerOutput:
    """Sanitize vllm output [llava-onevision] to compare with hf output."""
    output_ids, output_str, out_logprobs = vllm_output

@@ -140,7 +148,8 @@ def llava_onevision_vllm_to_hf_output(vllm_output: RunnerOutput,
    eos_token_id = tokenizer.eos_token_id

    hf_output_ids = [
-        token_id for idx, token_id in enumerate(output_ids)
+        token_id
+        for idx, token_id in enumerate(output_ids)
        if token_id != video_token_id or output_ids[idx - 1] != video_token_id
    ]

@@ -151,8 +160,7 @@ def llava_onevision_vllm_to_hf_output(vllm_output: RunnerOutput,
    return hf_output_ids, hf_output_str, out_logprobs


-def mantis_vllm_to_hf_output(vllm_output: RunnerOutput,
-                             model: str) -> RunnerOutput:
+def mantis_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
    """Sanitize vllm output [mantis] to compare with hf output."""
    output_ids, output_str, out_logprobs = vllm_output

@@ -161,8 +169,7 @@ def mantis_vllm_to_hf_output(vllm_output: RunnerOutput,
    return output_ids, hf_output_str, out_logprobs


-def phi3v_vllm_to_hf_output(vllm_output: RunnerOutput,
-                            model: str) -> RunnerOutput:
+def phi3v_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
    """Sanitize vllm output [phi3v] to be comparable with hf output."""
    _, output_str, out_logprobs = vllm_output

@@ -180,8 +187,7 @@ def phi3v_vllm_to_hf_output(vllm_output: RunnerOutput,
    return hf_output_ids, hf_output_str, out_logprobs


-def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput,
-                                model: str) -> RunnerOutput:
+def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
    """Sanitize vllm output to be comparable with hf output."""
    output_ids, output_str, out_logprobs = vllm_output

@@ -192,7 +198,8 @@ def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput,
    eos_token_id = tokenizer.eos_token_id

    hf_output_ids = [
-        token_id for idx, token_id in enumerate(output_ids)
+        token_id
+        for idx, token_id in enumerate(output_ids)
        if token_id != image_token_id or output_ids[idx - 1] != image_token_id
    ]

@@ -205,46 +212,40 @@ def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput,


 ####### Post-processors for HF outputs
-def deepseekvl2_trunc_hf_output(hf_output: RunnerOutput,
-                                model: str) -> RunnerOutput:
+def deepseekvl2_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
    output_ids, output_str, out_logprobs = hf_output
    if output_str.endswith("<｜end▁of▁sentence｜>"):
        output_str = output_str.split("<｜end▁of▁sentence｜>")[0]
    return output_ids, output_str, out_logprobs


-def idefics3_trunc_hf_output(hf_output: RunnerOutput,
-                             model: str) -> RunnerOutput:
+def idefics3_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
    output_ids, output_str, out_logprobs = hf_output
    if output_str.endswith("<end_of_utterance>"):
        output_str = output_str.split("<end_of_utterance>")[0]
    return output_ids, output_str, out_logprobs


-def smolvlm_trunc_hf_output(hf_output: RunnerOutput,
-                            model: str) -> RunnerOutput:
+def smolvlm_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
    # Based on Idefics3
    return idefics3_trunc_hf_output(hf_output, model)


-def minicpmv_trunc_hf_output(hf_output: RunnerOutput,
-                             model: str) -> RunnerOutput:
+def minicpmv_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
    output_ids, output_str, out_logprobs = hf_output
    if output_str.endswith("<|eot_id|>"):
        output_str = output_str.split("<|eot_id|>")[0]
    return output_ids, output_str, out_logprobs


-def minimax_vl_01_hf_output(hf_output: RunnerOutput,
-                            model: str) -> RunnerOutput:
+def minimax_vl_01_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
    output_ids, output_str, out_logprobs = hf_output
    if output_str.endswith("<end_of_sentence>"):
        output_str = output_str.split("<end_of_sentence>")[0]
    return output_ids, output_str, out_logprobs


-def ultravox_trunc_hf_output(hf_output: RunnerOutput,
-                             model: str) -> RunnerOutput:
+def ultravox_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
    output_ids, output_str, out_logprobs = hf_output

    tokenizer = AutoTokenizer.from_pretrained(model)
@@ -262,8 +263,8 @@ def get_llava_embeddings(image_assets: ImageTestAssets):

 ####### Prompt path encoders for models that need models on disk
 def qwen_prompt_path_encoder(
-        tmp_path: PosixPath, prompt: str,
-        assets: Union[list[ImageAsset], ImageTestAssets]) -> str:
+    tmp_path: PosixPath, prompt: str, assets: Union[list[ImageAsset], ImageTestAssets]
+) -> str:
    """Given a temporary dir path, export one or more image assets into the
    tempdir & replace its contents with the local path to the string so that
    the HF version of Qwen-VL can resolve the path and load the image in its
@@ -313,8 +314,9 @@ def deepseekvl2_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
        return BatchFeature(data=inputs, tensor_type="pt")

    hf_model.processor = processor
-    hf_model.model.get_output_embeddings = lambda: \
-        hf_model.model.language.model.embed_tokens
+    hf_model.model.get_output_embeddings = (
+        lambda: hf_model.model.language.model.embed_tokens
+    )
    return hf_model


@@ -357,11 +359,10 @@ def glm4v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
        assert len(contents) == len(images)

        return hf_processor.apply_chat_template(
-            [{
-                "role": "user",
-                "image": image,
-                "content": content
-            } for image, content in zip(images, contents)],
+            [
+                {"role": "user", "image": image, "content": content}
+                for image, content in zip(images, contents)
+            ],
            add_generation_prompt=True,
            tokenize=True,
            return_dict=True,
@@ -369,8 +370,9 @@ def glm4v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
        )

    hf_model.processor = processor
-    hf_model.model.get_output_embeddings = lambda: \
-        hf_model.model.transformer.output_layer
+    hf_model.model.get_output_embeddings = (
+        lambda: hf_model.model.transformer.output_layer
+    )
    return hf_model


@@ -387,10 +389,9 @@ def glm4_1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
        else:
            video_metadata = None

-        return hf_processor(*args,
-                            videos=videos,
-                            video_metadata=video_metadata,
-                            **kwargs)
+        return hf_processor(
+            *args, videos=videos, video_metadata=video_metadata, **kwargs
+        )

    hf_model.processor = processor
    return hf_model
@@ -406,8 +407,9 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
            self.num_image_token = hf_runner.model.num_image_token
            self.tokenizer = hf_runner.tokenizer

-            self.config = AutoConfig.from_pretrained(hf_runner.model_name,
-                                                     trust_remote_code=True)
+            self.config = AutoConfig.from_pretrained(
+                hf_runner.model_name, trust_remote_code=True
+            )
            self.vision_config = self.config.vision_config
            self.use_thumbnail = self.config.use_thumbnail
            self.use_msac = self.config.use_msac
@@ -415,11 +417,14 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
            self.max_num = self.config.max_dynamic_patch
            self.image_size = self.vision_config.image_size

-        def __call__(self, text: str, images: Union[Image, list[Image]],
-                     **kwargs):
+        def __call__(self, text: str, images: Union[Image, list[Image]], **kwargs):
            # yapf: disable
            from vllm.model_executor.models.h2ovl import (
-                IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values_h2ovl)
+                IMG_CONTEXT,
+                IMG_END,
+                IMG_START,
+                image_to_pixel_values_h2ovl,
+            )

            # yapf: enable
            images = [images] if isinstance(images, Image) else images
@@ -431,29 +436,26 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
                    max_num=self.max_num,
                    use_thumbnail=self.use_thumbnail,
                    use_msac=self.use_msac,
-                ) for image in images
-            ]
-            num_patches_list = [
-                pixel_value.shape[0] for pixel_value in pixel_values
+                )
+                for image in images
            ]
+            num_patches_list = [pixel_value.shape[0] for pixel_value in pixel_values]
            pixel_values = torch.cat(pixel_values, dim=0)
            for num_patches in num_patches_list:
-                context_tokens = IMG_CONTEXT * self.num_image_token \
-                    * num_patches
+                context_tokens = IMG_CONTEXT * self.num_image_token * num_patches
                image_tokens = IMG_START + context_tokens + IMG_END
-                text = text.replace('<image>', image_tokens, 1)
+                text = text.replace("<image>", image_tokens, 1)
            prompt = self.tokenizer(text, return_tensors="pt")
            prompt.update({"pixel_values": pixel_values})
            return prompt

-    img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
-        "<IMG_CONTEXT>")
+    img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
    hf_model.model.img_context_token_id = img_context_token_id
    hf_model.processor = H2OVLProcessor(hf_model)
-    hf_model.model.get_output_embeddings = lambda: \
-        hf_model.model.language_model.get_output_embeddings()
-    hf_model.model.generate = types.MethodType(_internvl_generate,
-                                               hf_model.model)
+    hf_model.model.get_output_embeddings = (
+        lambda: hf_model.model.language_model.get_output_embeddings()
+    )
+    hf_model.model.generate = types.MethodType(_internvl_generate, hf_model.model)
    return hf_model


@@ -467,19 +469,23 @@ def skyworkr1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
            self.num_image_token = hf_runner.model.num_image_token
            self.tokenizer = hf_runner.tokenizer

-            self.config = AutoConfig.from_pretrained(hf_runner.model_name,
-                                                     trust_remote_code=True)
+            self.config = AutoConfig.from_pretrained(
+                hf_runner.model_name, trust_remote_code=True
+            )
            self.vision_config = self.config.vision_config
            self.use_thumbnail = self.config.use_thumbnail
            self.min_num = self.config.min_dynamic_patch
            self.max_num = self.config.max_dynamic_patch
            self.image_size = self.vision_config.image_size

-        def __call__(self, text: str, images: Union[Image, list[Image]],
-                     **kwargs):
+        def __call__(self, text: str, images: Union[Image, list[Image]], **kwargs):
            from vllm.model_executor.models.skyworkr1v import (
-                IMG_CONTEXT, IMG_END, IMG_START,
-                image_to_pixel_values_skyworkr1v)
+                IMG_CONTEXT,
+                IMG_END,
+                IMG_START,
+                image_to_pixel_values_skyworkr1v,
+            )
+
            images = [images] if isinstance(images, Image) else images
            pixel_values = [
                image_to_pixel_values_skyworkr1v(
@@ -488,29 +494,26 @@ def skyworkr1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
                    min_num=self.min_num,
                    max_num=self.max_num,
                    use_thumbnail=self.use_thumbnail,
-                ) for image in images
-            ]
-            num_patches_list = [
-                pixel_value.shape[0] for pixel_value in pixel_values
+                )
+                for image in images
            ]
+            num_patches_list = [pixel_value.shape[0] for pixel_value in pixel_values]
            pixel_values = torch.cat(pixel_values, dim=0)
            for num_patches in num_patches_list:
-                context_tokens = IMG_CONTEXT * self.num_image_token \
-                    * num_patches
+                context_tokens = IMG_CONTEXT * self.num_image_token * num_patches
                image_tokens = IMG_START + context_tokens + IMG_END
-                text = text.replace('<image>', image_tokens, 1)
+                text = text.replace("<image>", image_tokens, 1)
            prompt = self.tokenizer(text, return_tensors="pt")
            prompt.update({"pixel_values": pixel_values})
            return prompt

-    img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
-        "<IMG_CONTEXT>")
+    img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
    hf_model.model.img_context_token_id = img_context_token_id
    hf_model.processor = SkyworkR1VProcessor(hf_model)
-    hf_model.model.get_output_embeddings = lambda: \
-        hf_model.model.language_model.get_output_embeddings()
-    hf_model.model.generate = types.MethodType(_internvl_generate,
-                                               hf_model.model)
+    hf_model.model.get_output_embeddings = (
+        lambda: hf_model.model.language_model.get_output_embeddings()
+    )
+    hf_model.model.generate = types.MethodType(_internvl_generate, hf_model.model)
    return hf_model


@@ -524,8 +527,9 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
            self.num_image_token = hf_runner.model.num_image_token
            self.tokenizer = hf_runner.tokenizer

-            self.config = AutoConfig.from_pretrained(hf_runner.model_name,
-                                                     trust_remote_code=True)
+            self.config = AutoConfig.from_pretrained(
+                hf_runner.model_name, trust_remote_code=True
+            )
            self.vision_config = self.config.vision_config
            self.use_thumbnail = self.config.use_thumbnail
            self.min_num = self.config.min_dynamic_patch
@@ -540,8 +544,13 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
            **kwargs,
        ):
            from vllm.model_executor.models.internvl import (
-                IMG_CONTEXT, IMG_END, IMG_START,
-                image_to_pixel_values_internvl, video_to_pixel_values_internvl)
+                IMG_CONTEXT,
+                IMG_END,
+                IMG_START,
+                image_to_pixel_values_internvl,
+                video_to_pixel_values_internvl,
+            )
+
            images = [images] if isinstance(images, Image) else images
            videos = [videos] if isinstance(videos, np.ndarray) else videos
            if images is not None:
@@ -552,7 +561,8 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
                        min_num=self.min_num,
                        max_num=self.max_num,
                        use_thumbnail=self.use_thumbnail,
-                    ) for image in images
+                    )
+                    for image in images
                ]
                num_patches_images = [
                    pixel_value.shape[0] for pixel_value in pixel_values_images
@@ -568,7 +578,8 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
                        min_num=1,
                        max_num=1,
                        use_thumbnail=False,
-                    ) for video in videos
+                    )
+                    for video in videos
                ]
                num_patches_videos = [
                    pixel_value.shape[0] for pixel_value in pixel_values_videos
@@ -580,38 +591,37 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
            while ("<image>" in text) or ("<video>" in text):
                image_index = text.find("<image>")
                video_index = text.find("<video>")
-                if image_index == -1 or (video_index > -1
-                                         and video_index < image_index):
+                if image_index == -1 or (
+                    video_index > -1 and video_index < image_index
+                ):
                    num_patches = num_patches_videos.pop(0)
                    pixel_values.append(pixel_values_videos.pop(0))
-                    context_tokens = IMG_START + \
-                        IMG_CONTEXT * self.num_image_token + IMG_END
-                    video_tokens = ''.join([
-                        f'Frame{i+1}: {context_tokens}'
-                        for i in range(num_patches)
-                    ])
-                    text = text.replace('<video>', video_tokens, 1)
+                    context_tokens = (
+                        IMG_START + IMG_CONTEXT * self.num_image_token + IMG_END
+                    )
+                    video_tokens = "".join(
+                        [f"Frame{i + 1}: {context_tokens}" for i in range(num_patches)]
+                    )
+                    text = text.replace("<video>", video_tokens, 1)
                else:
                    num_patches = num_patches_images.pop(0)
                    pixel_values.append(pixel_values_images.pop(0))
-                    context_tokens = IMG_CONTEXT * self.num_image_token \
-                        * num_patches
+                    context_tokens = IMG_CONTEXT * self.num_image_token * num_patches
                    image_tokens = IMG_START + context_tokens + IMG_END
-                    text = text.replace('<image>', image_tokens, 1)
+                    text = text.replace("<image>", image_tokens, 1)
            pixel_values = torch.cat(pixel_values, dim=0)

            prompt = self.tokenizer(text, return_tensors="pt")
            prompt.update({"pixel_values": pixel_values})
            return prompt

-    img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
-        "<IMG_CONTEXT>")
+    img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
    hf_model.model.img_context_token_id = img_context_token_id
    hf_model.processor = InternVLProcessor(hf_model)
-    hf_model.model.get_output_embeddings = lambda: \
-        hf_model.model.language_model.get_output_embeddings()
-    hf_model.model.generate = types.MethodType(_internvl_generate,
-                                               hf_model.model)
+    hf_model.model.get_output_embeddings = (
+        lambda: hf_model.model.language_model.get_output_embeddings()
+    )
+    hf_model.model.generate = types.MethodType(_internvl_generate, hf_model.model)
    return hf_model


@@ -631,7 +641,7 @@ def _internvl_generate(
    input_embeds = input_embeds.reshape(B * N, C)

    input_ids = input_ids.reshape(B * N)
-    selected = (input_ids == self.img_context_token_id)
+    selected = input_ids == self.img_context_token_id
    assert selected.sum() != 0
    input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device)

@@ -778,8 +788,9 @@ def molmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:

 def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
    """Patches and returns an instance of the HfRunner to use for Ovis2."""
-    hf_model.model.get_output_embeddings = lambda: \
-        hf_model.model.llm.get_output_embeddings()
+    hf_model.model.get_output_embeddings = (
+        lambda: hf_model.model.llm.get_output_embeddings()
+    )

    def processor(*args, text="", images=None, **kwargs):
        text_tokenizer = hf_model.model.get_text_tokenizer()
@@ -787,8 +798,7 @@ def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:

        prompt_start_and_end = {
            "qwen2": ("<|im_start|>user\n", "<|im_end|>\n"),
-            "llama":
-            ("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
+            "llama": ("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
            "gemma2": ("<start_of_turn>user\n", "<end_of_turn>\n"),
        }
        for start, end in prompt_start_and_end.values():
@@ -797,7 +807,8 @@ def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
                break

        prompt, input_ids, pixel_values = hf_model.model.preprocess_inputs(
-            text_or_conversations=text, images=images)
+            text_or_conversations=text, images=images
+        )
        attention_mask = torch.ne(input_ids, text_tokenizer.pad_token_id)

        inputs = {
@@ -813,8 +824,9 @@ def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:

 def ovis2_5_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
    """Patches and returns an instance of the HfRunner to use for Ovis2."""
-    hf_model.model.get_output_embeddings = lambda: \
-        hf_model.model.llm.get_output_embeddings()
+    hf_model.model.get_output_embeddings = (
+        lambda: hf_model.model.llm.get_output_embeddings()
+    )

    def processor(*args, text="", images=None, videos=None, **kwargs):
        if images is None:
@@ -825,13 +837,11 @@ def ovis2_5_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
            videos = []
        else:
            videos = [videos] if isinstance(videos, np.ndarray) else videos
-            videos = [[PIL.Image.fromarray(frame) for frame in vid]
-                      for vid in videos]
+            videos = [[PIL.Image.fromarray(frame) for frame in vid] for vid in videos]

        prompt_start_and_end = {
            "qwen2": ("<|im_start|>user\n", "<|im_end|>\n"),
-            "llama":
-            ("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
+            "llama": ("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
            "gemma2": ("<start_of_turn>user\n", "<end_of_turn>\n"),
        }
        for start, end in prompt_start_and_end.values():
@@ -842,21 +852,20 @@ def ovis2_5_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
        images_message = [{"type": "image", "image": img} for img in images]
        videos_message = [{"type": "video", "video": vid} for vid in videos]

-        messages = [{
-            "role":
-            "user",
-            "content": [
-                *images_message,
-                *videos_message,
-                {
-                    "type": "text",
-                    "text": text
-                },
-            ],
-        }]
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    *images_message,
+                    *videos_message,
+                    {"type": "text", "text": text},
+                ],
+            }
+        ]

        input_ids, pixel_values, grid_thws = hf_model.model.preprocess_inputs(
-            messages=messages, enable_thinking=True)
+            messages=messages, enable_thinking=True
+        )
        inputs = {
            "inputs": input_ids,
            "pixel_values": pixel_values,
--- a/tests/models/multimodal/generation/vlm_utils/runners.py
+++ b/tests/models/multimodal/generation/vlm_utils/runners.py
@@ -3,23 +3,34 @@
 """Entrypoints for wrapping the core run_test implementation for specific test
 types / modalities.
 """
+
 from pathlib import PosixPath

-from .....conftest import (AudioTestAssets, HfRunner, ImageTestAssets,
-                           VideoTestAssets, VllmRunner)
+from .....conftest import (
+    AudioTestAssets,
+    HfRunner,
+    ImageTestAssets,
+    VideoTestAssets,
+    VllmRunner,
+)
 from . import builders, core
 from .types import ExpandableVLMTestArgs, VLMTestInfo


 ####### Entrypoints for running different test types
-def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
-                          test_case: ExpandableVLMTestArgs,
-                          hf_runner: type[HfRunner],
-                          vllm_runner: type[VllmRunner],
-                          image_assets: ImageTestAssets):
+def run_single_image_test(
+    *,
+    tmp_path: PosixPath,
+    model_test_info: VLMTestInfo,
+    test_case: ExpandableVLMTestArgs,
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    image_assets: ImageTestAssets,
+):
    assert test_case.size_wrapper is not None
    inputs = builders.build_single_image_inputs_from_test_info(
-        model_test_info, image_assets, test_case.size_wrapper, tmp_path)
+        model_test_info, image_assets, test_case.size_wrapper, tmp_path
+    )

    core.run_test(
        hf_runner=hf_runner,
@@ -31,17 +42,23 @@ def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
        num_logprobs=test_case.num_logprobs,
        limit_mm_per_prompt={"image": 1},
        distributed_executor_backend=test_case.distributed_executor_backend,
-        **model_test_info.get_non_parametrized_runner_kwargs())
+        **model_test_info.get_non_parametrized_runner_kwargs(),
+    )


-def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
-                         test_case: ExpandableVLMTestArgs,
-                         hf_runner: type[HfRunner],
-                         vllm_runner: type[VllmRunner],
-                         image_assets: ImageTestAssets):
+def run_multi_image_test(
+    *,
+    tmp_path: PosixPath,
+    model_test_info: VLMTestInfo,
+    test_case: ExpandableVLMTestArgs,
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    image_assets: ImageTestAssets,
+):
    assert test_case.size_wrapper is not None
    inputs = builders.build_multi_image_inputs_from_test_info(
-        model_test_info, image_assets, test_case.size_wrapper, tmp_path)
+        model_test_info, image_assets, test_case.size_wrapper, tmp_path
+    )

    core.run_test(
        hf_runner=hf_runner,
@@ -53,17 +70,22 @@ def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
        num_logprobs=test_case.num_logprobs,
        limit_mm_per_prompt={"image": len(image_assets)},
        distributed_executor_backend=test_case.distributed_executor_backend,
-        **model_test_info.get_non_parametrized_runner_kwargs())
+        **model_test_info.get_non_parametrized_runner_kwargs(),
+    )


-def run_embedding_test(*, model_test_info: VLMTestInfo,
-                       test_case: ExpandableVLMTestArgs,
-                       hf_runner: type[HfRunner],
-                       vllm_runner: type[VllmRunner],
-                       image_assets: ImageTestAssets):
+def run_embedding_test(
+    *,
+    model_test_info: VLMTestInfo,
+    test_case: ExpandableVLMTestArgs,
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+    image_assets: ImageTestAssets,
+):
    assert test_case.size_wrapper is not None
    inputs, vllm_embeddings = builders.build_embedding_inputs_from_test_info(
-        model_test_info, image_assets, test_case.size_wrapper)
+        model_test_info, image_assets, test_case.size_wrapper
+    )

    core.run_test(
        hf_runner=hf_runner,
@@ -76,7 +98,8 @@ def run_embedding_test(*, model_test_info: VLMTestInfo,
        limit_mm_per_prompt={"image": 1},
        vllm_embeddings=vllm_embeddings,
        distributed_executor_backend=test_case.distributed_executor_backend,
-        **model_test_info.get_non_parametrized_runner_kwargs())
+        **model_test_info.get_non_parametrized_runner_kwargs(),
+    )


 def run_video_test(
@@ -90,8 +113,11 @@ def run_video_test(
    assert test_case.size_wrapper is not None
    assert test_case.num_video_frames is not None
    inputs = builders.build_video_inputs_from_test_info(
-        model_test_info, video_assets, test_case.size_wrapper,
-        test_case.num_video_frames)
+        model_test_info,
+        video_assets,
+        test_case.size_wrapper,
+        test_case.num_video_frames,
+    )

    core.run_test(
        hf_runner=hf_runner,
@@ -103,7 +129,8 @@ def run_video_test(
        num_logprobs=test_case.num_logprobs,
        limit_mm_per_prompt={"video": len(video_assets)},
        distributed_executor_backend=test_case.distributed_executor_backend,
-        **model_test_info.get_non_parametrized_runner_kwargs())
+        **model_test_info.get_non_parametrized_runner_kwargs(),
+    )


 def run_audio_test(
@@ -114,8 +141,7 @@ def run_audio_test(
    vllm_runner: type[VllmRunner],
    audio_assets: AudioTestAssets,
 ):
-    inputs = builders.build_audio_inputs_from_test_info(
-        model_test_info, audio_assets)
+    inputs = builders.build_audio_inputs_from_test_info(model_test_info, audio_assets)

    core.run_test(
        hf_runner=hf_runner,
@@ -127,13 +153,17 @@ def run_audio_test(
        num_logprobs=test_case.num_logprobs,
        limit_mm_per_prompt={"audio": 1},
        distributed_executor_backend=test_case.distributed_executor_backend,
-        **model_test_info.get_non_parametrized_runner_kwargs())
+        **model_test_info.get_non_parametrized_runner_kwargs(),
+    )


-def run_custom_inputs_test(*, model_test_info: VLMTestInfo,
-                           test_case: ExpandableVLMTestArgs,
-                           hf_runner: type[HfRunner],
-                           vllm_runner: type[VllmRunner]):
+def run_custom_inputs_test(
+    *,
+    model_test_info: VLMTestInfo,
+    test_case: ExpandableVLMTestArgs,
+    hf_runner: type[HfRunner],
+    vllm_runner: type[VllmRunner],
+):
    # Custom test cases can provide inputs directly, but they need to
    # explicitly provided a CustomTestConfig, which wraps the inputs and
    # the limit_mm_per_prompt
@@ -155,4 +185,5 @@ def run_custom_inputs_test(*, model_test_info: VLMTestInfo,
        num_logprobs=test_case.num_logprobs,
        limit_mm_per_prompt=limit_mm_per_prompt,
        distributed_executor_backend=test_case.distributed_executor_backend,
-        **model_test_info.get_non_parametrized_runner_kwargs())
+        **model_test_info.get_non_parametrized_runner_kwargs(),
+    )
--- a/tests/models/multimodal/generation/vlm_utils/types.py
+++ b/tests/models/multimodal/generation/vlm_utils/types.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Types for writing multimodal model tests."""
+
 from collections.abc import Iterable
 from enum import Enum
 from pathlib import PosixPath
@@ -15,9 +16,16 @@ from vllm.config import RunnerOption
 from vllm.logprobs import SampleLogprobs
 from vllm.transformers_utils.tokenizer import AnyTokenizer

-from .....conftest import (AUDIO_ASSETS, IMAGE_ASSETS, HfRunner, ImageAsset,
-                           ImageTestAssets, PromptAudioInput, PromptImageInput,
-                           PromptVideoInput)
+from .....conftest import (
+    AUDIO_ASSETS,
+    IMAGE_ASSETS,
+    HfRunner,
+    ImageAsset,
+    ImageTestAssets,
+    PromptAudioInput,
+    PromptImageInput,
+    PromptVideoInput,
+)
 from ....utils import check_logprobs_close

 # meta image tag; will be replaced by the appropriate tag for the model
@@ -47,6 +55,7 @@ RunnerOutput = tuple[list[int], str, Optional[SampleLogprobs]]

 class PromptWithMultiModalInput(NamedTuple):
    """Holds the multimodal input for a single test case."""
+
    prompts: list[str]
    image_data: Optional[PromptImageInput] = None
    video_data: Optional[PromptVideoInput] = None
@@ -100,8 +109,9 @@ class VLMTestInfo(NamedTuple):

    # Function for converting ImageAssets to image embeddings;
    # We need to define this explicitly for embedding tests
-    convert_assets_to_embeddings: Optional[Callable[[ImageTestAssets],
-                                                    list[torch.Tensor]]] = None
+    convert_assets_to_embeddings: Optional[
+        Callable[[ImageTestAssets], list[torch.Tensor]]
+    ] = None

    # Exposed options for vLLM runner; we change these in a several tests,
    # but the defaults are derived from VllmRunner & the engine defaults
@@ -156,8 +166,8 @@ class VLMTestInfo(NamedTuple):
    # for Qwen-VL, which requires encoding the image path / url into the prompt
    # for HF runner
    prompt_path_encoder: Optional[
-        Callable[[PosixPath, str, Union[list[ImageAsset], ImageTestAssets]],
-                 str]] = None  # noqa: E501
+        Callable[[PosixPath, str, Union[list[ImageAsset], ImageTestAssets]], str]
+    ] = None  # noqa: E501

    # Allows configuring a test to run with custom inputs
    custom_test_opts: Optional[list[CustomTestOptions]] = None
@@ -190,6 +200,7 @@ class VLMTestInfo(NamedTuple):

 class ExpandableVLMTestArgs(NamedTuple):
    """The expanded kwargs which correspond to a single test case."""
+
    model: str
    max_tokens: int
    num_logprobs: int