Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-05 15:06:22 +01:00
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions
--- a/tests/models/quantization/test_awq.py
+++ b/tests/models/quantization/test_awq.py
@@ -11,12 +11,12 @@ from vllm.multimodal.image import rescale_image_size
 from ...conftest import IMAGE_ASSETS, ImageTestAssets, VllmRunner
 from ..utils import check_logprobs_close

-HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
-    "stop_sign":
-    "<|im_start|>User\n<image>\nWhat's the content in the center of the image?<|im_end|>\n<|im_start|>Assistant\n",  # noqa: E501
-    "cherry_blossom":
-    "<|im_start|>User\n<image>\nWhat is the season?<|im_end|>\n<|im_start|>Assistant\n",  # noqa: E501
-})
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
+    {
+        "stop_sign": "<|im_start|>User\n<image>\nWhat's the content in the center of the image?<|im_end|>\n<|im_start|>Assistant\n",  # noqa: E501
+        "cherry_blossom": "<|im_start|>User\n<image>\nWhat is the season?<|im_end|>\n<|im_start|>Assistant\n",  # noqa: E501
+    }
+)


 def run_awq_test(
@@ -34,10 +34,13 @@ def run_awq_test(
 ):
    images = [asset.pil_image for asset in image_assets]

-    inputs_per_image = [(
-        [prompt for _ in size_factors],
-        [rescale_image_size(image, factor) for factor in size_factors],
-    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+    inputs_per_image = [
+        (
+            [prompt for _ in size_factors],
+            [rescale_image_size(image, factor) for factor in size_factors],
+        )
+        for image, prompt in zip(images, HF_IMAGE_PROMPTS)
+    ]

    # NOTE: take care of the order. run vLLM first, and then run HF.
    # vLLM needs a fresh new process without cuda initialization.
@@ -46,42 +49,41 @@ def run_awq_test(

    # max_model_len should be greater than image_feature_size
    with vllm_runner(
-            source_model,
-            max_model_len=4096,
-            dtype=dtype,
-            tensor_parallel_size=tensor_parallel_size,
-            distributed_executor_backend=distributed_executor_backend,
-            enforce_eager=True,
-            default_torch_num_threads=1,
+        source_model,
+        max_model_len=4096,
+        dtype=dtype,
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
+        enforce_eager=True,
+        default_torch_num_threads=1,
    ) as vllm_model:
        source_outputs_per_image = [
-            vllm_model.generate_greedy_logprobs(prompts,
-                                                max_tokens,
-                                                num_logprobs=num_logprobs,
-                                                images=images)
+            vllm_model.generate_greedy_logprobs(
+                prompts, max_tokens, num_logprobs=num_logprobs, images=images
+            )
            for prompts, images in inputs_per_image
        ]

    with vllm_runner(
-            quant_model,
-            quantization="awq",
-            max_model_len=4096,
-            dtype=dtype,
-            tensor_parallel_size=tensor_parallel_size,
-            distributed_executor_backend=distributed_executor_backend,
-            enforce_eager=True,
-            default_torch_num_threads=1,
+        quant_model,
+        quantization="awq",
+        max_model_len=4096,
+        dtype=dtype,
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
+        enforce_eager=True,
+        default_torch_num_threads=1,
    ) as vllm_model:
        quant_outputs_per_image = [
-            vllm_model.generate_greedy_logprobs(prompts,
-                                                max_tokens,
-                                                num_logprobs=num_logprobs,
-                                                images=images)
+            vllm_model.generate_greedy_logprobs(
+                prompts, max_tokens, num_logprobs=num_logprobs, images=images
+            )
            for prompts, images in inputs_per_image
        ]

-    for source_outputs, quant_outputs in zip(source_outputs_per_image,
-                                             quant_outputs_per_image):
+    for source_outputs, quant_outputs in zip(
+        source_outputs_per_image, quant_outputs_per_image
+    ):
        # TODO: Check whether using original CLIPVisionModel can improve
        # consistency against HF
        check_logprobs_close(
@@ -113,9 +115,16 @@ def run_awq_test(
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
@torch.inference_mode()
-def test_awq_models(vllm_runner, image_assets, source_model, quant_model,
-                    size_factors, dtype, max_tokens, num_logprobs) -> None:
-
+def test_awq_models(
+    vllm_runner,
+    image_assets,
+    source_model,
+    quant_model,
+    size_factors,
+    dtype,
+    max_tokens,
+    num_logprobs,
+) -> None:
    run_awq_test(
        vllm_runner,
        image_assets,
--- a/tests/models/quantization/test_bitblas.py
+++ b/tests/models/quantization/test_bitblas.py
@@ -7,9 +7,10 @@ As a result, in this test, we just confirm that the top selected tokens of the
 bitblas/GPTQ models are in the top 3 selections of each other.

 Note: bitblas internally uses locks to synchronize the threads. This can
-result in very slight nondeterminism for bitblas. As a result, we re-run the 
+result in very slight nondeterminism for bitblas. As a result, we re-run the
 test up to 3 times to see if we pass.
 """
+
 from dataclasses import dataclass

 import pytest
@@ -24,8 +25,10 @@ class ModelPair:


 model_pairs = [
-    ModelPair(model_bitblas="hxbgsyxh/opt-125m-4bit-128g-bitblas",
-              model_gptq="hxbgsyxh/opt-125m-4bit-128g"),
+    ModelPair(
+        model_bitblas="hxbgsyxh/opt-125m-4bit-128g-bitblas",
+        model_gptq="hxbgsyxh/opt-125m-4bit-128g",
+    ),
 ]


@@ -43,16 +46,19 @@ def test_models(
    max_tokens: int,
    num_logprobs: int,
 ) -> None:
-    with vllm_runner(model_pair.model_bitblas,
-                     dtype=dtype,
-                     quantization="bitblas") as bitblas_model:
+    with vllm_runner(
+        model_pair.model_bitblas, dtype=dtype, quantization="bitblas"
+    ) as bitblas_model:
        bitblas_outputs = bitblas_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )

-    with vllm_runner(model_pair.model_gptq, dtype=dtype,
-                     quantization="gptq") as gptq_model:
+    with vllm_runner(
+        model_pair.model_gptq, dtype=dtype, quantization="gptq"
+    ) as gptq_model:
        gptq_outputs = gptq_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )

    check_logprobs_close(
        outputs_0_lst=gptq_outputs,
--- a/tests/models/quantization/test_bitsandbytes.py
+++ b/tests/models/quantization/test_bitsandbytes.py
@@ -1,9 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-'''Tests whether bitsandbytes computation is enabled correctly.
+"""Tests whether bitsandbytes computation is enabled correctly.

 Run `pytest tests/quantization/test_bitsandbytes.py`.
-'''
+"""

 import pytest
 from transformers import BitsAndBytesConfig
@@ -15,8 +15,10 @@ from ..utils import check_embeddings_close, check_logprobs_close

 models_4bit_to_test = [
    ("facebook/opt-125m", "quantize opt model inflight"),
-    ("mistralai/Mistral-7B-Instruct-v0.3",
-     "quantize inflight model with both HF and Mistral format weights")
+    (
+        "mistralai/Mistral-7B-Instruct-v0.3",
+        "quantize inflight model with both HF and Mistral format weights",
+    ),
 ]

 models_4bit_to_embedding_test = [
@@ -28,72 +30,84 @@ models_4bit_to_moe_test = [
 ]

 models_pre_qaunt_4bit_to_test = [
-    ('PrunaAI/Einstein-v6.1-Llama3-8B-bnb-4bit-smashed',
-     'read pre-quantized 4-bit FP4 model'),
-    ('poedator/opt-125m-bnb-4bit', 'read pre-quantized 4-bit NF4 opt model'),
+    (
+        "PrunaAI/Einstein-v6.1-Llama3-8B-bnb-4bit-smashed",
+        "read pre-quantized 4-bit FP4 model",
+    ),
+    ("poedator/opt-125m-bnb-4bit", "read pre-quantized 4-bit NF4 opt model"),
 ]

 models_pre_quant_8bit_to_test = [
-    ('meta-llama/Llama-Guard-3-8B-INT8',
-     'read pre-quantized llama 8-bit model'),
+    ("meta-llama/Llama-Guard-3-8B-INT8", "read pre-quantized llama 8-bit model"),
    ("yec019/fbopt-350m-8bit", "read pre-quantized 8-bit opt model"),
 ]


-@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
-                    reason='bitsandbytes is not supported on this GPU type.')
+@pytest.mark.skipif(
+    not is_quant_method_supported("bitsandbytes"),
+    reason="bitsandbytes is not supported on this GPU type.",
+)
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
-def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
-                             model_name, description) -> None:
-
-    hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(
-        load_in_4bit=True))
-    validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1],
-                             model_name, False, hf_model_kwargs)
+def test_load_4bit_bnb_model(
+    hf_runner, vllm_runner, example_prompts, model_name, description
+) -> None:
+    hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(load_in_4bit=True))
+    validate_generated_texts(
+        hf_runner, vllm_runner, example_prompts[:1], model_name, False, hf_model_kwargs
+    )


-@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
-                    reason='bitsandbytes is not supported on this GPU type.')
-@pytest.mark.parametrize("model_name, description",
-                         models_pre_qaunt_4bit_to_test)
-def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
-                                       model_name, description) -> None:
-
-    validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1],
-                             model_name, True)
+@pytest.mark.skipif(
+    not is_quant_method_supported("bitsandbytes"),
+    reason="bitsandbytes is not supported on this GPU type.",
+)
+@pytest.mark.parametrize("model_name, description", models_pre_qaunt_4bit_to_test)
+def test_load_pre_quant_4bit_bnb_model(
+    hf_runner, vllm_runner, example_prompts, model_name, description
+) -> None:
+    validate_generated_texts(
+        hf_runner, vllm_runner, example_prompts[:1], model_name, True
+    )


-@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
-                    reason='bitsandbytes is not supported on this GPU type.')
-@pytest.mark.parametrize("model_name, description",
-                         models_pre_quant_8bit_to_test)
-def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
-                             model_name, description) -> None:
-
-    validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1],
-                             model_name, True)
+@pytest.mark.skipif(
+    not is_quant_method_supported("bitsandbytes"),
+    reason="bitsandbytes is not supported on this GPU type.",
+)
+@pytest.mark.parametrize("model_name, description", models_pre_quant_8bit_to_test)
+def test_load_8bit_bnb_model(
+    hf_runner, vllm_runner, example_prompts, model_name, description
+) -> None:
+    validate_generated_texts(
+        hf_runner, vllm_runner, example_prompts[:1], model_name, True
+    )


-@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
-                    reason='bitsandbytes is not supported on this GPU type.')
+@pytest.mark.skipif(
+    not is_quant_method_supported("bitsandbytes"),
+    reason="bitsandbytes is not supported on this GPU type.",
+)
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
@multi_gpu_test(num_gpus=2)
-def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
-                                model_name, description) -> None:
-
-    hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(
-        load_in_4bit=True))
-    validate_generated_texts(hf_runner,
-                             vllm_runner,
-                             example_prompts[:1],
-                             model_name,
-                             False,
-                             hf_model_kwargs,
-                             vllm_tp_size=2)
+def test_load_tp_4bit_bnb_model(
+    hf_runner, vllm_runner, example_prompts, model_name, description
+) -> None:
+    hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(load_in_4bit=True))
+    validate_generated_texts(
+        hf_runner,
+        vllm_runner,
+        example_prompts[:1],
+        model_name,
+        False,
+        hf_model_kwargs,
+        vllm_tp_size=2,
+    )


-@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
-                    reason='bitsandbytes is not supported on this GPU type.')
+@pytest.mark.skipif(
+    not is_quant_method_supported("bitsandbytes"),
+    reason="bitsandbytes is not supported on this GPU type.",
+)
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
@multi_gpu_test(num_gpus=2)
 def test_load_pp_4bit_bnb_model(model_name, description) -> None:
@@ -115,30 +129,37 @@ def test_load_pp_4bit_bnb_model(model_name, description) -> None:
    compare_two_settings(model_name, common_args, pp_args)


-@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
-                    reason='bitsandbytes is not supported on this GPU type.')
+@pytest.mark.skipif(
+    not is_quant_method_supported("bitsandbytes"),
+    reason="bitsandbytes is not supported on this GPU type.",
+)
@pytest.mark.parametrize("model_name, description", models_4bit_to_moe_test)
-def test_4bit_bnb_moe_model(hf_runner, vllm_runner, example_prompts,
-                            model_name, description) -> None:
+def test_4bit_bnb_moe_model(
+    hf_runner, vllm_runner, example_prompts, model_name, description
+) -> None:
+    hf_model_kwargs = dict(
+        quantization_config=BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_use_double_quant=True,
+        )
+    )
+    with vllm_runner(
+        model_name,
+        quantization="bitsandbytes",
+        enforce_eager=False,
+        default_torch_num_threads=1,
+    ) as llm:
+        vllm_outputs = llm.generate_greedy_logprobs(
+            example_prompts, max_tokens=32, num_logprobs=5
+        )

-    hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(
-        load_in_4bit=True,
-        bnb_4bit_quant_type="nf4",
-        bnb_4bit_use_double_quant=True,
-    ))
-    with vllm_runner(model_name,
-                     quantization='bitsandbytes',
-                     enforce_eager=False,
-                     default_torch_num_threads=1) as llm:
-        vllm_outputs = llm.generate_greedy_logprobs(example_prompts,
-                                                    max_tokens=32,
-                                                    num_logprobs=5)
-
-    with hf_runner(model_name,
-                   model_kwargs=hf_model_kwargs,
-                   default_torch_num_threads=1) as llm:
+    with hf_runner(
+        model_name, model_kwargs=hf_model_kwargs, default_torch_num_threads=1
+    ) as llm:
        transformers_outputs = llm.generate_greedy_logprobs_limit(
-            example_prompts, max_tokens=32, num_logprobs=5)
+            example_prompts, max_tokens=32, num_logprobs=5
+        )
    check_logprobs_close(
        outputs_0_lst=transformers_outputs,
        outputs_1_lst=vllm_outputs,
@@ -147,10 +168,11 @@ def test_4bit_bnb_moe_model(hf_runner, vllm_runner, example_prompts,
    )


-@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
-                    reason='bitsandbytes is not supported on this GPU type.')
-@pytest.mark.parametrize("model_name, description",
-                         models_4bit_to_embedding_test)
+@pytest.mark.skipif(
+    not is_quant_method_supported("bitsandbytes"),
+    reason="bitsandbytes is not supported on this GPU type.",
+)
+@pytest.mark.parametrize("model_name, description", models_4bit_to_embedding_test)
@pytest.mark.parametrize("dtype", ["half"])
 def test_4bit_bnb_embedding_model(
    model_name,
@@ -160,7 +182,6 @@ def test_4bit_bnb_embedding_model(
    example_prompts,
    dtype: str,
 ) -> None:
-
    # The example_prompts has ending "\n", for example:
    # "Write a short story about a robot that dreams for the first time.\n"
    # sentence_transformers will strip the input texts, see:
@@ -170,22 +191,23 @@ def test_4bit_bnb_embedding_model(
    example_prompts = [str(s).strip() for s in example_prompts]

    # Inflight 4bit quantization
-    with vllm_runner(model_name,
-                     runner="pooling",
-                     dtype=dtype,
-                     gpu_memory_utilization=0.5,
-                     quantization="bitsandbytes",
-                     default_torch_num_threads=1) as vllm_model:
+    with vllm_runner(
+        model_name,
+        runner="pooling",
+        dtype=dtype,
+        gpu_memory_utilization=0.5,
+        quantization="bitsandbytes",
+        default_torch_num_threads=1,
+    ) as vllm_model:
        vllm_outputs = vllm_model.embed(example_prompts)

-    hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(
-        load_in_4bit=True))
+    hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(load_in_4bit=True))
    with hf_runner(
-            model_name,
-            dtype=dtype,
-            model_kwargs=hf_model_kwargs,
-            is_sentence_transformer=True,
-            default_torch_num_threads=1,
+        model_name,
+        dtype=dtype,
+        model_kwargs=hf_model_kwargs,
+        is_sentence_transformer=True,
+        default_torch_num_threads=1,
    ) as hf_model:
        hf_outputs = hf_model.encode(example_prompts)

@@ -210,23 +232,25 @@ def log_generated_texts(prompts, outputs, runner_name):
    return logged_texts


-def validate_generated_texts(hf_runner,
-                             vllm_runner,
-                             prompts,
-                             model_name,
-                             pre_quant=False,
-                             hf_model_kwargs=None,
-                             vllm_tp_size=1,
-                             max_tokens=8):
-
+def validate_generated_texts(
+    hf_runner,
+    vllm_runner,
+    prompts,
+    model_name,
+    pre_quant=False,
+    hf_model_kwargs=None,
+    vllm_tp_size=1,
+    max_tokens=8,
+):
    # NOTE: run vLLM first, as it requires a clean process
    # when using distributed inference
-    with vllm_runner(model_name,
-                     quantization=None if pre_quant else 'bitsandbytes',
-                     tensor_parallel_size=vllm_tp_size,
-                     enforce_eager=False,
-                     default_torch_num_threads=1) as llm:
-
+    with vllm_runner(
+        model_name,
+        quantization=None if pre_quant else "bitsandbytes",
+        tensor_parallel_size=vllm_tp_size,
+        enforce_eager=False,
+        default_torch_num_threads=1,
+    ) as llm:
        vllm_outputs = llm.generate_greedy(prompts, max_tokens)
        vllm_logs = log_generated_texts(prompts, vllm_outputs, "VllmRunner")

@@ -234,9 +258,9 @@ def validate_generated_texts(hf_runner,
        hf_model_kwargs = {}

    # Run with HF runner
-    with hf_runner(model_name,
-                   model_kwargs=hf_model_kwargs,
-                   default_torch_num_threads=1) as llm:
+    with hf_runner(
+        model_name, model_kwargs=hf_model_kwargs, default_torch_num_threads=1
+    ) as llm:
        hf_outputs = llm.generate_greedy(prompts, max_tokens)
        hf_logs = log_generated_texts(prompts, hf_outputs, "HfRunner")

@@ -245,8 +269,10 @@ def validate_generated_texts(hf_runner,
        hf_str = hf_log["generated_text"]
        vllm_str = vllm_log["generated_text"]
        prompt = hf_log["prompt"]
-        assert hf_str == vllm_str, (f"Model: {model_name}"
-                                    f"Mismatch between HF and vLLM outputs:\n"
-                                    f"Prompt: {prompt}\n"
-                                    f"HF Output: '{hf_str}'\n"
-                                    f"vLLM Output: '{vllm_str}'")
+        assert hf_str == vllm_str, (
+            f"Model: {model_name}"
+            f"Mismatch between HF and vLLM outputs:\n"
+            f"Prompt: {prompt}\n"
+            f"HF Output: '{hf_str}'\n"
+            f"vLLM Output: '{vllm_str}'"
+        )
--- a/tests/models/quantization/test_fp8.py
+++ b/tests/models/quantization/test_fp8.py
@@ -5,6 +5,7 @@
 """Tests fp8 models against ground truth generation
 Note: these tests will only pass on L4 GPU.
 """
+
 import pytest

 from tests.quantization.utils import is_quant_method_supported
@@ -14,21 +15,33 @@ from vllm.utils import STR_BACKEND_ENV_VAR
 from ..utils import check_logprobs_close


-@pytest.mark.skipif(not is_quant_method_supported("fp8"),
-                    reason="fp8 is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not is_quant_method_supported("fp8"),
+    reason="fp8 is not supported on this GPU type.",
+)
@pytest.mark.parametrize(
    "kv_cache_dtype,base_model,test_model",
    [
        # Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors.
-        ("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct",
-         "nm-testing/Llama-3.2-1B-Instruct-FP8-KV"),
+        (
+            "fp8_e4m3",
+            "meta-llama/Llama-3.2-1B-Instruct",
+            "nm-testing/Llama-3.2-1B-Instruct-FP8-KV",
+        ),
        # Test BF16 checkpoint w. fp8_e5m2 kv-cache.
-        ("fp8_e5m2", "meta-llama/Llama-3.2-1B-Instruct",
-         "meta-llama/Llama-3.2-1B-Instruct"),
+        (
+            "fp8_e5m2",
+            "meta-llama/Llama-3.2-1B-Instruct",
+            "meta-llama/Llama-3.2-1B-Instruct",
+        ),
        # Test BF16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
-        ("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct",
-         "meta-llama/Llama-3.2-1B-Instruct")
-    ])
+        (
+            "fp8_e4m3",
+            "meta-llama/Llama-3.2-1B-Instruct",
+            "meta-llama/Llama-3.2-1B-Instruct",
+        ),
+    ],
+)
 # Due to low-precision numerical divergence, we only test logprob of 4 tokens
@pytest.mark.parametrize("max_tokens", [4])
@pytest.mark.parametrize("enforce_eager", [True])
@@ -54,38 +67,39 @@ def test_models(
    """

    if kv_cache_dtype == "fp8_e5m2" and current_platform.is_rocm():
-        pytest.skip(
-            f"{kv_cache_dtype} is currently not supported on ROCm/HIP.")
+        pytest.skip(f"{kv_cache_dtype} is currently not supported on ROCm/HIP.")

    if not current_platform.is_kv_cache_dtype_supported(kv_cache_dtype, None):
        pytest.skip(f"{kv_cache_dtype} is not supported on this platform.")

    with monkeypatch.context() as m:
-        m.setenv("TOKENIZERS_PARALLELISM", 'true')
+        m.setenv("TOKENIZERS_PARALLELISM", "true")
        m.setenv(STR_BACKEND_ENV_VAR, backend)

        MAX_MODEL_LEN = 1024
        NUM_LOG_PROBS = 8

        with vllm_runner(
-                base_model,
-                max_model_len=MAX_MODEL_LEN,
-                tensor_parallel_size=tensor_parallel_size,
-                enforce_eager=enforce_eager,
-                kv_cache_dtype="auto",
+            base_model,
+            max_model_len=MAX_MODEL_LEN,
+            tensor_parallel_size=tensor_parallel_size,
+            enforce_eager=enforce_eager,
+            kv_cache_dtype="auto",
        ) as vllm_model:
            baseline_outputs = vllm_model.generate_greedy_logprobs(
-                example_prompts, max_tokens, NUM_LOG_PROBS)
+                example_prompts, max_tokens, NUM_LOG_PROBS
+            )

        with vllm_runner(
-                test_model,
-                max_model_len=MAX_MODEL_LEN,
-                tensor_parallel_size=tensor_parallel_size,
-                enforce_eager=enforce_eager,
-                kv_cache_dtype=kv_cache_dtype,
+            test_model,
+            max_model_len=MAX_MODEL_LEN,
+            tensor_parallel_size=tensor_parallel_size,
+            enforce_eager=enforce_eager,
+            kv_cache_dtype=kv_cache_dtype,
        ) as vllm_model:
            test_outputs = vllm_model.generate_greedy_logprobs(
-                example_prompts, max_tokens, NUM_LOG_PROBS)
+                example_prompts, max_tokens, NUM_LOG_PROBS
+            )

        check_logprobs_close(
            outputs_0_lst=baseline_outputs,
@@ -96,15 +110,18 @@ def test_models(


@pytest.mark.cpu_model
-@pytest.mark.skipif(not current_platform.is_cpu(),
-                    reason="test for the CPU backend.")
+@pytest.mark.skipif(not current_platform.is_cpu(), reason="test for the CPU backend.")
@pytest.mark.parametrize(
    "kv_cache_dtype,base_model,test_model",
    [
        # Test BF16 checkpoint w. fp8_e5m2 kv-cache.
-        ("fp8_e5m2", "meta-llama/Llama-3.2-1B-Instruct",
-         "meta-llama/Llama-3.2-1B-Instruct"),
-    ])
+        (
+            "fp8_e5m2",
+            "meta-llama/Llama-3.2-1B-Instruct",
+            "meta-llama/Llama-3.2-1B-Instruct",
+        ),
+    ],
+)
 # Due to low-precision numerical divergence, we only test logprob of 4 tokens
@pytest.mark.parametrize("max_tokens", [4])
 def test_cpu_models(
@@ -121,28 +138,30 @@ def test_cpu_models(
    numerical sensitive kernels.
    """
    with monkeypatch.context() as m:
-        m.setenv("TOKENIZERS_PARALLELISM", 'true')
+        m.setenv("TOKENIZERS_PARALLELISM", "true")

        MAX_MODEL_LEN = 1024
        NUM_LOG_PROBS = 8

        with vllm_runner(
-                base_model,
-                max_model_len=MAX_MODEL_LEN,
-                dtype="bfloat16",
-                kv_cache_dtype="auto",
+            base_model,
+            max_model_len=MAX_MODEL_LEN,
+            dtype="bfloat16",
+            kv_cache_dtype="auto",
        ) as vllm_model:
            baseline_outputs = vllm_model.generate_greedy_logprobs(
-                example_prompts, max_tokens, NUM_LOG_PROBS)
+                example_prompts, max_tokens, NUM_LOG_PROBS
+            )

        with vllm_runner(
-                test_model,
-                max_model_len=MAX_MODEL_LEN,
-                dtype="bfloat16",
-                kv_cache_dtype=kv_cache_dtype,
+            test_model,
+            max_model_len=MAX_MODEL_LEN,
+            dtype="bfloat16",
+            kv_cache_dtype=kv_cache_dtype,
        ) as vllm_model:
            test_outputs = vllm_model.generate_greedy_logprobs(
-                example_prompts, max_tokens, NUM_LOG_PROBS)
+                example_prompts, max_tokens, NUM_LOG_PROBS
+            )

        check_logprobs_close(
            outputs_0_lst=baseline_outputs,
--- a/tests/models/quantization/test_gguf.py
+++ b/tests/models/quantization/test_gguf.py
@@ -100,35 +100,37 @@ def check_model_outputs(
 ):
    tokenizer = AutoTokenizer.from_pretrained(model.original_model)
    if tokenizer.chat_template is not None:
-        messages = [[{
-            'role': 'user',
-            'content': prompt
-        }] for prompt in prompts]
-        prompts = tokenizer.apply_chat_template(messages,
-                                                tokenize=False,
-                                                add_generation_prompt=True)
+        messages = [[{"role": "user", "content": prompt}] for prompt in prompts]
+        prompts = tokenizer.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )

    # Run gguf model.
-    with vllm_runner(model_name=model.gguf_model,
-                     enforce_eager=True,
-                     tokenizer_name=model.original_model,
-                     dtype=dtype,
-                     max_model_len=MAX_MODEL_LEN,
-                     tensor_parallel_size=tp_size) as gguf_model:
+    with vllm_runner(
+        model_name=model.gguf_model,
+        enforce_eager=True,
+        tokenizer_name=model.original_model,
+        dtype=dtype,
+        max_model_len=MAX_MODEL_LEN,
+        tensor_parallel_size=tp_size,
+    ) as gguf_model:
        gguf_outputs = gguf_model.generate_greedy_logprobs(
-            prompts[:-1], max_tokens, num_logprobs)
+            prompts[:-1], max_tokens, num_logprobs
+        )

    # Run unquantized model.
    # Should run with tp=1, otherwise the test will stuck at
    # nccl initialization.
    with vllm_runner(
-            model_name=model.original_model,
-            enforce_eager=True,  # faster tests
-            dtype=dtype,
-            max_model_len=MAX_MODEL_LEN,
-            tensor_parallel_size=1) as original_model:
+        model_name=model.original_model,
+        enforce_eager=True,  # faster tests
+        dtype=dtype,
+        max_model_len=MAX_MODEL_LEN,
+        tensor_parallel_size=1,
+    ) as original_model:
        original_outputs = original_model.generate_greedy_logprobs(
-            prompts[:-1], max_tokens, num_logprobs)
+            prompts[:-1], max_tokens, num_logprobs
+        )

    check_logprobs_close(
        outputs_0_lst=original_outputs,
@@ -138,12 +140,14 @@ def check_model_outputs(
    )


-@pytest.mark.skipif(not is_quant_method_supported("gguf"),
-                    reason="gguf is not supported on this GPU type.")
-@pytest.mark.parametrize("model", [
-    pytest.param(test_config, marks=test_config.marks)
-    for test_config in MODELS
-])
+@pytest.mark.skipif(
+    not is_quant_method_supported("gguf"),
+    reason="gguf is not supported on this GPU type.",
+)
+@pytest.mark.parametrize(
+    "model",
+    [pytest.param(test_config, marks=test_config.marks) for test_config in MODELS],
+)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
@@ -157,12 +161,15 @@ def test_models(
    num_logprobs: int,
    tp_size: int,
 ) -> None:
-    check_model_outputs(vllm_runner, example_prompts, model, dtype, max_tokens,
-                        num_logprobs, tp_size)
+    check_model_outputs(
+        vllm_runner, example_prompts, model, dtype, max_tokens, num_logprobs, tp_size
+    )


-@pytest.mark.skipif(not is_quant_method_supported("gguf"),
-                    reason="gguf is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not is_quant_method_supported("gguf"),
+    reason="gguf is not supported on this GPU type.",
+)
@pytest.mark.parametrize("model", [LLAMA_CONFIG])
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [8])
@@ -178,5 +185,6 @@ def test_distributed(
    num_logprobs: int,
    tp_size: int,
 ) -> None:
-    check_model_outputs(vllm_runner, example_prompts, model, dtype, max_tokens,
-                        num_logprobs, tp_size)
+    check_model_outputs(
+        vllm_runner, example_prompts, model, dtype, max_tokens, num_logprobs, tp_size
+    )
--- a/tests/models/quantization/test_gptq_bitblas.py
+++ b/tests/models/quantization/test_gptq_bitblas.py
@@ -7,9 +7,10 @@ As a result, in this test, we just confirm that the top selected tokens of the
 bitblas/GPTQ models are in the top 3 selections of each other.

 Note: bitblas internally uses locks to synchronize the threads. This can
-result in very slight nondeterminism for bitblas. As a result, we re-run the 
+result in very slight nondeterminism for bitblas. As a result, we re-run the
 test up to 3 times to see if we pass.
 """
+
 from dataclasses import dataclass

 import pytest
@@ -41,16 +42,19 @@ def test_models(
    max_tokens: int,
    num_logprobs: int,
 ) -> None:
-    with vllm_runner(model_pair.model_gptq,
-                     dtype=dtype,
-                     quantization="bitblas") as bitblas_model:
+    with vllm_runner(
+        model_pair.model_gptq, dtype=dtype, quantization="bitblas"
+    ) as bitblas_model:
        bitblas_outputs = bitblas_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )

-    with vllm_runner(model_pair.model_gptq, dtype=dtype,
-                     quantization="gptq") as gptq_model:
+    with vllm_runner(
+        model_pair.model_gptq, dtype=dtype, quantization="gptq"
+    ) as gptq_model:
        gptq_outputs = gptq_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )

    check_logprobs_close(
        outputs_0_lst=gptq_outputs,
--- a/tests/models/quantization/test_gptq_marlin.py
+++ b/tests/models/quantization/test_gptq_marlin.py
@@ -9,6 +9,7 @@ Note: Marlin internally uses locks to synchronize the threads. This can
 result in very slight nondeterminism for Marlin. As a result, we re-run the test
 up to 3 times to see if we pass.
 """
+
 import os

 import pytest
@@ -26,20 +27,20 @@ MAX_MODEL_LEN = 1024
 MODELS = [
    # act_order==True, group_size=128
    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "main"),
-
    # 8-bit, act_order==True, group_size=channelwise
    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit--1g-actorder_True"),
-
    # 4-bit, act_order==True, group_size=128
-    ("TechxGenus/gemma-1.1-2b-it-GPTQ", "main")
+    ("TechxGenus/gemma-1.1-2b-it-GPTQ", "main"),
 ]


@pytest.mark.flaky(reruns=3)
-@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin")
-                    or current_platform.is_rocm()
-                    or not current_platform.is_cuda(),
-                    reason="gptq_marlin is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not is_quant_method_supported("gptq_marlin")
+    or current_platform.is_rocm()
+    or not current_platform.is_cuda(),
+    reason="gptq_marlin is not supported on this GPU type.",
+)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half", "bfloat16"])
@pytest.mark.parametrize("max_tokens", [32])
@@ -55,29 +56,34 @@ def test_models(
    model_name, revision = model

    # Run marlin.
-    with vllm_runner(model_name=model_name,
-                     revision=revision,
-                     dtype=dtype,
-                     quantization="marlin",
-                     max_model_len=MAX_MODEL_LEN,
-                     tensor_parallel_size=1) as gptq_marlin_model:
-
+    with vllm_runner(
+        model_name=model_name,
+        revision=revision,
+        dtype=dtype,
+        quantization="marlin",
+        max_model_len=MAX_MODEL_LEN,
+        tensor_parallel_size=1,
+    ) as gptq_marlin_model:
        gptq_marlin_outputs = gptq_marlin_model.generate_greedy_logprobs(
-            example_prompts[:-1], max_tokens, num_logprobs)
+            example_prompts[:-1], max_tokens, num_logprobs
+        )
    _ROPE_DICT.clear()  # clear rope cache to avoid rope dtype error

    # Run gptq.
    # The naive gptq kernel doesn't support bf16 yet.
    # Here we always compare fp16/bf16 gpt marlin kernel
    # to fp16 gptq kernel.
-    with vllm_runner(model_name=model_name,
-                     revision=revision,
-                     dtype="half",
-                     quantization="gptq",
-                     max_model_len=MAX_MODEL_LEN,
-                     tensor_parallel_size=1) as gptq_model:
+    with vllm_runner(
+        model_name=model_name,
+        revision=revision,
+        dtype="half",
+        quantization="gptq",
+        max_model_len=MAX_MODEL_LEN,
+        tensor_parallel_size=1,
+    ) as gptq_model:
        gptq_outputs = gptq_model.generate_greedy_logprobs(
-            example_prompts[:-1], max_tokens, num_logprobs)
+            example_prompts[:-1], max_tokens, num_logprobs
+        )

    check_logprobs_close(
        outputs_0_lst=gptq_outputs,
--- a/tests/models/quantization/test_gptq_marlin_24.py
+++ b/tests/models/quantization/test_gptq_marlin_24.py
@@ -6,6 +6,7 @@ Note: GPTQ and Marlin_24 do not have bitwise correctness.
 As a result, in this test, we just confirm that the top selected tokens of the
 Marlin/GPTQ models are in the top 3 selections of each other.
 """
+
 from dataclasses import dataclass

 import pytest
@@ -24,15 +25,18 @@ class ModelPair:

 model_pairs = [
    # 4-bit, group_size == 128
-    ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-g128",
-              model_gptq="alexm-nm/tinyllama-24-gptq-4bit-g128"),
+    ModelPair(
+        model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-g128",
+        model_gptq="alexm-nm/tinyllama-24-gptq-4bit-g128",
+    ),
    # # 4-bit, group_size == channelwise
    # ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-channelwise",
    #           model_gptq="alexm-nm/tinyllama-24-gptq-4bit-channelwise"),
-
    # 8-bit, group_size == 128
-    ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-g128",
-              model_gptq="alexm-nm/tinyllama-24-gptq-8bit-g128"),
+    ModelPair(
+        model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-g128",
+        model_gptq="alexm-nm/tinyllama-24-gptq-8bit-g128",
+    ),
    # # 8-bit, group_size == channelwise
    # ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-channelwise",
    #           model_gptq="alexm-nm/tinyllama-24-gptq-8bit-channelwise"),
@@ -40,10 +44,12 @@ model_pairs = [


@pytest.mark.flaky(reruns=2)
-@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin_24")
-                    or current_platform.is_rocm()
-                    or not current_platform.is_cuda(),
-                    reason="Marlin24 is not supported on this GPU type.")
+@pytest.mark.skipif(
+    not is_quant_method_supported("gptq_marlin_24")
+    or current_platform.is_rocm()
+    or not current_platform.is_cuda(),
+    reason="Marlin24 is not supported on this GPU type.",
+)
@pytest.mark.parametrize("model_pair", model_pairs)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [8])
@@ -56,16 +62,19 @@ def test_models(
    max_tokens: int,
    num_logprobs: int,
 ) -> None:
-    with vllm_runner(model_pair.model_marlin,
-                     dtype=dtype,
-                     quantization="gptq_marlin_24") as marlin_24_model:
+    with vllm_runner(
+        model_pair.model_marlin, dtype=dtype, quantization="gptq_marlin_24"
+    ) as marlin_24_model:
        marlin_24_outputs = marlin_24_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )

-    with vllm_runner(model_pair.model_gptq, dtype=dtype,
-                     quantization="gptq") as gptq_model:
+    with vllm_runner(
+        model_pair.model_gptq, dtype=dtype, quantization="gptq"
+    ) as gptq_model:
        gptq_outputs = gptq_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+            example_prompts, max_tokens, num_logprobs
+        )

    check_logprobs_close(
        outputs_0_lst=gptq_outputs,
--- a/tests/models/quantization/test_modelopt.py
+++ b/tests/models/quantization/test_modelopt.py
@@ -5,6 +5,7 @@
 """Tests Model Optimizer fp8 models against ground truth generation
 Note: these tests will only pass on H100
 """
+
 import os

 import pytest
@@ -22,13 +23,13 @@ MODELS = ["nvidia/Llama-3.1-8B-Instruct-FP8"]
 EXPECTED_STRS_MAP = {
    "nvidia/Llama-3.1-8B-Instruct-FP8": [
        "You're referring to VLLM, a high-performance Large Language Model (LLM) inference and",
-        'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
-        'The comparison between artificial intelligence (AI) and human intelligence in terms of processing information is a complex and',
+        "Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ",
+        "The comparison between artificial intelligence (AI) and human intelligence in terms of processing information is a complex and",
        'A neural network is a complex system modeled after the human brain, consisting of interconnected nodes or "ne',
-        '**The Spark of Imagination**\n\nZeta-5, a sleek and efficient robot, whir',
-        'The COVID-19 pandemic has had a profound impact on global economic structures and business models, leading to',
-        'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
-        'Here are the translations:\n\n**Japanese:** 「早起きは早く獲物をとる'
+        "**The Spark of Imagination**\n\nZeta-5, a sleek and efficient robot, whir",
+        "The COVID-19 pandemic has had a profound impact on global economic structures and business models, leading to",
+        "The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of",
+        "Here are the translations:\n\n**Japanese:** 「早起きは早く獲物をとる",
    ]
 }

@@ -39,10 +40,12 @@ EXPECTED_STRS_MAP = {
 # the hardware being run on.
 # Disabled to prevent it from breaking the build
@pytest.mark.skip(
-    reason=
-    "Prevent unstable test based on golden strings from breaking the build.")
-@pytest.mark.skipif(not is_quant_method_supported("fp8"),
-                    reason="fp8 is not supported on this GPU type.")
+    reason="Prevent unstable test based on golden strings from breaking the build."
+)
+@pytest.mark.skipif(
+    not is_quant_method_supported("fp8"),
+    reason="fp8 is not supported on this GPU type.",
+)
@pytest.mark.parametrize("model_name", MODELS)
 def test_models(example_prompts, model_name) -> None:
    llm = LLM(
@@ -55,12 +58,11 @@ def test_models(example_prompts, model_name) -> None:

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    formatted_prompts = [
-        tokenizer.apply_chat_template([{
-            "role": "user",
-            "content": prompt
-        }],
-                                      tokenize=False,
-                                      add_generation_prompt=True)
+        tokenizer.apply_chat_template(
+            [{"role": "user", "content": prompt}],
+            tokenize=False,
+            add_generation_prompt=True,
+        )
        for prompt in example_prompts
    ]
    params = SamplingParams(max_tokens=20, temperature=0)
@@ -78,4 +80,5 @@ def test_models(example_prompts, model_name) -> None:
        generated_str = generations[i]
        expected_str = expected_strs[i]
        assert expected_str == generated_str, (
-            f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}")
+            f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}"
+        )
--- a/tests/models/quantization/test_mxfp4.py
+++ b/tests/models/quantization/test_mxfp4.py
@@ -1,8 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # flake8: noqa
-"""Tests Quark mxfp4 models against ground truth generation
-"""
+"""Tests Quark mxfp4 models against ground truth generation"""
+
 import pytest

 from vllm import LLM, SamplingParams
@@ -11,13 +11,13 @@ MODELS = ["amd/Llama-2-7b-chat-hf-wmxfp4-amxfp4-kvfp8-scale-uint8"]

 EXPECTED_STRS_MAP = {
    "amd/Llama-2-7b-chat-hf-wmxfp4-amxfp4-kvfp8-scale-uint8": [
-        '\n### Key Features\n\n* **High-throughput Inference**: vLL',
-        '\nArtificial intelligence (AI) has evolved significantly since its inception in the 1',
-        'Artificial intelligence (AI) and human intelligence (HI) are two distinct concepts that have been',
-        'A neural network is a machine learning model inspired by the structure of the human brain. It consists of',
-        '\nTitle: The Dreaming Robot\n\nAs the sun set on the bustling metropol',
-        '\nThe COVID-19 pandemic has had a profound impact on global economic structures and business',
-        'The Mona Lisa painting, created by Leonardo da Vinci in the early 16th',
+        "\n### Key Features\n\n* **High-throughput Inference**: vLL",
+        "\nArtificial intelligence (AI) has evolved significantly since its inception in the 1",
+        "Artificial intelligence (AI) and human intelligence (HI) are two distinct concepts that have been",
+        "A neural network is a machine learning model inspired by the structure of the human brain. It consists of",
+        "\nTitle: The Dreaming Robot\n\nAs the sun set on the bustling metropol",
+        "\nThe COVID-19 pandemic has had a profound impact on global economic structures and business",
+        "The Mona Lisa painting, created by Leonardo da Vinci in the early 16th",
        " everybody knows this proverbial saying, but did you know that it's not entirely accurate?",
    ]
 }
@@ -38,4 +38,5 @@ def test_models(example_prompts, model_name) -> None:
        output_str = output.outputs[0].text
        expected_str = EXPECTED_STRS_MAP[model_name][i]
        assert expected_str == output_str, (
-            f"Expected: {expected_str!r}\nvLLM: {output_str!r}")
+            f"Expected: {expected_str!r}\nvLLM: {output_str!r}"
+        )
--- a/tests/models/quantization/test_nvfp4.py
+++ b/tests/models/quantization/test_nvfp4.py
@@ -4,6 +4,7 @@
 """Tests Model Optimizer nvfp4 models against ground truth generation
 Note: these tests will only pass on B200
 """
+
 import os
 from typing import List

@@ -21,14 +22,14 @@ MODELS = ["nvidia/Llama-3.3-70B-Instruct-FP4"]

 EXPECTED_STRS_MAP = {
    "nvidia/Llama-3.3-70B-Instruct-FP4": [
-        'vLLM (Vectorized Large Language Model) is indeed a high-throughput and memory-efficient inference',
-        'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
-        'Artificial intelligence (AI) and human intelligence (HI) are two distinct forms of intelligence that process',
-        'A neural network is a type of machine learning model inspired by the structure and function of the human brain',
-        'In the heart of a cutting-edge robotics lab, a team of engineers had been working tirelessly to push',
-        'The COVID-19 pandemic has had a profound impact on global economic structures and future business models, leading',
-        'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
-        'Here are the translations:\n\n* Japanese: (Sasuga no tori ga miwa o ts'
+        "vLLM (Vectorized Large Language Model) is indeed a high-throughput and memory-efficient inference",
+        "Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ",
+        "Artificial intelligence (AI) and human intelligence (HI) are two distinct forms of intelligence that process",
+        "A neural network is a type of machine learning model inspired by the structure and function of the human brain",
+        "In the heart of a cutting-edge robotics lab, a team of engineers had been working tirelessly to push",
+        "The COVID-19 pandemic has had a profound impact on global economic structures and future business models, leading",
+        "The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of",
+        "Here are the translations:\n\n* Japanese: (Sasuga no tori ga miwa o ts",
    ]
 }

@@ -39,11 +40,13 @@ EXPECTED_STRS_MAP = {
 # the hardware being run on.
 # Disabled to prevent it from breaking the build
@pytest.mark.skip(
-    reason=
-    "Prevent unstable test based on golden strings from breaking the build "
-    " and test input model being too large and hanging the system.")
-@pytest.mark.skipif(not is_quant_method_supported("modelopt_fp4"),
-                    reason="modelopt_fp4 is not supported on this GPU type.")
+    reason="Prevent unstable test based on golden strings from breaking the build "
+    " and test input model being too large and hanging the system."
+)
+@pytest.mark.skipif(
+    not is_quant_method_supported("modelopt_fp4"),
+    reason="modelopt_fp4 is not supported on this GPU type.",
+)
@pytest.mark.parametrize("model_name", MODELS)
 def test_models(example_prompts, model_name) -> None:
    llm = LLM(
@@ -56,12 +59,11 @@ def test_models(example_prompts, model_name) -> None:

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    formatted_prompts = [
-        tokenizer.apply_chat_template([{
-            "role": "user",
-            "content": prompt
-        }],
-                                      tokenize=False,
-                                      add_generation_prompt=True)
+        tokenizer.apply_chat_template(
+            [{"role": "user", "content": prompt}],
+            tokenize=False,
+            add_generation_prompt=True,
+        )
        for prompt in example_prompts
    ]
    params = SamplingParams(max_tokens=20, temperature=0)
@@ -79,4 +81,5 @@ def test_models(example_prompts, model_name) -> None:
        generated_str = generations[i]
        expected_str = expected_strs[i]
        assert expected_str == generated_str, (
-            f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}")
+            f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}"
+        )