Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-05 15:06:22 +01:00
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -10,12 +10,16 @@ import torch
 import torch.nn as nn
 from huggingface_hub import snapshot_download

-from vllm.distributed import (cleanup_dist_env_and_memory,
-                              init_distributed_environment,
-                              initialize_model_parallel)
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               MergedColumnParallelLinear,
-                                               RowParallelLinear)
+from vllm.distributed import (
+    cleanup_dist_env_and_memory,
+    init_distributed_environment,
+    initialize_model_parallel,
+)
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    MergedColumnParallelLinear,
+    RowParallelLinear,
+)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.models.interfaces import SupportsLoRA
@@ -47,11 +51,13 @@ def dist_init():
    if current_platform.is_cpu() or current_platform.is_tpu():
        backend = "gloo"

-    init_distributed_environment(world_size=1,
-                                 rank=0,
-                                 distributed_init_method=f"file://{temp_file}",
-                                 local_rank=0,
-                                 backend=backend)
+    init_distributed_environment(
+        world_size=1,
+        rank=0,
+        distributed_init_method=f"file://{temp_file}",
+        local_rank=0,
+        backend=backend,
+    )
    initialize_model_parallel(1, 1)
    yield
    cleanup_dist_env_and_memory(shutdown_ray=True)
@@ -66,10 +72,9 @@ def dist_init_torch_only():
        backend = "gloo"

    temp_file = tempfile.mkstemp()[1]
-    torch.distributed.init_process_group(world_size=1,
-                                         rank=0,
-                                         init_method=f"file://{temp_file}",
-                                         backend=backend)
+    torch.distributed.init_process_group(
+        world_size=1, rank=0, init_method=f"file://{temp_file}", backend=backend
+    )


 class DummyLoRAModel(nn.Sequential, SupportsLoRA):
@@ -79,24 +84,30 @@ class DummyLoRAModel(nn.Sequential, SupportsLoRA):
@pytest.fixture
 def dummy_model() -> nn.Module:
    model = DummyLoRAModel(
-        OrderedDict([
-            ("dense1", ColumnParallelLinear(764, 100)),
-            ("dense2", RowParallelLinear(100, 50)),
-            (
-                "layer1",
-                nn.Sequential(
-                    OrderedDict([
-                        ("dense1", ColumnParallelLinear(100, 10)),
-                        ("dense2", RowParallelLinear(10, 50)),
-                    ])),
-            ),
-            ("act2", nn.ReLU()),
-            ("output", ColumnParallelLinear(50, 10)),
-            ("outact", nn.Sigmoid()),
-            # Special handling for lm_head & sampler
-            ("lm_head", ParallelLMHead(512, 10)),
-            ("logits_processor", LogitsProcessor(512)),
-        ]))
+        OrderedDict(
+            [
+                ("dense1", ColumnParallelLinear(764, 100)),
+                ("dense2", RowParallelLinear(100, 50)),
+                (
+                    "layer1",
+                    nn.Sequential(
+                        OrderedDict(
+                            [
+                                ("dense1", ColumnParallelLinear(100, 10)),
+                                ("dense2", RowParallelLinear(10, 50)),
+                            ]
+                        )
+                    ),
+                ),
+                ("act2", nn.ReLU()),
+                ("output", ColumnParallelLinear(50, 10)),
+                ("outact", nn.Sigmoid()),
+                # Special handling for lm_head & sampler
+                ("lm_head", ParallelLMHead(512, 10)),
+                ("logits_processor", LogitsProcessor(512)),
+            ]
+        )
+    )
    model.config = MagicMock()
    model.embedding_modules = {"lm_head": "lm_head"}
    model.unpadded_vocab_size = 32000
@@ -106,24 +117,30 @@ def dummy_model() -> nn.Module:
@pytest.fixture
 def dummy_model_gate_up() -> nn.Module:
    model = DummyLoRAModel(
-        OrderedDict([
-            ("dense1", ColumnParallelLinear(764, 100)),
-            ("dense2", RowParallelLinear(100, 50)),
-            (
-                "layer1",
-                nn.Sequential(
-                    OrderedDict([
-                        ("dense1", ColumnParallelLinear(100, 10)),
-                        ("dense2", RowParallelLinear(10, 50)),
-                    ])),
-            ),
-            ("act2", nn.ReLU()),
-            ("gate_up_proj", MergedColumnParallelLinear(50, [5, 5])),
-            ("outact", nn.Sigmoid()),
-            # Special handling for lm_head & sampler
-            ("lm_head", ParallelLMHead(512, 10)),
-            ("logits_processor", LogitsProcessor(512)),
-        ]))
+        OrderedDict(
+            [
+                ("dense1", ColumnParallelLinear(764, 100)),
+                ("dense2", RowParallelLinear(100, 50)),
+                (
+                    "layer1",
+                    nn.Sequential(
+                        OrderedDict(
+                            [
+                                ("dense1", ColumnParallelLinear(100, 10)),
+                                ("dense2", RowParallelLinear(10, 50)),
+                            ]
+                        )
+                    ),
+                ),
+                ("act2", nn.ReLU()),
+                ("gate_up_proj", MergedColumnParallelLinear(50, [5, 5])),
+                ("outact", nn.Sigmoid()),
+                # Special handling for lm_head & sampler
+                ("lm_head", ParallelLMHead(512, 10)),
+                ("logits_processor", LogitsProcessor(512)),
+            ]
+        )
+    )
    model.config = MagicMock()
    model.packed_modules_mapping = {
        "gate_up_proj": [
--- a/tests/lora/test_add_lora.py
+++ b/tests/lora/test_add_lora.py
@@ -7,7 +7,8 @@ import pytest

 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.entrypoints.openai.api_server import (
-    build_async_engine_client_from_engine_args)
+    build_async_engine_client_from_engine_args,
+)
 from vllm.inputs import TextPrompt
 from vllm.lora.request import LoRARequest
 from vllm.sampling_params import SamplingParams
@@ -26,14 +27,10 @@ def get_lora_requests(lora_path) -> list[LoRARequest]:
    return lora_requests


-async def requests_processing_time(llm,
-                                   lora_requests: list[LoRARequest]) -> float:
-
-    sampling_params = SamplingParams(n=1,
-                                     temperature=0.0,
-                                     top_p=1.0,
-                                     ignore_eos=True,
-                                     max_tokens=1)
+async def requests_processing_time(llm, lora_requests: list[LoRARequest]) -> float:
+    sampling_params = SamplingParams(
+        n=1, temperature=0.0, top_p=1.0, ignore_eos=True, max_tokens=1
+    )

    generators = []
    start = time.perf_counter()
@@ -41,11 +38,11 @@ async def requests_processing_time(llm,
    for lora_request in lora_requests:
        lora_int_id = lora_request.lora_int_id
        generator = llm.generate(
-            prompt=TextPrompt(prompt=f"hello {lora_int_id}",
-                              multi_modal_data=None),  # type: ignore 
+            prompt=TextPrompt(prompt=f"hello {lora_int_id}", multi_modal_data=None),  # type: ignore
            sampling_params=sampling_params,
            lora_request=lora_request,
-            request_id=f"test{lora_int_id}")
+            request_id=f"test{lora_int_id}",
+        )
        generators.append(generator)

    all_gens = merge_async_iterators(*generators)
@@ -58,13 +55,13 @@ async def requests_processing_time(llm,

@pytest.mark.asyncio
 async def test_add_lora(chatglm3_lora_files):
-    """ 
+    """
    The add_lora function is used to preload some LoRA adapters into the
    engine in anticipation of future requests using these adapters. To test
    this functionality, we use the async engine to process some requests - We
    do it twice, once with add_lora() preloading and once without.

-    We measure the request processing time in both cases and expect the time 
+    We measure the request processing time in both cases and expect the time
    to be lesser in the case with add_lora() calls.
    """
    lora_requests: list[LoRARequest] = get_lora_requests(chatglm3_lora_files)
@@ -78,18 +75,18 @@ async def test_add_lora(chatglm3_lora_files):
        max_loras=max_loras,
        max_lora_rank=LORA_RANK,
        max_model_len=128,
-        gpu_memory_utilization=0.8,  #avoid OOM
+        gpu_memory_utilization=0.8,  # avoid OOM
        trust_remote_code=True,
-        enforce_eager=True)
+        enforce_eager=True,
+    )

    # split lora_requests into 3 parts
    part_size = len(lora_requests) // 3
    dummy_run_requests = lora_requests[:part_size]
-    warmup_run_requests = lora_requests[part_size:part_size * 2]
-    cold_run_requests = lora_requests[part_size * 2:]
+    warmup_run_requests = lora_requests[part_size : part_size * 2]
+    cold_run_requests = lora_requests[part_size * 2 :]

    async with build_async_engine_client_from_engine_args(engine_args) as llm:
-
        # Dummy run - So any 1-time functionality like triton kernel compilation
        # is complete here.
        await requests_processing_time(llm, dummy_run_requests)
@@ -101,18 +98,16 @@ async def test_add_lora(chatglm3_lora_files):
        # Test that all all_lora calls are successful.
        assert all(add_lora_results)

-        time_with_add_lora = await requests_processing_time(
-            llm, warmup_run_requests)
+        time_with_add_lora = await requests_processing_time(llm, warmup_run_requests)

        # Run without any warmup
-        time_cold_start = await requests_processing_time(
-            llm, cold_run_requests)
+        time_cold_start = await requests_processing_time(llm, cold_run_requests)

-    print(f"time hot-start {time_with_add_lora} vs "
-          f"time cold-start {time_cold_start} ")
+    print(f"time hot-start {time_with_add_lora} vs time cold-start {time_cold_start} ")

    assert time_with_add_lora < time_cold_start, (
        f"time_with_add_lora={time_with_add_lora}, "
        f"time_cold_start={time_cold_start}"
        "The engine request processing time with LoRA pre-loading "
-        "must be less than the version that does on-demand LoRA loading.")
+        "must be less than the version that does on-demand LoRA loading."
+    )
--- a/tests/lora/test_chatglm3_tp.py
+++ b/tests/lora/test_chatglm3_tp.py
@@ -21,20 +21,18 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
    prompts = [
        PROMPT_TEMPLATE.format(query="How many singers do we have?"),
        PROMPT_TEMPLATE.format(
-            query=
-            "What is the average, minimum, and maximum age of all singers from France?"  # noqa: E501
+            query="What is the average, minimum, and maximum age of all singers from France?"  # noqa: E501
        ),
        PROMPT_TEMPLATE.format(
-            query=
-            "Show name, country, age for all singers ordered by age from the oldest to the youngest."  # noqa: E501
+            query="Show name, country, age for all singers ordered by age from the oldest to the youngest."  # noqa: E501
        ),
    ]
    sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
    outputs = llm.generate(
        prompts,
        sampling_params,
-        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-        if lora_id else None)
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
+    )
    # Print the outputs.
    generated_texts: list[str] = []
    for output in outputs:
@@ -47,13 +45,15 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:

@create_new_process_for_each_test()
 def test_chatglm3_lora(chatglm3_lora_files):
-    llm = vllm.LLM(MODEL_PATH,
-                   max_model_len=1024,
-                   enable_lora=True,
-                   max_loras=4,
-                   max_lora_rank=64,
-                   trust_remote_code=True,
-                   enable_chunked_prefill=True)
+    llm = vllm.LLM(
+        MODEL_PATH,
+        max_model_len=1024,
+        enable_lora=True,
+        max_loras=4,
+        max_lora_rank=64,
+        trust_remote_code=True,
+        enable_chunked_prefill=True,
+    )

    output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
    for i in range(len(EXPECTED_LORA_OUTPUT)):
@@ -66,15 +66,17 @@ def test_chatglm3_lora(chatglm3_lora_files):
@multi_gpu_test(num_gpus=4)
@create_new_process_for_each_test()
 def test_chatglm3_lora_tp4(chatglm3_lora_files):
-    llm = vllm.LLM(MODEL_PATH,
-                   max_model_len=1024,
-                   enable_lora=True,
-                   max_loras=4,
-                   max_lora_rank=64,
-                   tensor_parallel_size=4,
-                   trust_remote_code=True,
-                   fully_sharded_loras=False,
-                   enable_chunked_prefill=True)
+    llm = vllm.LLM(
+        MODEL_PATH,
+        max_model_len=1024,
+        enable_lora=True,
+        max_loras=4,
+        max_lora_rank=64,
+        tensor_parallel_size=4,
+        trust_remote_code=True,
+        fully_sharded_loras=False,
+        enable_chunked_prefill=True,
+    )

    output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
    for i in range(len(EXPECTED_LORA_OUTPUT)):
@@ -90,16 +92,18 @@ def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
    # https://github.com/NVIDIA/nccl/issues/1790, set a lower value for
    # gpu_memory_utilization here because NCCL >= 2.26.3 seems to use
    # more GPU memory causing vLLM to OOM
-    llm = vllm.LLM(MODEL_PATH,
-                   max_model_len=1024,
-                   enable_lora=True,
-                   max_loras=4,
-                   max_lora_rank=64,
-                   tensor_parallel_size=4,
-                   trust_remote_code=True,
-                   fully_sharded_loras=True,
-                   enable_chunked_prefill=True,
-                   gpu_memory_utilization=0.85)
+    llm = vllm.LLM(
+        MODEL_PATH,
+        max_model_len=1024,
+        enable_lora=True,
+        max_loras=4,
+        max_lora_rank=64,
+        tensor_parallel_size=4,
+        trust_remote_code=True,
+        fully_sharded_loras=True,
+        enable_chunked_prefill=True,
+        gpu_memory_utilization=0.85,
+    )
    output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
    for i in range(len(EXPECTED_LORA_OUTPUT)):
        assert output1[i] == EXPECTED_LORA_OUTPUT[i]
--- a/tests/lora/test_default_mm_loras.py
+++ b/tests/lora/test_default_mm_loras.py
@@ -32,15 +32,12 @@ VLLM_RUNNER_BASE_KWARGS = {
    "max_lora_rank": 320,
    "max_model_len": 12800,
    "gpu_memory_utilization": 0.8,
-    "limit_mm_per_prompt": {
-        "audio": 1
-    },
+    "limit_mm_per_prompt": {"audio": 1},
    "enforce_eager": True,
 }


-def run_test(vllm_runner, audio_assets, lora_request, expected_suffix,
-             **kwargs):
+def run_test(vllm_runner, audio_assets, lora_request, expected_suffix, **kwargs):
    inputs = [([AUDIO_PROMPT], [audio_assets[0].audio_and_sample_rate[0]])]

    # Apply any additional kwargs as overrides to the base kwargs
@@ -53,11 +50,11 @@ def run_test(vllm_runner, audio_assets, lora_request, expected_suffix,
                max_tokens=128,
                audios=audios,
                lora_request=lora_request,
-            ) for prompts, audios in inputs
+            )
+            for prompts, audios in inputs
        ]

-        assert vllm_outputs_with_default_lora[-1][-1][-1].endswith(
-            expected_suffix)
+        assert vllm_outputs_with_default_lora[-1][-1][-1].endswith(expected_suffix)


 def test_active_default_mm_lora(
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
@@ -19,27 +19,28 @@ EXPECTED_LORA_OUTPUT = [
    "  SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ",  # noqa: E501
    "  SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ",  # noqa: E501
    "  SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ",  # noqa: E501
-    "  SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' "  # noqa: E501
+    "  SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' ",  # noqa: E501
 ]


-def do_sample(llm: vllm.LLM,
-              lora_path: str,
-              lora_id: int,
-              tensorizer_config_dict: Union[dict, None] = None) -> list[str]:
+def do_sample(
+    llm: vllm.LLM,
+    lora_path: str,
+    lora_id: int,
+    tensorizer_config_dict: Union[dict, None] = None,
+) -> list[str]:
    prompts = [
        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",  # noqa: E501
        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]",  # noqa: E501
        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]",  # noqa: E501
        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]",  # noqa: E501
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]"  # noqa: E501
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]",  # noqa: E501
    ]

-    sampling_params = vllm.SamplingParams(temperature=0,
-                                          max_tokens=256,
-                                          skip_special_tokens=False,
-                                          stop=["[/assistant]"])
+    sampling_params = vllm.SamplingParams(
+        temperature=0, max_tokens=256, skip_special_tokens=False, stop=["[/assistant]"]
+    )

    if tensorizer_config_dict is not None:
        outputs = llm.generate(
@@ -49,14 +50,19 @@ def do_sample(llm: vllm.LLM,
                str(lora_id),
                lora_id,
                lora_path,
-                tensorizer_config_dict=tensorizer_config_dict)
-            if lora_id else None)
+                tensorizer_config_dict=tensorizer_config_dict,
+            )
+            if lora_id
+            else None,
+        )
    else:
        outputs = llm.generate(
            prompts,
            sampling_params,
            lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-            if lora_id else None)
+            if lora_id
+            else None,
+        )
    # Print the outputs.
    generated_texts: list[str] = []
    for output in outputs:
@@ -67,42 +73,51 @@ def do_sample(llm: vllm.LLM,
    return generated_texts


-def generate_and_test(llm,
-                      sql_lora_files,
-                      tensorizer_config_dict: Union[dict, None] = None):
+def generate_and_test(
+    llm, sql_lora_files, tensorizer_config_dict: Union[dict, None] = None
+):
    print("lora adapter created")
    print("lora 1")
-    assert do_sample(llm,
-                     sql_lora_files,
-                     tensorizer_config_dict=tensorizer_config_dict,
-                     lora_id=1) == EXPECTED_LORA_OUTPUT
+    assert (
+        do_sample(
+            llm,
+            sql_lora_files,
+            tensorizer_config_dict=tensorizer_config_dict,
+            lora_id=1,
+        )
+        == EXPECTED_LORA_OUTPUT
+    )

    print("lora 2")
-    assert do_sample(llm,
-                     sql_lora_files,
-                     tensorizer_config_dict=tensorizer_config_dict,
-                     lora_id=2) == EXPECTED_LORA_OUTPUT
+    assert (
+        do_sample(
+            llm,
+            sql_lora_files,
+            tensorizer_config_dict=tensorizer_config_dict,
+            lora_id=2,
+        )
+        == EXPECTED_LORA_OUTPUT
+    )

    print("removing lora")


@create_new_process_for_each_test()
 def test_llama_lora(sql_lora_files):
-
    llm = vllm.LLM(
        MODEL_PATH,
        tokenizer=sql_lora_files,
        enable_lora=True,
        # also test odd max_num_seqs
        max_num_seqs=13,
-        max_loras=4)
+        max_loras=4,
+    )
    generate_and_test(llm, sql_lora_files)


@multi_gpu_test(num_gpus=4)
@create_new_process_for_each_test()
 def test_llama_lora_tp4(sql_lora_files):
-
    llm = vllm.LLM(
        MODEL_PATH,
        tokenizer=sql_lora_files,
@@ -117,7 +132,6 @@ def test_llama_lora_tp4(sql_lora_files):
@multi_gpu_test(num_gpus=4)
@create_new_process_for_each_test()
 def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
-
    llm = vllm.LLM(
        MODEL_PATH,
        tokenizer=sql_lora_files,
@@ -132,9 +146,9 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):

@multi_gpu_test(num_gpus=2)
@create_new_process_for_each_test()
-def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files,
-                                            sql_lora_huggingface_id):
-
+def test_tp2_serialize_and_deserialize_lora(
+    tmp_path, sql_lora_files, sql_lora_huggingface_id
+):
    # Run the tensorizing of the LoRA adapter and the model in a subprocess
    # to guarantee cleanup

@@ -145,17 +159,28 @@ def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files,
    lora_path = sql_lora_huggingface_id
    suffix = "test"
    try:
-        result = subprocess.run([
-            sys.executable,
-            f"{VLLM_PATH}/examples/others/tensorize_vllm_model.py", "--model",
-            MODEL_PATH, "--lora-path", lora_path, "--tensor-parallel-size",
-            str(tp_size), "serialize", "--serialized-directory",
-            str(tmp_path), "--suffix", suffix, "--serialization-kwargs",
-            '{"limit_cpu_concurrency": 4}'
-        ],
-                                check=True,
-                                capture_output=True,
-                                text=True)
+        result = subprocess.run(
+            [
+                sys.executable,
+                f"{VLLM_PATH}/examples/others/tensorize_vllm_model.py",
+                "--model",
+                MODEL_PATH,
+                "--lora-path",
+                lora_path,
+                "--tensor-parallel-size",
+                str(tp_size),
+                "serialize",
+                "--serialized-directory",
+                str(tmp_path),
+                "--suffix",
+                suffix,
+                "--serialization-kwargs",
+                '{"limit_cpu_concurrency": 4}',
+            ],
+            check=True,
+            capture_output=True,
+            text=True,
+        )
    except subprocess.CalledProcessError as e:
        print("Tensorizing failed.")
        print("STDOUT:\n", e.stdout)
@@ -167,21 +192,25 @@ def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files,
    model_uri = tmp_path / "vllm" / model_ref / suffix / model_name
    tensorizer_config = TensorizerConfig(tensorizer_uri=str(model_uri))

-    loaded_llm = LLM(model=model_ref,
-                     tokenizer=sql_lora_files,
-                     load_format="tensorizer",
-                     enable_lora=True,
-                     enforce_eager=True,
-                     model_loader_extra_config=tensorizer_config,
-                     max_num_seqs=13,
-                     tensor_parallel_size=2,
-                     max_loras=2)
+    loaded_llm = LLM(
+        model=model_ref,
+        tokenizer=sql_lora_files,
+        load_format="tensorizer",
+        enable_lora=True,
+        enforce_eager=True,
+        model_loader_extra_config=tensorizer_config,
+        max_num_seqs=13,
+        tensor_parallel_size=2,
+        max_loras=2,
+    )

    tc_as_dict = tensorizer_config.to_serializable()

    print("lora adapter created")
    print("lora 1")
-    assert do_sample(loaded_llm,
-                     sql_lora_files,
-                     tensorizer_config_dict=tc_as_dict,
-                     lora_id=1) == EXPECTED_LORA_OUTPUT
+    assert (
+        do_sample(
+            loaded_llm, sql_lora_files, tensorizer_config_dict=tc_as_dict, lora_id=1
+        )
+        == EXPECTED_LORA_OUTPUT
+    )
--- a/tests/lora/test_llm_with_multi_loras.py
+++ b/tests/lora/test_llm_with_multi_loras.py
@@ -5,6 +5,7 @@ This script contains:
 1. test multi loras service with tp >= 2
 2. test multi loras request
 """
+
 import pytest

 from tests.utils import multi_gpu_test
@@ -31,14 +32,8 @@ LORA_TEST_EXPECTED = [

 def format_chatml_messages(prompt: str):
    return [
-        {
-            "role": "system",
-            "content": "You are a helpful assistant."
-        },
-        {
-            "role": "user",
-            "content": prompt
-        },
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": prompt},
    ]


@@ -57,7 +52,6 @@ def make_add_lora_request(name: str, path: str):

@multi_gpu_test(num_gpus=2)
 def test_multi_loras_with_tp_sync():
-
    llm = LLM(
        model=MODEL_PATH,
        enable_lora=True,
@@ -116,15 +110,17 @@ def test_multi_loras_with_tp_sync():

    def reload_lora(name: str):
        """
-        reload a lora to simulate the case: 
-        setting `VLLM_ALLOW_RUNTIME_LORA_UPDATING=true` 
+        reload a lora to simulate the case:
+        setting `VLLM_ALLOW_RUNTIME_LORA_UPDATING=true`
        for dynamic lora loading and unloading
        """
        remove_lora_response = llm.llm_engine.remove_lora(
-            lora_id=LORA_NAME_ID_MAP[name])
+            lora_id=LORA_NAME_ID_MAP[name]
+        )

        add_lora_response = llm.llm_engine.add_lora(
-            make_add_lora_request(name, LORA_NAME_PATH_MAP[name]))
+            make_add_lora_request(name, LORA_NAME_PATH_MAP[name])
+        )

        print(f"{remove_lora_response=}, {add_lora_response=}")

@@ -134,7 +130,6 @@ def test_multi_loras_with_tp_sync():
        assert outputs == expected

    for prompt, expected_output in zip(LORA_TEST_PROMPTS, LORA_TEST_EXPECTED):
-
        output_text = call_llm_get_outputs(prompt, "Alice")
        check_outputs(output_text, expected_output)

@@ -175,8 +170,7 @@ def test_multiple_lora_requests():
    PROMPTS = ["Hello, my name is"] * 2
    LORA_NAME = "Alice"
    lora_request = [
-        LoRARequest(LORA_NAME + str(idx), idx + 1,
-                    LORA_NAME_PATH_MAP[LORA_NAME])
+        LoRARequest(LORA_NAME + str(idx), idx + 1, LORA_NAME_PATH_MAP[LORA_NAME])
        for idx in range(len(PROMPTS))
    ]
    # Multiple SamplingParams should be matched with each prompt
--- a/tests/lora/test_lora_checkpoints.py
+++ b/tests/lora/test_lora_checkpoints.py
@@ -8,9 +8,7 @@ from vllm.lora.peft_helper import PEFTHelper
 from vllm.model_executor.models.baichuan import BaiChuanBaseForCausalLM
 from vllm.model_executor.models.utils import WeightsMapper

-lora_lst = [
-    "baichuan7B", "baichuan7B-zero", "baichuan7B-zero-regex", "chatglm3-6b"
-]
+lora_lst = ["baichuan7B", "baichuan7B-zero", "baichuan7B-zero-regex", "chatglm3-6b"]
 BAICHUAN_LORA_MODULES = [
    "W_pack",
    "o_proj",
@@ -37,8 +35,9 @@ def test_load_checkpoints(
        else:
            expected_lora_modules.append(module)
    if lora_name == "baichuan7B":
-        peft_helper = PEFTHelper.from_local_dir(baichuan_lora_files,
-                                                max_position_embeddings=4096)
+        peft_helper = PEFTHelper.from_local_dir(
+            baichuan_lora_files, max_position_embeddings=4096
+        )
        # For the baichuan7B model, load it's LoRA,
        # and the test should pass.
        LoRAModel.from_local_checkpoint(
@@ -48,13 +47,15 @@ def test_load_checkpoints(
            lora_model_id=1,
            device="cpu",
            embedding_modules=embedding_modules,
-            embedding_padding_modules=embed_padding_modules)
+            embedding_padding_modules=embed_padding_modules,
+        )
    elif lora_name == "baichuan7B-zero":
        # Test that the target_modules contain prefix
        # such as "model.layers.0.self_atten.W_pack", and
        # the test should pass.
-        peft_helper = PEFTHelper.from_local_dir(baichuan_zero_lora_files,
-                                                max_position_embeddings=4096)
+        peft_helper = PEFTHelper.from_local_dir(
+            baichuan_zero_lora_files, max_position_embeddings=4096
+        )
        LoRAModel.from_local_checkpoint(
            baichuan_zero_lora_files,
            expected_lora_modules,
@@ -62,12 +63,14 @@ def test_load_checkpoints(
            lora_model_id=1,
            device="cpu",
            embedding_modules=embedding_modules,
-            embedding_padding_modules=embed_padding_modules)
+            embedding_padding_modules=embed_padding_modules,
+        )
    elif lora_name == "baichuan7B-zero-regex":
        # Test that the `target_modules` in the form of regular expressions,
        # such as `model\\..*(W_pack|o_proj)`, and the test should pass.
-        peft_helper = PEFTHelper.from_local_dir(baichuan_regex_lora_files,
-                                                max_position_embeddings=4096)
+        peft_helper = PEFTHelper.from_local_dir(
+            baichuan_regex_lora_files, max_position_embeddings=4096
+        )
        LoRAModel.from_local_checkpoint(
            baichuan_regex_lora_files,
            expected_lora_modules,
@@ -75,13 +78,15 @@ def test_load_checkpoints(
            lora_model_id=1,
            device="cpu",
            embedding_modules=embedding_modules,
-            embedding_padding_modules=embed_padding_modules)
+            embedding_padding_modules=embed_padding_modules,
+        )
    else:
        # For the baichuan7B model, load chatglm3-6b's LoRA,
        # and the test should raise the following error.
        expected_error = "Please verify that the loaded LoRA module is correct"  # noqa: E501
-        peft_helper = PEFTHelper.from_local_dir(chatglm3_lora_files,
-                                                max_position_embeddings=4096)
+        peft_helper = PEFTHelper.from_local_dir(
+            chatglm3_lora_files, max_position_embeddings=4096
+        )
        with pytest.raises(ValueError, match=expected_error):
            LoRAModel.from_local_checkpoint(
                chatglm3_lora_files,
@@ -90,11 +95,11 @@ def test_load_checkpoints(
                lora_model_id=1,
                device="cpu",
                embedding_modules=embedding_modules,
-                embedding_padding_modules=embed_padding_modules)
+                embedding_padding_modules=embed_padding_modules,
+            )


 def test_lora_weights_mapping(baichuan_lora_files):
-
    packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
    embedding_modules = BaiChuanBaseForCausalLM.embedding_modules
    embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules
@@ -113,8 +118,9 @@ def test_lora_weights_mapping(baichuan_lora_files):
            ".layers.": ".baichuan_layers.",
        },
    )
-    peft_helper = PEFTHelper.from_local_dir(baichuan_lora_files,
-                                            max_position_embeddings=4096)
+    peft_helper = PEFTHelper.from_local_dir(
+        baichuan_lora_files, max_position_embeddings=4096
+    )
    lora_model = LoRAModel.from_local_checkpoint(
        baichuan_lora_files,
        expected_lora_modules,
--- a/tests/lora/test_lora_functions.py
+++ b/tests/lora/test_lora_functions.py
@@ -3,11 +3,13 @@
 """
 Script to test add_lora, remove_lora, pin_lora, list_loras functions.
 """
+
 import pytest

 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.entrypoints.openai.api_server import (
-    build_async_engine_client_from_engine_args)
+    build_async_engine_client_from_engine_args,
+)
 from vllm.lora.request import LoRARequest
 from vllm.v1.engine.llm_engine import LLMEngine

@@ -17,23 +19,24 @@ LORA_RANK = 8


 def make_lora_request(lora_id: int):
-    return LoRARequest(lora_name=f"{lora_id}",
-                       lora_int_id=lora_id,
-                       lora_path=LORA_MODULE_PATH)
+    return LoRARequest(
+        lora_name=f"{lora_id}", lora_int_id=lora_id, lora_path=LORA_MODULE_PATH
+    )


 def test_lora_functions_sync():
-
    max_loras = 4
    # Create engine in eager-mode. Due to high max_loras, the CI can
    # OOM during cuda-graph capture.
-    engine_args = EngineArgs(model=MODEL_PATH,
-                             enable_lora=True,
-                             max_loras=max_loras,
-                             max_lora_rank=LORA_RANK,
-                             max_model_len=128,
-                             gpu_memory_utilization=0.8,
-                             enforce_eager=True)
+    engine_args = EngineArgs(
+        model=MODEL_PATH,
+        enable_lora=True,
+        max_loras=max_loras,
+        max_lora_rank=LORA_RANK,
+        max_model_len=128,
+        gpu_memory_utilization=0.8,
+        enforce_eager=True,
+    )

    llm = LLMEngine.from_engine_args(engine_args)

@@ -70,15 +73,16 @@ def test_lora_functions_sync():

@pytest.mark.asyncio
 async def test_lora_functions_async():
-
    max_loras = 4
-    engine_args = AsyncEngineArgs(model=MODEL_PATH,
-                                  enable_lora=True,
-                                  max_loras=max_loras,
-                                  max_lora_rank=LORA_RANK,
-                                  max_model_len=128,
-                                  gpu_memory_utilization=0.8,
-                                  enforce_eager=True)
+    engine_args = AsyncEngineArgs(
+        model=MODEL_PATH,
+        enable_lora=True,
+        max_loras=max_loras,
+        max_lora_rank=LORA_RANK,
+        max_model_len=128,
+        gpu_memory_utilization=0.8,
+        enforce_eager=True,
+    )

    async def run_check(fn, args, expected: list):
        await fn(args)
--- a/tests/lora/test_lora_huggingface.py
+++ b/tests/lora/test_lora_huggingface.py
@@ -11,8 +11,12 @@ from vllm.model_executor.models.llama import LlamaForCausalLM
 # Provide absolute path and huggingface lora ids
 lora_fixture_name = ["sql_lora_files", "sql_lora_huggingface_id"]
 LLAMA_LORA_MODULES = [
-    "qkv_proj", "o_proj", "gate_up_proj", "down_proj", "embed_tokens",
-    "lm_head"
+    "qkv_proj",
+    "o_proj",
+    "gate_up_proj",
+    "down_proj",
+    "embed_tokens",
+    "lm_head",
 ]


@@ -40,7 +44,8 @@ def test_load_checkpoints_from_huggingface(lora_fixture_name, request):
        lora_model_id=1,
        device="cpu",
        embedding_modules=embedding_modules,
-        embedding_padding_modules=embed_padding_modules)
+        embedding_padding_modules=embed_padding_modules,
+    )

    # Assertions to ensure the model is loaded correctly
    assert lora_model is not None, "LoRAModel is not loaded correctly"
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -10,16 +10,21 @@ from torch import nn

 from vllm.config import ModelConfig, VllmConfig
 from vllm.config.lora import LoRAConfig
-from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
-                              MergedColumnParallelLinearWithLoRA,
-                              RowParallelLinearWithLoRA)
+from vllm.lora.layers import (
+    ColumnParallelLinearWithLoRA,
+    MergedColumnParallelLinearWithLoRA,
+    RowParallelLinearWithLoRA,
+)
 from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights
-from vllm.lora.models import (LoRAMapping, LoRAModel, LoRAModelManager,
-                              LRUCacheLoRAModelManager)
+from vllm.lora.models import (
+    LoRAMapping,
+    LoRAModel,
+    LoRAModelManager,
+    LRUCacheLoRAModelManager,
+)
 from vllm.lora.peft_helper import PEFTHelper
 from vllm.lora.request import LoRARequest
-from vllm.lora.worker_manager import (LRUCacheWorkerLoRAManager,
-                                      WorkerLoRAManager)
+from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager, WorkerLoRAManager
 from vllm.platforms import current_platform

 from .utils import create_peft_lora
@@ -31,22 +36,25 @@ EMBEDDING_MODULES = {

 EMBEDDING_PADDING_MODULES = ["lm_head"]

-DEVICES = ([
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-] if current_platform.is_cuda_alike() else ["cpu"])
+DEVICES = (
+    [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
+    if current_platform.is_cuda_alike()
+    else ["cpu"]
+)

 DEFAULT_DTYPE = torch.get_default_dtype()


@pytest.mark.parametrize("device", DEVICES)
 def test_from_lora_tensors(sql_lora_files, device):
-    tensors = load_file(
-        os.path.join(sql_lora_files, "adapter_model.safetensors"))
+    tensors = load_file(os.path.join(sql_lora_files, "adapter_model.safetensors"))
    new_embeddings = load_file(
-        os.path.join(sql_lora_files, "new_embeddings.safetensors"))
+        os.path.join(sql_lora_files, "new_embeddings.safetensors")
+    )

-    peft_helper = PEFTHelper.from_local_dir(sql_lora_files,
-                                            max_position_embeddings=4096)
+    peft_helper = PEFTHelper.from_local_dir(
+        sql_lora_files, max_position_embeddings=4096
+    )
    lora_model = LoRAModel.from_lora_tensors(
        1,
        tensors,
@@ -54,7 +62,8 @@ def test_from_lora_tensors(sql_lora_files, device):
        device=device,
        embeddings=new_embeddings,
        embedding_modules=EMBEDDING_MODULES,
-        embedding_padding_modules=EMBEDDING_PADDING_MODULES)
+        embedding_padding_modules=EMBEDDING_PADDING_MODULES,
+    )
    for module_name, lora in lora_model.loras.items():
        assert lora.module_name == module_name
        assert lora.rank == 8
@@ -63,22 +72,27 @@ def test_from_lora_tensors(sql_lora_files, device):
        assert lora.lora_b is not None
        assert lora.lora_a.device == torch.device(device)
        assert lora.lora_b.device == torch.device(device)
-        assert (lora.lora_a.shape[0] == lora.lora_b.shape[1]
-                ), f"{lora.lora_a.shape=}, {lora.lora_b.shape=}"
+        assert lora.lora_a.shape[0] == lora.lora_b.shape[1], (
+            f"{lora.lora_a.shape=}, {lora.lora_b.shape=}"
+        )
        assert lora.lora_a.shape[0] == 8
        embeddings_module = next(
-            (k for k in EMBEDDING_MODULES if k in module_name), None)
+            (k for k in EMBEDDING_MODULES if k in module_name), None
+        )
        if embeddings_module:
            assert torch.equal(
                lora.embeddings_tensor,
                new_embeddings[EMBEDDING_MODULES[embeddings_module]].to(
-                    device=lora.embeddings_tensor.device))
+                    device=lora.embeddings_tensor.device
+                ),
+            )
        else:
            assert lora.embeddings_tensor is None


-def create_lora(lora_id: int, model: nn.Module, sub_modules: list[str],
-                device: torch.device) -> LoRAModel:
+def create_lora(
+    lora_id: int, model: nn.Module, sub_modules: list[str], device: torch.device
+) -> LoRAModel:
    loras: dict[str, LoRALayerWeights] = {}
    for name in sub_modules:
        w = model.get_submodule(name).weight
@@ -110,8 +124,7 @@ def create_packed_lora(
            8,
            16,
            torch.rand([8, w.shape[1]], device=device),
-            torch.rand([w.shape[0] // len(replaced_module_names), 8],
-                       device=device),
+            torch.rand([w.shape[0] // len(replaced_module_names), 8], device=device),
        )
    return LoRAModel(lora_id, 8, loras)

@@ -119,42 +132,42 @@ def create_packed_lora(
 def test_replace_submodules(dist_init, dummy_model):
    model = dummy_model
    manager = LoRAModelManager(
-        model, 1, 1, 1,
-        LoRAConfig(max_lora_rank=8,
-                   max_cpu_loras=8,
-                   max_loras=8,
-                   lora_dtype=DEFAULT_DTYPE), torch.device(DEVICES[0]))
+        model,
+        1,
+        1,
+        1,
+        LoRAConfig(
+            max_lora_rank=8, max_cpu_loras=8, max_loras=8, lora_dtype=DEFAULT_DTYPE
+        ),
+        torch.device(DEVICES[0]),
+    )
    model = manager.model
-    assert isinstance(model.get_submodule("dense1"),
-                      ColumnParallelLinearWithLoRA)
-    assert isinstance(model.get_submodule("layer1.dense1"),
-                      ColumnParallelLinearWithLoRA)
+    assert isinstance(model.get_submodule("dense1"), ColumnParallelLinearWithLoRA)
+    assert isinstance(
+        model.get_submodule("layer1.dense1"), ColumnParallelLinearWithLoRA
+    )
    assert isinstance(model.get_submodule("dense2"), RowParallelLinearWithLoRA)
-    assert isinstance(model.get_submodule("layer1.dense2"),
-                      RowParallelLinearWithLoRA)
+    assert isinstance(model.get_submodule("layer1.dense2"), RowParallelLinearWithLoRA)


@pytest.mark.parametrize("device", DEVICES)
 def test_lora_model_manager(dist_init, dummy_model, device):
    model = dummy_model
-    model_lora1 = create_lora(1,
-                              model, ["layer1.dense1", "dense2", "lm_head"],
-                              device=device)
-    model_lora2 = create_lora(2,
-                              model, ["dense1", "dense2", "lm_head"],
-                              device=device)
-    model_lora3 = create_lora(3,
-                              model, ["dense1", "dense2", "lm_head"],
-                              device=device)
-    manager = LoRAModelManager(model,
-                               2,
-                               2,
-                               2,
-                               LoRAConfig(max_lora_rank=8,
-                                          max_cpu_loras=3,
-                                          max_loras=2,
-                                          lora_dtype=DEFAULT_DTYPE),
-                               device=device)
+    model_lora1 = create_lora(
+        1, model, ["layer1.dense1", "dense2", "lm_head"], device=device
+    )
+    model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"], device=device)
+    model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"], device=device)
+    manager = LoRAModelManager(
+        model,
+        2,
+        2,
+        2,
+        LoRAConfig(
+            max_lora_rank=8, max_cpu_loras=3, max_loras=2, lora_dtype=DEFAULT_DTYPE
+        ),
+        device=device,
+    )
    assert all(x is None for x in manager.lora_index_to_id)
    assert manager.add_adapter(model_lora1)
    assert manager.activate_adapter(1)
@@ -204,24 +217,21 @@ def test_lora_model_manager(dist_init, dummy_model, device):
@pytest.mark.parametrize("device", DEVICES)
 def test_lora_lru_cache_model_manager(dist_init, dummy_model, device):
    model = dummy_model
-    model_lora1 = create_lora(1,
-                              model, ["layer1.dense1", "dense2", "lm_head"],
-                              device=device)
-    model_lora2 = create_lora(2,
-                              model, ["dense1", "dense2", "lm_head"],
-                              device=device)
-    model_lora3 = create_lora(3,
-                              model, ["dense1", "dense2", "lm_head"],
-                              device=device)
-    manager = LRUCacheLoRAModelManager(model,
-                                       2,
-                                       2,
-                                       2,
-                                       LoRAConfig(max_lora_rank=8,
-                                                  max_cpu_loras=3,
-                                                  max_loras=2,
-                                                  lora_dtype=DEFAULT_DTYPE),
-                                       device=device)
+    model_lora1 = create_lora(
+        1, model, ["layer1.dense1", "dense2", "lm_head"], device=device
+    )
+    model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"], device=device)
+    model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"], device=device)
+    manager = LRUCacheLoRAModelManager(
+        model,
+        2,
+        2,
+        2,
+        LoRAConfig(
+            max_lora_rank=8, max_cpu_loras=3, max_loras=2, lora_dtype=DEFAULT_DTYPE
+        ),
+        device=device,
+    )
    assert all(x is None for x in manager.lora_index_to_id)
    assert manager.add_adapter(model_lora1)
    assert manager.activate_adapter(1)
@@ -297,27 +307,22 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):
    # This tests just the LRU cache functionality, everything else is
    # tested in test_lora_model_manager
    model = dummy_model
-    model_lora1 = create_lora(1,
-                              model, ["layer1.dense1", "dense2", "lm_head"],
-                              device=device)
-    model_lora2 = create_lora(2,
-                              model, ["dense1", "dense2", "lm_head"],
-                              device=device)
-    model_lora3 = create_lora(3,
-                              model, ["dense1", "dense2", "lm_head"],
-                              device=device)
-    model_lora4 = create_lora(4,
-                              model, ["dense1", "dense2", "lm_head"],
-                              device=device)
-    manager = LRUCacheLoRAModelManager(model,
-                                       2,
-                                       2,
-                                       2,
-                                       LoRAConfig(max_lora_rank=8,
-                                                  max_cpu_loras=2,
-                                                  max_loras=2,
-                                                  lora_dtype=DEFAULT_DTYPE),
-                                       device=device)
+    model_lora1 = create_lora(
+        1, model, ["layer1.dense1", "dense2", "lm_head"], device=device
+    )
+    model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"], device=device)
+    model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"], device=device)
+    model_lora4 = create_lora(4, model, ["dense1", "dense2", "lm_head"], device=device)
+    manager = LRUCacheLoRAModelManager(
+        model,
+        2,
+        2,
+        2,
+        LoRAConfig(
+            max_lora_rank=8, max_cpu_loras=2, max_loras=2, lora_dtype=DEFAULT_DTYPE
+        ),
+        device=device,
+    )
    assert all(x is None for x in manager.lora_index_to_id)

    # Add up to capacity
@@ -421,12 +426,10 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):


@pytest.mark.parametrize("device", DEVICES)
-def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device,
-                                          tmp_path):
-    lora_config = LoRAConfig(max_lora_rank=8,
-                             max_cpu_loras=4,
-                             max_loras=4,
-                             lora_dtype=DEFAULT_DTYPE)
+def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device, tmp_path):
+    lora_config = LoRAConfig(
+        max_lora_rank=8, max_cpu_loras=4, max_loras=4, lora_dtype=DEFAULT_DTYPE
+    )

    dummy_lora_files = f"{tmp_path}/lora_adapter"
    os.makedirs(dummy_lora_files, exist_ok=True)
@@ -438,13 +441,13 @@ def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device,
    )

    model_config = ModelConfig(max_model_len=16)
-    vllm_config = VllmConfig(model_config=model_config,
-                             lora_config=lora_config)
+    vllm_config = VllmConfig(model_config=model_config, lora_config=lora_config)

    vllm_config.scheduler_config.max_num_seqs = 4
    vllm_config.scheduler_config.max_num_batched_tokens = 2
    worker_adapter_manager = LRUCacheWorkerLoRAManager(
-        vllm_config, device, EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES)
+        vllm_config, device, EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES
+    )

    worker_adapter_manager.max_num_seqs = 4
    worker_adapter_manager.max_num_batched_tokens = 2
@@ -452,52 +455,64 @@ def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device,
    worker_adapter_manager.create_lora_manager(dummy_model)

    mapping = LoRAMapping([], [])
-    worker_adapter_manager.set_active_adapters([
-        LoRARequest("1", 1, dummy_lora_files),
-        LoRARequest("2", 2, dummy_lora_files)
-    ], mapping)
+    worker_adapter_manager.set_active_adapters(
+        [LoRARequest("1", 1, dummy_lora_files), LoRARequest("2", 2, dummy_lora_files)],
+        mapping,
+    )
    assert worker_adapter_manager.list_adapters() == {1, 2}
    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2

-    worker_adapter_manager.set_active_adapters([
-        LoRARequest("1", 1, dummy_lora_files),
-        LoRARequest("3", 3, dummy_lora_files),
-        LoRARequest("4", 4, dummy_lora_files)
-    ], mapping)
+    worker_adapter_manager.set_active_adapters(
+        [
+            LoRARequest("1", 1, dummy_lora_files),
+            LoRARequest("3", 3, dummy_lora_files),
+            LoRARequest("4", 4, dummy_lora_files),
+        ],
+        mapping,
+    )
    assert worker_adapter_manager.list_adapters() == {1, 2, 3, 4}
    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 3
    assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 4

-    worker_adapter_manager.set_active_adapters([
-        LoRARequest("1", 1, dummy_lora_files),
-        LoRARequest("2", 2, dummy_lora_files),
-        LoRARequest("5", 5, dummy_lora_files)
-    ], mapping)
+    worker_adapter_manager.set_active_adapters(
+        [
+            LoRARequest("1", 1, dummy_lora_files),
+            LoRARequest("2", 2, dummy_lora_files),
+            LoRARequest("5", 5, dummy_lora_files),
+        ],
+        mapping,
+    )
    assert worker_adapter_manager.list_adapters() == {1, 2, 4, 5}
    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 5
    assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 4

-    worker_adapter_manager.set_active_adapters([
-        LoRARequest("1", 1, dummy_lora_files),
-        LoRARequest("1", 1, dummy_lora_files),
-        LoRARequest("1", 1, dummy_lora_files)
-    ], mapping)
+    worker_adapter_manager.set_active_adapters(
+        [
+            LoRARequest("1", 1, dummy_lora_files),
+            LoRARequest("1", 1, dummy_lora_files),
+            LoRARequest("1", 1, dummy_lora_files),
+        ],
+        mapping,
+    )
    assert worker_adapter_manager.list_adapters() == {1, 2, 4, 5}
    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 5
    assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 4

-    worker_adapter_manager.set_active_adapters([
-        LoRARequest("6", 6, dummy_lora_files),
-        LoRARequest("7", 7, dummy_lora_files),
-        LoRARequest("8", 8, dummy_lora_files)
-    ], mapping)
+    worker_adapter_manager.set_active_adapters(
+        [
+            LoRARequest("6", 6, dummy_lora_files),
+            LoRARequest("7", 7, dummy_lora_files),
+            LoRARequest("8", 8, dummy_lora_files),
+        ],
+        mapping,
+    )
    assert worker_adapter_manager.list_adapters() == {1, 6, 7, 8}
    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 7
@@ -506,41 +521,40 @@ def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device,

    # Over capacity
    with pytest.raises(RuntimeError):
-        worker_adapter_manager.set_active_adapters([
-            LoRARequest("10", 10, dummy_lora_files),
-            LoRARequest("11", 11, dummy_lora_files),
-            LoRARequest("12", 12, dummy_lora_files),
-            LoRARequest("13", 13, dummy_lora_files),
-            LoRARequest("14", 14, dummy_lora_files)
-        ], mapping)
+        worker_adapter_manager.set_active_adapters(
+            [
+                LoRARequest("10", 10, dummy_lora_files),
+                LoRARequest("11", 11, dummy_lora_files),
+                LoRARequest("12", 12, dummy_lora_files),
+                LoRARequest("13", 13, dummy_lora_files),
+                LoRARequest("14", 14, dummy_lora_files),
+            ],
+            mapping,
+        )

    assert worker_adapter_manager.device == device
-    assert (worker_adapter_manager._adapter_manager.punica_wrapper.device ==
-            device)
+    assert worker_adapter_manager._adapter_manager.punica_wrapper.device == device


@pytest.mark.parametrize("device", DEVICES)
-def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device,
-                                tmp_path):
+def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device, tmp_path):
    # Should remove every LoRA not specified in the request.
-    lora_config = LoRAConfig(max_lora_rank=8,
-                             max_cpu_loras=4,
-                             max_loras=4,
-                             lora_dtype=DEFAULT_DTYPE)
+    lora_config = LoRAConfig(
+        max_lora_rank=8, max_cpu_loras=4, max_loras=4, lora_dtype=DEFAULT_DTYPE
+    )

    model_config = ModelConfig(max_model_len=16)
-    vllm_config = VllmConfig(model_config=model_config,
-                             lora_config=lora_config)
+    vllm_config = VllmConfig(model_config=model_config, lora_config=lora_config)

    vllm_config.scheduler_config.max_num_seqs = 4
    vllm_config.scheduler_config.max_num_batched_tokens = 2

-    worker_adapter_manager = WorkerLoRAManager(vllm_config, device,
-                                               EMBEDDING_MODULES,
-                                               EMBEDDING_PADDING_MODULES)
+    worker_adapter_manager = WorkerLoRAManager(
+        vllm_config, device, EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES
+    )
    worker_adapter_manager.vocab_size = (
-        dummy_model_gate_up.unpadded_vocab_size -
-        lora_config.lora_extra_vocab_size)
+        dummy_model_gate_up.unpadded_vocab_size - lora_config.lora_extra_vocab_size
+    )
    worker_adapter_manager.create_lora_manager(dummy_model_gate_up)

    dummy_lora_files = f"{tmp_path}/lora_adapter"
@@ -553,49 +567,61 @@ def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device,
    )

    mapping = LoRAMapping([], [])
-    worker_adapter_manager.set_active_adapters([
-        LoRARequest("1", 1, dummy_lora_files),
-        LoRARequest("2", 2, dummy_lora_files)
-    ], mapping)
+    worker_adapter_manager.set_active_adapters(
+        [LoRARequest("1", 1, dummy_lora_files), LoRARequest("2", 2, dummy_lora_files)],
+        mapping,
+    )
    assert worker_adapter_manager.list_adapters() == {1, 2}
    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2

-    worker_adapter_manager.set_active_adapters([
-        LoRARequest("1", 1, dummy_lora_files),
-        LoRARequest("3", 3, dummy_lora_files),
-        LoRARequest("4", 4, dummy_lora_files)
-    ], mapping)
+    worker_adapter_manager.set_active_adapters(
+        [
+            LoRARequest("1", 1, dummy_lora_files),
+            LoRARequest("3", 3, dummy_lora_files),
+            LoRARequest("4", 4, dummy_lora_files),
+        ],
+        mapping,
+    )
    assert worker_adapter_manager.list_adapters() == {1, 3, 4}
    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 3
    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 4

-    worker_adapter_manager.set_active_adapters([
-        LoRARequest("1", 1, dummy_lora_files),
-        LoRARequest("2", 2, dummy_lora_files),
-        LoRARequest("5", 5, dummy_lora_files)
-    ], mapping)
+    worker_adapter_manager.set_active_adapters(
+        [
+            LoRARequest("1", 1, dummy_lora_files),
+            LoRARequest("2", 2, dummy_lora_files),
+            LoRARequest("5", 5, dummy_lora_files),
+        ],
+        mapping,
+    )
    assert worker_adapter_manager.list_adapters() == {1, 2, 5}
    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 5

-    worker_adapter_manager.set_active_adapters([
-        LoRARequest("1", 1, dummy_lora_files),
-        LoRARequest("1", 1, dummy_lora_files),
-        LoRARequest("1", 1, dummy_lora_files)
-    ], mapping)
+    worker_adapter_manager.set_active_adapters(
+        [
+            LoRARequest("1", 1, dummy_lora_files),
+            LoRARequest("1", 1, dummy_lora_files),
+            LoRARequest("1", 1, dummy_lora_files),
+        ],
+        mapping,
+    )
    assert worker_adapter_manager.list_adapters() == {1}
    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] is None
    assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] is None

-    worker_adapter_manager.set_active_adapters([
-        LoRARequest("6", 6, dummy_lora_files),
-        LoRARequest("7", 7, dummy_lora_files),
-        LoRARequest("8", 8, dummy_lora_files)
-    ], mapping)
+    worker_adapter_manager.set_active_adapters(
+        [
+            LoRARequest("6", 6, dummy_lora_files),
+            LoRARequest("7", 7, dummy_lora_files),
+            LoRARequest("8", 8, dummy_lora_files),
+        ],
+        mapping,
+    )
    assert worker_adapter_manager.list_adapters() == {6, 7, 8}
    assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 8
    assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 6
@@ -603,17 +629,19 @@ def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device,

    # Over capacity
    with pytest.raises(RuntimeError):
-        worker_adapter_manager.set_active_adapters([
-            LoRARequest("10", 10, dummy_lora_files),
-            LoRARequest("11", 11, dummy_lora_files),
-            LoRARequest("12", 12, dummy_lora_files),
-            LoRARequest("13", 13, dummy_lora_files),
-            LoRARequest("14", 14, dummy_lora_files)
-        ], mapping)
+        worker_adapter_manager.set_active_adapters(
+            [
+                LoRARequest("10", 10, dummy_lora_files),
+                LoRARequest("11", 11, dummy_lora_files),
+                LoRARequest("12", 12, dummy_lora_files),
+                LoRARequest("13", 13, dummy_lora_files),
+                LoRARequest("14", 14, dummy_lora_files),
+            ],
+            mapping,
+        )

    assert worker_adapter_manager.device == device
-    assert (worker_adapter_manager._adapter_manager.punica_wrapper.device ==
-            device)
+    assert worker_adapter_manager._adapter_manager.punica_wrapper.device == device


@pytest.mark.parametrize("device", DEVICES)
@@ -624,7 +652,8 @@ def test_packed_loras(dist_init, dummy_model_gate_up, device):
        model,
        module_name="gate_up_proj",
        replaced_module_names=["gate_proj", "up_proj"],
-        device=device)
+        device=device,
+    )
    model_lora1 = create_packed_lora(
        2,
        model,
@@ -634,19 +663,21 @@ def test_packed_loras(dist_init, dummy_model_gate_up, device):
        empty_replaced_module_name="gate_proj",
    )

-    manager = LoRAModelManager(model,
-                               2,
-                               2,
-                               2,
-                               LoRAConfig(max_lora_rank=8,
-                                          max_cpu_loras=2,
-                                          max_loras=2,
-                                          lora_dtype=DEFAULT_DTYPE),
-                               device=device)
+    manager = LoRAModelManager(
+        model,
+        2,
+        2,
+        2,
+        LoRAConfig(
+            max_lora_rank=8, max_cpu_loras=2, max_loras=2, lora_dtype=DEFAULT_DTYPE
+        ),
+        device=device,
+    )
    model = manager.model

-    assert isinstance(model.get_submodule("gate_up_proj"),
-                      MergedColumnParallelLinearWithLoRA)
+    assert isinstance(
+        model.get_submodule("gate_up_proj"), MergedColumnParallelLinearWithLoRA
+    )
    # Verify packed lora is correct
    model_lora_clone = model_lora.clone(1)
    model_lora_clone1 = model_lora1.clone(1)
@@ -659,21 +690,27 @@ def test_packed_loras(dist_init, dummy_model_gate_up, device):
    packed_lora = model_lora.get_lora("gate_up_proj")
    assert packed_lora and isinstance(packed_lora, PackedLoRALayerWeights)

-    torch.testing.assert_close(packed_lora.lora_a[0],
-                               model_lora_clone.get_lora("gate_proj").lora_a)
-    torch.testing.assert_close(packed_lora.lora_b[0],
-                               model_lora_clone.get_lora("gate_proj").lora_b)
-    torch.testing.assert_close(packed_lora.lora_a[1],
-                               model_lora_clone.get_lora("up_proj").lora_a)
-    torch.testing.assert_close(packed_lora.lora_b[1],
-                               model_lora_clone.get_lora("up_proj").lora_b)
+    torch.testing.assert_close(
+        packed_lora.lora_a[0], model_lora_clone.get_lora("gate_proj").lora_a
+    )
+    torch.testing.assert_close(
+        packed_lora.lora_b[0], model_lora_clone.get_lora("gate_proj").lora_b
+    )
+    torch.testing.assert_close(
+        packed_lora.lora_a[1], model_lora_clone.get_lora("up_proj").lora_a
+    )
+    torch.testing.assert_close(
+        packed_lora.lora_b[1], model_lora_clone.get_lora("up_proj").lora_b
+    )

    packed_lora1 = model_lora1.get_lora("gate_up_proj")
    assert packed_lora1 and isinstance(packed_lora1, PackedLoRALayerWeights)

    assert packed_lora1.lora_a[0] is None
    assert packed_lora1.lora_b[0] is None
-    torch.testing.assert_close(packed_lora1.lora_a[1],
-                               model_lora_clone1.get_lora("up_proj").lora_a)
-    torch.testing.assert_close(packed_lora1.lora_b[1],
-                               model_lora_clone1.get_lora("up_proj").lora_b)
+    torch.testing.assert_close(
+        packed_lora1.lora_a[1], model_lora_clone1.get_lora("up_proj").lora_a
+    )
+    torch.testing.assert_close(
+        packed_lora1.lora_b[1], model_lora_clone1.get_lora("up_proj").lora_b
+    )
--- a/tests/lora/test_minicpmv_tp.py
+++ b/tests/lora/test_minicpmv_tp.py
@@ -15,7 +15,8 @@ MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5"
 PROMPT_TEMPLATE = (
    "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"
    "(<image>./</image>)\nWhat is in the image?<|eot_id|>"
-    "<|start_header_id|>assistant<|end_header_id|>\n\n")
+    "<|start_header_id|>assistant<|end_header_id|>\n\n"
+)

 IMAGE_ASSETS = [
    ImageAsset("stop_sign"),
@@ -34,18 +35,18 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
        stop_token_ids=[128001, 128009],  # eos_id, eot_id
    )

-    inputs = [{
-        "prompt": PROMPT_TEMPLATE,
-        "multi_modal_data": {
-            "image": asset.pil_image
-        },
-    } for asset in IMAGE_ASSETS]
+    inputs = [
+        {
+            "prompt": PROMPT_TEMPLATE,
+            "multi_modal_data": {"image": asset.pil_image},
+        }
+        for asset in IMAGE_ASSETS
+    ]

    outputs = llm.generate(
        inputs,
        sampling_params,
-        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-        if lora_id else None,
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
    )
    # Print the outputs.
    generated_texts: list[str] = []
@@ -58,7 +59,8 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:

@pytest.mark.xfail(
    current_platform.is_rocm(),
-    reason="MiniCPM-V dependency xformers incompatible with ROCm")
+    reason="MiniCPM-V dependency xformers incompatible with ROCm",
+)
 def test_minicpmv_lora(minicpmv_lora_files):
    llm = vllm.LLM(
        MODEL_PATH,
@@ -68,10 +70,7 @@ def test_minicpmv_lora(minicpmv_lora_files):
        max_lora_rank=8,
        enforce_eager=True,
        max_model_len=2048,
-        limit_mm_per_prompt={
-            "image": 2,
-            "video": 0
-        },
+        limit_mm_per_prompt={"image": 2, "video": 0},
        trust_remote_code=True,
    )
    output1 = do_sample(llm, minicpmv_lora_files, lora_id=1)
@@ -82,11 +81,13 @@ def test_minicpmv_lora(minicpmv_lora_files):
        assert EXPECTED_OUTPUT[i].startswith(output2[i])


-@pytest.mark.skipif(current_platform.is_cuda_alike(),
-                    reason="Skipping to avoid redundant model tests")
+@pytest.mark.skipif(
+    current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests"
+)
@pytest.mark.xfail(
    current_platform.is_rocm(),
-    reason="MiniCPM-V dependency xformers incompatible with ROCm")
+    reason="MiniCPM-V dependency xformers incompatible with ROCm",
+)
@create_new_process_for_each_test()
 def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
    llm = vllm.LLM(
@@ -96,10 +97,7 @@ def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
        max_loras=4,
        max_lora_rank=64,
        tensor_parallel_size=4,
-        limit_mm_per_prompt={
-            "image": 2,
-            "video": 0
-        },
+        limit_mm_per_prompt={"image": 2, "video": 0},
        trust_remote_code=True,
    )
    output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1)
@@ -107,11 +105,13 @@ def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
        assert EXPECTED_OUTPUT[i].startswith(output_tp[i])


-@pytest.mark.skipif(current_platform.is_cuda_alike(),
-                    reason="Skipping to avoid redundant model tests")
+@pytest.mark.skipif(
+    current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests"
+)
@pytest.mark.xfail(
    current_platform.is_rocm(),
-    reason="MiniCPM-V dependency xformers incompatible with ROCm")
+    reason="MiniCPM-V dependency xformers incompatible with ROCm",
+)
@create_new_process_for_each_test()
 def test_minicpmv_tp4_fully_sharded_loras(minicpmv_lora_files):
    llm = vllm.LLM(
@@ -122,10 +122,7 @@ def test_minicpmv_tp4_fully_sharded_loras(minicpmv_lora_files):
        max_lora_rank=8,
        tensor_parallel_size=4,
        trust_remote_code=True,
-        limit_mm_per_prompt={
-            "image": 1,
-            "video": 0
-        },
+        limit_mm_per_prompt={"image": 1, "video": 0},
        fully_sharded_loras=True,
    )
    output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1)
--- a/tests/lora/test_mixtral.py
+++ b/tests/lora/test_mixtral.py
@@ -11,15 +11,15 @@ from vllm.platforms import current_platform
 MODEL_PATH = "mistralai/Mixtral-8x7B-Instruct-v0.1"


-def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int,
-              prompts: list[str]) -> list[str]:
-
+def do_sample(
+    llm: vllm.LLM, lora_path: str, lora_id: int, prompts: list[str]
+) -> list[str]:
    sampling_params = vllm.SamplingParams(temperature=0, max_tokens=256)
    outputs = llm.generate(
        prompts,
        sampling_params,
-        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-        if lora_id else None)
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
+    )
    # Print the outputs.
    generated_texts: list[str] = []
    for output in outputs:
@@ -33,8 +33,11 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int,
@pytest.mark.parametrize("tp_size", [4])
 def test_mixtral_lora(mixtral_lora_files, tp_size):
    """Original test, the LoRA model has the common target modules, not all"""
-    if torch.cuda.device_count(
-    ) < tp_size and tp_size > 1 and current_platform.is_cuda_alike():
+    if (
+        torch.cuda.device_count() < tp_size
+        and tp_size > 1
+        and current_platform.is_cuda_alike()
+    ):
        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")

    prompts = [
@@ -57,7 +60,11 @@ def test_mixtral_lora(mixtral_lora_files, tp_size):
        "give_opinion(name[SpellForce 3], developer[Grimlore Games], release_year[2017], rating[poor])",  # noqa: E501
        "inform(name[BioShock], release_year[2007], rating[good], genres[action-adventure, role-playing, shooter], platforms[PlayStation, Xbox, PC], available_on_steam[yes], has_linux_release[no], has_mac_release[yes])",  # noqa: E501
    ]
-    assert do_sample(llm, mixtral_lora_files, lora_id=1,
-                     prompts=prompts) == expected_lora_output
-    assert do_sample(llm, mixtral_lora_files, lora_id=2,
-                     prompts=prompts) == expected_lora_output
+    assert (
+        do_sample(llm, mixtral_lora_files, lora_id=1, prompts=prompts)
+        == expected_lora_output
+    )
+    assert (
+        do_sample(llm, mixtral_lora_files, lora_id=2, prompts=prompts)
+        == expected_lora_output
+    )
--- a/tests/lora/test_peft_helper.py
+++ b/tests/lora/test_peft_helper.py
@@ -13,34 +13,27 @@ from vllm.lora.peft_helper import PEFTHelper
 ERROR_CASES = [
    (
        "test_rank",
-        {
-            "r": 1024
-        },
+        {"r": 1024},
        "is greater than max_lora_rank",
    ),
    (
        "test_bias",
-        {
-            "bias": "all"
-        },
+        {"bias": "all"},
        "Adapter bias cannot be used without bias_enabled",
    ),
-    ("test_dora", {
-        "use_dora": True
-    }, "does not yet support DoRA"),
+    ("test_dora", {"use_dora": True}, "does not yet support DoRA"),
    (
        "test_modules_to_save",
-        {
-            "modules_to_save": ["lm_head"]
-        },
+        {"modules_to_save": ["lm_head"]},
        "only supports modules_to_save being None",
    ),
 ]


 def test_peft_helper_pass(sql_lora_files, tmp_path):
-    peft_helper = PEFTHelper.from_local_dir(sql_lora_files,
-                                            max_position_embeddings=4096)
+    peft_helper = PEFTHelper.from_local_dir(
+        sql_lora_files, max_position_embeddings=4096
+    )
    lora_config = LoRAConfig(max_lora_rank=16, max_cpu_loras=3, max_loras=2)
    peft_helper.validate_legal(lora_config)
    assert peft_helper.r == 8
@@ -74,8 +67,7 @@ def test_peft_helper_pass(sql_lora_files, tmp_path):
    with open(config_path, "w") as f:
        json.dump(adapter_config, f)

-    peft_helper = PEFTHelper.from_local_dir(test_dir,
-                                            max_position_embeddings=4096)
+    peft_helper = PEFTHelper.from_local_dir(test_dir, max_position_embeddings=4096)
    peft_helper.validate_legal(lora_config)
    scaling = peft_helper.lora_alpha / math.sqrt(peft_helper.r)
    assert abs(peft_helper.vllm_lora_scaling_factor - scaling) < 1e-3
@@ -106,4 +98,5 @@ def test_peft_helper_error(
    # Test loading the adapter
    with pytest.raises(ValueError, match=expected_error):
        PEFTHelper.from_local_dir(
-            test_dir, max_position_embeddings=4096).validate_legal(lora_config)
+            test_dir, max_position_embeddings=4096
+        ).validate_legal(lora_config)
--- a/tests/lora/test_punica_ops.py
+++ b/tests/lora/test_punica_ops.py
@@ -21,11 +21,18 @@ def reset_device(reset_default_device):

 # Utility shrink and expand operations used as reference implementations.
 def sgmv_shrink_for_nslices(
-        nslices: int, inputs_tensor: torch.Tensor,
-        lora_weights_lst: list[torch.Tensor], out_tensor: torch.Tensor,
-        b_seq_start_loc: torch.Tensor, seq_len_tensor: torch.Tensor,
-        prompt_lora_mapping: torch.Tensor, batches: int, max_seq_length: int,
-        num_tokens: int, scaling: float):
+    nslices: int,
+    inputs_tensor: torch.Tensor,
+    lora_weights_lst: list[torch.Tensor],
+    out_tensor: torch.Tensor,
+    b_seq_start_loc: torch.Tensor,
+    seq_len_tensor: torch.Tensor,
+    prompt_lora_mapping: torch.Tensor,
+    batches: int,
+    max_seq_length: int,
+    num_tokens: int,
+    scaling: float,
+):
    """
    Wrapper around torch_ops.sgmv_shrink that handles any nslices.
    """
@@ -44,15 +51,20 @@ def sgmv_shrink_for_nslices(
        )


-def sgmv_expand_for_nslices(nslices: int, hidden_size: int,
-                            inputs_tensor: torch.Tensor,
-                            lora_weights_lst: list[torch.Tensor],
-                            out_tensor: torch.Tensor,
-                            b_seq_start_loc: torch.Tensor,
-                            seq_len_tensor: torch.Tensor,
-                            prompt_lora_mapping: torch.Tensor, batches: int,
-                            max_seq_length: int, num_tokens: int,
-                            add_inputs: bool) -> None:
+def sgmv_expand_for_nslices(
+    nslices: int,
+    hidden_size: int,
+    inputs_tensor: torch.Tensor,
+    lora_weights_lst: list[torch.Tensor],
+    out_tensor: torch.Tensor,
+    b_seq_start_loc: torch.Tensor,
+    seq_len_tensor: torch.Tensor,
+    prompt_lora_mapping: torch.Tensor,
+    batches: int,
+    max_seq_length: int,
+    num_tokens: int,
+    add_inputs: bool,
+) -> None:
    """
    Wrapper around torch_ops.sgmv_expand that handles any nslices.
    """
@@ -94,10 +106,17 @@ def sgmv_expand_for_nslices(nslices: int, hidden_size: int,
 _dict_lock = Lock()


-def check_lora_shrink_kernel(batches: int, num_loras: int, rank: int,
-                             hidden_size: int, nslices: int,
-                             dtype: torch.dtype, device: str, seq_length: int,
-                             scaling: float):
+def check_lora_shrink_kernel(
+    batches: int,
+    num_loras: int,
+    rank: int,
+    hidden_size: int,
+    nslices: int,
+    dtype: torch.dtype,
+    device: str,
+    seq_length: int,
+    scaling: float,
+):
    """
    Compare outputs of torch_ops.sgmv_shrink and triton_ops.lora_shrink
    kernels.
@@ -116,14 +135,19 @@ def check_lora_shrink_kernel(batches: int, num_loras: int, rank: int,
    max_seq_length, token_nums = data.meta()

    # Setup metadata information for SGMV and reference kernels
-    sgmv_meta_args = (data.b_seq_start_loc, data.seq_len_tensor,
-                      data.prompt_lora_mapping, batches, max_seq_length,
-                      token_nums)
+    sgmv_meta_args = (
+        data.b_seq_start_loc,
+        data.seq_len_tensor,
+        data.prompt_lora_mapping,
+        batches,
+        max_seq_length,
+        token_nums,
+    )

    # Setup metadata information for the LoRA kernel.
-    lora_meta = LoRAKernelMeta.make(max_loras=num_loras,
-                                    max_num_tokens=token_nums,
-                                    device='cuda')
+    lora_meta = LoRAKernelMeta.make(
+        max_loras=num_loras, max_num_tokens=token_nums, device="cuda"
+    )
    lora_meta.prepare_tensors(data.token_lora_mapping)

    ref_out_tensor = data.ref_out_tensor
@@ -154,10 +178,17 @@ def check_lora_shrink_kernel(batches: int, num_loras: int, rank: int,
    assert_close(out_tensor, ref_out_tensor)


-def check_lora_expand_kernel(batches: int, num_loras: int, rank: int,
-                             hidden_size: int, nslices: int,
-                             dtype: torch.dtype, device: str, seq_length: int,
-                             add_inputs: bool):
+def check_lora_expand_kernel(
+    batches: int,
+    num_loras: int,
+    rank: int,
+    hidden_size: int,
+    nslices: int,
+    dtype: torch.dtype,
+    device: str,
+    seq_length: int,
+    add_inputs: bool,
+):
    """
    Compare outputs of torch_ops.sgmv_expand and triton_ops.lora_expand
    kernels.
@@ -177,14 +208,19 @@ def check_lora_expand_kernel(batches: int, num_loras: int, rank: int,
    max_seq_length, token_nums = data.meta()

    # Setup metadata information for SGMV and reference kernels
-    sgmv_meta_args = (data.b_seq_start_loc, data.seq_len_tensor,
-                      data.prompt_lora_mapping, batches, max_seq_length,
-                      token_nums)
+    sgmv_meta_args = (
+        data.b_seq_start_loc,
+        data.seq_len_tensor,
+        data.prompt_lora_mapping,
+        batches,
+        max_seq_length,
+        token_nums,
+    )

    # Setup metadata information for the LoRA kernel.
-    lora_meta = LoRAKernelMeta.make(max_loras=num_loras,
-                                    max_num_tokens=token_nums,
-                                    device='cuda')
+    lora_meta = LoRAKernelMeta.make(
+        max_loras=num_loras, max_num_tokens=token_nums, device="cuda"
+    )
    lora_meta.prepare_tensors(data.token_lora_mapping)

    # Setup output tensors
@@ -194,21 +230,25 @@ def check_lora_expand_kernel(batches: int, num_loras: int, rank: int,
    with _dict_lock:
        # lora_expand kernel
        _LORA_B_PTR_DICT.clear()
-        triton_ops.lora_expand(data.inputs_tensor,
-                               data.lora_weights,
-                               out_tensor,
-                               *lora_meta.meta_args(token_nums=token_nums),
-                               offset_start=0,
-                               add_inputs=add_inputs)
+        triton_ops.lora_expand(
+            data.inputs_tensor,
+            data.lora_weights,
+            out_tensor,
+            *lora_meta.meta_args(token_nums=token_nums),
+            offset_start=0,
+            add_inputs=add_inputs,
+        )

    # Reference
-    sgmv_expand_for_nslices(nslices,
-                            hidden_size,
-                            data.inputs_tensor,
-                            data.lora_weights,
-                            ref_out_tensor,
-                            *sgmv_meta_args,
-                            add_inputs=add_inputs)
+    sgmv_expand_for_nslices(
+        nslices,
+        hidden_size,
+        data.inputs_tensor,
+        data.lora_weights,
+        ref_out_tensor,
+        *sgmv_meta_args,
+        add_inputs=add_inputs,
+    )

    assert_close(out_tensor, ref_out_tensor)

@@ -299,7 +339,7 @@ HIDDEN_SIZES = [
    128000,
    128256,
 ]
-#The size of TP
+# The size of TP
 divisibility = [1, 2, 8, 16, 64]

 all_hidden_size = []
@@ -331,10 +371,10 @@ DEVICES = [f"cuda:{0}"]
 SEED = [0]


-@pytest.mark.parametrize("batches", test_params['batches'])
-@pytest.mark.parametrize("num_loras", test_params['num_loras'])
-@pytest.mark.parametrize("rank", test_params['max_ranks'])
-@pytest.mark.parametrize("hidden_size", test_params['hidden_sizes'])
+@pytest.mark.parametrize("batches", test_params["batches"])
+@pytest.mark.parametrize("num_loras", test_params["num_loras"])
+@pytest.mark.parametrize("rank", test_params["max_ranks"])
+@pytest.mark.parametrize("hidden_size", test_params["hidden_sizes"])
@pytest.mark.parametrize("nslices", [1, 2, 3])
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("device", DEVICES)
@@ -358,31 +398,35 @@ def test_kernels(
    current_platform.seed_everything(seed)

    if op_type == "shrink":
-        check_lora_shrink_kernel(batches=batches,
-                                 num_loras=num_loras,
-                                 rank=rank,
-                                 hidden_size=hidden_size,
-                                 nslices=nslices,
-                                 dtype=dtype,
-                                 device=device,
-                                 seq_length=128,
-                                 scaling=0.5)
+        check_lora_shrink_kernel(
+            batches=batches,
+            num_loras=num_loras,
+            rank=rank,
+            hidden_size=hidden_size,
+            nslices=nslices,
+            dtype=dtype,
+            device=device,
+            seq_length=128,
+            scaling=0.5,
+        )
    else:
-        check_lora_expand_kernel(batches=batches,
-                                 num_loras=num_loras,
-                                 rank=rank,
-                                 hidden_size=hidden_size,
-                                 nslices=nslices,
-                                 dtype=dtype,
-                                 device=device,
-                                 seq_length=128,
-                                 add_inputs=True)
+        check_lora_expand_kernel(
+            batches=batches,
+            num_loras=num_loras,
+            rank=rank,
+            hidden_size=hidden_size,
+            nslices=nslices,
+            dtype=dtype,
+            device=device,
+            seq_length=128,
+            add_inputs=True,
+        )


-@pytest.mark.parametrize("batches", hs_test_params['batches'])
-@pytest.mark.parametrize("num_loras", hs_test_params['num_loras'])
-@pytest.mark.parametrize("rank", hs_test_params['max_ranks'])
-@pytest.mark.parametrize("hidden_size", hs_test_params['hidden_sizes'])
+@pytest.mark.parametrize("batches", hs_test_params["batches"])
+@pytest.mark.parametrize("num_loras", hs_test_params["num_loras"])
+@pytest.mark.parametrize("rank", hs_test_params["max_ranks"])
+@pytest.mark.parametrize("hidden_size", hs_test_params["hidden_sizes"])
@pytest.mark.parametrize("nslices", [1, 2, 3])
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("device", DEVICES)
@@ -406,22 +450,26 @@ def test_kernels_hidden_size(
    current_platform.seed_everything(seed)

    if op_type == "shrink":
-        check_lora_shrink_kernel(batches=batches,
-                                 num_loras=num_loras,
-                                 rank=rank,
-                                 hidden_size=hidden_size,
-                                 nslices=nslices,
-                                 dtype=dtype,
-                                 device=device,
-                                 seq_length=128,
-                                 scaling=0.5)
+        check_lora_shrink_kernel(
+            batches=batches,
+            num_loras=num_loras,
+            rank=rank,
+            hidden_size=hidden_size,
+            nslices=nslices,
+            dtype=dtype,
+            device=device,
+            seq_length=128,
+            scaling=0.5,
+        )
    else:
-        check_lora_expand_kernel(batches=batches,
-                                 num_loras=num_loras,
-                                 rank=rank,
-                                 hidden_size=hidden_size,
-                                 nslices=nslices,
-                                 dtype=dtype,
-                                 device=device,
-                                 seq_length=128,
-                                 add_inputs=True)
+        check_lora_expand_kernel(
+            batches=batches,
+            num_loras=num_loras,
+            rank=rank,
+            hidden_size=hidden_size,
+            nslices=nslices,
+            dtype=dtype,
+            device=device,
+            seq_length=128,
+            add_inputs=True,
+        )
--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
@@ -20,28 +20,27 @@ class ModelWithQuantization:


 MODELS: list[ModelWithQuantization]
-#AWQ quantization is currently not supported in ROCm.
+# AWQ quantization is currently not supported in ROCm.
 if current_platform.is_rocm():
    MODELS = [
        ModelWithQuantization(
-            model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
-            quantization="gptq"),
+            model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", quantization="gptq"
+        ),
    ]
 else:
    MODELS = [
        ModelWithQuantization(
-            model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",
-            quantization="awq"),
+            model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", quantization="awq"
+        ),
        ModelWithQuantization(
-            model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
-            quantization="gptq"),
+            model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", quantization="gptq"
+        ),
    ]


-def do_sample(llm: vllm.LLM,
-              lora_path: str,
-              lora_id: int,
-              max_tokens: int = 256) -> list[str]:
+def do_sample(
+    llm: vllm.LLM, lora_path: str, lora_id: int, max_tokens: int = 256
+) -> list[str]:
    raw_prompts = [
        "Give me an orange-ish brown color",
        "Give me a neon pink color",
@@ -52,14 +51,14 @@ def do_sample(llm: vllm.LLM,

    prompts = [format_prompt_tuples(p) for p in raw_prompts]

-    sampling_params = vllm.SamplingParams(temperature=0,
-                                          max_tokens=max_tokens,
-                                          stop=["<|im_end|>"])
+    sampling_params = vllm.SamplingParams(
+        temperature=0, max_tokens=max_tokens, stop=["<|im_end|>"]
+    )
    outputs = llm.generate(
        prompts,
        sampling_params,
-        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-        if lora_id else None)
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
+    )
    # Print the outputs.
    generated_texts: list[str] = []
    for output in outputs:
@@ -72,18 +71,18 @@ def do_sample(llm: vllm.LLM,

@pytest.mark.parametrize("model", MODELS)
 def test_quant_model_lora(tinyllama_lora_files, model):
-
    llm = vllm.LLM(
        model=model.model_path,
        enable_lora=True,
        max_num_seqs=16,
        max_loras=4,
        max_model_len=400,
-        gpu_memory_utilization=0.2,  #avoid OOM
+        gpu_memory_utilization=0.2,  # avoid OOM
        quantization=model.quantization,
        trust_remote_code=True,
        enable_chunked_prefill=True,
-        tokenizer=tinyllama_lora_files)
+        tokenizer=tinyllama_lora_files,
+    )

    if model.quantization is None:
        expected_lora_output = [
@@ -104,11 +103,11 @@ def test_quant_model_lora(tinyllama_lora_files, model):
    def expect_match(output, expected_output):
        # HACK: GPTQ lora outputs are just incredibly unstable.
        # Assert that the outputs changed.
-        if (model.quantization == "gptq"
-                and expected_output is expected_lora_output):
+        if model.quantization == "gptq" and expected_output is expected_lora_output:
            for i, o in enumerate(output):
-                assert o.startswith(
-                    '#'), f"Expected example {i} to start with # but got {o}"
+                assert o.startswith("#"), (
+                    f"Expected example {i} to start with # but got {o}"
+                )
            return
        assert output == expected_output

@@ -116,17 +115,11 @@ def test_quant_model_lora(tinyllama_lora_files, model):

    print("lora adapter created")
    print("lora 1")
-    output = do_sample(llm,
-                       tinyllama_lora_files,
-                       lora_id=1,
-                       max_tokens=max_tokens)
+    output = do_sample(llm, tinyllama_lora_files, lora_id=1, max_tokens=max_tokens)
    expect_match(output, expected_lora_output)

    print("lora 2")
-    output = do_sample(llm,
-                       tinyllama_lora_files,
-                       lora_id=2,
-                       max_tokens=max_tokens)
+    output = do_sample(llm, tinyllama_lora_files, lora_id=2, max_tokens=max_tokens)
    expect_match(output, expected_lora_output)

    print("removing lora")
@@ -136,8 +129,7 @@ def test_quant_model_lora(tinyllama_lora_files, model):


@pytest.mark.parametrize("model", MODELS)
-def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
-                                 model):
+def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available, model):
    if num_gpus_available < 2:
        pytest.skip(f"Not enough GPUs for tensor parallelism {2}")
    if model.quantization == "gptq":
@@ -147,10 +139,11 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
        enable_lora=True,
        max_num_seqs=16,
        max_loras=4,
-        gpu_memory_utilization=0.2,  #avoid OOM
+        gpu_memory_utilization=0.2,  # avoid OOM
        quantization=model.quantization,
        trust_remote_code=True,
-        enable_chunked_prefill=True)
+        enable_chunked_prefill=True,
+    )
    output_tp1 = do_sample(llm_tp1, tinyllama_lora_files, lora_id=1)

    del llm_tp1
@@ -162,9 +155,10 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
        max_num_seqs=16,
        max_loras=4,
        tensor_parallel_size=2,
-        gpu_memory_utilization=0.2,  #avoid OOM
+        gpu_memory_utilization=0.2,  # avoid OOM
        quantization=model.quantization,
-        enable_chunked_prefill=True)
+        enable_chunked_prefill=True,
+    )
    output_tp2 = do_sample(llm_tp2, tinyllama_lora_files, lora_id=1)

    del llm_tp2
--- a/tests/lora/test_qwen2vl.py
+++ b/tests/lora/test_qwen2vl.py
@@ -37,7 +37,8 @@ class Qwen2VLTester:
        "<|im_start|>system\nYou are a helpful assistant.<|im_end|>"
        "\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
        "What is in the image?<|im_end|>\n"
-        "<|im_start|>assistant\n")
+        "<|im_start|>assistant\n"
+    )

    def __init__(self, config: TestConfig):
        self.config = config
@@ -56,68 +57,68 @@ class Qwen2VLTester:
            max_model_len=self.config.max_model_len,
        )

-    def run_test(self,
-                 images: list[ImageAsset],
-                 expected_outputs: list[str],
-                 lora_id: Optional[int] = None,
-                 temperature: float = 0,
-                 max_tokens: int = 5):
-
+    def run_test(
+        self,
+        images: list[ImageAsset],
+        expected_outputs: list[str],
+        lora_id: Optional[int] = None,
+        temperature: float = 0,
+        max_tokens: int = 5,
+    ):
        sampling_params = vllm.SamplingParams(
            temperature=temperature,
            max_tokens=max_tokens,
        )
-        inputs = [{
-            "prompt": self.PROMPT_TEMPLATE,
-            "multi_modal_data": {
-                "image": asset.pil_image
-            },
-        } for asset in images]
-
-        lora_request = LoRARequest(str(lora_id), lora_id,
-                                   self.config.lora_path)
-        outputs = self.llm.generate(inputs,
-                                    sampling_params,
-                                    lora_request=lora_request)
-        generated_texts = [
-            output.outputs[0].text.strip() for output in outputs
+        inputs = [
+            {
+                "prompt": self.PROMPT_TEMPLATE,
+                "multi_modal_data": {"image": asset.pil_image},
+            }
+            for asset in images
        ]

+        lora_request = LoRARequest(str(lora_id), lora_id, self.config.lora_path)
+        outputs = self.llm.generate(inputs, sampling_params, lora_request=lora_request)
+        generated_texts = [output.outputs[0].text.strip() for output in outputs]
+
        # Validate outputs
        for generated, expected in zip(generated_texts, expected_outputs):
-            assert expected.startswith(
-                generated), f"Generated text {generated} doesn't "
+            assert expected.startswith(generated), (
+                f"Generated text {generated} doesn't "
+            )
            f"match expected pattern {expected}"

-    def run_beam_search_test(self,
-                             images: list[ImageAsset],
-                             expected_outputs: list[list[str]],
-                             lora_id: Optional[int] = None,
-                             temperature: float = 0,
-                             beam_width: int = 2,
-                             max_tokens: int = 5):
+    def run_beam_search_test(
+        self,
+        images: list[ImageAsset],
+        expected_outputs: list[list[str]],
+        lora_id: Optional[int] = None,
+        temperature: float = 0,
+        beam_width: int = 2,
+        max_tokens: int = 5,
+    ):
+        beam_search_params = BeamSearchParams(
+            beam_width=beam_width, max_tokens=max_tokens, temperature=temperature
+        )

-        beam_search_params = BeamSearchParams(beam_width=beam_width,
-                                              max_tokens=max_tokens,
-                                              temperature=temperature)
+        inputs = [
+            {
+                "prompt": self.PROMPT_TEMPLATE,
+                "multi_modal_data": {"image": asset.pil_image},
+            }
+            for asset in images
+        ]

-        inputs = [{
-            "prompt": self.PROMPT_TEMPLATE,
-            "multi_modal_data": {
-                "image": asset.pil_image
-            },
-        } for asset in images]
-
-        lora_request = LoRARequest(str(lora_id), lora_id,
-                                   self.config.lora_path)
-        outputs = self.llm.beam_search(inputs,
-                                       beam_search_params,
-                                       lora_request=lora_request)
+        lora_request = LoRARequest(str(lora_id), lora_id, self.config.lora_path)
+        outputs = self.llm.beam_search(
+            inputs, beam_search_params, lora_request=lora_request
+        )

        for output_obj, expected_outs in zip(outputs, expected_outputs):
            output_texts = [seq.text for seq in output_obj.sequences]
-            assert output_texts == expected_outs, \
-                f"Generated texts {output_texts} do not match expected {expected_outs}"  # noqa: E501
+            assert output_texts == expected_outs, (
+                f"Generated texts {output_texts} do not match expected {expected_outs}"
+            )  # noqa: E501


 TEST_IMAGES = [
@@ -144,27 +145,25 @@ QWEN25VL_MODEL_PATH = "Qwen/Qwen2.5-VL-3B-Instruct"

@pytest.mark.xfail(
    current_platform.is_rocm(),
-    reason="Qwen2-VL dependency xformers incompatible with ROCm")
+    reason="Qwen2-VL dependency xformers incompatible with ROCm",
+)
 def test_qwen2vl_lora(qwen2vl_lora_files):
    """Test Qwen 2.0 VL model with LoRA"""
-    config = TestConfig(model_path=QWEN2VL_MODEL_PATH,
-                        lora_path=qwen2vl_lora_files)
+    config = TestConfig(model_path=QWEN2VL_MODEL_PATH, lora_path=qwen2vl_lora_files)
    tester = Qwen2VLTester(config)

    # Test with different LoRA IDs
    for lora_id in [1, 2]:
-        tester.run_test(TEST_IMAGES,
-                        expected_outputs=EXPECTED_OUTPUTS,
-                        lora_id=lora_id)
+        tester.run_test(TEST_IMAGES, expected_outputs=EXPECTED_OUTPUTS, lora_id=lora_id)


@pytest.mark.xfail(
    current_platform.is_rocm(),
-    reason="Qwen2-VL dependency xformers incompatible with ROCm")
+    reason="Qwen2-VL dependency xformers incompatible with ROCm",
+)
 def test_qwen2vl_lora_beam_search(qwen2vl_lora_files):
    """Test Qwen 2.0 VL model with LoRA through beam search."""
-    config = TestConfig(model_path=QWEN2VL_MODEL_PATH,
-                        lora_path=qwen2vl_lora_files)
+    config = TestConfig(model_path=QWEN2VL_MODEL_PATH, lora_path=qwen2vl_lora_files)
    tester = Qwen2VLTester(config)

    # Test with different LoRA IDs
@@ -176,7 +175,8 @@ def test_qwen2vl_lora_beam_search(qwen2vl_lora_files):
        tester.run_beam_search_test(
            [ImageAsset("cherry_blossom")],
            expected_outputs=EXPECTED_BEAM_SEARCH_OUTPUTS,
-            lora_id=lora_id)
+            lora_id=lora_id,
+        )


@pytest.mark.xfail(
@@ -185,12 +185,9 @@ def test_qwen2vl_lora_beam_search(qwen2vl_lora_files):
 )
 def test_qwen25vl_lora(qwen25vl_lora_files):
    """Test Qwen 2.5 VL model with LoRA"""
-    config = TestConfig(model_path=QWEN25VL_MODEL_PATH,
-                        lora_path=qwen25vl_lora_files)
+    config = TestConfig(model_path=QWEN25VL_MODEL_PATH, lora_path=qwen25vl_lora_files)
    tester = Qwen2VLTester(config)

    # Test with different LoRA IDs
    for lora_id in [1, 2]:
-        tester.run_test(TEST_IMAGES,
-                        expected_outputs=EXPECTED_OUTPUTS,
-                        lora_id=lora_id)
+        tester.run_test(TEST_IMAGES, expected_outputs=EXPECTED_OUTPUTS, lora_id=lora_id)
--- a/tests/lora/test_resolver.py
+++ b/tests/lora/test_resolver.py
@@ -12,13 +12,15 @@ from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
 class DummyLoRAResolver(LoRAResolver):
    """A dummy LoRA resolver for testing."""

-    async def resolve_lora(self, base_model_name: str,
-                           lora_name: str) -> Optional[LoRARequest]:
+    async def resolve_lora(
+        self, base_model_name: str, lora_name: str
+    ) -> Optional[LoRARequest]:
        if lora_name == "test_lora":
            return LoRARequest(
                lora_name=lora_name,
                lora_path=f"/dummy/path/{base_model_name}/{lora_name}",
-                lora_int_id=abs(hash(lora_name)))
+                lora_int_id=abs(hash(lora_name)),
+            )
        return None


@@ -70,6 +72,5 @@ async def test_dummy_resolver_resolve():
    assert result.lora_path == f"/dummy/path/{base_model_name}/{lora_name}"

    # Test failed resolution
-    result = await dummy_resolver.resolve_lora(base_model_name,
-                                               "nonexistent_lora")
+    result = await dummy_resolver.resolve_lora(base_model_name, "nonexistent_lora")
    assert result is None
--- a/tests/lora/test_transformers_model.py
+++ b/tests/lora/test_transformers_model.py
@@ -24,20 +24,18 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
    prompts = [
        PROMPT_TEMPLATE.format(query="How many singers do we have?"),
        PROMPT_TEMPLATE.format(
-            query=
-            "What is the average, minimum, and maximum age of all singers from France?"  # noqa: E501
+            query="What is the average, minimum, and maximum age of all singers from France?"  # noqa: E501
        ),
        PROMPT_TEMPLATE.format(
-            query=
-            "What are all distinct countries where singers above age 20 are from?"  # noqa: E501
+            query="What are all distinct countries where singers above age 20 are from?"  # noqa: E501
        ),
    ]
    sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
    outputs = llm.generate(
        prompts,
        sampling_params,
-        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-        if lora_id else None)
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None,
+    )
    # Print the outputs.
    generated_texts: list[str] = []
    for output in outputs:
@@ -49,13 +47,15 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:


 def test_ilama_lora(ilama_lora_files):
-    llm = vllm.LLM(MODEL_PATH,
-                   max_model_len=1024,
-                   enable_lora=True,
-                   max_loras=4,
-                   max_lora_rank=16,
-                   trust_remote_code=True,
-                   enable_chunked_prefill=True)
+    llm = vllm.LLM(
+        MODEL_PATH,
+        max_model_len=1024,
+        enable_lora=True,
+        max_loras=4,
+        max_lora_rank=16,
+        trust_remote_code=True,
+        enable_chunked_prefill=True,
+    )

    output1 = do_sample(llm, ilama_lora_files, lora_id=1)
    for i in range(len(EXPECTED_LORA_OUTPUT)):
@@ -65,20 +65,23 @@ def test_ilama_lora(ilama_lora_files):
        assert output2[i] == EXPECTED_LORA_OUTPUT[i]


-@pytest.mark.skipif(current_platform.is_cuda_alike(),
-                    reason="Skipping to avoid redundant model tests")
+@pytest.mark.skipif(
+    current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests"
+)
@multi_gpu_test(num_gpus=4)
@create_new_process_for_each_test()
 def test_ilama_lora_tp4(ilama_lora_files):
-    llm = vllm.LLM(MODEL_PATH,
-                   max_model_len=1024,
-                   enable_lora=True,
-                   max_loras=4,
-                   max_lora_rank=16,
-                   tensor_parallel_size=4,
-                   trust_remote_code=True,
-                   fully_sharded_loras=False,
-                   enable_chunked_prefill=True)
+    llm = vllm.LLM(
+        MODEL_PATH,
+        max_model_len=1024,
+        enable_lora=True,
+        max_loras=4,
+        max_lora_rank=16,
+        tensor_parallel_size=4,
+        trust_remote_code=True,
+        fully_sharded_loras=False,
+        enable_chunked_prefill=True,
+    )

    output1 = do_sample(llm, ilama_lora_files, lora_id=1)
    for i in range(len(EXPECTED_LORA_OUTPUT)):
@@ -88,20 +91,23 @@ def test_ilama_lora_tp4(ilama_lora_files):
        assert output2[i] == EXPECTED_LORA_OUTPUT[i]


-@pytest.mark.skipif(current_platform.is_cuda_alike(),
-                    reason="Skipping to avoid redundant model tests")
+@pytest.mark.skipif(
+    current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests"
+)
@multi_gpu_test(num_gpus=4)
@create_new_process_for_each_test()
 def test_ilama_lora_tp4_fully_sharded_loras(ilama_lora_files):
-    llm = vllm.LLM(MODEL_PATH,
-                   max_model_len=1024,
-                   enable_lora=True,
-                   max_loras=4,
-                   max_lora_rank=16,
-                   tensor_parallel_size=4,
-                   trust_remote_code=True,
-                   fully_sharded_loras=True,
-                   enable_chunked_prefill=True)
+    llm = vllm.LLM(
+        MODEL_PATH,
+        max_model_len=1024,
+        enable_lora=True,
+        max_loras=4,
+        max_lora_rank=16,
+        tensor_parallel_size=4,
+        trust_remote_code=True,
+        fully_sharded_loras=True,
+        enable_chunked_prefill=True,
+    )
    output1 = do_sample(llm, ilama_lora_files, lora_id=1)
    for i in range(len(EXPECTED_LORA_OUTPUT)):
        assert output1[i] == EXPECTED_LORA_OUTPUT[i]
--- a/tests/lora/test_utils.py
+++ b/tests/lora/test_utils.py
@@ -9,8 +9,11 @@ import pytest
 from huggingface_hub.utils import HfHubHTTPError
 from torch import nn

-from vllm.lora.utils import (get_adapter_absolute_path,
-                             parse_fine_tuned_lora_name, replace_submodule)
+from vllm.lora.utils import (
+    get_adapter_absolute_path,
+    parse_fine_tuned_lora_name,
+    replace_submodule,
+)
 from vllm.model_executor.models.utils import WeightsMapper


@@ -24,10 +27,12 @@ class LoRANameParserTestConfig(NamedTuple):

 def test_parse_fine_tuned_lora_name_valid():
    fixture = [
-        LoRANameParserTestConfig("base_model.model.lm_head.lora_A.weight",
-                                 "lm_head", True, False),
-        LoRANameParserTestConfig("base_model.model.lm_head.lora_B.weight",
-                                 "lm_head", False, False),
+        LoRANameParserTestConfig(
+            "base_model.model.lm_head.lora_A.weight", "lm_head", True, False
+        ),
+        LoRANameParserTestConfig(
+            "base_model.model.lm_head.lora_B.weight", "lm_head", False, False
+        ),
        LoRANameParserTestConfig(
            "base_model.model.model.embed_tokens.lora_embedding_A",
            "model.embed_tokens",
@@ -71,7 +76,8 @@ def test_parse_fine_tuned_lora_name_valid():
            True,
            False,
            weights_mapper=WeightsMapper(
-                orig_to_new_prefix={"model.": "language_model.model."}),
+                orig_to_new_prefix={"model.": "language_model.model."}
+            ),
        ),
        LoRANameParserTestConfig(
            "base_model.model.model.layers.9.mlp.down_proj.lora_B.weight",
@@ -79,7 +85,8 @@ def test_parse_fine_tuned_lora_name_valid():
            False,
            False,
            weights_mapper=WeightsMapper(
-                orig_to_new_prefix={"model.": "language_model.model."}),
+                orig_to_new_prefix={"model.": "language_model.model."}
+            ),
        ),
        LoRANameParserTestConfig(
            "model.layers.9.mlp.down_proj.lora_A.weight",
@@ -87,7 +94,8 @@ def test_parse_fine_tuned_lora_name_valid():
            True,
            False,
            weights_mapper=WeightsMapper(
-                orig_to_new_prefix={"model.": "language_model.model."}),
+                orig_to_new_prefix={"model.": "language_model.model."}
+            ),
        ),
        LoRANameParserTestConfig(
            "model.layers.9.mlp.down_proj.lora_B.weight",
@@ -95,12 +103,14 @@ def test_parse_fine_tuned_lora_name_valid():
            False,
            False,
            weights_mapper=WeightsMapper(
-                orig_to_new_prefix={"model.": "language_model.model."}),
+                orig_to_new_prefix={"model.": "language_model.model."}
+            ),
        ),
    ]
    for name, module_name, is_lora_a, is_bias, weights_mapper in fixture:
-        assert (module_name, is_lora_a,
-                is_bias) == parse_fine_tuned_lora_name(name, weights_mapper)
+        assert (module_name, is_lora_a, is_bias) == parse_fine_tuned_lora_name(
+            name, weights_mapper
+        )


 def test_parse_fine_tuned_lora_name_invalid():
@@ -115,22 +125,28 @@ def test_parse_fine_tuned_lora_name_invalid():

 def test_replace_submodule():
    model = nn.Sequential(
-        OrderedDict([
-            ("dense1", nn.Linear(764, 100)),
-            ("act1", nn.ReLU()),
-            ("dense2", nn.Linear(100, 50)),
-            (
-                "seq1",
-                nn.Sequential(
-                    OrderedDict([
-                        ("dense1", nn.Linear(100, 10)),
-                        ("dense2", nn.Linear(10, 50)),
-                    ])),
-            ),
-            ("act2", nn.ReLU()),
-            ("output", nn.Linear(50, 10)),
-            ("outact", nn.Sigmoid()),
-        ]))
+        OrderedDict(
+            [
+                ("dense1", nn.Linear(764, 100)),
+                ("act1", nn.ReLU()),
+                ("dense2", nn.Linear(100, 50)),
+                (
+                    "seq1",
+                    nn.Sequential(
+                        OrderedDict(
+                            [
+                                ("dense1", nn.Linear(100, 10)),
+                                ("dense2", nn.Linear(10, 50)),
+                            ]
+                        )
+                    ),
+                ),
+                ("act2", nn.ReLU()),
+                ("output", nn.Linear(50, 10)),
+                ("outact", nn.Sigmoid()),
+            ]
+        )
+    )

    sigmoid = nn.Sigmoid()

@@ -143,52 +159,51 @@ def test_replace_submodule():


 # Unit tests for get_adapter_absolute_path
-@patch('os.path.isabs')
+@patch("os.path.isabs")
 def test_get_adapter_absolute_path_absolute(mock_isabs):
-    path = '/absolute/path/to/lora'
+    path = "/absolute/path/to/lora"
    mock_isabs.return_value = True
    assert get_adapter_absolute_path(path) == path


-@patch('os.path.expanduser')
+@patch("os.path.expanduser")
 def test_get_adapter_absolute_path_expanduser(mock_expanduser):
    # Path with ~ that needs to be expanded
-    path = '~/relative/path/to/lora'
-    absolute_path = '/home/user/relative/path/to/lora'
+    path = "~/relative/path/to/lora"
+    absolute_path = "/home/user/relative/path/to/lora"
    mock_expanduser.return_value = absolute_path
    assert get_adapter_absolute_path(path) == absolute_path


-@patch('os.path.exists')
-@patch('os.path.abspath')
+@patch("os.path.exists")
+@patch("os.path.abspath")
 def test_get_adapter_absolute_path_local_existing(mock_abspath, mock_exist):
    # Relative path that exists locally
-    path = 'relative/path/to/lora'
-    absolute_path = '/absolute/path/to/lora'
+    path = "relative/path/to/lora"
+    absolute_path = "/absolute/path/to/lora"
    mock_exist.return_value = True
    mock_abspath.return_value = absolute_path
    assert get_adapter_absolute_path(path) == absolute_path


-@patch('huggingface_hub.snapshot_download')
-@patch('os.path.exists')
-def test_get_adapter_absolute_path_huggingface(mock_exist,
-                                               mock_snapshot_download):
+@patch("huggingface_hub.snapshot_download")
+@patch("os.path.exists")
+def test_get_adapter_absolute_path_huggingface(mock_exist, mock_snapshot_download):
    # Hugging Face model identifier
-    path = 'org/repo'
-    absolute_path = '/mock/snapshot/path'
+    path = "org/repo"
+    absolute_path = "/mock/snapshot/path"
    mock_exist.return_value = False
    mock_snapshot_download.return_value = absolute_path
    assert get_adapter_absolute_path(path) == absolute_path


-@patch('huggingface_hub.snapshot_download')
-@patch('os.path.exists')
-def test_get_adapter_absolute_path_huggingface_error(mock_exist,
-                                                     mock_snapshot_download):
+@patch("huggingface_hub.snapshot_download")
+@patch("os.path.exists")
+def test_get_adapter_absolute_path_huggingface_error(
+    mock_exist, mock_snapshot_download
+):
    # Hugging Face model identifier with download error
-    path = 'org/repo'
+    path = "org/repo"
    mock_exist.return_value = False
-    mock_snapshot_download.side_effect = HfHubHTTPError(
-        "failed to query model info")
+    mock_snapshot_download.side_effect = HfHubHTTPError("failed to query model info")
    assert get_adapter_absolute_path(path) == path
--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -6,8 +6,14 @@ import random
 import tempfile
 from unittest.mock import patch

-from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
-                         ParallelConfig, SchedulerConfig, VllmConfig)
+from vllm.config import (
+    CacheConfig,
+    DeviceConfig,
+    ModelConfig,
+    ParallelConfig,
+    SchedulerConfig,
+    VllmConfig,
+)
 from vllm.config.load import LoadConfig
 from vllm.config.lora import LoRAConfig
 from vllm.lora.models import LoRAMapping
@@ -19,12 +25,12 @@ NUM_LORAS = 16

@patch.dict(os.environ, {"RANK": "0"})
 def test_worker_apply_lora(sql_lora_files):
-
    def set_active_loras(worker: Worker, lora_requests: list[LoRARequest]):
        lora_mapping = LoRAMapping([], [])

        worker.model_runner.lora_manager.set_active_adapters(
-            lora_requests, lora_mapping)
+            lora_requests, lora_mapping
+        )

    vllm_config = VllmConfig(
        model_config=ModelConfig(
@@ -49,9 +55,9 @@ def test_worker_apply_lora(sql_lora_files):
            swap_space=0,
            cache_dtype="auto",
        ),
-        lora_config=LoRAConfig(max_lora_rank=8,
-                               max_cpu_loras=NUM_LORAS,
-                               max_loras=NUM_LORAS),
+        lora_config=LoRAConfig(
+            max_lora_rank=8, max_cpu_loras=NUM_LORAS, max_loras=NUM_LORAS
+        ),
    )
    worker = Worker(
        vllm_config=vllm_config,
@@ -67,23 +73,22 @@ def test_worker_apply_lora(sql_lora_files):
    assert worker.list_loras() == set()

    lora_requests = [
-        LoRARequest(str(i + 1), i + 1, sql_lora_files)
-        for i in range(NUM_LORAS)
+        LoRARequest(str(i + 1), i + 1, sql_lora_files) for i in range(NUM_LORAS)
    ]

    set_active_loras(worker, lora_requests)
    assert worker.list_loras() == {
-        lora_request.lora_int_id
-        for lora_request in lora_requests
+        lora_request.lora_int_id for lora_request in lora_requests
    }

    for i in range(NUM_LORAS):
        random.seed(i)
-        iter_lora_requests = random.choices(lora_requests,
-                                            k=random.randint(1, NUM_LORAS))
+        iter_lora_requests = random.choices(
+            lora_requests, k=random.randint(1, NUM_LORAS)
+        )
        random.shuffle(iter_lora_requests)
-        iter_lora_requests = iter_lora_requests[:-random.randint(0, NUM_LORAS)]
+        iter_lora_requests = iter_lora_requests[: -random.randint(0, NUM_LORAS)]
        set_active_loras(worker, lora_requests)
        assert worker.list_loras().issuperset(
-            {lora_request.lora_int_id
-             for lora_request in iter_lora_requests})
+            {lora_request.lora_int_id for lora_request in iter_lora_requests}
+        )
--- a/tests/lora/utils.py
+++ b/tests/lora/utils.py
@@ -13,7 +13,6 @@ from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights


 class DummyLoRAManager:
-
    def __init__(self, device: torch.device = "cuda:0"):
        super().__init__()
        self._loras: dict[str, LoRALayerWeights] = {}
@@ -36,12 +35,12 @@ class DummyLoRAManager:
            module_name,
            rank=rank,
            lora_alpha=1,
-            lora_a=torch.rand([rank, weight.shape[1]],
-                              dtype=weight.dtype,
-                              device=self._device),
-            lora_b=torch.rand([weight.shape[0], rank],
-                              dtype=weight.dtype,
-                              device=self._device),
+            lora_a=torch.rand(
+                [rank, weight.shape[1]], dtype=weight.dtype, device=self._device
+            ),
+            lora_b=torch.rand(
+                [weight.shape[0], rank], dtype=weight.dtype, device=self._device
+            ),
        )
        if generate_embeddings_tensor:
            lora.embeddings_tensor = torch.rand(
@@ -146,27 +145,26 @@ def generate_data(
    op_type,
    device,
 ) -> PunicaTensors:
-    seq_len_tensor = torch.randint(seq_length, seq_length + 1,
-                                   (batches, )).to(device)
+    seq_len_tensor = torch.randint(seq_length, seq_length + 1, (batches,)).to(device)
    b_seq_start_loc = torch.cumsum(
        torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
        dim=0,
    ).to(device)
    total_tokens = seq_len_tensor.sum()
    if op_type == "shrink":
-        inputs_tensor = torch.rand((total_tokens, hidden_size),
-                                   dtype=dtype).to(device)
+        inputs_tensor = torch.rand((total_tokens, hidden_size), dtype=dtype).to(device)
        lora_weights = torch.rand(
            (lora_nums, max_rank, hidden_size),  # col-major
            dtype=dtype,
        ).to(device)
        # shrink op need atomic_add, so output is initinized by 0
-        ref_out_tensor = torch.zeros((total_tokens, max_rank),
-                                     dtype=dtype,
-                                     device=inputs_tensor.device)
+        ref_out_tensor = torch.zeros(
+            (total_tokens, max_rank), dtype=dtype, device=inputs_tensor.device
+        )
        # NOTE  shrink kernel using torch.float32 as output type
-        our_out_tensor = torch.zeros((total_tokens, max_rank),
-                                     dtype=torch.float32).to(device)
+        our_out_tensor = torch.zeros((total_tokens, max_rank), dtype=torch.float32).to(
+            device
+        )
    else:
        inputs_tensor = torch.rand(
            (total_tokens, max_rank),
@@ -184,15 +182,16 @@ def generate_data(
        ).to(device)
        # Ensure the same input.
        our_out_tensor = ref_out_tensor.clone()
-    lora_indices_tensor = torch.randint(0,
-                                        lora_nums - 1 if lora_nums > 1 else 1,
-                                        (batches, )).to(device)
+    lora_indices_tensor = torch.randint(
+        0, lora_nums - 1 if lora_nums > 1 else 1, (batches,)
+    ).to(device)
    indices = torch.zeros((total_tokens), dtype=torch.long).to(device)
    current_offset = 0
    for b_id in range(batches):
        lora_index = lora_indices_tensor[b_id]
-        indices[current_offset:current_offset +
-                seq_len_tensor[b_id]].copy_(lora_index)
+        indices[current_offset : current_offset + seq_len_tensor[b_id]].copy_(
+            lora_index
+        )
        current_offset += seq_len_tensor[b_id].item()

    return PunicaTensors(
@@ -217,8 +216,7 @@ def generate_data_for_expand_nslices(
    nslices,
    device,
 ) -> PunicaTensors:
-    seq_len_tensor = torch.randint(seq_length, seq_length + 1,
-                                   (batches, )).to(device)
+    seq_len_tensor = torch.randint(seq_length, seq_length + 1, (batches,)).to(device)
    b_seq_start_loc = torch.cumsum(
        torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
        dim=0,
@@ -234,22 +232,25 @@ def generate_data_for_expand_nslices(
            torch.rand(
                (lora_nums, hidden_size, max_rank),  # col-major
                dtype=dtype,
-            ).to(device))
+            ).to(device)
+        )
    # expand op needs to complete y+=a@lora_b, so output is
    # initinized randomly
-    ref_out_tensor = torch.rand((total_tokens, hidden_size * nslices),
-                                dtype=dtype).to(device)
+    ref_out_tensor = torch.rand((total_tokens, hidden_size * nslices), dtype=dtype).to(
+        device
+    )
    # Ensure the same input.
    our_out_tensor = ref_out_tensor.clone()
-    lora_indices_tensor = torch.randint(0,
-                                        lora_nums - 1 if lora_nums > 1 else 1,
-                                        (batches, ))
+    lora_indices_tensor = torch.randint(
+        0, lora_nums - 1 if lora_nums > 1 else 1, (batches,)
+    )
    indices = torch.zeros((total_tokens), dtype=torch.long).to(device)
    current_offset = 0
    for b_id in range(batches):
        lora_index = lora_indices_tensor[b_id]
-        indices[current_offset:current_offset +
-                seq_len_tensor[b_id]] = (lora_index.item())
+        indices[current_offset : current_offset + seq_len_tensor[b_id]] = (
+            lora_index.item()
+        )
        current_offset += seq_len_tensor[b_id].item()

    lora_indices_tensor = lora_indices_tensor.to(device)
@@ -276,8 +277,7 @@ def generate_data_for_nslices(
    op_type,
    device,
 ) -> PunicaTensors:
-    seq_len_tensor = torch.randint(seq_length, seq_length + 1,
-                                   (batches, )).to(device)
+    seq_len_tensor = torch.randint(seq_length, seq_length + 1, (batches,)).to(device)
    b_seq_start_loc = torch.cumsum(
        torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
        dim=0,
@@ -286,9 +286,7 @@ def generate_data_for_nslices(

    lora_weights_lst = []
    if op_type == "shrink":
-
-        inputs_tensor = torch.rand((total_tokens, hidden_size),
-                                   dtype=dtype).to(device)
+        inputs_tensor = torch.rand((total_tokens, hidden_size), dtype=dtype).to(device)

        for _ in range(nslices):
            if op_type == "shrink":
@@ -296,7 +294,8 @@ def generate_data_for_nslices(
                    torch.rand(
                        (lora_nums, max_rank, hidden_size),  # col-major
                        dtype=dtype,
-                    ).to(device))
+                    ).to(device)
+                )
        # NOTE  shrink kernel using torch.float32 as output type
        # shrink op need atomic_add, so output is initinized by 0
        our_out_tensor = torch.zeros(
@@ -313,23 +312,26 @@ def generate_data_for_nslices(
                torch.rand(
                    (lora_nums, hidden_size, max_rank),  # col-major
                    dtype=dtype,
-                ).to(device))
+                ).to(device)
+            )
        # expand op needs to complete y+=a@lora_b, so output is
        # initinized randomly
-        our_out_tensor = torch.rand((total_tokens, hidden_size * nslices),
-                                    dtype=dtype).to(device)
+        our_out_tensor = torch.rand(
+            (total_tokens, hidden_size * nslices), dtype=dtype
+        ).to(device)

    # Ensure the same input.
    ref_out_tensor = our_out_tensor.clone()
-    lora_indices_tensor = torch.randint(0,
-                                        lora_nums - 1 if lora_nums > 1 else 1,
-                                        (batches, ))
+    lora_indices_tensor = torch.randint(
+        0, lora_nums - 1 if lora_nums > 1 else 1, (batches,)
+    )
    indices = torch.zeros((total_tokens), dtype=torch.long).to(device)
    current_offset = 0
    for b_id in range(batches):
        lora_index = lora_indices_tensor[b_id]
-        indices[current_offset:current_offset +
-                seq_len_tensor[b_id]] = (lora_index.item())
+        indices[current_offset : current_offset + seq_len_tensor[b_id]] = (
+            lora_index.item()
+        )
        current_offset += seq_len_tensor[b_id].item()

    lora_indices_tensor = lora_indices_tensor.to(device)
@@ -379,24 +381,20 @@ def create_peft_lora(
    }

    for module_name in target_modules:
-
        module = model
        for attr in module_name.split("."):
            module = getattr(module, attr)

        if hasattr(module, "input_size") and hasattr(module, "output_size"):
-
            in_features = module.input_size
            out_features = module.output_size

-        elif hasattr(module, "embedding_dim") and hasattr(
-                module, "num_embeddings"):
+        elif hasattr(module, "embedding_dim") and hasattr(module, "num_embeddings"):
            # ParallelLMHead
            in_features = module.embedding_dim
            out_features = module.num_embeddings
        else:
-            raise ValueError(
-                f"Unable to determine dimensions for module {module_name}")
+            raise ValueError(f"Unable to determine dimensions for module {module_name}")

        lora_A = torch.randn(rank, in_features, dtype=lora_dtype)