Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-05 15:06:22 +01:00
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions
--- a/tests/lora/test_add_lora.py
+++ b/tests/lora/test_add_lora.py
@@ -7,7 +7,8 @@ import pytest

 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.entrypoints.openai.api_server import (
-    build_async_engine_client_from_engine_args)
+    build_async_engine_client_from_engine_args,
+)
 from vllm.inputs import TextPrompt
 from vllm.lora.request import LoRARequest
 from vllm.sampling_params import SamplingParams
@@ -26,14 +27,10 @@ def get_lora_requests(lora_path) -> list[LoRARequest]:
    return lora_requests


-async def requests_processing_time(llm,
-                                   lora_requests: list[LoRARequest]) -> float:
-
-    sampling_params = SamplingParams(n=1,
-                                     temperature=0.0,
-                                     top_p=1.0,
-                                     ignore_eos=True,
-                                     max_tokens=1)
+async def requests_processing_time(llm, lora_requests: list[LoRARequest]) -> float:
+    sampling_params = SamplingParams(
+        n=1, temperature=0.0, top_p=1.0, ignore_eos=True, max_tokens=1
+    )

    generators = []
    start = time.perf_counter()
@@ -41,11 +38,11 @@ async def requests_processing_time(llm,
    for lora_request in lora_requests:
        lora_int_id = lora_request.lora_int_id
        generator = llm.generate(
-            prompt=TextPrompt(prompt=f"hello {lora_int_id}",
-                              multi_modal_data=None),  # type: ignore 
+            prompt=TextPrompt(prompt=f"hello {lora_int_id}", multi_modal_data=None),  # type: ignore
            sampling_params=sampling_params,
            lora_request=lora_request,
-            request_id=f"test{lora_int_id}")
+            request_id=f"test{lora_int_id}",
+        )
        generators.append(generator)

    all_gens = merge_async_iterators(*generators)
@@ -58,13 +55,13 @@ async def requests_processing_time(llm,

@pytest.mark.asyncio
 async def test_add_lora(chatglm3_lora_files):
-    """ 
+    """
    The add_lora function is used to preload some LoRA adapters into the
    engine in anticipation of future requests using these adapters. To test
    this functionality, we use the async engine to process some requests - We
    do it twice, once with add_lora() preloading and once without.

-    We measure the request processing time in both cases and expect the time 
+    We measure the request processing time in both cases and expect the time
    to be lesser in the case with add_lora() calls.
    """
    lora_requests: list[LoRARequest] = get_lora_requests(chatglm3_lora_files)
@@ -78,18 +75,18 @@ async def test_add_lora(chatglm3_lora_files):
        max_loras=max_loras,
        max_lora_rank=LORA_RANK,
        max_model_len=128,
-        gpu_memory_utilization=0.8,  #avoid OOM
+        gpu_memory_utilization=0.8,  # avoid OOM
        trust_remote_code=True,
-        enforce_eager=True)
+        enforce_eager=True,
+    )

    # split lora_requests into 3 parts
    part_size = len(lora_requests) // 3
    dummy_run_requests = lora_requests[:part_size]
-    warmup_run_requests = lora_requests[part_size:part_size * 2]
-    cold_run_requests = lora_requests[part_size * 2:]
+    warmup_run_requests = lora_requests[part_size : part_size * 2]
+    cold_run_requests = lora_requests[part_size * 2 :]

    async with build_async_engine_client_from_engine_args(engine_args) as llm:
-
        # Dummy run - So any 1-time functionality like triton kernel compilation
        # is complete here.
        await requests_processing_time(llm, dummy_run_requests)
@@ -101,18 +98,16 @@ async def test_add_lora(chatglm3_lora_files):
        # Test that all all_lora calls are successful.
        assert all(add_lora_results)

-        time_with_add_lora = await requests_processing_time(
-            llm, warmup_run_requests)
+        time_with_add_lora = await requests_processing_time(llm, warmup_run_requests)

        # Run without any warmup
-        time_cold_start = await requests_processing_time(
-            llm, cold_run_requests)
+        time_cold_start = await requests_processing_time(llm, cold_run_requests)

-    print(f"time hot-start {time_with_add_lora} vs "
-          f"time cold-start {time_cold_start} ")
+    print(f"time hot-start {time_with_add_lora} vs time cold-start {time_cold_start} ")

    assert time_with_add_lora < time_cold_start, (
        f"time_with_add_lora={time_with_add_lora}, "
        f"time_cold_start={time_cold_start}"
        "The engine request processing time with LoRA pre-loading "
-        "must be less than the version that does on-demand LoRA loading.")
+        "must be less than the version that does on-demand LoRA loading."
+    )