Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-05 15:06:22 +01:00
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -4,6 +4,7 @@

 Run `pytest tests/basic_correctness/test_basic_correctness.py`.
 """
+
 import os
 import weakref
 from unittest.mock import Mock
@@ -37,16 +38,21 @@ def test_vllm_gc_ed():


 def _fix_prompt_embed_outputs(
-        vllm_outputs: list[tuple[list[int], str]], hf_model: HfRunner,
-        example_prompts: list[str]) -> list[tuple[list[int], str]]:
+    vllm_outputs: list[tuple[list[int], str]],
+    hf_model: HfRunner,
+    example_prompts: list[str],
+) -> list[tuple[list[int], str]]:
    fixed_vllm_outputs = []
    for vllm_output, hf_input, prompt in zip(
-            vllm_outputs, hf_model.get_inputs(example_prompts),
-            example_prompts):
+        vllm_outputs, hf_model.get_inputs(example_prompts), example_prompts
+    ):
        hf_input_ids = hf_input["input_ids"].tolist()[0]
        fixed_vllm_outputs.append(
-            (hf_input_ids + vllm_output[0][len(hf_input_ids):],
-             prompt + vllm_output[1]))
+            (
+                hf_input_ids + vllm_output[0][len(hf_input_ids) :],
+                prompt + vllm_output[1],
+            )
+        )
    return fixed_vllm_outputs


@@ -69,8 +75,7 @@ def test_models(
    enable_prompt_embeds: bool,
 ) -> None:
    if backend == "XFORMERS" and model == "google/gemma-2-2b-it":
-        pytest.skip(
-            f"{backend} does not support gemma2 with full context length.")
+        pytest.skip(f"{backend} does not support gemma2 with full context length.")

    with monkeypatch.context() as m:
        m.setenv("VLLM_ATTENTION_BACKEND", backend)
@@ -78,34 +83,35 @@ def test_models(
        # 5042 tokens for gemma2
        # gemma2 has alternating sliding window size of 4096
        # we need a prompt with more than 4096 tokens to test the sliding window
-        prompt = "The following numbers of the sequence " + ", ".join(
-            str(i) for i in range(1024)) + " are:"
+        prompt = (
+            "The following numbers of the sequence "
+            + ", ".join(str(i) for i in range(1024))
+            + " are:"
+        )
        example_prompts = [prompt]

        with hf_runner(model) as hf_model:
            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
            if enable_prompt_embeds:
                with torch.no_grad():
-                    prompt_embeds = hf_model.get_prompt_embeddings(
-                        example_prompts)
+                    prompt_embeds = hf_model.get_prompt_embeddings(example_prompts)

        with VllmRunner(
-                model,
-                max_model_len=8192,
-                enforce_eager=enforce_eager,
-                enable_prompt_embeds=enable_prompt_embeds,
-                gpu_memory_utilization=0.7,
-                async_scheduling=async_scheduling,
-                distributed_executor_backend=model_executor,
+            model,
+            max_model_len=8192,
+            enforce_eager=enforce_eager,
+            enable_prompt_embeds=enable_prompt_embeds,
+            gpu_memory_utilization=0.7,
+            async_scheduling=async_scheduling,
+            distributed_executor_backend=model_executor,
        ) as vllm_model:
            if enable_prompt_embeds:
-                vllm_outputs = vllm_model.generate_greedy(
-                    prompt_embeds, max_tokens)
+                vllm_outputs = vllm_model.generate_greedy(prompt_embeds, max_tokens)
                vllm_outputs = _fix_prompt_embed_outputs(
-                    vllm_outputs, hf_model, example_prompts)
+                    vllm_outputs, hf_model, example_prompts
+                )
            else:
-                vllm_outputs = vllm_model.generate_greedy(
-                    example_prompts, max_tokens)
+                vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)

        check_outputs_equal(
            outputs_0_lst=hf_outputs,
@@ -117,21 +123,18 @@ def test_models(

@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize(
-    "model, distributed_executor_backend, attention_backend, "
-    "test_suite, extra_env", [
+    "model, distributed_executor_backend, attention_backend, test_suite, extra_env",
+    [
        ("distilbert/distilgpt2", "ray", "", "L4", {}),
        ("distilbert/distilgpt2", "mp", "", "L4", {}),
-        ("distilbert/distilgpt2", "ray", "", "L4", {
-            "VLLM_SLEEP_WHEN_IDLE": "1"
-        }),
-        ("distilbert/distilgpt2", "mp", "", "L4", {
-            "VLLM_SLEEP_WHEN_IDLE": "1"
-        }),
+        ("distilbert/distilgpt2", "ray", "", "L4", {"VLLM_SLEEP_WHEN_IDLE": "1"}),
+        ("distilbert/distilgpt2", "mp", "", "L4", {"VLLM_SLEEP_WHEN_IDLE": "1"}),
        ("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4", {}),
        ("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4", {}),
        ("distilbert/distilgpt2", "ray", "", "A100", {}),
        ("distilbert/distilgpt2", "mp", "", "A100", {}),
-    ])
+    ],
+)
@pytest.mark.parametrize("enable_prompt_embeds", [True, False])
 def test_models_distributed(
    monkeypatch: pytest.MonkeyPatch,
@@ -149,11 +152,14 @@ def test_models_distributed(
        pytest.skip(f"Skip test for {test_suite}")

    with monkeypatch.context() as monkeypatch_context:
-        if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4":  # noqa
+        if (
+            model == "meta-llama/Llama-3.2-1B-Instruct"
+            and distributed_executor_backend == "ray"
+            and attention_backend == ""
+            and test_suite == "L4"
+        ):  # noqa
            if enable_prompt_embeds:
-                pytest.skip(
-                    "enable_prompt_embeds does not work with ray compiled dag."
-                )
+                pytest.skip("enable_prompt_embeds does not work with ray compiled dag.")
            monkeypatch_context.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
            monkeypatch_context.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")

@@ -175,30 +181,26 @@ def test_models_distributed(
        # will hurt multiprocessing backend with fork method
        # (the default method).
        with vllm_runner(
-                model,
-                dtype=dtype,
-                tensor_parallel_size=2,
-                distributed_executor_backend=distributed_executor_backend,
-                enable_prompt_embeds=enable_prompt_embeds,
-                gpu_memory_utilization=0.7,
+            model,
+            dtype=dtype,
+            tensor_parallel_size=2,
+            distributed_executor_backend=distributed_executor_backend,
+            enable_prompt_embeds=enable_prompt_embeds,
+            gpu_memory_utilization=0.7,
        ) as vllm_model:
            if enable_prompt_embeds:
                with hf_runner(model, dtype=dtype) as hf_model:
                    with torch.no_grad():
-                        prompt_embeds = hf_model.get_prompt_embeddings(
-                            example_prompts)
-                    vllm_outputs = vllm_model.generate_greedy(
-                        prompt_embeds, max_tokens)
+                        prompt_embeds = hf_model.get_prompt_embeddings(example_prompts)
+                    vllm_outputs = vllm_model.generate_greedy(prompt_embeds, max_tokens)
                    vllm_outputs = _fix_prompt_embed_outputs(
-                        vllm_outputs, hf_model, example_prompts)
-                    hf_outputs = hf_model.generate_greedy(
-                        example_prompts, max_tokens)
+                        vllm_outputs, hf_model, example_prompts
+                    )
+                    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
            else:
-                vllm_outputs = vllm_model.generate_greedy(
-                    example_prompts, max_tokens)
+                vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
                with hf_runner(model, dtype=dtype) as hf_model:
-                    hf_outputs = hf_model.generate_greedy(
-                        example_prompts, max_tokens)
+                    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)

    check_outputs_equal(
        outputs_0_lst=hf_outputs,
@@ -209,27 +211,23 @@ def test_models_distributed(


 def test_failed_model_execution(vllm_runner, monkeypatch) -> None:
-
    from vllm.envs import VLLM_USE_V1

    if not VLLM_USE_V1:
        pytest.skip("Skipping V0 test, dump input not supported")

    # Needed to mock an error in the same process
-    monkeypatch.setenv('VLLM_ENABLE_V1_MULTIPROCESSING', '0')
+    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")

-    with vllm_runner('facebook/opt-125m', enforce_eager=True) as vllm_model:
+    with vllm_runner("facebook/opt-125m", enforce_eager=True) as vllm_model:
        if isinstance(vllm_model.llm.llm_engine, LLMEngineV1):
            v1_test_failed_model_execution(vllm_model)


 def v1_test_failed_model_execution(vllm_model):
-
    engine = vllm_model.llm.llm_engine
-    mocked_execute_model = Mock(
-        side_effect=RuntimeError("Mocked Critical Error"))
-    engine.engine_core.engine_core.model_executor.execute_model =\
-                mocked_execute_model
+    mocked_execute_model = Mock(side_effect=RuntimeError("Mocked Critical Error"))
+    engine.engine_core.engine_core.model_executor.execute_model = mocked_execute_model

    with pytest.raises(RuntimeError) as exc_info:
        prompts = [
--- a/tests/basic_correctness/test_cpu_offload.py
+++ b/tests/basic_correctness/test_cpu_offload.py
@@ -5,5 +5,6 @@ from ..utils import compare_two_settings


 def test_cpu_offload():
-    compare_two_settings("meta-llama/Llama-3.2-1B-Instruct", [],
-                         ["--cpu-offload-gb", "1"])
+    compare_two_settings(
+        "meta-llama/Llama-3.2-1B-Instruct", [], ["--cpu-offload-gb", "1"]
+    )
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@@ -23,13 +23,13 @@ def test_python_error():
    tensors = []
    with allocator.use_memory_pool():
        # allocate 70% of the total memory
-        x = torch.empty(alloc_bytes, dtype=torch.uint8, device='cuda')
+        x = torch.empty(alloc_bytes, dtype=torch.uint8, device="cuda")
        tensors.append(x)
    # release the memory
    allocator.sleep()

    # allocate more memory than the total memory
-    y = torch.empty(alloc_bytes, dtype=torch.uint8, device='cuda')
+    y = torch.empty(alloc_bytes, dtype=torch.uint8, device="cuda")
    tensors.append(y)
    with pytest.raises(RuntimeError):
        # when the allocator is woken up, it should raise an error
@@ -41,17 +41,17 @@ def test_python_error():
 def test_basic_cumem():
    # some tensors from default memory pool
    shape = (1024, 1024)
-    x = torch.empty(shape, device='cuda')
+    x = torch.empty(shape, device="cuda")
    x.zero_()

    # some tensors from custom memory pool
    allocator = CuMemAllocator.get_instance()
    with allocator.use_memory_pool():
        # custom memory pool
-        y = torch.empty(shape, device='cuda')
+        y = torch.empty(shape, device="cuda")
        y.zero_()
        y += 1
-        z = torch.empty(shape, device='cuda')
+        z = torch.empty(shape, device="cuda")
        z.zero_()
        z += 2

@@ -74,16 +74,16 @@ def test_basic_cumem():
 def test_cumem_with_cudagraph():
    allocator = CuMemAllocator.get_instance()
    with allocator.use_memory_pool():
-        weight = torch.eye(1024, device='cuda')
+        weight = torch.eye(1024, device="cuda")
    with allocator.use_memory_pool(tag="discard"):
-        cache = torch.empty(1024, 1024, device='cuda')
+        cache = torch.empty(1024, 1024, device="cuda")

    def model(x):
        out = x @ weight
-        cache[:out.size(0)].copy_(out)
+        cache[: out.size(0)].copy_(out)
        return out + 1

-    x = torch.empty(128, 1024, device='cuda')
+    x = torch.empty(128, 1024, device="cuda")

    # warmup
    model(x)
@@ -109,7 +109,7 @@ def test_cumem_with_cudagraph():
    model_graph.replay()

    # cache content is as expected
-    assert torch.allclose(x, cache[:x.size(0)])
+    assert torch.allclose(x, cache[: x.size(0)])

    # output content is as expected
    assert torch.allclose(y, x + 1)
@@ -123,7 +123,8 @@ def test_cumem_with_cudagraph():
        ("meta-llama/Llama-3.2-1B", True),
        # sleep mode with pytorch checkpoint
        ("facebook/opt-125m", True),
-    ])
+    ],
+)
 def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
    with monkeypatch.context() as m:
        assert use_v1