Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-05 15:06:22 +01:00
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions
--- a/tests/distributed/test_quick_all_reduce.py
+++ b/tests/distributed/test_quick_all_reduce.py
@@ -8,20 +8,20 @@ import ray
 import torch
 import torch.distributed as dist

-from vllm.distributed.communication_op import (  # noqa
-    tensor_model_parallel_all_reduce)
+from vllm.distributed.communication_op import tensor_model_parallel_all_reduce  # noqa
 from vllm.distributed.parallel_state import get_tp_group, graph_capture
 from vllm.platforms import current_platform

-from ..utils import (ensure_model_parallel_initialized,
-                     init_test_distributed_environment, multi_process_parallel)
+from ..utils import (
+    ensure_model_parallel_initialized,
+    init_test_distributed_environment,
+    multi_process_parallel,
+)

 torch.manual_seed(42)
 random.seed(44)
 # Size over 8MB is sufficient for custom quick allreduce.
-test_sizes = [
-    random.randint(8 * 1024 * 1024, 10 * 1024 * 1024) for _ in range(8)
-]
+test_sizes = [random.randint(8 * 1024 * 1024, 10 * 1024 * 1024) for _ in range(8)]
 for i, v in enumerate(test_sizes):
    test_sizes[i] -= v % 8

@@ -38,8 +38,7 @@ def graph_quickreduce(
        m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
        device = torch.device(f"cuda:{rank}")
        torch.cuda.set_device(device)
-        init_test_distributed_environment(tp_size, pp_size, rank,
-                                          distributed_init_port)
+        init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
        ensure_model_parallel_initialized(tp_size, pp_size)
        group = get_tp_group().device_group

@@ -64,18 +63,15 @@ def graph_quickreduce(
        for sz in test_sizes:
            for dtype in [torch.float16, torch.bfloat16]:
                with graph_capture(device=device) as graph_capture_context:
-                    inp1 = torch.randint(1,
-                                         23, (sz, ),
-                                         dtype=dtype,
-                                         device=torch.cuda.current_device())
-                    inp2 = torch.randint(-23,
-                                         1, (sz, ),
-                                         dtype=dtype,
-                                         device=torch.cuda.current_device())
+                    inp1 = torch.randint(
+                        1, 23, (sz,), dtype=dtype, device=torch.cuda.current_device()
+                    )
+                    inp2 = torch.randint(
+                        -23, 1, (sz,), dtype=dtype, device=torch.cuda.current_device()
+                    )
                    torch.cuda.synchronize()
                    graph = torch.cuda.CUDAGraph()
-                    with torch.cuda.graph(graph,
-                                          stream=graph_capture_context.stream):
+                    with torch.cuda.graph(graph, stream=graph_capture_context.stream):
                        for _ in range(num_communication):
                            out1 = tensor_model_parallel_all_reduce(inp1)
                            dist.all_reduce(inp1, group=group)
@@ -99,39 +95,42 @@ def eager_quickreduce(
        device = torch.device(f"cuda:{rank}")
        torch.cuda.set_device(device)

-        init_test_distributed_environment(tp_size, pp_size, rank,
-                                          distributed_init_port)
+        init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)

        # Size over 8MB is sufficient for custom quick allreduce.
        sz = 16 * 1024 * 1024
        fa = get_tp_group().device_communicator.qr_comm
-        inp = torch.tensor([1.0 * ((i) % 23) for i in range(sz)],
-                           dtype=torch.float16,
-                           device=device)
+        inp = torch.tensor(
+            [1.0 * ((i) % 23) for i in range(sz)], dtype=torch.float16, device=device
+        )
        out = fa.quick_all_reduce(inp)
        torch.testing.assert_close(out, inp * tp_size, atol=2.5, rtol=0.1)

-        inp = torch.tensor([1.0 * ((i) % 23) for i in range(sz)],
-                           dtype=torch.bfloat16,
-                           device=device)
+        inp = torch.tensor(
+            [1.0 * ((i) % 23) for i in range(sz)], dtype=torch.bfloat16, device=device
+        )
        out = fa.quick_all_reduce(inp)
        torch.testing.assert_close(out, inp * tp_size, atol=2.5, rtol=0.1)


-@pytest.mark.skipif(not current_platform.is_rocm(),
-                    reason="only test quick allreduce for rocm")
+@pytest.mark.skipif(
+    not current_platform.is_rocm(), reason="only test quick allreduce for rocm"
+)
@pytest.mark.parametrize("quant_mode", ["FP", "INT8", "INT6", "INT4"])
@pytest.mark.parametrize("tp_size", [2])
@pytest.mark.parametrize("pipeline_parallel_size", [1, 2])
@pytest.mark.parametrize("test_target", [graph_quickreduce, eager_quickreduce])
-def test_custom_quick_allreduce(monkeypatch: pytest.MonkeyPatch, tp_size,
-                                pipeline_parallel_size, test_target,
-                                quant_mode):
+def test_custom_quick_allreduce(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size,
+    pipeline_parallel_size,
+    test_target,
+    quant_mode,
+):
    world_size = tp_size * pipeline_parallel_size
    if world_size > torch.cuda.device_count():
        pytest.skip("Not enough GPUs to run the test.")

    monkeypatch.setenv("VLLM_ROCM_QUICK_REDUCE_QUANTIZATION", quant_mode)

-    multi_process_parallel(monkeypatch, tp_size, pipeline_parallel_size,
-                           test_target)
+    multi_process_parallel(monkeypatch, tp_size, pipeline_parallel_size, test_target)