Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-05 15:06:22 +01:00
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions
--- a/tests/distributed/conftest.py
+++ b/tests/distributed/conftest.py
@@ -111,8 +111,7 @@ class MockSubscriber:
        self.last_seq = -1
        self.decoder = msgspec.msgpack.Decoder(type=decode_type)

-    def receive_one(self,
-                    timeout=1000) -> Union[tuple[int, SampleBatch], None]:
+    def receive_one(self, timeout=1000) -> Union[tuple[int, SampleBatch], None]:
        """Receive a single message with timeout"""
        if not self.sub.poll(timeout):
            return None
@@ -135,8 +134,7 @@ class MockSubscriber:

        self.replay_sockets[socket_idx].send(start_seq.to_bytes(8, "big"))

-    def receive_replay(self,
-                       socket_idx: int = 0) -> list[tuple[int, SampleBatch]]:
+    def receive_replay(self, socket_idx: int = 0) -> list[tuple[int, SampleBatch]]:
        """Receive replayed messages from a specific replay socket"""
        if not self.replay_sockets:
            raise ValueError("Replay sockets not initialized")
--- a/tests/distributed/test_ca_buffer_sharing.py
+++ b/tests/distributed/test_ca_buffer_sharing.py
@@ -12,7 +12,8 @@ import torch.distributed as dist

 from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
 from vllm.distributed.device_communicators.custom_all_reduce import (  # noqa
-    CustomAllreduce)
+    CustomAllreduce,
+)

 # create a cpu process group for communicating metadata (ipc handle)
 dist.init_process_group(backend="gloo")
@@ -52,7 +53,8 @@ for p in pointers:
        assert ord(host_data[i]) == byte_value, (
            f"Rank {rank} failed"
            f" to verify buffer {p}. Expected {byte_value}, "
-            f"got {ord(host_data[i])}")
+            f"got {ord(host_data[i])}"
+        )

 print(f"Rank {rank} verified all buffers")

--- a/tests/distributed/test_comm_ops.py
+++ b/tests/distributed/test_comm_ops.py
@@ -13,13 +13,19 @@ import pytest
 import ray
 import torch

-from vllm.distributed import (broadcast_tensor_dict, get_pp_group,
-                              tensor_model_parallel_all_gather,
-                              tensor_model_parallel_all_reduce,
-                              tensor_model_parallel_reduce_scatter)
+from vllm.distributed import (
+    broadcast_tensor_dict,
+    get_pp_group,
+    tensor_model_parallel_all_gather,
+    tensor_model_parallel_all_reduce,
+    tensor_model_parallel_reduce_scatter,
+)

-from ..utils import (init_test_distributed_environment, multi_gpu_test,
-                     multi_process_parallel)
+from ..utils import (
+    init_test_distributed_environment,
+    multi_gpu_test,
+    multi_process_parallel,
+)


@ray.remote(num_gpus=1, max_calls=1)
@@ -37,12 +43,11 @@ def all_reduce_test_worker(

    device = torch.device(f"cuda:{rank}")
    torch.cuda.set_device(device)
-    init_test_distributed_environment(tp_size, pp_size, rank,
-                                      distributed_init_port)
+    init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
    num_elements = 8
    all_tensors = [
-        torch.arange(num_elements, dtype=torch.float32, device="cuda") *
-        (r + 1) for r in range(tp_size)
+        torch.arange(num_elements, dtype=torch.float32, device="cuda") * (r + 1)
+        for r in range(tp_size)
    ]
    expected = torch.sum(torch.stack(all_tensors, dim=0), dim=0)
    t = all_tensors[rank % tp_size]
@@ -51,28 +56,31 @@ def all_reduce_test_worker(


@ray.remote(num_gpus=1, max_calls=1)
-def reduce_scatter_test_worker(monkeypatch: pytest.MonkeyPatch, tp_size: int,
-                               pp_size: int, rank: int,
-                               distributed_init_port: str):
+def reduce_scatter_test_worker(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size: int,
+    pp_size: int,
+    rank: int,
+    distributed_init_port: str,
+):
    # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
    # so that each worker can see all the GPUs
    # they will be able to set the device to the correct GPU
    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
    device = torch.device(f"cuda:{rank}")
    torch.cuda.set_device(device)
-    init_test_distributed_environment(tp_size, pp_size, rank,
-                                      distributed_init_port)
+    init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)

    num_elements = 8
    all_tensors = [
-        torch.arange(num_elements, dtype=torch.float32, device="cuda") *
-        (r + 1) for r in range(tp_size)
+        torch.arange(num_elements, dtype=torch.float32, device="cuda") * (r + 1)
+        for r in range(tp_size)
    ]

    index = rank % tp_size
    partition_size = num_elements // tp_size
    all_reduce = torch.sum(torch.stack(all_tensors, dim=0), dim=0)
-    expected = all_reduce[index * partition_size:(index + 1) * partition_size]
+    expected = all_reduce[index * partition_size : (index + 1) * partition_size]
    t = all_tensors[index]
    t = tensor_model_parallel_reduce_scatter(t, 0)
    torch.testing.assert_close(t, expected)
@@ -92,8 +100,7 @@ def all_gather_test_worker(
    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
    device = torch.device(f"cuda:{rank}")
    torch.cuda.set_device(device)
-    init_test_distributed_environment(tp_size, pp_size, rank,
-                                      distributed_init_port)
+    init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
    num_dimensions = 3
    tensor_size = list(range(2, num_dimensions + 2))
    total_size = 1
@@ -101,8 +108,10 @@ def all_gather_test_worker(
        total_size *= s
    for all_gather_dimension in range(num_dimensions):
        all_tensors = [
-            torch.arange(total_size, dtype=torch.float32,
-                         device="cuda").reshape(tensor_size) * (r + 1)
+            torch.arange(total_size, dtype=torch.float32, device="cuda").reshape(
+                tensor_size
+            )
+            * (r + 1)
            for r in range(tp_size)
        ]
        expected = torch.cat(all_tensors, dim=all_gather_dimension)
@@ -125,8 +134,7 @@ def broadcast_tensor_dict_test_worker(
    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
    device = torch.device(f"cuda:{rank}")
    torch.cuda.set_device(device)
-    init_test_distributed_environment(tp_size, pp_size, rank,
-                                      distributed_init_port)
+    init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
    test_dict = {
        # device tensor
        "a": torch.arange(8, dtype=torch.float32, device="cuda"),
@@ -134,10 +142,7 @@ def broadcast_tensor_dict_test_worker(
        "b": torch.arange(16, dtype=torch.int8, device="cpu"),
        "c": "test",
        "d": [1, 2, 3],
-        "e": {
-            "a": 1,
-            "b": 2
-        },
+        "e": {"a": 1, "b": 2},
        # empty tensor
        "f": torch.tensor([], dtype=torch.float32, device="cuda"),
    }
@@ -166,8 +171,7 @@ def send_recv_tensor_dict_test_worker(
    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
    device = torch.device(f"cuda:{rank}")
    torch.cuda.set_device(device)
-    init_test_distributed_environment(tp_size, pp_size, rank,
-                                      distributed_init_port)
+    init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)

    test_dict = {
        # device tensor
@@ -176,10 +180,7 @@ def send_recv_tensor_dict_test_worker(
        "b": torch.arange(16, dtype=torch.int8, device="cpu"),
        "c": "test",
        "d": [1, 2, 3],
-        "e": {
-            "a": 1,
-            "b": 2
-        },
+        "e": {"a": 1, "b": 2},
        # empty tensor
        "f": torch.tensor([], dtype=torch.float32, device="cuda"),
    }
@@ -211,8 +212,7 @@ def send_recv_test_worker(
    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
    device = torch.device(f"cuda:{rank}")
    torch.cuda.set_device(device)
-    init_test_distributed_environment(tp_size, pp_size, rank,
-                                      distributed_init_port)
+    init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)

    size = 64
    test_tensor = torch.arange(64, dtype=torch.float32, device="cuda")
@@ -229,10 +229,10 @@ def send_recv_test_worker(

@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize("tp_size", [2])
-@pytest.mark.parametrize("test_target", [
-    all_reduce_test_worker, all_gather_test_worker,
-    broadcast_tensor_dict_test_worker
-])
+@pytest.mark.parametrize(
+    "test_target",
+    [all_reduce_test_worker, all_gather_test_worker, broadcast_tensor_dict_test_worker],
+)
 def test_multi_process_tensor_parallel(
    monkeypatch: pytest.MonkeyPatch,
    tp_size: int,
@@ -244,7 +244,8 @@ def test_multi_process_tensor_parallel(
@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize("pp_size", [2])
@pytest.mark.parametrize(
-    "test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker])
+    "test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker]
+)
 def test_multi_process_pipeline_parallel(
    monkeypatch: pytest.MonkeyPatch,
    pp_size: int,
@@ -256,11 +257,16 @@ def test_multi_process_pipeline_parallel(
@multi_gpu_test(num_gpus=4)
@pytest.mark.parametrize("tp_size", [2])
@pytest.mark.parametrize("pp_size", [2])
-@pytest.mark.parametrize("test_target", [
-    send_recv_test_worker, send_recv_tensor_dict_test_worker,
-    all_reduce_test_worker, all_gather_test_worker,
-    broadcast_tensor_dict_test_worker
-])
+@pytest.mark.parametrize(
+    "test_target",
+    [
+        send_recv_test_worker,
+        send_recv_tensor_dict_test_worker,
+        all_reduce_test_worker,
+        all_gather_test_worker,
+        broadcast_tensor_dict_test_worker,
+    ],
+)
 def test_multi_process_tensor_parallel_pipeline_parallel(
    tp_size: int,
    pp_size: int,
--- a/tests/distributed/test_context_parallel.py
+++ b/tests/distributed/test_context_parallel.py
@@ -7,6 +7,7 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
 all workers in a node other than the head node, which can cause the test
 to fail.
 """
+
 import json
 import os
 from dataclasses import dataclass
@@ -56,7 +57,8 @@ class CPTestSettings:
            raise ValueError(
                f"Length mismatch: distributed_backends "
                f"({len(self.distributed_backends)}) != "
-                f"vllm_major_versions ({len(self.vllm_major_versions)})")
+                f"vllm_major_versions ({len(self.vllm_major_versions)})"
+            )

    @staticmethod
    def detailed(
@@ -74,29 +76,39 @@ class CPTestSettings:
                for dcp_multiplier in [0.5, 1]:
                    for chunked_prefill_val in [True]:
                        parallel_setups.append(
-                            ParallelSetup(tp_size=tp_base,
-                                          pp_size=pp_multiplier * pp_base,
-                                          dcp_size=int(dcp_multiplier *
-                                                       tp_base),
-                                          eager_mode=eager_mode_val,
-                                          chunked_prefill=chunked_prefill_val))
+                            ParallelSetup(
+                                tp_size=tp_base,
+                                pp_size=pp_multiplier * pp_base,
+                                dcp_size=int(dcp_multiplier * tp_base),
+                                eager_mode=eager_mode_val,
+                                chunked_prefill=chunked_prefill_val,
+                            )
+                        )
        return CPTestSettings(
            parallel_setups=parallel_setups,
            distributed_backends=["mp"],
            vllm_major_versions=["1"],
            runner=runner,
-            test_options=CPTestOptions(multi_node_only=multi_node_only,
-                                       load_format=load_format),
+            test_options=CPTestOptions(
+                multi_node_only=multi_node_only, load_format=load_format
+            ),
        )

    def iter_params(self, model_id: str):
        opts = self.test_options

        for parallel_setup in self.parallel_setups:
-            for backend, vllm_major_version in zip(self.distributed_backends,
-                                                   self.vllm_major_versions):
-                yield (model_id, parallel_setup, backend, vllm_major_version,
-                       self.runner, opts)
+            for backend, vllm_major_version in zip(
+                self.distributed_backends, self.vllm_major_versions
+            ):
+                yield (
+                    model_id,
+                    parallel_setup,
+                    backend,
+                    vllm_major_version,
+                    self.runner,
+                    opts,
+                )


 def _compare_cp_with_tp(
@@ -148,8 +160,10 @@ def _compare_cp_with_tp(
    if num_gpus_available < tp_size * pp_size:
        pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
    if VLLM_MULTI_NODE and distributed_backend == "mp":
-        pytest.skip("Skipping multi-node pipeline parallel test for "
-                    "multiprocessing distributed backend")
+        pytest.skip(
+            "Skipping multi-node pipeline parallel test for "
+            "multiprocessing distributed backend"
+        )
    if multi_node_only and not VLLM_MULTI_NODE:
        pytest.skip("Not in multi-node setting")

@@ -178,8 +192,7 @@ def _compare_cp_with_tp(
        common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])

    cp_env = tp_env = {
-        "VLLM_USE_V1":
-        vllm_major_version,  # Note(hc): DCP only support V1 engine only
+        "VLLM_USE_V1": vllm_major_version,  # Note(hc): DCP only support V1 engine only
    }

    cp_args = [
@@ -205,13 +218,15 @@ def _compare_cp_with_tp(
    ]

    try:
-        compare_two_settings(model_id,
-                             cp_args,
-                             tp_args,
-                             cp_env,
-                             tp_env,
-                             method=method,
-                             max_wait_seconds=720)
+        compare_two_settings(
+            model_id,
+            cp_args,
+            tp_args,
+            cp_env,
+            tp_env,
+            method=method,
+            max_wait_seconds=720,
+        )
    except Exception:
        testing_ray_compiled_graph = cp_env is not None
        if testing_ray_compiled_graph and vllm_major_version == "0":
@@ -224,9 +239,10 @@ def _compare_cp_with_tp(

 CP_TEXT_GENERATION_MODELS = {
    # [MLA attention only]
-    "deepseek-ai/DeepSeek-V2-Lite-Chat":
-    [CPTestSettings.detailed(),
-     CPTestSettings.detailed(tp_base=2)],
+    "deepseek-ai/DeepSeek-V2-Lite-Chat": [
+        CPTestSettings.detailed(),
+        CPTestSettings.detailed(tp_base=2),
+    ],
 }

 CP_TEST_MODELS = [
@@ -237,11 +253,19 @@ CP_TEST_MODELS = [


@pytest.mark.parametrize(
-    ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
-     "runner", "test_options"),
+    (
+        "model_id",
+        "parallel_setup",
+        "distributed_backend",
+        "vllm_major_version",
+        "runner",
+        "test_options",
+    ),
    [
-        params for model_id, settings in CP_TEXT_GENERATION_MODELS.items()
-        for setting in settings for params in setting.iter_params(model_id)
+        params
+        for model_id, settings in CP_TEXT_GENERATION_MODELS.items()
+        for setting in settings
+        for params in setting.iter_params(model_id)
        if model_id in CP_TEST_MODELS
    ],
 )
@@ -255,12 +279,14 @@ def test_cp_generation(
    test_options: CPTestOptions,
    num_gpus_available,
 ):
-    _compare_cp_with_tp(model_id,
-                        parallel_setup,
-                        distributed_backend,
-                        vllm_major_version,
-                        runner,
-                        test_options,
-                        num_gpus_available,
-                        method="generate",
-                        is_multimodal=False)
+    _compare_cp_with_tp(
+        model_id,
+        parallel_setup,
+        distributed_backend,
+        vllm_major_version,
+        runner,
+        test_options,
+        num_gpus_available,
+        method="generate",
+        is_multimodal=False,
+    )
--- a/tests/distributed/test_custom_all_reduce.py
+++ b/tests/distributed/test_custom_all_reduce.py
@@ -8,12 +8,14 @@ import ray
 import torch
 import torch.distributed as dist

-from vllm.distributed.communication_op import (  # noqa
-    tensor_model_parallel_all_reduce)
+from vllm.distributed.communication_op import tensor_model_parallel_all_reduce  # noqa
 from vllm.distributed.parallel_state import get_tp_group, graph_capture

-from ..utils import (ensure_model_parallel_initialized,
-                     init_test_distributed_environment, multi_process_parallel)
+from ..utils import (
+    ensure_model_parallel_initialized,
+    init_test_distributed_environment,
+    multi_process_parallel,
+)

 random.seed(42)
 test_sizes = [random.randint(1024, 2048 * 1024) for _ in range(8)]
@@ -33,8 +35,7 @@ def graph_allreduce(
        m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
        device = torch.device(f"cuda:{rank}")
        torch.cuda.set_device(device)
-        init_test_distributed_environment(tp_size, pp_size, rank,
-                                          distributed_init_port)
+        init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
        ensure_model_parallel_initialized(tp_size, pp_size)
        group = get_tp_group().device_group

@@ -60,18 +61,15 @@ def graph_allreduce(
            for dtype in [torch.float32, torch.float16, torch.bfloat16]:
                with graph_capture(device=device) as graph_capture_context:
                    # use integers so result matches NCCL exactly
-                    inp1 = torch.randint(1,
-                                         16, (sz, ),
-                                         dtype=dtype,
-                                         device=torch.cuda.current_device())
-                    inp2 = torch.randint(1,
-                                         16, (sz, ),
-                                         dtype=dtype,
-                                         device=torch.cuda.current_device())
+                    inp1 = torch.randint(
+                        1, 16, (sz,), dtype=dtype, device=torch.cuda.current_device()
+                    )
+                    inp2 = torch.randint(
+                        1, 16, (sz,), dtype=dtype, device=torch.cuda.current_device()
+                    )
                    torch.cuda.synchronize()
                    graph = torch.cuda.CUDAGraph()
-                    with torch.cuda.graph(graph,
-                                          stream=graph_capture_context.stream):
+                    with torch.cuda.graph(graph, stream=graph_capture_context.stream):
                        for i in range(num_communication):
                            out1 = tensor_model_parallel_all_reduce(inp1)
                            # the input buffer is immediately modified to test
@@ -96,8 +94,7 @@ def eager_allreduce(
        m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
        device = torch.device(f"cuda:{rank}")
        torch.cuda.set_device(device)
-        init_test_distributed_environment(tp_size, pp_size, rank,
-                                          distributed_init_port)
+        init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)

        # we use the first group to communicate once
        # and the second group to communicate twice
@@ -132,5 +129,4 @@ def test_custom_allreduce(
    world_size = tp_size * pipeline_parallel_size
    if world_size > torch.cuda.device_count():
        pytest.skip("Not enough GPUs to run the test.")
-    multi_process_parallel(monkeypatch, tp_size, pipeline_parallel_size,
-                           test_target)
+    multi_process_parallel(monkeypatch, tp_size, pipeline_parallel_size, test_target)
--- a/tests/distributed/test_distributed_oot.py
+++ b/tests/distributed/test_distributed_oot.py
@@ -1,8 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-from ..entrypoints.openai.test_oot_registration import (
-    run_and_test_dummy_opt_api_server)
+from ..entrypoints.openai.test_oot_registration import run_and_test_dummy_opt_api_server


 def test_distributed_oot(dummy_opt_path: str):
--- a/tests/distributed/test_eplb_algo.py
+++ b/tests/distributed/test_eplb_algo.py
@@ -10,10 +10,12 @@ from vllm.distributed.eplb.rebalance_algo import rebalance_experts
 def test_basic_rebalance():
    """Test basic rebalancing functionality"""
    # Example from https://github.com/deepseek-ai/eplb
-    weight = torch.tensor([
-        [90, 132, 40, 61, 104, 165, 39, 4, 73, 56, 183, 86],
-        [20, 107, 104, 64, 19, 197, 187, 157, 172, 86, 16, 27],
-    ])
+    weight = torch.tensor(
+        [
+            [90, 132, 40, 61, 104, 165, 39, 4, 73, 56, 183, 86],
+            [20, 107, 104, 64, 19, 197, 187, 157, 172, 86, 16, 27],
+        ]
+    )

    num_layers = weight.shape[0]
    num_replicas = 16
@@ -21,45 +23,49 @@ def test_basic_rebalance():
    num_nodes = 2
    num_gpus = 8

-    phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
-                                                 num_groups, num_nodes,
-                                                 num_gpus)
+    phy2log, log2phy, logcnt = rebalance_experts(
+        weight, num_replicas, num_groups, num_nodes, num_gpus
+    )

    # Verify output shapes
    assert phy2log.shape == (
        2,
        16,
    ), f"Expected `phy2log` shape (2, 16), got {phy2log.shape}"
-    assert (log2phy.shape[0] == 2
-            ), f"Expected `log2phy` first dimension 2, got {log2phy.shape[0]}"
-    assert (
-        log2phy.shape[1] == 12
-    ), f"Expected `log2phy` second dimension 12, got {log2phy.shape[1]}"
+    assert log2phy.shape[0] == 2, (
+        f"Expected `log2phy` first dimension 2, got {log2phy.shape[0]}"
+    )
+    assert log2phy.shape[1] == 12, (
+        f"Expected `log2phy` second dimension 12, got {log2phy.shape[1]}"
+    )
    assert logcnt.shape == (
        2,
        12,
    ), f"Expected `logcnt` shape (2, 12), got {logcnt.shape}"

    # Verify physical to logical expert mapping range is correct
-    assert torch.all(phy2log >= 0) and torch.all(
-        phy2log < 12), "Physical to logical mapping should be in range [0, 12)"
+    assert torch.all(phy2log >= 0) and torch.all(phy2log < 12), (
+        "Physical to logical mapping should be in range [0, 12)"
+    )

    # Verify expert count reasonableness
-    assert torch.all(
-        logcnt >= 1), "Each logical expert should have at least 1 replica"
-    assert (
-        torch.sum(logcnt, dim=1).sum() == num_replicas *
-        num_layers), f"Total replicas should be {num_replicas * num_layers}"
+    assert torch.all(logcnt >= 1), "Each logical expert should have at least 1 replica"
+    assert torch.sum(logcnt, dim=1).sum() == num_replicas * num_layers, (
+        f"Total replicas should be {num_replicas * num_layers}"
+    )

    # Verify expected output
-    expected_phy2log = torch.tensor([
-        [5, 6, 5, 7, 8, 4, 3, 4, 10, 9, 10, 2, 0, 1, 11, 1],
-        [7, 10, 6, 8, 6, 11, 8, 9, 2, 4, 5, 1, 5, 0, 3, 1],
-    ])
+    expected_phy2log = torch.tensor(
+        [
+            [5, 6, 5, 7, 8, 4, 3, 4, 10, 9, 10, 2, 0, 1, 11, 1],
+            [7, 10, 6, 8, 6, 11, 8, 9, 2, 4, 5, 1, 5, 0, 3, 1],
+        ]
+    )
    assert torch.all(phy2log == expected_phy2log)

-    expected_logcnt = torch.tensor([[1, 2, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1],
-                                    [1, 2, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1]])
+    expected_logcnt = torch.tensor(
+        [[1, 2, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1], [1, 2, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1]]
+    )
    assert torch.all(logcnt == expected_logcnt)


@@ -71,9 +77,9 @@ def test_single_gpu_case():
    num_nodes = 1
    num_gpus = 1

-    phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
-                                                 num_groups, num_nodes,
-                                                 num_gpus)
+    phy2log, log2phy, logcnt = rebalance_experts(
+        weight, num_replicas, num_groups, num_nodes, num_gpus
+    )

    # Verify shapes
    assert phy2log.shape == (1, 4)
@@ -93,19 +99,19 @@ def test_equal_weights():
    num_nodes = 2
    num_gpus = 4

-    phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
-                                                 num_groups, num_nodes,
-                                                 num_gpus)
+    phy2log, log2phy, logcnt = rebalance_experts(
+        weight, num_replicas, num_groups, num_nodes, num_gpus
+    )

    # Verify shapes
    assert phy2log.shape == (1, 8)
    assert logcnt.shape == (1, 8)

    # With equal weights, each expert should have exactly one replica
-    assert torch.all(
-        logcnt == 1
-    ), "With equal weights and no replication, " \
-       "each expert should have exactly 1 replica"
+    assert torch.all(logcnt == 1), (
+        "With equal weights and no replication, "
+        "each expert should have exactly 1 replica"
+    )


 def test_extreme_weight_imbalance():
@@ -116,35 +122,37 @@ def test_extreme_weight_imbalance():
    num_nodes = 2
    num_gpus = 4

-    phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
-                                                 num_groups, num_nodes,
-                                                 num_gpus)
+    phy2log, log2phy, logcnt = rebalance_experts(
+        weight, num_replicas, num_groups, num_nodes, num_gpus
+    )

    # Verify shapes
    assert phy2log.shape == (1, 12)
    assert logcnt.shape == (1, 8)

    # Expert with highest weight (index 0) should have more replicas
-    assert (
-        logcnt[0, 0]
-        > logcnt[0, 1]), "Expert with highest weight should have more replicas"
+    assert logcnt[0, 0] > logcnt[0, 1], (
+        "Expert with highest weight should have more replicas"
+    )


 def test_multiple_layers():
    """Test multiple layers case"""
-    weight = torch.tensor([
-        [10, 20, 30, 40, 50, 60],  # First layer
-        [60, 50, 40, 30, 20, 10],  # Second layer (opposite weight pattern)
-        [25, 25, 25, 25, 25, 25],  # Third layer (equal weights)
-    ])
+    weight = torch.tensor(
+        [
+            [10, 20, 30, 40, 50, 60],  # First layer
+            [60, 50, 40, 30, 20, 10],  # Second layer (opposite weight pattern)
+            [25, 25, 25, 25, 25, 25],  # Third layer (equal weights)
+        ]
+    )
    num_replicas = 8
    num_groups = 2
    num_nodes = 2
    num_gpus = 4

-    phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
-                                                 num_groups, num_nodes,
-                                                 num_gpus)
+    phy2log, log2phy, logcnt = rebalance_experts(
+        weight, num_replicas, num_groups, num_nodes, num_gpus
+    )

    # Verify shapes
    assert phy2log.shape == (3, 8)
@@ -152,12 +160,12 @@ def test_multiple_layers():

    # Verify expert allocation is reasonable for each layer
    for layer in range(3):
-        assert torch.all(phy2log[layer] >= 0) and torch.all(
-            phy2log[layer] < 6
-        ), f"Layer {layer} physical to logical mapping" \
-            "should be in range [0, 6)"
-        assert (torch.sum(logcnt[layer]) == num_replicas
-                ), f"Layer {layer} total replicas should be {num_replicas}"
+        assert torch.all(phy2log[layer] >= 0) and torch.all(phy2log[layer] < 6), (
+            f"Layer {layer} physical to logical mappingshould be in range [0, 6)"
+        )
+        assert torch.sum(logcnt[layer]) == num_replicas, (
+            f"Layer {layer} total replicas should be {num_replicas}"
+        )


 def test_parameter_validation():
@@ -179,17 +187,19 @@ def test_parameter_validation():

 def test_small_scale_hierarchical():
    """Test small-scale hierarchical load balancing"""
-    weight = torch.tensor([
-        [100, 50, 200, 75, 150, 25, 300, 80],  # 8 experts
-    ])
+    weight = torch.tensor(
+        [
+            [100, 50, 200, 75, 150, 25, 300, 80],  # 8 experts
+        ]
+    )
    num_replicas = 12
    num_groups = 4  # 4 groups, 2 experts each
    num_nodes = 2  # 2 nodes
    num_gpus = 4  # 4 GPUs

-    phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
-                                                 num_groups, num_nodes,
-                                                 num_gpus)
+    phy2log, log2phy, logcnt = rebalance_experts(
+        weight, num_replicas, num_groups, num_nodes, num_gpus
+    )

    # Verify basic constraints
    assert phy2log.shape == (1, 12)
@@ -199,8 +209,9 @@ def test_small_scale_hierarchical():

    # Expert with highest weight should have more replicas
    max_weight_expert = torch.argmax(weight[0])
-    assert (logcnt[0, max_weight_expert]
-            >= 2), "Highest weight expert should have multiple replicas"
+    assert logcnt[0, max_weight_expert] >= 2, (
+        "Highest weight expert should have multiple replicas"
+    )


 def test_global_load_balance_fallback():
@@ -213,9 +224,9 @@ def test_global_load_balance_fallback():
    num_nodes = 2
    num_gpus = 4

-    phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
-                                                 num_groups, num_nodes,
-                                                 num_gpus)
+    phy2log, log2phy, logcnt = rebalance_experts(
+        weight, num_replicas, num_groups, num_nodes, num_gpus
+    )

    # Should work normally, just using global load balancing strategy
    assert phy2log.shape == (1, 8)
@@ -235,9 +246,9 @@ def test_device_compatibility(device):
    num_nodes = 1
    num_gpus = 2

-    phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
-                                                 num_groups, num_nodes,
-                                                 num_gpus)
+    phy2log, log2phy, logcnt = rebalance_experts(
+        weight, num_replicas, num_groups, num_nodes, num_gpus
+    )

    # Function will convert to CPU internally, but should handle different
    # device inputs normally
@@ -250,7 +261,8 @@ def test_additional_cases():

    # Test case 1: Large-scale distributed setup
    weight1 = torch.tensor(
-        [[50, 100, 75, 120, 90, 60, 80, 110, 40, 70, 95, 85, 65, 55, 45, 35]])
+        [[50, 100, 75, 120, 90, 60, 80, 110, 40, 70, 95, 85, 65, 55, 45, 35]]
+    )
    phy2log1, log2phy1, logcnt1 = rebalance_experts(weight1, 24, 8, 4, 8)

    assert phy2log1.shape == (1, 24)
@@ -258,10 +270,12 @@ def test_additional_cases():
    assert torch.sum(logcnt1) == 24

    # Test case 2: Different weight distributions
-    weight2 = torch.tensor([
-        [200, 150, 100, 50, 25, 12],  # Decreasing weights
-        [12, 25, 50, 100, 150, 200],  # Increasing weights
-    ])
+    weight2 = torch.tensor(
+        [
+            [200, 150, 100, 50, 25, 12],  # Decreasing weights
+            [12, 25, 50, 100, 150, 200],  # Increasing weights
+        ]
+    )
    phy2log2, log2phy2, logcnt2 = rebalance_experts(weight2, 10, 3, 1, 2)

    assert phy2log2.shape == (2, 10)
@@ -274,19 +288,21 @@ def test_additional_cases():


 if __name__ == "__main__":
-    weight = torch.tensor([
-        [90, 132, 40, 61, 104, 165, 39, 4, 73, 56, 183, 86],
-        [20, 107, 104, 64, 19, 197, 187, 157, 172, 86, 16, 27],
-    ])
+    weight = torch.tensor(
+        [
+            [90, 132, 40, 61, 104, 165, 39, 4, 73, 56, 183, 86],
+            [20, 107, 104, 64, 19, 197, 187, 157, 172, 86, 16, 27],
+        ]
+    )

    num_replicas = 16
    num_groups = 4
    num_nodes = 2
    num_gpus = 8

-    phy2log, log2phy, logcnt = rebalance_experts(weight, num_replicas,
-                                                 num_groups, num_nodes,
-                                                 num_gpus)
+    phy2log, log2phy, logcnt = rebalance_experts(
+        weight, num_replicas, num_groups, num_nodes, num_gpus
+    )
    print(phy2log)

    test_basic_rebalance()
--- a/tests/distributed/test_eplb_execute.py
+++ b/tests/distributed/test_eplb_execute.py
@@ -9,11 +9,12 @@ import pytest
 import torch
 import torch.distributed

-from vllm.distributed.eplb.rebalance_execute import (
-    rearrange_expert_weights_inplace)
-from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
-                                             get_tp_group,
-                                             init_distributed_environment)
+from vllm.distributed.eplb.rebalance_execute import rearrange_expert_weights_inplace
+from vllm.distributed.parallel_state import (
+    ensure_model_parallel_initialized,
+    get_tp_group,
+    init_distributed_environment,
+)
 from vllm.utils import update_environment_variables


@@ -22,13 +23,13 @@ def distributed_run(fn, world_size):
    processes: list[multiprocessing.Process] = []
    for i in range(number_of_processes):
        env: dict[str, str] = {}
-        env['RANK'] = str(i)
-        env['LOCAL_RANK'] = str(i)
-        env['WORLD_SIZE'] = str(number_of_processes)
-        env['LOCAL_WORLD_SIZE'] = str(number_of_processes)
-        env['MASTER_ADDR'] = 'localhost'
-        env['MASTER_PORT'] = '12345'
-        p = multiprocessing.Process(target=fn, args=(env, ))
+        env["RANK"] = str(i)
+        env["LOCAL_RANK"] = str(i)
+        env["WORLD_SIZE"] = str(number_of_processes)
+        env["LOCAL_WORLD_SIZE"] = str(number_of_processes)
+        env["MASTER_ADDR"] = "localhost"
+        env["MASTER_PORT"] = "12345"
+        p = multiprocessing.Process(target=fn, args=(env,))
        processes.append(p)
        p.start()

@@ -45,7 +46,7 @@ def worker_fn_wrapper(fn):
    # and update the environment variables in the function
    def wrapped_fn(env):
        update_environment_variables(env)
-        local_rank = os.environ['LOCAL_RANK']
+        local_rank = os.environ["LOCAL_RANK"]
        device = torch.device(f"cuda:{local_rank}")
        torch.cuda.set_device(device)
        init_distributed_environment()
@@ -60,20 +61,20 @@ def worker_fn_wrapper(fn):


 def create_expert_indices_with_redundancy(
-        num_layers: int,
-        num_logical_experts: int,
-        total_physical_experts: int,
-        redundancy_config: list[int],  # redundancy for each logical expert
+    num_layers: int,
+    num_logical_experts: int,
+    total_physical_experts: int,
+    redundancy_config: list[int],  # redundancy for each logical expert
 ) -> torch.Tensor:
    """
    Create expert indices with redundancy.
-    
+
    Args:
        num_layers: number of layers
        num_logical_experts: number of logical experts
        total_physical_experts: total number of physical experts
        redundancy_config: redundancy for each logical expert
-    
+
    Returns:
        indices: Shape (num_layers, total_physical_experts)
    """
@@ -106,11 +107,11 @@ def create_expert_weights(
 ) -> list[list[torch.Tensor]]:
    """
    Create fake expert weights tensor for testing.
-    
+
    Use `arange` to generate predictable weights values, based on logical
    expert ID.
    All replicas of the same logical expert should have the same weights.
-    
+
    Args:
        physical_to_logical_mapping: Shape (num_layers, num_local_experts)
            mapping[layer, physical_pos] = logical_expert_id
@@ -120,27 +121,27 @@ def create_expert_weights(
    for layer in range(num_layers):
        layer_weights = []
        for weight_idx, hidden_size in enumerate(hidden_sizes):
-            weight_tensor = torch.zeros(num_local_experts,
-                                        hidden_size,
-                                        device=device,
-                                        dtype=torch.float32)
+            weight_tensor = torch.zeros(
+                num_local_experts, hidden_size, device=device, dtype=torch.float32
+            )

            for local_expert in range(num_local_experts):
                # Get the logical expert ID for this physical expert
                global_pos = rank * num_local_experts + local_expert
                logical_expert_id = physical_to_logical_mapping[
-                    layer, global_pos].item()
+                    layer, global_pos
+                ].item()

                # Generate weights based on logical expert ID
                # (so that all replicas of the same logical expert have the
                # same weights)
-                base_value = (logical_expert_id * 1000 + layer * 100 +
-                              weight_idx * 10)
-                weight_tensor[local_expert] = torch.arange(base_value,
-                                                           base_value +
-                                                           hidden_size,
-                                                           device=device,
-                                                           dtype=torch.float32)
+                base_value = logical_expert_id * 1000 + layer * 100 + weight_idx * 10
+                weight_tensor[local_expert] = torch.arange(
+                    base_value,
+                    base_value + hidden_size,
+                    device=device,
+                    dtype=torch.float32,
+                )

            layer_weights.append(weight_tensor)
        expert_weights.append(layer_weights)
@@ -182,12 +183,15 @@ def verify_expert_weights_after_shuffle(

                # Check if the weights are correct
                actual_weights = weight_tensor[local_expert]
-                expected_base = (expected_logical_expert * 1000 + layer * 100 +
-                                 weight_idx * 10)
-                expected_weights = torch.arange(expected_base,
-                                                expected_base + hidden_size,
-                                                device=actual_weights.device,
-                                                dtype=actual_weights.dtype)
+                expected_base = (
+                    expected_logical_expert * 1000 + layer * 100 + weight_idx * 10
+                )
+                expected_weights = torch.arange(
+                    expected_base,
+                    expected_base + hidden_size,
+                    device=actual_weights.device,
+                    dtype=actual_weights.dtype,
+                )

                torch.testing.assert_close(
                    actual_weights,
@@ -195,7 +199,8 @@ def verify_expert_weights_after_shuffle(
                    msg=f"Layer {layer}, weight {weight_idx},"
                    f"local expert {local_expert}: "
                    f"weights do not match. "
-                    f"Expected logical expert {expected_logical_expert}")
+                    f"Expected logical expert {expected_logical_expert}",
+                )


 def verify_redundant_experts_have_same_weights(
@@ -222,23 +227,23 @@ def verify_redundant_experts_have_same_weights(
                total_physical_experts,
                hidden_size,
                device=expert_weights[layer][weight_idx].device,
-                dtype=expert_weights[layer][weight_idx].dtype)
+                dtype=expert_weights[layer][weight_idx].dtype,
+            )

            # Use all_gather to collect expert weights from current node
            # expert_weights[layer][weight_idx] shape:
            # [num_local_experts, hidden_size]
            local_weights = expert_weights[layer][
-                weight_idx]  # [num_local_experts, hidden_size]
+                weight_idx
+            ]  # [num_local_experts, hidden_size]

            # Split tensor along dim 0 into a list for all_gather
-            gathered_weights_list = torch.chunk(gathered_weights,
-                                                world_size,
-                                                dim=0)
+            gathered_weights_list = torch.chunk(gathered_weights, world_size, dim=0)

            torch.distributed.all_gather(
                # Output list: each element corresponds to one rank's weights
                list(gathered_weights_list),
-                local_weights  # Input: current rank's local weights
+                local_weights,  # Input: current rank's local weights
            )

            all_weights.append(gathered_weights)
@@ -266,7 +271,8 @@ def verify_redundant_experts_have_same_weights(
                        msg=f"Layer {layer}, weight {weight_idx},"
                        f"logical expert {logical_expert_id}: "
                        f"Physical expert {physical_pos} has different weights"
-                        f"than expected")
+                        f"than expected",
+                    )


@pytest.mark.parametrize(
@@ -290,10 +296,11 @@ def verify_redundant_experts_have_same_weights(
        # 4 GPU, 8 experts per GPU
        # 16 logical experts, 32 physical experts, 16 redundant experts
        (4, 8, 8, 16),
-    ])
-def test_rearrange_expert_weights_with_redundancy(world_size, num_layers,
-                                                  num_local_experts,
-                                                  num_logical_experts):
+    ],
+)
+def test_rearrange_expert_weights_with_redundancy(
+    world_size, num_layers, num_local_experts, num_logical_experts
+):
    """Test the functionality of rearranging expert weights with redundancy."""

    if torch.cuda.device_count() < world_size:
@@ -304,8 +311,8 @@ def test_rearrange_expert_weights_with_redundancy(world_size, num_layers,
        # Initialize model parallel (using tensor parallel as an entrypoint
        # to expert parallel)
        ensure_model_parallel_initialized(
-            tensor_model_parallel_size=world_size,
-            pipeline_model_parallel_size=1)
+            tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
+        )

        ep_group = get_tp_group().cpu_group
        ep_rank = torch.distributed.get_rank()
@@ -316,8 +323,9 @@ def test_rearrange_expert_weights_with_redundancy(world_size, num_layers,
        hidden_sizes = [32, 64]  # Two different weight matrices

        # Create old expert indices (with redundancy)
-        redundancy_config = create_redundancy_config(num_logical_experts,
-                                                     total_physical_experts)
+        redundancy_config = create_redundancy_config(
+            num_logical_experts, total_physical_experts
+        )

        old_indices = create_expert_indices_with_redundancy(
            num_layers,
@@ -328,7 +336,8 @@ def test_rearrange_expert_weights_with_redundancy(world_size, num_layers,

        # Create new expert indices (with redundancy)
        new_redundancy_config = create_redundancy_config(
-            num_logical_experts, total_physical_experts)
+            num_logical_experts, total_physical_experts
+        )
        new_indices = create_expert_indices_with_redundancy(
            num_layers,
            num_logical_experts,
@@ -337,9 +346,9 @@ def test_rearrange_expert_weights_with_redundancy(world_size, num_layers,
        )

        # Create expert weights
-        expert_weights = create_expert_weights(num_layers, num_local_experts,
-                                               hidden_sizes, ep_rank, device,
-                                               old_indices)
+        expert_weights = create_expert_weights(
+            num_layers, num_local_experts, hidden_sizes, ep_rank, device, old_indices
+        )

        # Execute weight rearrangement
        rearrange_expert_weights_inplace(
@@ -383,8 +392,8 @@ def test_rearrange_expert_weights_no_change(world_size):
    @worker_fn_wrapper
    def worker_fn():
        ensure_model_parallel_initialized(
-            tensor_model_parallel_size=world_size,
-            pipeline_model_parallel_size=1)
+            tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
+        )

        ep_group = get_tp_group().cpu_group
        ep_rank = torch.distributed.get_rank()
@@ -401,12 +410,12 @@ def test_rearrange_expert_weights_no_change(world_size):

        # Same indices - no change
        indices = create_expert_indices_with_redundancy(
-            num_layers, num_logical_experts, total_physical_experts,
-            redundancy_config)
+            num_layers, num_logical_experts, total_physical_experts, redundancy_config
+        )

-        expert_weights = create_expert_weights(num_layers, num_local_experts,
-                                               hidden_sizes, ep_rank, device,
-                                               indices)
+        expert_weights = create_expert_weights(
+            num_layers, num_local_experts, hidden_sizes, ep_rank, device, indices
+        )

        # Save original weights
        original_weights = []
@@ -422,7 +431,8 @@ def test_rearrange_expert_weights_no_change(world_size):
            indices,  # Same indices
            expert_weights,
            ep_group,
-            is_profile=False)
+            is_profile=False,
+        )

        # Verify that the weights have not changed
        for layer in range(num_layers):
@@ -430,8 +440,8 @@ def test_rearrange_expert_weights_no_change(world_size):
                torch.testing.assert_close(
                    expert_weights[layer][weight_idx],
                    original_weights[layer][weight_idx],
-                    msg=f"Layer {layer}, weight {weight_idx} should remain "
-                    f"unchanged")
+                    msg=f"Layer {layer}, weight {weight_idx} should remain unchanged",
+                )

    distributed_run(worker_fn, world_size)

@@ -446,8 +456,8 @@ def test_rearrange_expert_weights_profile_mode(world_size):
    @worker_fn_wrapper
    def worker_fn():
        ensure_model_parallel_initialized(
-            tensor_model_parallel_size=world_size,
-            pipeline_model_parallel_size=1)
+            tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
+        )

        ep_group = get_tp_group().cpu_group
        ep_rank = torch.distributed.get_rank()
@@ -460,21 +470,23 @@ def test_rearrange_expert_weights_profile_mode(world_size):
        hidden_sizes = [32]

        # Create different index distributions
-        old_redundancy = create_redundancy_config(num_logical_experts,
-                                                  total_physical_experts)
-        new_redundancy = create_redundancy_config(num_logical_experts,
-                                                  total_physical_experts)
+        old_redundancy = create_redundancy_config(
+            num_logical_experts, total_physical_experts
+        )
+        new_redundancy = create_redundancy_config(
+            num_logical_experts, total_physical_experts
+        )

        old_indices = create_expert_indices_with_redundancy(
-            num_layers, num_logical_experts, total_physical_experts,
-            old_redundancy)
+            num_layers, num_logical_experts, total_physical_experts, old_redundancy
+        )
        new_indices = create_expert_indices_with_redundancy(
-            num_layers, num_logical_experts, total_physical_experts,
-            new_redundancy)
+            num_layers, num_logical_experts, total_physical_experts, new_redundancy
+        )

-        expert_weights = create_expert_weights(num_layers, num_local_experts,
-                                               hidden_sizes, ep_rank, device,
-                                               old_indices)
+        expert_weights = create_expert_weights(
+            num_layers, num_local_experts, hidden_sizes, ep_rank, device, old_indices
+        )

        # Save original weights
        original_weights = []
@@ -490,7 +502,7 @@ def test_rearrange_expert_weights_profile_mode(world_size):
            new_indices,
            expert_weights,
            ep_group,
-            is_profile=True  # Profile mode
+            is_profile=True,  # Profile mode
        )

        # In profile mode, the weights should remain unchanged
@@ -499,6 +511,7 @@ def test_rearrange_expert_weights_profile_mode(world_size):
                torch.testing.assert_close(
                    expert_weights[layer][weight_idx],
                    original_weights[layer][weight_idx],
-                    msg="In profile mode, the weights should remain unchanged")
+                    msg="In profile mode, the weights should remain unchanged",
+                )

    distributed_run(worker_fn, world_size)
--- a/tests/distributed/test_events.py
+++ b/tests/distributed/test_events.py
@@ -6,24 +6,29 @@ import time
 import msgspec
 import pytest

-from vllm.distributed.kv_events import (EventBatch, EventPublisherFactory,
-                                        NullEventPublisher)
+from vllm.distributed.kv_events import (
+    EventBatch,
+    EventPublisherFactory,
+    NullEventPublisher,
+)

 DP_RANK = 0


 class EventSample(
-        msgspec.Struct,
-        tag=True,  # type: ignore
-        array_like=True  # type: ignore
+    msgspec.Struct,
+    tag=True,  # type: ignore
+    array_like=True,  # type: ignore
 ):
    """Test event for publisher testing"""
+
    id: int
    value: str


 class SampleBatch(EventBatch):
    """Test event batch for publisher testing"""
+
    events: list[EventSample]


@@ -44,10 +49,8 @@ def test_basic_publishing(publisher, subscriber):

    seq, received = result
    assert seq == 0, "Sequence number mismatch"
-    assert received.ts == pytest.approx(test_batch.ts,
-                                        abs=0.1), ("Timestamp mismatch")
-    assert len(received.events) == len(
-        test_batch.events), ("Number of events mismatch")
+    assert received.ts == pytest.approx(test_batch.ts, abs=0.1), "Timestamp mismatch"
+    assert len(received.events) == len(test_batch.events), "Number of events mismatch"

    for i, event in enumerate(received.events):
        assert event.id == i, "Event id mismatch"
@@ -88,9 +91,9 @@ def test_replay_mechanism(publisher, subscriber):
    assert len(replayed) > 0, "No replayed messages received"
    seqs = [seq for seq, _ in replayed]
    assert all(seq >= 10 for seq in seqs), "Replayed messages not in order"
-    assert seqs == list(range(min(seqs),
-                              max(seqs) +
-                              1)), ("Replayed messages not consecutive")
+    assert seqs == list(range(min(seqs), max(seqs) + 1)), (
+        "Replayed messages not consecutive"
+    )


 def test_buffer_limit(publisher, subscriber, publisher_config):
@@ -126,6 +129,7 @@ def test_topic_filtering(publisher_config):
    pub = EventPublisherFactory.create(publisher_config, DP_RANK)

    from .conftest import MockSubscriber
+
    sub_foo = MockSubscriber(publisher_config.endpoint, None, "foo")
    sub_bar = MockSubscriber(publisher_config.endpoint, None, "bar")

@@ -137,11 +141,13 @@ def test_topic_filtering(publisher_config):

        foo_received = [sub_foo.receive_one(timeout=200) for _ in range(3)]
        assert all(msg is not None for msg in foo_received), (
-            "Subscriber with matching topic should receive messages")
+            "Subscriber with matching topic should receive messages"
+        )

        bar_received = [sub_bar.receive_one(timeout=200) for _ in range(3)]
        assert all(msg is None for msg in bar_received), (
-            "Subscriber with non-matching topic should receive no messages")
+            "Subscriber with non-matching topic should receive no messages"
+        )
    finally:
        pub.shutdown()
        sub_foo.close()
@@ -178,8 +184,7 @@ def test_high_volume(publisher, subscriber):

    publisher_thread.join()

-    assert len(received) >= num_batches * 0.9, (
-        "We should have received most messages")
+    assert len(received) >= num_batches * 0.9, "We should have received most messages"

    seqs = [seq for seq, _ in received]
    assert sorted(seqs) == seqs, "Sequence numbers should be in order"
@@ -209,13 +214,15 @@ def test_data_parallel_rank_tagging(publisher_config):
        # For TCP endpoints: tcp://localhost:5557 -> tcp://localhost:5557, tcp://localhost:5558
        expected_endpoint_0 = base_endpoint  # rank 0 gets port + 0 = same port
        expected_endpoint_1 = base_endpoint.replace(
-            ":5557", ":5558")  # rank 1 gets port + 1
+            ":5557", ":5558"
+        )  # rank 1 gets port + 1
    else:
        # For inproc endpoints: inproc://test -> inproc://test_dp0, inproc://test_dp1
        expected_endpoint_0 = base_endpoint  # rank 0 gets base
        expected_endpoint_1 = base_endpoint + "_dp1"  # rank 1 gets _dp1

    from .conftest import MockSubscriber
+
    sub_0 = MockSubscriber(expected_endpoint_0, None, publisher_config.topic)
    sub_1 = MockSubscriber(expected_endpoint_1, None, publisher_config.topic)

@@ -241,15 +248,15 @@ def test_data_parallel_rank_tagging(publisher_config):

        # Verify DP rank tagging
        assert received_0.data_parallel_rank == 0, (
-            f"Expected DP rank 0, got {received_0.data_parallel_rank}")
+            f"Expected DP rank 0, got {received_0.data_parallel_rank}"
+        )
        assert received_1.data_parallel_rank == 1, (
-            f"Expected DP rank 1, got {received_1.data_parallel_rank}")
+            f"Expected DP rank 1, got {received_1.data_parallel_rank}"
+        )

        # Verify event content is correct
-        assert len(
-            received_0.events) == 2, "Wrong number of events from rank 0"
-        assert len(
-            received_1.events) == 3, "Wrong number of events from rank 1"
+        assert len(received_0.events) == 2, "Wrong number of events from rank 0"
+        assert len(received_1.events) == 3, "Wrong number of events from rank 1"

    finally:
        pub_0.shutdown()
--- a/tests/distributed/test_expert_parallel.py
+++ b/tests/distributed/test_expert_parallel.py
@@ -46,28 +46,24 @@ class EPTestSettings:
    ):
        return EPTestSettings(
            parallel_setups=[
-                ParallelSetup(tp_size=tp_base,
-                              eager_mode=False,
-                              chunked_prefill=False),
-                ParallelSetup(tp_size=tp_base,
-                              eager_mode=False,
-                              chunked_prefill=True),
-                ParallelSetup(tp_size=tp_base,
-                              eager_mode=True,
-                              chunked_prefill=False),
-                ParallelSetup(tp_size=2 * tp_base,
-                              eager_mode=False,
-                              chunked_prefill=True),
-                ParallelSetup(tp_size=2 * tp_base,
-                              eager_mode=True,
-                              chunked_prefill=False),
+                ParallelSetup(tp_size=tp_base, eager_mode=False, chunked_prefill=False),
+                ParallelSetup(tp_size=tp_base, eager_mode=False, chunked_prefill=True),
+                ParallelSetup(tp_size=tp_base, eager_mode=True, chunked_prefill=False),
+                ParallelSetup(
+                    tp_size=2 * tp_base, eager_mode=False, chunked_prefill=True
+                ),
+                ParallelSetup(
+                    tp_size=2 * tp_base, eager_mode=True, chunked_prefill=False
+                ),
            ],
            distributed_backends=["mp", "ray"],
            runner=runner,
-            test_options=EPTestOptions(trust_remote_code=trust_remote_code,
-                                       tokenizer_mode=tokenizer_mode,
-                                       load_format=load_format,
-                                       hf_overrides=hf_overrides),
+            test_options=EPTestOptions(
+                trust_remote_code=trust_remote_code,
+                tokenizer_mode=tokenizer_mode,
+                load_format=load_format,
+                hf_overrides=hf_overrides,
+            ),
        )

    @staticmethod
@@ -82,16 +78,16 @@ class EPTestSettings:
    ):
        return EPTestSettings(
            parallel_setups=[
-                ParallelSetup(tp_size=tp_base,
-                              eager_mode=True,
-                              chunked_prefill=False),
+                ParallelSetup(tp_size=tp_base, eager_mode=True, chunked_prefill=False),
            ],
            distributed_backends=["mp"],
            runner=runner,
-            test_options=EPTestOptions(trust_remote_code=trust_remote_code,
-                                       tokenizer_mode=tokenizer_mode,
-                                       load_format=load_format,
-                                       hf_overrides=hf_overrides),
+            test_options=EPTestOptions(
+                trust_remote_code=trust_remote_code,
+                tokenizer_mode=tokenizer_mode,
+                load_format=load_format,
+                hf_overrides=hf_overrides,
+            ),
        )

    def iter_params(self, model_name: str):
@@ -99,8 +95,13 @@ class EPTestSettings:

        for parallel_setup in self.parallel_setups:
            for distributed_backend in self.distributed_backends:
-                yield (model_name, parallel_setup, distributed_backend,
-                       self.runner, opts)
+                yield (
+                    model_name,
+                    parallel_setup,
+                    distributed_backend,
+                    self.runner,
+                    opts,
+                )


 # NOTE: You can adjust tp_base locally to fit the model in GPU
--- a/tests/distributed/test_expert_placement.py
+++ b/tests/distributed/test_expert_placement.py
@@ -6,8 +6,7 @@ import pytest
 from vllm.model_executor.layers.fused_moe.layer import determine_expert_map


-def verify_round_robin_pattern(expert_map, ep_rank, ep_size,
-                               global_num_experts):
+def verify_round_robin_pattern(expert_map, ep_rank, ep_size, global_num_experts):
    """Verify that the expert map follows the round_robin pattern."""
    # Calculate expected local experts (supporting non-divisible cases)
    base_experts = global_num_experts // ep_size
@@ -30,24 +29,21 @@ def verify_round_robin_pattern(expert_map, ep_rank, ep_size,
        if global_expert_id in expected_expert_ids:
            local_expert_id = expert_map[global_expert_id]
            expected_local_id = expected_expert_ids.index(global_expert_id)
-            assert (
-                local_expert_id == expected_local_id
-            ), f"Global expert {global_expert_id} should map to local expert " \
+            assert local_expert_id == expected_local_id, (
+                f"Global expert {global_expert_id} should map to local expert "
                f"{expected_local_id}, got {local_expert_id}"
+            )
        else:
-            assert (
-                expert_map[global_expert_id] == -1
-            ), f"Global expert {global_expert_id} should not be mapped to " \
-                f"this rank"
+            assert expert_map[global_expert_id] == -1, (
+                f"Global expert {global_expert_id} should not be mapped to this rank"
+            )

    # Verify that all local expert IDs are consecutive starting from 0
-    local_expert_ids = [
-        expert_map[global_id] for global_id in expected_expert_ids
-    ]
+    local_expert_ids = [expert_map[global_id] for global_id in expected_expert_ids]
    expected_local_ids = list(range(local_num_experts))
-    assert (
-        local_expert_ids == expected_local_ids
-    ), f"Expected local expert IDs {expected_local_ids}, got {local_expert_ids}"
+    assert local_expert_ids == expected_local_ids, (
+        f"Expected local expert IDs {expected_local_ids}, got {local_expert_ids}"
+    )


@pytest.mark.parametrize("expert_placement_strategy", ["round_robin"])
@@ -78,8 +74,9 @@ def test_expert_placement_various_sizes(expert_placement_strategy, world_size):

    for test_global_experts, test_ep_size in test_cases:
        # Ensure ep_size matches world_size
-        assert (test_ep_size == world_size
-                ), f"ep_size {test_ep_size} must equal world_size {world_size}"
+        assert test_ep_size == world_size, (
+            f"ep_size {test_ep_size} must equal world_size {world_size}"
+        )

        # Test each rank
        for ep_rank in range(world_size):
@@ -98,21 +95,22 @@ def test_expert_placement_various_sizes(expert_placement_strategy, world_size):
                expert_placement_strategy=expert_placement_strategy,
            )

-            assert (
-                test_local_experts == expected_test_local
-            ), f"For {test_global_experts} experts on {test_ep_size} ranks, " \
-                f"rank {ep_rank}: expected {expected_test_local} local" \
+            assert test_local_experts == expected_test_local, (
+                f"For {test_global_experts} experts on {test_ep_size} ranks, "
+                f"rank {ep_rank}: expected {expected_test_local} local"
                f"experts, got {test_local_experts}"
+            )

            if test_expert_map is not None:
-                assert test_expert_map.shape == (
-                    test_global_experts,
-                ), f"Expected expert map shape ({test_global_experts},), " \
+                assert test_expert_map.shape == (test_global_experts,), (
+                    f"Expected expert map shape ({test_global_experts},), "
                    f"got {test_expert_map.shape}"
+                )

                # Verify round_robin pattern for this test case
-                verify_round_robin_pattern(test_expert_map, ep_rank,
-                                           test_ep_size, test_global_experts)
+                verify_round_robin_pattern(
+                    test_expert_map, ep_rank, test_ep_size, test_global_experts
+                )


@pytest.mark.parametrize("expert_placement_strategy", ["round_robin"])
@@ -147,28 +145,81 @@ def test_determine_expert_map_comprehensive():
    # expert_placement_strategy, expected_local, expected_map_pattern)
    test_cases = [
        # Round robin placement tests
-        (2, 0, 8, "round_robin", 4, [0, -1, 1, -1, 2, -1, 3,
-                                     -1]),  # rank 0 gets even experts
-        (2, 1, 8, "round_robin", 4, [-1, 0, -1, 1, -1, 2, -1,
-                                     3]),  # rank 1 gets odd experts
-        (2, 0, 9, "round_robin", 5, [0, -1, 1, -1, 2, -1, 3, -1, 4
-                                     ]),  # rank 0 gets 5 experts (even + last)
-        (2, 1, 9, "round_robin", 4, [-1, 0, -1, 1, -1, 2, -1, 3,
-                                     -1]),  # rank 1 gets 4 experts (odd)
-
+        (
+            2,
+            0,
+            8,
+            "round_robin",
+            4,
+            [0, -1, 1, -1, 2, -1, 3, -1],
+        ),  # rank 0 gets even experts
+        (
+            2,
+            1,
+            8,
+            "round_robin",
+            4,
+            [-1, 0, -1, 1, -1, 2, -1, 3],
+        ),  # rank 1 gets odd experts
+        (
+            2,
+            0,
+            9,
+            "round_robin",
+            5,
+            [0, -1, 1, -1, 2, -1, 3, -1, 4],
+        ),  # rank 0 gets 5 experts (even + last)
+        (
+            2,
+            1,
+            9,
+            "round_robin",
+            4,
+            [-1, 0, -1, 1, -1, 2, -1, 3, -1],
+        ),  # rank 1 gets 4 experts (odd)
        # 4-rank tests
-        (4, 0, 8, "round_robin", 2, [0, -1, -1, -1, 1, -1, -1,
-                                     -1]),  # rank 0 gets experts 0, 4
-        (4, 1, 8, "round_robin", 2, [-1, 0, -1, -1, -1, 1, -1,
-                                     -1]),  # rank 1 gets experts 1, 5
-        (4, 2, 8, "round_robin", 2, [-1, -1, 0, -1, -1, -1, 1,
-                                     -1]),  # rank 2 gets experts 2, 6
-        (4, 3, 8, "round_robin", 2, [-1, -1, -1, 0, -1, -1, -1,
-                                     1]),  # rank 3 gets experts 3, 7
+        (
+            4,
+            0,
+            8,
+            "round_robin",
+            2,
+            [0, -1, -1, -1, 1, -1, -1, -1],
+        ),  # rank 0 gets experts 0, 4
+        (
+            4,
+            1,
+            8,
+            "round_robin",
+            2,
+            [-1, 0, -1, -1, -1, 1, -1, -1],
+        ),  # rank 1 gets experts 1, 5
+        (
+            4,
+            2,
+            8,
+            "round_robin",
+            2,
+            [-1, -1, 0, -1, -1, -1, 1, -1],
+        ),  # rank 2 gets experts 2, 6
+        (
+            4,
+            3,
+            8,
+            "round_robin",
+            2,
+            [-1, -1, -1, 0, -1, -1, -1, 1],
+        ),  # rank 3 gets experts 3, 7
    ]

-    for ep_size, ep_rank, global_num_experts, expert_placement_strategy, \
-        expected_local, expected_map_pattern in test_cases:
+    for (
+        ep_size,
+        ep_rank,
+        global_num_experts,
+        expert_placement_strategy,
+        expected_local,
+        expected_map_pattern,
+    ) in test_cases:
        local_num_experts, expert_map = determine_expert_map(
            ep_size=ep_size,
            ep_rank=ep_rank,
@@ -176,19 +227,21 @@ def test_determine_expert_map_comprehensive():
            expert_placement_strategy=expert_placement_strategy,
        )

-        assert local_num_experts == expected_local, \
-            f"ep_size={ep_size}, ep_rank={ep_rank}, " \
-            f"global_num_experts={global_num_experts}, " \
-            f"expert_placement_strategy={expert_placement_strategy}: " \
+        assert local_num_experts == expected_local, (
+            f"ep_size={ep_size}, ep_rank={ep_rank}, "
+            f"global_num_experts={global_num_experts}, "
+            f"expert_placement_strategy={expert_placement_strategy}: "
            f"expected {expected_local} local experts, got {local_num_experts}"
+        )

        if expected_map_pattern is None:
            assert expert_map is None, "Expected expert_map to be None"
        else:
            assert expert_map is not None, "Expected expert_map to not be None"
            actual_map = expert_map.tolist()
-            assert actual_map == expected_map_pattern, \
-                f"ep_size={ep_size}, ep_rank={ep_rank}, " \
-                f"global_num_experts={global_num_experts}, " \
-                f"expert_placement_strategy={expert_placement_strategy}: " \
+            assert actual_map == expected_map_pattern, (
+                f"ep_size={ep_size}, ep_rank={ep_rank}, "
+                f"global_num_experts={global_num_experts}, "
+                f"expert_placement_strategy={expert_placement_strategy}: "
                f"expected map {expected_map_pattern}, got {actual_map}"
+            )
--- a/tests/distributed/test_kvlayout.py
+++ b/tests/distributed/test_kvlayout.py
@@ -1,10 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-from vllm.config import (DeviceConfig, KVTransferConfig, ModelConfig,
-                         VllmConfig, set_current_vllm_config)
+from vllm.config import (
+    DeviceConfig,
+    KVTransferConfig,
+    ModelConfig,
+    VllmConfig,
+    set_current_vllm_config,
+)
 from vllm.distributed.kv_transfer.kv_connector.utils import (
-    get_kv_connector_cache_layout)
+    get_kv_connector_cache_layout,
+)
 from vllm.logger import init_logger

 logger = init_logger("test_expert_parallel")
@@ -23,8 +29,9 @@ def test_get_kv_connector_cache_layout_with_lmcache_connector():
        kv_connector="LMCacheConnectorV1",
        kv_role="kv_both",
    )
-    vllm_config = VllmConfig(device_config=DeviceConfig("cpu"),
-                             kv_transfer_config=kv_transfer_config)
+    vllm_config = VllmConfig(
+        device_config=DeviceConfig("cpu"), kv_transfer_config=kv_transfer_config
+    )
    with set_current_vllm_config(vllm_config):
        # Test with default settings
        layout = get_kv_connector_cache_layout()
@@ -37,9 +44,11 @@ def test_get_kv_connector_cache_layout_with_nixl_connector():
        kv_role="kv_both",
    )
    model_config = ModelConfig()
-    vllm_config = VllmConfig(device_config=DeviceConfig("cpu"),
-                             model_config=model_config,
-                             kv_transfer_config=kv_transfer_config)
+    vllm_config = VllmConfig(
+        device_config=DeviceConfig("cpu"),
+        model_config=model_config,
+        kv_transfer_config=kv_transfer_config,
+    )
    with set_current_vllm_config(vllm_config):
        # Test with default settings
        layout = get_kv_connector_cache_layout()
@@ -47,25 +56,22 @@ def test_get_kv_connector_cache_layout_with_nixl_connector():


 def test_get_kv_connector_cache_layout_with_multi_connector():
-    kv_transfer_config = KVTransferConfig(kv_connector="MultiConnector",
-                                          kv_role="kv_both",
-                                          kv_connector_extra_config={
-                                              "connectors": [{
-                                                  "kv_connector":
-                                                  "SharedStorageConnector",
-                                                  "kv_role":
-                                                  "kv_both"
-                                              }, {
-                                                  "kv_connector":
-                                                  "NixlConnector",
-                                                  "kv_role":
-                                                  "kv_both"
-                                              }]
-                                          })
+    kv_transfer_config = KVTransferConfig(
+        kv_connector="MultiConnector",
+        kv_role="kv_both",
+        kv_connector_extra_config={
+            "connectors": [
+                {"kv_connector": "SharedStorageConnector", "kv_role": "kv_both"},
+                {"kv_connector": "NixlConnector", "kv_role": "kv_both"},
+            ]
+        },
+    )
    model_config = ModelConfig()
-    vllm_config = VllmConfig(device_config=DeviceConfig("cpu"),
-                             model_config=model_config,
-                             kv_transfer_config=kv_transfer_config)
+    vllm_config = VllmConfig(
+        device_config=DeviceConfig("cpu"),
+        model_config=model_config,
+        kv_transfer_config=kv_transfer_config,
+    )
    with set_current_vllm_config(vllm_config):
        # Test with default settings
        layout = get_kv_connector_cache_layout()
--- a/tests/distributed/test_multi_node_assignment.py
+++ b/tests/distributed/test_multi_node_assignment.py
@@ -24,14 +24,13 @@ from vllm.utils import get_ip
 VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"


-@pytest.mark.skipif(not VLLM_MULTI_NODE,
-                    reason="Need at least 2 nodes to run the test.")
+@pytest.mark.skipif(
+    not VLLM_MULTI_NODE, reason="Need at least 2 nodes to run the test."
+)
 def test_multi_node_assignment() -> None:
-
    # NOTE: important to keep this class definition here
    # to let ray use cloudpickle to serialize it.
    class Actor:
-
        def get_ip(self):
            return get_ip()

@@ -41,8 +40,7 @@ def test_multi_node_assignment() -> None:

        current_ip = get_ip()
        workers = []
-        for bundle_id, bundle in enumerate(
-                config.placement_group.bundle_specs):
+        for bundle_id, bundle in enumerate(config.placement_group.bundle_specs):
            if not bundle.get("GPU", 0):
                continue
            scheduling_strategy = PlacementGroupSchedulingStrategy(
--- a/tests/distributed/test_nccl_symm_mem_allreduce.py
+++ b/tests/distributed/test_nccl_symm_mem_allreduce.py
@@ -11,15 +11,17 @@ import torch.multiprocessing as mp

 import vllm.envs as envs
 from vllm.distributed import cleanup_dist_env_and_memory
-from vllm.distributed.device_communicators.cuda_communicator import (
-    CudaCommunicator)
-from vllm.distributed.device_communicators.pynccl import (
-    register_nccl_symmetric_ops)
+from vllm.distributed.device_communicators.cuda_communicator import CudaCommunicator
+from vllm.distributed.device_communicators.pynccl import register_nccl_symmetric_ops
 from vllm.distributed.device_communicators.pynccl_allocator import (
-    get_nccl_mem_pool, is_symmetric_memory_enabled)
-from vllm.distributed.parallel_state import (get_tp_group,
-                                             init_distributed_environment,
-                                             initialize_model_parallel)
+    get_nccl_mem_pool,
+    is_symmetric_memory_enabled,
+)
+from vllm.distributed.parallel_state import (
+    get_tp_group,
+    init_distributed_environment,
+    initialize_model_parallel,
+)
 from vllm.platforms import current_platform
 from vllm.utils import update_environment_variables

@@ -38,31 +40,32 @@ def nccl_symm_mem_allreduce_worker(local_rank: int, world_size: int):
        torch.cuda.set_device(device)
        torch.set_default_device(device)
        torch.set_default_dtype(dtype)
-        update_environment_variables({
-            "RANK": str(local_rank),
-            "LOCAL_RANK": str(local_rank),
-            "WORLD_SIZE": str(world_size),
-            "MASTER_ADDR": "localhost",
-            "MASTER_PORT": "12345",
-        })
+        update_environment_variables(
+            {
+                "RANK": str(local_rank),
+                "LOCAL_RANK": str(local_rank),
+                "WORLD_SIZE": str(world_size),
+                "MASTER_ADDR": "localhost",
+                "MASTER_PORT": "12345",
+            }
+        )

        init_distributed_environment()
        initialize_model_parallel(tensor_model_parallel_size=world_size)

-        cuda_communicator = typing.cast(CudaCommunicator,
-                                        get_tp_group().device_communicator)
+        cuda_communicator = typing.cast(
+            CudaCommunicator, get_tp_group().device_communicator
+        )
        pynccl_comm = cuda_communicator.pynccl_comm
        if get_nccl_mem_pool() is None:
-            pytest.skip("NCCL allocator compilation failed "
-                        "(probably missing NCCL headers).")
+            pytest.skip(
+                "NCCL allocator compilation failed (probably missing NCCL headers)."
+            )
        if not is_symmetric_memory_enabled():
            pytest.skip("NCCL symmetric memory allreduce is disabled.")

        register_nccl_symmetric_ops(pynccl_comm)
-        input = torch.randint(1,
-                              23, (test_size_elements, ),
-                              dtype=dtype,
-                              device=device)
+        input = torch.randint(1, 23, (test_size_elements,), dtype=dtype, device=device)
        input_clone = input.clone()
        output = torch.ops.vllm.all_reduce_symmetric_with_copy(input)
        assert output is not None
@@ -77,8 +80,7 @@ def nccl_symm_mem_allreduce_worker(local_rank: int, world_size: int):
    reason="NCCLSymmMemAllreduce is only available for CUDA platforms.",
 )
@pytest.mark.parametrize("world_size", [2])
-@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"],
-                    reason="Only test on CUDA")
+@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA")
 def test_nccl_symm_mem_allreduce(monkeypatch: pytest.MonkeyPatch, world_size):
    if world_size > torch.cuda.device_count():
        pytest.skip("Not enough GPUs to run the test.")
@@ -88,7 +90,5 @@ def test_nccl_symm_mem_allreduce(monkeypatch: pytest.MonkeyPatch, world_size):
    monkeypatch.setenv("NCCL_NVLS_ENABLE", "1")
    monkeypatch.setenv("NCCL_CUMEM_ENABLE", "1")

-    mp.spawn(nccl_symm_mem_allreduce_worker,
-             args=(world_size, ),
-             nprocs=world_size)
+    mp.spawn(nccl_symm_mem_allreduce_worker, args=(world_size,), nprocs=world_size)
    cleanup_dist_env_and_memory()
--- a/tests/distributed/test_node_count.py
+++ b/tests/distributed/test_node_count.py
@@ -32,12 +32,15 @@ if __name__ == "__main__":
        # Expected node count based on environment variable)
        expected = int(os.environ.get("NUM_NODES", "1"))

-        assert test_result == expected, \
-            f"Expected {expected} nodes, got {test_result}"
+        assert test_result == expected, f"Expected {expected} nodes, got {test_result}"

        if pg == dist.group.WORLD:
-            print(f"Node count test passed! Got {test_result} nodes "
-                  f"when using torch distributed!")
+            print(
+                f"Node count test passed! Got {test_result} nodes "
+                f"when using torch distributed!"
+            )
        else:
-            print(f"Node count test passed! Got {test_result} nodes "
-                  f"when using StatelessProcessGroup!")
+            print(
+                f"Node count test passed! Got {test_result} nodes "
+                f"when using StatelessProcessGroup!"
+            )
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -7,6 +7,7 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
 all workers in a node other than the head node, which can cause the test
 to fail.
 """
+
 import json
 import os
 from dataclasses import dataclass
@@ -55,26 +56,17 @@ class PPTestSettings:
    ):
        return PPTestSettings(
            parallel_setups=[
-                ParallelSetup(tp_size=tp_base,
-                              pp_size=pp_base,
-                              eager_mode=False),
-                ParallelSetup(tp_size=tp_base,
-                              pp_size=2 * pp_base,
-                              eager_mode=False),
-                ParallelSetup(tp_size=tp_base,
-                              pp_size=2 * pp_base,
-                              eager_mode=True),
-                ParallelSetup(tp_size=2 * tp_base,
-                              pp_size=pp_base,
-                              eager_mode=False),
-                ParallelSetup(tp_size=2 * tp_base,
-                              pp_size=pp_base,
-                              eager_mode=True),
+                ParallelSetup(tp_size=tp_base, pp_size=pp_base, eager_mode=False),
+                ParallelSetup(tp_size=tp_base, pp_size=2 * pp_base, eager_mode=False),
+                ParallelSetup(tp_size=tp_base, pp_size=2 * pp_base, eager_mode=True),
+                ParallelSetup(tp_size=2 * tp_base, pp_size=pp_base, eager_mode=False),
+                ParallelSetup(tp_size=2 * tp_base, pp_size=pp_base, eager_mode=True),
            ],
            distributed_backends=["mp", "ray"],
            runner=runner,
-            test_options=PPTestOptions(multi_node_only=multi_node_only,
-                                       load_format=load_format),
+            test_options=PPTestOptions(
+                multi_node_only=multi_node_only, load_format=load_format
+            ),
        )

    @staticmethod
@@ -86,17 +78,15 @@ class PPTestSettings:
        multi_node_only: bool = False,
        load_format: Optional[str] = None,
    ):
-
        return PPTestSettings(
            parallel_setups=[
-                ParallelSetup(tp_size=tp_base,
-                              pp_size=pp_base,
-                              eager_mode=True),
+                ParallelSetup(tp_size=tp_base, pp_size=pp_base, eager_mode=True),
            ],
            distributed_backends=["mp"],
            runner=runner,
-            test_options=PPTestOptions(multi_node_only=multi_node_only,
-                                       load_format=load_format),
+            test_options=PPTestOptions(
+                multi_node_only=multi_node_only, load_format=load_format
+            ),
        )

    def iter_params(self, model_id: str):
@@ -281,8 +271,10 @@ def _compare_tp(
    if num_gpus_available < tp_size * pp_size:
        pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
    if VLLM_MULTI_NODE and distributed_backend == "mp":
-        pytest.skip("Skipping multi-node pipeline parallel test for "
-                    "multiprocessing distributed backend")
+        pytest.skip(
+            "Skipping multi-node pipeline parallel test for "
+            "multiprocessing distributed backend"
+        )
    if multi_node_only and not VLLM_MULTI_NODE:
        pytest.skip("Not in multi-node setting")

@@ -357,20 +349,16 @@ def _compare_tp(
        "mp",
    ]

-    compare_two_settings(model_id,
-                         pp_args,
-                         tp_args,
-                         pp_env,
-                         tp_env,
-                         method=method)
+    compare_two_settings(model_id, pp_args, tp_args, pp_env, tp_env, method=method)


@pytest.mark.parametrize(
-    ("model_id", "parallel_setup", "distributed_backend", "runner",
-     "test_options"),
+    ("model_id", "parallel_setup", "distributed_backend", "runner", "test_options"),
    [
-        params for model_id, settings in TEXT_GENERATION_MODELS.items()
-        for params in settings.iter_params(model_id) if model_id in TEST_MODELS
+        params
+        for model_id, settings in TEXT_GENERATION_MODELS.items()
+        for params in settings.iter_params(model_id)
+        if model_id in TEST_MODELS
    ],
 )
@create_new_process_for_each_test()
@@ -382,22 +370,25 @@ def test_tp_language_generation(
    test_options: PPTestOptions,
    num_gpus_available,
 ):
-    _compare_tp(model_id,
-                parallel_setup,
-                distributed_backend,
-                runner,
-                test_options,
-                num_gpus_available,
-                method="generate",
-                is_multimodal=False)
+    _compare_tp(
+        model_id,
+        parallel_setup,
+        distributed_backend,
+        runner,
+        test_options,
+        num_gpus_available,
+        method="generate",
+        is_multimodal=False,
+    )


@pytest.mark.parametrize(
-    ("model_id", "parallel_setup", "distributed_backend", "runner",
-     "test_options"),
+    ("model_id", "parallel_setup", "distributed_backend", "runner", "test_options"),
    [
-        params for model_id, settings in EMBEDDING_MODELS.items()
-        for params in settings.iter_params(model_id) if model_id in TEST_MODELS
+        params
+        for model_id, settings in EMBEDDING_MODELS.items()
+        for params in settings.iter_params(model_id)
+        if model_id in TEST_MODELS
    ],
 )
@create_new_process_for_each_test()
@@ -409,22 +400,25 @@ def test_tp_language_embedding(
    test_options: PPTestOptions,
    num_gpus_available,
 ):
-    _compare_tp(model_id,
-                parallel_setup,
-                distributed_backend,
-                runner,
-                test_options,
-                num_gpus_available,
-                method="encode",
-                is_multimodal=False)
+    _compare_tp(
+        model_id,
+        parallel_setup,
+        distributed_backend,
+        runner,
+        test_options,
+        num_gpus_available,
+        method="encode",
+        is_multimodal=False,
+    )


@pytest.mark.parametrize(
-    ("model_id", "parallel_setup", "distributed_backend", "runner",
-     "test_options"),
+    ("model_id", "parallel_setup", "distributed_backend", "runner", "test_options"),
    [
-        params for model_id, settings in MULTIMODAL_MODELS.items()
-        for params in settings.iter_params(model_id) if model_id in TEST_MODELS
+        params
+        for model_id, settings in MULTIMODAL_MODELS.items()
+        for params in settings.iter_params(model_id)
+        if model_id in TEST_MODELS
    ],
 )
@create_new_process_for_each_test()
@@ -436,11 +430,13 @@ def test_tp_multimodal_generation(
    test_options: PPTestOptions,
    num_gpus_available,
 ):
-    _compare_tp(model_id,
-                parallel_setup,
-                distributed_backend,
-                runner,
-                test_options,
-                num_gpus_available,
-                method="generate",
-                is_multimodal=True)
+    _compare_tp(
+        model_id,
+        parallel_setup,
+        distributed_backend,
+        runner,
+        test_options,
+        num_gpus_available,
+        method="generate",
+        is_multimodal=True,
+    )
--- a/tests/distributed/test_pipeline_partition.py
+++ b/tests/distributed/test_pipeline_partition.py
@@ -9,7 +9,6 @@ from vllm.distributed.utils import get_pp_indices


 def test_custom_layer_partition(monkeypatch: pytest.MonkeyPatch):
-
    with monkeypatch.context() as m:

        def _verify(partition_str, num_layers, pp_size, goldens):
@@ -57,7 +56,8 @@ def test_custom_layer_partition(monkeypatch: pytest.MonkeyPatch):
        (5, 3, 0, (0, 2)),
        (5, 3, 1, (2, 4)),
        (5, 3, 2, (4, 5)),
-    ])
+    ],
+)
 def test_uneven_auto_partition(
    num_hidden_layers: int,
    pp_size: int,
--- a/tests/distributed/test_pp_cudagraph.py
+++ b/tests/distributed/test_pp_cudagraph.py
@@ -12,12 +12,18 @@ if TYPE_CHECKING:
    from typing_extensions import LiteralString


-@pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [
-    (2, "JackFram/llama-160m"),
-])
-@pytest.mark.parametrize("ATTN_BACKEND", [
-    "FLASH_ATTN",
-])
+@pytest.mark.parametrize(
+    "PP_SIZE, MODEL_NAME",
+    [
+        (2, "JackFram/llama-160m"),
+    ],
+)
+@pytest.mark.parametrize(
+    "ATTN_BACKEND",
+    [
+        "FLASH_ATTN",
+    ],
+)
@create_new_process_for_each_test()
 def test_pp_cudagraph(
    monkeypatch: pytest.MonkeyPatch,
--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@@ -9,13 +9,15 @@ import pytest
 import torch
 import torch.distributed

-from vllm.distributed.communication_op import (  # noqa
-    tensor_model_parallel_all_reduce)
+from vllm.distributed.communication_op import tensor_model_parallel_all_reduce  # noqa
 from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
 from vllm.distributed.device_communicators.pynccl_wrapper import NCCLLibrary
-from vllm.distributed.parallel_state import (ensure_model_parallel_initialized,
-                                             get_world_group, graph_capture,
-                                             init_distributed_environment)
+from vllm.distributed.parallel_state import (
+    ensure_model_parallel_initialized,
+    get_world_group,
+    graph_capture,
+    init_distributed_environment,
+)
 from vllm.utils import update_environment_variables


@@ -24,13 +26,13 @@ def distributed_run(fn, world_size):
    processes: list[multiprocessing.Process] = []
    for i in range(number_of_processes):
        env: dict[str, str] = {}
-        env['RANK'] = str(i)
-        env['LOCAL_RANK'] = str(i)
-        env['WORLD_SIZE'] = str(number_of_processes)
-        env['LOCAL_WORLD_SIZE'] = str(number_of_processes)
-        env['MASTER_ADDR'] = 'localhost'
-        env['MASTER_PORT'] = '12345'
-        p = multiprocessing.Process(target=fn, args=(env, ))
+        env["RANK"] = str(i)
+        env["LOCAL_RANK"] = str(i)
+        env["WORLD_SIZE"] = str(number_of_processes)
+        env["LOCAL_WORLD_SIZE"] = str(number_of_processes)
+        env["MASTER_ADDR"] = "localhost"
+        env["MASTER_PORT"] = "12345"
+        p = multiprocessing.Process(target=fn, args=(env,))
        processes.append(p)
        p.start()

@@ -47,7 +49,7 @@ def worker_fn_wrapper(fn):
    # and update the environment variables in the function
    def wrapped_fn(env):
        update_environment_variables(env)
-        local_rank = os.environ['LOCAL_RANK']
+        local_rank = os.environ["LOCAL_RANK"]
        device = torch.device(f"cuda:{local_rank}")
        torch.cuda.set_device(device)
        init_distributed_environment()
@@ -58,17 +60,18 @@ def worker_fn_wrapper(fn):

@worker_fn_wrapper
 def worker_fn():
-    pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
-                                     device=get_world_group().device)
-    tensor = torch.ones(16, 1024, 1024,
-                        dtype=torch.float32).cuda(pynccl_comm.rank)
+    pynccl_comm = PyNcclCommunicator(
+        get_world_group().cpu_group, device=get_world_group().device
+    )
+    tensor = torch.ones(16, 1024, 1024, dtype=torch.float32).cuda(pynccl_comm.rank)
    tensor = pynccl_comm.all_reduce(tensor)
    torch.cuda.synchronize()
    assert torch.all(tensor == pynccl_comm.world_size).cpu().item()


-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
+)
 def test_pynccl():
    distributed_run(worker_fn, 2)

@@ -78,7 +81,7 @@ def multiple_allreduce_worker_fn():
    device = torch.device(f"cuda:{torch.distributed.get_rank()}")
    groups = [
        torch.distributed.new_group(ranks=[0, 1], backend="gloo"),
-        torch.distributed.new_group(ranks=[2, 3], backend="gloo")
+        torch.distributed.new_group(ranks=[2, 3], backend="gloo"),
    ]
    group = groups[0] if torch.distributed.get_rank() in [0, 1] else groups[1]
    pynccl_comm = PyNcclCommunicator(group=group, device=device)
@@ -95,8 +98,9 @@ def multiple_allreduce_worker_fn():
        assert torch.all(tensor == 2).cpu().item()


-@pytest.mark.skipif(torch.cuda.device_count() < 4,
-                    reason="Need at least 4 GPUs to run the test.")
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 4, reason="Need at least 4 GPUs to run the test."
+)
 def test_pynccl_multiple_allreduce():
    # this tests pynccl for multiple tp groups, in a standalone way
    # i.e. call `pynccl_comm.all_reduce` directly
@@ -121,8 +125,9 @@ def multiple_allreduce_with_vllm_worker_fn():
            assert torch.all(tensor == 2).cpu().item()


-@pytest.mark.skipif(torch.cuda.device_count() < 4,
-                    reason="Need at least 4 GPUs to run the test.")
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 4, reason="Need at least 4 GPUs to run the test."
+)
 def test_pynccl_multiple_allreduce_with_vllm():
    # this tests pynccl for multiple tp groups, together with vllm
    # i.e. call `tensor_model_parallel_all_reduce`
@@ -133,10 +138,11 @@ def test_pynccl_multiple_allreduce_with_vllm():
 def worker_fn_with_cudagraph():
    with torch.no_grad():
        graph = torch.cuda.CUDAGraph()
-        pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
-                                         device=get_world_group().device)
+        pynccl_comm = PyNcclCommunicator(
+            get_world_group().cpu_group, device=get_world_group().device
+        )
        # run something in the default stream to initialize torch engine
-        a = torch.ones((4, 4), device=f'cuda:{pynccl_comm.rank}')
+        a = torch.ones((4, 4), device=f"cuda:{pynccl_comm.rank}")
        torch.cuda.synchronize()
        with torch.cuda.graph(graph):
            a_out = pynccl_comm.all_reduce(a)
@@ -148,84 +154,90 @@ def worker_fn_with_cudagraph():

@worker_fn_wrapper
 def all_gather_worker_fn():
-    pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
-                                     device=get_world_group().device)
+    pynccl_comm = PyNcclCommunicator(
+        get_world_group().cpu_group, device=get_world_group().device
+    )

    rank = pynccl_comm.rank
    world_size = pynccl_comm.world_size
-    device = f'cuda:{pynccl_comm.rank}'
+    device = f"cuda:{pynccl_comm.rank}"

    num_elems = 1000
-    tensor = torch.arange(num_elems, dtype=torch.float32,
-                          device=device) + rank * num_elems
-    result = torch.zeros(num_elems * world_size,
-                         dtype=torch.float32,
-                         device=device)
+    tensor = (
+        torch.arange(num_elems, dtype=torch.float32, device=device) + rank * num_elems
+    )
+    result = torch.zeros(num_elems * world_size, dtype=torch.float32, device=device)

-    expected = torch.cat([
-        torch.arange(num_elems, dtype=torch.float32) + r * num_elems
-        for r in range(world_size)
-    ]).to(device)
+    expected = torch.cat(
+        [
+            torch.arange(num_elems, dtype=torch.float32) + r * num_elems
+            for r in range(world_size)
+        ]
+    ).to(device)

    pynccl_comm.all_gather(result, tensor)
    torch.cuda.synchronize()
    torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)


-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
+)
 def test_pynccl_all_gather():
    distributed_run(all_gather_worker_fn, 2)


@worker_fn_wrapper
 def all_gatherv_worker_fn():
-    pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
-                                     device=get_world_group().device)
+    pynccl_comm = PyNcclCommunicator(
+        get_world_group().cpu_group, device=get_world_group().device
+    )

    rank = pynccl_comm.rank
    world_size = pynccl_comm.world_size
-    device = f'cuda:{pynccl_comm.rank}'
+    device = f"cuda:{pynccl_comm.rank}"

    assert world_size <= 8
    sizes = [81, 20, 57, 52, 81, 5, 49, 49][:world_size]
    num_elems = sizes[rank]
-    tensor = torch.arange(num_elems, dtype=torch.float32,
-                          device=device) + rank * 100
+    tensor = torch.arange(num_elems, dtype=torch.float32, device=device) + rank * 100
    result = torch.zeros(sum(sizes), dtype=torch.float32, device=device)

-    expected = torch.cat([
-        torch.arange(sizes[r], dtype=torch.float32) + r * 100
-        for r in range(world_size)
-    ]).to(device)
+    expected = torch.cat(
+        [
+            torch.arange(sizes[r], dtype=torch.float32) + r * 100
+            for r in range(world_size)
+        ]
+    ).to(device)

    pynccl_comm.all_gatherv(result, tensor, sizes=sizes)
    torch.cuda.synchronize()
    torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)


-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
+)
 def test_pynccl_all_gatherv():
    distributed_run(all_gatherv_worker_fn, 2)


@worker_fn_wrapper
 def reduce_scatter_worker_fn():
-    pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
-                                     device=get_world_group().device)
+    pynccl_comm = PyNcclCommunicator(
+        get_world_group().cpu_group, device=get_world_group().device
+    )

    rank = pynccl_comm.rank
    world_size = pynccl_comm.world_size
-    device = f'cuda:{pynccl_comm.rank}'
+    device = f"cuda:{pynccl_comm.rank}"

    num_elems = 1000
-    tensor = torch.arange(num_elems, dtype=torch.float32,
-                          device=device) + rank * num_elems
-    assert (num_elems % world_size == 0)
-    result = torch.zeros(num_elems // world_size,
-                         dtype=torch.float32,
-                         device=device)
+    tensor = (
+        torch.arange(num_elems, dtype=torch.float32, device=device) + rank * num_elems
+    )
+    assert num_elems % world_size == 0
+    result = torch.zeros(num_elems // world_size, dtype=torch.float32, device=device)

    # Calculate expected result for this rank's chunk
    scattered_size = num_elems // world_size
@@ -233,34 +245,37 @@ def reduce_scatter_worker_fn():
        torch.arange(num_elems, dtype=torch.float32) + r * num_elems
        for r in range(world_size)
    ]
-    expected = sum(tensor[rank * scattered_size:(rank + 1) * scattered_size]
-                   for tensor in all_tensors).to(device)
+    expected = sum(
+        tensor[rank * scattered_size : (rank + 1) * scattered_size]
+        for tensor in all_tensors
+    ).to(device)

    pynccl_comm.reduce_scatter(result, tensor)
    torch.cuda.synchronize()
    torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)


-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
+)
 def test_pynccl_reduce_scatter():
    distributed_run(reduce_scatter_worker_fn, 2)


@worker_fn_wrapper
 def reduce_scatterv_worker_fn():
-    pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
-                                     device=get_world_group().device)
+    pynccl_comm = PyNcclCommunicator(
+        get_world_group().cpu_group, device=get_world_group().device
+    )

    rank = pynccl_comm.rank
    world_size = pynccl_comm.world_size
-    device = f'cuda:{pynccl_comm.rank}'
+    device = f"cuda:{pynccl_comm.rank}"

    assert world_size <= 8
    sizes = [81, 20, 57, 52, 81, 5, 49, 49][:world_size]
    num_elems = sum(sizes)
-    tensor = torch.arange(num_elems, dtype=torch.float32,
-                          device=device) + rank * 100
+    tensor = torch.arange(num_elems, dtype=torch.float32, device=device) + rank * 100
    result = torch.zeros(sizes[rank], dtype=torch.float32, device=device)

    # Calculate expected result for this rank's chunk
@@ -278,41 +293,41 @@ def reduce_scatterv_worker_fn():
    torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)


-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
+)
 def test_pynccl_reduce_scatterv():
    distributed_run(reduce_scatterv_worker_fn, 2)


-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
+)
 def test_pynccl_with_cudagraph():
    distributed_run(worker_fn_with_cudagraph, 2)


@worker_fn_wrapper
 def send_recv_worker_fn():
-    pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
-                                     device=get_world_group().device)
+    pynccl_comm = PyNcclCommunicator(
+        get_world_group().cpu_group, device=get_world_group().device
+    )
    if pynccl_comm.rank == 0:
-        tensor = torch.ones(16, 1024, 1024,
-                            dtype=torch.float32).cuda(pynccl_comm.rank)
+        tensor = torch.ones(16, 1024, 1024, dtype=torch.float32).cuda(pynccl_comm.rank)
    else:
-        tensor = torch.empty(16, 1024, 1024,
-                             dtype=torch.float32).cuda(pynccl_comm.rank)
+        tensor = torch.empty(16, 1024, 1024, dtype=torch.float32).cuda(pynccl_comm.rank)

    if pynccl_comm.rank == 0:
-        pynccl_comm.send(tensor,
-                         dst=(pynccl_comm.rank + 1) % pynccl_comm.world_size)
+        pynccl_comm.send(tensor, dst=(pynccl_comm.rank + 1) % pynccl_comm.world_size)
    else:
-        pynccl_comm.recv(tensor,
-                         src=(pynccl_comm.rank - 1) % pynccl_comm.world_size)
+        pynccl_comm.recv(tensor, src=(pynccl_comm.rank - 1) % pynccl_comm.world_size)
    torch.cuda.synchronize()
    assert torch.all(tensor == 1).cpu().item()


-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
+)
 def test_pynccl_send_recv():
    distributed_run(send_recv_worker_fn, 2)

@@ -322,27 +337,20 @@ def multiple_send_recv_worker_fn():
    device = torch.device(f"cuda:{torch.distributed.get_rank()}")
    groups = [
        torch.distributed.new_group(ranks=[0, 2], backend="gloo"),
-        torch.distributed.new_group(ranks=[1, 3], backend="gloo")
+        torch.distributed.new_group(ranks=[1, 3], backend="gloo"),
    ]
    group = groups[0] if torch.distributed.get_rank() in [0, 2] else groups[1]
    pynccl_comm = PyNcclCommunicator(group=group, device=device)
    if torch.distributed.get_rank() == 0:
        tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
    elif torch.distributed.get_rank() == 1:
-        tensor = 2 * torch.ones(
-            16, 1024, 1024, dtype=torch.float32, device=device)
+        tensor = 2 * torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
    else:
-        tensor = torch.empty(16,
-                             1024,
-                             1024,
-                             dtype=torch.float32,
-                             device=device)
+        tensor = torch.empty(16, 1024, 1024, dtype=torch.float32, device=device)
    if torch.distributed.get_rank() in [0, 1]:
-        pynccl_comm.send(tensor,
-                         dst=(pynccl_comm.rank + 1) % pynccl_comm.world_size)
+        pynccl_comm.send(tensor, dst=(pynccl_comm.rank + 1) % pynccl_comm.world_size)
    else:
-        pynccl_comm.recv(tensor,
-                         src=(pynccl_comm.rank - 1) % pynccl_comm.world_size)
+        pynccl_comm.recv(tensor, src=(pynccl_comm.rank - 1) % pynccl_comm.world_size)
    torch.cuda.synchronize()
    if torch.distributed.get_rank() in [0, 2]:
        assert torch.all(tensor == 1).cpu().item()
@@ -350,14 +358,16 @@ def multiple_send_recv_worker_fn():
        assert torch.all(tensor == 2).cpu().item()


-@pytest.mark.skipif(torch.cuda.device_count() < 4,
-                    reason="Need at least 4 GPUs to run the test.")
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 4, reason="Need at least 4 GPUs to run the test."
+)
 def test_pynccl_multiple_send_recv():
    distributed_run(multiple_send_recv_worker_fn, 4)


-@pytest.mark.skipif(torch.cuda.device_count() < 4,
-                    reason="Need at least 4 GPUs to run the test.")
+@pytest.mark.skipif(
+    torch.cuda.device_count() < 4, reason="Need at least 4 GPUs to run the test."
+)
 def test_pynccl_broadcast():
    distributed_run(broadcast_worker_fn, 4)

@@ -366,19 +376,17 @@ def test_pynccl_broadcast():
 def broadcast_worker_fn():
    # Test broadcast for every root rank.
    # Essentially this is an all-gather operation.
-    pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
-                                     device=get_world_group().device)
+    pynccl_comm = PyNcclCommunicator(
+        get_world_group().cpu_group, device=get_world_group().device
+    )
    recv_tensors = [
-        torch.empty(16,
-                    1024,
-                    1024,
-                    dtype=torch.float32,
-                    device=pynccl_comm.device)
+        torch.empty(16, 1024, 1024, dtype=torch.float32, device=pynccl_comm.device)
        for i in range(pynccl_comm.world_size)
    ]
-    recv_tensors[pynccl_comm.rank] = torch.ones(
-        16, 1024, 1024, dtype=torch.float32,
-        device=pynccl_comm.device) * pynccl_comm.rank
+    recv_tensors[pynccl_comm.rank] = (
+        torch.ones(16, 1024, 1024, dtype=torch.float32, device=pynccl_comm.device)
+        * pynccl_comm.rank
+    )

    for i in range(pynccl_comm.world_size):
        pynccl_comm.broadcast(recv_tensors[i], src=i)
--- a/tests/distributed/test_quick_all_reduce.py
+++ b/tests/distributed/test_quick_all_reduce.py
@@ -8,20 +8,20 @@ import ray
 import torch
 import torch.distributed as dist

-from vllm.distributed.communication_op import (  # noqa
-    tensor_model_parallel_all_reduce)
+from vllm.distributed.communication_op import tensor_model_parallel_all_reduce  # noqa
 from vllm.distributed.parallel_state import get_tp_group, graph_capture
 from vllm.platforms import current_platform

-from ..utils import (ensure_model_parallel_initialized,
-                     init_test_distributed_environment, multi_process_parallel)
+from ..utils import (
+    ensure_model_parallel_initialized,
+    init_test_distributed_environment,
+    multi_process_parallel,
+)

 torch.manual_seed(42)
 random.seed(44)
 # Size over 8MB is sufficient for custom quick allreduce.
-test_sizes = [
-    random.randint(8 * 1024 * 1024, 10 * 1024 * 1024) for _ in range(8)
-]
+test_sizes = [random.randint(8 * 1024 * 1024, 10 * 1024 * 1024) for _ in range(8)]
 for i, v in enumerate(test_sizes):
    test_sizes[i] -= v % 8

@@ -38,8 +38,7 @@ def graph_quickreduce(
        m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
        device = torch.device(f"cuda:{rank}")
        torch.cuda.set_device(device)
-        init_test_distributed_environment(tp_size, pp_size, rank,
-                                          distributed_init_port)
+        init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
        ensure_model_parallel_initialized(tp_size, pp_size)
        group = get_tp_group().device_group

@@ -64,18 +63,15 @@ def graph_quickreduce(
        for sz in test_sizes:
            for dtype in [torch.float16, torch.bfloat16]:
                with graph_capture(device=device) as graph_capture_context:
-                    inp1 = torch.randint(1,
-                                         23, (sz, ),
-                                         dtype=dtype,
-                                         device=torch.cuda.current_device())
-                    inp2 = torch.randint(-23,
-                                         1, (sz, ),
-                                         dtype=dtype,
-                                         device=torch.cuda.current_device())
+                    inp1 = torch.randint(
+                        1, 23, (sz,), dtype=dtype, device=torch.cuda.current_device()
+                    )
+                    inp2 = torch.randint(
+                        -23, 1, (sz,), dtype=dtype, device=torch.cuda.current_device()
+                    )
                    torch.cuda.synchronize()
                    graph = torch.cuda.CUDAGraph()
-                    with torch.cuda.graph(graph,
-                                          stream=graph_capture_context.stream):
+                    with torch.cuda.graph(graph, stream=graph_capture_context.stream):
                        for _ in range(num_communication):
                            out1 = tensor_model_parallel_all_reduce(inp1)
                            dist.all_reduce(inp1, group=group)
@@ -99,39 +95,42 @@ def eager_quickreduce(
        device = torch.device(f"cuda:{rank}")
        torch.cuda.set_device(device)

-        init_test_distributed_environment(tp_size, pp_size, rank,
-                                          distributed_init_port)
+        init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)

        # Size over 8MB is sufficient for custom quick allreduce.
        sz = 16 * 1024 * 1024
        fa = get_tp_group().device_communicator.qr_comm
-        inp = torch.tensor([1.0 * ((i) % 23) for i in range(sz)],
-                           dtype=torch.float16,
-                           device=device)
+        inp = torch.tensor(
+            [1.0 * ((i) % 23) for i in range(sz)], dtype=torch.float16, device=device
+        )
        out = fa.quick_all_reduce(inp)
        torch.testing.assert_close(out, inp * tp_size, atol=2.5, rtol=0.1)

-        inp = torch.tensor([1.0 * ((i) % 23) for i in range(sz)],
-                           dtype=torch.bfloat16,
-                           device=device)
+        inp = torch.tensor(
+            [1.0 * ((i) % 23) for i in range(sz)], dtype=torch.bfloat16, device=device
+        )
        out = fa.quick_all_reduce(inp)
        torch.testing.assert_close(out, inp * tp_size, atol=2.5, rtol=0.1)


-@pytest.mark.skipif(not current_platform.is_rocm(),
-                    reason="only test quick allreduce for rocm")
+@pytest.mark.skipif(
+    not current_platform.is_rocm(), reason="only test quick allreduce for rocm"
+)
@pytest.mark.parametrize("quant_mode", ["FP", "INT8", "INT6", "INT4"])
@pytest.mark.parametrize("tp_size", [2])
@pytest.mark.parametrize("pipeline_parallel_size", [1, 2])
@pytest.mark.parametrize("test_target", [graph_quickreduce, eager_quickreduce])
-def test_custom_quick_allreduce(monkeypatch: pytest.MonkeyPatch, tp_size,
-                                pipeline_parallel_size, test_target,
-                                quant_mode):
+def test_custom_quick_allreduce(
+    monkeypatch: pytest.MonkeyPatch,
+    tp_size,
+    pipeline_parallel_size,
+    test_target,
+    quant_mode,
+):
    world_size = tp_size * pipeline_parallel_size
    if world_size > torch.cuda.device_count():
        pytest.skip("Not enough GPUs to run the test.")

    monkeypatch.setenv("VLLM_ROCM_QUICK_REDUCE_QUANTIZATION", quant_mode)

-    multi_process_parallel(monkeypatch, tp_size, pipeline_parallel_size,
-                           test_target)
+    multi_process_parallel(monkeypatch, tp_size, pipeline_parallel_size, test_target)
--- a/tests/distributed/test_same_node.py
+++ b/tests/distributed/test_same_node.py
@@ -22,15 +22,13 @@ if __name__ == "__main__":
        dist.broadcast_object_list(recv, src=0)
        ip, port = recv

-    stateless_pg = StatelessProcessGroup.create(ip, port, rank,
-                                                dist.get_world_size())
+    stateless_pg = StatelessProcessGroup.create(ip, port, rank, dist.get_world_size())

    for pg in [dist.group.WORLD, stateless_pg]:
        test_result = all(in_the_same_node_as(pg, source_rank=0))

        expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
-        assert test_result == expected, \
-            f"Expected {expected}, got {test_result}"
+        assert test_result == expected, f"Expected {expected}, got {test_result}"
        if pg == dist.group.WORLD:
            print("Same node test passed! when using torch distributed!")
        else:
--- a/tests/distributed/test_sequence_parallel.py
+++ b/tests/distributed/test_sequence_parallel.py
@@ -7,6 +7,7 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
 all workers in a node other than the head node, which can cause the test
 to fail.
 """
+
 import json
 import os
 from dataclasses import dataclass
@@ -56,7 +57,8 @@ class SPTestSettings:
            raise ValueError(
                f"Length mismatch: distributed_backends "
                f"({len(self.distributed_backends)}) != "
-                f"vllm_major_versions ({len(self.vllm_major_versions)})")
+                f"vllm_major_versions ({len(self.vllm_major_versions)})"
+            )

    @staticmethod
    def detailed(
@@ -72,18 +74,22 @@ class SPTestSettings:
            for pp_multiplier in [1, 2]:
                for chunked_prefill_val in [False, True]:
                    parallel_setups.append(
-                        ParallelSetup(tp_size=tp_base,
-                                      pp_size=pp_multiplier * pp_base,
-                                      enable_fusion=False,
-                                      eager_mode=eager_mode_val,
-                                      chunked_prefill=chunked_prefill_val))
+                        ParallelSetup(
+                            tp_size=tp_base,
+                            pp_size=pp_multiplier * pp_base,
+                            enable_fusion=False,
+                            eager_mode=eager_mode_val,
+                            chunked_prefill=chunked_prefill_val,
+                        )
+                    )
        return SPTestSettings(
            parallel_setups=parallel_setups,
            distributed_backends=["mp", "ray"],
            vllm_major_versions=["1", "1"],
            runner=runner,
-            test_options=SPTestOptions(multi_node_only=multi_node_only,
-                                       load_format=load_format),
+            test_options=SPTestOptions(
+                multi_node_only=multi_node_only, load_format=load_format
+            ),
        )

    @staticmethod
@@ -100,18 +106,22 @@ class SPTestSettings:
            for pp_multiplier in [1, 2]:
                for chunked_prefill_val in [False, True]:
                    parallel_setups.append(
-                        ParallelSetup(tp_size=tp_base,
-                                      pp_size=pp_multiplier * pp_base,
-                                      enable_fusion=False,
-                                      eager_mode=eager_mode_val,
-                                      chunked_prefill=chunked_prefill_val))
+                        ParallelSetup(
+                            tp_size=tp_base,
+                            pp_size=pp_multiplier * pp_base,
+                            enable_fusion=False,
+                            eager_mode=eager_mode_val,
+                            chunked_prefill=chunked_prefill_val,
+                        )
+                    )
        return SPTestSettings(
            parallel_setups=parallel_setups,
            distributed_backends=["mp", "ray"],
            vllm_major_versions=["1", "1"],
            runner=runner,
-            test_options=SPTestOptions(multi_node_only=multi_node_only,
-                                       load_format=load_format),
+            test_options=SPTestOptions(
+                multi_node_only=multi_node_only, load_format=load_format
+            ),
        )

    @staticmethod
@@ -126,28 +136,39 @@ class SPTestSettings:
        parallel_setups = []
        for fusion_val in [False, True]:
            parallel_setups.append(
-                ParallelSetup(tp_size=tp_base,
-                              pp_size=pp_base,
-                              enable_fusion=fusion_val,
-                              eager_mode=True,
-                              chunked_prefill=False))
+                ParallelSetup(
+                    tp_size=tp_base,
+                    pp_size=pp_base,
+                    enable_fusion=fusion_val,
+                    eager_mode=True,
+                    chunked_prefill=False,
+                )
+            )
        return SPTestSettings(
            parallel_setups=parallel_setups,
            distributed_backends=["mp", "ray"],
            vllm_major_versions=["1", "1"],
            runner=runner,
-            test_options=SPTestOptions(multi_node_only=multi_node_only,
-                                       load_format=load_format),
+            test_options=SPTestOptions(
+                multi_node_only=multi_node_only, load_format=load_format
+            ),
        )

    def iter_params(self, model_id: str):
        opts = self.test_options

        for parallel_setup in self.parallel_setups:
-            for backend, vllm_major_version in zip(self.distributed_backends,
-                                                   self.vllm_major_versions):
-                yield (model_id, parallel_setup, backend, vllm_major_version,
-                       self.runner, opts)
+            for backend, vllm_major_version in zip(
+                self.distributed_backends, self.vllm_major_versions
+            ):
+                yield (
+                    model_id,
+                    parallel_setup,
+                    backend,
+                    vllm_major_version,
+                    self.runner,
+                    opts,
+                )


 def _compare_sp(
@@ -200,8 +221,10 @@ def _compare_sp(
    if num_gpus_available < tp_size * pp_size:
        pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
    if VLLM_MULTI_NODE and distributed_backend == "mp":
-        pytest.skip("Skipping multi-node pipeline parallel test for "
-                    "multiprocessing distributed backend")
+        pytest.skip(
+            "Skipping multi-node pipeline parallel test for "
+            "multiprocessing distributed backend"
+        )
    if multi_node_only and not VLLM_MULTI_NODE:
        pytest.skip("Not in multi-node setting")

@@ -232,13 +255,13 @@ def _compare_sp(
        common_args.append("--skip-tokenizer-init")

    compilation_config = {
-        'level': 3,
-        'custom_ops': ["+rms_norm"],
-        'compile_sizes': [4, 8],
-        'pass_config': {
-            'enable_sequence_parallelism': True,
-            'enable_fusion': enable_fusion,
-            'enable_noop': True,
+        "level": 3,
+        "custom_ops": ["+rms_norm"],
+        "compile_sizes": [4, 8],
+        "pass_config": {
+            "enable_sequence_parallelism": True,
+            "enable_fusion": enable_fusion,
+            "enable_noop": True,
        },
    }

@@ -270,12 +293,9 @@ def _compare_sp(
    ]

    try:
-        compare_two_settings(model_id,
-                             tp_sp_args,
-                             tp_args,
-                             tp_sp_env,
-                             tp_env,
-                             method=method)
+        compare_two_settings(
+            model_id, tp_sp_args, tp_args, tp_sp_env, tp_env, method=method
+        )
    except Exception:
        testing_ray_compiled_graph = tp_sp_env is not None
        if testing_ray_compiled_graph and vllm_major_version == "0":
@@ -301,10 +321,17 @@ SP_TEST_MODELS = [


@pytest.mark.parametrize(
-    ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
-     "runner", "test_options"),
+    (
+        "model_id",
+        "parallel_setup",
+        "distributed_backend",
+        "vllm_major_version",
+        "runner",
+        "test_options",
+    ),
    [
-        params for model_id, settings in SP_TEXT_GENERATION_MODELS.items()
+        params
+        for model_id, settings in SP_TEXT_GENERATION_MODELS.items()
        for params in settings.iter_params(model_id)
        if model_id in SP_TEST_MODELS
    ],
@@ -319,12 +346,14 @@ def test_tp_sp_generation(
    test_options: SPTestOptions,
    num_gpus_available,
 ):
-    _compare_sp(model_id,
-                parallel_setup,
-                distributed_backend,
-                vllm_major_version,
-                runner,
-                test_options,
-                num_gpus_available,
-                method="generate",
-                is_multimodal=False)
+    _compare_sp(
+        model_id,
+        parallel_setup,
+        distributed_backend,
+        vllm_major_version,
+        runner,
+        test_options,
+        num_gpus_available,
+        method="generate",
+        is_multimodal=False,
+    )
--- a/tests/distributed/test_shm_broadcast.py
+++ b/tests/distributed/test_shm_broadcast.py
@@ -26,13 +26,13 @@ def distributed_run(fn, world_size):
    processes = []
    for i in range(number_of_processes):
        env = {}
-        env['RANK'] = str(i)
-        env['LOCAL_RANK'] = str(i)
-        env['WORLD_SIZE'] = str(number_of_processes)
-        env['LOCAL_WORLD_SIZE'] = str(number_of_processes)
-        env['MASTER_ADDR'] = 'localhost'
-        env['MASTER_PORT'] = '12345'
-        p = multiprocessing.Process(target=fn, args=(env, ))
+        env["RANK"] = str(i)
+        env["LOCAL_RANK"] = str(i)
+        env["WORLD_SIZE"] = str(number_of_processes)
+        env["LOCAL_WORLD_SIZE"] = str(number_of_processes)
+        env["MASTER_ADDR"] = "localhost"
+        env["MASTER_PORT"] = "12345"
+        p = multiprocessing.Process(target=fn, args=(env,))
        processes.append(p)
        p.start()

@@ -57,25 +57,23 @@ def worker_fn_wrapper(fn):

@worker_fn_wrapper
 def worker_fn():
-
    rank = dist.get_rank()
    if rank == 0:
        port = get_open_port()
-        ip = '127.0.0.1'
+        ip = "127.0.0.1"
        dist.broadcast_object_list([ip, port], src=0)
    else:
        recv = [None, None]
        dist.broadcast_object_list(recv, src=0)
        ip, port = recv  # type: ignore

-    stateless_pg = StatelessProcessGroup.create(ip, port, rank,
-                                                dist.get_world_size())
+    stateless_pg = StatelessProcessGroup.create(ip, port, rank, dist.get_world_size())

    for pg in [dist.group.WORLD, stateless_pg]:
-
        writer_rank = 2
        broadcaster = MessageQueue.create_from_process_group(
-            pg, 40 * 1024, 2, writer_rank)
+            pg, 40 * 1024, 2, writer_rank
+        )
        if rank == writer_rank:
            seed = random.randint(0, 1000)
            dist.broadcast_object_list([seed], writer_rank)
--- a/tests/distributed/test_shm_buffer.py
+++ b/tests/distributed/test_shm_buffer.py
@@ -5,7 +5,8 @@ import traceback
 import unittest

 from vllm.distributed.device_communicators.shm_object_storage import (
-    SingleWriterShmRingBuffer)
+    SingleWriterShmRingBuffer,
+)


 class TestSingleWriterShmRingBuffer(unittest.TestCase):
@@ -25,18 +26,21 @@ class TestSingleWriterShmRingBuffer(unittest.TestCase):
        """Test opening an existing buffer"""
        # First create a buffer
        self.ring_buffer = SingleWriterShmRingBuffer(
-            data_buffer_size=self.buffer_size, create=True)
+            data_buffer_size=self.buffer_size, create=True
+        )

        # Then open it with another instance
        reader_buffer = SingleWriterShmRingBuffer(*self.ring_buffer.handle())
        self.assertFalse(reader_buffer.is_writer)
-        self.assertEqual(reader_buffer.shared_memory.name,
-                         self.ring_buffer.shared_memory.name)
+        self.assertEqual(
+            reader_buffer.shared_memory.name, self.ring_buffer.shared_memory.name
+        )

    def test_buffer_access(self):
        """Test accessing allocated buffers"""
        self.ring_buffer = SingleWriterShmRingBuffer(
-            data_buffer_size=self.buffer_size, create=True)
+            data_buffer_size=self.buffer_size, create=True
+        )

        size = 100
        address, monotonic_id = self.ring_buffer.allocate_buf(size)
@@ -44,11 +48,11 @@ class TestSingleWriterShmRingBuffer(unittest.TestCase):
        # Write some test data
        test_data = b"Hello, World!" * 7  # 91 bytes
        with self.ring_buffer.access_buf(address) as (data_buf, metadata):
-            data_buf[0:len(test_data)] = test_data
+            data_buf[0 : len(test_data)] = test_data

        # Read it back
        with self.ring_buffer.access_buf(address) as (data_buf2, metadata2):
-            read_data = bytes(data_buf2[0:len(test_data)])
+            read_data = bytes(data_buf2[0 : len(test_data)])
            read_id = metadata2[0]

        self.assertEqual(read_data, test_data)
@@ -58,7 +62,8 @@ class TestSingleWriterShmRingBuffer(unittest.TestCase):
        """Test that MemoryError is raised when buffer is full"""
        small_buffer_size = 200
        self.ring_buffer = SingleWriterShmRingBuffer(
-            data_buffer_size=small_buffer_size, create=True)
+            data_buffer_size=small_buffer_size, create=True
+        )

        # Fill up the buffer
        self.ring_buffer.allocate_buf(100)
@@ -72,7 +77,8 @@ class TestSingleWriterShmRingBuffer(unittest.TestCase):
        """Test allocation and freeing of buffers"""
        small_buffer_size = 200
        self.ring_buffer = SingleWriterShmRingBuffer(
-            data_buffer_size=small_buffer_size, create=True)
+            data_buffer_size=small_buffer_size, create=True
+        )

        size = 80
        # Write some data
@@ -81,7 +87,7 @@ class TestSingleWriterShmRingBuffer(unittest.TestCase):
            address, monotonic_id = self.ring_buffer.allocate_buf(size)
            with self.ring_buffer.access_buf(address) as (data_buf, metadata):
                data_buf[0:4] = (0).to_bytes(4, "little")  # 0 for not in-use
-                data_buf[4:len(test_data) + 4] = test_data
+                data_buf[4 : len(test_data) + 4] = test_data
            print(self.ring_buffer.metadata)
            freed_ids = self.ring_buffer.free_buf(lambda *args: True)
            print(f"  Freed IDs: {freed_ids}")
@@ -90,7 +96,8 @@ class TestSingleWriterShmRingBuffer(unittest.TestCase):
    def test_clear_buffer(self):
        """Test clearing the buffer"""
        self.ring_buffer = SingleWriterShmRingBuffer(
-            data_buffer_size=self.buffer_size, create=True)
+            data_buffer_size=self.buffer_size, create=True
+        )

        # Allocate some buffers
        for _ in range(3):
@@ -121,8 +128,7 @@ def main():
    # Manual demonstration
    try:
        print("Creating ring buffer...")
-        writer_buffer = SingleWriterShmRingBuffer(data_buffer_size=2048,
-                                                  create=True)
+        writer_buffer = SingleWriterShmRingBuffer(data_buffer_size=2048, create=True)
        reader_buffer = SingleWriterShmRingBuffer(*writer_buffer.handle())

        print(f"Buffer created with name: {writer_buffer.shared_memory.name}")
@@ -140,7 +146,7 @@ def main():
                # Write some test data
                with writer_buffer.access_buf(address) as (data_buf, metadata):
                    test_message = f"Test message {i}".encode()
-                    data_buf[0:len(test_message)] = test_message
+                    data_buf[0 : len(test_message)] = test_message

            except MemoryError as e:
                print(f"  Failed to allocate {size} bytes: {e}")
--- a/tests/distributed/test_shm_storage.py
+++ b/tests/distributed/test_shm_storage.py
@@ -12,28 +12,33 @@ import torch

 # Assuming these are imported from your module
 from vllm.distributed.device_communicators.shm_object_storage import (
-    MsgpackSerde, SingleWriterShmObjectStorage, SingleWriterShmRingBuffer)
-from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargsItem,
-                                    MultiModalSharedField)
+    MsgpackSerde,
+    SingleWriterShmObjectStorage,
+    SingleWriterShmRingBuffer,
+)
+from vllm.multimodal.inputs import (
+    MultiModalFieldElem,
+    MultiModalKwargsItem,
+    MultiModalSharedField,
+)


 def _dummy_elem(modality: str, key: str, size: int):
    return MultiModalFieldElem(
        modality=modality,
        key=key,
-        data=torch.empty((size, ), dtype=torch.int8),
+        data=torch.empty((size,), dtype=torch.int8),
        field=MultiModalSharedField(1),
    )


 def _dummy_item(modality: str, size_by_key: dict[str, int]):
-    return MultiModalKwargsItem.from_elems([
-        _dummy_elem(modality, key, size) for key, size in size_by_key.items()
-    ])
+    return MultiModalKwargsItem.from_elems(
+        [_dummy_elem(modality, key, size) for key, size in size_by_key.items()]
+    )


 class TestSingleWriterShmObjectStorage(unittest.TestCase):
-
    def setUp(self):
        """Set up test fixtures before each test method."""
        ring_buffer = SingleWriterShmRingBuffer(
@@ -208,8 +213,7 @@ class TestSingleWriterShmObjectStorage(unittest.TestCase):
        with self.assertRaises(ValueError) as context:
            self.storage.get(address, monotonic_id + 100)

-        self.assertIn("has been modified or is invalid", \
-            str(context.exception))
+        self.assertIn("has been modified or is invalid", str(context.exception))

    def test_clear_storage(self):
        """Test clearing the storage."""
@@ -234,8 +238,7 @@ class TestSingleWriterShmObjectStorage(unittest.TestCase):
 # Reader process function
 def reader_process(process_id, storage_handle, items_to_read):
    """Reader process that connects to existing shared memory and reads data."""
-    reader_storage = SingleWriterShmObjectStorage.create_from_handle(
-        storage_handle)
+    reader_storage = SingleWriterShmObjectStorage.create_from_handle(storage_handle)

    print(f"Reader {process_id} started")

@@ -276,11 +279,7 @@ def run_multiprocess_example():

        # Test basic data types
        test_data = [
-            ("user_data", {
-                "name": "Alice",
-                "age": 30,
-                "scores": [95, 87, 92]
-            }),
+            ("user_data", {"name": "Alice", "age": 30, "scores": [95, 87, 92]}),
            ("simple_string", "Hello, World!"),
            ("number", 42),
            ("list_data", [1, 2, 3, "four", 5.0]),
@@ -301,8 +300,9 @@ def run_multiprocess_example():
        # initialize lock for reader processes
        handle.reader_lock = Lock()
        for i in range(storage.n_readers):
-            p = multiprocessing.Process(target=reader_process,
-                                        args=(i, handle, stored_items))
+            p = multiprocessing.Process(
+                target=reader_process, args=(i, handle, stored_items)
+            )
            processes.append(p)
            p.start()

--- a/tests/distributed/test_symm_mem_allreduce.py
+++ b/tests/distributed/test_symm_mem_allreduce.py
@@ -14,11 +14,12 @@ import vllm.envs as envs
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.distributed.communication_op import tensor_model_parallel_all_reduce
-from vllm.distributed.device_communicators.cuda_communicator import (
-    CudaCommunicator)
-from vllm.distributed.parallel_state import (get_tp_group,
-                                             init_distributed_environment,
-                                             initialize_model_parallel)
+from vllm.distributed.device_communicators.cuda_communicator import CudaCommunicator
+from vllm.distributed.parallel_state import (
+    get_tp_group,
+    init_distributed_environment,
+    initialize_model_parallel,
+)
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.llm_engine import LLMEngine
 from vllm.platforms import current_platform
@@ -32,8 +33,7 @@ test_size_elements = 1024 * 1024

 def symm_mem_allreduce_worker(local_rank: int, world_size: int, q: mp.Queue):
    monkeypatch = pytest.MonkeyPatch()
-    config = VllmConfig(parallel_config=ParallelConfig(
-        tensor_parallel_size=world_size))
+    config = VllmConfig(parallel_config=ParallelConfig(tensor_parallel_size=world_size))

    with monkeypatch.context() as m, set_current_vllm_config(config):
        m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
@@ -42,34 +42,34 @@ def symm_mem_allreduce_worker(local_rank: int, world_size: int, q: mp.Queue):
        torch.cuda.set_device(device)
        torch.set_default_device(device)
        torch.set_default_dtype(dtype)
-        update_environment_variables({
-            'RANK': str(local_rank),
-            'LOCAL_RANK': str(local_rank),
-            'WORLD_SIZE': str(world_size),
-            'MASTER_ADDR': 'localhost',
-            'MASTER_PORT': '12345',
-        })
+        update_environment_variables(
+            {
+                "RANK": str(local_rank),
+                "LOCAL_RANK": str(local_rank),
+                "WORLD_SIZE": str(world_size),
+                "MASTER_ADDR": "localhost",
+                "MASTER_PORT": "12345",
+            }
+        )

        init_distributed_environment()
        initialize_model_parallel(tensor_model_parallel_size=world_size)

-        cuda_communicator = typing.cast(CudaCommunicator,
-                                        get_tp_group().device_communicator)
+        cuda_communicator = typing.cast(
+            CudaCommunicator, get_tp_group().device_communicator
+        )
        symm_mem_comm = cuda_communicator.symm_mem_comm
        if symm_mem_comm is None or symm_mem_comm.disabled:
            # can't use skip under multiprocessing
            q.put("SymmMemCommunicator is not available or disabled.")
            return

-        inp_direct_symm_mem = torch.randint(1,
-                                            23, (test_size_elements, ),
-                                            dtype=dtype,
-                                            device=device)
+        inp_direct_symm_mem = torch.randint(
+            1, 23, (test_size_elements,), dtype=dtype, device=device
+        )
        if not symm_mem_comm.should_use_symm_mem(inp_direct_symm_mem):
            # can't use skip under multiprocessing
-            q.put(
-                "SymmMemCommunicator isn't used for this world and input size."
-            )
+            q.put("SymmMemCommunicator isn't used for this world and input size.")
            return

        original_inp_direct_symm_mem = inp_direct_symm_mem.clone()
@@ -78,42 +78,37 @@ def symm_mem_allreduce_worker(local_rank: int, world_size: int, q: mp.Queue):

        group = get_tp_group().device_group
        dist.all_reduce(original_inp_direct_symm_mem, group=group)
-        torch.testing.assert_close(out_direct_symm_mem,
-                                   original_inp_direct_symm_mem,
-                                   atol=2.5,
-                                   rtol=0.1)
+        torch.testing.assert_close(
+            out_direct_symm_mem, original_inp_direct_symm_mem, atol=2.5, rtol=0.1
+        )

        # Test tensor_model_parallel_all_reduce which should use symm_mem
-        inp_tensor_parallel = torch.randint(-23,
-                                            1, (test_size_elements, ),
-                                            dtype=dtype,
-                                            device=device)
+        inp_tensor_parallel = torch.randint(
+            -23, 1, (test_size_elements,), dtype=dtype, device=device
+        )
        original_inp_tensor_parallel = inp_tensor_parallel.clone()
-        out_tensor_parallel = tensor_model_parallel_all_reduce(
-            inp_tensor_parallel)
+        out_tensor_parallel = tensor_model_parallel_all_reduce(inp_tensor_parallel)
        dist.all_reduce(original_inp_tensor_parallel, group=group)
-        torch.testing.assert_close(out_tensor_parallel,
-                                   original_inp_tensor_parallel,
-                                   atol=2.5,
-                                   rtol=0.1)
+        torch.testing.assert_close(
+            out_tensor_parallel, original_inp_tensor_parallel, atol=2.5, rtol=0.1
+        )


@pytest.mark.skipif(
    not current_platform.is_cuda(),
-    reason="SymmMemAllreduce is only available for CUDA platforms.")
+    reason="SymmMemAllreduce is only available for CUDA platforms.",
+)
@pytest.mark.parametrize("tp_size", [2])
@pytest.mark.parametrize("pipeline_parallel_size", [1])
-@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"],
-                    reason="Only test on CUDA")
-def test_symm_mem_allreduce(monkeypatch: pytest.MonkeyPatch, tp_size,
-                            pipeline_parallel_size):
+@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA")
+def test_symm_mem_allreduce(
+    monkeypatch: pytest.MonkeyPatch, tp_size, pipeline_parallel_size
+):
    world_size = tp_size * pipeline_parallel_size
    if world_size > torch.cuda.device_count():
        pytest.skip("Not enough GPUs to run the test.")
-    q = mp.get_context('spawn').Queue()
-    mp.spawn(symm_mem_allreduce_worker,
-             args=(world_size, q),
-             nprocs=world_size)
+    q = mp.get_context("spawn").Queue()
+    mp.spawn(symm_mem_allreduce_worker, args=(world_size, q), nprocs=world_size)
    try:
        val = q.get(timeout=1)
    except queue.Empty:
@@ -126,18 +121,20 @@ def test_symm_mem_allreduce(monkeypatch: pytest.MonkeyPatch, tp_size,

@pytest.mark.skipif(
    not current_platform.is_cuda(),
-    reason="SymmMemAllreduce is only available for CUDA platforms.")
-@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"],
-                    reason="Only test on CUDA")
+    reason="SymmMemAllreduce is only available for CUDA platforms.",
+)
+@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA")
 def test_dp_with_symm_mem_allreduce(monkeypatch: pytest.MonkeyPatch):
    world_size = 4
    if world_size > torch.cuda.device_count():
        pytest.skip("Not enough GPUs to run the test.")
    # Verify that the DataParallel runs without error
-    engine_args = EngineArgs(model="distilbert/distilgpt2",
-                             enforce_eager=True,
-                             enable_prefix_caching=True,
-                             data_parallel_size=2,
-                             tensor_parallel_size=2,
-                             data_parallel_backend="mp")
+    engine_args = EngineArgs(
+        model="distilbert/distilgpt2",
+        enforce_eager=True,
+        enable_prefix_caching=True,
+        data_parallel_size=2,
+        tensor_parallel_size=2,
+        data_parallel_backend="mp",
+    )
    LLMEngine.from_engine_args(engine_args)
--- a/tests/distributed/test_torchrun_example.py
+++ b/tests/distributed/test_torchrun_example.py
@@ -24,13 +24,15 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

 # set different `gpu_memory_utilization` and `swap_space` for different ranks,
 # to test if all ranks agree on the same kv cache configuration.
-llm = LLM(model="facebook/opt-125m",
-          tensor_parallel_size=2,
-          pipeline_parallel_size=int(os.getenv("PP_SIZE", 1)),
-          distributed_executor_backend="external_launcher",
-          gpu_memory_utilization=random.uniform(0.7, 0.9),
-          swap_space=random.randint(1, 4),
-          seed=0)
+llm = LLM(
+    model="facebook/opt-125m",
+    tensor_parallel_size=2,
+    pipeline_parallel_size=int(os.getenv("PP_SIZE", 1)),
+    distributed_executor_backend="external_launcher",
+    gpu_memory_utilization=random.uniform(0.7, 0.9),
+    swap_space=random.randint(1, 4),
+    seed=0,
+)

 outputs = llm.generate(prompts, sampling_params)

@@ -48,15 +50,14 @@ def test_consistent_across_ranks(obj):
        assert container[0] == obj


-test_consistent_across_ranks(
-    llm.llm_engine.vllm_config.cache_config.num_cpu_blocks)
-test_consistent_across_ranks(
-    llm.llm_engine.vllm_config.cache_config.num_gpu_blocks)
+test_consistent_across_ranks(llm.llm_engine.vllm_config.cache_config.num_cpu_blocks)
+test_consistent_across_ranks(llm.llm_engine.vllm_config.cache_config.num_gpu_blocks)

 # make sure we can access the model parameters from the calling process
 # of the `LLM` instance.
-params = list(llm.llm_engine.model_executor.driver_worker.worker.model_runner.
-              model.parameters())
+params = list(
+    llm.llm_engine.model_executor.driver_worker.worker.model_runner.model.parameters()
+)
 test_consistent_across_ranks(len(params))

 # all ranks should have the same outputs
@@ -65,5 +66,4 @@ for output in outputs:
    generated_text = output.outputs[0].text
    test_consistent_across_ranks(prompt)
    test_consistent_across_ranks(generated_text)
-    print(f"Rank {torch_rank}, Prompt: {prompt!r}, "
-          f"Generated text: {generated_text!r}")
+    print(f"Rank {torch_rank}, Prompt: {prompt!r}, Generated text: {generated_text!r}")
--- a/tests/distributed/test_torchrun_example_moe.py
+++ b/tests/distributed/test_torchrun_example_moe.py
@@ -24,23 +24,22 @@ dp_rank = int(os.getenv("DP_RANK", "0"))

 if dp_size > 1:
    # distribute the prompts across the data parallel ranks
-    prompts = [
-        prompt for idx, prompt in enumerate(prompts)
-        if idx % dp_size == dp_rank
-    ]
+    prompts = [prompt for idx, prompt in enumerate(prompts) if idx % dp_size == dp_rank]

 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

 # set different `gpu_memory_utilization` and `swap_space` for different ranks,
 # to test if all ranks agree on the same kv cache configuration.
-llm = LLM(model="microsoft/Phi-mini-MoE-instruct",
-          tensor_parallel_size=int(os.getenv("TP_SIZE", "1")),
-          pipeline_parallel_size=int(os.getenv("PP_SIZE", "1")),
-          enable_expert_parallel=int(os.getenv("ENABLE_EP", "0")) == 1,
-          distributed_executor_backend="external_launcher",
-          gpu_memory_utilization=random.uniform(0.7, 0.9),
-          swap_space=random.randint(1, 4),
-          seed=0)
+llm = LLM(
+    model="microsoft/Phi-mini-MoE-instruct",
+    tensor_parallel_size=int(os.getenv("TP_SIZE", "1")),
+    pipeline_parallel_size=int(os.getenv("PP_SIZE", "1")),
+    enable_expert_parallel=int(os.getenv("ENABLE_EP", "0")) == 1,
+    distributed_executor_backend="external_launcher",
+    gpu_memory_utilization=random.uniform(0.7, 0.9),
+    swap_space=random.randint(1, 4),
+    seed=0,
+)

 outputs = llm.generate(prompts, sampling_params)

@@ -54,21 +53,18 @@ def test_consistent_across_ranks(obj):
        dist.broadcast_object_list([obj], src=group.ranks[0], group=cpu_group)
    else:
        container = [None]
-        dist.broadcast_object_list(container,
-                                   src=group.ranks[0],
-                                   group=cpu_group)
+        dist.broadcast_object_list(container, src=group.ranks[0], group=cpu_group)
        assert container[0] == obj


-test_consistent_across_ranks(
-    llm.llm_engine.vllm_config.cache_config.num_cpu_blocks)
-test_consistent_across_ranks(
-    llm.llm_engine.vllm_config.cache_config.num_gpu_blocks)
+test_consistent_across_ranks(llm.llm_engine.vllm_config.cache_config.num_cpu_blocks)
+test_consistent_across_ranks(llm.llm_engine.vllm_config.cache_config.num_gpu_blocks)

 # make sure we can access the model parameters from the calling process
 # of the `LLM` instance.
-params = list(llm.llm_engine.model_executor.driver_worker.worker.model_runner.
-              model.parameters())
+params = list(
+    llm.llm_engine.model_executor.driver_worker.worker.model_runner.model.parameters()
+)
 test_consistent_across_ranks(len(params))

 # all ranks should have the same outputs
@@ -77,5 +73,4 @@ for output in outputs:
    generated_text = output.outputs[0].text
    test_consistent_across_ranks(prompt)
    test_consistent_across_ranks(generated_text)
-    print(f"Rank {group_rank}, Prompt: {prompt!r}, "
-          f"Generated text: {generated_text!r}")
+    print(f"Rank {group_rank}, Prompt: {prompt!r}, Generated text: {generated_text!r}")
--- a/tests/distributed/test_utils.py
+++ b/tests/distributed/test_utils.py
@@ -10,21 +10,22 @@ import torch
 import vllm.envs as envs
 from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
 from vllm.distributed.utils import StatelessProcessGroup
-from vllm.utils import (cuda_device_count_stateless, get_open_port,
-                        update_environment_variables)
+from vllm.utils import (
+    cuda_device_count_stateless,
+    get_open_port,
+    update_environment_variables,
+)

 from ..utils import multi_gpu_test


@ray.remote
 class _CUDADeviceCountStatelessTestActor:
-
    def get_count(self):
        return cuda_device_count_stateless()

    def set_cuda_visible_devices(self, cuda_visible_devices: str):
-        update_environment_variables(
-            {"CUDA_VISIBLE_DEVICES": cuda_visible_devices})
+        update_environment_variables({"CUDA_VISIBLE_DEVICES": cuda_visible_devices})

    def get_cuda_visible_devices(self):
        return envs.CUDA_VISIBLE_DEVICES
@@ -34,10 +35,9 @@ def test_cuda_device_count_stateless():
    """Test that cuda_device_count_stateless changes return value if
    CUDA_VISIBLE_DEVICES is changed."""
    actor = _CUDADeviceCountStatelessTestActor.options(  # type: ignore
-        num_gpus=2).remote()
-    assert len(
-        sorted(ray.get(
-            actor.get_cuda_visible_devices.remote()).split(","))) == 2
+        num_gpus=2
+    ).remote()
+    assert len(sorted(ray.get(actor.get_cuda_visible_devices.remote()).split(","))) == 2
    assert ray.get(actor.get_count.remote()) == 2
    ray.get(actor.set_cuda_visible_devices.remote("0"))
    assert ray.get(actor.get_count.remote()) == 1
@@ -46,15 +46,13 @@ def test_cuda_device_count_stateless():


 def cpu_worker(rank, WORLD_SIZE, port1, port2):
-    pg1 = StatelessProcessGroup.create(host="127.0.0.1",
-                                       port=port1,
-                                       rank=rank,
-                                       world_size=WORLD_SIZE)
+    pg1 = StatelessProcessGroup.create(
+        host="127.0.0.1", port=port1, rank=rank, world_size=WORLD_SIZE
+    )
    if rank <= 2:
-        pg2 = StatelessProcessGroup.create(host="127.0.0.1",
-                                           port=port2,
-                                           rank=rank,
-                                           world_size=3)
+        pg2 = StatelessProcessGroup.create(
+            host="127.0.0.1", port=port2, rank=rank, world_size=3
+        )
    data = torch.tensor([rank])
    data = pg1.broadcast_obj(data, src=2)
    assert data.item() == 2
@@ -68,16 +66,14 @@ def cpu_worker(rank, WORLD_SIZE, port1, port2):

 def gpu_worker(rank, WORLD_SIZE, port1, port2):
    torch.cuda.set_device(rank)
-    pg1 = StatelessProcessGroup.create(host="127.0.0.1",
-                                       port=port1,
-                                       rank=rank,
-                                       world_size=WORLD_SIZE)
+    pg1 = StatelessProcessGroup.create(
+        host="127.0.0.1", port=port1, rank=rank, world_size=WORLD_SIZE
+    )
    pynccl1 = PyNcclCommunicator(pg1, device=rank)
    if rank <= 2:
-        pg2 = StatelessProcessGroup.create(host="127.0.0.1",
-                                           port=port2,
-                                           rank=rank,
-                                           world_size=3)
+        pg2 = StatelessProcessGroup.create(
+            host="127.0.0.1", port=port2, rank=rank, world_size=3
+        )
        pynccl2 = PyNcclCommunicator(pg2, device=rank)
    data = torch.tensor([rank]).cuda()
    pynccl1.all_reduce(data)
@@ -96,10 +92,9 @@ def gpu_worker(rank, WORLD_SIZE, port1, port2):


 def broadcast_worker(rank, WORLD_SIZE, port1, port2):
-    pg1 = StatelessProcessGroup.create(host="127.0.0.1",
-                                       port=port1,
-                                       rank=rank,
-                                       world_size=WORLD_SIZE)
+    pg1 = StatelessProcessGroup.create(
+        host="127.0.0.1", port=port1, rank=rank, world_size=WORLD_SIZE
+    )
    if rank == 2:
        pg1.broadcast_obj("secret", src=2)
    else:
@@ -109,10 +104,9 @@ def broadcast_worker(rank, WORLD_SIZE, port1, port2):


 def allgather_worker(rank, WORLD_SIZE, port1, port2):
-    pg1 = StatelessProcessGroup.create(host="127.0.0.1",
-                                       port=port1,
-                                       rank=rank,
-                                       world_size=WORLD_SIZE)
+    pg1 = StatelessProcessGroup.create(
+        host="127.0.0.1", port=port1, rank=rank, world_size=WORLD_SIZE
+    )
    data = pg1.all_gather_obj(rank)
    assert data == list(range(WORLD_SIZE))
    pg1.barrier()
@@ -121,7 +115,8 @@ def allgather_worker(rank, WORLD_SIZE, port1, port2):
@pytest.mark.skip(reason="This test is flaky and prone to hang.")
@multi_gpu_test(num_gpus=4)
@pytest.mark.parametrize(
-    "worker", [cpu_worker, gpu_worker, broadcast_worker, allgather_worker])
+    "worker", [cpu_worker, gpu_worker, broadcast_worker, allgather_worker]
+)
 def test_stateless_process_group(worker):
    port1 = get_open_port()
    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
@@ -129,12 +124,14 @@ def test_stateless_process_group(worker):
        port2 = get_open_port()
    WORLD_SIZE = 4
    from multiprocessing import get_context
+
    ctx = get_context("fork")
    processes = []
    for i in range(WORLD_SIZE):
        rank = i
        processes.append(
-            ctx.Process(target=worker, args=(rank, WORLD_SIZE, port1, port2)))
+            ctx.Process(target=worker, args=(rank, WORLD_SIZE, port1, port2))
+        )
    for p in processes:
        p.start()
    for p in processes: