Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-05 15:06:22 +01:00
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -161,9 +161,9 @@ class ParallelConfig:
    placement_group: Optional[PlacementGroup] = None
    """ray distributed model workers placement group."""

-    distributed_executor_backend: Optional[Union[str,
-                                                 DistributedExecutorBackend,
-                                                 type[ExecutorBase]]] = None
+    distributed_executor_backend: Optional[
+        Union[str, DistributedExecutorBackend, type[ExecutorBase]]
+    ] = None
    """Backend to use for distributed model
    workers, either "ray" or "mp" (multiprocessing). If the product
    of pipeline_parallel_size and tensor_parallel_size is less than
@@ -253,7 +253,8 @@ class ParallelConfig:
        from torch.distributed import DistNetworkError

        from vllm.distributed.utils import (
-            stateless_init_torch_distributed_process_group)
+            stateless_init_torch_distributed_process_group,
+        )

        max_retries = 5
        last_exc: Optional[Exception] = None
@@ -265,12 +266,12 @@ class ParallelConfig:
                    self.get_next_dp_init_port(),
                    self.data_parallel_rank,
                    self.data_parallel_size,
-                    backend="gloo")
+                    backend="gloo",
+                )
            except DistNetworkError as e:
                # We only want to retry when the root cause is EADDRINUSE.
                if "EADDRINUSE" in str(e):
-                    logger.warning(
-                        "Address already in use. Retrying with a new port.")
+                    logger.warning("Address already in use. Retrying with a new port.")
                    last_exc = e
                    continue  # try again with a new port
                raise e
@@ -290,19 +291,22 @@ class ParallelConfig:
    # Not needed for pplx-kernels as it can handle duplicate input tokens.
    @property
    def use_sequence_parallel_moe(self) -> bool:
-        return (envs.VLLM_ALL2ALL_BACKEND
-                in ("allgather_reducescatter", "naive",
-                    "deepep_high_throughput", "deepep_low_latency")
-                and self.enable_expert_parallel
-                and self.tensor_parallel_size > 1
-                and self.data_parallel_size > 1)
+        return (
+            envs.VLLM_ALL2ALL_BACKEND
+            in (
+                "allgather_reducescatter",
+                "naive",
+                "deepep_high_throughput",
+                "deepep_low_latency",
+            )
+            and self.enable_expert_parallel
+            and self.tensor_parallel_size > 1
+            and self.data_parallel_size > 1
+        )

    @staticmethod
-    def has_unfinished_dp(dp_group: ProcessGroup,
-                          has_unfinished: bool) -> bool:
-        tensor = torch.tensor([has_unfinished],
-                              dtype=torch.int32,
-                              device="cpu")
+    def has_unfinished_dp(dp_group: ProcessGroup, has_unfinished: bool) -> bool:
+        tensor = torch.tensor([has_unfinished], dtype=torch.int32, device="cpu")
        # dp rank 0: has_unfinished_seqs=True
        # dp rank 1: has_unfinished_seqs=False
        # aggregated: has_unfinished_seqs=True
@@ -312,13 +316,10 @@ class ParallelConfig:
        return aggregated_has_unfinished

    @staticmethod
-    def sync_kv_cache_memory_size(dp_group: ProcessGroup,
-                                  kv_cache_memory: int) -> int:
+    def sync_kv_cache_memory_size(dp_group: ProcessGroup, kv_cache_memory: int) -> int:
        if kv_cache_memory == -1:
            kv_cache_memory = torch.iinfo(torch.int64).max
-        tensor = torch.tensor([kv_cache_memory],
-                              dtype=torch.int64,
-                              device="cpu")
+        tensor = torch.tensor([kv_cache_memory], dtype=torch.int64, device="cpu")
        # we cannot use broadcast for stateless dp group since it depends
        # on global rank
        torch.distributed.all_reduce(tensor, op=ReduceOp.MIN, group=dp_group)
@@ -343,38 +344,40 @@ class ParallelConfig:
    def __post_init__(self) -> None:
        # Forward deprecated fields to their new location
        if self.num_redundant_experts is not None:
-            self.eplb_config.num_redundant_experts = (
-                self.num_redundant_experts)
+            self.eplb_config.num_redundant_experts = self.num_redundant_experts
            logger.warning_once(
                "num_redundant_experts is deprecated and has been replaced "
                "with eplb_config.num_redundant_experts. This will be removed "
                "in v0.12.0. Changing this field after initialization will "
-                "have no effect.")
+                "have no effect."
+            )
        if self.eplb_window_size is not None:
            self.eplb_config.window_size = self.eplb_window_size
            logger.warning_once(
                "eplb_window_size is deprecated and has been replaced "
                "with eplb_config.window_size. This will be removed "
                "in v0.12.0. Changing this field after initialization will "
-                "have no effect.")
+                "have no effect."
+            )
        if self.eplb_step_interval is not None:
            self.eplb_config.step_interval = self.eplb_step_interval
            logger.warning_once(
                "eplb_step_interval is deprecated and has been replaced "
                "with eplb_config.step_interval. This will be removed "
                "in v0.12.0. Changing this field after initialization will "
-                "have no effect.")
+                "have no effect."
+            )
        if self.eplb_log_balancedness is not None:
            self.eplb_config.log_balancedness = self.eplb_log_balancedness
            logger.warning_once(
                "eplb_log_balancedness is deprecated and has been replaced "
                "with eplb_config.log_balancedness. This will be removed "
                "in v0.12.0. Changing this field after initialization will "
-                "have no effect.")
+                "have no effect."
+            )

        # Continue with the rest of the initialization
-        self.world_size = self.pipeline_parallel_size * \
-            self.tensor_parallel_size
+        self.world_size = self.pipeline_parallel_size * self.tensor_parallel_size

        if self.distributed_executor_backend == "external_launcher":
            logger.info("Using external launcher for distributed inference.")
@@ -383,26 +386,30 @@ class ParallelConfig:
        if self.data_parallel_size_local > self.data_parallel_size:
            raise ValueError(
                f"data_parallel_size_local ({self.data_parallel_size_local}) "
-                f"must be <= data_parallel_size ({self.data_parallel_size})")
+                f"must be <= data_parallel_size ({self.data_parallel_size})"
+            )

        if self.data_parallel_size > 1 or self.data_parallel_size_local == 0:
            # Data parallel was specified in the engine args.
            if self.distributed_executor_backend == "external_launcher":
                # For external launcher,
                # we need to set the data parallel rank automatically
-                self.data_parallel_rank = int(os.environ["RANK"]) \
-                    // (self.world_size // self.data_parallel_size)
-                logger.info("Set data_parallel_rank to %d automatically.",
-                            self.data_parallel_rank)
+                self.data_parallel_rank = int(os.environ["RANK"]) // (
+                    self.world_size // self.data_parallel_size
+                )
+                logger.info(
+                    "Set data_parallel_rank to %d automatically.",
+                    self.data_parallel_rank,
+                )
            if not self._data_parallel_master_port_list:
                self._data_parallel_master_port_list = get_open_ports_list(5)
-            self.data_parallel_master_port = \
-                self._data_parallel_master_port_list.pop()
+            self.data_parallel_master_port = self._data_parallel_master_port_list.pop()

            if not (0 <= self.data_parallel_rank < self.data_parallel_size):
                raise ValueError(
                    f"data_parallel_rank ({self.data_parallel_rank})"
-                    f" must be in the range [0, {self.data_parallel_size})")
+                    f" must be in the range [0, {self.data_parallel_size})"
+                )
        else:
            # Otherwise fall back to env vars (e.g. for offline SPMD case).
            self.data_parallel_size = envs.VLLM_DP_SIZE
@@ -412,8 +419,10 @@ class ParallelConfig:
            self.data_parallel_master_port = envs.VLLM_DP_MASTER_PORT

            if self.data_parallel_external_lb:
-                raise ValueError("data_parallel_external_lb can only "
-                                 "be set when data_parallel_size > 1")
+                raise ValueError(
+                    "data_parallel_external_lb can only "
+                    "be set when data_parallel_size > 1"
+                )

        if self.distributed_executor_backend == "external_launcher":
            os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
@@ -423,14 +432,15 @@ class ParallelConfig:
            if not current_platform.is_cuda():
                raise ValueError(
                    "Expert parallelism load balancing is only supported on "
-                    "CUDA devices now.")
+                    "CUDA devices now."
+                )
            if self.eplb_config.num_redundant_experts < 0:
                raise ValueError(
                    "num_redundant_experts must be non-negative, but got "
-                    f"{self.eplb_config.num_redundant_experts}.")
+                    f"{self.eplb_config.num_redundant_experts}."
+                )
            if not self.enable_expert_parallel:
-                raise ValueError(
-                    "enable_expert_parallel must be True to use EPLB.")
+                raise ValueError("enable_expert_parallel must be True to use EPLB.")
            if self.tensor_parallel_size * self.data_parallel_size <= 1:
                raise ValueError(
                    "EPLB requires tensor_parallel_size or data_parallel_size "
@@ -443,41 +453,50 @@ class ParallelConfig:
                    "num_redundant_experts is set to "
                    f"{self.eplb_config.num_redundant_experts} but EPLB is not "
                    "enabled. Either enable EPLB or unset "
-                    "num_redundant_experts.")
+                    "num_redundant_experts."
+                )
        if self.distributed_executor_backend is None and self.world_size > 1:
            # We use multiprocessing by default if world_size fits on the
            # current node and we aren't in a ray placement group.

            from vllm.executor import ray_utils
+
            backend: DistributedExecutorBackend = "mp"
            ray_found = ray_utils.ray_is_available()
            if current_platform.is_tpu() and envs.VLLM_XLA_USE_SPMD:
                backend = "uni"
-            elif (current_platform.is_cuda()
-                  and cuda_device_count_stateless() < self.world_size):
+            elif (
+                current_platform.is_cuda()
+                and cuda_device_count_stateless() < self.world_size
+            ):
                if not ray_found:
-                    raise ValueError("Unable to load Ray: "
-                                     f"{ray_utils.ray_import_err}. Ray is "
-                                     "required for multi-node inference, "
-                                     "please install Ray with `pip install "
-                                     "ray`.")
+                    raise ValueError(
+                        "Unable to load Ray: "
+                        f"{ray_utils.ray_import_err}. Ray is "
+                        "required for multi-node inference, "
+                        "please install Ray with `pip install "
+                        "ray`."
+                    )
                backend = "ray"
            elif self.data_parallel_backend == "ray":
-                logger.info("Using ray distributed inference because "
-                            "data_parallel_backend is ray")
+                logger.info(
+                    "Using ray distributed inference because "
+                    "data_parallel_backend is ray"
+                )
                backend = "ray"
            elif ray_found:
                if self.placement_group:
                    backend = "ray"
                else:
                    from ray import is_initialized as ray_is_initialized
+
                    if ray_is_initialized():
                        from ray.util import get_current_placement_group
+
                        if get_current_placement_group():
                            backend = "ray"
            self.distributed_executor_backend = backend
-            logger.debug("Defaulting to use %s for distributed inference",
-                         backend)
+            logger.debug("Defaulting to use %s for distributed inference", backend)

        if self.distributed_executor_backend is None and self.world_size == 1:
            self.distributed_executor_backend = "uni"
@@ -486,39 +505,50 @@ class ParallelConfig:
            raise ValueError(
                "Invalid value of `_api_process_rank`. "
                f"Expected to be `-1` or `[0, {self._api_process_count})`, "
-                f"but found: {self._api_process_rank}")
+                f"but found: {self._api_process_rank}"
+            )

    @property
    def use_ray(self) -> bool:
        return self.distributed_executor_backend == "ray" or (
            isinstance(self.distributed_executor_backend, type)
-            and getattr(self.distributed_executor_backend, "uses_ray", False))
+            and getattr(self.distributed_executor_backend, "uses_ray", False)
+        )

-    @model_validator(mode='after')
+    @model_validator(mode="after")
    def _verify_args(self) -> Self:
        # Lazy import to avoid circular import
        from vllm.executor.executor_base import ExecutorBase
        from vllm.platforms import current_platform
-        if self.distributed_executor_backend is not None and not isinstance(
-                self.distributed_executor_backend, str) and not (isinstance(
-                    self.distributed_executor_backend, type) and issubclass(
-                        self.distributed_executor_backend, ExecutorBase)):
+
+        if (
+            self.distributed_executor_backend is not None
+            and not isinstance(self.distributed_executor_backend, str)
+            and not (
+                isinstance(self.distributed_executor_backend, type)
+                and issubclass(self.distributed_executor_backend, ExecutorBase)
+            )
+        ):
            raise ValueError(
                "Unrecognized distributed executor backend "
                f"{self.distributed_executor_backend}. Supported "
                "values are 'ray', 'mp' 'uni', 'external_launcher', "
-                " custom ExecutorBase subclass or its import path.")
+                " custom ExecutorBase subclass or its import path."
+            )
        if self.use_ray:
            from vllm.executor import ray_utils
+
            ray_utils.assert_ray_available()

        if not current_platform.use_custom_allreduce():
            self.disable_custom_all_reduce = True
            logger.debug(
                "Disabled the custom all-reduce kernel because it is not "
-                "supported on current platform.")
+                "supported on current platform."
+            )
        if self.ray_workers_use_nsight and not self.use_ray:
-            raise ValueError("Unable to use nsight profiling unless workers "
-                             "run with Ray.")
+            raise ValueError(
+                "Unable to use nsight profiling unless workers run with Ray."
+            )

        return self