Convert formatting to use ruff instead of yapf + isort (#26247)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@@ -161,9 +161,9 @@ class ParallelConfig:
|
||||
placement_group: Optional[PlacementGroup] = None
|
||||
"""ray distributed model workers placement group."""
|
||||
|
||||
distributed_executor_backend: Optional[Union[str,
|
||||
DistributedExecutorBackend,
|
||||
type[ExecutorBase]]] = None
|
||||
distributed_executor_backend: Optional[
|
||||
Union[str, DistributedExecutorBackend, type[ExecutorBase]]
|
||||
] = None
|
||||
"""Backend to use for distributed model
|
||||
workers, either "ray" or "mp" (multiprocessing). If the product
|
||||
of pipeline_parallel_size and tensor_parallel_size is less than
|
||||
@@ -253,7 +253,8 @@ class ParallelConfig:
|
||||
from torch.distributed import DistNetworkError
|
||||
|
||||
from vllm.distributed.utils import (
|
||||
stateless_init_torch_distributed_process_group)
|
||||
stateless_init_torch_distributed_process_group,
|
||||
)
|
||||
|
||||
max_retries = 5
|
||||
last_exc: Optional[Exception] = None
|
||||
@@ -265,12 +266,12 @@ class ParallelConfig:
|
||||
self.get_next_dp_init_port(),
|
||||
self.data_parallel_rank,
|
||||
self.data_parallel_size,
|
||||
backend="gloo")
|
||||
backend="gloo",
|
||||
)
|
||||
except DistNetworkError as e:
|
||||
# We only want to retry when the root cause is EADDRINUSE.
|
||||
if "EADDRINUSE" in str(e):
|
||||
logger.warning(
|
||||
"Address already in use. Retrying with a new port.")
|
||||
logger.warning("Address already in use. Retrying with a new port.")
|
||||
last_exc = e
|
||||
continue # try again with a new port
|
||||
raise e
|
||||
@@ -290,19 +291,22 @@ class ParallelConfig:
|
||||
# Not needed for pplx-kernels as it can handle duplicate input tokens.
|
||||
@property
|
||||
def use_sequence_parallel_moe(self) -> bool:
|
||||
return (envs.VLLM_ALL2ALL_BACKEND
|
||||
in ("allgather_reducescatter", "naive",
|
||||
"deepep_high_throughput", "deepep_low_latency")
|
||||
and self.enable_expert_parallel
|
||||
and self.tensor_parallel_size > 1
|
||||
and self.data_parallel_size > 1)
|
||||
return (
|
||||
envs.VLLM_ALL2ALL_BACKEND
|
||||
in (
|
||||
"allgather_reducescatter",
|
||||
"naive",
|
||||
"deepep_high_throughput",
|
||||
"deepep_low_latency",
|
||||
)
|
||||
and self.enable_expert_parallel
|
||||
and self.tensor_parallel_size > 1
|
||||
and self.data_parallel_size > 1
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def has_unfinished_dp(dp_group: ProcessGroup,
|
||||
has_unfinished: bool) -> bool:
|
||||
tensor = torch.tensor([has_unfinished],
|
||||
dtype=torch.int32,
|
||||
device="cpu")
|
||||
def has_unfinished_dp(dp_group: ProcessGroup, has_unfinished: bool) -> bool:
|
||||
tensor = torch.tensor([has_unfinished], dtype=torch.int32, device="cpu")
|
||||
# dp rank 0: has_unfinished_seqs=True
|
||||
# dp rank 1: has_unfinished_seqs=False
|
||||
# aggregated: has_unfinished_seqs=True
|
||||
@@ -312,13 +316,10 @@ class ParallelConfig:
|
||||
return aggregated_has_unfinished
|
||||
|
||||
@staticmethod
|
||||
def sync_kv_cache_memory_size(dp_group: ProcessGroup,
|
||||
kv_cache_memory: int) -> int:
|
||||
def sync_kv_cache_memory_size(dp_group: ProcessGroup, kv_cache_memory: int) -> int:
|
||||
if kv_cache_memory == -1:
|
||||
kv_cache_memory = torch.iinfo(torch.int64).max
|
||||
tensor = torch.tensor([kv_cache_memory],
|
||||
dtype=torch.int64,
|
||||
device="cpu")
|
||||
tensor = torch.tensor([kv_cache_memory], dtype=torch.int64, device="cpu")
|
||||
# we cannot use broadcast for stateless dp group since it depends
|
||||
# on global rank
|
||||
torch.distributed.all_reduce(tensor, op=ReduceOp.MIN, group=dp_group)
|
||||
@@ -343,38 +344,40 @@ class ParallelConfig:
|
||||
def __post_init__(self) -> None:
|
||||
# Forward deprecated fields to their new location
|
||||
if self.num_redundant_experts is not None:
|
||||
self.eplb_config.num_redundant_experts = (
|
||||
self.num_redundant_experts)
|
||||
self.eplb_config.num_redundant_experts = self.num_redundant_experts
|
||||
logger.warning_once(
|
||||
"num_redundant_experts is deprecated and has been replaced "
|
||||
"with eplb_config.num_redundant_experts. This will be removed "
|
||||
"in v0.12.0. Changing this field after initialization will "
|
||||
"have no effect.")
|
||||
"have no effect."
|
||||
)
|
||||
if self.eplb_window_size is not None:
|
||||
self.eplb_config.window_size = self.eplb_window_size
|
||||
logger.warning_once(
|
||||
"eplb_window_size is deprecated and has been replaced "
|
||||
"with eplb_config.window_size. This will be removed "
|
||||
"in v0.12.0. Changing this field after initialization will "
|
||||
"have no effect.")
|
||||
"have no effect."
|
||||
)
|
||||
if self.eplb_step_interval is not None:
|
||||
self.eplb_config.step_interval = self.eplb_step_interval
|
||||
logger.warning_once(
|
||||
"eplb_step_interval is deprecated and has been replaced "
|
||||
"with eplb_config.step_interval. This will be removed "
|
||||
"in v0.12.0. Changing this field after initialization will "
|
||||
"have no effect.")
|
||||
"have no effect."
|
||||
)
|
||||
if self.eplb_log_balancedness is not None:
|
||||
self.eplb_config.log_balancedness = self.eplb_log_balancedness
|
||||
logger.warning_once(
|
||||
"eplb_log_balancedness is deprecated and has been replaced "
|
||||
"with eplb_config.log_balancedness. This will be removed "
|
||||
"in v0.12.0. Changing this field after initialization will "
|
||||
"have no effect.")
|
||||
"have no effect."
|
||||
)
|
||||
|
||||
# Continue with the rest of the initialization
|
||||
self.world_size = self.pipeline_parallel_size * \
|
||||
self.tensor_parallel_size
|
||||
self.world_size = self.pipeline_parallel_size * self.tensor_parallel_size
|
||||
|
||||
if self.distributed_executor_backend == "external_launcher":
|
||||
logger.info("Using external launcher for distributed inference.")
|
||||
@@ -383,26 +386,30 @@ class ParallelConfig:
|
||||
if self.data_parallel_size_local > self.data_parallel_size:
|
||||
raise ValueError(
|
||||
f"data_parallel_size_local ({self.data_parallel_size_local}) "
|
||||
f"must be <= data_parallel_size ({self.data_parallel_size})")
|
||||
f"must be <= data_parallel_size ({self.data_parallel_size})"
|
||||
)
|
||||
|
||||
if self.data_parallel_size > 1 or self.data_parallel_size_local == 0:
|
||||
# Data parallel was specified in the engine args.
|
||||
if self.distributed_executor_backend == "external_launcher":
|
||||
# For external launcher,
|
||||
# we need to set the data parallel rank automatically
|
||||
self.data_parallel_rank = int(os.environ["RANK"]) \
|
||||
// (self.world_size // self.data_parallel_size)
|
||||
logger.info("Set data_parallel_rank to %d automatically.",
|
||||
self.data_parallel_rank)
|
||||
self.data_parallel_rank = int(os.environ["RANK"]) // (
|
||||
self.world_size // self.data_parallel_size
|
||||
)
|
||||
logger.info(
|
||||
"Set data_parallel_rank to %d automatically.",
|
||||
self.data_parallel_rank,
|
||||
)
|
||||
if not self._data_parallel_master_port_list:
|
||||
self._data_parallel_master_port_list = get_open_ports_list(5)
|
||||
self.data_parallel_master_port = \
|
||||
self._data_parallel_master_port_list.pop()
|
||||
self.data_parallel_master_port = self._data_parallel_master_port_list.pop()
|
||||
|
||||
if not (0 <= self.data_parallel_rank < self.data_parallel_size):
|
||||
raise ValueError(
|
||||
f"data_parallel_rank ({self.data_parallel_rank})"
|
||||
f" must be in the range [0, {self.data_parallel_size})")
|
||||
f" must be in the range [0, {self.data_parallel_size})"
|
||||
)
|
||||
else:
|
||||
# Otherwise fall back to env vars (e.g. for offline SPMD case).
|
||||
self.data_parallel_size = envs.VLLM_DP_SIZE
|
||||
@@ -412,8 +419,10 @@ class ParallelConfig:
|
||||
self.data_parallel_master_port = envs.VLLM_DP_MASTER_PORT
|
||||
|
||||
if self.data_parallel_external_lb:
|
||||
raise ValueError("data_parallel_external_lb can only "
|
||||
"be set when data_parallel_size > 1")
|
||||
raise ValueError(
|
||||
"data_parallel_external_lb can only "
|
||||
"be set when data_parallel_size > 1"
|
||||
)
|
||||
|
||||
if self.distributed_executor_backend == "external_launcher":
|
||||
os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
|
||||
@@ -423,14 +432,15 @@ class ParallelConfig:
|
||||
if not current_platform.is_cuda():
|
||||
raise ValueError(
|
||||
"Expert parallelism load balancing is only supported on "
|
||||
"CUDA devices now.")
|
||||
"CUDA devices now."
|
||||
)
|
||||
if self.eplb_config.num_redundant_experts < 0:
|
||||
raise ValueError(
|
||||
"num_redundant_experts must be non-negative, but got "
|
||||
f"{self.eplb_config.num_redundant_experts}.")
|
||||
f"{self.eplb_config.num_redundant_experts}."
|
||||
)
|
||||
if not self.enable_expert_parallel:
|
||||
raise ValueError(
|
||||
"enable_expert_parallel must be True to use EPLB.")
|
||||
raise ValueError("enable_expert_parallel must be True to use EPLB.")
|
||||
if self.tensor_parallel_size * self.data_parallel_size <= 1:
|
||||
raise ValueError(
|
||||
"EPLB requires tensor_parallel_size or data_parallel_size "
|
||||
@@ -443,41 +453,50 @@ class ParallelConfig:
|
||||
"num_redundant_experts is set to "
|
||||
f"{self.eplb_config.num_redundant_experts} but EPLB is not "
|
||||
"enabled. Either enable EPLB or unset "
|
||||
"num_redundant_experts.")
|
||||
"num_redundant_experts."
|
||||
)
|
||||
if self.distributed_executor_backend is None and self.world_size > 1:
|
||||
# We use multiprocessing by default if world_size fits on the
|
||||
# current node and we aren't in a ray placement group.
|
||||
|
||||
from vllm.executor import ray_utils
|
||||
|
||||
backend: DistributedExecutorBackend = "mp"
|
||||
ray_found = ray_utils.ray_is_available()
|
||||
if current_platform.is_tpu() and envs.VLLM_XLA_USE_SPMD:
|
||||
backend = "uni"
|
||||
elif (current_platform.is_cuda()
|
||||
and cuda_device_count_stateless() < self.world_size):
|
||||
elif (
|
||||
current_platform.is_cuda()
|
||||
and cuda_device_count_stateless() < self.world_size
|
||||
):
|
||||
if not ray_found:
|
||||
raise ValueError("Unable to load Ray: "
|
||||
f"{ray_utils.ray_import_err}. Ray is "
|
||||
"required for multi-node inference, "
|
||||
"please install Ray with `pip install "
|
||||
"ray`.")
|
||||
raise ValueError(
|
||||
"Unable to load Ray: "
|
||||
f"{ray_utils.ray_import_err}. Ray is "
|
||||
"required for multi-node inference, "
|
||||
"please install Ray with `pip install "
|
||||
"ray`."
|
||||
)
|
||||
backend = "ray"
|
||||
elif self.data_parallel_backend == "ray":
|
||||
logger.info("Using ray distributed inference because "
|
||||
"data_parallel_backend is ray")
|
||||
logger.info(
|
||||
"Using ray distributed inference because "
|
||||
"data_parallel_backend is ray"
|
||||
)
|
||||
backend = "ray"
|
||||
elif ray_found:
|
||||
if self.placement_group:
|
||||
backend = "ray"
|
||||
else:
|
||||
from ray import is_initialized as ray_is_initialized
|
||||
|
||||
if ray_is_initialized():
|
||||
from ray.util import get_current_placement_group
|
||||
|
||||
if get_current_placement_group():
|
||||
backend = "ray"
|
||||
self.distributed_executor_backend = backend
|
||||
logger.debug("Defaulting to use %s for distributed inference",
|
||||
backend)
|
||||
logger.debug("Defaulting to use %s for distributed inference", backend)
|
||||
|
||||
if self.distributed_executor_backend is None and self.world_size == 1:
|
||||
self.distributed_executor_backend = "uni"
|
||||
@@ -486,39 +505,50 @@ class ParallelConfig:
|
||||
raise ValueError(
|
||||
"Invalid value of `_api_process_rank`. "
|
||||
f"Expected to be `-1` or `[0, {self._api_process_count})`, "
|
||||
f"but found: {self._api_process_rank}")
|
||||
f"but found: {self._api_process_rank}"
|
||||
)
|
||||
|
||||
@property
|
||||
def use_ray(self) -> bool:
|
||||
return self.distributed_executor_backend == "ray" or (
|
||||
isinstance(self.distributed_executor_backend, type)
|
||||
and getattr(self.distributed_executor_backend, "uses_ray", False))
|
||||
and getattr(self.distributed_executor_backend, "uses_ray", False)
|
||||
)
|
||||
|
||||
@model_validator(mode='after')
|
||||
@model_validator(mode="after")
|
||||
def _verify_args(self) -> Self:
|
||||
# Lazy import to avoid circular import
|
||||
from vllm.executor.executor_base import ExecutorBase
|
||||
from vllm.platforms import current_platform
|
||||
if self.distributed_executor_backend is not None and not isinstance(
|
||||
self.distributed_executor_backend, str) and not (isinstance(
|
||||
self.distributed_executor_backend, type) and issubclass(
|
||||
self.distributed_executor_backend, ExecutorBase)):
|
||||
|
||||
if (
|
||||
self.distributed_executor_backend is not None
|
||||
and not isinstance(self.distributed_executor_backend, str)
|
||||
and not (
|
||||
isinstance(self.distributed_executor_backend, type)
|
||||
and issubclass(self.distributed_executor_backend, ExecutorBase)
|
||||
)
|
||||
):
|
||||
raise ValueError(
|
||||
"Unrecognized distributed executor backend "
|
||||
f"{self.distributed_executor_backend}. Supported "
|
||||
"values are 'ray', 'mp' 'uni', 'external_launcher', "
|
||||
" custom ExecutorBase subclass or its import path.")
|
||||
" custom ExecutorBase subclass or its import path."
|
||||
)
|
||||
if self.use_ray:
|
||||
from vllm.executor import ray_utils
|
||||
|
||||
ray_utils.assert_ray_available()
|
||||
|
||||
if not current_platform.use_custom_allreduce():
|
||||
self.disable_custom_all_reduce = True
|
||||
logger.debug(
|
||||
"Disabled the custom all-reduce kernel because it is not "
|
||||
"supported on current platform.")
|
||||
"supported on current platform."
|
||||
)
|
||||
if self.ray_workers_use_nsight and not self.use_ray:
|
||||
raise ValueError("Unable to use nsight profiling unless workers "
|
||||
"run with Ray.")
|
||||
raise ValueError(
|
||||
"Unable to use nsight profiling unless workers run with Ray."
|
||||
)
|
||||
|
||||
return self
|
||||
|
||||
Reference in New Issue
Block a user