[Feature] Expert Parallelism Load Balancer (EPLB) (#18343)
Signed-off-by: Bowen Wang <abmfy@icloud.com>
This commit is contained in:
@@ -1775,6 +1775,25 @@ class ParallelConfig:
|
||||
"""Backend to use for data parallel, either "mp" or "ray"."""
|
||||
enable_expert_parallel: bool = False
|
||||
"""Use expert parallelism instead of tensor parallelism for MoE layers."""
|
||||
enable_eplb: bool = False
|
||||
"""Enable expert parallelism load balancing for MoE layers."""
|
||||
num_redundant_experts: int = 0
|
||||
"""Number of redundant experts to use for expert parallelism."""
|
||||
eplb_window_size: int = 1000
|
||||
"""Window size for expert load recording."""
|
||||
eplb_step_interval: int = 3000
|
||||
"""
|
||||
Interval for rearranging experts in expert parallelism.
|
||||
|
||||
Note that if this is greater than the EPLB window size, only the metrics
|
||||
of the last `eplb_window_size` steps will be used for rearranging experts.
|
||||
"""
|
||||
eplb_log_balancedness: bool = False
|
||||
"""
|
||||
Log the balancedness each step of expert parallelism.
|
||||
This is turned off by default since it will cause communication overhead.
|
||||
"""
|
||||
|
||||
max_parallel_loading_workers: Optional[int] = None
|
||||
"""Maximum number of parallel loading workers when loading model
|
||||
sequentially in multiple batches. To avoid RAM OOM when using tensor
|
||||
@@ -1913,6 +1932,20 @@ class ParallelConfig:
|
||||
os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
|
||||
logger.info("Disabling V1 multiprocessing for external launcher.")
|
||||
|
||||
if self.enable_eplb:
|
||||
if not current_platform.is_cuda():
|
||||
raise ValueError(
|
||||
"Expert parallelism load balancing is only supported on "
|
||||
"CUDA devices now.")
|
||||
if self.num_redundant_experts < 0:
|
||||
raise ValueError(
|
||||
"num_redundant_experts must be non-negative, but got "
|
||||
f"{self.num_redundant_experts}.")
|
||||
else:
|
||||
if self.num_redundant_experts != 0:
|
||||
raise ValueError(
|
||||
"num_redundant_experts should be used with EPLB."
|
||||
f"{self.num_redundant_experts}.")
|
||||
if self.distributed_executor_backend is None and self.world_size > 1:
|
||||
# We use multiprocessing by default if world_size fits on the
|
||||
# current node and we aren't in a ray placement group.
|
||||
|
||||
Reference in New Issue
Block a user