[Feat][EPLB] A novel static EPLB placement strategy for MoE models. (#23745)

Signed-off-by: bruceszchen <bruceszchen@tencent.com> Signed-off-by: Chen Bruce <bruceszchen@tencent.com> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Signed-off-by: Chen Bruce <cszwwdz@vip.qq.com> Co-authored-by: lemon412 <lemon412@foxmail.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-09-16 18:55:16 +08:00
parent 27fcfe7bcf
commit 7ea5c73ad7
4 changed files with 265 additions and 12 deletions
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -34,6 +34,7 @@ from vllm.config import (BlockSize, CacheConfig, CacheDType, CompilationConfig,
                         SpeculativeConfig, TaskOption, TokenizerMode,
                         VllmConfig, get_attr_docs)
 from vllm.config.multimodal import MMCacheType, MultiModalConfig
+from vllm.config.parallel import ExpertPlacementStrategy
 from vllm.config.utils import get_field
 from vllm.logger import init_logger
 from vllm.platforms import CpuArchEnum, current_platform
@@ -328,6 +329,8 @@ class EngineArgs:
    enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel
    eplb_config: EPLBConfig = get_field(ParallelConfig, "eplb_config")
    enable_eplb: bool = ParallelConfig.enable_eplb
+    expert_placement_strategy: ExpertPlacementStrategy = \
+        ParallelConfig.expert_placement_strategy
    num_redundant_experts: int = EPLBConfig.num_redundant_experts
    eplb_window_size: int = EPLBConfig.window_size
    eplb_step_interval: int = EPLBConfig.step_interval
@@ -696,6 +699,9 @@ class EngineArgs:
                                    **parallel_kwargs["enable_eplb"])
        parallel_group.add_argument("--eplb-config",
                                    **parallel_kwargs["eplb_config"])
+        parallel_group.add_argument(
+            "--expert-placement-strategy",
+            **parallel_kwargs["expert_placement_strategy"])
        parallel_group.add_argument(
            "--num-redundant-experts",
            type=int,
@@ -1335,6 +1341,7 @@ class EngineArgs:
            enable_expert_parallel=self.enable_expert_parallel,
            enable_eplb=self.enable_eplb,
            eplb_config=self.eplb_config,
+            expert_placement_strategy=self.expert_placement_strategy,
            max_parallel_loading_workers=self.max_parallel_loading_workers,
            disable_custom_all_reduce=self.disable_custom_all_reduce,
            ray_workers_use_nsight=self.ray_workers_use_nsight,