[Performance] Add --enable-ep-weight-filter CLI option (#37351)
Signed-off-by: esmeetu <jasonailu87@gmail.com>
Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
(cherry picked from commit 761e0aa7a0)
This commit is contained in:
@@ -138,6 +138,13 @@ class ParallelConfig:
|
|||||||
"""Whether the deployed model is MoE (if known)."""
|
"""Whether the deployed model is MoE (if known)."""
|
||||||
enable_expert_parallel: bool = False
|
enable_expert_parallel: bool = False
|
||||||
"""Use expert parallelism instead of tensor parallelism for MoE layers."""
|
"""Use expert parallelism instead of tensor parallelism for MoE layers."""
|
||||||
|
enable_ep_weight_filter: bool = False
|
||||||
|
"""Skip non-local expert weights during model loading when expert
|
||||||
|
parallelism is active. Each rank only reads its own expert shard from
|
||||||
|
disk, which can drastically reduce storage I/O for MoE models with
|
||||||
|
per-expert weight tensors (e.g. DeepSeek, Mixtral, Kimi-K2.5). Has no
|
||||||
|
effect on 3D fused-expert checkpoints (e.g. GPT-OSS) or non-MoE
|
||||||
|
models."""
|
||||||
enable_eplb: bool = False
|
enable_eplb: bool = False
|
||||||
"""Enable expert parallelism load balancing for MoE layers."""
|
"""Enable expert parallelism load balancing for MoE layers."""
|
||||||
eplb_config: EPLBConfig = Field(default_factory=EPLBConfig)
|
eplb_config: EPLBConfig = Field(default_factory=EPLBConfig)
|
||||||
|
|||||||
@@ -419,6 +419,7 @@ class EngineArgs:
|
|||||||
data_parallel_external_lb: bool = False
|
data_parallel_external_lb: bool = False
|
||||||
data_parallel_backend: DataParallelBackend = ParallelConfig.data_parallel_backend
|
data_parallel_backend: DataParallelBackend = ParallelConfig.data_parallel_backend
|
||||||
enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel
|
enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel
|
||||||
|
enable_ep_weight_filter: bool = ParallelConfig.enable_ep_weight_filter
|
||||||
moe_backend: MoEBackend = KernelConfig.moe_backend
|
moe_backend: MoEBackend = KernelConfig.moe_backend
|
||||||
all2all_backend: All2AllBackend = ParallelConfig.all2all_backend
|
all2all_backend: All2AllBackend = ParallelConfig.all2all_backend
|
||||||
enable_elastic_ep: bool = ParallelConfig.enable_elastic_ep
|
enable_elastic_ep: bool = ParallelConfig.enable_elastic_ep
|
||||||
@@ -901,6 +902,10 @@ class EngineArgs:
|
|||||||
"-ep",
|
"-ep",
|
||||||
**parallel_kwargs["enable_expert_parallel"],
|
**parallel_kwargs["enable_expert_parallel"],
|
||||||
)
|
)
|
||||||
|
parallel_group.add_argument(
|
||||||
|
"--enable-ep-weight-filter",
|
||||||
|
**parallel_kwargs["enable_ep_weight_filter"],
|
||||||
|
)
|
||||||
parallel_group.add_argument(
|
parallel_group.add_argument(
|
||||||
"--all2all-backend", **parallel_kwargs["all2all_backend"]
|
"--all2all-backend", **parallel_kwargs["all2all_backend"]
|
||||||
)
|
)
|
||||||
@@ -1727,6 +1732,7 @@ class EngineArgs:
|
|||||||
data_parallel_hybrid_lb=self.data_parallel_hybrid_lb,
|
data_parallel_hybrid_lb=self.data_parallel_hybrid_lb,
|
||||||
is_moe_model=model_config.is_moe,
|
is_moe_model=model_config.is_moe,
|
||||||
enable_expert_parallel=self.enable_expert_parallel,
|
enable_expert_parallel=self.enable_expert_parallel,
|
||||||
|
enable_ep_weight_filter=self.enable_ep_weight_filter,
|
||||||
all2all_backend=self.all2all_backend,
|
all2all_backend=self.all2all_backend,
|
||||||
enable_elastic_ep=self.enable_elastic_ep,
|
enable_elastic_ep=self.enable_elastic_ep,
|
||||||
enable_dbo=self.enable_dbo,
|
enable_dbo=self.enable_dbo,
|
||||||
|
|||||||
@@ -313,7 +313,11 @@ class DefaultModelLoader(BaseModelLoader):
|
|||||||
vllm_config = get_current_vllm_config()
|
vllm_config = get_current_vllm_config()
|
||||||
parallel_config = vllm_config.parallel_config
|
parallel_config = vllm_config.parallel_config
|
||||||
|
|
||||||
if not (model_config.is_moe and parallel_config.enable_expert_parallel):
|
if not (
|
||||||
|
model_config.is_moe
|
||||||
|
and parallel_config.enable_expert_parallel
|
||||||
|
and parallel_config.enable_ep_weight_filter
|
||||||
|
):
|
||||||
return
|
return
|
||||||
|
|
||||||
num_experts = model_config.get_num_experts()
|
num_experts = model_config.get_num_experts()
|
||||||
|
|||||||
Reference in New Issue
Block a user