[Performance] Add --enable-ep-weight-filter CLI option (#37351)
Signed-off-by: esmeetu <jasonailu87@gmail.com>
Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
(cherry picked from commit 761e0aa7a0)
This commit is contained in:
@@ -138,6 +138,13 @@ class ParallelConfig:
|
||||
"""Whether the deployed model is MoE (if known)."""
|
||||
enable_expert_parallel: bool = False
|
||||
"""Use expert parallelism instead of tensor parallelism for MoE layers."""
|
||||
enable_ep_weight_filter: bool = False
|
||||
"""Skip non-local expert weights during model loading when expert
|
||||
parallelism is active. Each rank only reads its own expert shard from
|
||||
disk, which can drastically reduce storage I/O for MoE models with
|
||||
per-expert weight tensors (e.g. DeepSeek, Mixtral, Kimi-K2.5). Has no
|
||||
effect on 3D fused-expert checkpoints (e.g. GPT-OSS) or non-MoE
|
||||
models."""
|
||||
enable_eplb: bool = False
|
||||
"""Enable expert parallelism load balancing for MoE layers."""
|
||||
eplb_config: EPLBConfig = Field(default_factory=EPLBConfig)
|
||||
|
||||
Reference in New Issue
Block a user