[Performance] Add --enable-ep-weight-filter CLI option (#37351)

Signed-off-by: esmeetu <jasonailu87@gmail.com> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com> (cherry picked from commit 761e0aa7a0)
2026-03-18 09:36:55 +08:00
parent eeabf740bb
commit faa80947f5
3 changed files with 18 additions and 1 deletions
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -138,6 +138,13 @@ class ParallelConfig:
    """Whether the deployed model is MoE (if known)."""
    enable_expert_parallel: bool = False
    """Use expert parallelism instead of tensor parallelism for MoE layers."""
+    enable_ep_weight_filter: bool = False
+    """Skip non-local expert weights during model loading when expert
+    parallelism is active.  Each rank only reads its own expert shard from
+    disk, which can drastically reduce storage I/O for MoE models with
+    per-expert weight tensors (e.g. DeepSeek, Mixtral, Kimi-K2.5).  Has no
+    effect on 3D fused-expert checkpoints (e.g. GPT-OSS) or non-MoE
+    models."""
    enable_eplb: bool = False
    """Enable expert parallelism load balancing for MoE layers."""
    eplb_config: EPLBConfig = Field(default_factory=EPLBConfig)