From 17c47fb8691f2efd7948659952c44ef167462534 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elvir=20Crn=C4=8Devi=C4=87?= Date: Wed, 18 Mar 2026 11:30:29 +0100 Subject: [PATCH] [Bugfix] Fix EP weight filter breaking EPLB and NVFP4 accuracy (#37322) Signed-off-by: Elvir Crncevic Co-authored-by: Claude Opus 4.6 Co-authored-by: Kevin H. Luu --- vllm/model_executor/model_loader/default_loader.py | 7 +++++++ vllm/model_executor/model_loader/ep_weight_filter.py | 5 +++++ 2 files changed, 12 insertions(+) diff --git a/vllm/model_executor/model_loader/default_loader.py b/vllm/model_executor/model_loader/default_loader.py index a8d810244..5c9c97f4b 100644 --- a/vllm/model_executor/model_loader/default_loader.py +++ b/vllm/model_executor/model_loader/default_loader.py @@ -320,6 +320,13 @@ class DefaultModelLoader(BaseModelLoader): ): return + # When EPLB is enabled, redundant physical expert slots may map to + # logical experts that belong to other ranks in the default partition. + # The weight loader needs to see ALL logical expert weights so it can + # populate these redundant slots. Skip the filter entirely. + if parallel_config.enable_eplb: + return + num_experts = model_config.get_num_experts() if num_experts <= 0: return diff --git a/vllm/model_executor/model_loader/ep_weight_filter.py b/vllm/model_executor/model_loader/ep_weight_filter.py index 1ef7f0174..190842379 100644 --- a/vllm/model_executor/model_loader/ep_weight_filter.py +++ b/vllm/model_executor/model_loader/ep_weight_filter.py @@ -73,4 +73,9 @@ def should_skip_weight( if eid is None: # Not an expert weight (dense / shared-expert / embedding) → keep. return False + # Only skip heavy weight tensors, never scale/metadata tensors. + # Scale tensors are tiny and some backends need them from ALL experts + # (e.g. FlashInfer NVFP4 computes a global max of activation scales). + if not weight_name.endswith(".weight"): + return False return eid not in local_expert_ids