From 17c47fb8691f2efd7948659952c44ef167462534 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Elvir=20Crn=C4=8Devi=C4=87?= <elvircrn@gmail.com>
Date: Wed, 18 Mar 2026 11:30:29 +0100
Subject: [PATCH] [Bugfix] Fix EP weight filter breaking EPLB and NVFP4
 accuracy (#37322)

Signed-off-by: Elvir Crncevic <elvircrn@gmail.com>
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
Co-authored-by: Kevin H. Luu <khluu000@gmail.com>
---
 vllm/model_executor/model_loader/default_loader.py   | 7 +++++++
 vllm/model_executor/model_loader/ep_weight_filter.py | 5 +++++
 2 files changed, 12 insertions(+)

diff --git a/vllm/model_executor/model_loader/default_loader.py b/vllm/model_executor/model_loader/default_loader.py
index a8d810244..5c9c97f4b 100644
--- a/vllm/model_executor/model_loader/default_loader.py
+++ b/vllm/model_executor/model_loader/default_loader.py
@@ -320,6 +320,13 @@ class DefaultModelLoader(BaseModelLoader):
         ):
             return
 
+        # When EPLB is enabled, redundant physical expert slots may map to
+        # logical experts that belong to other ranks in the default partition.
+        # The weight loader needs to see ALL logical expert weights so it can
+        # populate these redundant slots.  Skip the filter entirely.
+        if parallel_config.enable_eplb:
+            return
+
         num_experts = model_config.get_num_experts()
         if num_experts <= 0:
             return
diff --git a/vllm/model_executor/model_loader/ep_weight_filter.py b/vllm/model_executor/model_loader/ep_weight_filter.py
index 1ef7f0174..190842379 100644
--- a/vllm/model_executor/model_loader/ep_weight_filter.py
+++ b/vllm/model_executor/model_loader/ep_weight_filter.py
@@ -73,4 +73,9 @@ def should_skip_weight(
     if eid is None:
         # Not an expert weight (dense / shared-expert / embedding) → keep.
         return False
+    # Only skip heavy weight tensors, never scale/metadata tensors.
+    # Scale tensors are tiny and some backends need them from ALL experts
+    # (e.g. FlashInfer NVFP4 computes a global max of activation scales).
+    if not weight_name.endswith(".weight"):
+        return False
     return eid not in local_expert_ids