[Bugfix] Fix EP weight filter breaking EPLB and NVFP4 accuracy (#37322)
Signed-off-by: Elvir Crncevic <elvircrn@gmail.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> Co-authored-by: Kevin H. Luu <khluu000@gmail.com>
This commit is contained in:
@@ -320,6 +320,13 @@ class DefaultModelLoader(BaseModelLoader):
|
||||
):
|
||||
return
|
||||
|
||||
# When EPLB is enabled, redundant physical expert slots may map to
|
||||
# logical experts that belong to other ranks in the default partition.
|
||||
# The weight loader needs to see ALL logical expert weights so it can
|
||||
# populate these redundant slots. Skip the filter entirely.
|
||||
if parallel_config.enable_eplb:
|
||||
return
|
||||
|
||||
num_experts = model_config.get_num_experts()
|
||||
if num_experts <= 0:
|
||||
return
|
||||
|
||||
@@ -73,4 +73,9 @@ def should_skip_weight(
|
||||
if eid is None:
|
||||
# Not an expert weight (dense / shared-expert / embedding) → keep.
|
||||
return False
|
||||
# Only skip heavy weight tensors, never scale/metadata tensors.
|
||||
# Scale tensors are tiny and some backends need them from ALL experts
|
||||
# (e.g. FlashInfer NVFP4 computes a global max of activation scales).
|
||||
if not weight_name.endswith(".weight"):
|
||||
return False
|
||||
return eid not in local_expert_ids
|
||||
|
||||
Reference in New Issue
Block a user