[Bugfix][MoE] Fix 6-8% decode regression: prefer multi-stream shared expert overlap (#38990)

Signed-off-by: Martin Vit <martin@voipmonitor.org> Signed-off-by: Robert Shaw <robshaw@redhat.com> Co-authored-by: Robert Shaw <robshaw@redhat.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
2026-04-05 16:28:31 +02:00
parent 9a528260ef
commit 228023b3a5
1 changed files with 9 additions and 9 deletions
--- a/vllm/model_executor/layers/fused_moe/runner/shared_experts.py
+++ b/vllm/model_executor/layers/fused_moe/runner/shared_experts.py
@@ -93,24 +93,24 @@ class SharedExperts:
                )

    @property
-    def _has_external_experts(self) -> bool:
+    def _use_external_experts(self) -> bool:
+        if self._use_dp_chunking:
+            return False
+
        # Disable shared expert overlap if:
        #   - we are using eplb with non-default backend, because of correctness issues
        #   - we are using flashinfer with DP, since there nothing to gain
        backend = self._moe_config.moe_parallel_config.all2all_backend
-        return not (
-            (
-                self._moe_config.moe_parallel_config.enable_eplb
-                and backend != "allgather_reducescatter"
-            )
-            or self._moe_config.moe_parallel_config.use_fi_nvl_two_sided_kernels
-        )
+        return (
+            self._moe_config.moe_parallel_config.enable_eplb
+            and backend != "allgather_reducescatter"
+        ) or self._moe_config.moe_parallel_config.use_fi_nvl_two_sided_kernels

    def _determine_shared_experts_order(
        self,
        hidden_states: torch.Tensor,
    ) -> SharedExpertsOrder:
-        if self._has_external_experts and not self._use_dp_chunking:
+        if self._use_external_experts:
            return SharedExpertsOrder.EXTERNAL

        if self._quant_method.mk_owns_shared_expert: