[Bugfix][MoE] Fix 6-8% decode regression: prefer multi-stream shared expert overlap (#38990)
Signed-off-by: Martin Vit <martin@voipmonitor.org> Signed-off-by: Robert Shaw <robshaw@redhat.com> Co-authored-by: Robert Shaw <robshaw@redhat.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
This commit is contained in:
@@ -93,24 +93,24 @@ class SharedExperts:
|
|||||||
)
|
)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def _has_external_experts(self) -> bool:
|
def _use_external_experts(self) -> bool:
|
||||||
|
if self._use_dp_chunking:
|
||||||
|
return False
|
||||||
|
|
||||||
# Disable shared expert overlap if:
|
# Disable shared expert overlap if:
|
||||||
# - we are using eplb with non-default backend, because of correctness issues
|
# - we are using eplb with non-default backend, because of correctness issues
|
||||||
# - we are using flashinfer with DP, since there nothing to gain
|
# - we are using flashinfer with DP, since there nothing to gain
|
||||||
backend = self._moe_config.moe_parallel_config.all2all_backend
|
backend = self._moe_config.moe_parallel_config.all2all_backend
|
||||||
return not (
|
return (
|
||||||
(
|
|
||||||
self._moe_config.moe_parallel_config.enable_eplb
|
self._moe_config.moe_parallel_config.enable_eplb
|
||||||
and backend != "allgather_reducescatter"
|
and backend != "allgather_reducescatter"
|
||||||
)
|
) or self._moe_config.moe_parallel_config.use_fi_nvl_two_sided_kernels
|
||||||
or self._moe_config.moe_parallel_config.use_fi_nvl_two_sided_kernels
|
|
||||||
)
|
|
||||||
|
|
||||||
def _determine_shared_experts_order(
|
def _determine_shared_experts_order(
|
||||||
self,
|
self,
|
||||||
hidden_states: torch.Tensor,
|
hidden_states: torch.Tensor,
|
||||||
) -> SharedExpertsOrder:
|
) -> SharedExpertsOrder:
|
||||||
if self._has_external_experts and not self._use_dp_chunking:
|
if self._use_external_experts:
|
||||||
return SharedExpertsOrder.EXTERNAL
|
return SharedExpertsOrder.EXTERNAL
|
||||||
|
|
||||||
if self._quant_method.mk_owns_shared_expert:
|
if self._quant_method.mk_owns_shared_expert:
|
||||||
|
|||||||
Reference in New Issue
Block a user