[MoE Refactor][5/N] Isolate zero expert to LongCatFlash (#28891)

Signed-off-by: baonudesifeizhai <85092850+baonudesifeizhai@users.noreply.github.com> Signed-off-by: Dongjie Zou <85092850+baonudesifeizhai@users.noreply.github.com> Signed-off-by: baonudesifeizhai <baonudesifeizhai@gmail.com> Signed-off-by: Robert Shaw <robertgshaw2@gmail.com> Co-authored-by: Robert Shaw <robshaw@redhat.com> Co-authored-by: Robert Shaw <robertgshaw2@gmail.com>
2025-12-20 13:22:04 -05:00
parent 560ae9638c
commit 54c8924384
19 changed files with 264 additions and 109 deletions
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -1292,13 +1292,11 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                    apply_router_weight_on_input=layer.apply_router_weight_on_input,
                )

-        select_result = layer.select_experts(
+        topk_weights, topk_ids = layer.select_experts(
            hidden_states=x,
            router_logits=router_logits,
        )

-        topk_weights, topk_ids, zero_expert_result = select_result
-
        if self.rocm_aiter_moe_enabled:
            from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (  # noqa: E501
                rocm_aiter_fused_experts,
@@ -1353,13 +1351,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                apply_router_weight_on_input=layer.apply_router_weight_on_input,
            )

-        if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:
-            assert not isinstance(result, tuple), (
-                "Shared + zero experts are mutually exclusive not yet supported"
-            )
-            return result, zero_expert_result
-        else:
-            return result
+        return result


 class Fp8OnlineMoEMethod(Fp8MoEMethod):