[MoE][Refactor] Make select_experts a non-static method (#29067)

Signed-off-by: Bill Nell <bnell@redhat.com>
2025-11-24 13:38:04 -05:00
parent cec418b5df
commit 8f066146c3
18 changed files with 163 additions and 472 deletions
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -1140,7 +1140,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):

    def apply(
        self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
        x: torch.Tensor,
        router_logits: torch.Tensor,
        top_k: int,
@@ -1216,31 +1216,9 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                    apply_router_weight_on_input=apply_router_weight_on_input,
                )

-        zero_expert_num = getattr(layer, "zero_expert_num", 0)
-        zero_expert_type = getattr(layer, "zero_expert_type", None)
-
-        select_result = FusedMoE.select_experts(
+        select_result = layer.select_experts(
            hidden_states=x,
            router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
-            enable_eplb=enable_eplb,
-            expert_map=expert_map,
-            expert_load_view=expert_load_view,
-            logical_to_physical_map=logical_to_physical_map,
-            logical_replica_count=logical_replica_count,
-            global_num_experts=global_num_experts,
-            zero_expert_num=zero_expert_num,
-            zero_expert_type=zero_expert_type,
-            num_fused_shared_experts=layer.num_fused_shared_experts,
        )

        topk_weights, topk_ids, zero_expert_result = select_result
@@ -1322,7 +1300,8 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                    self.allow_cutlass_block_scaled_grouped_gemm
                ),
            )
-        if zero_expert_num != 0 and zero_expert_type is not None:
+
+        if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:
            assert not isinstance(result, tuple), (
                "Shared + zero experts are mutually exclusive not yet supported"
            )