[MoE Refactor] Move select_experts from FusedMoEQuantMethod -> FusedMoE (#31996)

Signed-off-by: Bill Nell <bnell@redhat.com>
2026-01-22 18:21:35 -05:00
parent fc56f4a071
commit dc917cceb8
22 changed files with 498 additions and 533 deletions
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -6,7 +6,6 @@ from typing import Any, Union
 import torch
 from packaging import version

-from vllm.model_executor.layers.fused_moe import FusedMoERouter
 from vllm.model_executor.layers.fused_moe.config import (
    FusedMoEConfig,
    FusedMoEQuantConfig,
@@ -499,16 +498,12 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase):
    def apply(
        self,
        layer: FusedMoE,
-        router: FusedMoERouter,
        x: torch.Tensor,
-        router_logits: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
        from vllm.model_executor.layers.fused_moe import fused_experts

-        topk_weights, topk_ids = router.select_experts(
-            hidden_states=x,
-            router_logits=router_logits,
-        )
        # TODO(bnell): Do these need to be called on the hot path?
        if self.quant_config.load_in_8bit:
            w13, w2 = self._apply_8bit_dequant(layer)