[Bugfix] Fix _CPU_MOE_ACT AssertionError when vLLM config not set (#32777)

Signed-off-by: Karan Bansal <karanb192@gmail.com>
2026-01-23 13:52:37 +05:30
parent 7ef5873752
commit fa6e599a61
2 changed files with 26 additions and 26 deletions
--- a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
@@ -8,33 +8,38 @@ from torch.nn import functional as F

 from vllm import _custom_ops as ops
 from vllm._custom_ops import cpu_fused_moe, cpu_prepack_moe_weight
-from vllm.model_executor.layers.activation import SiluAndMul, SwigluOAIAndMul
+from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.quantization.utils.layer_utils import replace_parameter
 from vllm.utils.torch_utils import direct_register_custom_op

 _CPU_MOE_LAYER_CACHE = {}


-class _LazyActivationDict(dict):
-    """Lazily instantiate activation functions on first access.
+def _swigluoai_forward_native(
+    x: torch.Tensor,
+    alpha: float = 1.702,
+    limit: float = 7.0,
+) -> torch.Tensor:
+    """PyTorch-native implementation of SwigluOAIAndMul.forward_native.

-    Avoids triggering CustomOp.__init__() at module import time,
-    which would call get_current_vllm_config() before config is set.
+    Standalone function to avoid instantiating SwigluOAIAndMul (a CustomOp)
+    which would trigger get_current_vllm_config() before config is set.
    """
-
-    _factories: dict[str, type[SiluAndMul] | type[SwigluOAIAndMul]] = {
-        "silu": SiluAndMul,
-        "swigluoai": SwigluOAIAndMul,
-    }
-
-    def __missing__(self, key: str) -> SiluAndMul | SwigluOAIAndMul:
-        if key not in self._factories:
-            raise KeyError(f"{key} is not a supported activation")
-        self[key] = self._factories[key]()
-        return self[key]
+    gate, up = x[..., ::2], x[..., 1::2]
+    gate = gate.clamp(min=None, max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    gated_output = (up + 1) * glu
+    return gated_output


-_CPU_MOE_ACT = _LazyActivationDict()
+# Map activation names to their native forward functions.
+# Uses static methods or standalone functions to avoid instantiating CustomOp
+# classes, which would call get_current_vllm_config() before config is set.
+_CPU_MOE_ACT_FN: dict[str, Callable[[torch.Tensor], torch.Tensor]] = {
+    "silu": SiluAndMul.forward_native,
+    "swigluoai": _swigluoai_forward_native,
+}


 def grouped_topk(
@@ -230,7 +235,7 @@ class CPUFusedMOE:
        apply_router_weight_on_input: bool = False,
        activation: str = "silu",
    ) -> torch.Tensor:
-        assert activation in _CPU_MOE_ACT._factories, f"{activation} is not supported."
+        assert activation in _CPU_MOE_ACT_FN, f"{activation} is not supported."
        assert not apply_router_weight_on_input

        topk_weights, topk_ids = select_experts(
@@ -418,7 +423,7 @@ def cpu_fused_moe_torch(
        tokens_for_this_expert = sorted_tokens[start_idx:end_idx]

        gate_up = layer.gate_up_linear[i](tokens_for_this_expert)  # type: ignore
-        gate_up = _CPU_MOE_ACT[activation].forward_native(gate_up)
+        gate_up = _CPU_MOE_ACT_FN[activation](gate_up)
        expert_out = layer.down_linear[i](gate_up)  # type: ignore
        outputs.append(expert_out)
        start_idx = end_idx