fix(rocm): Enable non-gated MoE (is_act_and_mul=False) support on ROCm (#32244)

Signed-off-by: rabi <ramishra@redhat.com>
2026-01-16 13:01:10 +05:30
parent 03da3b52ef
commit b66b0d6abb
2 changed files with 15 additions and 4 deletions
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -448,9 +448,13 @@ class FusedMoE(CustomOp):
        )

        # ROCm aiter shared experts fusion
-        self.rocm_aiter_fmoe_enabled = rocm_aiter_ops.is_fused_moe_enabled()
+        # AITER only supports gated activations (silu/gelu), so disable it
+        # for non-gated MoE (is_act_and_mul=False)
+        self.rocm_aiter_fmoe_enabled = (
+            rocm_aiter_ops.is_fused_moe_enabled() and is_act_and_mul
+        )
        self.aiter_fmoe_shared_expert_enabled = (
-            rocm_aiter_ops.is_fusion_moe_shared_experts_enabled()
+            rocm_aiter_ops.is_fusion_moe_shared_experts_enabled() and is_act_and_mul
        )

        self.num_fused_shared_experts = (
@@ -619,9 +623,9 @@ class FusedMoE(CustomOp):
        # for heuristic purposes, so it must be initialized first.
        self.quant_method: FusedMoEMethodBase = _get_quant_method()

-        if not self.moe_config.is_act_and_mul and not current_platform.is_cuda():
+        if not self.moe_config.is_act_and_mul and not current_platform.is_cuda_alike():
            raise NotImplementedError(
-                "is_act_and_mul=False is supported only for CUDA for now"
+                "is_act_and_mul=False is supported only for CUDA and ROCm for now"
            )

        if self.enable_eplb and not self.quant_method.supports_eplb:
--- a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
+++ b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
@@ -8,6 +8,7 @@ from torch.nn import Module

 import vllm.envs as envs
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.logger import init_logger
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.fused_moe.config import (
@@ -56,6 +57,12 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
            use_ep=self.moe.moe_parallel_config.use_ep,
            use_dp=self.moe.moe_parallel_config.dp_size > 1,
        )
+
+        # AITER only supports gated activations (silu/gelu), so disable it
+        # for non-gated MoE (is_act_and_mul=False)
+        self.rocm_aiter_moe_enabled = (
+            rocm_aiter_ops.is_fused_moe_enabled() and moe.is_act_and_mul
+        )
        self.kernel: mk.FusedMoEModularKernel | None = None

    @property