From b66b0d6abb955f9209a0d88b1dc245f4c1c9ff98 Mon Sep 17 00:00:00 2001 From: Rabi Mishra Date: Fri, 16 Jan 2026 13:01:10 +0530 Subject: [PATCH] fix(rocm): Enable non-gated MoE (is_act_and_mul=False) support on ROCm (#32244) Signed-off-by: rabi --- vllm/model_executor/layers/fused_moe/layer.py | 12 ++++++++---- .../layers/fused_moe/unquantized_fused_moe_method.py | 7 +++++++ 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index fd3f76cb2..702052c96 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -448,9 +448,13 @@ class FusedMoE(CustomOp): ) # ROCm aiter shared experts fusion - self.rocm_aiter_fmoe_enabled = rocm_aiter_ops.is_fused_moe_enabled() + # AITER only supports gated activations (silu/gelu), so disable it + # for non-gated MoE (is_act_and_mul=False) + self.rocm_aiter_fmoe_enabled = ( + rocm_aiter_ops.is_fused_moe_enabled() and is_act_and_mul + ) self.aiter_fmoe_shared_expert_enabled = ( - rocm_aiter_ops.is_fusion_moe_shared_experts_enabled() + rocm_aiter_ops.is_fusion_moe_shared_experts_enabled() and is_act_and_mul ) self.num_fused_shared_experts = ( @@ -619,9 +623,9 @@ class FusedMoE(CustomOp): # for heuristic purposes, so it must be initialized first. self.quant_method: FusedMoEMethodBase = _get_quant_method() - if not self.moe_config.is_act_and_mul and not current_platform.is_cuda(): + if not self.moe_config.is_act_and_mul and not current_platform.is_cuda_alike(): raise NotImplementedError( - "is_act_and_mul=False is supported only for CUDA for now" + "is_act_and_mul=False is supported only for CUDA and ROCm for now" ) if self.enable_eplb and not self.quant_method.supports_eplb: diff --git a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py index 40a009e4b..351d63144 100644 --- a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py +++ b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py @@ -8,6 +8,7 @@ from torch.nn import Module import vllm.envs as envs import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm._aiter_ops import rocm_aiter_ops from vllm.logger import init_logger from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.layers.fused_moe.config import ( @@ -56,6 +57,12 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): use_ep=self.moe.moe_parallel_config.use_ep, use_dp=self.moe.moe_parallel_config.dp_size > 1, ) + + # AITER only supports gated activations (silu/gelu), so disable it + # for non-gated MoE (is_act_and_mul=False) + self.rocm_aiter_moe_enabled = ( + rocm_aiter_ops.is_fused_moe_enabled() and moe.is_act_and_mul + ) self.kernel: mk.FusedMoEModularKernel | None = None @property