From 6a9cceb219fcbd6b1eb540ddfdc77ec160f0e209 Mon Sep 17 00:00:00 2001 From: Duyi-Wang Date: Thu, 19 Mar 2026 17:49:27 +0800 Subject: [PATCH] [Bugfix][ROCm] Fix MoRI + AITER FP8 dispatch compatibility for defer_input_quant (#37418) Signed-off-by: Duyi-Wang --- .../layers/fused_moe/mori_prepare_finalize.py | 9 +++------ .../layers/fused_moe/rocm_aiter_fused_moe.py | 7 ++++++- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/mori_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/mori_prepare_finalize.py index 164605dde..fe3a53941 100644 --- a/vllm/model_executor/layers/fused_moe/mori_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/mori_prepare_finalize.py @@ -70,16 +70,13 @@ class MoriPrepareAndFinalize(mk.FusedMoEPrepareAndFinalizeModular): - Optional dispatched expert topk IDs - Optional dispatched expert topk weight """ - if defer_input_quant: - raise NotImplementedError( - f"{self.__class__.__name__} does not support defer_input_quant=True. " - "Please select an MoE kernel that accepts quantized inputs." - ) assert not apply_router_weight_on_input, ( "mori does not support apply_router_weight_on_input=True now." ) scale = None - if self.use_fp8_dispatch: + # When defer_input_quant is True, the expert kernel handles + # quantization internally, so skip FP8 dispatch quantization. + if self.use_fp8_dispatch and not defer_input_quant: from aiter import QuantType, get_hip_quant if quant_config.is_block_quantized: diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py index b1a4b0d59..b9f161ae8 100644 --- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py @@ -295,7 +295,12 @@ def rocm_aiter_fused_experts( class AiterExperts(mk.FusedMoEExpertsModular): @property def expects_unquantized_inputs(self) -> bool: - return True + # When paired with MoRI, the prepare/finalize handles FP8 + # quantization during dispatch to reduce network traffic, + # so we should not defer input quantization. + # Otherwise, AITER fused MoE kernels handle input quantization + # internally via a single fused kernel. + return not self.moe_config.use_mori_kernels @staticmethod def activation_format() -> mk.FusedMoEActivationFormat: