From c4041f37a4a4b70c8821c35df9c79298ee5fcb7b Mon Sep 17 00:00:00 2001 From: Andreas Karatzas Date: Wed, 7 Jan 2026 22:17:56 -0600 Subject: [PATCH] [ROCm][LoRA] Fix MoE accuracy regression by preserving float32 router weight scaling (#31931) Signed-off-by: Andreas Karatzas --- vllm/model_executor/layers/fused_moe/fused_moe.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 8b3bca52e..0ad3d10da 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -519,6 +519,12 @@ def fused_moe_kernel( a_ptrs += BLOCK_SIZE_K * stride_ak b_ptrs += BLOCK_SIZE_K * stride_bk + # Router weight multiplication MUST happen in float32 before precision + # conversion for numerical stability (especially critical on ROCm). + if MUL_ROUTED_WEIGHT: + moe_weight = tl.load(topk_weights_ptr + offs_token, mask=token_mask, other=0) + accumulator = accumulator * moe_weight[:, None] + if use_int8_w8a16: accumulator = (accumulator * b_scale).to(compute_type) elif use_fp8_w8a8 or use_int8_w8a8: @@ -529,12 +535,10 @@ def fused_moe_kernel( else: accumulator = accumulator.to(compute_type) - # Since bias is typically not quantized, it's added after dequantization. + # Bias is added AFTER dequantization since bias is typically stored in + # the output dtype and should not be scaled by quantization factors. if HAS_BIAS: accumulator = accumulator + bias[None, :] - if MUL_ROUTED_WEIGHT: - moe_weight = tl.load(topk_weights_ptr + offs_token, mask=token_mask, other=0) - accumulator = accumulator * moe_weight[:, None] # ----------------------------------------------------------- # Write back the block of the output