From 56f516254c5ff024e3986a4d625f698f3ec2a288 Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Date: Mon, 29 Dec 2025 16:27:55 -0500 Subject: [PATCH] [Bugfix][ROCm] Fix Static Quant Issue (#31502) Signed-off-by: Robert Shaw Co-authored-by: Robert Shaw Co-authored-by: TJian --- .../layers/fused_moe/rocm_aiter_fused_moe.py | 5 +++- .../model_executor/layers/quantization/fp8.py | 23 ++++++++----------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py index ebd9e3a4a..ec1e41060 100644 --- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py @@ -325,8 +325,11 @@ class AiterExperts(mk.FusedMoEPermuteExpertsUnpermute): expert_tokens_meta: mk.ExpertTokensMetadata | None, apply_router_weight_on_input: bool, ): + # TODO(rob): rocm_aiter_fused_experts uses self.quant_config's + # a_scales for static quantization. Update this to fit better + # with the interface once all quant integrations are complete. assert a1q_scale is None - assert a2_scale is None + assert a2_scale == self.quant_config.a2_scale assert expert_tokens_meta is None result = rocm_aiter_fused_experts( diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index e77b94db8..08e1f4d44 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -1046,35 +1046,32 @@ class Fp8MoEMethod(FusedMoEMethodBase): w2_weight = layer.w2_weight w13_weight_scale = getattr(layer, f"w13_{self.weight_scale_name}") w2_weight_scale = getattr(layer, f"w2_{self.weight_scale_name}") + w13_input_scale = layer.w13_input_scale + w2_input_scale = layer.w2_input_scale # MI300x and MI325x use FNUZ format for FP8. Convert if needed. if current_platform.is_fp8_fnuz(): - w13_weight, w13_weight_scale, layer.w13_input_scale = ( + w13_weight, w13_weight_scale, w13_input_scale = ( normalize_e4m3fn_to_e4m3fnuz( - w13_weight, w13_weight_scale, layer.w13_input_scale + w13_weight, w13_weight_scale, w13_input_scale ) ) - w2_weight, w2_weight_scale, layer.w2_input_scale = ( - normalize_e4m3fn_to_e4m3fnuz( - w2_weight, w2_weight_scale, layer.w2_input_scale - ) + w2_weight, w2_weight_scale, w2_input_scale = normalize_e4m3fn_to_e4m3fnuz( + w2_weight, w2_weight_scale, w2_input_scale ) # Per tensor kernels require single activation scale. Use the max. if self.quant_config.activation_scheme == "static": assert not self.block_quant - assert layer.w13_input_scale is not None - assert layer.w2_input_scale is not None - if not all_close_1d(layer.w13_input_scale) or not all_close_1d( - layer.w2_input_scale - ): + assert w13_input_scale is not None and w2_input_scale is not None + if not all_close_1d(w13_input_scale) or not all_close_1d(w2_input_scale): logger.warning_once( "Found input_scales that are not equal for " "fp8 MoE layer. Using the maximum across experts " "for each layer." ) - replace_parameter(layer, "w13_input_scale", layer.w13_input_scale.max()) - replace_parameter(layer, "w2_input_scale", layer.w2_input_scale.max()) + replace_parameter(layer, "w13_input_scale", w13_input_scale.max()) + replace_parameter(layer, "w2_input_scale", w2_input_scale.max()) # Per tensor kernels require single weight scale for w13 per expert, but # on disk there is a scale for w1 and w3. Use the max to requantize.