diff --git a/vllm/model_executor/layers/quantization/input_quant_fp8.py b/vllm/model_executor/layers/quantization/input_quant_fp8.py index 2d6692459..c1a901c37 100644 --- a/vllm/model_executor/layers/quantization/input_quant_fp8.py +++ b/vllm/model_executor/layers/quantization/input_quant_fp8.py @@ -98,7 +98,9 @@ class QuantFP8(CustomOp): num_token_padding=self.num_token_padding, scale_ub=scale_ub, use_per_token_if_dynamic=self.use_per_token_if_dynamic, - group_shape=self.group_shape if self.static else None, + group_shape=(self.group_shape.row, self.group_shape.col) + if self.static + else None, ) def forward_hip(