diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 3d94626e5..49ff87df9 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -772,10 +772,10 @@ class Fp8MoEMethod(FusedMoEMethodBase): if self.allow_deep_gemm and not is_deep_gemm_e8m0_used(): if _is_col_major(layer.w13_weight_scale_inv): layer.w13_weight_scale_inv = \ - get_col_major_tma_aligned_tensor(layer.w13_weight_scale_inv).contiguous() + get_col_major_tma_aligned_tensor(layer.w13_weight_scale_inv) if _is_col_major(layer.w2_weight_scale_inv): layer.w2_weight_scale_inv = \ - get_col_major_tma_aligned_tensor(layer.w2_weight_scale_inv).contiguous() + get_col_major_tma_aligned_tensor(layer.w2_weight_scale_inv) # If checkpoint is fp16, quantize in place. elif not self.quant_config.is_checkpoint_fp8_serialized: @@ -923,10 +923,10 @@ class Fp8MoEMethod(FusedMoEMethodBase): # Ensure column-major TMA alignment expected by DeepGEMM. if _is_col_major(layer.w13_weight_scale_inv): layer.w13_weight_scale_inv = get_col_major_tma_aligned_tensor( - layer.w13_weight_scale_inv).contiguous() + layer.w13_weight_scale_inv) if _is_col_major(layer.w2_weight_scale_inv): layer.w2_weight_scale_inv = get_col_major_tma_aligned_tensor( - layer.w2_weight_scale_inv).contiguous() + layer.w2_weight_scale_inv) def select_gemm_impl( self,