diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index ab9690132..026520e3a 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -437,6 +437,14 @@ class Fp8LinearMethod(LinearMethodBase): else: layer.input_scale = None + if self.use_marlin: + prepare_fp8_layer_for_marlin( + layer, size_k_first, input_dtype=self.marlin_input_dtype + ) + # Activations not quantized for marlin. + del layer.input_scale + return + if self.block_quant and self.use_deep_gemm: maybe_post_process_fp8_weight_block(layer)