diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py index 983c076bd..0d6d0bac9 100644 --- a/vllm/model_executor/layers/quantization/bitsandbytes.py +++ b/vllm/model_executor/layers/quantization/bitsandbytes.py @@ -336,16 +336,6 @@ class BitsAndBytesLinearMethod(LinearMethodBase): current_index += output_size - # only update the matmul_states if it is not profile_run - if ( - generation > 0 - and not self.quant_config.llm_int8_has_fp16_weight - and matmul_states[i].CB is not None - and matmul_states[i].CxB is not None - ): - del matmul_states[i].CB - qweight[offsets[i] : offsets[i + 1]] = matmul_states[i].CxB - out = out.to(original_type) if reshape_after_matmul: