support bitsandbytes quantization with more models (#9148)

2024-10-08 18:52:19 -07:00
parent 9ba0bd6aa6
commit 2f4117c38e
10 changed files with 165 additions and 28 deletions
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -108,7 +108,7 @@ class BitsAndBytesConfig(QuantizationConfig):
        return None

    def get_scaled_act_names(self) -> List[str]:
-        return ["gelu", "gelu_fast", "gelu_new", "gelu_pytorch_tanh"]
+        return []


 class BitsAndBytesLinearMethod(LinearMethodBase):
@@ -236,7 +236,7 @@ class BitsAndBytesLinearMethod(LinearMethodBase):
            if generation == 0 or generation == 1:
                matmul_states[i] = MatmulLtState()
                matmul_states[i].CB = qweight[offsets[i]:offsets[i + 1]]
-                matmul_states[i].SCB = quant_states[i]
+                matmul_states[i].SCB = quant_states[i].to(x.device)
                matmul_states[i].threshold = (
                    self.quant_config.llm_int8_threshold)
                matmul_states[i].has_fp16_weights = (