[Feature][Kernel] Support bitsandbytes quantization and QLoRA (#4776)

2024-06-01 13:51:10 -07:00
parent 37464a0f74
commit b9c0605a8e
11 changed files with 752 additions and 8 deletions
--- a/vllm/model_executor/layers/quantization/init.py
+++ b/vllm/model_executor/layers/quantization/init.py
@@ -4,6 +4,8 @@ from vllm.model_executor.layers.quantization.aqlm import AQLMConfig
 from vllm.model_executor.layers.quantization.awq import AWQConfig
 from vllm.model_executor.layers.quantization.base_config import (
    QuantizationConfig)
+from vllm.model_executor.layers.quantization.bitsandbytes import (
+    BitsAndBytesConfig)
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
    CompressedTensorsConfig)
 from vllm.model_executor.layers.quantization.deepspeedfp import (
@@ -30,6 +32,7 @@ QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = {
    "gptq": GPTQConfig,
    "squeezellm": SqueezeLLMConfig,
    "sparseml": CompressedTensorsConfig,
+    "bitsandbytes": BitsAndBytesConfig,
 }