support bitsandbytes 8-bit and FP4 quantized models (#7445)

2024-08-29 16:09:08 -07:00
parent 257afc37c5
commit 4664ceaad6
6 changed files with 435 additions and 189 deletions
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -405,6 +405,8 @@ class ModelConfig:
            raise ValueError(
                "BitAndBytes quantization with TP or PP is not supported yet.")

+        # Remove the constraint after the bitsandbytes issue is fixed:
+        # https://github.com/bitsandbytes-foundation/bitsandbytes/issues/1308
        if self.quantization == "bitsandbytes" and self.enforce_eager is False:
            logger.warning("CUDA graph is not supported on BitAndBytes yet, "
                           "fallback to the eager mode.")