[Quantization][V1] BitsAndBytes support V1 (#15611)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-03-28 10:12:47 +08:00
parent bd45912b99
commit 726efc6a32
7 changed files with 52 additions and 24 deletions
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -682,8 +682,9 @@ class ModelConfig:

    def _verify_bnb_config(self) -> None:
        """
-        The current version of bitsandbytes (0.44.0) with 8-bit models does not
+        The current version of bitsandbytes (0.45.3) with 8-bit models does not
        yet support CUDA graph.
+        # TODO Remove this when bitsandbytes supports.
        """
        is_bitsandbytes = self.quantization == "bitsandbytes"
        has_quantization_config = (getattr(self.hf_config,
@@ -698,8 +699,9 @@ class ModelConfig:
                not self.enforce_eager,
        ]):
            logger.warning(
-                "CUDA graph is not supported on BitAndBytes 8bit yet, "
+                "CUDA graph is not supported on BitsAndBytes 8bit yet, "
                "fallback to the eager mode.")
+
            self.enforce_eager = True

    def _verify_with_expert_parallelism(self) -> None: