diff --git a/docs/features/quantization/bnb.md b/docs/features/quantization/bnb.md index 2348c7739..53419e067 100644 --- a/docs/features/quantization/bnb.md +++ b/docs/features/quantization/bnb.md @@ -7,7 +7,7 @@ Compared to other quantization methods, BitsAndBytes eliminates the need for cal Below are the steps to utilize BitsAndBytes with vLLM. ```bash -pip install bitsandbytes>=0.46.1 +pip install bitsandbytes>=0.49.2 ``` vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint. diff --git a/requirements/nightly_torch_test.txt b/requirements/nightly_torch_test.txt index c9211b913..9a0bc4b20 100644 --- a/requirements/nightly_torch_test.txt +++ b/requirements/nightly_torch_test.txt @@ -33,7 +33,7 @@ transformers==4.57.5 tokenizers==0.22.0 schemathesis>=3.39.15 # Required for openai schema test. # quantization -bitsandbytes>=0.46.1 +bitsandbytes>=0.49.2 buildkite-test-collector==0.1.9 diff --git a/requirements/rocm-test.txt b/requirements/rocm-test.txt index af7703916..1983392a1 100644 --- a/requirements/rocm-test.txt +++ b/requirements/rocm-test.txt @@ -102,3 +102,5 @@ terratorch==1.2.2 segmentation-models-pytorch==0.5.0 # Required for Prithvi tests imagehash==4.3.2 +# Required for bitsandbytes quantization test +bitsandbytes==0.49.2 diff --git a/requirements/test.in b/requirements/test.in index 5faf1c456..92d8fec4b 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -41,7 +41,7 @@ transformers==4.57.5 tokenizers==0.22.0 schemathesis>=3.39.15 # Required for openai schema test. # quantization -bitsandbytes==0.46.1 +bitsandbytes==0.49.2 buildkite-test-collector==0.1.9 diff --git a/requirements/test.txt b/requirements/test.txt index c18d21637..791bdc005 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -66,7 +66,7 @@ backoff==2.2.1 # via # -r requirements/test.in # schemathesis -bitsandbytes==0.46.1 +bitsandbytes==0.49.2 # via # -r requirements/test.in # lightning @@ -653,6 +653,7 @@ orjson==3.11.5 packaging==24.2 # via # accelerate + # bitsandbytes # black # datamodel-code-generator # datasets diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py index 15ebb5f4a..eadc3534c 100644 --- a/tests/models/test_transformers.py +++ b/tests/models/test_transformers.py @@ -6,8 +6,6 @@ from typing import Any import pytest -from vllm.platforms import current_platform - from ..conftest import HfRunner, VllmRunner from ..utils import multi_gpu_test, prep_prompts from .registry import HF_EXAMPLE_MODELS @@ -131,6 +129,7 @@ def test_distributed( "quantization": "bitsandbytes", }, ), + ("unsloth/tinyllama-bnb-4bit", {}), ], ) @pytest.mark.parametrize("max_tokens", [32]) @@ -143,12 +142,6 @@ def test_quantization( max_tokens: int, num_logprobs: int, ) -> None: - if ( - current_platform.is_rocm() - and quantization_kwargs.get("quantization", "") == "bitsandbytes" - ): - pytest.skip("bitsandbytes quantization is currently not supported in rocm.") - with vllm_runner( model, model_impl="auto", diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py index 0d6d0bac9..716a20090 100644 --- a/vllm/model_executor/layers/quantization/bitsandbytes.py +++ b/vllm/model_executor/layers/quantization/bitsandbytes.py @@ -28,6 +28,24 @@ from vllm.platforms import current_platform from vllm.utils.torch_utils import direct_register_custom_op +def _check_bitsandbytes_version(): + min_version = "0.49.2" if current_platform.is_rocm() else "0.48.1" + try: + import bitsandbytes + + if version.parse(bitsandbytes.__version__) < version.parse(min_version): + raise ImportError( + "bitsandbytes version is wrong. Please " + f"install bitsandbytes>={min_version}." + ) + except ImportError as err: + raise ImportError( + f"Please install bitsandbytes>={min_version} via " + f"`pip install bitsandbytes>={min_version}` to use " + "bitsandbytes quantizer." + ) from err + + class BitsAndBytesConfig(QuantizationConfig): """Config class for BitsAndBytes Quantization. @@ -183,21 +201,7 @@ class BitsAndBytesLinearMethod(LinearMethodBase): """ def __init__(self, quant_config: BitsAndBytesConfig): - try: - import bitsandbytes - - if version.parse(bitsandbytes.__version__) < version.parse("0.46.1"): - raise ImportError( - "bitsandbytes version is wrong. Please " - "install bitsandbytes>=0.46.1." - ) - except ImportError as err: - raise ImportError( - "Please install bitsandbytes>=0.46.1 via " - "`pip install bitsandbytes>=0.46.1` to use " - "bitsandbytes quantizer." - ) from err - + _check_bitsandbytes_version() self.quant_config = quant_config def create_weights( @@ -442,20 +446,7 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase): moe: FusedMoEConfig, ): super().__init__(moe) - try: - import bitsandbytes - - if version.parse(bitsandbytes.__version__) < version.parse("0.46.1"): - raise ImportError( - "bitsandbytes version is wrong. Please " - "install bitsandbytes>=0.46.1." - ) - except ImportError as err: - raise ImportError( - "Please install bitsandbytes>=0.46.1 via " - "`pip install bitsandbytes>=0.46.1` to use " - "bitsandbytes quantizer." - ) from err + _check_bitsandbytes_version() self.quant_config = quant_config def create_weights( diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 2fedd7c67..a8a1d59f1 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -244,10 +244,8 @@ class RocmPlatform(Platform): "mxfp4", "petit_nvfp4", "torchao", + "bitsandbytes", ] - # bitsandbytes not supported on gfx9 (warp size 64 limitation) - if not on_gfx9(): - supported_quantization += ["bitsandbytes"] @classmethod def import_kernels(cls) -> None: