[Quantization][Deprecation] Remove BitBlas (#32683)

Signed-off-by: Robert Shaw <robshaw@redhat.com> Signed-off-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Co-authored-by: Robert Shaw <robshaw@redhat.com>
2026-01-28 03:06:22 -08:00
parent ecb4f82209
commit 247d1a32ea
15 changed files with 2 additions and 2030 deletions
--- a/tests/models/quantization/test_bitblas.py
+++ b/tests/models/quantization/test_bitblas.py
@@ -1,68 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Compare the outputs of a GPTQ model to a bitblas model.
-
-Note: GPTQ and bitblas do not have bitwise correctness.
-As a result, in this test, we just confirm that the top selected tokens of the
-bitblas/GPTQ models are in the top 3 selections of each other.
-
-Note: bitblas internally uses locks to synchronize the threads. This can
-result in very slight nondeterminism for bitblas. As a result, we re-run the
-test up to 3 times to see if we pass.
-"""
-
-from dataclasses import dataclass
-
-import pytest
-
-from ..utils import check_logprobs_close
-
-
-@dataclass
-class ModelPair:
-    model_bitblas: str
-    model_gptq: str
-
-
-model_pairs = [
-    ModelPair(
-        model_bitblas="hxbgsyxh/opt-125m-4bit-128g-bitblas",
-        model_gptq="hxbgsyxh/opt-125m-4bit-128g",
-    ),
-]
-
-
-@pytest.mark.flaky(reruns=2)
-@pytest.mark.skipif(True, reason="BitBLAS takes too much time for tuning.")
-@pytest.mark.parametrize("model_pair", model_pairs)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [32])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_models(
-    vllm_runner,
-    example_prompts,
-    model_pair: ModelPair,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-) -> None:
-    with vllm_runner(
-        model_pair.model_bitblas, dtype=dtype, quantization="bitblas"
-    ) as bitblas_model:
-        bitblas_outputs = bitblas_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs
-        )
-
-    with vllm_runner(
-        model_pair.model_gptq, dtype=dtype, quantization="gptq"
-    ) as gptq_model:
-        gptq_outputs = gptq_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs
-        )
-
-    check_logprobs_close(
-        outputs_0_lst=gptq_outputs,
-        outputs_1_lst=bitblas_outputs,
-        name_0="gptq",
-        name_1="bitblas",
-    )
--- a/tests/models/quantization/test_gptq_bitblas.py
+++ b/tests/models/quantization/test_gptq_bitblas.py
@@ -1,64 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Compare the outputs of a GPTQ model to a bitblas model.
-
-Note: GPTQ and bitblas do not have bitwise correctness.
-As a result, in this test, we just confirm that the top selected tokens of the
-bitblas/GPTQ models are in the top 3 selections of each other.
-
-Note: bitblas internally uses locks to synchronize the threads. This can
-result in very slight nondeterminism for bitblas. As a result, we re-run the
-test up to 3 times to see if we pass.
-"""
-
-from dataclasses import dataclass
-
-import pytest
-
-from ..utils import check_logprobs_close
-
-
-@dataclass
-class ModelPair:
-    model_gptq: str
-
-
-model_pairs = [
-    ModelPair(model_gptq="hxbgsyxh/opt-125m-4bit-128g"),
-]
-
-
-@pytest.mark.flaky(reruns=2)
-@pytest.mark.skipif(True, reason="BitBLAS takes too much time for tuning.")
-@pytest.mark.parametrize("model_pair", model_pairs)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [32])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_models(
-    vllm_runner,
-    example_prompts,
-    model_pair: ModelPair,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-) -> None:
-    with vllm_runner(
-        model_pair.model_gptq, dtype=dtype, quantization="bitblas"
-    ) as bitblas_model:
-        bitblas_outputs = bitblas_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs
-        )
-
-    with vllm_runner(
-        model_pair.model_gptq, dtype=dtype, quantization="gptq"
-    ) as gptq_model:
-        gptq_outputs = gptq_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs
-        )
-
-    check_logprobs_close(
-        outputs_0_lst=gptq_outputs,
-        outputs_1_lst=bitblas_outputs,
-        name_0="gptq",
-        name_1="gptq_bitblas",
-    )