[Quantization][V1] BitsAndBytes support V1 (#15611)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-03-28 10:12:47 +08:00
parent bd45912b99
commit 726efc6a32
7 changed files with 52 additions and 24 deletions
--- a/tests/quantization/test_bitsandbytes.py
+++ b/tests/quantization/test_bitsandbytes.py
@@ -101,8 +101,6 @@ def test_load_pp_4bit_bnb_model(model_name, description) -> None:
        "--enable-prefix-caching",
        "--quantization",
        "bitsandbytes",
-        "--load-format",
-        "bitsandbytes",
        "--gpu-memory-utilization",
        "0.7",
    ]
@@ -137,7 +135,6 @@ def validate_generated_texts(hf_runner,
    # when using distributed inference
    with vllm_runner(model_name,
                     quantization='bitsandbytes',
-                     load_format='bitsandbytes',
                     tensor_parallel_size=vllm_tp_size,
                     enforce_eager=False) as llm:
        vllm_outputs = llm.generate_greedy(prompts, 8)