[Feature][kernel] tensor parallelism with bitsandbytes quantization (#8434)

2024-09-17 08:09:12 -07:00
parent 1009e93c5d
commit 9855b99502
4 changed files with 80 additions and 17 deletions
--- a/tests/quantization/test_bitsandbytes.py
+++ b/tests/quantization/test_bitsandbytes.py
@@ -64,6 +64,24 @@ def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                             model_name)


+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason='Test requires at least 2 GPUs.')
+@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
+                    reason='bitsandbytes is not supported on this GPU type.')
+@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
+@fork_new_process_for_each_test
+def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
+                                model_name, description) -> None:
+
+    hf_model_kwargs = {"load_in_4bit": True}
+    validate_generated_texts(hf_runner,
+                             vllm_runner,
+                             example_prompts[:1],
+                             model_name,
+                             hf_model_kwargs,
+                             vllm_tp_size=2)
+
+
 def log_generated_texts(prompts, outputs, runner_name):
    logged_texts = []
    for i, (_, generated_text) in enumerate(outputs):
@@ -80,22 +98,21 @@ def validate_generated_texts(hf_runner,
                             vllm_runner,
                             prompts,
                             model_name,
-                             hf_model_kwargs=None):
+                             hf_model_kwargs=None,
+                             vllm_tp_size=1):

    # NOTE: run vLLM first, as it requires a clean process
    # when using distributed inference
-
-    #Run with vLLM runner
    with vllm_runner(model_name,
                     quantization='bitsandbytes',
                     load_format='bitsandbytes',
+                     tensor_parallel_size=vllm_tp_size,
                     enforce_eager=True,
                     gpu_memory_utilization=0.8) as llm:
        vllm_outputs = llm.generate_greedy(prompts, 8)
        vllm_logs = log_generated_texts(prompts, vllm_outputs, "VllmRunner")

    # Clean up the GPU memory for the next test
-    torch.cuda.synchronize()
    gc.collect()
    torch.cuda.empty_cache()

@@ -108,7 +125,6 @@ def validate_generated_texts(hf_runner,
        hf_logs = log_generated_texts(prompts, hf_outputs, "HfRunner")

    # Clean up the GPU memory for the next test
-    torch.cuda.synchronize()
    gc.collect()
    torch.cuda.empty_cache()