[Feature][kernel] tensor parallelism with bitsandbytes quantization (#8434)
This commit is contained in:
@@ -64,6 +64,24 @@ def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
||||
model_name)
|
||||
|
||||
|
||||
@pytest.mark.skipif(torch.cuda.device_count() < 2,
|
||||
reason='Test requires at least 2 GPUs.')
|
||||
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
|
||||
reason='bitsandbytes is not supported on this GPU type.')
|
||||
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
|
||||
@fork_new_process_for_each_test
|
||||
def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
|
||||
model_name, description) -> None:
|
||||
|
||||
hf_model_kwargs = {"load_in_4bit": True}
|
||||
validate_generated_texts(hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts[:1],
|
||||
model_name,
|
||||
hf_model_kwargs,
|
||||
vllm_tp_size=2)
|
||||
|
||||
|
||||
def log_generated_texts(prompts, outputs, runner_name):
|
||||
logged_texts = []
|
||||
for i, (_, generated_text) in enumerate(outputs):
|
||||
@@ -80,22 +98,21 @@ def validate_generated_texts(hf_runner,
|
||||
vllm_runner,
|
||||
prompts,
|
||||
model_name,
|
||||
hf_model_kwargs=None):
|
||||
hf_model_kwargs=None,
|
||||
vllm_tp_size=1):
|
||||
|
||||
# NOTE: run vLLM first, as it requires a clean process
|
||||
# when using distributed inference
|
||||
|
||||
#Run with vLLM runner
|
||||
with vllm_runner(model_name,
|
||||
quantization='bitsandbytes',
|
||||
load_format='bitsandbytes',
|
||||
tensor_parallel_size=vllm_tp_size,
|
||||
enforce_eager=True,
|
||||
gpu_memory_utilization=0.8) as llm:
|
||||
vllm_outputs = llm.generate_greedy(prompts, 8)
|
||||
vllm_logs = log_generated_texts(prompts, vllm_outputs, "VllmRunner")
|
||||
|
||||
# Clean up the GPU memory for the next test
|
||||
torch.cuda.synchronize()
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
@@ -108,7 +125,6 @@ def validate_generated_texts(hf_runner,
|
||||
hf_logs = log_generated_texts(prompts, hf_outputs, "HfRunner")
|
||||
|
||||
# Clean up the GPU memory for the next test
|
||||
torch.cuda.synchronize()
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user