[Misc] LoRA + Chunked Prefill (#9057)

This commit is contained in:
Aurick Qiao
2024-12-10 21:09:20 -05:00
committed by GitHub
parent 9a93973708
commit d5c5154fcf
12 changed files with 49 additions and 20 deletions

View File

@@ -84,7 +84,8 @@ def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model,
tensor_parallel_size=tp_size,
gpu_memory_utilization=0.2, #avoid OOM
quantization=model.quantization,
trust_remote_code=True)
trust_remote_code=True,
enable_chunked_prefill=True)
if model.quantization is None:
expected_no_lora_output = [
@@ -176,7 +177,8 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
tensor_parallel_size=1,
gpu_memory_utilization=0.2, #avoid OOM
quantization=model.quantization,
trust_remote_code=True)
trust_remote_code=True,
enable_chunked_prefill=True)
output_tp1 = do_sample(llm_tp1, tinyllama_lora_files, lora_id=1)
del llm_tp1
@@ -189,7 +191,8 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
max_loras=4,
tensor_parallel_size=2,
gpu_memory_utilization=0.2, #avoid OOM
quantization=model.quantization)
quantization=model.quantization,
enable_chunked_prefill=True)
output_tp2 = do_sample(llm_tp2, tinyllama_lora_files, lora_id=1)
del llm_tp2