[Kernel][RFC] Refactor the punica kernel based on Triton (#5036)

2024-08-01 08:12:24 +08:00
parent 7eb0cb4a14
commit 7ecee34321
47 changed files with 3177 additions and 4366 deletions
--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
@@ -64,14 +64,16 @@ def test_quant_model_lora(tinyllama_lora_files, model, tp_size):
    # if torch.cuda.device_count() < tp_size:
    #     pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")

-    llm = vllm.LLM(model=model.model_path,
-                   enable_lora=True,
-                   max_num_seqs=16,
-                   max_loras=4,
-                   max_model_len=400,
-                   tensor_parallel_size=tp_size,
-                   quantization=model.quantization,
-                   trust_remote_code=True)
+    llm = vllm.LLM(
+        model=model.model_path,
+        enable_lora=True,
+        max_num_seqs=16,
+        max_loras=4,
+        max_model_len=400,
+        tensor_parallel_size=tp_size,
+        gpu_memory_utilization=0.2,  #avoid OOM
+        quantization=model.quantization,
+        trust_remote_code=True)

    if model.quantization is None:
        expected_no_lora_output = [
@@ -156,24 +158,28 @@ def test_quant_model_tp_equality(tinyllama_lora_files, model):
    # if torch.cuda.device_count() < 2:
    #     pytest.skip(f"Not enough GPUs for tensor parallelism {2}")

-    llm_tp1 = vllm.LLM(model=model.model_path,
-                       enable_lora=True,
-                       max_num_seqs=16,
-                       max_loras=4,
-                       tensor_parallel_size=1,
-                       quantization=model.quantization,
-                       trust_remote_code=True)
+    llm_tp1 = vllm.LLM(
+        model=model.model_path,
+        enable_lora=True,
+        max_num_seqs=16,
+        max_loras=4,
+        tensor_parallel_size=1,
+        gpu_memory_utilization=0.2,  #avoid OOM
+        quantization=model.quantization,
+        trust_remote_code=True)
    output_tp1 = do_sample(llm_tp1, tinyllama_lora_files, lora_id=1)

    del llm_tp1
    cleanup()

-    llm_tp2 = vllm.LLM(model=model.model_path,
-                       enable_lora=True,
-                       max_num_seqs=16,
-                       max_loras=4,
-                       tensor_parallel_size=2,
-                       quantization=model.quantization)
+    llm_tp2 = vllm.LLM(
+        model=model.model_path,
+        enable_lora=True,
+        max_num_seqs=16,
+        max_loras=4,
+        tensor_parallel_size=2,
+        gpu_memory_utilization=0.2,  #avoid OOM
+        quantization=model.quantization)
    output_tp2 = do_sample(llm_tp2, tinyllama_lora_files, lora_id=1)

    del llm_tp2