[CI/Build] Fix CI LoRA failure (#16270)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-04-09 09:13:56 +08:00
parent 2755c34a8f
commit 86c3369eb8
8 changed files with 19 additions and 13 deletions
--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
@@ -78,12 +78,7 @@ def do_sample(llm: vllm.LLM,


@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("tp_size", [1])
-def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model,
-                          tp_size):
-    if num_gpus_available < tp_size and \
-        tp_size > 1 and current_platform.is_cuda_alike():
-        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
+def test_quant_model_lora(tinyllama_lora_files, model):

    llm = vllm.LLM(
        model=model.model_path,
@@ -91,7 +86,6 @@ def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model,
        max_num_seqs=16,
        max_loras=4,
        max_model_len=400,
-        tensor_parallel_size=tp_size,
        gpu_memory_utilization=0.2,  #avoid OOM
        quantization=model.quantization,
        trust_remote_code=True,
@@ -185,7 +179,6 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
        enable_lora=True,
        max_num_seqs=16,
        max_loras=4,
-        tensor_parallel_size=1,
        gpu_memory_utilization=0.2,  #avoid OOM
        quantization=model.quantization,
        trust_remote_code=True,