[Hardware][CPU] Multi-LoRA implementation for the CPU backend (#11100)

Signed-off-by: Akshat Tripathi <akshat@krai.ai> Signed-off-by: Oleg Mosalov <oleg@krai.ai> Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> Co-authored-by: Oleg Mosalov <oleg@krai.ai> Co-authored-by: Jee Jee Li <pandaleefree@gmail.com> Co-authored-by: Isotr0py <2037008807@qq.com>
2025-01-12 13:01:52 +00:00
parent f967e51f38
commit 8bddb73512
25 changed files with 855 additions and 193 deletions
--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
@@ -72,7 +72,8 @@ def do_sample(llm: vllm.LLM,
@pytest.mark.parametrize("tp_size", [1])
 def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model,
                          tp_size):
-    if num_gpus_available < tp_size:
+    if num_gpus_available < tp_size and \
+        tp_size > 1 and current_platform.is_cuda_alike():
        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")

    llm = vllm.LLM(