[Core] Pipeline Parallel Support (#4412)

Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai>
2024-07-02 10:58:08 -07:00
parent 15aba081f3
commit c5832d2ae9
82 changed files with 1096 additions and 400 deletions
--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@@ -56,8 +56,8 @@ def test_chunked_prefill_recompute(
            max_num_seqs=max_num_seqs,
    ) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-        assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
-                ARTIFICIAL_PREEMPTION_MAX_CNT)
+        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
+                < ARTIFICIAL_PREEMPTION_MAX_CNT)

    for i in range(len(example_prompts)):
        hf_output_ids, hf_output_str = hf_outputs[i]
@@ -91,10 +91,10 @@ def test_preemption(
            disable_log_stats=False,
    ) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-        assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
-                ARTIFICIAL_PREEMPTION_MAX_CNT)
+        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
+                < ARTIFICIAL_PREEMPTION_MAX_CNT)
        total_preemption = (
-            vllm_model.model.llm_engine.scheduler.num_cumulative_preemption)
+            vllm_model.model.llm_engine.scheduler[0].num_cumulative_preemption)

    check_outputs_equal(
        outputs_0_lst=hf_outputs,
@@ -147,10 +147,10 @@ def test_swap(
    ) as vllm_model:
        vllm_outputs = vllm_model.generate_beam_search(example_prompts,
                                                       beam_width, max_tokens)
-        assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
-                ARTIFICIAL_PREEMPTION_MAX_CNT)
+        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
+                < ARTIFICIAL_PREEMPTION_MAX_CNT)
        total_preemption = (
-            vllm_model.model.llm_engine.scheduler.num_cumulative_preemption)
+            vllm_model.model.llm_engine.scheduler[0].num_cumulative_preemption)

    for i in range(len(example_prompts)):
        hf_output_ids, _ = hf_outputs[i]
@@ -214,8 +214,8 @@ def test_swap_infeasible(
            example_prompts,
            sampling_params=sampling_params,
        )
-        assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
-                ARTIFICIAL_PREEMPTION_MAX_CNT)
+        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
+                < ARTIFICIAL_PREEMPTION_MAX_CNT)

    # Verify the request is ignored and not hang.
    assert req_outputs[0].outputs[0].finish_reason == "length"
@@ -252,8 +252,8 @@ def test_preemption_infeasible(
            sampling_params=sampling_params,
        )

-        assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
-                ARTIFICIAL_PREEMPTION_MAX_CNT)
+        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
+                < ARTIFICIAL_PREEMPTION_MAX_CNT)

    # Verify the request is ignored and not hang.
    for req_output in req_outputs: