[Core] Optimize SPMD architecture with delta + serialization optimization (#7109)

2024-08-18 17:57:20 -07:00
parent 200a2ffa6b
commit ff7ec82c4d
36 changed files with 722 additions and 346 deletions
--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@@ -8,6 +8,7 @@ pytest tests/basic_correctness/test_preemption.py`.
 import pytest
 from prometheus_client import REGISTRY

+import vllm.envs as envs
 from vllm import SamplingParams
 from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,
                                 ENABLE_ARTIFICIAL_PREEMPT)
@@ -24,6 +25,13 @@ assert ENABLE_ARTIFICIAL_PREEMPT is True, (
    "tests/basic_correctness/test_preemption.py`")


+@pytest.fixture
+def worker_use_ray() -> bool:
+    # When SPMD worker is used, use ray_use_worker=True
+    # to test delta input optimization works with preemption.
+    return envs.VLLM_USE_RAY_SPMD_WORKER
+
+
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [96])
@@ -36,6 +44,7 @@ def test_chunked_prefill_recompute(
    dtype: str,
    max_tokens: int,
    chunked_prefill_token_size: int,
+    worker_use_ray: bool,
 ) -> None:
    """Ensure that chunked prefill works with preemption."""
    max_num_seqs = min(chunked_prefill_token_size, 256)
@@ -54,6 +63,7 @@ def test_chunked_prefill_recompute(
            max_num_batched_tokens=max_num_batched_tokens,
            enable_chunked_prefill=enable_chunked_prefill,
            max_num_seqs=max_num_seqs,
+            worker_use_ray=worker_use_ray,
    ) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
@@ -79,6 +89,7 @@ def test_preemption(
    model: str,
    dtype: str,
    max_tokens: int,
+    worker_use_ray: bool,
 ) -> None:
    """By default, recompute preemption is enabled"""

@@ -89,6 +100,7 @@ def test_preemption(
            model,
            dtype=dtype,
            disable_log_stats=False,
+            worker_use_ray=worker_use_ray,
    ) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
@@ -132,6 +144,7 @@ def test_swap(
    dtype: str,
    max_tokens: int,
    beam_width: int,
+    worker_use_ray: bool,
 ) -> None:
    """Use beam search enables swapping."""
    example_prompts = example_prompts[:1]
@@ -144,6 +157,7 @@ def test_swap(
            dtype=dtype,
            swap_space=10,
            disable_log_stats=False,
+            worker_use_ray=worker_use_ray,
    ) as vllm_model:
        vllm_outputs = vllm_model.generate_beam_search(example_prompts,
                                                       beam_width, max_tokens)
@@ -188,6 +202,7 @@ def test_swap_infeasible(
    dtype: str,
    max_tokens: int,
    beam_width: int,
+    worker_use_ray: bool,
 ) -> None:
    """Verify infeasible swap request will be ignored."""
    BLOCK_SIZE = 16
@@ -204,6 +219,7 @@ def test_swap_infeasible(
            # decode blocks are not enough to finish.
            num_gpu_blocks_override=prefill_blocks + decode_blocks,
            max_model_len=(prefill_blocks + decode_blocks) * BLOCK_SIZE,
+            worker_use_ray=worker_use_ray,
    ) as vllm_model:
        sampling_params = SamplingParams(n=beam_width,
                                         use_beam_search=True,
@@ -230,6 +246,7 @@ def test_preemption_infeasible(
    model: str,
    dtype: str,
    max_tokens: int,
+    worker_use_ray: bool,
 ) -> None:
    """Verify infeasible preemption request will be ignored."""
    BLOCK_SIZE = 16
@@ -244,6 +261,7 @@ def test_preemption_infeasible(
            # ignored instead of hanging forever.
            num_gpu_blocks_override=prefill_blocks + decode_blocks // 2,
            max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE),
+            worker_use_ray=worker_use_ray,
    ) as vllm_model:
        sampling_params = SamplingParams(max_tokens=max_tokens,
                                         ignore_eos=True)