[Core] Optimize SPMD architecture with delta + serialization optimization (#7109)

2024-08-18 17:57:20 -07:00
parent 200a2ffa6b
commit ff7ec82c4d
36 changed files with 722 additions and 346 deletions
--- a/tests/distributed/test_chunked_prefill_distributed.py
+++ b/tests/distributed/test_chunked_prefill_distributed.py
@@ -6,6 +6,8 @@ pytest test_chunked_prefill_distributed.py
 ```
 """

+import os
+
 import pytest

 from vllm.utils import cuda_device_count_stateless
@@ -30,6 +32,11 @@ def test_models(
    model: str,
    distributed_executor_backend: str,
 ) -> None:
+    if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray":  # noqa
+        assert distributed_executor_backend == "ray"
+        # test ray adag
+        os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
+        os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"

    dtype = "half"
    max_tokens = 5