[Core] Introduce SPMD worker execution using Ray accelerated DAG (#6032)

Signed-off-by: Rui Qiao <ruisearch42@gmail.com> Co-authored-by: Stephanie Wang <swang@cs.berkeley.edu>
2024-07-17 22:27:09 -07:00
parent d25877dd9b
commit 61e592747c
8 changed files with 216 additions and 119 deletions
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -34,6 +34,7 @@ if TYPE_CHECKING:
    VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS: bool = False
    VLLM_XLA_CACHE_PATH: str = os.path.join(VLLM_CACHE_ROOT, "xla_cache")
    VLLM_FUSED_MOE_CHUNK_SIZE: int = 64 * 1024
+    VLLM_USE_RAY_SPMD_WORKER: bool = False
    VLLM_USE_RAY_COMPILED_DAG: bool = False
    VLLM_WORKER_MULTIPROC_METHOD: str = "fork"
    VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets")
@@ -261,6 +262,13 @@ environment_variables: Dict[str, Callable[[], Any]] = {
    "VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS":
    lambda: bool(os.getenv("VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS", False)),

+    # If the env var is set, then all workers will execute as separate
+    # processes from the engine, and we use the same mechanism to trigger
+    # execution on all workers.
+    # Run vLLM with VLLM_USE_RAY_SPMD_WORKER=1 to enable it.
+    "VLLM_USE_RAY_SPMD_WORKER":
+    lambda: bool(os.getenv("VLLM_USE_RAY_SPMD_WORKER", 0)),
+
    # If the env var is set, it uses the Ray's compiled DAG API
    # which optimizes the control plane overhead.
    # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.