[Core] Optimize SPMD architecture with delta + serialization optimization (#7109)
This commit is contained in:
@@ -6,6 +6,8 @@ pytest test_chunked_prefill_distributed.py
|
||||
```
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.utils import cuda_device_count_stateless
|
||||
@@ -30,6 +32,11 @@ def test_models(
|
||||
model: str,
|
||||
distributed_executor_backend: str,
|
||||
) -> None:
|
||||
if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray": # noqa
|
||||
assert distributed_executor_backend == "ray"
|
||||
# test ray adag
|
||||
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
|
||||
os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
|
||||
|
||||
dtype = "half"
|
||||
max_tokens = 5
|
||||
|
||||
Reference in New Issue
Block a user