[Feat][Executor] Introduce RayExecutorV2 (#36836)
Signed-off-by: Jeffrey Wang <jeffreywang@anyscale.com>
This commit is contained in:
209
tests/distributed/test_ray_v2_executor_e2e.py
Normal file
209
tests/distributed/test_ray_v2_executor_e2e.py
Normal file
@@ -0,0 +1,209 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Orchestration-level integration tests for RayExecutorV2.
|
||||
"""
|
||||
|
||||
import gc
|
||||
import os
|
||||
import pathlib
|
||||
|
||||
import pytest
|
||||
import ray
|
||||
|
||||
pytestmark = pytest.mark.usefixtures("enable_ray_v2_backend")
|
||||
|
||||
MODEL = "facebook/opt-125m"
|
||||
|
||||
|
||||
def _get_env_var(worker, name):
|
||||
return os.environ.get(name)
|
||||
|
||||
|
||||
def _ray_init():
|
||||
"""Start Ray with the project root on workers' PYTHONPATH.
|
||||
|
||||
Without this, workers cannot unpickle actor classes defined in the
|
||||
``tests`` package, causing FunctionActorManager to fall back to
|
||||
TemporaryActor which drops async method signatures."""
|
||||
project_root = str(pathlib.Path(__file__).resolve().parents[2])
|
||||
ray.init(
|
||||
ignore_reinit_error=True,
|
||||
runtime_env={"env_vars": {"PYTHONPATH": project_root}},
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def ray_init():
|
||||
_ray_init()
|
||||
|
||||
|
||||
class _AsyncLLMActor:
|
||||
def start(self, pg, bundle_indices=None, ray_runtime_env=None):
|
||||
os.environ["VLLM_USE_RAY_V2_EXECUTOR_BACKEND"] = "1"
|
||||
# Needed so collective_rpc can pickle _get_env_var over the
|
||||
# AsyncLLM -> EngineCore ZMQ boundary.
|
||||
os.environ["VLLM_ALLOW_INSECURE_SERIALIZATION"] = "1"
|
||||
if bundle_indices is not None:
|
||||
os.environ["VLLM_RAY_BUNDLE_INDICES"] = bundle_indices
|
||||
else:
|
||||
os.environ.pop("VLLM_RAY_BUNDLE_INDICES", None)
|
||||
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||
from vllm.v1.engine.async_llm import AsyncLLM
|
||||
from vllm.v1.executor.abstract import Executor
|
||||
|
||||
engine_args = AsyncEngineArgs(
|
||||
model=MODEL,
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend="ray",
|
||||
enforce_eager=True,
|
||||
max_model_len=256,
|
||||
gpu_memory_utilization=0.8,
|
||||
)
|
||||
vllm_config = engine_args.create_engine_config()
|
||||
vllm_config.parallel_config.placement_group = pg
|
||||
if ray_runtime_env is not None:
|
||||
vllm_config.parallel_config.ray_runtime_env = ray_runtime_env
|
||||
|
||||
executor_class = Executor.get_class(vllm_config)
|
||||
self.engine = AsyncLLM(
|
||||
vllm_config=vllm_config,
|
||||
executor_class=executor_class,
|
||||
log_stats=False,
|
||||
log_requests=False,
|
||||
)
|
||||
|
||||
async def generate(self, prompt):
|
||||
from vllm.sampling_params import SamplingParams
|
||||
|
||||
params = SamplingParams(max_tokens=16)
|
||||
result = None
|
||||
async for output in self.engine.generate(
|
||||
prompt, params, request_id="test_request_id"
|
||||
):
|
||||
result = output
|
||||
assert result is not None
|
||||
return result.outputs[0].text
|
||||
|
||||
async def generate_and_get_worker_envs(self, prompt, env_names):
|
||||
from vllm.sampling_params import SamplingParams
|
||||
|
||||
params = SamplingParams(max_tokens=16)
|
||||
result = None
|
||||
async for output in self.engine.generate(
|
||||
prompt, params, request_id="test_request_id"
|
||||
):
|
||||
result = output
|
||||
assert result is not None
|
||||
text = result.outputs[0].text
|
||||
|
||||
env_results = {}
|
||||
for name in env_names:
|
||||
vals = await self.engine.collective_rpc(
|
||||
_get_env_var, timeout=10, args=(name,)
|
||||
)
|
||||
env_results[name] = vals
|
||||
return text, env_results
|
||||
|
||||
def shutdown(self):
|
||||
if engine := getattr(self, "engine", None):
|
||||
engine.shutdown()
|
||||
del self.engine
|
||||
gc.collect()
|
||||
|
||||
|
||||
AsyncLLMActor = ray.remote(num_cpus=0, max_concurrency=1)(_AsyncLLMActor)
|
||||
|
||||
|
||||
def test_multi_replicas(ray_init):
|
||||
pg1 = ray.util.placement_group([{"GPU": 1, "CPU": 1}] * 2, strategy="PACK")
|
||||
pg2 = ray.util.placement_group([{"GPU": 1, "CPU": 1}] * 2, strategy="PACK")
|
||||
ray.get([pg1.ready(), pg2.ready()])
|
||||
|
||||
actor1 = AsyncLLMActor.remote()
|
||||
actor2 = AsyncLLMActor.remote()
|
||||
|
||||
ray.get(actor1.start.remote(pg1))
|
||||
ray.get(actor2.start.remote(pg2))
|
||||
|
||||
out1, out2 = ray.get(
|
||||
[
|
||||
actor1.generate.remote("Hello world"),
|
||||
actor2.generate.remote("Hello world"),
|
||||
]
|
||||
)
|
||||
assert len(out1) > 0
|
||||
assert len(out2) > 0
|
||||
|
||||
|
||||
def test_multi_replicas_with_bundle_indices(ray_init):
|
||||
pg = ray.util.placement_group([{"GPU": 1, "CPU": 1}] * 4, strategy="PACK")
|
||||
ray.get(pg.ready())
|
||||
|
||||
actor1 = AsyncLLMActor.remote()
|
||||
actor2 = AsyncLLMActor.remote()
|
||||
|
||||
ray.get(actor1.start.remote(pg, bundle_indices="2,1"))
|
||||
ray.get(actor2.start.remote(pg, bundle_indices="0,3"))
|
||||
|
||||
out1, out2 = ray.get(
|
||||
[
|
||||
actor1.generate.remote("Hello world"),
|
||||
actor2.generate.remote("Hello world"),
|
||||
]
|
||||
)
|
||||
assert len(out1) > 0
|
||||
assert len(out2) > 0
|
||||
|
||||
|
||||
def test_env_var_and_runtime_env_propagation():
|
||||
"""
|
||||
Verify env vars (NCCL_, HF_) and parallel_config.ray_runtime_env
|
||||
propagate to RayWorkerProc actors.
|
||||
"""
|
||||
sentinel_vars = {
|
||||
"NCCL_DEBUG": "INFO",
|
||||
"HF_TOKEN": "test_sentinel_token",
|
||||
}
|
||||
for k, v in sentinel_vars.items():
|
||||
os.environ[k] = v
|
||||
|
||||
try:
|
||||
# Called directly (not via the ray_init fixture) because sentinel
|
||||
# env vars must be in os.environ before ray.init() so that Ray
|
||||
# worker processes inherit them.
|
||||
_ray_init()
|
||||
|
||||
pg = ray.util.placement_group([{"GPU": 1, "CPU": 1}] * 2, strategy="PACK")
|
||||
ray.get(pg.ready())
|
||||
|
||||
# Include the project root so that RayWorkerProc actors can
|
||||
# unpickle _get_env_var.
|
||||
project_root = str(pathlib.Path(__file__).resolve().parents[2])
|
||||
ray_runtime_env = {
|
||||
"env_vars": {
|
||||
"RAY_RUNTIME_ENV_TEST": "ray_runtime_env",
|
||||
"PYTHONPATH": project_root,
|
||||
},
|
||||
}
|
||||
|
||||
actor = AsyncLLMActor.remote()
|
||||
ray.get(actor.start.remote(pg, ray_runtime_env=ray_runtime_env))
|
||||
|
||||
all_env_names = list(sentinel_vars) + ["RAY_RUNTIME_ENV_TEST"]
|
||||
text, env_results = ray.get(
|
||||
actor.generate_and_get_worker_envs.remote("Hello world", all_env_names)
|
||||
)
|
||||
assert len(text) > 0
|
||||
|
||||
for name, expected in sentinel_vars.items():
|
||||
for val in env_results[name]:
|
||||
assert val == expected
|
||||
|
||||
for val in env_results["RAY_RUNTIME_ENV_TEST"]:
|
||||
assert val == "ray_runtime_env"
|
||||
|
||||
finally:
|
||||
for k in sentinel_vars:
|
||||
os.environ.pop(k, None)
|
||||
Reference in New Issue
Block a user