210 lines
6.3 KiB
Python
210 lines
6.3 KiB
Python
|
|
# SPDX-License-Identifier: Apache-2.0
|
||
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||
|
|
"""
|
||
|
|
Orchestration-level integration tests for RayExecutorV2.
|
||
|
|
"""
|
||
|
|
|
||
|
|
import gc
|
||
|
|
import os
|
||
|
|
import pathlib
|
||
|
|
|
||
|
|
import pytest
|
||
|
|
import ray
|
||
|
|
|
||
|
|
pytestmark = pytest.mark.usefixtures("enable_ray_v2_backend")
|
||
|
|
|
||
|
|
MODEL = "facebook/opt-125m"
|
||
|
|
|
||
|
|
|
||
|
|
def _get_env_var(worker, name):
|
||
|
|
return os.environ.get(name)
|
||
|
|
|
||
|
|
|
||
|
|
def _ray_init():
|
||
|
|
"""Start Ray with the project root on workers' PYTHONPATH.
|
||
|
|
|
||
|
|
Without this, workers cannot unpickle actor classes defined in the
|
||
|
|
``tests`` package, causing FunctionActorManager to fall back to
|
||
|
|
TemporaryActor which drops async method signatures."""
|
||
|
|
project_root = str(pathlib.Path(__file__).resolve().parents[2])
|
||
|
|
ray.init(
|
||
|
|
ignore_reinit_error=True,
|
||
|
|
runtime_env={"env_vars": {"PYTHONPATH": project_root}},
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
@pytest.fixture
|
||
|
|
def ray_init():
|
||
|
|
_ray_init()
|
||
|
|
|
||
|
|
|
||
|
|
class _AsyncLLMActor:
|
||
|
|
def start(self, pg, bundle_indices=None, ray_runtime_env=None):
|
||
|
|
os.environ["VLLM_USE_RAY_V2_EXECUTOR_BACKEND"] = "1"
|
||
|
|
# Needed so collective_rpc can pickle _get_env_var over the
|
||
|
|
# AsyncLLM -> EngineCore ZMQ boundary.
|
||
|
|
os.environ["VLLM_ALLOW_INSECURE_SERIALIZATION"] = "1"
|
||
|
|
if bundle_indices is not None:
|
||
|
|
os.environ["VLLM_RAY_BUNDLE_INDICES"] = bundle_indices
|
||
|
|
else:
|
||
|
|
os.environ.pop("VLLM_RAY_BUNDLE_INDICES", None)
|
||
|
|
|
||
|
|
from vllm.engine.arg_utils import AsyncEngineArgs
|
||
|
|
from vllm.v1.engine.async_llm import AsyncLLM
|
||
|
|
from vllm.v1.executor.abstract import Executor
|
||
|
|
|
||
|
|
engine_args = AsyncEngineArgs(
|
||
|
|
model=MODEL,
|
||
|
|
tensor_parallel_size=2,
|
||
|
|
distributed_executor_backend="ray",
|
||
|
|
enforce_eager=True,
|
||
|
|
max_model_len=256,
|
||
|
|
gpu_memory_utilization=0.8,
|
||
|
|
)
|
||
|
|
vllm_config = engine_args.create_engine_config()
|
||
|
|
vllm_config.parallel_config.placement_group = pg
|
||
|
|
if ray_runtime_env is not None:
|
||
|
|
vllm_config.parallel_config.ray_runtime_env = ray_runtime_env
|
||
|
|
|
||
|
|
executor_class = Executor.get_class(vllm_config)
|
||
|
|
self.engine = AsyncLLM(
|
||
|
|
vllm_config=vllm_config,
|
||
|
|
executor_class=executor_class,
|
||
|
|
log_stats=False,
|
||
|
|
log_requests=False,
|
||
|
|
)
|
||
|
|
|
||
|
|
async def generate(self, prompt):
|
||
|
|
from vllm.sampling_params import SamplingParams
|
||
|
|
|
||
|
|
params = SamplingParams(max_tokens=16)
|
||
|
|
result = None
|
||
|
|
async for output in self.engine.generate(
|
||
|
|
prompt, params, request_id="test_request_id"
|
||
|
|
):
|
||
|
|
result = output
|
||
|
|
assert result is not None
|
||
|
|
return result.outputs[0].text
|
||
|
|
|
||
|
|
async def generate_and_get_worker_envs(self, prompt, env_names):
|
||
|
|
from vllm.sampling_params import SamplingParams
|
||
|
|
|
||
|
|
params = SamplingParams(max_tokens=16)
|
||
|
|
result = None
|
||
|
|
async for output in self.engine.generate(
|
||
|
|
prompt, params, request_id="test_request_id"
|
||
|
|
):
|
||
|
|
result = output
|
||
|
|
assert result is not None
|
||
|
|
text = result.outputs[0].text
|
||
|
|
|
||
|
|
env_results = {}
|
||
|
|
for name in env_names:
|
||
|
|
vals = await self.engine.collective_rpc(
|
||
|
|
_get_env_var, timeout=10, args=(name,)
|
||
|
|
)
|
||
|
|
env_results[name] = vals
|
||
|
|
return text, env_results
|
||
|
|
|
||
|
|
def shutdown(self):
|
||
|
|
if engine := getattr(self, "engine", None):
|
||
|
|
engine.shutdown()
|
||
|
|
del self.engine
|
||
|
|
gc.collect()
|
||
|
|
|
||
|
|
|
||
|
|
AsyncLLMActor = ray.remote(num_cpus=0, max_concurrency=1)(_AsyncLLMActor)
|
||
|
|
|
||
|
|
|
||
|
|
def test_multi_replicas(ray_init):
|
||
|
|
pg1 = ray.util.placement_group([{"GPU": 1, "CPU": 1}] * 2, strategy="PACK")
|
||
|
|
pg2 = ray.util.placement_group([{"GPU": 1, "CPU": 1}] * 2, strategy="PACK")
|
||
|
|
ray.get([pg1.ready(), pg2.ready()])
|
||
|
|
|
||
|
|
actor1 = AsyncLLMActor.remote()
|
||
|
|
actor2 = AsyncLLMActor.remote()
|
||
|
|
|
||
|
|
ray.get(actor1.start.remote(pg1))
|
||
|
|
ray.get(actor2.start.remote(pg2))
|
||
|
|
|
||
|
|
out1, out2 = ray.get(
|
||
|
|
[
|
||
|
|
actor1.generate.remote("Hello world"),
|
||
|
|
actor2.generate.remote("Hello world"),
|
||
|
|
]
|
||
|
|
)
|
||
|
|
assert len(out1) > 0
|
||
|
|
assert len(out2) > 0
|
||
|
|
|
||
|
|
|
||
|
|
def test_multi_replicas_with_bundle_indices(ray_init):
|
||
|
|
pg = ray.util.placement_group([{"GPU": 1, "CPU": 1}] * 4, strategy="PACK")
|
||
|
|
ray.get(pg.ready())
|
||
|
|
|
||
|
|
actor1 = AsyncLLMActor.remote()
|
||
|
|
actor2 = AsyncLLMActor.remote()
|
||
|
|
|
||
|
|
ray.get(actor1.start.remote(pg, bundle_indices="2,1"))
|
||
|
|
ray.get(actor2.start.remote(pg, bundle_indices="0,3"))
|
||
|
|
|
||
|
|
out1, out2 = ray.get(
|
||
|
|
[
|
||
|
|
actor1.generate.remote("Hello world"),
|
||
|
|
actor2.generate.remote("Hello world"),
|
||
|
|
]
|
||
|
|
)
|
||
|
|
assert len(out1) > 0
|
||
|
|
assert len(out2) > 0
|
||
|
|
|
||
|
|
|
||
|
|
def test_env_var_and_runtime_env_propagation():
|
||
|
|
"""
|
||
|
|
Verify env vars (NCCL_, HF_) and parallel_config.ray_runtime_env
|
||
|
|
propagate to RayWorkerProc actors.
|
||
|
|
"""
|
||
|
|
sentinel_vars = {
|
||
|
|
"NCCL_DEBUG": "INFO",
|
||
|
|
"HF_TOKEN": "test_sentinel_token",
|
||
|
|
}
|
||
|
|
for k, v in sentinel_vars.items():
|
||
|
|
os.environ[k] = v
|
||
|
|
|
||
|
|
try:
|
||
|
|
# Called directly (not via the ray_init fixture) because sentinel
|
||
|
|
# env vars must be in os.environ before ray.init() so that Ray
|
||
|
|
# worker processes inherit them.
|
||
|
|
_ray_init()
|
||
|
|
|
||
|
|
pg = ray.util.placement_group([{"GPU": 1, "CPU": 1}] * 2, strategy="PACK")
|
||
|
|
ray.get(pg.ready())
|
||
|
|
|
||
|
|
# Include the project root so that RayWorkerProc actors can
|
||
|
|
# unpickle _get_env_var.
|
||
|
|
project_root = str(pathlib.Path(__file__).resolve().parents[2])
|
||
|
|
ray_runtime_env = {
|
||
|
|
"env_vars": {
|
||
|
|
"RAY_RUNTIME_ENV_TEST": "ray_runtime_env",
|
||
|
|
"PYTHONPATH": project_root,
|
||
|
|
},
|
||
|
|
}
|
||
|
|
|
||
|
|
actor = AsyncLLMActor.remote()
|
||
|
|
ray.get(actor.start.remote(pg, ray_runtime_env=ray_runtime_env))
|
||
|
|
|
||
|
|
all_env_names = list(sentinel_vars) + ["RAY_RUNTIME_ENV_TEST"]
|
||
|
|
text, env_results = ray.get(
|
||
|
|
actor.generate_and_get_worker_envs.remote("Hello world", all_env_names)
|
||
|
|
)
|
||
|
|
assert len(text) > 0
|
||
|
|
|
||
|
|
for name, expected in sentinel_vars.items():
|
||
|
|
for val in env_results[name]:
|
||
|
|
assert val == expected
|
||
|
|
|
||
|
|
for val in env_results["RAY_RUNTIME_ENV_TEST"]:
|
||
|
|
assert val == "ray_runtime_env"
|
||
|
|
|
||
|
|
finally:
|
||
|
|
for k in sentinel_vars:
|
||
|
|
os.environ.pop(k, None)
|