[Feat][Executor] Introduce RayExecutorV2 (#36836)
Signed-off-by: Jeffrey Wang <jeffreywang@anyscale.com>
This commit is contained in:
@@ -1,5 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import os
|
||||
import random
|
||||
|
||||
import msgspec
|
||||
@@ -166,3 +167,31 @@ class MockSubscriber:
|
||||
self.sub.close()
|
||||
for replay in self.replay_sockets:
|
||||
replay.close()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def enable_ray_v2_backend():
|
||||
"""Set env vars for the Ray V2 executor backend and shut down Ray
|
||||
between tests."""
|
||||
import ray
|
||||
|
||||
saved = {
|
||||
"VLLM_USE_RAY_V2_EXECUTOR_BACKEND": os.environ.get(
|
||||
"VLLM_USE_RAY_V2_EXECUTOR_BACKEND"
|
||||
),
|
||||
"VLLM_ENABLE_V1_MULTIPROCESSING": os.environ.get(
|
||||
"VLLM_ENABLE_V1_MULTIPROCESSING"
|
||||
),
|
||||
}
|
||||
os.environ["VLLM_USE_RAY_V2_EXECUTOR_BACKEND"] = "1"
|
||||
os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
|
||||
if ray.is_initialized():
|
||||
ray.shutdown()
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
if ray.is_initialized():
|
||||
ray.shutdown()
|
||||
os.environ.update({k: v for k, v in saved.items() if v is not None})
|
||||
for key in (k for k, v in saved.items() if v is None):
|
||||
os.environ.pop(key, None)
|
||||
|
||||
119
tests/distributed/test_mq_tcp_multinode.py
Normal file
119
tests/distributed/test_mq_tcp_multinode.py
Normal file
@@ -0,0 +1,119 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Multi-node integration test for MessageQueue TCP fallback.
|
||||
|
||||
Verifies that when writer and readers span separate nodes (Docker containers
|
||||
with isolated /dev/shm), `create_from_process_group` correctly detects
|
||||
cross-node ranks via `in_the_same_node_as()` and falls back to ZMQ TCP
|
||||
transport — and that data actually arrives.
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import torch.distributed as dist
|
||||
|
||||
from vllm.distributed.device_communicators.shm_broadcast import MessageQueue
|
||||
from vllm.distributed.parallel_state import in_the_same_node_as
|
||||
|
||||
|
||||
def main():
|
||||
dist.init_process_group(backend="gloo")
|
||||
|
||||
rank = dist.get_rank()
|
||||
world_size = dist.get_world_size()
|
||||
assert world_size >= 2, (
|
||||
f"Need at least 2 ranks across nodes, got world_size={world_size}"
|
||||
)
|
||||
|
||||
# Verify that in_the_same_node_as detects cross-node correctly
|
||||
status = in_the_same_node_as(dist.group.WORLD, source_rank=0)
|
||||
local_count = sum(status)
|
||||
print(
|
||||
f"[Rank {rank}] in_the_same_node_as(source=0): {status} "
|
||||
f"(local={local_count}/{world_size})"
|
||||
)
|
||||
# With 2 Docker containers (1 proc each), rank 0 and rank 1
|
||||
# should be on different nodes.
|
||||
assert local_count < world_size, (
|
||||
f"Expected cross-node ranks but all {world_size} ranks appear local."
|
||||
)
|
||||
|
||||
# Create MessageQueue
|
||||
writer_rank = 0
|
||||
mq = MessageQueue.create_from_process_group(
|
||||
dist.group.WORLD,
|
||||
max_chunk_bytes=1024 * 1024, # 1 MiB
|
||||
max_chunks=10,
|
||||
writer_rank=writer_rank,
|
||||
)
|
||||
|
||||
# Verify the transport path selection
|
||||
if rank == writer_rank:
|
||||
print(
|
||||
f"[Rank {rank}] Writer: n_local_reader={mq.n_local_reader}, "
|
||||
f"n_remote_reader={mq.n_remote_reader}"
|
||||
)
|
||||
assert mq.n_remote_reader > 0, (
|
||||
"Writer should have at least 1 remote (TCP) reader in a multi-node setup."
|
||||
)
|
||||
else:
|
||||
if status[rank]:
|
||||
assert mq._is_local_reader, (
|
||||
f"Rank {rank} is on the same node as writer but is not a local reader."
|
||||
)
|
||||
print(f"[Rank {rank}] Reader: local (shared memory)")
|
||||
else:
|
||||
assert mq._is_remote_reader, (
|
||||
f"Rank {rank} is on a different node but is not a remote (TCP) reader."
|
||||
)
|
||||
print(f"[Rank {rank}] Reader: remote (TCP)")
|
||||
|
||||
# Test data transfer: simple objects
|
||||
dist.barrier()
|
||||
if rank == writer_rank:
|
||||
mq.enqueue("hello_from_node0")
|
||||
else:
|
||||
msg = mq.dequeue(timeout=10)
|
||||
assert msg == "hello_from_node0"
|
||||
dist.barrier()
|
||||
print(f"[Rank {rank}] Simple object test passed")
|
||||
|
||||
# Test data transfer: numpy arrays
|
||||
np.random.seed(42)
|
||||
arrays = [
|
||||
np.random.randint(0, 100, size=np.random.randint(100, 5000)) for _ in range(100)
|
||||
]
|
||||
|
||||
dist.barrier()
|
||||
if rank == writer_rank:
|
||||
for arr in arrays:
|
||||
mq.enqueue(arr)
|
||||
else:
|
||||
for i, expected in enumerate(arrays):
|
||||
received = mq.dequeue(timeout=10)
|
||||
assert np.array_equal(expected, received), (
|
||||
f"Array mismatch at index {i}: "
|
||||
f"expected shape {expected.shape}, got shape {received.shape}"
|
||||
)
|
||||
dist.barrier()
|
||||
print(f"[Rank {rank}] Numpy array test passed")
|
||||
|
||||
# Test data transfer: large payload (> max_chunk_bytes)
|
||||
dist.barrier()
|
||||
big_array = np.zeros(200_000, dtype=np.int64) # ~1.6 MiB > 1 MiB chunk
|
||||
if rank == writer_rank:
|
||||
mq.enqueue(big_array)
|
||||
else:
|
||||
received = mq.dequeue(timeout=10)
|
||||
assert np.array_equal(big_array, received)
|
||||
dist.barrier()
|
||||
print(f"[Rank {rank}] Large payload test passed")
|
||||
|
||||
# Done -- cleanup
|
||||
dist.barrier()
|
||||
print(f"[Rank {rank}] All MessageQueue TCP multi-node tests passed!")
|
||||
dist.destroy_process_group()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
345
tests/distributed/test_ray_v2_executor.py
Normal file
345
tests/distributed/test_ray_v2_executor.py
Normal file
@@ -0,0 +1,345 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
"""
|
||||
Integration tests for RayExecutorV2 at the executor level.
|
||||
Validates executor initialization, placement group support, RPC calls,
|
||||
and distributed execution with various TP/PP configurations.
|
||||
"""
|
||||
|
||||
import gc
|
||||
import threading
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
import ray
|
||||
|
||||
from vllm import LLM
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.v1.executor.ray_executor_v2 import RayExecutorV2
|
||||
|
||||
pytestmark = pytest.mark.usefixtures("enable_ray_v2_backend")
|
||||
|
||||
MODEL = "facebook/opt-125m"
|
||||
|
||||
|
||||
def create_vllm_config(
|
||||
tensor_parallel_size: int = 1,
|
||||
pipeline_parallel_size: int = 1,
|
||||
max_model_len: int = 256,
|
||||
gpu_memory_utilization: float = 0.3,
|
||||
placement_group=None,
|
||||
) -> VllmConfig:
|
||||
engine_args = EngineArgs(
|
||||
model=MODEL,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
pipeline_parallel_size=pipeline_parallel_size,
|
||||
max_model_len=max_model_len,
|
||||
gpu_memory_utilization=gpu_memory_utilization,
|
||||
distributed_executor_backend="ray",
|
||||
enforce_eager=True,
|
||||
)
|
||||
vllm_config = engine_args.create_engine_config()
|
||||
|
||||
if placement_group is not None:
|
||||
vllm_config.parallel_config.placement_group = placement_group
|
||||
|
||||
return vllm_config
|
||||
|
||||
|
||||
def ensure_ray_initialized():
|
||||
if not ray.is_initialized():
|
||||
ray.init(ignore_reinit_error=True)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def create_placement_group(request):
|
||||
ensure_ray_initialized()
|
||||
num_gpus = request.param
|
||||
bundles = [{"GPU": 1, "CPU": 1} for _ in range(num_gpus)]
|
||||
pg = ray.util.placement_group(bundles, strategy="PACK")
|
||||
ray.get(pg.ready())
|
||||
yield pg
|
||||
ray.util.remove_placement_group(pg)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def executor(request):
|
||||
"""Create a RayExecutorV2 and shut it down after the test."""
|
||||
executor = RayExecutorV2(vllm_config=request.param)
|
||||
yield executor
|
||||
executor.shutdown()
|
||||
|
||||
|
||||
def assert_executor(executor, tp_size, pp_size):
|
||||
"""Common assertions for executor initialization tests."""
|
||||
world_size = tp_size * pp_size
|
||||
expected_output_rank = (pp_size - 1) * tp_size
|
||||
|
||||
assert executor.world_size == world_size
|
||||
assert len(executor.ray_worker_handles) == world_size
|
||||
assert len(executor.response_mqs) == world_size
|
||||
assert executor._get_output_rank() == expected_output_rank
|
||||
|
||||
if pp_size > 1:
|
||||
assert executor.max_concurrent_batches == pp_size
|
||||
|
||||
executor.check_health()
|
||||
assert not executor.is_failed
|
||||
|
||||
ranks = sorted(h.rank for h in executor.ray_worker_handles)
|
||||
assert ranks == list(range(world_size))
|
||||
|
||||
for handle in executor.ray_worker_handles:
|
||||
assert handle.node_id is not None
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tp_size, pp_size", [(1, 1), (2, 1), (4, 1), (2, 2)])
|
||||
def test_ray_v2_executor(tp_size, pp_size):
|
||||
"""Validate RayExecutorV2 with various TP/PP configs."""
|
||||
vllm_config = create_vllm_config(
|
||||
tensor_parallel_size=tp_size,
|
||||
pipeline_parallel_size=pp_size,
|
||||
)
|
||||
executor = RayExecutorV2(vllm_config=vllm_config)
|
||||
try:
|
||||
assert_executor(executor, tp_size, pp_size)
|
||||
finally:
|
||||
executor.shutdown()
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"tp_size, pp_size, create_placement_group",
|
||||
[(2, 1, 2), (4, 1, 4), (2, 2, 4)],
|
||||
indirect=["create_placement_group"],
|
||||
)
|
||||
def test_ray_v2_executor_pg(tp_size, pp_size, create_placement_group):
|
||||
"""Validate RayExecutorV2 with various TP/PP configs using external PG."""
|
||||
vllm_config = create_vllm_config(
|
||||
tensor_parallel_size=tp_size,
|
||||
pipeline_parallel_size=pp_size,
|
||||
placement_group=create_placement_group,
|
||||
)
|
||||
executor = RayExecutorV2(vllm_config=vllm_config)
|
||||
try:
|
||||
assert_executor(executor, tp_size, pp_size)
|
||||
finally:
|
||||
executor.shutdown()
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"executor",
|
||||
[create_vllm_config(tensor_parallel_size=2)],
|
||||
indirect=True,
|
||||
)
|
||||
def test_ray_v2_executor_failure_callback(executor):
|
||||
"""Validate failure callback registration."""
|
||||
callback_invoked = False
|
||||
|
||||
def test_callback():
|
||||
nonlocal callback_invoked
|
||||
callback_invoked = True
|
||||
|
||||
executor.register_failure_callback(test_callback)
|
||||
assert not callback_invoked
|
||||
|
||||
executor.is_failed = True
|
||||
executor.register_failure_callback(test_callback)
|
||||
assert callback_invoked
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"executor",
|
||||
[create_vllm_config(tensor_parallel_size=2)],
|
||||
indirect=True,
|
||||
)
|
||||
def test_ray_v2_executor_collective_rpc(executor):
|
||||
"""Validate collective RPC calls through MessageQueue."""
|
||||
executor.check_health()
|
||||
assert not executor.is_failed
|
||||
assert executor.rpc_broadcast_mq is not None
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"executor",
|
||||
[create_vllm_config(tensor_parallel_size=2)],
|
||||
indirect=True,
|
||||
)
|
||||
def test_ray_v2_executor_driver_node_rank_0(executor):
|
||||
"""Validate that driver node workers get the lowest ranks."""
|
||||
driver_node = ray.get_runtime_context().get_node_id()
|
||||
|
||||
for handle in executor.ray_worker_handles:
|
||||
assert handle.node_id == driver_node
|
||||
|
||||
rank0_handle = next(h for h in executor.ray_worker_handles if h.rank == 0)
|
||||
assert rank0_handle.node_id == driver_node
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"executor",
|
||||
[create_vllm_config(tensor_parallel_size=2)],
|
||||
indirect=True,
|
||||
)
|
||||
def test_ray_v2_executor_worker_death(executor):
|
||||
"""Validate executor detects worker death via ray.wait()."""
|
||||
callback_event = threading.Event()
|
||||
|
||||
def on_failure():
|
||||
callback_event.set()
|
||||
|
||||
executor.register_failure_callback(on_failure)
|
||||
assert not executor.is_failed
|
||||
|
||||
# Kill one worker actor externally
|
||||
victim = executor.ray_worker_handles[1].actor
|
||||
ray.kill(victim, no_restart=True)
|
||||
|
||||
# Monitor thread should detect the death and invoke callback
|
||||
assert callback_event.wait(timeout=30)
|
||||
assert executor.is_failed
|
||||
assert executor.shutting_down
|
||||
|
||||
|
||||
def test_ray_v2_executor_shutdown():
|
||||
"""Validate graceful shutdown: ray.kill() terminates all worker actors."""
|
||||
executor = RayExecutorV2(vllm_config=create_vllm_config(tensor_parallel_size=2))
|
||||
assert executor.rpc_broadcast_mq is not None
|
||||
assert len(executor.response_mqs) == executor.world_size
|
||||
|
||||
actors = [h.actor for h in executor.ray_worker_handles]
|
||||
executor.shutdown()
|
||||
|
||||
for actor in actors:
|
||||
with pytest.raises(ray.exceptions.RayActorError):
|
||||
ray.get(actor.wait_for_init.remote(), timeout=5)
|
||||
|
||||
assert executor.rpc_broadcast_mq is None
|
||||
assert len(executor.response_mqs) == 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"executor",
|
||||
[create_vllm_config(tensor_parallel_size=2)],
|
||||
indirect=True,
|
||||
)
|
||||
def test_ray_v2_run_refs_stored_for_monitoring(executor):
|
||||
"""Validate worker handles store run_ref for monitoring."""
|
||||
for handle in executor.ray_worker_handles:
|
||||
assert handle.run_ref is not None
|
||||
ready, _ = ray.wait([handle.run_ref], timeout=0)
|
||||
assert len(ready) == 0, "run_ref should be pending"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tp_size, pp_size", [(2, 1), (2, 2)])
|
||||
def test_ray_v2_single_node_generation(tp_size, pp_size):
|
||||
"""End-to-end LLM generation with RayExecutorV2."""
|
||||
|
||||
llm = LLM(
|
||||
model=MODEL,
|
||||
tensor_parallel_size=tp_size,
|
||||
pipeline_parallel_size=pp_size,
|
||||
distributed_executor_backend="ray",
|
||||
enforce_eager=True,
|
||||
max_model_len=256,
|
||||
gpu_memory_utilization=0.3,
|
||||
)
|
||||
try:
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
outputs = llm.generate(prompts)
|
||||
|
||||
assert len(outputs) == len(prompts)
|
||||
for output in outputs:
|
||||
assert len(output.outputs) > 0
|
||||
assert len(output.outputs[0].text) > 0
|
||||
finally:
|
||||
llm.llm_engine.model_executor.shutdown()
|
||||
del llm
|
||||
gc.collect()
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"bundle_indices, expected_bundle_ids, create_placement_group",
|
||||
[("2,3", [2, 3], 4), ("3,2", [3, 2], 4)],
|
||||
indirect=["create_placement_group"],
|
||||
)
|
||||
def test_ray_v2_bundle_indices_env(
|
||||
bundle_indices, expected_bundle_ids, create_placement_group, monkeypatch
|
||||
):
|
||||
"""Validate explicit VLLM_RAY_BUNDLE_INDICES bundle placement."""
|
||||
monkeypatch.setenv("VLLM_RAY_BUNDLE_INDICES", bundle_indices)
|
||||
vllm_config = create_vllm_config(
|
||||
tensor_parallel_size=2,
|
||||
placement_group=create_placement_group,
|
||||
)
|
||||
executor = RayExecutorV2(vllm_config=vllm_config)
|
||||
try:
|
||||
actual = [
|
||||
h.bundle_id_idx
|
||||
for h in sorted(executor.ray_worker_handles, key=lambda h: h.rank)
|
||||
]
|
||||
assert actual == expected_bundle_ids
|
||||
assert_executor(executor, tp_size=2, pp_size=1)
|
||||
finally:
|
||||
executor.shutdown()
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"bundle_indices, expected_error, create_placement_group",
|
||||
[
|
||||
("1,1", "cannot have duplicate values,", 4),
|
||||
("0,1,2", "must have the same size", 4),
|
||||
],
|
||||
indirect=["create_placement_group"],
|
||||
)
|
||||
def test_ray_v2_invalid_bundle_indices(
|
||||
bundle_indices, expected_error, create_placement_group, monkeypatch
|
||||
):
|
||||
"""Validate invalid bundle indices are rejected."""
|
||||
monkeypatch.setenv("VLLM_RAY_BUNDLE_INDICES", bundle_indices)
|
||||
vllm_config = create_vllm_config(
|
||||
tensor_parallel_size=2, placement_group=create_placement_group
|
||||
)
|
||||
with pytest.raises(AssertionError, match=expected_error):
|
||||
RayExecutorV2(vllm_config=vllm_config)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tp_size, pp_size", [(2, 1), (2, 2)])
|
||||
def test_ray_v2_single_node_generation_with_pg(tp_size, pp_size):
|
||||
"""E2E LLM generation with a user-provided placement group."""
|
||||
ensure_ray_initialized()
|
||||
bundles = [{"GPU": 1, "CPU": 1} for _ in range(tp_size * pp_size)]
|
||||
pg = ray.util.placement_group(bundles, strategy="PACK")
|
||||
ray.get(pg.ready())
|
||||
|
||||
try:
|
||||
with patch.object(ray.util, "get_current_placement_group", return_value=pg):
|
||||
llm = LLM(
|
||||
model=MODEL,
|
||||
tensor_parallel_size=tp_size,
|
||||
pipeline_parallel_size=pp_size,
|
||||
distributed_executor_backend="ray",
|
||||
enforce_eager=True,
|
||||
max_model_len=256,
|
||||
gpu_memory_utilization=0.3,
|
||||
)
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
outputs = llm.generate(prompts)
|
||||
|
||||
assert len(outputs) == len(prompts)
|
||||
for output in outputs:
|
||||
assert len(output.outputs) > 0
|
||||
assert len(output.outputs[0].text) > 0
|
||||
finally:
|
||||
llm.llm_engine.model_executor.shutdown()
|
||||
del llm
|
||||
gc.collect()
|
||||
209
tests/distributed/test_ray_v2_executor_e2e.py
Normal file
209
tests/distributed/test_ray_v2_executor_e2e.py
Normal file
@@ -0,0 +1,209 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Orchestration-level integration tests for RayExecutorV2.
|
||||
"""
|
||||
|
||||
import gc
|
||||
import os
|
||||
import pathlib
|
||||
|
||||
import pytest
|
||||
import ray
|
||||
|
||||
pytestmark = pytest.mark.usefixtures("enable_ray_v2_backend")
|
||||
|
||||
MODEL = "facebook/opt-125m"
|
||||
|
||||
|
||||
def _get_env_var(worker, name):
|
||||
return os.environ.get(name)
|
||||
|
||||
|
||||
def _ray_init():
|
||||
"""Start Ray with the project root on workers' PYTHONPATH.
|
||||
|
||||
Without this, workers cannot unpickle actor classes defined in the
|
||||
``tests`` package, causing FunctionActorManager to fall back to
|
||||
TemporaryActor which drops async method signatures."""
|
||||
project_root = str(pathlib.Path(__file__).resolve().parents[2])
|
||||
ray.init(
|
||||
ignore_reinit_error=True,
|
||||
runtime_env={"env_vars": {"PYTHONPATH": project_root}},
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def ray_init():
|
||||
_ray_init()
|
||||
|
||||
|
||||
class _AsyncLLMActor:
|
||||
def start(self, pg, bundle_indices=None, ray_runtime_env=None):
|
||||
os.environ["VLLM_USE_RAY_V2_EXECUTOR_BACKEND"] = "1"
|
||||
# Needed so collective_rpc can pickle _get_env_var over the
|
||||
# AsyncLLM -> EngineCore ZMQ boundary.
|
||||
os.environ["VLLM_ALLOW_INSECURE_SERIALIZATION"] = "1"
|
||||
if bundle_indices is not None:
|
||||
os.environ["VLLM_RAY_BUNDLE_INDICES"] = bundle_indices
|
||||
else:
|
||||
os.environ.pop("VLLM_RAY_BUNDLE_INDICES", None)
|
||||
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||
from vllm.v1.engine.async_llm import AsyncLLM
|
||||
from vllm.v1.executor.abstract import Executor
|
||||
|
||||
engine_args = AsyncEngineArgs(
|
||||
model=MODEL,
|
||||
tensor_parallel_size=2,
|
||||
distributed_executor_backend="ray",
|
||||
enforce_eager=True,
|
||||
max_model_len=256,
|
||||
gpu_memory_utilization=0.8,
|
||||
)
|
||||
vllm_config = engine_args.create_engine_config()
|
||||
vllm_config.parallel_config.placement_group = pg
|
||||
if ray_runtime_env is not None:
|
||||
vllm_config.parallel_config.ray_runtime_env = ray_runtime_env
|
||||
|
||||
executor_class = Executor.get_class(vllm_config)
|
||||
self.engine = AsyncLLM(
|
||||
vllm_config=vllm_config,
|
||||
executor_class=executor_class,
|
||||
log_stats=False,
|
||||
log_requests=False,
|
||||
)
|
||||
|
||||
async def generate(self, prompt):
|
||||
from vllm.sampling_params import SamplingParams
|
||||
|
||||
params = SamplingParams(max_tokens=16)
|
||||
result = None
|
||||
async for output in self.engine.generate(
|
||||
prompt, params, request_id="test_request_id"
|
||||
):
|
||||
result = output
|
||||
assert result is not None
|
||||
return result.outputs[0].text
|
||||
|
||||
async def generate_and_get_worker_envs(self, prompt, env_names):
|
||||
from vllm.sampling_params import SamplingParams
|
||||
|
||||
params = SamplingParams(max_tokens=16)
|
||||
result = None
|
||||
async for output in self.engine.generate(
|
||||
prompt, params, request_id="test_request_id"
|
||||
):
|
||||
result = output
|
||||
assert result is not None
|
||||
text = result.outputs[0].text
|
||||
|
||||
env_results = {}
|
||||
for name in env_names:
|
||||
vals = await self.engine.collective_rpc(
|
||||
_get_env_var, timeout=10, args=(name,)
|
||||
)
|
||||
env_results[name] = vals
|
||||
return text, env_results
|
||||
|
||||
def shutdown(self):
|
||||
if engine := getattr(self, "engine", None):
|
||||
engine.shutdown()
|
||||
del self.engine
|
||||
gc.collect()
|
||||
|
||||
|
||||
AsyncLLMActor = ray.remote(num_cpus=0, max_concurrency=1)(_AsyncLLMActor)
|
||||
|
||||
|
||||
def test_multi_replicas(ray_init):
|
||||
pg1 = ray.util.placement_group([{"GPU": 1, "CPU": 1}] * 2, strategy="PACK")
|
||||
pg2 = ray.util.placement_group([{"GPU": 1, "CPU": 1}] * 2, strategy="PACK")
|
||||
ray.get([pg1.ready(), pg2.ready()])
|
||||
|
||||
actor1 = AsyncLLMActor.remote()
|
||||
actor2 = AsyncLLMActor.remote()
|
||||
|
||||
ray.get(actor1.start.remote(pg1))
|
||||
ray.get(actor2.start.remote(pg2))
|
||||
|
||||
out1, out2 = ray.get(
|
||||
[
|
||||
actor1.generate.remote("Hello world"),
|
||||
actor2.generate.remote("Hello world"),
|
||||
]
|
||||
)
|
||||
assert len(out1) > 0
|
||||
assert len(out2) > 0
|
||||
|
||||
|
||||
def test_multi_replicas_with_bundle_indices(ray_init):
|
||||
pg = ray.util.placement_group([{"GPU": 1, "CPU": 1}] * 4, strategy="PACK")
|
||||
ray.get(pg.ready())
|
||||
|
||||
actor1 = AsyncLLMActor.remote()
|
||||
actor2 = AsyncLLMActor.remote()
|
||||
|
||||
ray.get(actor1.start.remote(pg, bundle_indices="2,1"))
|
||||
ray.get(actor2.start.remote(pg, bundle_indices="0,3"))
|
||||
|
||||
out1, out2 = ray.get(
|
||||
[
|
||||
actor1.generate.remote("Hello world"),
|
||||
actor2.generate.remote("Hello world"),
|
||||
]
|
||||
)
|
||||
assert len(out1) > 0
|
||||
assert len(out2) > 0
|
||||
|
||||
|
||||
def test_env_var_and_runtime_env_propagation():
|
||||
"""
|
||||
Verify env vars (NCCL_, HF_) and parallel_config.ray_runtime_env
|
||||
propagate to RayWorkerProc actors.
|
||||
"""
|
||||
sentinel_vars = {
|
||||
"NCCL_DEBUG": "INFO",
|
||||
"HF_TOKEN": "test_sentinel_token",
|
||||
}
|
||||
for k, v in sentinel_vars.items():
|
||||
os.environ[k] = v
|
||||
|
||||
try:
|
||||
# Called directly (not via the ray_init fixture) because sentinel
|
||||
# env vars must be in os.environ before ray.init() so that Ray
|
||||
# worker processes inherit them.
|
||||
_ray_init()
|
||||
|
||||
pg = ray.util.placement_group([{"GPU": 1, "CPU": 1}] * 2, strategy="PACK")
|
||||
ray.get(pg.ready())
|
||||
|
||||
# Include the project root so that RayWorkerProc actors can
|
||||
# unpickle _get_env_var.
|
||||
project_root = str(pathlib.Path(__file__).resolve().parents[2])
|
||||
ray_runtime_env = {
|
||||
"env_vars": {
|
||||
"RAY_RUNTIME_ENV_TEST": "ray_runtime_env",
|
||||
"PYTHONPATH": project_root,
|
||||
},
|
||||
}
|
||||
|
||||
actor = AsyncLLMActor.remote()
|
||||
ray.get(actor.start.remote(pg, ray_runtime_env=ray_runtime_env))
|
||||
|
||||
all_env_names = list(sentinel_vars) + ["RAY_RUNTIME_ENV_TEST"]
|
||||
text, env_results = ray.get(
|
||||
actor.generate_and_get_worker_envs.remote("Hello world", all_env_names)
|
||||
)
|
||||
assert len(text) > 0
|
||||
|
||||
for name, expected in sentinel_vars.items():
|
||||
for val in env_results[name]:
|
||||
assert val == expected
|
||||
|
||||
for val in env_results["RAY_RUNTIME_ENV_TEST"]:
|
||||
assert val == "ray_runtime_env"
|
||||
|
||||
finally:
|
||||
for k in sentinel_vars:
|
||||
os.environ.pop(k, None)
|
||||
Reference in New Issue
Block a user