[Feat][Executor] Introduce RayExecutorV2 (#36836)

Signed-off-by: Jeffrey Wang <jeffreywang@anyscale.com>
This commit is contained in:
Jeffrey Wang
2026-04-01 14:34:29 -07:00
committed by GitHub
parent cb268e4e55
commit de5e6c44c6
14 changed files with 1603 additions and 30 deletions

View File

@@ -1,5 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import random
import msgspec
@@ -166,3 +167,31 @@ class MockSubscriber:
self.sub.close()
for replay in self.replay_sockets:
replay.close()
@pytest.fixture
def enable_ray_v2_backend():
"""Set env vars for the Ray V2 executor backend and shut down Ray
between tests."""
import ray
saved = {
"VLLM_USE_RAY_V2_EXECUTOR_BACKEND": os.environ.get(
"VLLM_USE_RAY_V2_EXECUTOR_BACKEND"
),
"VLLM_ENABLE_V1_MULTIPROCESSING": os.environ.get(
"VLLM_ENABLE_V1_MULTIPROCESSING"
),
}
os.environ["VLLM_USE_RAY_V2_EXECUTOR_BACKEND"] = "1"
os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
if ray.is_initialized():
ray.shutdown()
try:
yield
finally:
if ray.is_initialized():
ray.shutdown()
os.environ.update({k: v for k, v in saved.items() if v is not None})
for key in (k for k, v in saved.items() if v is None):
os.environ.pop(key, None)

View File

@@ -0,0 +1,119 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Multi-node integration test for MessageQueue TCP fallback.
Verifies that when writer and readers span separate nodes (Docker containers
with isolated /dev/shm), `create_from_process_group` correctly detects
cross-node ranks via `in_the_same_node_as()` and falls back to ZMQ TCP
transport — and that data actually arrives.
"""
import numpy as np
import torch.distributed as dist
from vllm.distributed.device_communicators.shm_broadcast import MessageQueue
from vllm.distributed.parallel_state import in_the_same_node_as
def main():
dist.init_process_group(backend="gloo")
rank = dist.get_rank()
world_size = dist.get_world_size()
assert world_size >= 2, (
f"Need at least 2 ranks across nodes, got world_size={world_size}"
)
# Verify that in_the_same_node_as detects cross-node correctly
status = in_the_same_node_as(dist.group.WORLD, source_rank=0)
local_count = sum(status)
print(
f"[Rank {rank}] in_the_same_node_as(source=0): {status} "
f"(local={local_count}/{world_size})"
)
# With 2 Docker containers (1 proc each), rank 0 and rank 1
# should be on different nodes.
assert local_count < world_size, (
f"Expected cross-node ranks but all {world_size} ranks appear local."
)
# Create MessageQueue
writer_rank = 0
mq = MessageQueue.create_from_process_group(
dist.group.WORLD,
max_chunk_bytes=1024 * 1024, # 1 MiB
max_chunks=10,
writer_rank=writer_rank,
)
# Verify the transport path selection
if rank == writer_rank:
print(
f"[Rank {rank}] Writer: n_local_reader={mq.n_local_reader}, "
f"n_remote_reader={mq.n_remote_reader}"
)
assert mq.n_remote_reader > 0, (
"Writer should have at least 1 remote (TCP) reader in a multi-node setup."
)
else:
if status[rank]:
assert mq._is_local_reader, (
f"Rank {rank} is on the same node as writer but is not a local reader."
)
print(f"[Rank {rank}] Reader: local (shared memory)")
else:
assert mq._is_remote_reader, (
f"Rank {rank} is on a different node but is not a remote (TCP) reader."
)
print(f"[Rank {rank}] Reader: remote (TCP)")
# Test data transfer: simple objects
dist.barrier()
if rank == writer_rank:
mq.enqueue("hello_from_node0")
else:
msg = mq.dequeue(timeout=10)
assert msg == "hello_from_node0"
dist.barrier()
print(f"[Rank {rank}] Simple object test passed")
# Test data transfer: numpy arrays
np.random.seed(42)
arrays = [
np.random.randint(0, 100, size=np.random.randint(100, 5000)) for _ in range(100)
]
dist.barrier()
if rank == writer_rank:
for arr in arrays:
mq.enqueue(arr)
else:
for i, expected in enumerate(arrays):
received = mq.dequeue(timeout=10)
assert np.array_equal(expected, received), (
f"Array mismatch at index {i}: "
f"expected shape {expected.shape}, got shape {received.shape}"
)
dist.barrier()
print(f"[Rank {rank}] Numpy array test passed")
# Test data transfer: large payload (> max_chunk_bytes)
dist.barrier()
big_array = np.zeros(200_000, dtype=np.int64) # ~1.6 MiB > 1 MiB chunk
if rank == writer_rank:
mq.enqueue(big_array)
else:
received = mq.dequeue(timeout=10)
assert np.array_equal(big_array, received)
dist.barrier()
print(f"[Rank {rank}] Large payload test passed")
# Done -- cleanup
dist.barrier()
print(f"[Rank {rank}] All MessageQueue TCP multi-node tests passed!")
dist.destroy_process_group()
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,345 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Integration tests for RayExecutorV2 at the executor level.
Validates executor initialization, placement group support, RPC calls,
and distributed execution with various TP/PP configurations.
"""
import gc
import threading
from unittest.mock import patch
import pytest
import ray
from vllm import LLM
from vllm.config import VllmConfig
from vllm.engine.arg_utils import EngineArgs
from vllm.v1.executor.ray_executor_v2 import RayExecutorV2
pytestmark = pytest.mark.usefixtures("enable_ray_v2_backend")
MODEL = "facebook/opt-125m"
def create_vllm_config(
tensor_parallel_size: int = 1,
pipeline_parallel_size: int = 1,
max_model_len: int = 256,
gpu_memory_utilization: float = 0.3,
placement_group=None,
) -> VllmConfig:
engine_args = EngineArgs(
model=MODEL,
tensor_parallel_size=tensor_parallel_size,
pipeline_parallel_size=pipeline_parallel_size,
max_model_len=max_model_len,
gpu_memory_utilization=gpu_memory_utilization,
distributed_executor_backend="ray",
enforce_eager=True,
)
vllm_config = engine_args.create_engine_config()
if placement_group is not None:
vllm_config.parallel_config.placement_group = placement_group
return vllm_config
def ensure_ray_initialized():
if not ray.is_initialized():
ray.init(ignore_reinit_error=True)
@pytest.fixture
def create_placement_group(request):
ensure_ray_initialized()
num_gpus = request.param
bundles = [{"GPU": 1, "CPU": 1} for _ in range(num_gpus)]
pg = ray.util.placement_group(bundles, strategy="PACK")
ray.get(pg.ready())
yield pg
ray.util.remove_placement_group(pg)
@pytest.fixture
def executor(request):
"""Create a RayExecutorV2 and shut it down after the test."""
executor = RayExecutorV2(vllm_config=request.param)
yield executor
executor.shutdown()
def assert_executor(executor, tp_size, pp_size):
"""Common assertions for executor initialization tests."""
world_size = tp_size * pp_size
expected_output_rank = (pp_size - 1) * tp_size
assert executor.world_size == world_size
assert len(executor.ray_worker_handles) == world_size
assert len(executor.response_mqs) == world_size
assert executor._get_output_rank() == expected_output_rank
if pp_size > 1:
assert executor.max_concurrent_batches == pp_size
executor.check_health()
assert not executor.is_failed
ranks = sorted(h.rank for h in executor.ray_worker_handles)
assert ranks == list(range(world_size))
for handle in executor.ray_worker_handles:
assert handle.node_id is not None
@pytest.mark.parametrize("tp_size, pp_size", [(1, 1), (2, 1), (4, 1), (2, 2)])
def test_ray_v2_executor(tp_size, pp_size):
"""Validate RayExecutorV2 with various TP/PP configs."""
vllm_config = create_vllm_config(
tensor_parallel_size=tp_size,
pipeline_parallel_size=pp_size,
)
executor = RayExecutorV2(vllm_config=vllm_config)
try:
assert_executor(executor, tp_size, pp_size)
finally:
executor.shutdown()
@pytest.mark.parametrize(
"tp_size, pp_size, create_placement_group",
[(2, 1, 2), (4, 1, 4), (2, 2, 4)],
indirect=["create_placement_group"],
)
def test_ray_v2_executor_pg(tp_size, pp_size, create_placement_group):
"""Validate RayExecutorV2 with various TP/PP configs using external PG."""
vllm_config = create_vllm_config(
tensor_parallel_size=tp_size,
pipeline_parallel_size=pp_size,
placement_group=create_placement_group,
)
executor = RayExecutorV2(vllm_config=vllm_config)
try:
assert_executor(executor, tp_size, pp_size)
finally:
executor.shutdown()
@pytest.mark.parametrize(
"executor",
[create_vllm_config(tensor_parallel_size=2)],
indirect=True,
)
def test_ray_v2_executor_failure_callback(executor):
"""Validate failure callback registration."""
callback_invoked = False
def test_callback():
nonlocal callback_invoked
callback_invoked = True
executor.register_failure_callback(test_callback)
assert not callback_invoked
executor.is_failed = True
executor.register_failure_callback(test_callback)
assert callback_invoked
@pytest.mark.parametrize(
"executor",
[create_vllm_config(tensor_parallel_size=2)],
indirect=True,
)
def test_ray_v2_executor_collective_rpc(executor):
"""Validate collective RPC calls through MessageQueue."""
executor.check_health()
assert not executor.is_failed
assert executor.rpc_broadcast_mq is not None
@pytest.mark.parametrize(
"executor",
[create_vllm_config(tensor_parallel_size=2)],
indirect=True,
)
def test_ray_v2_executor_driver_node_rank_0(executor):
"""Validate that driver node workers get the lowest ranks."""
driver_node = ray.get_runtime_context().get_node_id()
for handle in executor.ray_worker_handles:
assert handle.node_id == driver_node
rank0_handle = next(h for h in executor.ray_worker_handles if h.rank == 0)
assert rank0_handle.node_id == driver_node
@pytest.mark.parametrize(
"executor",
[create_vllm_config(tensor_parallel_size=2)],
indirect=True,
)
def test_ray_v2_executor_worker_death(executor):
"""Validate executor detects worker death via ray.wait()."""
callback_event = threading.Event()
def on_failure():
callback_event.set()
executor.register_failure_callback(on_failure)
assert not executor.is_failed
# Kill one worker actor externally
victim = executor.ray_worker_handles[1].actor
ray.kill(victim, no_restart=True)
# Monitor thread should detect the death and invoke callback
assert callback_event.wait(timeout=30)
assert executor.is_failed
assert executor.shutting_down
def test_ray_v2_executor_shutdown():
"""Validate graceful shutdown: ray.kill() terminates all worker actors."""
executor = RayExecutorV2(vllm_config=create_vllm_config(tensor_parallel_size=2))
assert executor.rpc_broadcast_mq is not None
assert len(executor.response_mqs) == executor.world_size
actors = [h.actor for h in executor.ray_worker_handles]
executor.shutdown()
for actor in actors:
with pytest.raises(ray.exceptions.RayActorError):
ray.get(actor.wait_for_init.remote(), timeout=5)
assert executor.rpc_broadcast_mq is None
assert len(executor.response_mqs) == 0
@pytest.mark.parametrize(
"executor",
[create_vllm_config(tensor_parallel_size=2)],
indirect=True,
)
def test_ray_v2_run_refs_stored_for_monitoring(executor):
"""Validate worker handles store run_ref for monitoring."""
for handle in executor.ray_worker_handles:
assert handle.run_ref is not None
ready, _ = ray.wait([handle.run_ref], timeout=0)
assert len(ready) == 0, "run_ref should be pending"
@pytest.mark.parametrize("tp_size, pp_size", [(2, 1), (2, 2)])
def test_ray_v2_single_node_generation(tp_size, pp_size):
"""End-to-end LLM generation with RayExecutorV2."""
llm = LLM(
model=MODEL,
tensor_parallel_size=tp_size,
pipeline_parallel_size=pp_size,
distributed_executor_backend="ray",
enforce_eager=True,
max_model_len=256,
gpu_memory_utilization=0.3,
)
try:
prompts = [
"Hello, my name is",
"The capital of France is",
"The future of AI is",
]
outputs = llm.generate(prompts)
assert len(outputs) == len(prompts)
for output in outputs:
assert len(output.outputs) > 0
assert len(output.outputs[0].text) > 0
finally:
llm.llm_engine.model_executor.shutdown()
del llm
gc.collect()
@pytest.mark.parametrize(
"bundle_indices, expected_bundle_ids, create_placement_group",
[("2,3", [2, 3], 4), ("3,2", [3, 2], 4)],
indirect=["create_placement_group"],
)
def test_ray_v2_bundle_indices_env(
bundle_indices, expected_bundle_ids, create_placement_group, monkeypatch
):
"""Validate explicit VLLM_RAY_BUNDLE_INDICES bundle placement."""
monkeypatch.setenv("VLLM_RAY_BUNDLE_INDICES", bundle_indices)
vllm_config = create_vllm_config(
tensor_parallel_size=2,
placement_group=create_placement_group,
)
executor = RayExecutorV2(vllm_config=vllm_config)
try:
actual = [
h.bundle_id_idx
for h in sorted(executor.ray_worker_handles, key=lambda h: h.rank)
]
assert actual == expected_bundle_ids
assert_executor(executor, tp_size=2, pp_size=1)
finally:
executor.shutdown()
@pytest.mark.parametrize(
"bundle_indices, expected_error, create_placement_group",
[
("1,1", "cannot have duplicate values,", 4),
("0,1,2", "must have the same size", 4),
],
indirect=["create_placement_group"],
)
def test_ray_v2_invalid_bundle_indices(
bundle_indices, expected_error, create_placement_group, monkeypatch
):
"""Validate invalid bundle indices are rejected."""
monkeypatch.setenv("VLLM_RAY_BUNDLE_INDICES", bundle_indices)
vllm_config = create_vllm_config(
tensor_parallel_size=2, placement_group=create_placement_group
)
with pytest.raises(AssertionError, match=expected_error):
RayExecutorV2(vllm_config=vllm_config)
@pytest.mark.parametrize("tp_size, pp_size", [(2, 1), (2, 2)])
def test_ray_v2_single_node_generation_with_pg(tp_size, pp_size):
"""E2E LLM generation with a user-provided placement group."""
ensure_ray_initialized()
bundles = [{"GPU": 1, "CPU": 1} for _ in range(tp_size * pp_size)]
pg = ray.util.placement_group(bundles, strategy="PACK")
ray.get(pg.ready())
try:
with patch.object(ray.util, "get_current_placement_group", return_value=pg):
llm = LLM(
model=MODEL,
tensor_parallel_size=tp_size,
pipeline_parallel_size=pp_size,
distributed_executor_backend="ray",
enforce_eager=True,
max_model_len=256,
gpu_memory_utilization=0.3,
)
prompts = [
"Hello, my name is",
"The capital of France is",
"The future of AI is",
]
outputs = llm.generate(prompts)
assert len(outputs) == len(prompts)
for output in outputs:
assert len(output.outputs) > 0
assert len(output.outputs[0].text) > 0
finally:
llm.llm_engine.model_executor.shutdown()
del llm
gc.collect()

View File

@@ -0,0 +1,209 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Orchestration-level integration tests for RayExecutorV2.
"""
import gc
import os
import pathlib
import pytest
import ray
pytestmark = pytest.mark.usefixtures("enable_ray_v2_backend")
MODEL = "facebook/opt-125m"
def _get_env_var(worker, name):
return os.environ.get(name)
def _ray_init():
"""Start Ray with the project root on workers' PYTHONPATH.
Without this, workers cannot unpickle actor classes defined in the
``tests`` package, causing FunctionActorManager to fall back to
TemporaryActor which drops async method signatures."""
project_root = str(pathlib.Path(__file__).resolve().parents[2])
ray.init(
ignore_reinit_error=True,
runtime_env={"env_vars": {"PYTHONPATH": project_root}},
)
@pytest.fixture
def ray_init():
_ray_init()
class _AsyncLLMActor:
def start(self, pg, bundle_indices=None, ray_runtime_env=None):
os.environ["VLLM_USE_RAY_V2_EXECUTOR_BACKEND"] = "1"
# Needed so collective_rpc can pickle _get_env_var over the
# AsyncLLM -> EngineCore ZMQ boundary.
os.environ["VLLM_ALLOW_INSECURE_SERIALIZATION"] = "1"
if bundle_indices is not None:
os.environ["VLLM_RAY_BUNDLE_INDICES"] = bundle_indices
else:
os.environ.pop("VLLM_RAY_BUNDLE_INDICES", None)
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.v1.engine.async_llm import AsyncLLM
from vllm.v1.executor.abstract import Executor
engine_args = AsyncEngineArgs(
model=MODEL,
tensor_parallel_size=2,
distributed_executor_backend="ray",
enforce_eager=True,
max_model_len=256,
gpu_memory_utilization=0.8,
)
vllm_config = engine_args.create_engine_config()
vllm_config.parallel_config.placement_group = pg
if ray_runtime_env is not None:
vllm_config.parallel_config.ray_runtime_env = ray_runtime_env
executor_class = Executor.get_class(vllm_config)
self.engine = AsyncLLM(
vllm_config=vllm_config,
executor_class=executor_class,
log_stats=False,
log_requests=False,
)
async def generate(self, prompt):
from vllm.sampling_params import SamplingParams
params = SamplingParams(max_tokens=16)
result = None
async for output in self.engine.generate(
prompt, params, request_id="test_request_id"
):
result = output
assert result is not None
return result.outputs[0].text
async def generate_and_get_worker_envs(self, prompt, env_names):
from vllm.sampling_params import SamplingParams
params = SamplingParams(max_tokens=16)
result = None
async for output in self.engine.generate(
prompt, params, request_id="test_request_id"
):
result = output
assert result is not None
text = result.outputs[0].text
env_results = {}
for name in env_names:
vals = await self.engine.collective_rpc(
_get_env_var, timeout=10, args=(name,)
)
env_results[name] = vals
return text, env_results
def shutdown(self):
if engine := getattr(self, "engine", None):
engine.shutdown()
del self.engine
gc.collect()
AsyncLLMActor = ray.remote(num_cpus=0, max_concurrency=1)(_AsyncLLMActor)
def test_multi_replicas(ray_init):
pg1 = ray.util.placement_group([{"GPU": 1, "CPU": 1}] * 2, strategy="PACK")
pg2 = ray.util.placement_group([{"GPU": 1, "CPU": 1}] * 2, strategy="PACK")
ray.get([pg1.ready(), pg2.ready()])
actor1 = AsyncLLMActor.remote()
actor2 = AsyncLLMActor.remote()
ray.get(actor1.start.remote(pg1))
ray.get(actor2.start.remote(pg2))
out1, out2 = ray.get(
[
actor1.generate.remote("Hello world"),
actor2.generate.remote("Hello world"),
]
)
assert len(out1) > 0
assert len(out2) > 0
def test_multi_replicas_with_bundle_indices(ray_init):
pg = ray.util.placement_group([{"GPU": 1, "CPU": 1}] * 4, strategy="PACK")
ray.get(pg.ready())
actor1 = AsyncLLMActor.remote()
actor2 = AsyncLLMActor.remote()
ray.get(actor1.start.remote(pg, bundle_indices="2,1"))
ray.get(actor2.start.remote(pg, bundle_indices="0,3"))
out1, out2 = ray.get(
[
actor1.generate.remote("Hello world"),
actor2.generate.remote("Hello world"),
]
)
assert len(out1) > 0
assert len(out2) > 0
def test_env_var_and_runtime_env_propagation():
"""
Verify env vars (NCCL_, HF_) and parallel_config.ray_runtime_env
propagate to RayWorkerProc actors.
"""
sentinel_vars = {
"NCCL_DEBUG": "INFO",
"HF_TOKEN": "test_sentinel_token",
}
for k, v in sentinel_vars.items():
os.environ[k] = v
try:
# Called directly (not via the ray_init fixture) because sentinel
# env vars must be in os.environ before ray.init() so that Ray
# worker processes inherit them.
_ray_init()
pg = ray.util.placement_group([{"GPU": 1, "CPU": 1}] * 2, strategy="PACK")
ray.get(pg.ready())
# Include the project root so that RayWorkerProc actors can
# unpickle _get_env_var.
project_root = str(pathlib.Path(__file__).resolve().parents[2])
ray_runtime_env = {
"env_vars": {
"RAY_RUNTIME_ENV_TEST": "ray_runtime_env",
"PYTHONPATH": project_root,
},
}
actor = AsyncLLMActor.remote()
ray.get(actor.start.remote(pg, ray_runtime_env=ray_runtime_env))
all_env_names = list(sentinel_vars) + ["RAY_RUNTIME_ENV_TEST"]
text, env_results = ray.get(
actor.generate_and_get_worker_envs.remote("Hello world", all_env_names)
)
assert len(text) > 0
for name, expected in sentinel_vars.items():
for val in env_results[name]:
assert val == expected
for val in env_results["RAY_RUNTIME_ENV_TEST"]:
assert val == "ray_runtime_env"
finally:
for k in sentinel_vars:
os.environ.pop(k, None)