[Feat][Executor] Introduce RayExecutorV2 (#36836)

Signed-off-by: Jeffrey Wang <jeffreywang@anyscale.com>
2026-04-01 14:34:29 -07:00
parent cb268e4e55
commit de5e6c44c6
14 changed files with 1603 additions and 30 deletions
--- a/.buildkite/test_areas/distributed.yaml
+++ b/.buildkite/test_areas/distributed.yaml
@@ -224,6 +224,20 @@ steps:
  commands:
    - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 $IMAGE_TAG "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code"

+- label: MessageQueue TCP Multi-Node (2 GPUs)
+  timeout_in_minutes: 10
+  working_dir: "/vllm-workspace/tests"
+  num_devices: 1
+  num_nodes: 2
+  no_plugin: true
+  optional: true
+  source_file_dependencies:
+  - vllm/distributed/device_communicators/shm_broadcast.py
+  - vllm/distributed/parallel_state.py
+  - tests/distributed/test_mq_tcp_multinode.py
+  commands:
+    - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 1 $IMAGE_TAG "torchrun --nnodes 2 --nproc-per-node=1 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_mq_tcp_multinode.py" "torchrun --nnodes 2 --nproc-per-node=1 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_mq_tcp_multinode.py"
+
 - label: Distributed NixlConnector PD accuracy (4 GPUs)
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"
@@ -294,3 +308,23 @@ steps:
  commands:
  - pytest -v -s distributed/test_pp_cudagraph.py
  - pytest -v -s distributed/test_pipeline_parallel.py
+
+- label: RayExecutorV2 (4 GPUs)
+  timeout_in_minutes: 60
+  working_dir: "/vllm-workspace/tests"
+  num_devices: 4
+  source_file_dependencies:
+  - vllm/v1/executor/ray_executor_v2.py
+  - vllm/v1/executor/abstract.py
+  - vllm/v1/executor/multiproc_executor.py
+  - tests/distributed/test_ray_v2_executor.py
+  - tests/distributed/test_ray_v2_executor_e2e.py
+  - tests/distributed/test_pipeline_parallel.py
+  - tests/basic_correctness/test_basic_correctness.py
+  commands:
+  - export VLLM_USE_RAY_V2_EXECUTOR_BACKEND=1
+  - export NCCL_CUMEM_HOST_ENABLE=0
+  - pytest -v -s distributed/test_ray_v2_executor.py
+  - pytest -v -s distributed/test_ray_v2_executor_e2e.py
+  - pytest -v -s distributed/test_pipeline_parallel.py -k "ray"
+  - TARGET_TEST_SUITE=L4 pytest -v -s basic_correctness/test_basic_correctness.py -k "ray"
--- a/tests/distributed/conftest.py
+++ b/tests/distributed/conftest.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
 import random

 import msgspec
@@ -166,3 +167,31 @@ class MockSubscriber:
        self.sub.close()
        for replay in self.replay_sockets:
            replay.close()
+
+
+@pytest.fixture
+def enable_ray_v2_backend():
+    """Set env vars for the Ray V2 executor backend and shut down Ray
+    between tests."""
+    import ray
+
+    saved = {
+        "VLLM_USE_RAY_V2_EXECUTOR_BACKEND": os.environ.get(
+            "VLLM_USE_RAY_V2_EXECUTOR_BACKEND"
+        ),
+        "VLLM_ENABLE_V1_MULTIPROCESSING": os.environ.get(
+            "VLLM_ENABLE_V1_MULTIPROCESSING"
+        ),
+    }
+    os.environ["VLLM_USE_RAY_V2_EXECUTOR_BACKEND"] = "1"
+    os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
+    if ray.is_initialized():
+        ray.shutdown()
+    try:
+        yield
+    finally:
+        if ray.is_initialized():
+            ray.shutdown()
+        os.environ.update({k: v for k, v in saved.items() if v is not None})
+        for key in (k for k, v in saved.items() if v is None):
+            os.environ.pop(key, None)
--- a/tests/distributed/test_mq_tcp_multinode.py
+++ b/tests/distributed/test_mq_tcp_multinode.py
@@ -0,0 +1,119 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Multi-node integration test for MessageQueue TCP fallback.
+
+Verifies that when writer and readers span separate nodes (Docker containers
+with isolated /dev/shm), `create_from_process_group` correctly detects
+cross-node ranks via `in_the_same_node_as()` and falls back to ZMQ TCP
+transport — and that data actually arrives.
+"""
+
+import numpy as np
+import torch.distributed as dist
+
+from vllm.distributed.device_communicators.shm_broadcast import MessageQueue
+from vllm.distributed.parallel_state import in_the_same_node_as
+
+
+def main():
+    dist.init_process_group(backend="gloo")
+
+    rank = dist.get_rank()
+    world_size = dist.get_world_size()
+    assert world_size >= 2, (
+        f"Need at least 2 ranks across nodes, got world_size={world_size}"
+    )
+
+    # Verify that in_the_same_node_as detects cross-node correctly
+    status = in_the_same_node_as(dist.group.WORLD, source_rank=0)
+    local_count = sum(status)
+    print(
+        f"[Rank {rank}] in_the_same_node_as(source=0): {status}  "
+        f"(local={local_count}/{world_size})"
+    )
+    # With 2 Docker containers (1 proc each), rank 0 and rank 1
+    # should be on different nodes.
+    assert local_count < world_size, (
+        f"Expected cross-node ranks but all {world_size} ranks appear local."
+    )
+
+    # Create MessageQueue
+    writer_rank = 0
+    mq = MessageQueue.create_from_process_group(
+        dist.group.WORLD,
+        max_chunk_bytes=1024 * 1024,  # 1 MiB
+        max_chunks=10,
+        writer_rank=writer_rank,
+    )
+
+    # Verify the transport path selection
+    if rank == writer_rank:
+        print(
+            f"[Rank {rank}] Writer: n_local_reader={mq.n_local_reader}, "
+            f"n_remote_reader={mq.n_remote_reader}"
+        )
+        assert mq.n_remote_reader > 0, (
+            "Writer should have at least 1 remote (TCP) reader in a multi-node setup."
+        )
+    else:
+        if status[rank]:
+            assert mq._is_local_reader, (
+                f"Rank {rank} is on the same node as writer but is not a local reader."
+            )
+            print(f"[Rank {rank}] Reader: local (shared memory)")
+        else:
+            assert mq._is_remote_reader, (
+                f"Rank {rank} is on a different node but is not a remote (TCP) reader."
+            )
+            print(f"[Rank {rank}] Reader: remote (TCP)")
+
+    # Test data transfer: simple objects
+    dist.barrier()
+    if rank == writer_rank:
+        mq.enqueue("hello_from_node0")
+    else:
+        msg = mq.dequeue(timeout=10)
+        assert msg == "hello_from_node0"
+    dist.barrier()
+    print(f"[Rank {rank}] Simple object test passed")
+
+    # Test data transfer: numpy arrays
+    np.random.seed(42)
+    arrays = [
+        np.random.randint(0, 100, size=np.random.randint(100, 5000)) for _ in range(100)
+    ]
+
+    dist.barrier()
+    if rank == writer_rank:
+        for arr in arrays:
+            mq.enqueue(arr)
+    else:
+        for i, expected in enumerate(arrays):
+            received = mq.dequeue(timeout=10)
+            assert np.array_equal(expected, received), (
+                f"Array mismatch at index {i}: "
+                f"expected shape {expected.shape}, got shape {received.shape}"
+            )
+    dist.barrier()
+    print(f"[Rank {rank}] Numpy array test passed")
+
+    # Test data transfer: large payload (> max_chunk_bytes)
+    dist.barrier()
+    big_array = np.zeros(200_000, dtype=np.int64)  # ~1.6 MiB > 1 MiB chunk
+    if rank == writer_rank:
+        mq.enqueue(big_array)
+    else:
+        received = mq.dequeue(timeout=10)
+        assert np.array_equal(big_array, received)
+    dist.barrier()
+    print(f"[Rank {rank}] Large payload test passed")
+
+    # Done -- cleanup
+    dist.barrier()
+    print(f"[Rank {rank}] All MessageQueue TCP multi-node tests passed!")
+    dist.destroy_process_group()
+
+
+if __name__ == "__main__":
+    main()
--- a/tests/distributed/test_ray_v2_executor.py
+++ b/tests/distributed/test_ray_v2_executor.py
@@ -0,0 +1,345 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+Integration tests for RayExecutorV2 at the executor level.
+Validates executor initialization, placement group support, RPC calls,
+and distributed execution with various TP/PP configurations.
+"""
+
+import gc
+import threading
+from unittest.mock import patch
+
+import pytest
+import ray
+
+from vllm import LLM
+from vllm.config import VllmConfig
+from vllm.engine.arg_utils import EngineArgs
+from vllm.v1.executor.ray_executor_v2 import RayExecutorV2
+
+pytestmark = pytest.mark.usefixtures("enable_ray_v2_backend")
+
+MODEL = "facebook/opt-125m"
+
+
+def create_vllm_config(
+    tensor_parallel_size: int = 1,
+    pipeline_parallel_size: int = 1,
+    max_model_len: int = 256,
+    gpu_memory_utilization: float = 0.3,
+    placement_group=None,
+) -> VllmConfig:
+    engine_args = EngineArgs(
+        model=MODEL,
+        tensor_parallel_size=tensor_parallel_size,
+        pipeline_parallel_size=pipeline_parallel_size,
+        max_model_len=max_model_len,
+        gpu_memory_utilization=gpu_memory_utilization,
+        distributed_executor_backend="ray",
+        enforce_eager=True,
+    )
+    vllm_config = engine_args.create_engine_config()
+
+    if placement_group is not None:
+        vllm_config.parallel_config.placement_group = placement_group
+
+    return vllm_config
+
+
+def ensure_ray_initialized():
+    if not ray.is_initialized():
+        ray.init(ignore_reinit_error=True)
+
+
+@pytest.fixture
+def create_placement_group(request):
+    ensure_ray_initialized()
+    num_gpus = request.param
+    bundles = [{"GPU": 1, "CPU": 1} for _ in range(num_gpus)]
+    pg = ray.util.placement_group(bundles, strategy="PACK")
+    ray.get(pg.ready())
+    yield pg
+    ray.util.remove_placement_group(pg)
+
+
+@pytest.fixture
+def executor(request):
+    """Create a RayExecutorV2 and shut it down after the test."""
+    executor = RayExecutorV2(vllm_config=request.param)
+    yield executor
+    executor.shutdown()
+
+
+def assert_executor(executor, tp_size, pp_size):
+    """Common assertions for executor initialization tests."""
+    world_size = tp_size * pp_size
+    expected_output_rank = (pp_size - 1) * tp_size
+
+    assert executor.world_size == world_size
+    assert len(executor.ray_worker_handles) == world_size
+    assert len(executor.response_mqs) == world_size
+    assert executor._get_output_rank() == expected_output_rank
+
+    if pp_size > 1:
+        assert executor.max_concurrent_batches == pp_size
+
+    executor.check_health()
+    assert not executor.is_failed
+
+    ranks = sorted(h.rank for h in executor.ray_worker_handles)
+    assert ranks == list(range(world_size))
+
+    for handle in executor.ray_worker_handles:
+        assert handle.node_id is not None
+
+
+@pytest.mark.parametrize("tp_size, pp_size", [(1, 1), (2, 1), (4, 1), (2, 2)])
+def test_ray_v2_executor(tp_size, pp_size):
+    """Validate RayExecutorV2 with various TP/PP configs."""
+    vllm_config = create_vllm_config(
+        tensor_parallel_size=tp_size,
+        pipeline_parallel_size=pp_size,
+    )
+    executor = RayExecutorV2(vllm_config=vllm_config)
+    try:
+        assert_executor(executor, tp_size, pp_size)
+    finally:
+        executor.shutdown()
+
+
+@pytest.mark.parametrize(
+    "tp_size, pp_size, create_placement_group",
+    [(2, 1, 2), (4, 1, 4), (2, 2, 4)],
+    indirect=["create_placement_group"],
+)
+def test_ray_v2_executor_pg(tp_size, pp_size, create_placement_group):
+    """Validate RayExecutorV2 with various TP/PP configs using external PG."""
+    vllm_config = create_vllm_config(
+        tensor_parallel_size=tp_size,
+        pipeline_parallel_size=pp_size,
+        placement_group=create_placement_group,
+    )
+    executor = RayExecutorV2(vllm_config=vllm_config)
+    try:
+        assert_executor(executor, tp_size, pp_size)
+    finally:
+        executor.shutdown()
+
+
+@pytest.mark.parametrize(
+    "executor",
+    [create_vllm_config(tensor_parallel_size=2)],
+    indirect=True,
+)
+def test_ray_v2_executor_failure_callback(executor):
+    """Validate failure callback registration."""
+    callback_invoked = False
+
+    def test_callback():
+        nonlocal callback_invoked
+        callback_invoked = True
+
+    executor.register_failure_callback(test_callback)
+    assert not callback_invoked
+
+    executor.is_failed = True
+    executor.register_failure_callback(test_callback)
+    assert callback_invoked
+
+
+@pytest.mark.parametrize(
+    "executor",
+    [create_vllm_config(tensor_parallel_size=2)],
+    indirect=True,
+)
+def test_ray_v2_executor_collective_rpc(executor):
+    """Validate collective RPC calls through MessageQueue."""
+    executor.check_health()
+    assert not executor.is_failed
+    assert executor.rpc_broadcast_mq is not None
+
+
+@pytest.mark.parametrize(
+    "executor",
+    [create_vllm_config(tensor_parallel_size=2)],
+    indirect=True,
+)
+def test_ray_v2_executor_driver_node_rank_0(executor):
+    """Validate that driver node workers get the lowest ranks."""
+    driver_node = ray.get_runtime_context().get_node_id()
+
+    for handle in executor.ray_worker_handles:
+        assert handle.node_id == driver_node
+
+    rank0_handle = next(h for h in executor.ray_worker_handles if h.rank == 0)
+    assert rank0_handle.node_id == driver_node
+
+
+@pytest.mark.parametrize(
+    "executor",
+    [create_vllm_config(tensor_parallel_size=2)],
+    indirect=True,
+)
+def test_ray_v2_executor_worker_death(executor):
+    """Validate executor detects worker death via ray.wait()."""
+    callback_event = threading.Event()
+
+    def on_failure():
+        callback_event.set()
+
+    executor.register_failure_callback(on_failure)
+    assert not executor.is_failed
+
+    # Kill one worker actor externally
+    victim = executor.ray_worker_handles[1].actor
+    ray.kill(victim, no_restart=True)
+
+    # Monitor thread should detect the death and invoke callback
+    assert callback_event.wait(timeout=30)
+    assert executor.is_failed
+    assert executor.shutting_down
+
+
+def test_ray_v2_executor_shutdown():
+    """Validate graceful shutdown: ray.kill() terminates all worker actors."""
+    executor = RayExecutorV2(vllm_config=create_vllm_config(tensor_parallel_size=2))
+    assert executor.rpc_broadcast_mq is not None
+    assert len(executor.response_mqs) == executor.world_size
+
+    actors = [h.actor for h in executor.ray_worker_handles]
+    executor.shutdown()
+
+    for actor in actors:
+        with pytest.raises(ray.exceptions.RayActorError):
+            ray.get(actor.wait_for_init.remote(), timeout=5)
+
+    assert executor.rpc_broadcast_mq is None
+    assert len(executor.response_mqs) == 0
+
+
+@pytest.mark.parametrize(
+    "executor",
+    [create_vllm_config(tensor_parallel_size=2)],
+    indirect=True,
+)
+def test_ray_v2_run_refs_stored_for_monitoring(executor):
+    """Validate worker handles store run_ref for monitoring."""
+    for handle in executor.ray_worker_handles:
+        assert handle.run_ref is not None
+        ready, _ = ray.wait([handle.run_ref], timeout=0)
+        assert len(ready) == 0, "run_ref should be pending"
+
+
+@pytest.mark.parametrize("tp_size, pp_size", [(2, 1), (2, 2)])
+def test_ray_v2_single_node_generation(tp_size, pp_size):
+    """End-to-end LLM generation with RayExecutorV2."""
+
+    llm = LLM(
+        model=MODEL,
+        tensor_parallel_size=tp_size,
+        pipeline_parallel_size=pp_size,
+        distributed_executor_backend="ray",
+        enforce_eager=True,
+        max_model_len=256,
+        gpu_memory_utilization=0.3,
+    )
+    try:
+        prompts = [
+            "Hello, my name is",
+            "The capital of France is",
+            "The future of AI is",
+        ]
+        outputs = llm.generate(prompts)
+
+        assert len(outputs) == len(prompts)
+        for output in outputs:
+            assert len(output.outputs) > 0
+            assert len(output.outputs[0].text) > 0
+    finally:
+        llm.llm_engine.model_executor.shutdown()
+        del llm
+        gc.collect()
+
+
+@pytest.mark.parametrize(
+    "bundle_indices, expected_bundle_ids, create_placement_group",
+    [("2,3", [2, 3], 4), ("3,2", [3, 2], 4)],
+    indirect=["create_placement_group"],
+)
+def test_ray_v2_bundle_indices_env(
+    bundle_indices, expected_bundle_ids, create_placement_group, monkeypatch
+):
+    """Validate explicit VLLM_RAY_BUNDLE_INDICES bundle placement."""
+    monkeypatch.setenv("VLLM_RAY_BUNDLE_INDICES", bundle_indices)
+    vllm_config = create_vllm_config(
+        tensor_parallel_size=2,
+        placement_group=create_placement_group,
+    )
+    executor = RayExecutorV2(vllm_config=vllm_config)
+    try:
+        actual = [
+            h.bundle_id_idx
+            for h in sorted(executor.ray_worker_handles, key=lambda h: h.rank)
+        ]
+        assert actual == expected_bundle_ids
+        assert_executor(executor, tp_size=2, pp_size=1)
+    finally:
+        executor.shutdown()
+
+
+@pytest.mark.parametrize(
+    "bundle_indices, expected_error, create_placement_group",
+    [
+        ("1,1", "cannot have duplicate values,", 4),
+        ("0,1,2", "must have the same size", 4),
+    ],
+    indirect=["create_placement_group"],
+)
+def test_ray_v2_invalid_bundle_indices(
+    bundle_indices, expected_error, create_placement_group, monkeypatch
+):
+    """Validate invalid bundle indices are rejected."""
+    monkeypatch.setenv("VLLM_RAY_BUNDLE_INDICES", bundle_indices)
+    vllm_config = create_vllm_config(
+        tensor_parallel_size=2, placement_group=create_placement_group
+    )
+    with pytest.raises(AssertionError, match=expected_error):
+        RayExecutorV2(vllm_config=vllm_config)
+
+
+@pytest.mark.parametrize("tp_size, pp_size", [(2, 1), (2, 2)])
+def test_ray_v2_single_node_generation_with_pg(tp_size, pp_size):
+    """E2E LLM generation with a user-provided placement group."""
+    ensure_ray_initialized()
+    bundles = [{"GPU": 1, "CPU": 1} for _ in range(tp_size * pp_size)]
+    pg = ray.util.placement_group(bundles, strategy="PACK")
+    ray.get(pg.ready())
+
+    try:
+        with patch.object(ray.util, "get_current_placement_group", return_value=pg):
+            llm = LLM(
+                model=MODEL,
+                tensor_parallel_size=tp_size,
+                pipeline_parallel_size=pp_size,
+                distributed_executor_backend="ray",
+                enforce_eager=True,
+                max_model_len=256,
+                gpu_memory_utilization=0.3,
+            )
+        prompts = [
+            "Hello, my name is",
+            "The capital of France is",
+            "The future of AI is",
+        ]
+        outputs = llm.generate(prompts)
+
+        assert len(outputs) == len(prompts)
+        for output in outputs:
+            assert len(output.outputs) > 0
+            assert len(output.outputs[0].text) > 0
+    finally:
+        llm.llm_engine.model_executor.shutdown()
+        del llm
+        gc.collect()
--- a/tests/distributed/test_ray_v2_executor_e2e.py
+++ b/tests/distributed/test_ray_v2_executor_e2e.py
@@ -0,0 +1,209 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Orchestration-level integration tests for RayExecutorV2.
+"""
+
+import gc
+import os
+import pathlib
+
+import pytest
+import ray
+
+pytestmark = pytest.mark.usefixtures("enable_ray_v2_backend")
+
+MODEL = "facebook/opt-125m"
+
+
+def _get_env_var(worker, name):
+    return os.environ.get(name)
+
+
+def _ray_init():
+    """Start Ray with the project root on workers' PYTHONPATH.
+
+    Without this, workers cannot unpickle actor classes defined in the
+    ``tests`` package, causing FunctionActorManager to fall back to
+    TemporaryActor which drops async method signatures."""
+    project_root = str(pathlib.Path(__file__).resolve().parents[2])
+    ray.init(
+        ignore_reinit_error=True,
+        runtime_env={"env_vars": {"PYTHONPATH": project_root}},
+    )
+
+
+@pytest.fixture
+def ray_init():
+    _ray_init()
+
+
+class _AsyncLLMActor:
+    def start(self, pg, bundle_indices=None, ray_runtime_env=None):
+        os.environ["VLLM_USE_RAY_V2_EXECUTOR_BACKEND"] = "1"
+        # Needed so collective_rpc can pickle _get_env_var over the
+        # AsyncLLM -> EngineCore ZMQ boundary.
+        os.environ["VLLM_ALLOW_INSECURE_SERIALIZATION"] = "1"
+        if bundle_indices is not None:
+            os.environ["VLLM_RAY_BUNDLE_INDICES"] = bundle_indices
+        else:
+            os.environ.pop("VLLM_RAY_BUNDLE_INDICES", None)
+
+        from vllm.engine.arg_utils import AsyncEngineArgs
+        from vllm.v1.engine.async_llm import AsyncLLM
+        from vllm.v1.executor.abstract import Executor
+
+        engine_args = AsyncEngineArgs(
+            model=MODEL,
+            tensor_parallel_size=2,
+            distributed_executor_backend="ray",
+            enforce_eager=True,
+            max_model_len=256,
+            gpu_memory_utilization=0.8,
+        )
+        vllm_config = engine_args.create_engine_config()
+        vllm_config.parallel_config.placement_group = pg
+        if ray_runtime_env is not None:
+            vllm_config.parallel_config.ray_runtime_env = ray_runtime_env
+
+        executor_class = Executor.get_class(vllm_config)
+        self.engine = AsyncLLM(
+            vllm_config=vllm_config,
+            executor_class=executor_class,
+            log_stats=False,
+            log_requests=False,
+        )
+
+    async def generate(self, prompt):
+        from vllm.sampling_params import SamplingParams
+
+        params = SamplingParams(max_tokens=16)
+        result = None
+        async for output in self.engine.generate(
+            prompt, params, request_id="test_request_id"
+        ):
+            result = output
+        assert result is not None
+        return result.outputs[0].text
+
+    async def generate_and_get_worker_envs(self, prompt, env_names):
+        from vllm.sampling_params import SamplingParams
+
+        params = SamplingParams(max_tokens=16)
+        result = None
+        async for output in self.engine.generate(
+            prompt, params, request_id="test_request_id"
+        ):
+            result = output
+        assert result is not None
+        text = result.outputs[0].text
+
+        env_results = {}
+        for name in env_names:
+            vals = await self.engine.collective_rpc(
+                _get_env_var, timeout=10, args=(name,)
+            )
+            env_results[name] = vals
+        return text, env_results
+
+    def shutdown(self):
+        if engine := getattr(self, "engine", None):
+            engine.shutdown()
+            del self.engine
+            gc.collect()
+
+
+AsyncLLMActor = ray.remote(num_cpus=0, max_concurrency=1)(_AsyncLLMActor)
+
+
+def test_multi_replicas(ray_init):
+    pg1 = ray.util.placement_group([{"GPU": 1, "CPU": 1}] * 2, strategy="PACK")
+    pg2 = ray.util.placement_group([{"GPU": 1, "CPU": 1}] * 2, strategy="PACK")
+    ray.get([pg1.ready(), pg2.ready()])
+
+    actor1 = AsyncLLMActor.remote()
+    actor2 = AsyncLLMActor.remote()
+
+    ray.get(actor1.start.remote(pg1))
+    ray.get(actor2.start.remote(pg2))
+
+    out1, out2 = ray.get(
+        [
+            actor1.generate.remote("Hello world"),
+            actor2.generate.remote("Hello world"),
+        ]
+    )
+    assert len(out1) > 0
+    assert len(out2) > 0
+
+
+def test_multi_replicas_with_bundle_indices(ray_init):
+    pg = ray.util.placement_group([{"GPU": 1, "CPU": 1}] * 4, strategy="PACK")
+    ray.get(pg.ready())
+
+    actor1 = AsyncLLMActor.remote()
+    actor2 = AsyncLLMActor.remote()
+
+    ray.get(actor1.start.remote(pg, bundle_indices="2,1"))
+    ray.get(actor2.start.remote(pg, bundle_indices="0,3"))
+
+    out1, out2 = ray.get(
+        [
+            actor1.generate.remote("Hello world"),
+            actor2.generate.remote("Hello world"),
+        ]
+    )
+    assert len(out1) > 0
+    assert len(out2) > 0
+
+
+def test_env_var_and_runtime_env_propagation():
+    """
+    Verify env vars (NCCL_, HF_) and parallel_config.ray_runtime_env
+    propagate to RayWorkerProc actors.
+    """
+    sentinel_vars = {
+        "NCCL_DEBUG": "INFO",
+        "HF_TOKEN": "test_sentinel_token",
+    }
+    for k, v in sentinel_vars.items():
+        os.environ[k] = v
+
+    try:
+        # Called directly (not via the ray_init fixture) because sentinel
+        # env vars must be in os.environ before ray.init() so that Ray
+        # worker processes inherit them.
+        _ray_init()
+
+        pg = ray.util.placement_group([{"GPU": 1, "CPU": 1}] * 2, strategy="PACK")
+        ray.get(pg.ready())
+
+        # Include the project root so that RayWorkerProc actors can
+        # unpickle _get_env_var.
+        project_root = str(pathlib.Path(__file__).resolve().parents[2])
+        ray_runtime_env = {
+            "env_vars": {
+                "RAY_RUNTIME_ENV_TEST": "ray_runtime_env",
+                "PYTHONPATH": project_root,
+            },
+        }
+
+        actor = AsyncLLMActor.remote()
+        ray.get(actor.start.remote(pg, ray_runtime_env=ray_runtime_env))
+
+        all_env_names = list(sentinel_vars) + ["RAY_RUNTIME_ENV_TEST"]
+        text, env_results = ray.get(
+            actor.generate_and_get_worker_envs.remote("Hello world", all_env_names)
+        )
+        assert len(text) > 0
+
+        for name, expected in sentinel_vars.items():
+            for val in env_results[name]:
+                assert val == expected
+
+        for val in env_results["RAY_RUNTIME_ENV_TEST"]:
+            assert val == "ray_runtime_env"
+
+    finally:
+        for k in sentinel_vars:
+            os.environ.pop(k, None)
--- a/tests/test_ray_env_utils.py
+++ b/tests/test_ray_env_utils.py
@@ -0,0 +1,51 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for vllm.v1.executor.ray_env_utils."""
+
+import os
+from unittest.mock import patch
+
+from vllm.v1.executor.ray_env_utils import get_driver_env_vars
+
+WORKER_VARS: set[str] = {
+    "CUDA_VISIBLE_DEVICES",
+    "LOCAL_RANK",
+}
+
+
+class TestDefaultPropagation:
+    """All env vars are propagated unless explicitly excluded."""
+
+    @patch.dict(os.environ, {"NCCL_DEBUG": "INFO"}, clear=False)
+    def test_nccl_prefix(self):
+        assert get_driver_env_vars(WORKER_VARS)["NCCL_DEBUG"] == "INFO"
+
+    @patch.dict(os.environ, {"HF_TOKEN": "secret"}, clear=False)
+    def test_hf_token(self):
+        assert "HF_TOKEN" in get_driver_env_vars(WORKER_VARS)
+
+    @patch.dict(os.environ, {"LMCACHE_LOCAL_CPU": "True"}, clear=False)
+    def test_lmcache_prefix(self):
+        assert "LMCACHE_LOCAL_CPU" in get_driver_env_vars(WORKER_VARS)
+
+    @patch.dict(os.environ, {"PYTHONHASHSEED": "42"}, clear=False)
+    def test_pythonhashseed(self):
+        assert get_driver_env_vars(WORKER_VARS)["PYTHONHASHSEED"] == "42"
+
+    @patch.dict(os.environ, {"MYLIB_FOO": "bar"}, clear=False)
+    def test_arbitrary_var_propagated(self):
+        assert get_driver_env_vars(WORKER_VARS)["MYLIB_FOO"] == "bar"
+
+
+class TestExclusion:
+    @patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1"}, clear=False)
+    def test_worker_specific_excluded(self):
+        assert "CUDA_VISIBLE_DEVICES" not in get_driver_env_vars(WORKER_VARS)
+
+    @patch.dict(os.environ, {"LMCACHE_LOCAL_CPU": "True"}, clear=False)
+    @patch(
+        "vllm.v1.executor.ray_env_utils.RAY_NON_CARRY_OVER_ENV_VARS",
+        {"LMCACHE_LOCAL_CPU"},
+    )
+    def test_non_carry_over_blacklist(self):
+        assert "LMCACHE_LOCAL_CPU" not in get_driver_env_vars(WORKER_VARS)
--- a/tests/utils_/test_ray_utils.py
+++ b/tests/utils_/test_ray_utils.py
@@ -0,0 +1,100 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from vllm.v1.executor.ray_utils import get_bundles_sorted_by_node
+
+NODE_A = "node_a"
+NODE_B = "node_b"
+NODE_C = "node_c"
+
+IP_A = "10.0.0.1"
+IP_B = "10.0.0.2"
+IP_C = "10.0.0.3"
+
+NODE_ID_TO_IP = {NODE_A: IP_A, NODE_B: IP_B, NODE_C: IP_C}
+
+MOCK_RAY_NODES = [
+    {"NodeID": NODE_A, "NodeManagerAddress": IP_A, "Alive": True},
+    {"NodeID": NODE_B, "NodeManagerAddress": IP_B, "Alive": True},
+    {"NodeID": NODE_C, "NodeManagerAddress": IP_C, "Alive": True},
+]
+
+
+@pytest.mark.parametrize(
+    "bundles_to_node_id,bundle_specs,expected",
+    [
+        pytest.param(
+            {0: NODE_C, 1: NODE_A, 2: NODE_B, 3: NODE_C, 4: NODE_A, 5: NODE_B},
+            [{"GPU": 1}] * 6,
+            [
+                (1, NODE_A, IP_A),
+                (4, NODE_A, IP_A),
+                (2, NODE_B, IP_B),
+                (5, NODE_B, IP_B),
+                (0, NODE_C, IP_C),
+                (3, NODE_C, IP_C),
+            ],
+        ),
+        pytest.param(
+            {0: NODE_B, 1: NODE_B, 2: NODE_A, 3: NODE_A},
+            [{"GPU": 1}] * 4,
+            [
+                (2, NODE_A, IP_A),
+                (3, NODE_A, IP_A),
+                (0, NODE_B, IP_B),
+                (1, NODE_B, IP_B),
+            ],
+        ),
+        pytest.param(
+            {0: NODE_C, 1: NODE_B, 2: NODE_C, 3: NODE_B},
+            [{"GPU": 1}] * 4,
+            [
+                (1, NODE_B, IP_B),
+                (3, NODE_B, IP_B),
+                (0, NODE_C, IP_C),
+                (2, NODE_C, IP_C),
+            ],
+        ),
+        pytest.param(
+            {0: NODE_A, 1: NODE_A, 2: NODE_A},
+            [{"GPU": 1}] * 3,
+            [(0, NODE_A, IP_A), (1, NODE_A, IP_A), (2, NODE_A, IP_A)],
+        ),
+        pytest.param(
+            {},
+            [],
+            [],
+        ),
+        pytest.param(
+            {0: NODE_A, 1: NODE_B, 2: NODE_A},
+            [{"CPU": 1}, {"GPU": 1}, {"GPU": 1}],
+            [(2, NODE_A, IP_A), (1, NODE_B, IP_B)],
+        ),
+    ],
+)
+def test_get_bundles_sorted_by_node(bundles_to_node_id, bundle_specs, expected):
+    mock_pg = MagicMock()
+    mock_pg.bundle_specs = bundle_specs
+
+    mock_ctx = MagicMock()
+    mock_ctx.get_node_id.return_value = NODE_A
+
+    with (
+        patch(
+            "vllm.v1.executor.ray_utils.placement_group_table",
+            return_value={"bundles_to_node_id": bundles_to_node_id},
+        ),
+        patch("vllm.v1.executor.ray_utils.ray") as mock_ray,
+        patch("vllm.v1.executor.ray_utils.current_platform") as mock_platform,
+    ):
+        mock_ray.get_runtime_context.return_value = mock_ctx
+        mock_ray.nodes.return_value = MOCK_RAY_NODES
+        mock_platform.ray_device_key = "GPU"
+
+        result = get_bundles_sorted_by_node(mock_pg)
+
+    assert result == expected
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -59,6 +59,7 @@ if TYPE_CHECKING:
    VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: Literal["auto", "nccl", "shm"] = "auto"
    VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = False
    VLLM_USE_RAY_WRAPPED_PP_COMM: bool = True
+    VLLM_USE_RAY_V2_EXECUTOR_BACKEND: bool = False
    VLLM_XLA_USE_SPMD: bool = False
    VLLM_WORKER_MULTIPROC_METHOD: Literal["fork", "spawn"] = "fork"
    VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets")
@@ -753,6 +754,12 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_USE_RAY_WRAPPED_PP_COMM": lambda: bool(
        int(os.getenv("VLLM_USE_RAY_WRAPPED_PP_COMM", "1"))
    ),
+    # When True and distributed_executor_backend="ray", use RayExecutorV2
+    # (MQ-based) instead of RayDistributedExecutor (compiled-graph backend).
+    # TODO (jeffreywang): Enabled by default in vLLM 0.20.0.
+    "VLLM_USE_RAY_V2_EXECUTOR_BACKEND": lambda: bool(
+        int(os.getenv("VLLM_USE_RAY_V2_EXECUTOR_BACKEND", "0"))
+    ),
    # Use dedicated multiprocess context for workers.
    # Both spawn and fork work
    "VLLM_WORKER_MULTIPROC_METHOD": env_with_choices(
--- a/vllm/v1/executor/abstract.py
+++ b/vllm/v1/executor/abstract.py
@@ -7,6 +7,7 @@ from concurrent.futures import Future
 from functools import cached_property
 from typing import TYPE_CHECKING, Literal, TypeVar, overload

+import vllm.envs as envs
 from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
@@ -57,6 +58,11 @@ class Executor(ABC):
                )
            executor_class = distributed_executor_backend
        elif distributed_executor_backend == "ray":
+            if envs.VLLM_USE_RAY_V2_EXECUTOR_BACKEND:
+                from vllm.v1.executor.ray_executor_v2 import RayExecutorV2
+
+                executor_class = RayExecutorV2
+            else:
                from vllm.v1.executor.ray_executor import RayDistributedExecutor

                executor_class = RayDistributedExecutor
--- a/vllm/v1/executor/ray_env_utils.py
+++ b/vllm/v1/executor/ray_env_utils.py
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+
+from vllm.ray.ray_env import RAY_NON_CARRY_OVER_ENV_VARS
+
+
+def get_driver_env_vars(
+    worker_specific_vars: set[str],
+) -> dict[str, str]:
+    """Return driver env vars to propagate to Ray workers.
+
+    Returns everything from ``os.environ`` except ``worker_specific_vars``
+    and user-configured exclusions (``RAY_NON_CARRY_OVER_ENV_VARS``).
+    """
+    exclude_vars = worker_specific_vars | RAY_NON_CARRY_OVER_ENV_VARS
+
+    return {key: value for key, value in os.environ.items() if key not in exclude_vars}
--- a/vllm/v1/executor/ray_executor.py
+++ b/vllm/v1/executor/ray_executor.py
@@ -23,6 +23,7 @@ from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
 from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.executor.ray_utils import (
+    WORKER_SPECIFIC_ENV_VARS,
    FutureWrapper,
    RayWorkerWrapper,
    initialize_ray_cluster,
@@ -62,17 +63,6 @@ class RayWorkerMetaData:
 class RayDistributedExecutor(Executor):
    """Ray-based distributed executor"""

-    # These env vars are worker-specific, therefore are NOT copied
-    # from the driver to the workers
-    WORKER_SPECIFIC_ENV_VARS = {
-        "VLLM_HOST_IP",
-        "VLLM_HOST_PORT",
-        "LOCAL_RANK",
-        "CUDA_VISIBLE_DEVICES",
-        "HIP_VISIBLE_DEVICES",
-        "ROCR_VISIBLE_DEVICES",
-    }
-
    uses_ray: bool = True
    supports_pp: bool = True

@@ -335,7 +325,7 @@ class RayDistributedExecutor(Executor):

        # Environment variables to copy from driver to workers
        env_vars_to_copy = get_env_vars_to_copy(
-            exclude_vars=self.WORKER_SPECIFIC_ENV_VARS,
+            exclude_vars=WORKER_SPECIFIC_ENV_VARS,
            additional_vars=set(current_platform.additional_env_vars),
            destination="workers",
        )
--- a/vllm/v1/executor/ray_executor_v2.py
+++ b/vllm/v1/executor/ray_executor_v2.py
@@ -0,0 +1,524 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import copy
+import os
+import threading
+import weakref
+from collections import defaultdict, deque
+from dataclasses import dataclass
+from typing import Any
+
+import vllm.envs as envs
+from vllm.config import VllmConfig
+from vllm.distributed.device_communicators.shm_broadcast import (
+    Handle,
+    MessageQueue,
+)
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from vllm.utils.network_utils import (
+    get_distributed_init_method,
+    get_open_port,
+)
+from vllm.v1.executor.multiproc_executor import (
+    FutureWrapper,
+    MultiprocExecutor,
+    WorkerProc,
+)
+from vllm.v1.executor.ray_env_utils import get_driver_env_vars
+from vllm.v1.executor.ray_utils import (
+    WORKER_SPECIFIC_ENV_VARS,
+    build_actor_name,
+    get_bundles_for_indices,
+    get_bundles_sorted_by_node,
+    initialize_ray_cluster,
+    ray,
+)
+
+if ray is not None:
+    from ray.actor import ActorHandle
+    from ray.types import ObjectRef
+    from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+else:
+    ActorHandle = None
+
+logger = init_logger(__name__)
+
+
+@dataclass
+class RayWorkerHandle:
+    """Handle for a Ray worker actor, compatible with MultiprocExecutor."""
+
+    actor: ActorHandle
+    """Ray worker actor"""
+
+    rank: int
+    """Rank of the worker"""
+
+    local_rank: int
+    """Local rank of the worker"""
+
+    node_id: str
+    """Node ID of the worker"""
+
+    bundle_id_idx: int = -1
+    """Placement group bundle index for the worker"""
+
+    run_ref: ObjectRef | None = None
+    """run() ObjectRef used as a sentinel for health monitoring"""
+
+    def run(self):
+        """Start the worker's busy loop"""
+        self.run_ref = self.actor.run.remote()
+
+
+class RayWorkerProc(WorkerProc):
+    """Worker process that runs inside a Ray actor.
+
+    Initialization is split into two phases:
+    1. __init__: lightweight setup, stores init args (no device/model init)
+    2. initialize_worker: called after GPU IDs are discovered, completes
+       the full WorkerProc initialization with the correct local_rank and
+       CUDA_VISIBLE_DEVICES.
+
+    CUDA_VISIBLE_DEVICES setup flow:
+
+    1. RayExecutorV2 enables RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES so Ray does
+       not set CUDA_VISIBLE_DEVICES on RayWorkerProc actors at creation time.
+    2. Each actor is scheduled with a placement group and bundle index; Ray resolves
+       the physical GPU ID for that bundle at placement time.
+    3. After placement, the worker discovers that GPU ID and sets
+       CUDA_VISIBLE_DEVICES before finishing WorkerProc initialization.
+
+    There is no workaround for this unset-and-reset sequence when the placement group
+    is externally managed: scheduling must complete before CUDA_VISIBLE_DEVICES can
+    match the GPU tied to the worker's bundle.
+
+    This sequence allows multiple vLLM instances to coexist on the same node:
+    each instance is unaware which physical devices others hold, and the
+    externally managed placement group avoids CUDA_VISIBLE_DEVICES conflicts
+    by binding workers to specific placement group bundles.
+    """
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        rank: int,
+        distributed_init_method: str,
+        input_shm_handle: Handle,
+        is_driver_worker: bool,
+        is_driver_node: bool = False,
+    ):
+        # Defer WorkerProc.__init__ until GPU IDs are known.
+        self._is_driver_node = is_driver_node
+        self._init_kwargs = dict(
+            vllm_config=vllm_config,
+            rank=rank,
+            distributed_init_method=distributed_init_method,
+            input_shm_handle=input_shm_handle,
+            shared_worker_lock=None,
+            is_driver_worker=is_driver_worker,
+        )
+
+    def get_node_and_gpu_ids(self) -> tuple[str, list[int]]:
+        """Return (node_id, gpu_ids) assigned to this actor by Ray."""
+        node_id = ray.get_runtime_context().get_node_id()
+        device_key = current_platform.ray_device_key
+        if not device_key:
+            raise RuntimeError(
+                f"current platform {current_platform.device_name} does not support ray."
+            )
+        gpu_ids = ray.get_runtime_context().get_accelerator_ids()[device_key]
+        return node_id, [int(x) for x in gpu_ids]
+
+    def initialize_worker(
+        self,
+        local_rank: int,
+        env_vars: dict[str, str],
+        driver_env_vars: dict[str, str] | None = None,
+    ) -> None:
+        """Complete initialization after GPU assignment is known.
+
+        *driver_env_vars* are applied with ``setdefault`` — they fill
+        in missing vars but never overwrite node-local values.
+        *env_vars* (e.g. CUDA_VISIBLE_DEVICES) always overwrite.
+        """
+        if driver_env_vars:
+            for key, value in driver_env_vars.items():
+                os.environ.setdefault(key, value)
+        for key, value in env_vars.items():
+            os.environ[key] = value
+
+        self.local_rank = local_rank
+        super().__init__(
+            local_rank=local_rank,
+            **self._init_kwargs,
+        )
+
+    def _init_message_queues(
+        self, input_shm_handle: Handle, vllm_config: VllmConfig
+    ) -> None:
+        """
+        Workers on the same node as the executor use shared memory for
+        both the broadcast (input) MQ and the response MQ. Workers on
+        different nodes use TCP (n_local_reader=0).
+        """
+        self.rpc_broadcast_mq = MessageQueue.create_from_handle(
+            input_shm_handle, self.worker.rank
+        )
+
+        n_local = 1 if self._is_driver_node else 0
+        # Use ray.util.get_node_ip_address() to get Ray's internal IP.
+        # get_ip() returns host's external IP which is typically not
+        # routable between nodes within the cluster.
+        self.worker_response_mq = MessageQueue(
+            n_reader=1,
+            n_local_reader=n_local,
+            connect_ip=ray.util.get_node_ip_address(),
+        )
+        self.peer_response_handles: list[dict] = []
+
+    def wait_for_init(self) -> dict:
+        """Respond to the driver's wait_until_ready() barrier."""
+        assert self.worker_response_mq is not None
+        return {
+            "status": self.READY_STR,
+            "handle": self.worker_response_mq.export_handle(),
+        }
+
+    def run(self) -> None:
+        """Main entry point called via actor.run.remote()."""
+        try:
+            assert self.rpc_broadcast_mq is not None
+            self.rpc_broadcast_mq.wait_until_ready()
+            assert self.worker_response_mq is not None
+            self.worker_response_mq.wait_until_ready()
+
+            self.worker_busy_loop()
+        except Exception as e:
+            logger.exception("RayWorkerProc failed: %s", e)
+            raise
+        finally:
+            self.shutdown()
+
+
+class RayExecutorV2(MultiprocExecutor):
+    """Ray-based distributed executor using MessageQueue communication.
+
+    Inherits from MultiprocExecutor to reuse the MQ-based control plane
+    and NCCL data plane. Workers are Ray actors.
+
+    Async scheduling is enabled, inherited from MultiprocExecutor.
+    This is cricitcal for RayExecutorV2 to be performant.
+    """
+
+    uses_ray: bool = True
+    supports_pp: bool = True
+
+    def __init__(self, vllm_config: VllmConfig):
+        super().__init__(vllm_config)
+
+    def _build_runtime_env(self) -> dict:
+        """Build a runtime_env dict for RayWorkerProc actors.
+
+        Driver env vars are applied separately via initialize_worker
+        with setdefault semantics.
+        """
+        base = self.parallel_config.ray_runtime_env
+        runtime_env: dict = copy.deepcopy(dict(base)) if base else {}
+
+        env_vars = runtime_env.setdefault("env_vars", {})
+        env_vars.update({v: "1" for v in current_platform.ray_noset_device_env_vars})
+        if self.parallel_config.ray_workers_use_nsight:
+            runtime_env["nsight"] = {
+                "t": "cuda,cudnn,cublas",
+                "o": "'worker_process_%p'",
+                "cuda-graph-trace": "node",
+            }
+        return runtime_env
+
+    @staticmethod
+    def _get_actor_resource_kwargs() -> dict[str, Any]:
+        """Return Ray actor resource kwargs for the current platform."""
+        num_devices = envs.VLLM_RAY_PER_WORKER_GPUS
+        device_key = current_platform.ray_device_key
+        if device_key == "GPU":
+            return {"num_gpus": num_devices}
+        return {"num_gpus": 0, "resources": {device_key: num_devices}}
+
+    def _init_executor(self) -> None:
+        """Initialize the RayExecutorV2 executor."""
+        self._finalizer = weakref.finalize(self, self.shutdown)
+        self.is_failed = False
+        self.failure_callback = None
+        self.shutting_down = False
+        self.shutdown_lock = threading.Lock()
+
+        # Step 1: Initialize Ray cluster and retrieve placement group
+        if ray is None:
+            raise ImportError("Using Ray backend requires installation of ray.")
+        initialize_ray_cluster(self.parallel_config, require_gpu_on_driver=False)
+        placement_group = self.parallel_config.placement_group
+
+        tp_size, pp_size, pcp_size = self._get_parallel_sizes()
+        assert self.world_size == tp_size * pp_size * pcp_size, (
+            f"world_size ({self.world_size}) must be equal to the "
+            f"tensor_parallel_size ({tp_size}) x pipeline"
+            f"_parallel_size ({pp_size}) x prefill_context"
+            f"_parallel_size ({pcp_size}). "
+        )
+
+        # Step 2: Build bundle assignments for worker rank placement
+        # while respecting VLLM_RAY_BUNDLE_INDICES.
+        if envs.VLLM_RAY_BUNDLE_INDICES:
+            bundle_to_node_id = get_bundles_for_indices(
+                placement_group,
+                list(map(int, envs.VLLM_RAY_BUNDLE_INDICES.split(","))),
+                self.world_size,
+            )
+        else:
+            bundle_to_node_id = get_bundles_sorted_by_node(placement_group)
+        driver_node = ray.get_runtime_context().get_node_id()
+
+        bundle_assignments: list[dict[str, Any]] = []
+        for rank, (bundle_id_idx, node_id, node_ip) in enumerate(bundle_to_node_id):
+            bundle_assignments.append(
+                {
+                    "rank": rank,
+                    "bundle_id_idx": bundle_id_idx,
+                    "node_id": node_id,
+                    "node_ip": node_ip,
+                }
+            )
+
+        # Step 3: Resolve the IP for torch.distributed TCPStore.
+        # The TCPStore server runs on rank 0's node, so all workers
+        # must be able to reach this address.
+        dist_ip = bundle_assignments[0]["node_ip"]
+        distributed_init_method = get_distributed_init_method(dist_ip, get_open_port())
+
+        # Step 4: Create broadcast MessageQueue.
+        # Workers on the driver node use shared memory; the rest use TCP.
+        max_chunk_bytes = envs.VLLM_MQ_MAX_CHUNK_BYTES_MB * 1024 * 1024
+        n_local = sum(1 for a in bundle_assignments if a["node_id"] == driver_node)
+        self.rpc_broadcast_mq = MessageQueue(
+            self.world_size,
+            n_local,
+            max_chunk_bytes=max_chunk_bytes,
+            connect_ip=ray.util.get_node_ip_address(),
+        )
+        scheduler_output_handle = self.rpc_broadcast_mq.export_handle()
+
+        # Step 5: Spawn RayWorkerProc actors into PG bundles (deferred init).
+        # Workers are created lightweight here; full initialization happens
+        # in Step 7 after GPU IDs are discovered.
+        self.ray_worker_handles: list[RayWorkerHandle] = []
+        instance_id = self.vllm_config.instance_id
+
+        # Collect driver env vars and apply but don't overwrite node-local values.
+        self.driver_env_vars = get_driver_env_vars(
+            worker_specific_vars=WORKER_SPECIFIC_ENV_VARS,
+        )
+
+        runtime_env = self._build_runtime_env()
+        resource_kwargs = self._get_actor_resource_kwargs()
+
+        for bundle_idx in range(self.world_size):
+            bundle = bundle_assignments[bundle_idx]
+            is_driver_worker = self._is_driver_worker(bundle["rank"])
+            is_driver_node = bundle["node_id"] == driver_node
+
+            scheduling_strategy = PlacementGroupSchedulingStrategy(
+                placement_group=placement_group,
+                placement_group_bundle_index=bundle["bundle_id_idx"],
+            )
+
+            actor_name = build_actor_name(
+                instance_id, bundle["rank"], tp_size, pp_size, pcp_size
+            )
+
+            actor = (
+                ray.remote(RayWorkerProc)
+                .options(
+                    name=actor_name,
+                    num_cpus=0,
+                    **resource_kwargs,
+                    scheduling_strategy=scheduling_strategy,
+                    runtime_env=runtime_env,
+                )
+                .remote(
+                    vllm_config=self.vllm_config,
+                    rank=bundle["rank"],
+                    distributed_init_method=distributed_init_method,
+                    input_shm_handle=scheduler_output_handle,
+                    is_driver_worker=is_driver_worker,
+                    is_driver_node=is_driver_node,
+                )
+            )
+
+            handle = RayWorkerHandle(
+                actor=actor,
+                rank=bundle["rank"],
+                local_rank=-1,  # Set in Step 7 after GPU ID discovery
+                node_id=bundle["node_id"],
+                bundle_id_idx=bundle["bundle_id_idx"],
+            )
+            self.ray_worker_handles.append(handle)
+
+        # Step 6: Discover GPU IDs assigned to each worker via Ray runtime context.
+        worker_node_and_gpu_ids = ray.get(
+            [h.actor.get_node_and_gpu_ids.remote() for h in self.ray_worker_handles]
+        )
+
+        node_workers: dict[str, list[int]] = defaultdict(list)
+        node_gpus: dict[str, list[int]] = defaultdict(list)
+        for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids):
+            node_workers[node_id].append(i)
+            node_gpus[node_id].extend(gpu_ids)
+        for node_id, gpu_ids in node_gpus.items():
+            node_gpus[node_id] = sorted(gpu_ids)
+
+        # Step 7: Initialize workers with correct local_rank and
+        # CUDA_VISIBLE_DEVICES. Each worker sees all GPUs assigned to
+        # this executor on its node; local_rank indexes into that set.
+        init_worker_refs = []
+        for i, (node_id, _) in enumerate(worker_node_and_gpu_ids):
+            local_rank = node_workers[node_id].index(i)
+            worker_env_vars = {
+                current_platform.device_control_env_var: ",".join(
+                    map(str, node_gpus[node_id])
+                ),
+            }
+            self.ray_worker_handles[i].local_rank = local_rank
+            init_worker_refs.append(
+                self.ray_worker_handles[i].actor.initialize_worker.remote(
+                    local_rank, worker_env_vars, self.driver_env_vars
+                )
+            )
+        ray.get(init_worker_refs)
+
+        # Step 8: Collect response MQ handles
+        init_results = ray.get(
+            [h.actor.wait_for_init.remote() for h in self.ray_worker_handles]
+        )
+
+        self.response_mqs: list[MessageQueue] = []
+        for i, result in enumerate(init_results):
+            if result["status"] != RayWorkerProc.READY_STR:
+                raise RuntimeError(f"Worker {i} failed to initialize: {result}")
+            self.response_mqs.append(
+                MessageQueue.create_from_handle(result["handle"], 0)
+            )
+
+        # Step 9: Start run() before wait_until_ready() to avoid
+        # deadlock — workers send subscriptions inside run().
+        for handle in self.ray_worker_handles:
+            handle.run()
+
+        # Step 10: wait_until_ready() barrier
+        self.rpc_broadcast_mq.wait_until_ready()
+        for response_mq in self.response_mqs:
+            response_mq.wait_until_ready()
+
+        self.futures_queue = deque[tuple[FutureWrapper, Any]]()
+        self._post_init_executor()
+
+        self.start_worker_monitor()
+        self.output_rank = self._get_output_rank()
+
+    def start_worker_monitor(self, inline=False) -> None:
+        """Monitor worker liveness via ray.wait() on run() ObjectRefs."""
+        run_refs = [h.run_ref for h in self.ray_worker_handles if h.run_ref is not None]
+        if not run_refs:
+            raise RuntimeError("Ray workers have not started successfully.")
+
+        self_ref = weakref.ref(self)
+        ref_to_rank = {
+            h.run_ref: h.rank for h in self.ray_worker_handles if h.run_ref is not None
+        }
+
+        def _should_stop() -> bool:
+            executor = self_ref()
+            return not executor or executor.shutting_down
+
+        def monitor_workers():
+            # Poll with a timeout rather than blocking on ray.wait()
+            # because a blocking call would segfault if Ray is torn down
+            # while this thread is inside it.
+            while not _should_stop() and ray.is_initialized():
+                try:
+                    done, _ = ray.wait(run_refs, num_returns=1, timeout=5.0)
+                except Exception:
+                    logger.exception(
+                        "RayWorkerMonitor: unexpected error, exiting monitor thread"
+                    )
+                    return
+                if not done or _should_stop():
+                    continue
+
+                dead_ranks = [ref_to_rank[r] for r in done]
+                executor = self_ref()
+                if not executor:
+                    return
+                executor.is_failed = True
+                logger.error(
+                    "RayWorkerProc rank=%s died unexpectedly, shutting down executor.",
+                    dead_ranks,
+                )
+                executor.shutdown()
+                if executor.failure_callback is not None:
+                    callback = executor.failure_callback
+                    executor.failure_callback = None
+                    callback()
+                return
+
+        t = threading.Thread(
+            target=monitor_workers, daemon=True, name="RayWorkerMonitor"
+        )
+        t.start()
+        self._monitor_thread = t
+
+    def _join_monitor_thread(self) -> None:
+        """Wait for the monitor thread to exit.
+
+        Must be called before tearing down Ray resources — the monitor
+        may be inside ray.wait() which would segfault if Ray is shut
+        down underneath it. When the monitor itself calls shutdown()
+        on worker death, we skip the join because the thread is about
+        to return anyway.
+        """
+        monitor = getattr(self, "_monitor_thread", None)
+        if (
+            monitor is not None
+            and monitor.is_alive()
+            and threading.current_thread() is not monitor
+        ):
+            monitor.join(timeout=10)
+
+    def shutdown(self) -> None:
+        """Properly shut down the executor and its workers."""
+        lock = getattr(self, "shutdown_lock", None)
+        if lock is None:
+            return
+
+        with lock:
+            if getattr(self, "shutting_down", False):
+                return
+            self.shutting_down = True
+
+        self._join_monitor_thread()
+
+        for handle in getattr(self, "ray_worker_handles", []):
+            try:
+                ray.kill(handle.actor)
+                logger.debug("Killed actor rank=%d", handle.rank)
+            except Exception:
+                logger.exception("Failed to kill actor rank=%d", handle.rank)
+
+        if rpc_broadcast_mq := getattr(self, "rpc_broadcast_mq", None):
+            rpc_broadcast_mq.shutdown()
+            self.rpc_broadcast_mq = None
+
+        for mq in getattr(self, "response_mqs", []):
+            mq.shutdown()
+        self.response_mqs = []
--- a/vllm/v1/executor/ray_utils.py
+++ b/vllm/v1/executor/ray_utils.py
@@ -26,6 +26,17 @@ if TYPE_CHECKING:
 logger = init_logger(__name__)
 PG_WAIT_TIMEOUT = 1800

+# Env vars that are worker-specific and must NOT be copied from the
+# driver to Ray workers — they are set per-worker after GPU discovery.
+WORKER_SPECIFIC_ENV_VARS: set[str] = {
+    "VLLM_HOST_IP",
+    "VLLM_HOST_PORT",
+    "LOCAL_RANK",
+    "CUDA_VISIBLE_DEVICES",
+    "HIP_VISIBLE_DEVICES",
+    "ROCR_VISIBLE_DEVICES",
+}
+
 try:
    import ray
    from ray.util import placement_group_table
@@ -51,6 +62,8 @@ try:
            # that thread.
            self.compiled_dag_cuda_device_set = False

+        rpc_rank: int
+
        def adjust_rank(self, rank_mapping: dict[int, int]) -> None:
            """
            Adjust the rpc_rank based on the given mapping.
@@ -214,13 +227,17 @@ def assert_ray_available():


 def _verify_bundles(
-    placement_group: "PlacementGroup", parallel_config: ParallelConfig, device_str: str
+    placement_group: "PlacementGroup",
+    parallel_config: ParallelConfig,
+    device_str: str,
+    require_gpu_on_driver: bool = True,
 ):
    """Verify a given placement group has bundles located in the right place.

    There are 2 rules.
    - Warn if all tensor parallel workers cannot fit in a single node.
-    - Fail if driver node is not included in a placement group.
+    - Fail if driver node is not included in a placement group
+      (only when require_gpu_on_driver is True).
    """
    assert ray.is_initialized(), (
        "Ray is not initialized although distributed-executor-backend is ray."
@@ -237,7 +254,7 @@ def _verify_bundles(
        node_id_to_bundle[node_id].append(bundles[bundle_idx])
    driver_node_id = ray.get_runtime_context().get_node_id()

-    if driver_node_id not in node_id_to_bundle:
+    if require_gpu_on_driver and driver_node_id not in node_id_to_bundle:
        raise RuntimeError(
            f"driver node id {driver_node_id} is not included in a placement "
            f"group {placement_group.id}. Node id -> bundles "
@@ -266,6 +283,115 @@ def _verify_bundles(
            )


+def build_actor_name(
+    instance_id: str,
+    rank: int,
+    tp_size: int,
+    pp_size: int,
+    pcp_size: int,
+) -> str:
+    """Build a descriptive Ray actor name for dashboard visibility."""
+    name = f"vllm_Worker_{instance_id}"
+    if tp_size > 1:
+        name += f"_TP{rank % tp_size}"
+    if pp_size > 1:
+        name += f"_PP{(rank // tp_size) % pp_size}"
+    if pcp_size > 1:
+        name += f"_PCP{rank // (tp_size * pp_size)}"
+    return name
+
+
+def get_bundles_for_indices(
+    placement_group: "PlacementGroup",
+    bundle_indices: list[int],
+    world_size: int,
+) -> list[tuple[int, str, str]]:
+    """
+    Return GPU bundle indices paired with node IDs and node IPs for
+    explicit bundle indices specified via VLLM_RAY_BUNDLE_INDICES.
+    """
+    assert len(bundle_indices) == world_size, (
+        "VLLM_RAY_BUNDLE_INDICES must have the same size"
+        f" as the world size, but got {bundle_indices=} "
+        f"and {world_size=}"
+    )
+    assert len(set(bundle_indices)) == len(bundle_indices), (
+        "VLLM_RAY_BUNDLE_INDICES cannot have duplicate values,"
+        f" but got {bundle_indices=}"
+    )
+
+    pg_data = placement_group_table(placement_group)
+    pg_bundle_to_node = pg_data["bundles_to_node_id"]
+    node_id_to_ip = {
+        n["NodeID"]: n["NodeManagerAddress"] for n in ray.nodes() if n["Alive"]
+    }
+    return [
+        (bid, pg_bundle_to_node[bid], node_id_to_ip[pg_bundle_to_node[bid]])
+        for bid in bundle_indices
+    ]
+
+
+def get_bundles_sorted_by_node(
+    placement_group: "PlacementGroup",
+) -> list[tuple[int, str, str]]:
+    """
+    Return GPU bundle indices paired with node IDs and node IPs,
+    sorted driver-first.
+
+    This utility has to be invoked from the driver node.
+
+    Example: 3-node cluster, driver on node-A, PG bundles spread
+    across nodes:
+
+      Input: [
+          (0, node-C),
+          (1, node-A),
+          (2, node-B),
+          (3, node-C),
+          (4, node-A),
+          (5, node-B),
+      ]
+      Output: [
+          (1, node-A),
+          (4, node-A),
+          (2, node-B),
+          (5, node-B),
+          (0, node-C),
+          (3, node-C),
+      ]
+    """
+    pg_data = placement_group_table(placement_group)
+    bundle_to_node = pg_data["bundles_to_node_id"]
+
+    ray_device_key = current_platform.ray_device_key
+    if not ray_device_key:
+        raise ValueError(
+            f"current platform {current_platform.device_name} does not support ray."
+        )
+
+    node_id_to_ip = {
+        n["NodeID"]: n["NodeManagerAddress"] for n in ray.nodes() if n["Alive"]
+    }
+
+    bundle_specs = placement_group.bundle_specs
+    assert bundle_specs is not None
+    bundle_to_node_id: list[tuple[int, str, str]] = []
+    for bundle_idx, bundle in enumerate(bundle_specs):
+        if bundle.get(ray_device_key):
+            node_id = bundle_to_node.get(bundle_idx)
+            bundle_to_node_id.append((bundle_idx, node_id, node_id_to_ip[node_id]))
+
+    driver_node = ray.get_runtime_context().get_node_id()
+
+    def _sort_key(item):
+        _, node_id, _ = item
+        return (0 if node_id == driver_node else 1, node_id)
+
+    bundle_to_node_id.sort(key=_sort_key)
+
+    return bundle_to_node_id
+
+
 def _wait_until_pg_ready(current_placement_group: "PlacementGroup"):
    """Wait until a placement group is ready.

@@ -352,6 +478,7 @@ def _wait_until_pg_removed(current_placement_group: "PlacementGroup"):
 def initialize_ray_cluster(
    parallel_config: ParallelConfig,
    ray_address: str | None = None,
+    require_gpu_on_driver: bool = True,
 ):
    """Initialize the distributed cluster with Ray.

@@ -363,10 +490,18 @@ def initialize_ray_cluster(
        parallel_config: The configurations for parallel execution.
        ray_address: The address of the Ray cluster. If None, uses
            the default Ray cluster address.
+        require_gpu_on_driver: If True (default), require at least one GPU
+            on the current (driver) node and pin the first PG bundle to it.
+            Set to False for executors like RayExecutorV2 where all GPU work
+            is delegated to remote Ray actors.
    """
    assert_ray_available()
    from vllm.platforms import current_platform

+    # Disable Ray usage stats collection
+    if os.environ.get("RAY_USAGE_STATS_ENABLED", "0") != "1":
+        os.environ["RAY_USAGE_STATS_ENABLED"] = "0"
+
    # Prevalidate GPU requirements before Ray processing
    if current_platform.is_cuda() and parallel_config.world_size > 1:
        available_gpus = current_platform.device_count()
@@ -459,15 +594,19 @@ def initialize_ray_cluster(
        current_ip = get_ip()
        current_node_id = ray.get_runtime_context().get_node_id()
        current_node_resource = available_resources_per_node()[current_node_id]
+        # TODO (jeffreywang): require_gpu_on_driver should be always False
+        # after deprecating RayDistributedExecutor.
+        if require_gpu_on_driver:
            if current_node_resource.get(device_str, 0) < 1:
                raise ValueError(
                    f"Current node has no {device_str} available. "
-                f"{current_node_resource=}. vLLM engine cannot start without "
-                f"{device_str}. Make sure you have at least 1 {device_str} "
-                f"available in a node {current_node_id=} {current_ip=}."
+                    f"{current_node_resource=}. vLLM engine cannot start "
+                    f"without {device_str}. Make sure you have at least 1 "
+                    f"{device_str} available in a node "
+                    f"{current_node_id=} {current_ip=}."
                )
-        # This way, at least bundle is required to be created in a current
-        # node.
+            # This way, at least bundle is required to be created in a
+            # current node.
            placement_group_specs[0][f"node:{current_ip}"] = 0.001

        # By default, Ray packs resources as much as possible.
@@ -477,7 +616,9 @@ def initialize_ray_cluster(
        _wait_until_pg_ready(current_placement_group)

    assert current_placement_group is not None
-    _verify_bundles(current_placement_group, parallel_config, device_str)
+    _verify_bundles(
+        current_placement_group, parallel_config, device_str, require_gpu_on_driver
+    )
    # Set the placement group in the parallel config
    parallel_config.placement_group = current_placement_group

--- a/vllm/v1/worker/worker_base.py
+++ b/vllm/v1/worker/worker_base.py
@@ -195,8 +195,8 @@ class WorkerWrapperBase:
        All workers have rpc_rank=0, but they have different ranks in the TP
        group.
        """
-        self.rpc_rank = rpc_rank
-        self.global_rank = self.rpc_rank if global_rank is None else global_rank
+        self.rpc_rank: int = rpc_rank
+        self.global_rank: int = self.rpc_rank if global_rank is None else global_rank

        # Initialized after init_worker is called
        self.worker: WorkerBase