tests/distributed/test_ray_v2_executor.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

"""
Integration tests for RayExecutorV2 at the executor level.
Validates executor initialization, placement group support, RPC calls,
and distributed execution with various TP/PP configurations.
"""

import gc
import threading
from unittest.mock import patch

import pytest
import ray

from vllm import LLM
from vllm.config import VllmConfig
from vllm.engine.arg_utils import EngineArgs
from vllm.v1.executor.ray_executor_v2 import RayExecutorV2

pytestmark = pytest.mark.usefixtures("enable_ray_v2_backend")

MODEL = "facebook/opt-125m"


def create_vllm_config(
    tensor_parallel_size: int = 1,
    pipeline_parallel_size: int = 1,
    max_model_len: int = 256,
    gpu_memory_utilization: float = 0.3,
    placement_group=None,
) -> VllmConfig:
    engine_args = EngineArgs(
        model=MODEL,
        tensor_parallel_size=tensor_parallel_size,
        pipeline_parallel_size=pipeline_parallel_size,
        max_model_len=max_model_len,
        gpu_memory_utilization=gpu_memory_utilization,
        distributed_executor_backend="ray",
        enforce_eager=True,
    )
    vllm_config = engine_args.create_engine_config()

    if placement_group is not None:
        vllm_config.parallel_config.placement_group = placement_group

    return vllm_config


def ensure_ray_initialized():
    if not ray.is_initialized():
        ray.init(ignore_reinit_error=True)


@pytest.fixture
def create_placement_group(request):
    ensure_ray_initialized()
    num_gpus = request.param
    bundles = [{"GPU": 1, "CPU": 1} for _ in range(num_gpus)]
    pg = ray.util.placement_group(bundles, strategy="PACK")
    ray.get(pg.ready())
    yield pg
    ray.util.remove_placement_group(pg)


@pytest.fixture
def executor(request):
    """Create a RayExecutorV2 and shut it down after the test."""
    executor = RayExecutorV2(vllm_config=request.param)
    yield executor
    executor.shutdown()


def assert_executor(executor, tp_size, pp_size):
    """Common assertions for executor initialization tests."""
    world_size = tp_size * pp_size
    expected_output_rank = (pp_size - 1) * tp_size

    assert executor.world_size == world_size
    assert len(executor.ray_worker_handles) == world_size
    assert len(executor.response_mqs) == world_size
    assert executor._get_output_rank() == expected_output_rank

    if pp_size > 1:
        assert executor.max_concurrent_batches == pp_size

    executor.check_health()
    assert not executor.is_failed

    ranks = sorted(h.rank for h in executor.ray_worker_handles)
    assert ranks == list(range(world_size))

    for handle in executor.ray_worker_handles:
        assert handle.node_id is not None


@pytest.mark.parametrize("tp_size, pp_size", [(1, 1), (2, 1), (4, 1), (2, 2)])
def test_ray_v2_executor(tp_size, pp_size):
    """Validate RayExecutorV2 with various TP/PP configs."""
    vllm_config = create_vllm_config(
        tensor_parallel_size=tp_size,
        pipeline_parallel_size=pp_size,
    )
    executor = RayExecutorV2(vllm_config=vllm_config)
    try:
        assert_executor(executor, tp_size, pp_size)
    finally:
        executor.shutdown()


@pytest.mark.parametrize(
    "tp_size, pp_size, create_placement_group",
    [(2, 1, 2), (4, 1, 4), (2, 2, 4)],
    indirect=["create_placement_group"],
)
def test_ray_v2_executor_pg(tp_size, pp_size, create_placement_group):
    """Validate RayExecutorV2 with various TP/PP configs using external PG."""
    vllm_config = create_vllm_config(
        tensor_parallel_size=tp_size,
        pipeline_parallel_size=pp_size,
        placement_group=create_placement_group,
    )
    executor = RayExecutorV2(vllm_config=vllm_config)
    try:
        assert_executor(executor, tp_size, pp_size)
    finally:
        executor.shutdown()


@pytest.mark.parametrize(
    "executor",
    [create_vllm_config(tensor_parallel_size=2)],
    indirect=True,
)
def test_ray_v2_executor_failure_callback(executor):
    """Validate failure callback registration."""
    callback_invoked = False

    def test_callback():
        nonlocal callback_invoked
        callback_invoked = True

    executor.register_failure_callback(test_callback)
    assert not callback_invoked

    executor.is_failed = True
    executor.register_failure_callback(test_callback)
    assert callback_invoked


@pytest.mark.parametrize(
    "executor",
    [create_vllm_config(tensor_parallel_size=2)],
    indirect=True,
)
def test_ray_v2_executor_collective_rpc(executor):
    """Validate collective RPC calls through MessageQueue."""
    executor.check_health()
    assert not executor.is_failed
    assert executor.rpc_broadcast_mq is not None


@pytest.mark.parametrize(
    "executor",
    [create_vllm_config(tensor_parallel_size=2)],
    indirect=True,
)
def test_ray_v2_executor_driver_node_rank_0(executor):
    """Validate that driver node workers get the lowest ranks."""
    driver_node = ray.get_runtime_context().get_node_id()

    for handle in executor.ray_worker_handles:
        assert handle.node_id == driver_node

    rank0_handle = next(h for h in executor.ray_worker_handles if h.rank == 0)
    assert rank0_handle.node_id == driver_node


@pytest.mark.parametrize(
    "executor",
    [create_vllm_config(tensor_parallel_size=2)],
    indirect=True,
)
def test_ray_v2_executor_worker_death(executor):
    """Validate executor detects worker death via ray.wait()."""
    callback_event = threading.Event()

    def on_failure():
        callback_event.set()

    executor.register_failure_callback(on_failure)
    assert not executor.is_failed

    # Kill one worker actor externally
    victim = executor.ray_worker_handles[1].actor
    ray.kill(victim, no_restart=True)

    # Monitor thread should detect the death and invoke callback
    assert callback_event.wait(timeout=30)
    assert executor.is_failed
    assert executor.shutting_down


def test_ray_v2_executor_shutdown():
    """Validate graceful shutdown: ray.kill() terminates all worker actors."""
    executor = RayExecutorV2(vllm_config=create_vllm_config(tensor_parallel_size=2))
    assert executor.rpc_broadcast_mq is not None
    assert len(executor.response_mqs) == executor.world_size

    actors = [h.actor for h in executor.ray_worker_handles]
    executor.shutdown()

    for actor in actors:
        with pytest.raises(ray.exceptions.RayActorError):
            ray.get(actor.wait_for_init.remote(), timeout=5)

    assert executor.rpc_broadcast_mq is None
    assert len(executor.response_mqs) == 0


@pytest.mark.parametrize(
    "executor",
    [create_vllm_config(tensor_parallel_size=2)],
    indirect=True,
)
def test_ray_v2_run_refs_stored_for_monitoring(executor):
    """Validate worker handles store run_ref for monitoring."""
    for handle in executor.ray_worker_handles:
        assert handle.run_ref is not None
        ready, _ = ray.wait([handle.run_ref], timeout=0)
        assert len(ready) == 0, "run_ref should be pending"


@pytest.mark.parametrize("tp_size, pp_size", [(2, 1), (2, 2)])
def test_ray_v2_single_node_generation(tp_size, pp_size):
    """End-to-end LLM generation with RayExecutorV2."""

    llm = LLM(
        model=MODEL,
        tensor_parallel_size=tp_size,
        pipeline_parallel_size=pp_size,
        distributed_executor_backend="ray",
        enforce_eager=True,
        max_model_len=256,
        gpu_memory_utilization=0.3,
    )
    try:
        prompts = [
            "Hello, my name is",
            "The capital of France is",
            "The future of AI is",
        ]
        outputs = llm.generate(prompts)

        assert len(outputs) == len(prompts)
        for output in outputs:
            assert len(output.outputs) > 0
            assert len(output.outputs[0].text) > 0
    finally:
        llm.llm_engine.model_executor.shutdown()
        del llm
        gc.collect()


@pytest.mark.parametrize(
    "bundle_indices, expected_bundle_ids, create_placement_group",
    [("2,3", [2, 3], 4), ("3,2", [3, 2], 4)],
    indirect=["create_placement_group"],
)
def test_ray_v2_bundle_indices_env(
    bundle_indices, expected_bundle_ids, create_placement_group, monkeypatch
):
    """Validate explicit VLLM_RAY_BUNDLE_INDICES bundle placement."""
    monkeypatch.setenv("VLLM_RAY_BUNDLE_INDICES", bundle_indices)
    vllm_config = create_vllm_config(
        tensor_parallel_size=2,
        placement_group=create_placement_group,
    )
    executor = RayExecutorV2(vllm_config=vllm_config)
    try:
        actual = [
            h.bundle_id_idx
            for h in sorted(executor.ray_worker_handles, key=lambda h: h.rank)
        ]
        assert actual == expected_bundle_ids
        assert_executor(executor, tp_size=2, pp_size=1)
    finally:
        executor.shutdown()


@pytest.mark.parametrize(
    "bundle_indices, expected_error, create_placement_group",
    [
        ("1,1", "cannot have duplicate values,", 4),
        ("0,1,2", "must have the same size", 4),
    ],
    indirect=["create_placement_group"],
)
def test_ray_v2_invalid_bundle_indices(
    bundle_indices, expected_error, create_placement_group, monkeypatch
):
    """Validate invalid bundle indices are rejected."""
    monkeypatch.setenv("VLLM_RAY_BUNDLE_INDICES", bundle_indices)
    vllm_config = create_vllm_config(
        tensor_parallel_size=2, placement_group=create_placement_group
    )
    with pytest.raises(AssertionError, match=expected_error):
        RayExecutorV2(vllm_config=vllm_config)


@pytest.mark.parametrize("tp_size, pp_size", [(2, 1), (2, 2)])
def test_ray_v2_single_node_generation_with_pg(tp_size, pp_size):
    """E2E LLM generation with a user-provided placement group."""
    ensure_ray_initialized()
    bundles = [{"GPU": 1, "CPU": 1} for _ in range(tp_size * pp_size)]
    pg = ray.util.placement_group(bundles, strategy="PACK")
    ray.get(pg.ready())

    try:
        with patch.object(ray.util, "get_current_placement_group", return_value=pg):
            llm = LLM(
                model=MODEL,
                tensor_parallel_size=tp_size,
                pipeline_parallel_size=pp_size,
                distributed_executor_backend="ray",
                enforce_eager=True,
                max_model_len=256,
                gpu_memory_utilization=0.3,
            )
        prompts = [
            "Hello, my name is",
            "The capital of France is",
            "The future of AI is",
        ]
        outputs = llm.generate(prompts)

        assert len(outputs) == len(prompts)
        for output in outputs:
            assert len(output.outputs) > 0
            assert len(output.outputs[0].text) > 0
    finally:
        llm.llm_engine.model_executor.shutdown()
        del llm
        gc.collect()
[Feat][Executor] Introduce RayExecutorV2 (#36836) Signed-off-by: Jeffrey Wang <jeffreywang@anyscale.com> 2026-04-01 14:34:29 -07:00			`# SPDX-License-Identifier: Apache-2.0`
			`# SPDX-FileCopyrightText: Copyright contributors to the vLLM project`

			`"""`
			`Integration tests for RayExecutorV2 at the executor level.`
			`Validates executor initialization, placement group support, RPC calls,`
			`and distributed execution with various TP/PP configurations.`
			`"""`

			`import gc`
			`import threading`
			`from unittest.mock import patch`

			`import pytest`
			`import ray`

			`from vllm import LLM`
			`from vllm.config import VllmConfig`
			`from vllm.engine.arg_utils import EngineArgs`
			`from vllm.v1.executor.ray_executor_v2 import RayExecutorV2`

			`pytestmark = pytest.mark.usefixtures("enable_ray_v2_backend")`

			`MODEL = "facebook/opt-125m"`


			`def create_vllm_config(`
			`tensor_parallel_size: int = 1,`
			`pipeline_parallel_size: int = 1,`
			`max_model_len: int = 256,`
			`gpu_memory_utilization: float = 0.3,`
			`placement_group=None,`
			`) -> VllmConfig:`
			`engine_args = EngineArgs(`
			`model=MODEL,`
			`tensor_parallel_size=tensor_parallel_size,`
			`pipeline_parallel_size=pipeline_parallel_size,`
			`max_model_len=max_model_len,`
			`gpu_memory_utilization=gpu_memory_utilization,`
			`distributed_executor_backend="ray",`
			`enforce_eager=True,`
			`)`
			`vllm_config = engine_args.create_engine_config()`

			`if placement_group is not None:`
			`vllm_config.parallel_config.placement_group = placement_group`

			`return vllm_config`


			`def ensure_ray_initialized():`
			`if not ray.is_initialized():`
			`ray.init(ignore_reinit_error=True)`


			`@pytest.fixture`
			`def create_placement_group(request):`
			`ensure_ray_initialized()`
			`num_gpus = request.param`
			`bundles = [{"GPU": 1, "CPU": 1} for _ in range(num_gpus)]`
			`pg = ray.util.placement_group(bundles, strategy="PACK")`
			`ray.get(pg.ready())`
			`yield pg`
			`ray.util.remove_placement_group(pg)`


			`@pytest.fixture`
			`def executor(request):`
			`"""Create a RayExecutorV2 and shut it down after the test."""`
			`executor = RayExecutorV2(vllm_config=request.param)`
			`yield executor`
			`executor.shutdown()`


			`def assert_executor(executor, tp_size, pp_size):`
			`"""Common assertions for executor initialization tests."""`
			`world_size = tp_size * pp_size`
			`expected_output_rank = (pp_size - 1) * tp_size`

			`assert executor.world_size == world_size`
			`assert len(executor.ray_worker_handles) == world_size`
			`assert len(executor.response_mqs) == world_size`
			`assert executor._get_output_rank() == expected_output_rank`

			`if pp_size > 1:`
			`assert executor.max_concurrent_batches == pp_size`

			`executor.check_health()`
			`assert not executor.is_failed`

			`ranks = sorted(h.rank for h in executor.ray_worker_handles)`
			`assert ranks == list(range(world_size))`

			`for handle in executor.ray_worker_handles:`
			`assert handle.node_id is not None`


			`@pytest.mark.parametrize("tp_size, pp_size", [(1, 1), (2, 1), (4, 1), (2, 2)])`
			`def test_ray_v2_executor(tp_size, pp_size):`
			`"""Validate RayExecutorV2 with various TP/PP configs."""`
			`vllm_config = create_vllm_config(`
			`tensor_parallel_size=tp_size,`
			`pipeline_parallel_size=pp_size,`
			`)`
			`executor = RayExecutorV2(vllm_config=vllm_config)`
			`try:`
			`assert_executor(executor, tp_size, pp_size)`
			`finally:`
			`executor.shutdown()`


			`@pytest.mark.parametrize(`
			`"tp_size, pp_size, create_placement_group",`
			`[(2, 1, 2), (4, 1, 4), (2, 2, 4)],`
			`indirect=["create_placement_group"],`
			`)`
			`def test_ray_v2_executor_pg(tp_size, pp_size, create_placement_group):`
			`"""Validate RayExecutorV2 with various TP/PP configs using external PG."""`
			`vllm_config = create_vllm_config(`
			`tensor_parallel_size=tp_size,`
			`pipeline_parallel_size=pp_size,`
			`placement_group=create_placement_group,`
			`)`
			`executor = RayExecutorV2(vllm_config=vllm_config)`
			`try:`
			`assert_executor(executor, tp_size, pp_size)`
			`finally:`
			`executor.shutdown()`


			`@pytest.mark.parametrize(`
			`"executor",`
			`[create_vllm_config(tensor_parallel_size=2)],`
			`indirect=True,`
			`)`
			`def test_ray_v2_executor_failure_callback(executor):`
			`"""Validate failure callback registration."""`
			`callback_invoked = False`

			`def test_callback():`
			`nonlocal callback_invoked`
			`callback_invoked = True`

			`executor.register_failure_callback(test_callback)`
			`assert not callback_invoked`

			`executor.is_failed = True`
			`executor.register_failure_callback(test_callback)`
			`assert callback_invoked`


			`@pytest.mark.parametrize(`
			`"executor",`
			`[create_vllm_config(tensor_parallel_size=2)],`
			`indirect=True,`
			`)`
			`def test_ray_v2_executor_collective_rpc(executor):`
			`"""Validate collective RPC calls through MessageQueue."""`
			`executor.check_health()`
			`assert not executor.is_failed`
			`assert executor.rpc_broadcast_mq is not None`


			`@pytest.mark.parametrize(`
			`"executor",`
			`[create_vllm_config(tensor_parallel_size=2)],`
			`indirect=True,`
			`)`
			`def test_ray_v2_executor_driver_node_rank_0(executor):`
			`"""Validate that driver node workers get the lowest ranks."""`
			`driver_node = ray.get_runtime_context().get_node_id()`

			`for handle in executor.ray_worker_handles:`
			`assert handle.node_id == driver_node`

			`rank0_handle = next(h for h in executor.ray_worker_handles if h.rank == 0)`
			`assert rank0_handle.node_id == driver_node`


			`@pytest.mark.parametrize(`
			`"executor",`
			`[create_vllm_config(tensor_parallel_size=2)],`
			`indirect=True,`
			`)`
			`def test_ray_v2_executor_worker_death(executor):`
			`"""Validate executor detects worker death via ray.wait()."""`
			`callback_event = threading.Event()`

			`def on_failure():`
			`callback_event.set()`

			`executor.register_failure_callback(on_failure)`
			`assert not executor.is_failed`

			`# Kill one worker actor externally`
			`victim = executor.ray_worker_handles[1].actor`
			`ray.kill(victim, no_restart=True)`

			`# Monitor thread should detect the death and invoke callback`
			`assert callback_event.wait(timeout=30)`
			`assert executor.is_failed`
			`assert executor.shutting_down`


			`def test_ray_v2_executor_shutdown():`
			`"""Validate graceful shutdown: ray.kill() terminates all worker actors."""`
			`executor = RayExecutorV2(vllm_config=create_vllm_config(tensor_parallel_size=2))`
			`assert executor.rpc_broadcast_mq is not None`
			`assert len(executor.response_mqs) == executor.world_size`

			`actors = [h.actor for h in executor.ray_worker_handles]`
			`executor.shutdown()`

			`for actor in actors:`
			`with pytest.raises(ray.exceptions.RayActorError):`
			`ray.get(actor.wait_for_init.remote(), timeout=5)`

			`assert executor.rpc_broadcast_mq is None`
			`assert len(executor.response_mqs) == 0`


			`@pytest.mark.parametrize(`
			`"executor",`
			`[create_vllm_config(tensor_parallel_size=2)],`
			`indirect=True,`
			`)`
			`def test_ray_v2_run_refs_stored_for_monitoring(executor):`
			`"""Validate worker handles store run_ref for monitoring."""`
			`for handle in executor.ray_worker_handles:`
			`assert handle.run_ref is not None`
			`ready, _ = ray.wait([handle.run_ref], timeout=0)`
			`assert len(ready) == 0, "run_ref should be pending"`


			`@pytest.mark.parametrize("tp_size, pp_size", [(2, 1), (2, 2)])`
			`def test_ray_v2_single_node_generation(tp_size, pp_size):`
			`"""End-to-end LLM generation with RayExecutorV2."""`

			`llm = LLM(`
			`model=MODEL,`
			`tensor_parallel_size=tp_size,`
			`pipeline_parallel_size=pp_size,`
			`distributed_executor_backend="ray",`
			`enforce_eager=True,`
			`max_model_len=256,`
			`gpu_memory_utilization=0.3,`
			`)`
			`try:`
			`prompts = [`
			`"Hello, my name is",`
			`"The capital of France is",`
			`"The future of AI is",`
			`]`
			`outputs = llm.generate(prompts)`

			`assert len(outputs) == len(prompts)`
			`for output in outputs:`
			`assert len(output.outputs) > 0`
			`assert len(output.outputs[0].text) > 0`
			`finally:`
			`llm.llm_engine.model_executor.shutdown()`
			`del llm`
			`gc.collect()`


			`@pytest.mark.parametrize(`
			`"bundle_indices, expected_bundle_ids, create_placement_group",`
			`[("2,3", [2, 3], 4), ("3,2", [3, 2], 4)],`
			`indirect=["create_placement_group"],`
			`)`
			`def test_ray_v2_bundle_indices_env(`
			`bundle_indices, expected_bundle_ids, create_placement_group, monkeypatch`
			`):`
			`"""Validate explicit VLLM_RAY_BUNDLE_INDICES bundle placement."""`
			`monkeypatch.setenv("VLLM_RAY_BUNDLE_INDICES", bundle_indices)`
			`vllm_config = create_vllm_config(`
			`tensor_parallel_size=2,`
			`placement_group=create_placement_group,`
			`)`
			`executor = RayExecutorV2(vllm_config=vllm_config)`
			`try:`
			`actual = [`
			`h.bundle_id_idx`
			`for h in sorted(executor.ray_worker_handles, key=lambda h: h.rank)`
			`]`
			`assert actual == expected_bundle_ids`
			`assert_executor(executor, tp_size=2, pp_size=1)`
			`finally:`
			`executor.shutdown()`


			`@pytest.mark.parametrize(`
			`"bundle_indices, expected_error, create_placement_group",`
			`[`
			`("1,1", "cannot have duplicate values,", 4),`
			`("0,1,2", "must have the same size", 4),`
			`],`
			`indirect=["create_placement_group"],`
			`)`
			`def test_ray_v2_invalid_bundle_indices(`
			`bundle_indices, expected_error, create_placement_group, monkeypatch`
			`):`
			`"""Validate invalid bundle indices are rejected."""`
			`monkeypatch.setenv("VLLM_RAY_BUNDLE_INDICES", bundle_indices)`
			`vllm_config = create_vllm_config(`
			`tensor_parallel_size=2, placement_group=create_placement_group`
			`)`
			`with pytest.raises(AssertionError, match=expected_error):`
			`RayExecutorV2(vllm_config=vllm_config)`


			`@pytest.mark.parametrize("tp_size, pp_size", [(2, 1), (2, 2)])`
			`def test_ray_v2_single_node_generation_with_pg(tp_size, pp_size):`
			`"""E2E LLM generation with a user-provided placement group."""`
			`ensure_ray_initialized()`
			`bundles = [{"GPU": 1, "CPU": 1} for _ in range(tp_size * pp_size)]`
			`pg = ray.util.placement_group(bundles, strategy="PACK")`
			`ray.get(pg.ready())`

			`try:`
			`with patch.object(ray.util, "get_current_placement_group", return_value=pg):`
			`llm = LLM(`
			`model=MODEL,`
			`tensor_parallel_size=tp_size,`
			`pipeline_parallel_size=pp_size,`
			`distributed_executor_backend="ray",`
			`enforce_eager=True,`
			`max_model_len=256,`
			`gpu_memory_utilization=0.3,`
			`)`
			`prompts = [`
			`"Hello, my name is",`
			`"The capital of France is",`
			`"The future of AI is",`
			`]`
			`outputs = llm.generate(prompts)`

			`assert len(outputs) == len(prompts)`
			`for output in outputs:`
			`assert len(output.outputs) > 0`
			`assert len(output.outputs[0].text) > 0`
			`finally:`
			`llm.llm_engine.model_executor.shutdown()`
			`del llm`
			`gc.collect()`