# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ Integration tests for RayExecutorV2 at the executor level. Validates executor initialization, placement group support, RPC calls, and distributed execution with various TP/PP configurations. """ import gc import threading from unittest.mock import patch import pytest import ray from vllm import LLM from vllm.config import VllmConfig from vllm.engine.arg_utils import EngineArgs from vllm.v1.executor.ray_executor_v2 import RayExecutorV2 pytestmark = pytest.mark.usefixtures("enable_ray_v2_backend") MODEL = "facebook/opt-125m" def create_vllm_config( tensor_parallel_size: int = 1, pipeline_parallel_size: int = 1, max_model_len: int = 256, gpu_memory_utilization: float = 0.3, placement_group=None, ) -> VllmConfig: engine_args = EngineArgs( model=MODEL, tensor_parallel_size=tensor_parallel_size, pipeline_parallel_size=pipeline_parallel_size, max_model_len=max_model_len, gpu_memory_utilization=gpu_memory_utilization, distributed_executor_backend="ray", enforce_eager=True, ) vllm_config = engine_args.create_engine_config() if placement_group is not None: vllm_config.parallel_config.placement_group = placement_group return vllm_config def ensure_ray_initialized(): if not ray.is_initialized(): ray.init(ignore_reinit_error=True) @pytest.fixture def create_placement_group(request): ensure_ray_initialized() num_gpus = request.param bundles = [{"GPU": 1, "CPU": 1} for _ in range(num_gpus)] pg = ray.util.placement_group(bundles, strategy="PACK") ray.get(pg.ready()) yield pg ray.util.remove_placement_group(pg) @pytest.fixture def executor(request): """Create a RayExecutorV2 and shut it down after the test.""" executor = RayExecutorV2(vllm_config=request.param) yield executor executor.shutdown() def assert_executor(executor, tp_size, pp_size): """Common assertions for executor initialization tests.""" world_size = tp_size * pp_size expected_output_rank = (pp_size - 1) * tp_size assert executor.world_size == world_size assert len(executor.ray_worker_handles) == world_size assert len(executor.response_mqs) == world_size assert executor._get_output_rank() == expected_output_rank if pp_size > 1: assert executor.max_concurrent_batches == pp_size executor.check_health() assert not executor.is_failed ranks = sorted(h.rank for h in executor.ray_worker_handles) assert ranks == list(range(world_size)) for handle in executor.ray_worker_handles: assert handle.node_id is not None @pytest.mark.parametrize("tp_size, pp_size", [(1, 1), (2, 1), (4, 1), (2, 2)]) def test_ray_v2_executor(tp_size, pp_size): """Validate RayExecutorV2 with various TP/PP configs.""" vllm_config = create_vllm_config( tensor_parallel_size=tp_size, pipeline_parallel_size=pp_size, ) executor = RayExecutorV2(vllm_config=vllm_config) try: assert_executor(executor, tp_size, pp_size) finally: executor.shutdown() @pytest.mark.parametrize( "tp_size, pp_size, create_placement_group", [(2, 1, 2), (4, 1, 4), (2, 2, 4)], indirect=["create_placement_group"], ) def test_ray_v2_executor_pg(tp_size, pp_size, create_placement_group): """Validate RayExecutorV2 with various TP/PP configs using external PG.""" vllm_config = create_vllm_config( tensor_parallel_size=tp_size, pipeline_parallel_size=pp_size, placement_group=create_placement_group, ) executor = RayExecutorV2(vllm_config=vllm_config) try: assert_executor(executor, tp_size, pp_size) finally: executor.shutdown() @pytest.mark.parametrize( "executor", [create_vllm_config(tensor_parallel_size=2)], indirect=True, ) def test_ray_v2_executor_failure_callback(executor): """Validate failure callback registration.""" callback_invoked = False def test_callback(): nonlocal callback_invoked callback_invoked = True executor.register_failure_callback(test_callback) assert not callback_invoked executor.is_failed = True executor.register_failure_callback(test_callback) assert callback_invoked @pytest.mark.parametrize( "executor", [create_vllm_config(tensor_parallel_size=2)], indirect=True, ) def test_ray_v2_executor_collective_rpc(executor): """Validate collective RPC calls through MessageQueue.""" executor.check_health() assert not executor.is_failed assert executor.rpc_broadcast_mq is not None @pytest.mark.parametrize( "executor", [create_vllm_config(tensor_parallel_size=2)], indirect=True, ) def test_ray_v2_executor_driver_node_rank_0(executor): """Validate that driver node workers get the lowest ranks.""" driver_node = ray.get_runtime_context().get_node_id() for handle in executor.ray_worker_handles: assert handle.node_id == driver_node rank0_handle = next(h for h in executor.ray_worker_handles if h.rank == 0) assert rank0_handle.node_id == driver_node @pytest.mark.parametrize( "executor", [create_vllm_config(tensor_parallel_size=2)], indirect=True, ) def test_ray_v2_executor_worker_death(executor): """Validate executor detects worker death via ray.wait().""" callback_event = threading.Event() def on_failure(): callback_event.set() executor.register_failure_callback(on_failure) assert not executor.is_failed # Kill one worker actor externally victim = executor.ray_worker_handles[1].actor ray.kill(victim, no_restart=True) # Monitor thread should detect the death and invoke callback assert callback_event.wait(timeout=30) assert executor.is_failed assert executor.shutting_down def test_ray_v2_executor_shutdown(): """Validate graceful shutdown: ray.kill() terminates all worker actors.""" executor = RayExecutorV2(vllm_config=create_vllm_config(tensor_parallel_size=2)) assert executor.rpc_broadcast_mq is not None assert len(executor.response_mqs) == executor.world_size actors = [h.actor for h in executor.ray_worker_handles] executor.shutdown() for actor in actors: with pytest.raises(ray.exceptions.RayActorError): ray.get(actor.wait_for_init.remote(), timeout=5) assert executor.rpc_broadcast_mq is None assert len(executor.response_mqs) == 0 @pytest.mark.parametrize( "executor", [create_vllm_config(tensor_parallel_size=2)], indirect=True, ) def test_ray_v2_run_refs_stored_for_monitoring(executor): """Validate worker handles store run_ref for monitoring.""" for handle in executor.ray_worker_handles: assert handle.run_ref is not None ready, _ = ray.wait([handle.run_ref], timeout=0) assert len(ready) == 0, "run_ref should be pending" @pytest.mark.parametrize("tp_size, pp_size", [(2, 1), (2, 2)]) def test_ray_v2_single_node_generation(tp_size, pp_size): """End-to-end LLM generation with RayExecutorV2.""" llm = LLM( model=MODEL, tensor_parallel_size=tp_size, pipeline_parallel_size=pp_size, distributed_executor_backend="ray", enforce_eager=True, max_model_len=256, gpu_memory_utilization=0.3, ) try: prompts = [ "Hello, my name is", "The capital of France is", "The future of AI is", ] outputs = llm.generate(prompts) assert len(outputs) == len(prompts) for output in outputs: assert len(output.outputs) > 0 assert len(output.outputs[0].text) > 0 finally: llm.llm_engine.model_executor.shutdown() del llm gc.collect() @pytest.mark.parametrize( "bundle_indices, expected_bundle_ids, create_placement_group", [("2,3", [2, 3], 4), ("3,2", [3, 2], 4)], indirect=["create_placement_group"], ) def test_ray_v2_bundle_indices_env( bundle_indices, expected_bundle_ids, create_placement_group, monkeypatch ): """Validate explicit VLLM_RAY_BUNDLE_INDICES bundle placement.""" monkeypatch.setenv("VLLM_RAY_BUNDLE_INDICES", bundle_indices) vllm_config = create_vllm_config( tensor_parallel_size=2, placement_group=create_placement_group, ) executor = RayExecutorV2(vllm_config=vllm_config) try: actual = [ h.bundle_id_idx for h in sorted(executor.ray_worker_handles, key=lambda h: h.rank) ] assert actual == expected_bundle_ids assert_executor(executor, tp_size=2, pp_size=1) finally: executor.shutdown() @pytest.mark.parametrize( "bundle_indices, expected_error, create_placement_group", [ ("1,1", "cannot have duplicate values,", 4), ("0,1,2", "must have the same size", 4), ], indirect=["create_placement_group"], ) def test_ray_v2_invalid_bundle_indices( bundle_indices, expected_error, create_placement_group, monkeypatch ): """Validate invalid bundle indices are rejected.""" monkeypatch.setenv("VLLM_RAY_BUNDLE_INDICES", bundle_indices) vllm_config = create_vllm_config( tensor_parallel_size=2, placement_group=create_placement_group ) with pytest.raises(AssertionError, match=expected_error): RayExecutorV2(vllm_config=vllm_config) @pytest.mark.parametrize("tp_size, pp_size", [(2, 1), (2, 2)]) def test_ray_v2_single_node_generation_with_pg(tp_size, pp_size): """E2E LLM generation with a user-provided placement group.""" ensure_ray_initialized() bundles = [{"GPU": 1, "CPU": 1} for _ in range(tp_size * pp_size)] pg = ray.util.placement_group(bundles, strategy="PACK") ray.get(pg.ready()) try: with patch.object(ray.util, "get_current_placement_group", return_value=pg): llm = LLM( model=MODEL, tensor_parallel_size=tp_size, pipeline_parallel_size=pp_size, distributed_executor_backend="ray", enforce_eager=True, max_model_len=256, gpu_memory_utilization=0.3, ) prompts = [ "Hello, my name is", "The capital of France is", "The future of AI is", ] outputs = llm.generate(prompts) assert len(outputs) == len(prompts) for output in outputs: assert len(output.outputs) > 0 assert len(output.outputs[0].text) > 0 finally: llm.llm_engine.model_executor.shutdown() del llm gc.collect()