tests/distributed/test_ray_v2_executor_e2e.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Orchestration-level integration tests for RayExecutorV2.
"""

import gc
import os
import pathlib

import pytest
import ray

pytestmark = pytest.mark.usefixtures("enable_ray_v2_backend")

MODEL = "facebook/opt-125m"


def _get_env_var(worker, name):
    return os.environ.get(name)


def _ray_init():
    """Start Ray with the project root on workers' PYTHONPATH.

    Without this, workers cannot unpickle actor classes defined in the
    ``tests`` package, causing FunctionActorManager to fall back to
    TemporaryActor which drops async method signatures."""
    project_root = str(pathlib.Path(__file__).resolve().parents[2])
    ray.init(
        ignore_reinit_error=True,
        runtime_env={"env_vars": {"PYTHONPATH": project_root}},
    )


@pytest.fixture
def ray_init():
    _ray_init()


class _AsyncLLMActor:
    def start(self, pg, bundle_indices=None, ray_runtime_env=None):
        os.environ["VLLM_USE_RAY_V2_EXECUTOR_BACKEND"] = "1"
        # Needed so collective_rpc can pickle _get_env_var over the
        # AsyncLLM -> EngineCore ZMQ boundary.
        os.environ["VLLM_ALLOW_INSECURE_SERIALIZATION"] = "1"
        if bundle_indices is not None:
            os.environ["VLLM_RAY_BUNDLE_INDICES"] = bundle_indices
        else:
            os.environ.pop("VLLM_RAY_BUNDLE_INDICES", None)

        from vllm.engine.arg_utils import AsyncEngineArgs
        from vllm.v1.engine.async_llm import AsyncLLM
        from vllm.v1.executor.abstract import Executor

        engine_args = AsyncEngineArgs(
            model=MODEL,
            tensor_parallel_size=2,
            distributed_executor_backend="ray",
            enforce_eager=True,
            max_model_len=256,
            gpu_memory_utilization=0.8,
        )
        vllm_config = engine_args.create_engine_config()
        vllm_config.parallel_config.placement_group = pg
        if ray_runtime_env is not None:
            vllm_config.parallel_config.ray_runtime_env = ray_runtime_env

        executor_class = Executor.get_class(vllm_config)
        self.engine = AsyncLLM(
            vllm_config=vllm_config,
            executor_class=executor_class,
            log_stats=False,
            log_requests=False,
        )

    async def generate(self, prompt):
        from vllm.sampling_params import SamplingParams

        params = SamplingParams(max_tokens=16)
        result = None
        async for output in self.engine.generate(
            prompt, params, request_id="test_request_id"
        ):
            result = output
        assert result is not None
        return result.outputs[0].text

    async def generate_and_get_worker_envs(self, prompt, env_names):
        from vllm.sampling_params import SamplingParams

        params = SamplingParams(max_tokens=16)
        result = None
        async for output in self.engine.generate(
            prompt, params, request_id="test_request_id"
        ):
            result = output
        assert result is not None
        text = result.outputs[0].text

        env_results = {}
        for name in env_names:
            vals = await self.engine.collective_rpc(
                _get_env_var, timeout=10, args=(name,)
            )
            env_results[name] = vals
        return text, env_results

    def shutdown(self):
        if engine := getattr(self, "engine", None):
            engine.shutdown()
            del self.engine
            gc.collect()


AsyncLLMActor = ray.remote(num_cpus=0, max_concurrency=1)(_AsyncLLMActor)


def test_multi_replicas(ray_init):
    pg1 = ray.util.placement_group([{"GPU": 1, "CPU": 1}] * 2, strategy="PACK")
    pg2 = ray.util.placement_group([{"GPU": 1, "CPU": 1}] * 2, strategy="PACK")
    ray.get([pg1.ready(), pg2.ready()])

    actor1 = AsyncLLMActor.remote()
    actor2 = AsyncLLMActor.remote()

    ray.get(actor1.start.remote(pg1))
    ray.get(actor2.start.remote(pg2))

    out1, out2 = ray.get(
        [
            actor1.generate.remote("Hello world"),
            actor2.generate.remote("Hello world"),
        ]
    )
    assert len(out1) > 0
    assert len(out2) > 0


def test_multi_replicas_with_bundle_indices(ray_init):
    pg = ray.util.placement_group([{"GPU": 1, "CPU": 1}] * 4, strategy="PACK")
    ray.get(pg.ready())

    actor1 = AsyncLLMActor.remote()
    actor2 = AsyncLLMActor.remote()

    ray.get(actor1.start.remote(pg, bundle_indices="2,1"))
    ray.get(actor2.start.remote(pg, bundle_indices="0,3"))

    out1, out2 = ray.get(
        [
            actor1.generate.remote("Hello world"),
            actor2.generate.remote("Hello world"),
        ]
    )
    assert len(out1) > 0
    assert len(out2) > 0


def test_env_var_and_runtime_env_propagation():
    """
    Verify env vars (NCCL_, HF_) and parallel_config.ray_runtime_env
    propagate to RayWorkerProc actors.
    """
    sentinel_vars = {
        "NCCL_DEBUG": "INFO",
        "HF_TOKEN": "test_sentinel_token",
    }
    for k, v in sentinel_vars.items():
        os.environ[k] = v

    try:
        # Called directly (not via the ray_init fixture) because sentinel
        # env vars must be in os.environ before ray.init() so that Ray
        # worker processes inherit them.
        _ray_init()

        pg = ray.util.placement_group([{"GPU": 1, "CPU": 1}] * 2, strategy="PACK")
        ray.get(pg.ready())

        # Include the project root so that RayWorkerProc actors can
        # unpickle _get_env_var.
        project_root = str(pathlib.Path(__file__).resolve().parents[2])
        ray_runtime_env = {
            "env_vars": {
                "RAY_RUNTIME_ENV_TEST": "ray_runtime_env",
                "PYTHONPATH": project_root,
            },
        }

        actor = AsyncLLMActor.remote()
        ray.get(actor.start.remote(pg, ray_runtime_env=ray_runtime_env))

        all_env_names = list(sentinel_vars) + ["RAY_RUNTIME_ENV_TEST"]
        text, env_results = ray.get(
            actor.generate_and_get_worker_envs.remote("Hello world", all_env_names)
        )
        assert len(text) > 0

        for name, expected in sentinel_vars.items():
            for val in env_results[name]:
                assert val == expected

        for val in env_results["RAY_RUNTIME_ENV_TEST"]:
            assert val == "ray_runtime_env"

    finally:
        for k in sentinel_vars:
            os.environ.pop(k, None)
[Feat][Executor] Introduce RayExecutorV2 (#36836) Signed-off-by: Jeffrey Wang <jeffreywang@anyscale.com> 2026-04-01 14:34:29 -07:00			`# SPDX-License-Identifier: Apache-2.0`
			`# SPDX-FileCopyrightText: Copyright contributors to the vLLM project`
			`"""`
			`Orchestration-level integration tests for RayExecutorV2.`
			`"""`

			`import gc`
			`import os`
			`import pathlib`

			`import pytest`
			`import ray`

			`pytestmark = pytest.mark.usefixtures("enable_ray_v2_backend")`

			`MODEL = "facebook/opt-125m"`


			`def _get_env_var(worker, name):`
			`return os.environ.get(name)`


			`def _ray_init():`
			`"""Start Ray with the project root on workers' PYTHONPATH.`

			`Without this, workers cannot unpickle actor classes defined in the`
			``tests`` package, causing FunctionActorManager to fall back to
			`TemporaryActor which drops async method signatures."""`
			`project_root = str(pathlib.Path(__file__).resolve().parents[2])`
			`ray.init(`
			`ignore_reinit_error=True,`
			`runtime_env={"env_vars": {"PYTHONPATH": project_root}},`
			`)`


			`@pytest.fixture`
			`def ray_init():`
			`_ray_init()`


			`class _AsyncLLMActor:`
			`def start(self, pg, bundle_indices=None, ray_runtime_env=None):`
			`os.environ["VLLM_USE_RAY_V2_EXECUTOR_BACKEND"] = "1"`
			`# Needed so collective_rpc can pickle _get_env_var over the`
			`# AsyncLLM -> EngineCore ZMQ boundary.`
			`os.environ["VLLM_ALLOW_INSECURE_SERIALIZATION"] = "1"`
			`if bundle_indices is not None:`
			`os.environ["VLLM_RAY_BUNDLE_INDICES"] = bundle_indices`
			`else:`
			`os.environ.pop("VLLM_RAY_BUNDLE_INDICES", None)`

			`from vllm.engine.arg_utils import AsyncEngineArgs`
			`from vllm.v1.engine.async_llm import AsyncLLM`
			`from vllm.v1.executor.abstract import Executor`

			`engine_args = AsyncEngineArgs(`
			`model=MODEL,`
			`tensor_parallel_size=2,`
			`distributed_executor_backend="ray",`
			`enforce_eager=True,`
			`max_model_len=256,`
			`gpu_memory_utilization=0.8,`
			`)`
			`vllm_config = engine_args.create_engine_config()`
			`vllm_config.parallel_config.placement_group = pg`
			`if ray_runtime_env is not None:`
			`vllm_config.parallel_config.ray_runtime_env = ray_runtime_env`

			`executor_class = Executor.get_class(vllm_config)`
			`self.engine = AsyncLLM(`
			`vllm_config=vllm_config,`
			`executor_class=executor_class,`
			`log_stats=False,`
			`log_requests=False,`
			`)`

			`async def generate(self, prompt):`
			`from vllm.sampling_params import SamplingParams`

			`params = SamplingParams(max_tokens=16)`
			`result = None`
			`async for output in self.engine.generate(`
			`prompt, params, request_id="test_request_id"`
			`):`
			`result = output`
			`assert result is not None`
			`return result.outputs[0].text`

			`async def generate_and_get_worker_envs(self, prompt, env_names):`
			`from vllm.sampling_params import SamplingParams`

			`params = SamplingParams(max_tokens=16)`
			`result = None`
			`async for output in self.engine.generate(`
			`prompt, params, request_id="test_request_id"`
			`):`
			`result = output`
			`assert result is not None`
			`text = result.outputs[0].text`

			`env_results = {}`
			`for name in env_names:`
			`vals = await self.engine.collective_rpc(`
			`_get_env_var, timeout=10, args=(name,)`
			`)`
			`env_results[name] = vals`
			`return text, env_results`

			`def shutdown(self):`
			`if engine := getattr(self, "engine", None):`
			`engine.shutdown()`
			`del self.engine`
			`gc.collect()`


			`AsyncLLMActor = ray.remote(num_cpus=0, max_concurrency=1)(_AsyncLLMActor)`


			`def test_multi_replicas(ray_init):`
			`pg1 = ray.util.placement_group([{"GPU": 1, "CPU": 1}] * 2, strategy="PACK")`
			`pg2 = ray.util.placement_group([{"GPU": 1, "CPU": 1}] * 2, strategy="PACK")`
			`ray.get([pg1.ready(), pg2.ready()])`

			`actor1 = AsyncLLMActor.remote()`
			`actor2 = AsyncLLMActor.remote()`

			`ray.get(actor1.start.remote(pg1))`
			`ray.get(actor2.start.remote(pg2))`

			`out1, out2 = ray.get(`
			`[`
			`actor1.generate.remote("Hello world"),`
			`actor2.generate.remote("Hello world"),`
			`]`
			`)`
			`assert len(out1) > 0`
			`assert len(out2) > 0`


			`def test_multi_replicas_with_bundle_indices(ray_init):`
			`pg = ray.util.placement_group([{"GPU": 1, "CPU": 1}] * 4, strategy="PACK")`
			`ray.get(pg.ready())`

			`actor1 = AsyncLLMActor.remote()`
			`actor2 = AsyncLLMActor.remote()`

			`ray.get(actor1.start.remote(pg, bundle_indices="2,1"))`
			`ray.get(actor2.start.remote(pg, bundle_indices="0,3"))`

			`out1, out2 = ray.get(`
			`[`
			`actor1.generate.remote("Hello world"),`
			`actor2.generate.remote("Hello world"),`
			`]`
			`)`
			`assert len(out1) > 0`
			`assert len(out2) > 0`


			`def test_env_var_and_runtime_env_propagation():`
			`"""`
			`Verify env vars (NCCL_, HF_) and parallel_config.ray_runtime_env`
			`propagate to RayWorkerProc actors.`
			`"""`
			`sentinel_vars = {`
			`"NCCL_DEBUG": "INFO",`
			`"HF_TOKEN": "test_sentinel_token",`
			`}`
			`for k, v in sentinel_vars.items():`
			`os.environ[k] = v`

			`try:`
			`# Called directly (not via the ray_init fixture) because sentinel`
			`# env vars must be in os.environ before ray.init() so that Ray`
			`# worker processes inherit them.`
			`_ray_init()`

			`pg = ray.util.placement_group([{"GPU": 1, "CPU": 1}] * 2, strategy="PACK")`
			`ray.get(pg.ready())`

			`# Include the project root so that RayWorkerProc actors can`
			`# unpickle _get_env_var.`
			`project_root = str(pathlib.Path(__file__).resolve().parents[2])`
			`ray_runtime_env = {`
			`"env_vars": {`
			`"RAY_RUNTIME_ENV_TEST": "ray_runtime_env",`
			`"PYTHONPATH": project_root,`
			`},`
			`}`

			`actor = AsyncLLMActor.remote()`
			`ray.get(actor.start.remote(pg, ray_runtime_env=ray_runtime_env))`

			`all_env_names = list(sentinel_vars) + ["RAY_RUNTIME_ENV_TEST"]`
			`text, env_results = ray.get(`
			`actor.generate_and_get_worker_envs.remote("Hello world", all_env_names)`
			`)`
			`assert len(text) > 0`

			`for name, expected in sentinel_vars.items():`
			`for val in env_results[name]:`
			`assert val == expected`

			`for val in env_results["RAY_RUNTIME_ENV_TEST"]:`
			`assert val == "ray_runtime_env"`

			`finally:`
			`for k in sentinel_vars:`
			`os.environ.pop(k, None)`