346 lines
10 KiB
Python
346 lines
10 KiB
Python
|
|
# SPDX-License-Identifier: Apache-2.0
|
||
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||
|
|
|
||
|
|
"""
|
||
|
|
Integration tests for RayExecutorV2 at the executor level.
|
||
|
|
Validates executor initialization, placement group support, RPC calls,
|
||
|
|
and distributed execution with various TP/PP configurations.
|
||
|
|
"""
|
||
|
|
|
||
|
|
import gc
|
||
|
|
import threading
|
||
|
|
from unittest.mock import patch
|
||
|
|
|
||
|
|
import pytest
|
||
|
|
import ray
|
||
|
|
|
||
|
|
from vllm import LLM
|
||
|
|
from vllm.config import VllmConfig
|
||
|
|
from vllm.engine.arg_utils import EngineArgs
|
||
|
|
from vllm.v1.executor.ray_executor_v2 import RayExecutorV2
|
||
|
|
|
||
|
|
pytestmark = pytest.mark.usefixtures("enable_ray_v2_backend")
|
||
|
|
|
||
|
|
MODEL = "facebook/opt-125m"
|
||
|
|
|
||
|
|
|
||
|
|
def create_vllm_config(
|
||
|
|
tensor_parallel_size: int = 1,
|
||
|
|
pipeline_parallel_size: int = 1,
|
||
|
|
max_model_len: int = 256,
|
||
|
|
gpu_memory_utilization: float = 0.3,
|
||
|
|
placement_group=None,
|
||
|
|
) -> VllmConfig:
|
||
|
|
engine_args = EngineArgs(
|
||
|
|
model=MODEL,
|
||
|
|
tensor_parallel_size=tensor_parallel_size,
|
||
|
|
pipeline_parallel_size=pipeline_parallel_size,
|
||
|
|
max_model_len=max_model_len,
|
||
|
|
gpu_memory_utilization=gpu_memory_utilization,
|
||
|
|
distributed_executor_backend="ray",
|
||
|
|
enforce_eager=True,
|
||
|
|
)
|
||
|
|
vllm_config = engine_args.create_engine_config()
|
||
|
|
|
||
|
|
if placement_group is not None:
|
||
|
|
vllm_config.parallel_config.placement_group = placement_group
|
||
|
|
|
||
|
|
return vllm_config
|
||
|
|
|
||
|
|
|
||
|
|
def ensure_ray_initialized():
|
||
|
|
if not ray.is_initialized():
|
||
|
|
ray.init(ignore_reinit_error=True)
|
||
|
|
|
||
|
|
|
||
|
|
@pytest.fixture
|
||
|
|
def create_placement_group(request):
|
||
|
|
ensure_ray_initialized()
|
||
|
|
num_gpus = request.param
|
||
|
|
bundles = [{"GPU": 1, "CPU": 1} for _ in range(num_gpus)]
|
||
|
|
pg = ray.util.placement_group(bundles, strategy="PACK")
|
||
|
|
ray.get(pg.ready())
|
||
|
|
yield pg
|
||
|
|
ray.util.remove_placement_group(pg)
|
||
|
|
|
||
|
|
|
||
|
|
@pytest.fixture
|
||
|
|
def executor(request):
|
||
|
|
"""Create a RayExecutorV2 and shut it down after the test."""
|
||
|
|
executor = RayExecutorV2(vllm_config=request.param)
|
||
|
|
yield executor
|
||
|
|
executor.shutdown()
|
||
|
|
|
||
|
|
|
||
|
|
def assert_executor(executor, tp_size, pp_size):
|
||
|
|
"""Common assertions for executor initialization tests."""
|
||
|
|
world_size = tp_size * pp_size
|
||
|
|
expected_output_rank = (pp_size - 1) * tp_size
|
||
|
|
|
||
|
|
assert executor.world_size == world_size
|
||
|
|
assert len(executor.ray_worker_handles) == world_size
|
||
|
|
assert len(executor.response_mqs) == world_size
|
||
|
|
assert executor._get_output_rank() == expected_output_rank
|
||
|
|
|
||
|
|
if pp_size > 1:
|
||
|
|
assert executor.max_concurrent_batches == pp_size
|
||
|
|
|
||
|
|
executor.check_health()
|
||
|
|
assert not executor.is_failed
|
||
|
|
|
||
|
|
ranks = sorted(h.rank for h in executor.ray_worker_handles)
|
||
|
|
assert ranks == list(range(world_size))
|
||
|
|
|
||
|
|
for handle in executor.ray_worker_handles:
|
||
|
|
assert handle.node_id is not None
|
||
|
|
|
||
|
|
|
||
|
|
@pytest.mark.parametrize("tp_size, pp_size", [(1, 1), (2, 1), (4, 1), (2, 2)])
|
||
|
|
def test_ray_v2_executor(tp_size, pp_size):
|
||
|
|
"""Validate RayExecutorV2 with various TP/PP configs."""
|
||
|
|
vllm_config = create_vllm_config(
|
||
|
|
tensor_parallel_size=tp_size,
|
||
|
|
pipeline_parallel_size=pp_size,
|
||
|
|
)
|
||
|
|
executor = RayExecutorV2(vllm_config=vllm_config)
|
||
|
|
try:
|
||
|
|
assert_executor(executor, tp_size, pp_size)
|
||
|
|
finally:
|
||
|
|
executor.shutdown()
|
||
|
|
|
||
|
|
|
||
|
|
@pytest.mark.parametrize(
|
||
|
|
"tp_size, pp_size, create_placement_group",
|
||
|
|
[(2, 1, 2), (4, 1, 4), (2, 2, 4)],
|
||
|
|
indirect=["create_placement_group"],
|
||
|
|
)
|
||
|
|
def test_ray_v2_executor_pg(tp_size, pp_size, create_placement_group):
|
||
|
|
"""Validate RayExecutorV2 with various TP/PP configs using external PG."""
|
||
|
|
vllm_config = create_vllm_config(
|
||
|
|
tensor_parallel_size=tp_size,
|
||
|
|
pipeline_parallel_size=pp_size,
|
||
|
|
placement_group=create_placement_group,
|
||
|
|
)
|
||
|
|
executor = RayExecutorV2(vllm_config=vllm_config)
|
||
|
|
try:
|
||
|
|
assert_executor(executor, tp_size, pp_size)
|
||
|
|
finally:
|
||
|
|
executor.shutdown()
|
||
|
|
|
||
|
|
|
||
|
|
@pytest.mark.parametrize(
|
||
|
|
"executor",
|
||
|
|
[create_vllm_config(tensor_parallel_size=2)],
|
||
|
|
indirect=True,
|
||
|
|
)
|
||
|
|
def test_ray_v2_executor_failure_callback(executor):
|
||
|
|
"""Validate failure callback registration."""
|
||
|
|
callback_invoked = False
|
||
|
|
|
||
|
|
def test_callback():
|
||
|
|
nonlocal callback_invoked
|
||
|
|
callback_invoked = True
|
||
|
|
|
||
|
|
executor.register_failure_callback(test_callback)
|
||
|
|
assert not callback_invoked
|
||
|
|
|
||
|
|
executor.is_failed = True
|
||
|
|
executor.register_failure_callback(test_callback)
|
||
|
|
assert callback_invoked
|
||
|
|
|
||
|
|
|
||
|
|
@pytest.mark.parametrize(
|
||
|
|
"executor",
|
||
|
|
[create_vllm_config(tensor_parallel_size=2)],
|
||
|
|
indirect=True,
|
||
|
|
)
|
||
|
|
def test_ray_v2_executor_collective_rpc(executor):
|
||
|
|
"""Validate collective RPC calls through MessageQueue."""
|
||
|
|
executor.check_health()
|
||
|
|
assert not executor.is_failed
|
||
|
|
assert executor.rpc_broadcast_mq is not None
|
||
|
|
|
||
|
|
|
||
|
|
@pytest.mark.parametrize(
|
||
|
|
"executor",
|
||
|
|
[create_vllm_config(tensor_parallel_size=2)],
|
||
|
|
indirect=True,
|
||
|
|
)
|
||
|
|
def test_ray_v2_executor_driver_node_rank_0(executor):
|
||
|
|
"""Validate that driver node workers get the lowest ranks."""
|
||
|
|
driver_node = ray.get_runtime_context().get_node_id()
|
||
|
|
|
||
|
|
for handle in executor.ray_worker_handles:
|
||
|
|
assert handle.node_id == driver_node
|
||
|
|
|
||
|
|
rank0_handle = next(h for h in executor.ray_worker_handles if h.rank == 0)
|
||
|
|
assert rank0_handle.node_id == driver_node
|
||
|
|
|
||
|
|
|
||
|
|
@pytest.mark.parametrize(
|
||
|
|
"executor",
|
||
|
|
[create_vllm_config(tensor_parallel_size=2)],
|
||
|
|
indirect=True,
|
||
|
|
)
|
||
|
|
def test_ray_v2_executor_worker_death(executor):
|
||
|
|
"""Validate executor detects worker death via ray.wait()."""
|
||
|
|
callback_event = threading.Event()
|
||
|
|
|
||
|
|
def on_failure():
|
||
|
|
callback_event.set()
|
||
|
|
|
||
|
|
executor.register_failure_callback(on_failure)
|
||
|
|
assert not executor.is_failed
|
||
|
|
|
||
|
|
# Kill one worker actor externally
|
||
|
|
victim = executor.ray_worker_handles[1].actor
|
||
|
|
ray.kill(victim, no_restart=True)
|
||
|
|
|
||
|
|
# Monitor thread should detect the death and invoke callback
|
||
|
|
assert callback_event.wait(timeout=30)
|
||
|
|
assert executor.is_failed
|
||
|
|
assert executor.shutting_down
|
||
|
|
|
||
|
|
|
||
|
|
def test_ray_v2_executor_shutdown():
|
||
|
|
"""Validate graceful shutdown: ray.kill() terminates all worker actors."""
|
||
|
|
executor = RayExecutorV2(vllm_config=create_vllm_config(tensor_parallel_size=2))
|
||
|
|
assert executor.rpc_broadcast_mq is not None
|
||
|
|
assert len(executor.response_mqs) == executor.world_size
|
||
|
|
|
||
|
|
actors = [h.actor for h in executor.ray_worker_handles]
|
||
|
|
executor.shutdown()
|
||
|
|
|
||
|
|
for actor in actors:
|
||
|
|
with pytest.raises(ray.exceptions.RayActorError):
|
||
|
|
ray.get(actor.wait_for_init.remote(), timeout=5)
|
||
|
|
|
||
|
|
assert executor.rpc_broadcast_mq is None
|
||
|
|
assert len(executor.response_mqs) == 0
|
||
|
|
|
||
|
|
|
||
|
|
@pytest.mark.parametrize(
|
||
|
|
"executor",
|
||
|
|
[create_vllm_config(tensor_parallel_size=2)],
|
||
|
|
indirect=True,
|
||
|
|
)
|
||
|
|
def test_ray_v2_run_refs_stored_for_monitoring(executor):
|
||
|
|
"""Validate worker handles store run_ref for monitoring."""
|
||
|
|
for handle in executor.ray_worker_handles:
|
||
|
|
assert handle.run_ref is not None
|
||
|
|
ready, _ = ray.wait([handle.run_ref], timeout=0)
|
||
|
|
assert len(ready) == 0, "run_ref should be pending"
|
||
|
|
|
||
|
|
|
||
|
|
@pytest.mark.parametrize("tp_size, pp_size", [(2, 1), (2, 2)])
|
||
|
|
def test_ray_v2_single_node_generation(tp_size, pp_size):
|
||
|
|
"""End-to-end LLM generation with RayExecutorV2."""
|
||
|
|
|
||
|
|
llm = LLM(
|
||
|
|
model=MODEL,
|
||
|
|
tensor_parallel_size=tp_size,
|
||
|
|
pipeline_parallel_size=pp_size,
|
||
|
|
distributed_executor_backend="ray",
|
||
|
|
enforce_eager=True,
|
||
|
|
max_model_len=256,
|
||
|
|
gpu_memory_utilization=0.3,
|
||
|
|
)
|
||
|
|
try:
|
||
|
|
prompts = [
|
||
|
|
"Hello, my name is",
|
||
|
|
"The capital of France is",
|
||
|
|
"The future of AI is",
|
||
|
|
]
|
||
|
|
outputs = llm.generate(prompts)
|
||
|
|
|
||
|
|
assert len(outputs) == len(prompts)
|
||
|
|
for output in outputs:
|
||
|
|
assert len(output.outputs) > 0
|
||
|
|
assert len(output.outputs[0].text) > 0
|
||
|
|
finally:
|
||
|
|
llm.llm_engine.model_executor.shutdown()
|
||
|
|
del llm
|
||
|
|
gc.collect()
|
||
|
|
|
||
|
|
|
||
|
|
@pytest.mark.parametrize(
|
||
|
|
"bundle_indices, expected_bundle_ids, create_placement_group",
|
||
|
|
[("2,3", [2, 3], 4), ("3,2", [3, 2], 4)],
|
||
|
|
indirect=["create_placement_group"],
|
||
|
|
)
|
||
|
|
def test_ray_v2_bundle_indices_env(
|
||
|
|
bundle_indices, expected_bundle_ids, create_placement_group, monkeypatch
|
||
|
|
):
|
||
|
|
"""Validate explicit VLLM_RAY_BUNDLE_INDICES bundle placement."""
|
||
|
|
monkeypatch.setenv("VLLM_RAY_BUNDLE_INDICES", bundle_indices)
|
||
|
|
vllm_config = create_vllm_config(
|
||
|
|
tensor_parallel_size=2,
|
||
|
|
placement_group=create_placement_group,
|
||
|
|
)
|
||
|
|
executor = RayExecutorV2(vllm_config=vllm_config)
|
||
|
|
try:
|
||
|
|
actual = [
|
||
|
|
h.bundle_id_idx
|
||
|
|
for h in sorted(executor.ray_worker_handles, key=lambda h: h.rank)
|
||
|
|
]
|
||
|
|
assert actual == expected_bundle_ids
|
||
|
|
assert_executor(executor, tp_size=2, pp_size=1)
|
||
|
|
finally:
|
||
|
|
executor.shutdown()
|
||
|
|
|
||
|
|
|
||
|
|
@pytest.mark.parametrize(
|
||
|
|
"bundle_indices, expected_error, create_placement_group",
|
||
|
|
[
|
||
|
|
("1,1", "cannot have duplicate values,", 4),
|
||
|
|
("0,1,2", "must have the same size", 4),
|
||
|
|
],
|
||
|
|
indirect=["create_placement_group"],
|
||
|
|
)
|
||
|
|
def test_ray_v2_invalid_bundle_indices(
|
||
|
|
bundle_indices, expected_error, create_placement_group, monkeypatch
|
||
|
|
):
|
||
|
|
"""Validate invalid bundle indices are rejected."""
|
||
|
|
monkeypatch.setenv("VLLM_RAY_BUNDLE_INDICES", bundle_indices)
|
||
|
|
vllm_config = create_vllm_config(
|
||
|
|
tensor_parallel_size=2, placement_group=create_placement_group
|
||
|
|
)
|
||
|
|
with pytest.raises(AssertionError, match=expected_error):
|
||
|
|
RayExecutorV2(vllm_config=vllm_config)
|
||
|
|
|
||
|
|
|
||
|
|
@pytest.mark.parametrize("tp_size, pp_size", [(2, 1), (2, 2)])
|
||
|
|
def test_ray_v2_single_node_generation_with_pg(tp_size, pp_size):
|
||
|
|
"""E2E LLM generation with a user-provided placement group."""
|
||
|
|
ensure_ray_initialized()
|
||
|
|
bundles = [{"GPU": 1, "CPU": 1} for _ in range(tp_size * pp_size)]
|
||
|
|
pg = ray.util.placement_group(bundles, strategy="PACK")
|
||
|
|
ray.get(pg.ready())
|
||
|
|
|
||
|
|
try:
|
||
|
|
with patch.object(ray.util, "get_current_placement_group", return_value=pg):
|
||
|
|
llm = LLM(
|
||
|
|
model=MODEL,
|
||
|
|
tensor_parallel_size=tp_size,
|
||
|
|
pipeline_parallel_size=pp_size,
|
||
|
|
distributed_executor_backend="ray",
|
||
|
|
enforce_eager=True,
|
||
|
|
max_model_len=256,
|
||
|
|
gpu_memory_utilization=0.3,
|
||
|
|
)
|
||
|
|
prompts = [
|
||
|
|
"Hello, my name is",
|
||
|
|
"The capital of France is",
|
||
|
|
"The future of AI is",
|
||
|
|
]
|
||
|
|
outputs = llm.generate(prompts)
|
||
|
|
|
||
|
|
assert len(outputs) == len(prompts)
|
||
|
|
for output in outputs:
|
||
|
|
assert len(output.outputs) > 0
|
||
|
|
assert len(output.outputs[0].text) > 0
|
||
|
|
finally:
|
||
|
|
llm.llm_engine.model_executor.shutdown()
|
||
|
|
del llm
|
||
|
|
gc.collect()
|