[Refactor] Unify engine process monitoring in engine manager and add Ray backend support (#35862)
Signed-off-by: fangyuchu <fangyuchu@qq.com> Signed-off-by: Nick Hill <nickhill123@gmail.com> Co-authored-by: Nick Hill <nickhill123@gmail.com>
This commit is contained in:
@@ -11,7 +11,7 @@ from enum import Enum, auto
|
||||
from multiprocessing import Process, connection
|
||||
from multiprocessing.process import BaseProcess
|
||||
from multiprocessing.queues import Queue
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import TYPE_CHECKING, cast
|
||||
from unittest.mock import patch
|
||||
|
||||
import msgspec
|
||||
@@ -133,6 +133,8 @@ class CoreEngineProcManager:
|
||||
)
|
||||
|
||||
self._finalizer = weakref.finalize(self, shutdown, self.processes)
|
||||
self.manager_stopped = threading.Event()
|
||||
self.failed_proc_name: str | None = None
|
||||
|
||||
try:
|
||||
for proc, local_dp_rank in zip(self.processes, local_dp_ranks):
|
||||
@@ -154,12 +156,31 @@ class CoreEngineProcManager:
|
||||
|
||||
def shutdown(self, timeout: float | None = None) -> None:
|
||||
"""Shutdown engine core processes with configurable timeout."""
|
||||
self.manager_stopped.set()
|
||||
if self._finalizer.detach() is not None:
|
||||
shutdown(self.processes, timeout=timeout)
|
||||
|
||||
def join_first(self):
|
||||
"""Wait for any process to exit."""
|
||||
connection.wait(proc.sentinel for proc in self.processes)
|
||||
def monitor_engine_liveness(self) -> None:
|
||||
"""Monitor engine core process liveness."""
|
||||
|
||||
sentinel_to_proc = {proc.sentinel: proc for proc in self.processes}
|
||||
sentinels = set(sentinel_to_proc.keys())
|
||||
|
||||
while sentinels and not self.manager_stopped.is_set():
|
||||
died_sentinels = connection.wait(sentinels, timeout=1)
|
||||
|
||||
for sentinel in died_sentinels:
|
||||
proc = sentinel_to_proc.pop(cast(int, sentinel))
|
||||
exitcode = proc.exitcode
|
||||
if exitcode != 0 and not self.manager_stopped.is_set():
|
||||
self.failed_proc_name = proc.name
|
||||
if died_sentinels:
|
||||
# Any engine exit currently triggers a shutdown. Future
|
||||
# work (e.g., Elastic and fault-tolerant EP) will add finer-grained
|
||||
# handling for different exit scenarios.
|
||||
break
|
||||
|
||||
self.shutdown()
|
||||
|
||||
def sentinels(self) -> list:
|
||||
return [proc.sentinel for proc in self.processes]
|
||||
@@ -298,6 +319,8 @@ class CoreEngineActorManager:
|
||||
self.log_stats = log_stats
|
||||
local_engine_count = vllm_config.parallel_config.data_parallel_size_local
|
||||
world_size = vllm_config.parallel_config.world_size
|
||||
self.manager_stopped = threading.Event()
|
||||
self.failed_proc_name: str | None = None
|
||||
|
||||
if ray.is_initialized():
|
||||
logger.info("Ray is already initialized. Skipping Ray initialization.")
|
||||
@@ -395,8 +418,11 @@ class CoreEngineActorManager:
|
||||
|
||||
ray.get(refs)
|
||||
self.run_refs = []
|
||||
self.actor_run_ref_dict = dict()
|
||||
for actor in self.local_engine_actors + self.remote_engine_actors:
|
||||
self.run_refs.append(actor.run.remote())
|
||||
ref = actor.run.remote()
|
||||
self.run_refs.append(ref)
|
||||
self.actor_run_ref_dict[actor] = ref
|
||||
|
||||
@staticmethod
|
||||
def create_dp_placement_groups(
|
||||
@@ -776,7 +802,9 @@ class CoreEngineActorManager:
|
||||
) + self.remote_engine_actors[-(len(placement_groups) - new_local_engines) :]
|
||||
|
||||
for actor in actors:
|
||||
self.run_refs.append(actor.run.remote())
|
||||
ref = actor.run.remote()
|
||||
self.run_refs.append(ref)
|
||||
self.actor_run_ref_dict[actor] = ref
|
||||
|
||||
cur_vllm_config.parallel_config.data_parallel_size = new_data_parallel_size
|
||||
# Update old_vllm_config with new data_parallel_size_local if any new
|
||||
@@ -805,12 +833,59 @@ class CoreEngineActorManager:
|
||||
self.remote_engine_actors.pop()
|
||||
ray.util.remove_placement_group(pg)
|
||||
|
||||
def remove_run_refs_for_scale_down(self, removed_dp_size: int) -> None:
|
||||
if removed_dp_size <= 0:
|
||||
return
|
||||
flags = self.placement_group_is_local[-removed_dp_size:]
|
||||
li = len(self.local_engine_actors) - 1
|
||||
ri = len(self.remote_engine_actors) - 1
|
||||
for is_local in reversed(flags):
|
||||
if is_local:
|
||||
actor = self.local_engine_actors[li]
|
||||
li -= 1
|
||||
else:
|
||||
actor = self.remote_engine_actors[ri]
|
||||
ri -= 1
|
||||
ref = self.actor_run_ref_dict.pop(actor)
|
||||
self.run_refs.remove(ref)
|
||||
|
||||
def get_run_refs(self):
|
||||
return self.run_refs
|
||||
|
||||
def monitor_engine_liveness(self) -> None:
|
||||
import ray
|
||||
|
||||
while not self.manager_stopped.is_set():
|
||||
actor_run_refs = list(self.get_run_refs())
|
||||
if not actor_run_refs:
|
||||
logger.info(
|
||||
"There are no actors to monitor currently. "
|
||||
"The monitoring function is about to terminate."
|
||||
)
|
||||
break
|
||||
actor_done_refs, _ = ray.wait(actor_run_refs, timeout=5)
|
||||
unexpected_failure = False
|
||||
for actor_ref in actor_done_refs:
|
||||
if self.manager_stopped.is_set():
|
||||
break
|
||||
if actor_ref not in self.get_run_refs():
|
||||
# The run refs may have been updated by elastic scale-down.
|
||||
continue
|
||||
try:
|
||||
ray.get(actor_ref)
|
||||
except ray.exceptions.RayActorError:
|
||||
self.failed_proc_name = f"Actor {actor_ref}"
|
||||
unexpected_failure = True
|
||||
|
||||
if unexpected_failure:
|
||||
break
|
||||
|
||||
self.shutdown()
|
||||
|
||||
def shutdown(self, timeout: float | None = None) -> None:
|
||||
import ray
|
||||
|
||||
self.manager_stopped.set()
|
||||
for actor in self.local_engine_actors + self.remote_engine_actors:
|
||||
ray.kill(actor)
|
||||
for pg in self.created_placement_groups:
|
||||
|
||||
Reference in New Issue
Block a user