[Bugfix]check health for engine core process exiting unexpectedly (#21728)
Signed-off-by: wuhang <wuhang6@huawei.com>
This commit is contained in:
@@ -104,8 +104,12 @@ class MultiprocExecutor(Executor):
|
||||
finally:
|
||||
if not success:
|
||||
# Clean up the worker procs if there was a failure.
|
||||
# Close death_writers first to signal workers to exit
|
||||
for uw in unready_workers:
|
||||
if uw.death_writer is not None:
|
||||
uw.death_writer.close()
|
||||
self._ensure_worker_termination(
|
||||
[w.proc for w in unready_workers])
|
||||
[uw.proc for uw in unready_workers])
|
||||
|
||||
# For pipeline parallel, we use a thread pool for asynchronous
|
||||
# execute_model.
|
||||
@@ -282,6 +286,10 @@ class MultiprocExecutor(Executor):
|
||||
|
||||
if workers := getattr(self, 'workers', None):
|
||||
for w in workers:
|
||||
# Close death_writer to signal child processes to exit
|
||||
if w.death_writer is not None:
|
||||
w.death_writer.close()
|
||||
w.death_writer = None
|
||||
w.worker_response_mq = None
|
||||
self._ensure_worker_termination([w.proc for w in workers])
|
||||
|
||||
@@ -316,6 +324,7 @@ class UnreadyWorkerProcHandle:
|
||||
proc: BaseProcess
|
||||
rank: int
|
||||
ready_pipe: Connection
|
||||
death_writer: Optional[Connection] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -323,6 +332,7 @@ class WorkerProcHandle:
|
||||
proc: BaseProcess
|
||||
rank: int
|
||||
worker_response_mq: MessageQueue # The worker process writes to this MQ
|
||||
death_writer: Optional[Connection] = None
|
||||
|
||||
@classmethod
|
||||
def from_unready_handle(
|
||||
@@ -332,6 +342,7 @@ class WorkerProcHandle:
|
||||
proc=unready_handle.proc,
|
||||
rank=unready_handle.rank,
|
||||
worker_response_mq=worker_response_mq,
|
||||
death_writer=unready_handle.death_writer,
|
||||
)
|
||||
|
||||
|
||||
@@ -396,6 +407,9 @@ class WorkerProc:
|
||||
# (reader, writer)
|
||||
reader, writer = context.Pipe(duplex=False)
|
||||
|
||||
# Create death pipe to detect parent process exit
|
||||
death_reader, death_writer = context.Pipe(duplex=False)
|
||||
|
||||
process_kwargs = {
|
||||
"vllm_config": vllm_config,
|
||||
"local_rank": local_rank,
|
||||
@@ -403,6 +417,7 @@ class WorkerProc:
|
||||
"distributed_init_method": distributed_init_method,
|
||||
"input_shm_handle": input_shm_handle,
|
||||
"ready_pipe": (reader, writer),
|
||||
"death_pipe": death_reader,
|
||||
}
|
||||
# Run EngineCore busy loop in background process.
|
||||
proc = context.Process(target=WorkerProc.worker_main,
|
||||
@@ -412,7 +427,9 @@ class WorkerProc:
|
||||
|
||||
proc.start()
|
||||
writer.close()
|
||||
return UnreadyWorkerProcHandle(proc, rank, reader)
|
||||
# Keep death_writer open in parent - when parent exits,
|
||||
# death_reader in child will get EOFError
|
||||
return UnreadyWorkerProcHandle(proc, rank, reader, death_writer)
|
||||
|
||||
@staticmethod
|
||||
def wait_for_ready(
|
||||
@@ -483,6 +500,28 @@ class WorkerProc:
|
||||
worker = None
|
||||
# tuple[Connection, Connection]
|
||||
reader, ready_writer = kwargs.pop("ready_pipe")
|
||||
death_pipe = kwargs.pop("death_pipe", None)
|
||||
|
||||
# Start death monitoring thread if death_pipe is provided
|
||||
if death_pipe is not None:
|
||||
|
||||
def monitor_parent_death():
|
||||
try:
|
||||
# This will block until parent process exits (pipe closes)
|
||||
death_pipe.recv()
|
||||
except EOFError:
|
||||
# Parent process has exited, terminate this worker
|
||||
logger.info("Parent process exited, terminating worker")
|
||||
# Send signal to self to trigger clean shutdown
|
||||
os.kill(os.getpid(), signal.SIGTERM)
|
||||
except Exception as e:
|
||||
logger.warning("Death monitoring error: %s", e)
|
||||
|
||||
death_monitor = Thread(target=monitor_parent_death,
|
||||
daemon=True,
|
||||
name="WorkerDeathMonitor")
|
||||
death_monitor.start()
|
||||
|
||||
try:
|
||||
reader.close()
|
||||
worker = WorkerProc(*args, **kwargs)
|
||||
@@ -523,6 +562,8 @@ class WorkerProc:
|
||||
finally:
|
||||
if ready_writer is not None:
|
||||
ready_writer.close()
|
||||
if death_pipe is not None:
|
||||
death_pipe.close()
|
||||
# Clean up once worker exits busy loop
|
||||
if worker is not None:
|
||||
worker.shutdown()
|
||||
|
||||
Reference in New Issue
Block a user