[V1] Improve TP>1 Error Handling + Stack Trace (#11721)
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
This commit is contained in:
@@ -9,6 +9,7 @@ from enum import Enum, auto
|
||||
from multiprocessing.process import BaseProcess
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
import psutil
|
||||
import zmq
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
@@ -38,6 +39,19 @@ class MultiprocExecutor(Executor):
|
||||
# and ensure workers will be terminated.
|
||||
self._finalizer = weakref.finalize(self, self.shutdown)
|
||||
|
||||
# The child processes will send SIGUSR1 when unrecoverable
|
||||
# errors happen.
|
||||
def sigusr1_handler(signum, frame):
|
||||
logger.fatal(
|
||||
"MulitprocExecutor got fatal signal from worker processes, "
|
||||
"shutting down. See stack trace above for root cause issue.")
|
||||
# Propagate error up to parent process.
|
||||
parent_process = psutil.Process().parent()
|
||||
parent_process.send_signal(signal.SIGUSR1)
|
||||
self.shutdown()
|
||||
|
||||
signal.signal(signal.SIGUSR1, sigusr1_handler)
|
||||
|
||||
self.vllm_config = vllm_config
|
||||
self.parallel_config = vllm_config.parallel_config
|
||||
|
||||
@@ -335,8 +349,11 @@ class WorkerProc:
|
||||
except SystemExit:
|
||||
logger.debug("Worker interrupted.")
|
||||
|
||||
except BaseException as e:
|
||||
logger.exception(e)
|
||||
except Exception:
|
||||
# worker_busy_loop sends exceptions exceptons to Executor
|
||||
# for shutdown, but if there is an error in startup or an
|
||||
# error with IPC itself, we need to alert the parent.
|
||||
psutil.Process().parent().send_signal(signal.SIGUSR1)
|
||||
raise
|
||||
|
||||
finally:
|
||||
@@ -377,9 +394,10 @@ class WorkerProc:
|
||||
|
||||
try:
|
||||
output = getattr(self.worker, method)(*args, **kwargs)
|
||||
except BaseException as e:
|
||||
except Exception as e:
|
||||
self.worker_response_mq.enqueue(
|
||||
(WorkerProc.ResponseStatus.FAILURE, e))
|
||||
logger.exception("WorkerProc hit an exception: %s", exc_info=e)
|
||||
continue
|
||||
|
||||
self.worker_response_mq.enqueue(
|
||||
|
||||
Reference in New Issue
Block a user