[V1] Improve TP>1 Error Handling + Stack Trace (#11721)

Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
This commit is contained in:
Robert Shaw
2025-01-03 16:29:11 -05:00
committed by GitHub
parent 61fed92c7e
commit 1543914c04
4 changed files with 40 additions and 21 deletions

View File

@@ -9,6 +9,7 @@ from enum import Enum, auto
from multiprocessing.process import BaseProcess
from typing import Any, Dict, List, Optional, Tuple
import psutil
import zmq
from vllm.config import VllmConfig
@@ -38,6 +39,19 @@ class MultiprocExecutor(Executor):
# and ensure workers will be terminated.
self._finalizer = weakref.finalize(self, self.shutdown)
# The child processes will send SIGUSR1 when unrecoverable
# errors happen.
def sigusr1_handler(signum, frame):
logger.fatal(
"MulitprocExecutor got fatal signal from worker processes, "
"shutting down. See stack trace above for root cause issue.")
# Propagate error up to parent process.
parent_process = psutil.Process().parent()
parent_process.send_signal(signal.SIGUSR1)
self.shutdown()
signal.signal(signal.SIGUSR1, sigusr1_handler)
self.vllm_config = vllm_config
self.parallel_config = vllm_config.parallel_config
@@ -335,8 +349,11 @@ class WorkerProc:
except SystemExit:
logger.debug("Worker interrupted.")
except BaseException as e:
logger.exception(e)
except Exception:
# worker_busy_loop sends exceptions exceptons to Executor
# for shutdown, but if there is an error in startup or an
# error with IPC itself, we need to alert the parent.
psutil.Process().parent().send_signal(signal.SIGUSR1)
raise
finally:
@@ -377,9 +394,10 @@ class WorkerProc:
try:
output = getattr(self.worker, method)(*args, **kwargs)
except BaseException as e:
except Exception as e:
self.worker_response_mq.enqueue(
(WorkerProc.ResponseStatus.FAILURE, e))
logger.exception("WorkerProc hit an exception: %s", exc_info=e)
continue
self.worker_response_mq.enqueue(