[V1] Improve TP>1 Error Handling + Stack Trace (#11721)

Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
This commit is contained in:
Robert Shaw
2025-01-03 16:29:11 -05:00
committed by GitHub
parent 61fed92c7e
commit 1543914c04
4 changed files with 40 additions and 21 deletions

View File

@@ -1,6 +1,5 @@
import asyncio
import os
import signal
from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union
from vllm.config import ModelConfig, VllmConfig
@@ -42,21 +41,6 @@ class AsyncLLM(EngineClient):
start_engine_loop: bool = True,
) -> None:
# The child processes will send SIGQUIT when unrecoverable
# errors happen. We kill the process tree here so that the
# stack trace is very evident.
# TODO: rather than killing the main process, we should
# figure out how to raise an AsyncEngineDeadError and
# handle at the API server level so we can return a better
# error code to the clients calling VLLM.
def sigquit_handler(signum, frame):
logger.fatal(
"AsyncLLM got SIGQUIT from worker processes, shutting "
"down. See stack trace above for root cause issue.")
kill_process_tree(os.getpid())
signal.signal(signal.SIGQUIT, sigquit_handler)
assert start_engine_loop
self.log_requests = log_requests