[V1] Improve TP>1 Error Handling + Stack Trace (#11721)

Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-01-03 16:29:11 -05:00
parent 61fed92c7e
commit 1543914c04
4 changed files with 40 additions and 21 deletions
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -1,6 +1,5 @@
 import asyncio
 import os
-import signal
 from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union

 from vllm.config import ModelConfig, VllmConfig
@@ -42,21 +41,6 @@ class AsyncLLM(EngineClient):
        start_engine_loop: bool = True,
    ) -> None:

-        # The child processes will send SIGQUIT when unrecoverable
-        # errors happen. We kill the process tree here so that the
-        # stack trace is very evident.
-        # TODO: rather than killing the main process, we should
-        # figure out how to raise an AsyncEngineDeadError and
-        # handle at the API server level so we can return a better
-        # error code to the clients calling VLLM.
-        def sigquit_handler(signum, frame):
-            logger.fatal(
-                "AsyncLLM got SIGQUIT from worker processes, shutting "
-                "down. See stack trace above for root cause issue.")
-            kill_process_tree(os.getpid())
-
-        signal.signal(signal.SIGQUIT, sigquit_handler)
-
        assert start_engine_loop

        self.log_requests = log_requests