[V1][Frontend] Improve Shutdown And Logs (#11737)
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com> Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com> Signed-off-by: Nick Hill <nhill@redhat.com> Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: Russell Bryant <rbryant@redhat.com> Co-authored-by: Andrew Feldman <afeldman@neuralmagic.com> Co-authored-by: afeldman-nm <156691304+afeldman-nm@users.noreply.github.com> Co-authored-by: Nick Hill <nhill@redhat.com>
This commit is contained in:
@@ -12,9 +12,11 @@ from fastapi import FastAPI, Request, Response
|
||||
from vllm import envs
|
||||
from vllm.engine.async_llm_engine import AsyncEngineDeadError
|
||||
from vllm.engine.multiprocessing import MQEngineDeadError
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.ssl import SSLCertRefresher
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils import find_process_using_port
|
||||
from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
@@ -40,6 +42,8 @@ async def serve_http(app: FastAPI,
|
||||
|
||||
loop = asyncio.get_running_loop()
|
||||
|
||||
watchdog_task = loop.create_task(
|
||||
watchdog_loop(server, app.state.engine_client))
|
||||
server_task = loop.create_task(
|
||||
server.serve(sockets=[sock] if sock else None))
|
||||
|
||||
@@ -52,6 +56,7 @@ async def serve_http(app: FastAPI,
|
||||
def signal_handler() -> None:
|
||||
# prevents the uvicorn signal handler to exit early
|
||||
server_task.cancel()
|
||||
watchdog_task.cancel()
|
||||
if ssl_cert_refresher:
|
||||
ssl_cert_refresher.stop()
|
||||
|
||||
@@ -73,48 +78,69 @@ async def serve_http(app: FastAPI,
|
||||
port, process, " ".join(process.cmdline()))
|
||||
logger.info("Shutting down FastAPI HTTP server.")
|
||||
return server.shutdown()
|
||||
finally:
|
||||
watchdog_task.cancel()
|
||||
|
||||
|
||||
async def watchdog_loop(server: uvicorn.Server, engine: EngineClient):
|
||||
"""
|
||||
# Watchdog task that runs in the background, checking
|
||||
# for error state in the engine. Needed to trigger shutdown
|
||||
# if an exception arises is StreamingResponse() generator.
|
||||
"""
|
||||
VLLM_WATCHDOG_TIME_S = 5.0
|
||||
while True:
|
||||
await asyncio.sleep(VLLM_WATCHDOG_TIME_S)
|
||||
terminate_if_errored(server, engine)
|
||||
|
||||
|
||||
def terminate_if_errored(server: uvicorn.Server, engine: EngineClient):
|
||||
"""
|
||||
See discussions here on shutting down a uvicorn server
|
||||
https://github.com/encode/uvicorn/discussions/1103
|
||||
In this case we cannot await the server shutdown here
|
||||
because handler must first return to close the connection
|
||||
for this request.
|
||||
"""
|
||||
engine_errored = engine.errored and not engine.is_running
|
||||
if not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH and engine_errored:
|
||||
server.should_exit = True
|
||||
|
||||
|
||||
def _add_shutdown_handlers(app: FastAPI, server: uvicorn.Server) -> None:
|
||||
"""Adds handlers for fatal errors that should crash the server"""
|
||||
"""
|
||||
VLLM V1 AsyncLLM catches exceptions and returns
|
||||
only two types: EngineGenerateError and EngineDeadError.
|
||||
|
||||
EngineGenerateError is raised by the per request generate()
|
||||
method. This error could be request specific (and therefore
|
||||
recoverable - e.g. if there is an error in input processing).
|
||||
|
||||
EngineDeadError is raised by the background output_handler
|
||||
method. This error is global and therefore not recoverable.
|
||||
|
||||
We register these @app.exception_handlers to return nice
|
||||
responses to the end user if they occur and shut down if needed.
|
||||
See https://fastapi.tiangolo.com/tutorial/handling-errors/
|
||||
for more details on how exception handlers work.
|
||||
|
||||
If an exception is encountered in a StreamingResponse
|
||||
generator, the exception is not raised, since we already sent
|
||||
a 200 status. Rather, we send an error message as the next chunk.
|
||||
Since the exception is not raised, this means that the server
|
||||
will not automatically shut down. Instead, we use the watchdog
|
||||
background task for check for errored state.
|
||||
"""
|
||||
|
||||
@app.exception_handler(RuntimeError)
|
||||
async def runtime_error_handler(request: Request, __):
|
||||
"""On generic runtime error, check to see if the engine has died.
|
||||
It probably has, in which case the server will no longer be able to
|
||||
handle requests. Trigger a graceful shutdown with a SIGTERM."""
|
||||
engine = request.app.state.engine_client
|
||||
if (not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH and engine.errored
|
||||
and not engine.is_running):
|
||||
logger.fatal("AsyncLLMEngine has failed, terminating server "
|
||||
"process")
|
||||
# See discussions here on shutting down a uvicorn server
|
||||
# https://github.com/encode/uvicorn/discussions/1103
|
||||
# In this case we cannot await the server shutdown here because
|
||||
# this handler must first return to close the connection for
|
||||
# this request.
|
||||
server.should_exit = True
|
||||
|
||||
return Response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)
|
||||
|
||||
@app.exception_handler(AsyncEngineDeadError)
|
||||
async def async_engine_dead_handler(_, __):
|
||||
"""Kill the server if the async engine is already dead. It will
|
||||
not handle any further requests."""
|
||||
if not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH:
|
||||
logger.fatal("AsyncLLMEngine is already dead, terminating server "
|
||||
"process")
|
||||
server.should_exit = True
|
||||
|
||||
return Response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)
|
||||
|
||||
@app.exception_handler(MQEngineDeadError)
|
||||
async def mq_engine_dead_handler(_, __):
|
||||
"""Kill the server if the mq engine is already dead. It will
|
||||
not handle any further requests."""
|
||||
if not envs.VLLM_KEEP_ALIVE_ON_ENGINE_DEATH:
|
||||
logger.fatal("MQLLMEngine is already dead, terminating server "
|
||||
"process")
|
||||
server.should_exit = True
|
||||
@app.exception_handler(EngineDeadError)
|
||||
@app.exception_handler(EngineGenerateError)
|
||||
async def runtime_exception_handler(request: Request, __):
|
||||
terminate_if_errored(
|
||||
server=server,
|
||||
engine=request.app.state.engine_client,
|
||||
)
|
||||
|
||||
return Response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)
|
||||
|
||||
Reference in New Issue
Block a user