[Frontend] Kill the server on engine death (#6594)

Signed-off-by: Joe Runde <joe@joerun.de> Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
2024-08-08 10:47:48 -06:00
parent 5fb4a3f678
commit 21b9c49aa3
8 changed files with 136 additions and 14 deletions
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -49,6 +49,7 @@ if TYPE_CHECKING:
    NVCC_THREADS: Optional[str] = None
    VLLM_USE_PRECOMPILED: bool = False
    VLLM_NO_DEPRECATION_WARNING: bool = False
+    VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False
    CMAKE_BUILD_TYPE: Optional[str] = None
    VERBOSE: bool = False
    VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False
@@ -335,6 +336,11 @@ environment_variables: Dict[str, Callable[[], Any]] = {
    "VLLM_NO_DEPRECATION_WARNING":
    lambda: bool(int(os.getenv("VLLM_NO_DEPRECATION_WARNING", "0"))),

+    # If set, the OpenAI API server will stay alive even after the underlying
+    # AsyncLLMEngine errors and stops serving requests
+    "VLLM_KEEP_ALIVE_ON_ENGINE_DEATH":
+    lambda: bool(os.getenv("VLLM_KEEP_ALIVE_ON_ENGINE_DEATH", 0)),
+
    # If the env var VLLM_ALLOW_LONG_MAX_MODEL_LEN is set, it allows
    # the user to specify a max sequence length greater than
    # the max length derived from the model's config.json.