[V1][Frontend] Improve Shutdown And Logs (#11737)

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com> Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com> Signed-off-by: Nick Hill <nhill@redhat.com> Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: Russell Bryant <rbryant@redhat.com> Co-authored-by: Andrew Feldman <afeldman@neuralmagic.com> Co-authored-by: afeldman-nm <156691304+afeldman-nm@users.noreply.github.com> Co-authored-by: Nick Hill <nhill@redhat.com>
2025-04-16 22:48:34 -04:00
parent 3c776dcefb
commit 2b05b8ce69
16 changed files with 1031 additions and 347 deletions
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -11,9 +11,7 @@ from logging import DEBUG
 from typing import Any, Callable, Optional, TypeVar, Union

 import msgspec
-import psutil
 import zmq
-import zmq.asyncio

 from vllm.config import ParallelConfig, VllmConfig
 from vllm.distributed import stateless_destroy_torch_distributed_process_group
@@ -22,8 +20,7 @@ from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.transformers_utils.config import (
    maybe_register_config_serialize_by_value)
-from vllm.utils import (get_exception_traceback, resolve_obj_by_qualname,
-                        zmq_socket_ctx)
+from vllm.utils import resolve_obj_by_qualname, zmq_socket_ctx
 from vllm.v1.core.kv_cache_utils import (get_kv_cache_config,
                                         unify_kv_cache_configs)
 from vllm.v1.core.sched.interface import SchedulerInterface
@@ -50,12 +47,11 @@ _R = TypeVar('_R')  # Return type for collective_rpc
 class EngineCore:
    """Inner loop of vLLM's Engine."""

-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        executor_class: type[Executor],
-        log_stats: bool,
-    ):
+    def __init__(self,
+                 vllm_config: VllmConfig,
+                 executor_class: type[Executor],
+                 log_stats: bool,
+                 executor_fail_callback: Optional[Callable] = None):
        assert vllm_config.model_config.runner_type != "pooling"

        logger.info("Initializing a V1 LLM engine (v%s) with config: %s",
@@ -65,6 +61,9 @@ class EngineCore:

        # Setup Model.
        self.model_executor = executor_class(vllm_config)
+        if executor_fail_callback is not None:
+            self.model_executor.register_failure_callback(
+                executor_fail_callback)

        # Setup KV Caches and update CacheConfig after profiling.
        num_gpu_blocks, num_cpu_blocks, kv_cache_config = \
@@ -254,7 +253,8 @@ class EngineCore:
        return engine_core_outputs

    def shutdown(self):
-        self.model_executor.shutdown()
+        if self.model_executor:
+            self.model_executor.shutdown()

    def profile(self, is_start: bool = True):
        self.model_executor.profile(is_start)
@@ -308,6 +308,8 @@ class EngineCore:
 class EngineCoreProc(EngineCore):
    """ZMQ-wrapper for running EngineCore in background process."""

+    ENGINE_CORE_DEAD = b'ENGINE_CORE_DEAD'
+
    def __init__(
        self,
        input_path: str,
@@ -317,11 +319,16 @@ class EngineCoreProc(EngineCore):
        log_stats: bool,
        engine_index: int = 0,
    ):
-        super().__init__(vllm_config, executor_class, log_stats)
+        input_queue = queue.Queue[tuple[EngineCoreRequestType, Any]]()
+
+        executor_fail_callback = lambda: input_queue.put_nowait(
+            (EngineCoreRequestType.EXECUTOR_FAILED, b''))
+
+        super().__init__(vllm_config, executor_class, log_stats,
+                         executor_fail_callback)

        self.step_fn = (self.step if self.batch_queue is None else
                        self.step_with_batch_queue)
-
        self.global_unfinished_reqs = False

        # Background Threads and Queues for IO. These enable us to
@@ -329,15 +336,16 @@ class EngineCoreProc(EngineCore):
        # and to overlap some serialization/deserialization with the
        # model forward pass.
        # Threads handle Socket <-> Queues and core_busy_loop uses Queue.
-        self.input_queue: queue.Queue[tuple[EngineCoreRequestType,
-                                            Any]] = queue.Queue()
-        self.output_queue: queue.Queue[EngineCoreOutputs] = queue.Queue()
+        self.input_queue = input_queue
+        self.output_queue = queue.Queue[Union[EngineCoreOutputs, bytes]]()
        threading.Thread(target=self.process_input_socket,
                         args=(input_path, engine_index),
                         daemon=True).start()
-        threading.Thread(target=self.process_output_socket,
-                         args=(output_path, engine_index),
-                         daemon=True).start()
+        self.output_thread = threading.Thread(
+            target=self.process_output_socket,
+            args=(output_path, engine_index),
+            daemon=True)
+        self.output_thread.start()

    @staticmethod
    def run_engine_core(*args,
@@ -364,7 +372,6 @@ class EngineCoreProc(EngineCore):
        signal.signal(signal.SIGTERM, signal_handler)
        signal.signal(signal.SIGINT, signal_handler)

-        parent_process = psutil.Process().parent()
        engine_core: Optional[EngineCoreProc] = None
        try:
            parallel_config: ParallelConfig = kwargs[
@@ -380,13 +387,15 @@ class EngineCoreProc(EngineCore):
            engine_core.run_busy_loop()

        except SystemExit:
-            logger.debug("EngineCore interrupted.")
-
-        except Exception:
-            traceback = get_exception_traceback()
-            logger.error("EngineCore hit an exception: %s", traceback)
-            parent_process.send_signal(signal.SIGUSR1)
+            logger.debug("EngineCore exiting.")

+        except Exception as e:
+            if engine_core is None:
+                logger.exception("EngineCore failed to start.")
+            else:
+                logger.exception("EngineCore encountered a fatal error.")
+                engine_core._send_engine_dead()
+            raise e
        finally:
            if engine_core is not None:
                engine_core.shutdown()
@@ -458,6 +467,11 @@ class EngineCoreProc(EngineCore):
                                          f" failed: {str(e)}")
            self.output_queue.put_nowait(
                EngineCoreOutputs(utility_output=output))
+        elif request_type == EngineCoreRequestType.EXECUTOR_FAILED:
+            raise RuntimeError("Executor failed.")
+        else:
+            logger.error("Unrecognized input request type encountered: %s",
+                         request_type)

    @staticmethod
    def _convert_msgspec_args(method, args):
@@ -473,6 +487,18 @@ class EngineCoreProc(EngineCore):
            and not isinstance(v, p.annotation) else v
            for v, p in zip(args, arg_types))

+    def _send_engine_dead(self):
+        """Send EngineDead status to the EngineCoreClient."""
+
+        # Put ENGINE_CORE_DEAD in the queue.
+        self.output_queue.put_nowait(EngineCoreProc.ENGINE_CORE_DEAD)
+
+        # Wait until msg sent by the daemon before shutdown.
+        self.output_thread.join(timeout=5.0)
+        if self.output_thread.is_alive():
+            logger.fatal("vLLM shutdown signal from EngineCore failed "
+                         "to send. Please report this issue.")
+
    def process_input_socket(self, input_path: str, engine_index: int):
        """Input socket IO thread."""

@@ -511,9 +537,16 @@ class EngineCoreProc(EngineCore):
        # Reuse send buffer.
        buffer = bytearray()

-        with zmq_socket_ctx(output_path, zmq.constants.PUSH) as socket:
+        # We must set linger to ensure the ENGINE_CORE_DEAD
+        # message is sent prior to closing the socket.
+        with zmq_socket_ctx(output_path, zmq.constants.PUSH,
+                            linger=4000) as socket:
            while True:
                outputs = self.output_queue.get()
+                if outputs == EngineCoreProc.ENGINE_CORE_DEAD:
+                    socket.send(outputs, copy=False)
+                    break
+                assert not isinstance(outputs, bytes)
                outputs.engine_index = engine_index
                buffers = encoder.encode_into(outputs, buffer)
                socket.send_multipart(buffers, copy=False)