[V1][Frontend] Improve Shutdown And Logs (#11737)

Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Andrew Feldman <afeldman@neuralmagic.com>
Co-authored-by: afeldman-nm <156691304+afeldman-nm@users.noreply.github.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
This commit is contained in:
Robert Shaw
2025-04-16 22:48:34 -04:00
committed by GitHub
parent 3c776dcefb
commit 2b05b8ce69
16 changed files with 1031 additions and 347 deletions

View File

@@ -11,9 +11,7 @@ from logging import DEBUG
from typing import Any, Callable, Optional, TypeVar, Union
import msgspec
import psutil
import zmq
import zmq.asyncio
from vllm.config import ParallelConfig, VllmConfig
from vllm.distributed import stateless_destroy_torch_distributed_process_group
@@ -22,8 +20,7 @@ from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.transformers_utils.config import (
maybe_register_config_serialize_by_value)
from vllm.utils import (get_exception_traceback, resolve_obj_by_qualname,
zmq_socket_ctx)
from vllm.utils import resolve_obj_by_qualname, zmq_socket_ctx
from vllm.v1.core.kv_cache_utils import (get_kv_cache_config,
unify_kv_cache_configs)
from vllm.v1.core.sched.interface import SchedulerInterface
@@ -50,12 +47,11 @@ _R = TypeVar('_R') # Return type for collective_rpc
class EngineCore:
"""Inner loop of vLLM's Engine."""
def __init__(
self,
vllm_config: VllmConfig,
executor_class: type[Executor],
log_stats: bool,
):
def __init__(self,
vllm_config: VllmConfig,
executor_class: type[Executor],
log_stats: bool,
executor_fail_callback: Optional[Callable] = None):
assert vllm_config.model_config.runner_type != "pooling"
logger.info("Initializing a V1 LLM engine (v%s) with config: %s",
@@ -65,6 +61,9 @@ class EngineCore:
# Setup Model.
self.model_executor = executor_class(vllm_config)
if executor_fail_callback is not None:
self.model_executor.register_failure_callback(
executor_fail_callback)
# Setup KV Caches and update CacheConfig after profiling.
num_gpu_blocks, num_cpu_blocks, kv_cache_config = \
@@ -254,7 +253,8 @@ class EngineCore:
return engine_core_outputs
def shutdown(self):
self.model_executor.shutdown()
if self.model_executor:
self.model_executor.shutdown()
def profile(self, is_start: bool = True):
self.model_executor.profile(is_start)
@@ -308,6 +308,8 @@ class EngineCore:
class EngineCoreProc(EngineCore):
"""ZMQ-wrapper for running EngineCore in background process."""
ENGINE_CORE_DEAD = b'ENGINE_CORE_DEAD'
def __init__(
self,
input_path: str,
@@ -317,11 +319,16 @@ class EngineCoreProc(EngineCore):
log_stats: bool,
engine_index: int = 0,
):
super().__init__(vllm_config, executor_class, log_stats)
input_queue = queue.Queue[tuple[EngineCoreRequestType, Any]]()
executor_fail_callback = lambda: input_queue.put_nowait(
(EngineCoreRequestType.EXECUTOR_FAILED, b''))
super().__init__(vllm_config, executor_class, log_stats,
executor_fail_callback)
self.step_fn = (self.step if self.batch_queue is None else
self.step_with_batch_queue)
self.global_unfinished_reqs = False
# Background Threads and Queues for IO. These enable us to
@@ -329,15 +336,16 @@ class EngineCoreProc(EngineCore):
# and to overlap some serialization/deserialization with the
# model forward pass.
# Threads handle Socket <-> Queues and core_busy_loop uses Queue.
self.input_queue: queue.Queue[tuple[EngineCoreRequestType,
Any]] = queue.Queue()
self.output_queue: queue.Queue[EngineCoreOutputs] = queue.Queue()
self.input_queue = input_queue
self.output_queue = queue.Queue[Union[EngineCoreOutputs, bytes]]()
threading.Thread(target=self.process_input_socket,
args=(input_path, engine_index),
daemon=True).start()
threading.Thread(target=self.process_output_socket,
args=(output_path, engine_index),
daemon=True).start()
self.output_thread = threading.Thread(
target=self.process_output_socket,
args=(output_path, engine_index),
daemon=True)
self.output_thread.start()
@staticmethod
def run_engine_core(*args,
@@ -364,7 +372,6 @@ class EngineCoreProc(EngineCore):
signal.signal(signal.SIGTERM, signal_handler)
signal.signal(signal.SIGINT, signal_handler)
parent_process = psutil.Process().parent()
engine_core: Optional[EngineCoreProc] = None
try:
parallel_config: ParallelConfig = kwargs[
@@ -380,13 +387,15 @@ class EngineCoreProc(EngineCore):
engine_core.run_busy_loop()
except SystemExit:
logger.debug("EngineCore interrupted.")
except Exception:
traceback = get_exception_traceback()
logger.error("EngineCore hit an exception: %s", traceback)
parent_process.send_signal(signal.SIGUSR1)
logger.debug("EngineCore exiting.")
except Exception as e:
if engine_core is None:
logger.exception("EngineCore failed to start.")
else:
logger.exception("EngineCore encountered a fatal error.")
engine_core._send_engine_dead()
raise e
finally:
if engine_core is not None:
engine_core.shutdown()
@@ -458,6 +467,11 @@ class EngineCoreProc(EngineCore):
f" failed: {str(e)}")
self.output_queue.put_nowait(
EngineCoreOutputs(utility_output=output))
elif request_type == EngineCoreRequestType.EXECUTOR_FAILED:
raise RuntimeError("Executor failed.")
else:
logger.error("Unrecognized input request type encountered: %s",
request_type)
@staticmethod
def _convert_msgspec_args(method, args):
@@ -473,6 +487,18 @@ class EngineCoreProc(EngineCore):
and not isinstance(v, p.annotation) else v
for v, p in zip(args, arg_types))
def _send_engine_dead(self):
"""Send EngineDead status to the EngineCoreClient."""
# Put ENGINE_CORE_DEAD in the queue.
self.output_queue.put_nowait(EngineCoreProc.ENGINE_CORE_DEAD)
# Wait until msg sent by the daemon before shutdown.
self.output_thread.join(timeout=5.0)
if self.output_thread.is_alive():
logger.fatal("vLLM shutdown signal from EngineCore failed "
"to send. Please report this issue.")
def process_input_socket(self, input_path: str, engine_index: int):
"""Input socket IO thread."""
@@ -511,9 +537,16 @@ class EngineCoreProc(EngineCore):
# Reuse send buffer.
buffer = bytearray()
with zmq_socket_ctx(output_path, zmq.constants.PUSH) as socket:
# We must set linger to ensure the ENGINE_CORE_DEAD
# message is sent prior to closing the socket.
with zmq_socket_ctx(output_path, zmq.constants.PUSH,
linger=4000) as socket:
while True:
outputs = self.output_queue.get()
if outputs == EngineCoreProc.ENGINE_CORE_DEAD:
socket.send(outputs, copy=False)
break
assert not isinstance(outputs, bytes)
outputs.engine_index = engine_index
buffers = encoder.encode_into(outputs, buffer)
socket.send_multipart(buffers, copy=False)