[Feature] Add basic metrics for /realtime endpoint (#35500)
Signed-off-by: Thomas Pouget-Abadie <thomaspou@microsoft.com> Signed-off-by: pougetat <thomas.pougetabadie@gmail.com> Co-authored-by: Thomas Pouget-Abadie <thomaspou@microsoft.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
@@ -264,6 +264,14 @@ def build_app(
|
|||||||
# Add scaling middleware to check for scaling state
|
# Add scaling middleware to check for scaling state
|
||||||
app.add_middleware(ScalingMiddleware)
|
app.add_middleware(ScalingMiddleware)
|
||||||
|
|
||||||
|
if "realtime" in supported_tasks:
|
||||||
|
# Add WebSocket metrics middleware
|
||||||
|
from vllm.entrypoints.openai.realtime.metrics import (
|
||||||
|
WebSocketMetricsMiddleware,
|
||||||
|
)
|
||||||
|
|
||||||
|
app.add_middleware(WebSocketMetricsMiddleware)
|
||||||
|
|
||||||
if envs.VLLM_DEBUG_LOG_API_SERVER_RESPONSE:
|
if envs.VLLM_DEBUG_LOG_API_SERVER_RESPONSE:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"CAUTION: Enabling log response in the API Server. "
|
"CAUTION: Enabling log response in the API Server. "
|
||||||
|
|||||||
78
vllm/entrypoints/openai/realtime/metrics.py
Normal file
78
vllm/entrypoints/openai/realtime/metrics.py
Normal file
@@ -0,0 +1,78 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
"""ASGI middleware for WebSocket Prometheus metrics.
|
||||||
|
|
||||||
|
Modeled after prometheus-fastapi-instrumentator, this middleware
|
||||||
|
transparently instruments WebSocket endpoints with standard metrics
|
||||||
|
without requiring changes to handler code.
|
||||||
|
|
||||||
|
NOTE: This module intentionally has zero vllm imports so that it can
|
||||||
|
be extracted into a standalone package (similar to
|
||||||
|
prometheus-fastapi-instrumentator) in the future. Please keep it that way.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import time
|
||||||
|
from collections.abc import Awaitable
|
||||||
|
|
||||||
|
from prometheus_client import Counter, Gauge, Histogram
|
||||||
|
from starlette.types import ASGIApp, Message, Receive, Scope, Send
|
||||||
|
|
||||||
|
# Standard WebSocket metric names (not vllm-specific, following
|
||||||
|
# the same convention as prometheus-fastapi-instrumentator).
|
||||||
|
_active_sessions = Gauge(
|
||||||
|
name="vllm:websocket_connections_active",
|
||||||
|
documentation="Number of currently active WebSocket connections.",
|
||||||
|
multiprocess_mode="livesum",
|
||||||
|
)
|
||||||
|
|
||||||
|
_total_sessions = Counter(
|
||||||
|
name="vllm:websocket_connections_total",
|
||||||
|
documentation="Total number of WebSocket connections.",
|
||||||
|
)
|
||||||
|
|
||||||
|
_session_duration = Histogram(
|
||||||
|
name="vllm:websocket_connection_duration_seconds",
|
||||||
|
documentation="Duration of WebSocket connections in seconds.",
|
||||||
|
buckets=[0.5, 1, 2.5, 5, 10, 30, 60, 120, 300, 600, 1800],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class WebSocketMetricsMiddleware:
|
||||||
|
"""Pure ASGI middleware that instruments WebSocket connections.
|
||||||
|
|
||||||
|
Tracks active connections (gauge), total connections (counter),
|
||||||
|
and connection duration (histogram) for all WebSocket endpoints.
|
||||||
|
|
||||||
|
Usage::
|
||||||
|
|
||||||
|
app.add_middleware(WebSocketMetricsMiddleware)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, app: ASGIApp) -> None:
|
||||||
|
self.app = app
|
||||||
|
|
||||||
|
def __call__(self, scope: Scope, receive: Receive, send: Send) -> Awaitable[None]:
|
||||||
|
if scope["type"] != "websocket":
|
||||||
|
return self.app(scope, receive, send)
|
||||||
|
|
||||||
|
return self._handle_websocket(scope, receive, send)
|
||||||
|
|
||||||
|
async def _handle_websocket(
|
||||||
|
self, scope: Scope, receive: Receive, send: Send
|
||||||
|
) -> None:
|
||||||
|
start_time: float | None = None
|
||||||
|
|
||||||
|
async def send_wrapper(message: Message) -> None:
|
||||||
|
nonlocal start_time
|
||||||
|
if message["type"] == "websocket.accept":
|
||||||
|
start_time = time.monotonic()
|
||||||
|
_active_sessions.inc()
|
||||||
|
_total_sessions.inc()
|
||||||
|
await send(message)
|
||||||
|
|
||||||
|
try:
|
||||||
|
await self.app(scope, receive, send_wrapper)
|
||||||
|
finally:
|
||||||
|
if start_time is not None:
|
||||||
|
_active_sessions.dec()
|
||||||
|
_session_duration.observe(time.monotonic() - start_time)
|
||||||
Reference in New Issue
Block a user