From 00e6402d56fb258e6958381b1f3ceb34217ba830 Mon Sep 17 00:00:00 2001 From: Chauncey Date: Wed, 14 Jan 2026 20:00:37 +0800 Subject: [PATCH] [Frontend] track responsesAPI server_load (#32323) Signed-off-by: chaunceyjiang --- vllm/entrypoints/openai/api_server.py | 4 ++++ vllm/entrypoints/openai/responses/api_router.py | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index af6a24cee..39d07f6d2 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -259,6 +259,10 @@ def engine_client(request: Request) -> EngineClient: async def get_server_load_metrics(request: Request): # This endpoint returns the current server load metrics. # It tracks requests utilizing the GPU from the following routes: + # - /v1/responses + # - /v1/responses/{response_id} + # - /v1/responses/{response_id}/cancel + # - /v1/messages # - /v1/chat/completions # - /v1/completions # - /v1/audio/transcriptions diff --git a/vllm/entrypoints/openai/responses/api_router.py b/vllm/entrypoints/openai/responses/api_router.py index 5eca91179..2be69999e 100644 --- a/vllm/entrypoints/openai/responses/api_router.py +++ b/vllm/entrypoints/openai/responses/api_router.py @@ -17,6 +17,7 @@ from vllm.entrypoints.openai.responses.protocol import ( from vllm.entrypoints.openai.responses.serving import OpenAIServingResponses from vllm.entrypoints.openai.utils import validate_json_request from vllm.entrypoints.utils import ( + load_aware_call, with_cancellation, ) from vllm.logger import init_logger @@ -54,6 +55,7 @@ async def _convert_stream_to_sse_events( }, ) @with_cancellation +@load_aware_call async def create_responses(request: ResponsesRequest, raw_request: Request): handler = responses(raw_request) if handler is None: @@ -79,6 +81,7 @@ async def create_responses(request: ResponsesRequest, raw_request: Request): @router.get("/v1/responses/{response_id}") +@load_aware_call async def retrieve_responses( response_id: str, raw_request: Request, @@ -113,6 +116,7 @@ async def retrieve_responses( @router.post("/v1/responses/{response_id}/cancel") +@load_aware_call async def cancel_responses(response_id: str, raw_request: Request): handler = responses(raw_request) if handler is None: