feat(api): Eager chat template warmup to eliminate first-request latency (#30700)

Signed-off-by: Nathan Price <nathan@abridge.com>
2025-12-17 18:01:29 -06:00
parent e3fc374a9a
commit 05a83dc6ee
2 changed files with 52 additions and 0 deletions
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1082,6 +1082,9 @@ async def init_app_state(
        if "generate" in supported_tasks
        else None
    )
+    # Warm up chat template processing to avoid first-request latency
+    if state.openai_serving_chat is not None:
+        await state.openai_serving_chat.warmup()
    state.openai_serving_completion = (
        OpenAIServingCompletion(
            engine_client,