From 1c0aabdeb0cf77019a1f89b5bed5b8eebdd5c211 Mon Sep 17 00:00:00 2001
From: Sage <80211083+sagearc@users.noreply.github.com>
Date: Tue, 24 Mar 2026 14:36:18 +0200
Subject: [PATCH] [Bugfix] Suppress spurious CPU KV cache warning in `launch
 render` (#37911)

Signed-off-by: Sage Ahrac <sagiahrak@gmail.com>
---
 vllm/entrypoints/cli/launch.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/vllm/entrypoints/cli/launch.py b/vllm/entrypoints/cli/launch.py
index cc9e467c4..9871a27da 100644
--- a/vllm/entrypoints/cli/launch.py
+++ b/vllm/entrypoints/cli/launch.py
@@ -5,6 +5,8 @@ import argparse
 
 import uvloop
 
+from vllm import envs
+from vllm.config import VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.entrypoints.cli.types import CLISubcommand
 from vllm.entrypoints.openai.api_server import (
@@ -108,8 +110,6 @@ def cmd_init() -> list[CLISubcommand]:
 
 async def run_launch_fastapi(args: argparse.Namespace) -> None:
     """Run the online serving layer with FastAPI (no GPU inference)."""
-    from vllm.config import VllmConfig
-
     # 1. Socket binding
     listen_address, sock = setup_server(args)
 
@@ -121,6 +121,10 @@ async def run_launch_fastapi(args: argparse.Namespace) -> None:
     # Clear quantization so VllmConfig skips quant dtype/capability validation.
     model_config.quantization = None
 
+    # Render servers never allocate KV cache; suppress the spurious CPU KV
+    # cache space warning from CpuPlatform.check_and_update_config.
+    envs.VLLM_CPU_KVCACHE_SPACE = 0
+
     vllm_config = VllmConfig(model_config=model_config)
     shutdown_task = await build_and_serve_renderer(
         vllm_config, listen_address, sock, args