diff --git a/vllm/entrypoints/cli/launch.py b/vllm/entrypoints/cli/launch.py index cc9e467c4..9871a27da 100644 --- a/vllm/entrypoints/cli/launch.py +++ b/vllm/entrypoints/cli/launch.py @@ -5,6 +5,8 @@ import argparse import uvloop +from vllm import envs +from vllm.config import VllmConfig from vllm.engine.arg_utils import AsyncEngineArgs from vllm.entrypoints.cli.types import CLISubcommand from vllm.entrypoints.openai.api_server import ( @@ -108,8 +110,6 @@ def cmd_init() -> list[CLISubcommand]: async def run_launch_fastapi(args: argparse.Namespace) -> None: """Run the online serving layer with FastAPI (no GPU inference).""" - from vllm.config import VllmConfig - # 1. Socket binding listen_address, sock = setup_server(args) @@ -121,6 +121,10 @@ async def run_launch_fastapi(args: argparse.Namespace) -> None: # Clear quantization so VllmConfig skips quant dtype/capability validation. model_config.quantization = None + # Render servers never allocate KV cache; suppress the spurious CPU KV + # cache space warning from CpuPlatform.check_and_update_config. + envs.VLLM_CPU_KVCACHE_SPACE = 0 + vllm_config = VllmConfig(model_config=model_config) shutdown_task = await build_and_serve_renderer( vllm_config, listen_address, sock, args