From 0fefd00e6ccf6670686eb2cc0a5eda57f56e625a Mon Sep 17 00:00:00 2001 From: Sage <80211083+sagearc@users.noreply.github.com> Date: Mon, 16 Mar 2026 20:59:01 +0200 Subject: [PATCH] [Bugfix] Fix render server crash for quantized models on CPU-only hosts (#37215) Signed-off-by: Sage Ahrac --- vllm/entrypoints/cli/launch.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/vllm/entrypoints/cli/launch.py b/vllm/entrypoints/cli/launch.py index 6afa24353..cc9e467c4 100644 --- a/vllm/entrypoints/cli/launch.py +++ b/vllm/entrypoints/cli/launch.py @@ -116,6 +116,11 @@ async def run_launch_fastapi(args: argparse.Namespace) -> None: # 2. Build and serve the API server engine_args = AsyncEngineArgs.from_cli_args(args) model_config = engine_args.create_model_config() + + # Render servers preprocess data only — no inference, no quantized kernels. + # Clear quantization so VllmConfig skips quant dtype/capability validation. + model_config.quantization = None + vllm_config = VllmConfig(model_config=model_config) shutdown_task = await build_and_serve_renderer( vllm_config, listen_address, sock, args