[Bugfix] Fix render server crash for quantized models on CPU-only hosts (#37215)

Signed-off-by: Sage Ahrac <sagiahrak@gmail.com>
2026-03-16 20:59:01 +02:00
parent f5c081d432
commit 0fefd00e6c
1 changed files with 5 additions and 0 deletions
--- a/vllm/entrypoints/cli/launch.py
+++ b/vllm/entrypoints/cli/launch.py
@@ -116,6 +116,11 @@ async def run_launch_fastapi(args: argparse.Namespace) -> None:
    # 2. Build and serve the API server
    engine_args = AsyncEngineArgs.from_cli_args(args)
    model_config = engine_args.create_model_config()
+
+    # Render servers preprocess data only — no inference, no quantized kernels.
+    # Clear quantization so VllmConfig skips quant dtype/capability validation.
+    model_config.quantization = None
+
    vllm_config = VllmConfig(model_config=model_config)
    shutdown_task = await build_and_serve_renderer(
        vllm_config, listen_address, sock, args