diff --git a/vllm/entrypoints/cli/launch.py b/vllm/entrypoints/cli/launch.py index 6afa24353..cc9e467c4 100644 --- a/vllm/entrypoints/cli/launch.py +++ b/vllm/entrypoints/cli/launch.py @@ -116,6 +116,11 @@ async def run_launch_fastapi(args: argparse.Namespace) -> None: # 2. Build and serve the API server engine_args = AsyncEngineArgs.from_cli_args(args) model_config = engine_args.create_model_config() + + # Render servers preprocess data only — no inference, no quantized kernels. + # Clear quantization so VllmConfig skips quant dtype/capability validation. + model_config.quantization = None + vllm_config = VllmConfig(model_config=model_config) shutdown_task = await build_and_serve_renderer( vllm_config, listen_address, sock, args