[Bugfix] Fix render server crash for quantized models on CPU-only hosts (#37215)
Signed-off-by: Sage Ahrac <sagiahrak@gmail.com>
This commit is contained in:
@@ -116,6 +116,11 @@ async def run_launch_fastapi(args: argparse.Namespace) -> None:
|
||||
# 2. Build and serve the API server
|
||||
engine_args = AsyncEngineArgs.from_cli_args(args)
|
||||
model_config = engine_args.create_model_config()
|
||||
|
||||
# Render servers preprocess data only — no inference, no quantized kernels.
|
||||
# Clear quantization so VllmConfig skips quant dtype/capability validation.
|
||||
model_config.quantization = None
|
||||
|
||||
vllm_config = VllmConfig(model_config=model_config)
|
||||
shutdown_task = await build_and_serve_renderer(
|
||||
vllm_config, listen_address, sock, args
|
||||
|
||||
Reference in New Issue
Block a user