From 2e4ff6b8d4f432ea3cdf81d46d5e7a7a34e4ef47 Mon Sep 17 00:00:00 2001 From: biondizzle Date: Sat, 16 May 2026 06:02:11 +0000 Subject: [PATCH] fix: increase vLLM RPC timeout to 10 min for first-request JIT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit First inference triggers Triton/TileLang kernel JIT compilation (2-3 min). The default 5-min RPC timeout kills the engine. Bumped to 10 min via VLLM_RPC_TIMEOUT_MS so the first request survives compilation. Not ideal — would prefer to warm up the kernels during startup. But CUDA graphs don't work well with grouped GEMMs and variable expert counts. Will investigate vLLM warmup shape config later. --- docker-compose.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/docker-compose.yml b/docker-compose.yml index 2017c8e7..11cf955f 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -9,6 +9,7 @@ services: - OMP_NUM_THREADS=128 - CUDA_LAUNCH_BLOCKING=0 - PYTHONUNBUFFERED=1 + - VLLM_RPC_TIMEOUT_MS=600000 command: - /model - --trust-remote-code