From b55156eae9aa586b8fbeb752ecb369179442c521 Mon Sep 17 00:00:00 2001 From: Artem Perevedentsev Date: Thu, 19 Mar 2026 23:36:28 +0200 Subject: [PATCH] [Performance] Enable Triton autotuning disk cache by default (#37188) Signed-off-by: Artem Perevedentsev --- vllm/env_override.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/vllm/env_override.py b/vllm/env_override.py index 181d000a6..5358568fc 100644 --- a/vllm/env_override.py +++ b/vllm/env_override.py @@ -105,6 +105,14 @@ os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1" # see https://github.com/vllm-project/vllm/issues/10619 torch._inductor.config.compile_threads = 1 +# Enable Triton autotuning result caching to disk by default. +# Without this, Triton re-runs autotuning on every process restart, +# adding significant latency to the first inference request. +# This writes autotuning results to TRITON_CACHE_DIR. +# It can still be overridden by setting TRITON_CACHE_AUTOTUNING=0 +# in the environment. +os.environ.setdefault("TRITON_CACHE_AUTOTUNING", "1") + # =================================================== # torch 2.9 Inductor PythonWrapperCodegen monkeypatch # ===================================================