[Performance] Enable Triton autotuning disk cache by default (#37188)
Signed-off-by: Artem Perevedentsev <aperevedents@nvidia.com>
This commit is contained in:
committed by
GitHub
parent
112944fab9
commit
b55156eae9
@@ -105,6 +105,14 @@ os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
|
||||
# see https://github.com/vllm-project/vllm/issues/10619
|
||||
torch._inductor.config.compile_threads = 1
|
||||
|
||||
# Enable Triton autotuning result caching to disk by default.
|
||||
# Without this, Triton re-runs autotuning on every process restart,
|
||||
# adding significant latency to the first inference request.
|
||||
# This writes autotuning results to TRITON_CACHE_DIR.
|
||||
# It can still be overridden by setting TRITON_CACHE_AUTOTUNING=0
|
||||
# in the environment.
|
||||
os.environ.setdefault("TRITON_CACHE_AUTOTUNING", "1")
|
||||
|
||||
# ===================================================
|
||||
# torch 2.9 Inductor PythonWrapperCodegen monkeypatch
|
||||
# ===================================================
|
||||
|
||||
Reference in New Issue
Block a user