[torch.compile] limit inductor threads and lazy import quant (#10482)

Signed-off-by: youkaichao <youkaichao@gmail.com>
This commit is contained in:
youkaichao
2024-11-20 18:36:33 -08:00
committed by GitHub
parent 2f77b6cfec
commit 388ee3de66
11 changed files with 178 additions and 64 deletions

View File

@@ -1,4 +1,5 @@
import logging
import os
from contextlib import contextmanager
from typing import TYPE_CHECKING, Optional
@@ -18,6 +19,14 @@ def load_general_plugins():
processes. They should be designed in a way that they can be loaded
multiple times without causing issues.
"""
# all processes created by vllm will load plugins,
# and here we can inject some common environment variables
# for all processes.
# see https://github.com/vllm-project/vllm/issues/10480
os.environ['TORCHINDUCTOR_COMPILE_THREADS'] = '1'
global plugins_loaded
if plugins_loaded:
return