[torch.compile] limit inductor threads and lazy import quant (#10482)

Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-11-20 18:36:33 -08:00
parent 2f77b6cfec
commit 388ee3de66
11 changed files with 178 additions and 64 deletions
--- a/vllm/plugins/init.py
+++ b/vllm/plugins/init.py
@@ -1,4 +1,5 @@
 import logging
+import os
 from contextlib import contextmanager
 from typing import TYPE_CHECKING, Optional

@@ -18,6 +19,14 @@ def load_general_plugins():
    processes. They should be designed in a way that they can be loaded
    multiple times without causing issues.
    """
+
+    # all processes created by vllm will load plugins,
+    # and here we can inject some common environment variables
+    # for all processes.
+
+    # see https://github.com/vllm-project/vllm/issues/10480
+    os.environ['TORCHINDUCTOR_COMPILE_THREADS'] = '1'
+
    global plugins_loaded
    if plugins_loaded:
        return