[torch.compile] limit inductor threads and lazy import quant (#10482)

Signed-off-by: youkaichao <youkaichao@gmail.com>
2024-11-20 18:36:33 -08:00
parent 2f77b6cfec
commit 388ee3de66
11 changed files with 178 additions and 64 deletions
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -10,6 +10,8 @@ import pynvml
 import torch
 from typing_extensions import ParamSpec

+# import custom ops, trigger op registration
+import vllm._C  # noqa
 from vllm.logger import init_logger

 from .interface import DeviceCapability, Platform, PlatformEnum
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -9,6 +9,17 @@ from .interface import DeviceCapability, Platform, PlatformEnum, _Backend

 logger = init_logger(__name__)

+try:
+    import vllm._C  # noqa: F401
+except ImportError as e:
+    logger.warning("Failed to import from vllm._C with %r", e)
+
+# import custom ops, trigger op registration
+try:
+    import vllm._rocm_C  # noqa: F401
+except ImportError as e:
+    logger.warning("Failed to import from vllm._rocm_C with %r", e)
+
 if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD", None) in ["fork", None]:
    logger.warning("`fork` method is not supported by ROCm. "
                   "VLLM_WORKER_MULTIPROC_METHOD is overridden to"