[torch.compile] limit inductor threads and lazy import quant (#10482)

Signed-off-by: youkaichao <youkaichao@gmail.com>
This commit is contained in:
youkaichao
2024-11-20 18:36:33 -08:00
committed by GitHub
parent 2f77b6cfec
commit 388ee3de66
11 changed files with 178 additions and 64 deletions

View File

@@ -9,6 +9,17 @@ from .interface import DeviceCapability, Platform, PlatformEnum, _Backend
logger = init_logger(__name__)
try:
import vllm._C # noqa: F401
except ImportError as e:
logger.warning("Failed to import from vllm._C with %r", e)
# import custom ops, trigger op registration
try:
import vllm._rocm_C # noqa: F401
except ImportError as e:
logger.warning("Failed to import from vllm._rocm_C with %r", e)
if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD", None) in ["fork", None]:
logger.warning("`fork` method is not supported by ROCm. "
"VLLM_WORKER_MULTIPROC_METHOD is overridden to"