From 1cab43c2d2172459987652a7fbf922c6611a71fa Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 3 Apr 2025 01:02:58 +0800
Subject: [PATCH] [misc] instruct pytorch to use nvml-based cuda check (#15951)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/__init__.py     | 20 ++++----------------
 vllm/env_override.py | 21 +++++++++++++++++++++
 2 files changed, 25 insertions(+), 16 deletions(-)
 create mode 100644 vllm/env_override.py

diff --git a/vllm/__init__.py b/vllm/__init__.py
index 457780824..52022fb8f 100644
--- a/vllm/__init__.py
+++ b/vllm/__init__.py
@@ -4,9 +4,10 @@
 # version library first.  Such assumption is critical for some customization.
 from .version import __version__, __version_tuple__  # isort:skip
 
-import os
-
-import torch
+# The environment variables override should be imported before any other
+# modules to ensure that the environment variables are set before any
+# other modules are imported.
+import vllm.env_override  # isort:skip  # noqa: F401
 
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
@@ -23,19 +24,6 @@ from vllm.outputs import (ClassificationOutput, ClassificationRequestOutput,
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 
-# set some common config/environment variables that should be set
-# for all processes created by vllm and all processes
-# that interact with vllm workers.
-# they are executed whenever `import vllm` is called.
-
-# see https://github.com/NVIDIA/nccl/issues/1234
-os.environ['NCCL_CUMEM_ENABLE'] = '0'
-
-# see https://github.com/vllm-project/vllm/issues/10480
-os.environ['TORCHINDUCTOR_COMPILE_THREADS'] = '1'
-# see https://github.com/vllm-project/vllm/issues/10619
-torch._inductor.config.compile_threads = 1
-
 __all__ = [
     "__version__",
     "__version_tuple__",
diff --git a/vllm/env_override.py b/vllm/env_override.py
new file mode 100644
index 000000000..0fa5b70c2
--- /dev/null
+++ b/vllm/env_override.py
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: Apache-2.0
+import os
+
+import torch
+
+# set some common config/environment variables that should be set
+# for all processes created by vllm and all processes
+# that interact with vllm workers.
+# they are executed whenever `import vllm` is called.
+
+# see https://github.com/NVIDIA/nccl/issues/1234
+os.environ['NCCL_CUMEM_ENABLE'] = '0'
+
+# see https://github.com/vllm-project/vllm/pull/15951
+# it avoids unintentional cuda initialization from torch.cuda.is_available()
+os.environ['PYTORCH_NVML_BASED_CUDA_CHECK'] = '1'
+
+# see https://github.com/vllm-project/vllm/issues/10480
+os.environ['TORCHINDUCTOR_COMPILE_THREADS'] = '1'
+# see https://github.com/vllm-project/vllm/issues/10619
+torch._inductor.config.compile_threads = 1