[distributed][rl] remove nccl cumem env var override (#24141)

Signed-off-by: youkaichao <youkaichao@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-09-03 14:45:25 +08:00
parent 426cc8629f
commit f38035c123
2 changed files with 1 additions and 19 deletions
--- a/vllm/env_override.py
+++ b/vllm/env_override.py
@@ -13,24 +13,6 @@ logger = init_logger(__name__)
 # that interact with vllm workers.
 # they are executed whenever `import vllm` is called.

-if os.environ.get('NCCL_CUMEM_ENABLE', '0') != '0':
-    logger.warning(
-        "NCCL_CUMEM_ENABLE is set to %s, skipping override. "
-        "This may increase memory overhead with cudagraph+allreduce: "
-        "https://github.com/NVIDIA/nccl/issues/1234",
-        os.environ['NCCL_CUMEM_ENABLE'])
-elif not os.path.exists('/dev/nvidia-caps-imex-channels'):
-    # NCCL requires NCCL_CUMEM_ENABLE to work with
-    # multi-node NVLink, typically on GB200-NVL72 systems.
-    # The ultimate way to detect multi-node NVLink is to use
-    # NVML APIs, which are too expensive to call here.
-    # As an approximation, we check the existence of
-    # /dev/nvidia-caps-imex-channels, used by
-    # multi-node NVLink to communicate across nodes.
-    # This will still cost some GPU memory, but it is worthwhile
-    # because we can get very fast cross-node bandwidth with NVLink.
-    os.environ['NCCL_CUMEM_ENABLE'] = '0'
-
 # see https://github.com/vllm-project/vllm/pull/15951
 # it avoids unintentional cuda initialization from torch.cuda.is_available()
 os.environ['PYTORCH_NVML_BASED_CUDA_CHECK'] = '1'