diff --git a/vllm/distributed/device_communicators/pynccl_allocator.py b/vllm/distributed/device_communicators/pynccl_allocator.py index a2ed3628f..401b80046 100644 --- a/vllm/distributed/device_communicators/pynccl_allocator.py +++ b/vllm/distributed/device_communicators/pynccl_allocator.py @@ -14,7 +14,7 @@ from vllm import envs from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator from vllm.logger import init_logger from vllm.platforms import current_platform -from vllm.utils import find_nccl_include_paths +from vllm.utils.nccl import find_nccl_include_paths logger = init_logger(__name__) diff --git a/vllm/distributed/device_communicators/pynccl_wrapper.py b/vllm/distributed/device_communicators/pynccl_wrapper.py index 28d4afde1..b2433d58d 100644 --- a/vllm/distributed/device_communicators/pynccl_wrapper.py +++ b/vllm/distributed/device_communicators/pynccl_wrapper.py @@ -33,7 +33,7 @@ from torch.distributed import ReduceOp from vllm import envs from vllm.logger import init_logger from vllm.platforms import current_platform -from vllm.utils import find_nccl_library +from vllm.utils.nccl import find_nccl_library logger = init_logger(__name__) diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index 9a52e9999..6704668c9 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -11,7 +11,6 @@ import json import multiprocessing import os import signal -import subprocess import sys import tempfile import textwrap @@ -211,90 +210,6 @@ def init_cached_hf_modules() -> None: init_hf_modules() -@cache -def find_library(lib_name: str) -> str: - """ - Find the library file in the system. - `lib_name` is full filename, with both prefix and suffix. - This function resolves `lib_name` to the full path of the library. - """ - # Adapted from https://github.com/openai/triton/blob/main/third_party/nvidia/backend/driver.py#L19 # noqa - # According to https://en.wikipedia.org/wiki/Filesystem_Hierarchy_Standard - # `/sbin/ldconfig` should exist in all Linux systems. - # `/sbin/ldconfig` searches the library in the system - libs = subprocess.check_output(["/sbin/ldconfig", "-p"]).decode() - # each line looks like the following: - # libcuda.so.1 (libc6,x86-64) => /lib/x86_64-linux-gnu/libcuda.so.1 - locs = [line.split()[-1] for line in libs.splitlines() if lib_name in line] - # `LD_LIBRARY_PATH` searches the library in the user-defined paths - env_ld_library_path = envs.LD_LIBRARY_PATH - if not locs and env_ld_library_path: - locs = [ - os.path.join(dir, lib_name) - for dir in env_ld_library_path.split(":") - if os.path.exists(os.path.join(dir, lib_name)) - ] - if not locs: - raise ValueError(f"Cannot find {lib_name} in the system.") - return locs[0] - - -def find_nccl_library() -> str: - """ - We either use the library file specified by the `VLLM_NCCL_SO_PATH` - environment variable, or we find the library file brought by PyTorch. - After importing `torch`, `libnccl.so.2` or `librccl.so.1` can be - found by `ctypes` automatically. - """ - so_file = envs.VLLM_NCCL_SO_PATH - - # manually load the nccl library - if so_file: - logger.info( - "Found nccl from environment variable VLLM_NCCL_SO_PATH=%s", so_file - ) - else: - if torch.version.cuda is not None: - so_file = "libnccl.so.2" - elif torch.version.hip is not None: - so_file = "librccl.so.1" - else: - raise ValueError("NCCL only supports CUDA and ROCm backends.") - logger.debug_once("Found nccl from library %s", so_file) - return so_file - - -def find_nccl_include_paths() -> list[str] | None: - """ - We either use the nccl.h specified by the `VLLM_NCCL_INCLUDE_PATH` - environment variable, or we find the library file brought by - nvidia-nccl-cuXX. load_inline by default uses - torch.utils.cpp_extension.include_paths - """ - paths: list[str] = [] - inc = envs.VLLM_NCCL_INCLUDE_PATH - if inc and os.path.isdir(inc): - paths.append(inc) - - try: - spec = importlib.util.find_spec("nvidia.nccl") - if spec and getattr(spec, "submodule_search_locations", None): - for loc in spec.submodule_search_locations: - inc_dir = os.path.join(loc, "include") - if os.path.exists(os.path.join(inc_dir, "nccl.h")): - paths.append(inc_dir) - except Exception: - pass - - seen = set() - out: list[str] = [] - for p in paths: - if p and p not in seen: - out.append(p) - seen.add(p) - return out or None - - def enable_trace_function_call_for_thread(vllm_config: VllmConfig) -> None: """Set up function tracing for the current thread, if enabled via the VLLM_TRACE_FUNCTION environment variable diff --git a/vllm/utils/nccl.py b/vllm/utils/nccl.py new file mode 100644 index 000000000..b1459fcbd --- /dev/null +++ b/vllm/utils/nccl.py @@ -0,0 +1,64 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from __future__ import annotations + +import importlib +import os + +import torch + +import vllm.envs as envs +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +def find_nccl_library() -> str: + """Return NCCL/RCCL shared library name to load. + + Uses `VLLM_NCCL_SO_PATH` if set; otherwise chooses by torch backend. + """ + so_file = envs.VLLM_NCCL_SO_PATH + if so_file: + logger.info( + "Found nccl from environment variable VLLM_NCCL_SO_PATH=%s", so_file + ) + else: + if torch.version.cuda is not None: + so_file = "libnccl.so.2" + elif torch.version.hip is not None: + so_file = "librccl.so.1" + else: + raise ValueError("NCCL only supports CUDA and ROCm backends.") + logger.debug_once("Found nccl from library %s", so_file) + return so_file + + +def find_nccl_include_paths() -> list[str] | None: + """Return possible include paths containing `nccl.h`. + + Considers `VLLM_NCCL_INCLUDE_PATH` and the `nvidia-nccl-cuXX` package. + """ + paths: list[str] = [] + inc = envs.VLLM_NCCL_INCLUDE_PATH + if inc and os.path.isdir(inc): + paths.append(inc) + + try: + spec = importlib.util.find_spec("nvidia.nccl") + if spec and getattr(spec, "submodule_search_locations", None): + for loc in spec.submodule_search_locations: + inc_dir = os.path.join(loc, "include") + if os.path.exists(os.path.join(inc_dir, "nccl.h")): + paths.append(inc_dir) + except Exception as e: + logger.debug("Failed to find nccl include path from nvidia.nccl package: %s", e) + + seen: set[str] = set() + out: list[str] = [] + for p in paths: + if p and p not in seen: + out.append(p) + seen.add(p) + return out or None