[Chore] Separate out NCCL utilities from vllm.utils (#27197)

Signed-off-by: dongbo910220 <1275604947@qq.com>
This commit is contained in:
dongbo910220
2025-10-21 21:18:23 +08:00
committed by GitHub
parent 80e9452984
commit 6c728f7771
4 changed files with 66 additions and 87 deletions

View File

@@ -14,7 +14,7 @@ from vllm import envs
from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.utils import find_nccl_include_paths
from vllm.utils.nccl import find_nccl_include_paths
logger = init_logger(__name__)

View File

@@ -33,7 +33,7 @@ from torch.distributed import ReduceOp
from vllm import envs
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.utils import find_nccl_library
from vllm.utils.nccl import find_nccl_library
logger = init_logger(__name__)

View File

@@ -11,7 +11,6 @@ import json
import multiprocessing
import os
import signal
import subprocess
import sys
import tempfile
import textwrap
@@ -211,90 +210,6 @@ def init_cached_hf_modules() -> None:
init_hf_modules()
@cache
def find_library(lib_name: str) -> str:
"""
Find the library file in the system.
`lib_name` is full filename, with both prefix and suffix.
This function resolves `lib_name` to the full path of the library.
"""
# Adapted from https://github.com/openai/triton/blob/main/third_party/nvidia/backend/driver.py#L19 # noqa
# According to https://en.wikipedia.org/wiki/Filesystem_Hierarchy_Standard
# `/sbin/ldconfig` should exist in all Linux systems.
# `/sbin/ldconfig` searches the library in the system
libs = subprocess.check_output(["/sbin/ldconfig", "-p"]).decode()
# each line looks like the following:
# libcuda.so.1 (libc6,x86-64) => /lib/x86_64-linux-gnu/libcuda.so.1
locs = [line.split()[-1] for line in libs.splitlines() if lib_name in line]
# `LD_LIBRARY_PATH` searches the library in the user-defined paths
env_ld_library_path = envs.LD_LIBRARY_PATH
if not locs and env_ld_library_path:
locs = [
os.path.join(dir, lib_name)
for dir in env_ld_library_path.split(":")
if os.path.exists(os.path.join(dir, lib_name))
]
if not locs:
raise ValueError(f"Cannot find {lib_name} in the system.")
return locs[0]
def find_nccl_library() -> str:
"""
We either use the library file specified by the `VLLM_NCCL_SO_PATH`
environment variable, or we find the library file brought by PyTorch.
After importing `torch`, `libnccl.so.2` or `librccl.so.1` can be
found by `ctypes` automatically.
"""
so_file = envs.VLLM_NCCL_SO_PATH
# manually load the nccl library
if so_file:
logger.info(
"Found nccl from environment variable VLLM_NCCL_SO_PATH=%s", so_file
)
else:
if torch.version.cuda is not None:
so_file = "libnccl.so.2"
elif torch.version.hip is not None:
so_file = "librccl.so.1"
else:
raise ValueError("NCCL only supports CUDA and ROCm backends.")
logger.debug_once("Found nccl from library %s", so_file)
return so_file
def find_nccl_include_paths() -> list[str] | None:
"""
We either use the nccl.h specified by the `VLLM_NCCL_INCLUDE_PATH`
environment variable, or we find the library file brought by
nvidia-nccl-cuXX. load_inline by default uses
torch.utils.cpp_extension.include_paths
"""
paths: list[str] = []
inc = envs.VLLM_NCCL_INCLUDE_PATH
if inc and os.path.isdir(inc):
paths.append(inc)
try:
spec = importlib.util.find_spec("nvidia.nccl")
if spec and getattr(spec, "submodule_search_locations", None):
for loc in spec.submodule_search_locations:
inc_dir = os.path.join(loc, "include")
if os.path.exists(os.path.join(inc_dir, "nccl.h")):
paths.append(inc_dir)
except Exception:
pass
seen = set()
out: list[str] = []
for p in paths:
if p and p not in seen:
out.append(p)
seen.add(p)
return out or None
def enable_trace_function_call_for_thread(vllm_config: VllmConfig) -> None:
"""Set up function tracing for the current thread,
if enabled via the VLLM_TRACE_FUNCTION environment variable

64
vllm/utils/nccl.py Normal file
View File

@@ -0,0 +1,64 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from __future__ import annotations
import importlib
import os
import torch
import vllm.envs as envs
from vllm.logger import init_logger
logger = init_logger(__name__)
def find_nccl_library() -> str:
"""Return NCCL/RCCL shared library name to load.
Uses `VLLM_NCCL_SO_PATH` if set; otherwise chooses by torch backend.
"""
so_file = envs.VLLM_NCCL_SO_PATH
if so_file:
logger.info(
"Found nccl from environment variable VLLM_NCCL_SO_PATH=%s", so_file
)
else:
if torch.version.cuda is not None:
so_file = "libnccl.so.2"
elif torch.version.hip is not None:
so_file = "librccl.so.1"
else:
raise ValueError("NCCL only supports CUDA and ROCm backends.")
logger.debug_once("Found nccl from library %s", so_file)
return so_file
def find_nccl_include_paths() -> list[str] | None:
"""Return possible include paths containing `nccl.h`.
Considers `VLLM_NCCL_INCLUDE_PATH` and the `nvidia-nccl-cuXX` package.
"""
paths: list[str] = []
inc = envs.VLLM_NCCL_INCLUDE_PATH
if inc and os.path.isdir(inc):
paths.append(inc)
try:
spec = importlib.util.find_spec("nvidia.nccl")
if spec and getattr(spec, "submodule_search_locations", None):
for loc in spec.submodule_search_locations:
inc_dir = os.path.join(loc, "include")
if os.path.exists(os.path.join(inc_dir, "nccl.h")):
paths.append(inc_dir)
except Exception as e:
logger.debug("Failed to find nccl include path from nvidia.nccl package: %s", e)
seen: set[str] = set()
out: list[str] = []
for p in paths:
if p and p not in seen:
out.append(p)
seen.add(p)
return out or None