[Chore] Separate out NCCL utilities from vllm.utils (#27197)
Signed-off-by: dongbo910220 <1275604947@qq.com>
This commit is contained in:
@@ -14,7 +14,7 @@ from vllm import envs
|
||||
from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
|
||||
from vllm.logger import init_logger
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import find_nccl_include_paths
|
||||
from vllm.utils.nccl import find_nccl_include_paths
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
@@ -33,7 +33,7 @@ from torch.distributed import ReduceOp
|
||||
from vllm import envs
|
||||
from vllm.logger import init_logger
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import find_nccl_library
|
||||
from vllm.utils.nccl import find_nccl_library
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
@@ -11,7 +11,6 @@ import json
|
||||
import multiprocessing
|
||||
import os
|
||||
import signal
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import textwrap
|
||||
@@ -211,90 +210,6 @@ def init_cached_hf_modules() -> None:
|
||||
init_hf_modules()
|
||||
|
||||
|
||||
@cache
|
||||
def find_library(lib_name: str) -> str:
|
||||
"""
|
||||
Find the library file in the system.
|
||||
`lib_name` is full filename, with both prefix and suffix.
|
||||
This function resolves `lib_name` to the full path of the library.
|
||||
"""
|
||||
# Adapted from https://github.com/openai/triton/blob/main/third_party/nvidia/backend/driver.py#L19 # noqa
|
||||
# According to https://en.wikipedia.org/wiki/Filesystem_Hierarchy_Standard
|
||||
# `/sbin/ldconfig` should exist in all Linux systems.
|
||||
# `/sbin/ldconfig` searches the library in the system
|
||||
libs = subprocess.check_output(["/sbin/ldconfig", "-p"]).decode()
|
||||
# each line looks like the following:
|
||||
# libcuda.so.1 (libc6,x86-64) => /lib/x86_64-linux-gnu/libcuda.so.1
|
||||
locs = [line.split()[-1] for line in libs.splitlines() if lib_name in line]
|
||||
# `LD_LIBRARY_PATH` searches the library in the user-defined paths
|
||||
env_ld_library_path = envs.LD_LIBRARY_PATH
|
||||
if not locs and env_ld_library_path:
|
||||
locs = [
|
||||
os.path.join(dir, lib_name)
|
||||
for dir in env_ld_library_path.split(":")
|
||||
if os.path.exists(os.path.join(dir, lib_name))
|
||||
]
|
||||
if not locs:
|
||||
raise ValueError(f"Cannot find {lib_name} in the system.")
|
||||
return locs[0]
|
||||
|
||||
|
||||
def find_nccl_library() -> str:
|
||||
"""
|
||||
We either use the library file specified by the `VLLM_NCCL_SO_PATH`
|
||||
environment variable, or we find the library file brought by PyTorch.
|
||||
After importing `torch`, `libnccl.so.2` or `librccl.so.1` can be
|
||||
found by `ctypes` automatically.
|
||||
"""
|
||||
so_file = envs.VLLM_NCCL_SO_PATH
|
||||
|
||||
# manually load the nccl library
|
||||
if so_file:
|
||||
logger.info(
|
||||
"Found nccl from environment variable VLLM_NCCL_SO_PATH=%s", so_file
|
||||
)
|
||||
else:
|
||||
if torch.version.cuda is not None:
|
||||
so_file = "libnccl.so.2"
|
||||
elif torch.version.hip is not None:
|
||||
so_file = "librccl.so.1"
|
||||
else:
|
||||
raise ValueError("NCCL only supports CUDA and ROCm backends.")
|
||||
logger.debug_once("Found nccl from library %s", so_file)
|
||||
return so_file
|
||||
|
||||
|
||||
def find_nccl_include_paths() -> list[str] | None:
|
||||
"""
|
||||
We either use the nccl.h specified by the `VLLM_NCCL_INCLUDE_PATH`
|
||||
environment variable, or we find the library file brought by
|
||||
nvidia-nccl-cuXX. load_inline by default uses
|
||||
torch.utils.cpp_extension.include_paths
|
||||
"""
|
||||
paths: list[str] = []
|
||||
inc = envs.VLLM_NCCL_INCLUDE_PATH
|
||||
if inc and os.path.isdir(inc):
|
||||
paths.append(inc)
|
||||
|
||||
try:
|
||||
spec = importlib.util.find_spec("nvidia.nccl")
|
||||
if spec and getattr(spec, "submodule_search_locations", None):
|
||||
for loc in spec.submodule_search_locations:
|
||||
inc_dir = os.path.join(loc, "include")
|
||||
if os.path.exists(os.path.join(inc_dir, "nccl.h")):
|
||||
paths.append(inc_dir)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
seen = set()
|
||||
out: list[str] = []
|
||||
for p in paths:
|
||||
if p and p not in seen:
|
||||
out.append(p)
|
||||
seen.add(p)
|
||||
return out or None
|
||||
|
||||
|
||||
def enable_trace_function_call_for_thread(vllm_config: VllmConfig) -> None:
|
||||
"""Set up function tracing for the current thread,
|
||||
if enabled via the VLLM_TRACE_FUNCTION environment variable
|
||||
|
||||
64
vllm/utils/nccl.py
Normal file
64
vllm/utils/nccl.py
Normal file
@@ -0,0 +1,64 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import importlib
|
||||
import os
|
||||
|
||||
import torch
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def find_nccl_library() -> str:
|
||||
"""Return NCCL/RCCL shared library name to load.
|
||||
|
||||
Uses `VLLM_NCCL_SO_PATH` if set; otherwise chooses by torch backend.
|
||||
"""
|
||||
so_file = envs.VLLM_NCCL_SO_PATH
|
||||
if so_file:
|
||||
logger.info(
|
||||
"Found nccl from environment variable VLLM_NCCL_SO_PATH=%s", so_file
|
||||
)
|
||||
else:
|
||||
if torch.version.cuda is not None:
|
||||
so_file = "libnccl.so.2"
|
||||
elif torch.version.hip is not None:
|
||||
so_file = "librccl.so.1"
|
||||
else:
|
||||
raise ValueError("NCCL only supports CUDA and ROCm backends.")
|
||||
logger.debug_once("Found nccl from library %s", so_file)
|
||||
return so_file
|
||||
|
||||
|
||||
def find_nccl_include_paths() -> list[str] | None:
|
||||
"""Return possible include paths containing `nccl.h`.
|
||||
|
||||
Considers `VLLM_NCCL_INCLUDE_PATH` and the `nvidia-nccl-cuXX` package.
|
||||
"""
|
||||
paths: list[str] = []
|
||||
inc = envs.VLLM_NCCL_INCLUDE_PATH
|
||||
if inc and os.path.isdir(inc):
|
||||
paths.append(inc)
|
||||
|
||||
try:
|
||||
spec = importlib.util.find_spec("nvidia.nccl")
|
||||
if spec and getattr(spec, "submodule_search_locations", None):
|
||||
for loc in spec.submodule_search_locations:
|
||||
inc_dir = os.path.join(loc, "include")
|
||||
if os.path.exists(os.path.join(inc_dir, "nccl.h")):
|
||||
paths.append(inc_dir)
|
||||
except Exception as e:
|
||||
logger.debug("Failed to find nccl include path from nvidia.nccl package: %s", e)
|
||||
|
||||
seen: set[str] = set()
|
||||
out: list[str] = []
|
||||
for p in paths:
|
||||
if p and p not in seen:
|
||||
out.append(p)
|
||||
seen.add(p)
|
||||
return out or None
|
||||
Reference in New Issue
Block a user