[Bugfix] Fix CUDA compatibility path setting for both datacenter and consumer NVIDIA GPUs (#33992)

Signed-off-by: Seungmin Kim <8457324+ehfd@users.noreply.github.com>
Signed-off-by: Andrew Mello <19512127+88plug@users.noreply.github.com>
Co-authored-by: 88plug <19512127+88plug@users.noreply.github.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
This commit is contained in:
Seungmin Kim
2026-02-26 11:15:51 +09:00
committed by GitHub
parent 9511a3f8ee
commit 160424a937
6 changed files with 334 additions and 5 deletions

View File

@@ -1,7 +1,89 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa: E402
import importlib.util
import os
def _get_torch_cuda_version():
"""Peripheral function to _maybe_set_cuda_compatibility_path().
PyTorch version must not be determined by importing directly
because it will trigger the CUDA initialization, losing the
chance to set the LD_LIBRARY_PATH beforehand.
"""
try:
spec = importlib.util.find_spec("torch")
if not spec:
return None
if spec.origin:
torch_root = os.path.dirname(spec.origin)
elif spec.submodule_search_locations:
torch_root = spec.submodule_search_locations[0]
else:
return None
version_path = os.path.join(torch_root, "version.py")
if not os.path.exists(version_path):
return None
# Load the version module without importing torch
ver_spec = importlib.util.spec_from_file_location("torch.version", version_path)
if not ver_spec or not ver_spec.loader:
return None
module = importlib.util.module_from_spec(ver_spec)
# Avoid registering in sys.modules to not confuse future imports
ver_spec.loader.exec_module(module)
return getattr(module, "cuda", None)
except Exception:
return None
def _maybe_set_cuda_compatibility_path():
"""Set LD_LIBRARY_PATH for CUDA forward compatibility if enabled.
Must run before 'import torch' since torch loads CUDA shared libraries
at import time and the dynamic linker only consults LD_LIBRARY_PATH when
a library is first loaded.
CUDA forward compatibility is only supported on select professional and
datacenter NVIDIA GPUs. Consumer GPUs (GeForce, RTX) do not support it
and will get Error 803 if compat libs are loaded.
"""
enable = os.environ.get("VLLM_ENABLE_CUDA_COMPATIBILITY", "0").strip().lower() in (
"1",
"true",
)
if not enable:
return
cuda_compat_path = os.environ.get("VLLM_CUDA_COMPATIBILITY_PATH", "")
if not cuda_compat_path or not os.path.isdir(cuda_compat_path):
conda_prefix = os.environ.get("CONDA_PREFIX", "")
conda_compat = os.path.join(conda_prefix, "cuda-compat")
if conda_prefix and os.path.isdir(conda_compat):
cuda_compat_path = conda_compat
if not cuda_compat_path or not os.path.isdir(cuda_compat_path):
torch_cuda_version = _get_torch_cuda_version()
if torch_cuda_version:
default_path = f"/usr/local/cuda-{torch_cuda_version}/compat"
if os.path.isdir(default_path):
cuda_compat_path = default_path
if not cuda_compat_path or not os.path.isdir(cuda_compat_path):
return
norm_path = os.path.normpath(cuda_compat_path)
existing = os.environ.get("LD_LIBRARY_PATH", "")
ld_paths = existing.split(os.pathsep) if existing else []
if ld_paths and ld_paths[0] and os.path.normpath(ld_paths[0]) == norm_path:
return # Already at the front
new_paths = [norm_path] + [
p for p in ld_paths if not p or os.path.normpath(p) != norm_path
]
os.environ["LD_LIBRARY_PATH"] = os.pathsep.join(new_paths)
_maybe_set_cuda_compatibility_path()
import torch
from vllm.logger import init_logger