[Bugfix] Fix CUDA compatibility path setting for both datacenter and consumer NVIDIA GPUs (#33992)

Signed-off-by: Seungmin Kim <8457324+ehfd@users.noreply.github.com> Signed-off-by: Andrew Mello <19512127+88plug@users.noreply.github.com> Co-authored-by: 88plug <19512127+88plug@users.noreply.github.com> Co-authored-by: Michael Goin <mgoin64@gmail.com>
2026-02-26 11:15:51 +09:00
parent 9511a3f8ee
commit 160424a937
6 changed files with 334 additions and 5 deletions
--- a/vllm/env_override.py
+++ b/vllm/env_override.py
@@ -1,7 +1,89 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: E402
+import importlib.util
 import os

+
+def _get_torch_cuda_version():
+    """Peripheral function to _maybe_set_cuda_compatibility_path().
+    PyTorch version must not be determined by importing directly
+    because it will trigger the CUDA initialization, losing the
+    chance to set the LD_LIBRARY_PATH beforehand.
+    """
+    try:
+        spec = importlib.util.find_spec("torch")
+        if not spec:
+            return None
+        if spec.origin:
+            torch_root = os.path.dirname(spec.origin)
+        elif spec.submodule_search_locations:
+            torch_root = spec.submodule_search_locations[0]
+        else:
+            return None
+        version_path = os.path.join(torch_root, "version.py")
+        if not os.path.exists(version_path):
+            return None
+        # Load the version module without importing torch
+        ver_spec = importlib.util.spec_from_file_location("torch.version", version_path)
+        if not ver_spec or not ver_spec.loader:
+            return None
+        module = importlib.util.module_from_spec(ver_spec)
+        # Avoid registering in sys.modules to not confuse future imports
+        ver_spec.loader.exec_module(module)
+        return getattr(module, "cuda", None)
+    except Exception:
+        return None
+
+
+def _maybe_set_cuda_compatibility_path():
+    """Set LD_LIBRARY_PATH for CUDA forward compatibility if enabled.
+
+    Must run before 'import torch' since torch loads CUDA shared libraries
+    at import time and the dynamic linker only consults LD_LIBRARY_PATH when
+    a library is first loaded.
+
+    CUDA forward compatibility is only supported on select professional and
+    datacenter NVIDIA GPUs. Consumer GPUs (GeForce, RTX) do not support it
+    and will get Error 803 if compat libs are loaded.
+    """
+    enable = os.environ.get("VLLM_ENABLE_CUDA_COMPATIBILITY", "0").strip().lower() in (
+        "1",
+        "true",
+    )
+    if not enable:
+        return
+
+    cuda_compat_path = os.environ.get("VLLM_CUDA_COMPATIBILITY_PATH", "")
+    if not cuda_compat_path or not os.path.isdir(cuda_compat_path):
+        conda_prefix = os.environ.get("CONDA_PREFIX", "")
+        conda_compat = os.path.join(conda_prefix, "cuda-compat")
+        if conda_prefix and os.path.isdir(conda_compat):
+            cuda_compat_path = conda_compat
+    if not cuda_compat_path or not os.path.isdir(cuda_compat_path):
+        torch_cuda_version = _get_torch_cuda_version()
+        if torch_cuda_version:
+            default_path = f"/usr/local/cuda-{torch_cuda_version}/compat"
+            if os.path.isdir(default_path):
+                cuda_compat_path = default_path
+    if not cuda_compat_path or not os.path.isdir(cuda_compat_path):
+        return
+
+    norm_path = os.path.normpath(cuda_compat_path)
+    existing = os.environ.get("LD_LIBRARY_PATH", "")
+    ld_paths = existing.split(os.pathsep) if existing else []
+
+    if ld_paths and ld_paths[0] and os.path.normpath(ld_paths[0]) == norm_path:
+        return  # Already at the front
+
+    new_paths = [norm_path] + [
+        p for p in ld_paths if not p or os.path.normpath(p) != norm_path
+    ]
+    os.environ["LD_LIBRARY_PATH"] = os.pathsep.join(new_paths)
+
+
+_maybe_set_cuda_compatibility_path()
+
 import torch

 from vllm.logger import init_logger