Revert #29787 and #29690 (#29815)

2025-12-01 13:42:03 -08:00
parent eaf81485ed
commit 1336a1ea24
9 changed files with 185 additions and 580 deletions
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -74,7 +74,7 @@ if TYPE_CHECKING:
    VLLM_MEDIA_CONNECTOR: str = "http"
    VLLM_MM_INPUT_CACHE_GIB: int = 4
    VLLM_TARGET_DEVICE: str = "cuda"
-    VLLM_MAIN_CUDA_VERSION: str = "12.9"
+    VLLM_MAIN_CUDA_VERSION: str = "12.8"
    MAX_JOBS: str | None = None
    NVCC_THREADS: str | None = None
    VLLM_USE_PRECOMPILED: bool = False
@@ -445,9 +445,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
    # Target device of vLLM, supporting [cuda (by default),
    # rocm, cpu]
    "VLLM_TARGET_DEVICE": lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda").lower(),
-    # Main CUDA version of vLLM. This follows PyTorch but can be overridden.
+    # Main CUDA version of vLLM, supporting [12.6, 12.8, 12.9],
+    # 12.8 is the default. This follows PyTorch but can be overridden.
    "VLLM_MAIN_CUDA_VERSION": lambda: os.getenv("VLLM_MAIN_CUDA_VERSION", "").lower()
-    or "12.9",
+    or "12.8",
    # Maximum number of compilation jobs to run in parallel.
    # By default this is the number of CPUs
    "MAX_JOBS": lambda: os.getenv("MAX_JOBS", None),