diff --git a/tests/compile/fullgraph/test_basic_correctness.py b/tests/compile/fullgraph/test_basic_correctness.py index 336456995..bec003287 100644 --- a/tests/compile/fullgraph/test_basic_correctness.py +++ b/tests/compile/fullgraph/test_basic_correctness.py @@ -6,7 +6,6 @@ import pytest from vllm.config import CompilationMode from vllm.platforms import current_platform -from vllm.utils.torch_utils import cuda_device_count_stateless from ...utils import compare_all_settings @@ -109,10 +108,10 @@ def test_compile_correctness( tp_size = test_setting.tp_size attn_backend = test_setting.attn_backend method = test_setting.method - if cuda_device_count_stateless() < pp_size * tp_size: + if current_platform.device_count() < pp_size * tp_size: pytest.skip( f"Need at least {pp_size}*{tp_size} CUDA gpus but got " - f"{cuda_device_count_stateless()}" + f"{current_platform.device_count()}" ) final_args = [ diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py index 43fe49037..d5808955b 100644 --- a/tests/compile/test_config.py +++ b/tests/compile/test_config.py @@ -412,7 +412,7 @@ def test_cudagraph_sizes_post_init( with ( ctx, - patch("vllm.config.parallel.cuda_device_count_stateless", return_value=tp_size), + patch.object(current_platform, "device_count", return_value=tp_size), ): kwargs = {} if cudagraph_capture_sizes is not None: diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py index 784918642..031937d7f 100644 --- a/tests/distributed/test_utils.py +++ b/tests/distributed/test_utils.py @@ -13,7 +13,6 @@ from vllm.distributed.utils import StatelessProcessGroup from vllm.platforms import current_platform from vllm.utils.network_utils import get_open_port from vllm.utils.system_utils import update_environment_variables -from vllm.utils.torch_utils import cuda_device_count_stateless from ..utils import multi_gpu_test @@ -21,7 +20,7 @@ from ..utils import multi_gpu_test @ray.remote class _CUDADeviceCountStatelessTestActor: def get_count(self): - return cuda_device_count_stateless() + return current_platform.device_count() def set_cuda_visible_devices(self, cuda_visible_devices: str): update_environment_variables({"CUDA_VISIBLE_DEVICES": cuda_visible_devices}) diff --git a/tests/kernels/moe/test_modular_kernel_combinations.py b/tests/kernels/moe/test_modular_kernel_combinations.py index 19367e7d1..c7295f3ed 100644 --- a/tests/kernels/moe/test_modular_kernel_combinations.py +++ b/tests/kernels/moe/test_modular_kernel_combinations.py @@ -15,7 +15,7 @@ from vllm.config import VllmConfig, set_current_vllm_config from vllm.platforms import current_platform from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe from vllm.utils.import_utils import has_deep_ep, has_deep_gemm -from vllm.utils.torch_utils import cuda_device_count_stateless, set_random_seed +from vllm.utils.torch_utils import set_random_seed from vllm.v1.worker.workspace import init_workspace_manager from .modular_kernel_tools.common import ( @@ -310,10 +310,10 @@ def test_modular_kernel_combinations_multigpu( world_size: int, pytestconfig, ): - if cuda_device_count_stateless() < world_size: + if current_platform.device_count() < world_size: pytest.skip( f"Not enough GPUs available to run, got " - f"{cuda_device_count_stateless()} expected " + f"{current_platform.device_count()} expected " f"{world_size}." ) diff --git a/tests/model_executor/model_loader/test_reload.py b/tests/model_executor/model_loader/test_reload.py index 6a04e7eb3..fc79f6a02 100644 --- a/tests/model_executor/model_loader/test_reload.py +++ b/tests/model_executor/model_loader/test_reload.py @@ -19,7 +19,6 @@ from vllm.model_executor.model_loader.reload.meta import ( from vllm.model_executor.model_loader.reload.types import LayerReloadingInfo from vllm.model_executor.model_loader.reload.utils import get_layer_tensors from vllm.platforms import current_platform -from vllm.utils.torch_utils import cuda_device_count_stateless def test_move_metatensors(): @@ -140,7 +139,7 @@ def test_get_numel_loaded(): ], ) def test_reload_weights(base_model, mul_model, add_model, tp_size, vllm_runner): - if cuda_device_count_stateless() < tp_size: + if current_platform.device_count() < tp_size: pytest.skip(reason="Not enough CUDA devices") if "FP8" in base_model and not current_platform.supports_fp8(): @@ -206,8 +205,8 @@ def test_reload_weights(base_model, mul_model, add_model, tp_size, vllm_runner): def test_online_quantize_reload( base_model, mul_model, add_model, quantization, tp_size, vllm_runner ): - if cuda_device_count_stateless() < tp_size: - pytest.skip(reason="Not enough CUDA devices") + if current_platform.device_count() < tp_size: + pytest.skip(reason="Not enough GPU devices") if quantization == "fp8" and not current_platform.supports_fp8(): pytest.skip(reason="Requires FP8 support") diff --git a/tests/models/quantization/test_gpt_oss.py b/tests/models/quantization/test_gpt_oss.py index 21cc9555b..fe9ddd2f6 100644 --- a/tests/models/quantization/test_gpt_oss.py +++ b/tests/models/quantization/test_gpt_oss.py @@ -21,8 +21,8 @@ import lm_eval import pytest from packaging import version +from vllm.platforms import current_platform from vllm.platforms.rocm import on_gfx950 -from vllm.utils.torch_utils import cuda_device_count_stateless MODEL_ACCURACIES = { # Full quantization: attention linears and MoE linears @@ -89,7 +89,7 @@ def test_gpt_oss_attention_quantization( expected_accuracy: float, monkeypatch: pytest.MonkeyPatch, ): - if tp_size > cuda_device_count_stateless(): + if tp_size > current_platform.device_count(): pytest.skip("Not enough GPUs to run this test case") if "amd/gpt-oss-20b-MoE-Quant-W-MXFP4-A-FP8-KV-FP8" in model_name and on_gfx950(): diff --git a/tests/utils.py b/tests/utils.py index 1264fe81c..84a061f29 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -58,7 +58,6 @@ from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.mem_constants import GB_bytes from vllm.utils.network_utils import get_open_port from vllm.utils.torch_utils import ( - cuda_device_count_stateless, set_random_seed, # noqa: F401 - re-exported for use in test files ) @@ -384,7 +383,7 @@ class RemoteVLLMServer: elif current_platform.is_cuda(): with _nvml(): total_used = 0 - device_count = cuda_device_count_stateless() + device_count = current_platform.device_count() for i in range(device_count): handle = nvmlDeviceGetHandleByIndex(i) mem_info = nvmlDeviceGetMemoryInfo(handle) @@ -1497,7 +1496,7 @@ def multi_gpu_marks(*, num_gpus: int): """Get a collection of pytest marks to apply for `@multi_gpu_test`.""" test_selector = pytest.mark.distributed(num_gpus=num_gpus) test_skipif = pytest.mark.skipif( - cuda_device_count_stateless() < num_gpus, + current_platform.device_count() < num_gpus, reason=f"Need at least {num_gpus} GPUs to run the test.", ) @@ -1529,7 +1528,7 @@ def gpu_tier_mark(*, min_gpus: int = 1, max_gpus: int | None = None): @gpu_tier_mark(max_gpus=1) # only on single-GPU @gpu_tier_mark(min_gpus=2, max_gpus=4) # 2-4 GPUs only """ - gpu_count = cuda_device_count_stateless() + gpu_count = current_platform.device_count() marks = [] if min_gpus > 1: diff --git a/tests/v1/shutdown/test_delete.py b/tests/v1/shutdown/test_delete.py index ee04dfad3..adf99fb92 100644 --- a/tests/v1/shutdown/test_delete.py +++ b/tests/v1/shutdown/test_delete.py @@ -11,8 +11,8 @@ from tests.v1.shutdown.utils import ( ) from vllm import LLM, SamplingParams from vllm.engine.arg_utils import AsyncEngineArgs +from vllm.platforms import current_platform from vllm.sampling_params import RequestOutputKind -from vllm.utils.torch_utils import cuda_device_count_stateless from vllm.v1.engine.async_llm import AsyncLLM MODELS = ["hmellor/tiny-random-LlamaForCausalLM"] @@ -34,7 +34,7 @@ async def test_async_llm_delete( tensor_parallel_size: degree of tensor parallelism send_one_request: send one request to engine before deleting """ - if cuda_device_count_stateless() < tensor_parallel_size: + if current_platform.device_count() < tensor_parallel_size: pytest.skip(reason="Not enough CUDA devices") engine_args = AsyncEngineArgs( @@ -83,7 +83,7 @@ def test_llm_delete( enable_multiprocessing: enable workers in separate process(es) send_one_request: send one request to engine before deleting """ - if cuda_device_count_stateless() < tensor_parallel_size: + if current_platform.device_count() < tensor_parallel_size: pytest.skip(reason="Not enough CUDA devices") with monkeypatch.context() as m: diff --git a/tests/v1/shutdown/test_forward_error.py b/tests/v1/shutdown/test_forward_error.py index 4625bc174..eadb1abb6 100644 --- a/tests/v1/shutdown/test_forward_error.py +++ b/tests/v1/shutdown/test_forward_error.py @@ -15,7 +15,7 @@ from tests.v1.shutdown.utils import ( from vllm import LLM, AsyncEngineArgs, SamplingParams from vllm.distributed import get_tensor_model_parallel_rank from vllm.model_executor.models.llama import LlamaForCausalLM -from vllm.utils.torch_utils import cuda_device_count_stateless +from vllm.platforms import current_platform from vllm.v1.engine.async_llm import AsyncLLM from vllm.v1.engine.exceptions import EngineDeadError @@ -60,7 +60,7 @@ async def test_async_llm_model_error( AsyncLLM always uses an MP client. """ - if cuda_device_count_stateless() < tensor_parallel_size: + if current_platform.device_count() < tensor_parallel_size: pytest.skip(reason="Not enough CUDA devices") # Monkeypatch an error in the model. @@ -126,7 +126,7 @@ def test_llm_model_error( TODO(andy) - LLM without multiprocessing; LLM with multiprocessing and >1 rank """ - if cuda_device_count_stateless() < tensor_parallel_size: + if current_platform.device_count() < tensor_parallel_size: pytest.skip(reason="Not enough CUDA devices") with monkeypatch.context() as m: diff --git a/tests/v1/shutdown/test_startup_error.py b/tests/v1/shutdown/test_startup_error.py index 4b5661a52..03982aa48 100644 --- a/tests/v1/shutdown/test_startup_error.py +++ b/tests/v1/shutdown/test_startup_error.py @@ -15,7 +15,7 @@ from vllm import LLM from vllm.distributed import get_tensor_model_parallel_rank from vllm.engine.arg_utils import AsyncEngineArgs from vllm.model_executor.models.llama import LlamaForCausalLM -from vllm.utils.torch_utils import cuda_device_count_stateless +from vllm.platforms import current_platform from vllm.v1.engine.async_llm import AsyncLLM MODELS = ["hmellor/tiny-random-LlamaForCausalLM"] @@ -57,7 +57,7 @@ def test_async_llm_startup_error( Test profiling (forward()) and load weights failures. AsyncLLM always uses an MP client. """ - if cuda_device_count_stateless() < tensor_parallel_size: + if current_platform.device_count() < tensor_parallel_size: pytest.skip(reason="Not enough CUDA devices") # Monkeypatch an error in the model. @@ -99,7 +99,7 @@ def test_llm_startup_error( # If MODELS list grows, each architecture needs its own test variant. if model != "JackFram/llama-68m": pytest.skip(reason="Only test JackFram/llama-68m") - if cuda_device_count_stateless() < tensor_parallel_size: + if current_platform.device_count() < tensor_parallel_size: pytest.skip(reason="Not enough CUDA devices") with monkeypatch.context() as m: diff --git a/tools/pre_commit/check_torch_cuda.py b/tools/pre_commit/check_torch_cuda.py index ea84618a0..9bc4ab56d 100644 --- a/tools/pre_commit/check_torch_cuda.py +++ b/tools/pre_commit/check_torch_cuda.py @@ -10,6 +10,8 @@ import regex as re _TORCH_CUDA_PATTERNS = [ r"\btorch\.cuda\.(empty_cache|synchronize|device_count|current_device|memory_reserved|memory_allocated|max_memory_allocated|max_memory_reserved|reset_peak_memory_stats|memory_stats|set_device|device\()\b", r"\bwith\storch\.cuda\.device\b", + # Calls torch.cuda.{_is_compiled/_device_count_amdsmi/_device_count_nvml} internally + r"\bcuda_device_count_stateless\(\)\b", ] ALLOWED_FILES = {"vllm/platforms/", "vllm/device_allocator/"} diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index ef51cbdf9..107dcfa27 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -16,7 +16,6 @@ from vllm.config.utils import config from vllm.logger import init_logger from vllm.platforms import current_platform from vllm.utils.network_utils import get_open_ports_list -from vllm.utils.torch_utils import cuda_device_count_stateless if TYPE_CHECKING: from ray.runtime_env import RuntimeEnv @@ -726,9 +725,9 @@ class ParallelConfig: backend = "mp" elif ( current_platform.is_cuda() - and cuda_device_count_stateless() < self.world_size + and current_platform.device_count() < self.world_size ): - gpu_count = cuda_device_count_stateless() + gpu_count = current_platform.device_count() raise ValueError( f"World size ({self.world_size}) is larger than the number of " f"available GPUs ({gpu_count}) in this node. If this is " diff --git a/vllm/distributed/device_communicators/all_reduce_utils.py b/vllm/distributed/device_communicators/all_reduce_utils.py index 108afa195..cebf2c49b 100644 --- a/vllm/distributed/device_communicators/all_reduce_utils.py +++ b/vllm/distributed/device_communicators/all_reduce_utils.py @@ -19,8 +19,8 @@ import torch.multiprocessing as mp import vllm.envs as envs from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary from vllm.logger import init_logger +from vllm.platforms import current_platform from vllm.utils.system_utils import update_environment_variables -from vllm.utils.torch_utils import cuda_device_count_stateless logger = init_logger(__name__) @@ -320,7 +320,7 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool: is_distributed = dist.is_initialized() - num_dev = cuda_device_count_stateless() + num_dev = current_platform.device_count() cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES if cuda_visible_devices is None: cuda_visible_devices = ",".join(str(i) for i in range(num_dev)) diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py index 02591805a..65a196264 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce.py +++ b/vllm/distributed/device_communicators/custom_all_reduce.py @@ -17,7 +17,6 @@ from vllm.distributed.device_communicators.all_reduce_utils import ( from vllm.distributed.parallel_state import in_the_same_node_as from vllm.logger import init_logger from vllm.platforms import current_platform -from vllm.utils.torch_utils import cuda_device_count_stateless try: ops.meta_size() @@ -135,7 +134,7 @@ class CustomAllreduce: if cuda_visible_devices: device_ids = list(map(int, cuda_visible_devices.split(","))) else: - device_ids = list(range(cuda_device_count_stateless())) + device_ids = list(range(current_platform.device_count())) physical_device_id = device_ids[device.index] tensor = torch.tensor([physical_device_id], dtype=torch.int, device="cpu") diff --git a/vllm/distributed/device_communicators/quick_all_reduce.py b/vllm/distributed/device_communicators/quick_all_reduce.py index 7670ec134..ca62eac1c 100644 --- a/vllm/distributed/device_communicators/quick_all_reduce.py +++ b/vllm/distributed/device_communicators/quick_all_reduce.py @@ -13,7 +13,6 @@ from vllm.config import get_current_vllm_config_or_none from vllm.distributed.parallel_state import in_the_same_node_as from vllm.logger import init_logger from vllm.platforms import current_platform -from vllm.utils.torch_utils import cuda_device_count_stateless logger = init_logger(__name__) @@ -137,7 +136,7 @@ class QuickAllReduce: if cuda_visible_devices: device_ids = list(map(int, cuda_visible_devices.split(","))) else: - device_ids = list(range(cuda_device_count_stateless())) + device_ids = list(range(current_platform.device_count())) physical_device_id = device_ids[device.index] tensor = torch.tensor([physical_device_id], dtype=torch.int, device="cpu") gather_list = [ diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 50a79cbb0..71c8e3a35 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -9,7 +9,7 @@ from __future__ import annotations import os from collections.abc import Callable from datetime import timedelta -from functools import cache, wraps +from functools import cache, lru_cache, wraps from typing import TYPE_CHECKING, TypeVar import torch @@ -20,9 +20,9 @@ from typing_extensions import ParamSpec # import custom ops, trigger op registration import vllm._C # noqa import vllm._C_stable_libtorch # noqa +import vllm.envs as envs from vllm.logger import init_logger from vllm.utils.import_utils import import_pynvml -from vllm.utils.torch_utils import cuda_device_count_stateless from vllm.v1.attention.backends.registry import AttentionBackendEnum from .interface import DeviceCapability, Platform, PlatformEnum @@ -47,6 +47,32 @@ pynvml = import_pynvml() torch.backends.cuda.enable_cudnn_sdp(False) +@lru_cache(maxsize=8) +def _cuda_device_count_stateless(cuda_visible_devices: str | None = None) -> int: + """Get number of CUDA devices, caching based on the value of CUDA_VISIBLE_DEVICES + at the time of call. + + This should be used instead of torch.accelerator.device_count() unless + CUDA_VISIBLE_DEVICES has already been set to the desired value. + + # This can be removed and simply replaced with torch.cuda.get_device_count + # after https://github.com/pytorch/pytorch/pull/122815 is released.""" + # Note: cuda_visible_devices is not used, but we keep it as an argument for + # LRU Cache purposes. + + # Code below is based on + # https://github.com/pytorch/pytorch/blob/ + # c1cd946818442aca8c7f812b16d187ce1586c3bc/ + # torch/cuda/__init__.py#L831C1-L831C17 + import torch.cuda + + if not torch.cuda._is_compiled(): + return 0 + raw_count = torch.cuda._device_count_nvml() + r = torch._C._cuda_getDeviceCount() if raw_count < 0 else raw_count + return r + + @cache def _get_backend_priorities( use_mla: bool, @@ -456,7 +482,7 @@ class CudaPlatformBase(Platform): @classmethod def device_count(cls) -> int: - return cuda_device_count_stateless() + return _cuda_device_count_stateless(envs.CUDA_VISIBLE_DEVICES) @classmethod def check_if_supports_dtype(cls, dtype: torch.dtype): diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index d3836a467..13ff8821f 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -13,7 +13,6 @@ from torch.distributed.distributed_c10d import is_nccl_available import vllm.envs as envs from vllm.logger import init_logger -from vllm.utils.torch_utils import cuda_device_count_stateless from vllm.v1.attention.backends.registry import AttentionBackendEnum from .interface import DeviceCapability, Platform, PlatformEnum @@ -67,6 +66,38 @@ _ROCM_DEVICE_ID_NAME_MAP: dict[str, str] = { } +@lru_cache(maxsize=8) +def _rocm_device_count_stateless(cuda_visible_devices: str | None = None) -> int: + """Get number of ROCm devices, caching based on the value of CUDA_VISIBLE_DEVICES + at the time of call. + + This should be used instead of torch.accelerator.device_count() unless + CUDA_VISIBLE_DEVICES has already been set to the desired value. + + # This can be removed and simply replaced with torch.cuda.get_device_count + # after https://github.com/pytorch/pytorch/pull/122815 is released.""" + # Note: cuda_visible_devices is not used, but we keep it as an argument for + # LRU Cache purposes. + + # Code below is based on + # https://github.com/pytorch/pytorch/blob/ + # c1cd946818442aca8c7f812b16d187ce1586c3bc/ + # torch/cuda/__init__.py#L831C1-L831C17 + import torch.cuda + + if not torch.cuda._is_compiled(): + return 0 + # ROCm uses amdsmi instead of nvml for stateless device count + # This requires a sufficiently modern version of Torch 2.4.0 + raw_count = ( + torch.cuda._device_count_amdsmi() + if (hasattr(torch.cuda, "_device_count_amdsmi")) + else -1 + ) + r = torch._C._cuda_getDeviceCount() if raw_count < 0 else raw_count + return r + + def _sync_hip_cuda_env_vars(): """Ensure HIP_VISIBLE_DEVICES and CUDA_VISIBLE_DEVICES are consistent. Treats empty string as unset. Raises on genuine conflicts.""" @@ -810,7 +841,7 @@ class RocmPlatform(Platform): @classmethod def device_count(cls) -> int: - return cuda_device_count_stateless() + return _rocm_device_count_stateless(getattr(envs, cls.device_control_env_var)) @classmethod def check_if_supports_dtype(cls, dtype: torch.dtype): diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py index fdae7d9b7..b2ce46c7b 100644 --- a/vllm/usage/usage_lib.py +++ b/vllm/usage/usage_lib.py @@ -22,7 +22,6 @@ import vllm.envs as envs from vllm.connections import global_http_connection from vllm.logger import init_logger from vllm.utils.platform_utils import cuda_get_device_properties -from vllm.utils.torch_utils import cuda_device_count_stateless from vllm.version import __version__ as VLLM_VERSION logger = init_logger(__name__) @@ -196,7 +195,7 @@ class UsageMessage: from vllm.platforms import current_platform if current_platform.is_cuda_alike(): - self.gpu_count = cuda_device_count_stateless() + self.gpu_count = current_platform.device_count() self.gpu_type, self.gpu_memory_per_device = cuda_get_device_properties( 0, ("name", "total_memory") ) diff --git a/vllm/utils/torch_utils.py b/vllm/utils/torch_utils.py index b4c77e548..bd9741024 100644 --- a/vllm/utils/torch_utils.py +++ b/vllm/utils/torch_utils.py @@ -6,7 +6,6 @@ import os import random import threading from collections.abc import Callable, Collection -from functools import lru_cache from typing import TYPE_CHECKING, Any, TypeVar import numpy as np @@ -16,7 +15,6 @@ from packaging import version from packaging.version import Version from torch.library import Library, infer_schema -import vllm.envs as envs from vllm.logger import init_logger if TYPE_CHECKING: @@ -590,49 +588,6 @@ def aux_stream() -> torch.cuda.Stream | None: return _aux_stream -@lru_cache(maxsize=8) -def _cuda_device_count_stateless(cuda_visible_devices: str | None = None) -> int: - # Note: cuda_visible_devices is not used, but we keep it as an argument for - # LRU Cache purposes. - - # Code below is based on - # https://github.com/pytorch/pytorch/blob/ - # c1cd946818442aca8c7f812b16d187ce1586c3bc/ - # torch/cuda/__init__.py#L831C1-L831C17 - import torch.cuda - import torch.version - - from vllm.platforms import current_platform - - if not torch.cuda._is_compiled(): - return 0 - if current_platform.is_rocm(): - # ROCm uses amdsmi instead of nvml for stateless device count - # This requires a sufficiently modern version of Torch 2.4.0 - raw_count = ( - torch.cuda._device_count_amdsmi() - if (hasattr(torch.cuda, "_device_count_amdsmi")) - else -1 - ) - else: - raw_count = torch.cuda._device_count_nvml() - r = torch._C._cuda_getDeviceCount() if raw_count < 0 else raw_count - return r - - -def cuda_device_count_stateless() -> int: - """Get number of CUDA devices, caching based on the value of - CUDA_VISIBLE_DEVICES at the time of call. - - This should be used instead of torch.accelerator.device_count() - unless CUDA_VISIBLE_DEVICES has already been set to the desired - value.""" - - # This can be removed and simply replaced with torch.cuda.get_device_count - # after https://github.com/pytorch/pytorch/pull/122815 is released. - return _cuda_device_count_stateless(envs.CUDA_VISIBLE_DEVICES) - - def weak_ref_tensor(tensor: Any) -> Any: """ Create a weak reference to a tensor. diff --git a/vllm/v1/executor/ray_utils.py b/vllm/v1/executor/ray_utils.py index dd82cfb99..c363a837b 100644 --- a/vllm/v1/executor/ray_utils.py +++ b/vllm/v1/executor/ray_utils.py @@ -369,9 +369,7 @@ def initialize_ray_cluster( # Prevalidate GPU requirements before Ray processing if current_platform.is_cuda() and parallel_config.world_size > 1: - from vllm.utils.torch_utils import cuda_device_count_stateless - - available_gpus = cuda_device_count_stateless() + available_gpus = current_platform.device_count() if parallel_config.world_size > available_gpus: logger.warning( "Tensor parallel size (%d) exceeds available GPUs (%d). "