Add cuda_device_count_stateless (#5473)

2024-06-13 16:06:49 -07:00
parent e38042d4af
commit 50eed24d25
8 changed files with 79 additions and 23 deletions
--- a/vllm/executor/multiproc_gpu_executor.py
+++ b/vllm/executor/multiproc_gpu_executor.py
@@ -9,7 +9,8 @@ from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
                                                  ResultHandler, WorkerMonitor)
 from vllm.logger import init_logger
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
-from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
+from vllm.utils import (cuda_device_count_stateless,
+                        get_distributed_init_method, get_ip, get_open_port,
                        get_vllm_instance_id, make_async)

 logger = init_logger(__name__)
@@ -33,8 +34,7 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor):
        # Disable torch async compiling which won't work with daemonic processes
        os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"

-        from torch.cuda import device_count
-        assert world_size <= device_count(), (
+        assert world_size <= cuda_device_count_stateless(), (
            "please set tensor_parallel_size to less than max local gpu count")

        distributed_init_method = get_distributed_init_method(