diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index 0178d23b7..3409f04a1 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -15,8 +15,6 @@ FROM ${BASE_IMAGE} AS base ARG ARG_PYTORCH_ROCM_ARCH ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}} -ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 -ENV RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1 # Install some basic utilities RUN apt-get update -q -y && apt-get install -q -y \ diff --git a/tests/config/test_config_generation.py b/tests/config/test_config_generation.py index 61c3df0a2..23ceb920c 100644 --- a/tests/config/test_config_generation.py +++ b/tests/config/test_config_generation.py @@ -60,6 +60,11 @@ def test_ray_runtime_env(monkeypatch: pytest.MonkeyPatch): runtime_env = { "env_vars": { "TEST_ENV_VAR": "test_value", + # In future ray versions, this will be default, so when setting a + # task or actor with num_gpus=None/0, the visible devices env var + # won't be overridden resulting in no GPUs being visible on a gpu + # machine. + "RAY_ACCEL_ENV_VAR_OVERRIDE_ON_ZERO": "0", }, } diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index fbe791f8a..0c0bd7db3 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -102,6 +102,9 @@ class CudaPlatformBase(Platform): ray_device_key: str = "GPU" dist_backend: str = "nccl" device_control_env_var: str = "CUDA_VISIBLE_DEVICES" + ray_noset_device_env_vars: list[str] = [ + "RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES", + ] @property def supported_dtypes(self) -> list[torch.dtype]: diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index f0e7ee0da..c3b189e01 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -116,6 +116,11 @@ class Platform: # https://github.com/ray-project/ray/tree/master/python/ray/_private/accelerators # noqa device_control_env_var: str = "VLLM_DEVICE_CONTROL_ENV_VAR_PLACEHOLDER" + # environment variables that need to be set to 1 to prevent ray from + # setting the visible devices e.g. + # RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES + ray_noset_device_env_vars: list[str] = [] + # The torch.compile backend for compiling simple and # standalone functions. The default value is "inductor" to keep # the same behavior as PyTorch. diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 6f4c235bb..2a9bd53e4 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -194,6 +194,11 @@ class RocmPlatform(Platform): dist_backend: str = "nccl" # rocm shares the same device control env var as CUDA device_control_env_var: str = "CUDA_VISIBLE_DEVICES" + ray_noset_device_env_vars: list[str] = [ + "RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES", + "RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES", + "RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES", + ] supported_quantization: list[str] = [ "awq", diff --git a/vllm/v1/executor/ray_executor.py b/vllm/v1/executor/ray_executor.py index c8c6185b6..a1f69c478 100644 --- a/vllm/v1/executor/ray_executor.py +++ b/vllm/v1/executor/ray_executor.py @@ -69,6 +69,8 @@ class RayDistributedExecutor(Executor): "VLLM_HOST_PORT", "LOCAL_RANK", "CUDA_VISIBLE_DEVICES", + "HIP_VISIBLE_DEVICES", + "ROCR_VISIBLE_DEVICES", } # These non-vLLM env vars are copied from the driver to workers @@ -146,6 +148,14 @@ class RayDistributedExecutor(Executor): return ray_remote_kwargs + def _update_noset_device_env_vars(self, ray_remote_kwargs): + runtime_env = ray_remote_kwargs.setdefault("runtime_env", {}) + env_vars = runtime_env.setdefault("env_vars", {}) + env_vars.update( + {env_var: "1" for env_var in current_platform.ray_noset_device_env_vars} + ) + return ray_remote_kwargs + # child class could overwrite this to return actual env vars. def _get_env_vars_to_be_updated(self): return self._env_vars_for_all_workers @@ -169,6 +179,11 @@ class RayDistributedExecutor(Executor): ray_remote_kwargs ) + # The way ray actors are setup in vllm is that the visible devices are + # not set by actors, they are left unset by ray. Internally we index + # the right gpu with local_rank. This is similar to how mp mode works. + self._update_noset_device_env_vars(ray_remote_kwargs) + # Create the workers. bundle_indices: list[int] if envs.VLLM_RAY_BUNDLE_INDICES: @@ -303,6 +318,15 @@ class RayDistributedExecutor(Executor): ) # Set environment variables for the driver and workers. + # We set CUDA_VISIBLE_DEVICES to ALL GPUs on the node for each worker. + # This is needed because: + # 1. Ray's compiled DAG needs to find the allocated GPU in + # CUDA_VISIBLE_DEVICES. + # 2. vLLM's communication layer (NCCL, CustomAllreduce) needs to see + # all GPUs for P2P checks and communication setup. Though if it was + # just this reason, we could have also just kept the visible devices + # unset. + # Each worker will use local_rank to index into the visible devices. all_args_to_update_environment_variables = [ { current_platform.device_control_env_var: ",".join( diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index f6e59526e..b451db382 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -209,6 +209,7 @@ class Worker(WorkerBase): f"be less than or equal to the number of visible devices " f"({visible_device_count})." ) + self.device = torch.device(f"cuda:{self.local_rank}") current_platform.set_device(self.device) diff --git a/vllm/v1/worker/worker_base.py b/vllm/v1/worker/worker_base.py index d34eb5253..eed371e98 100644 --- a/vllm/v1/worker/worker_base.py +++ b/vllm/v1/worker/worker_base.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import os from collections.abc import Callable from typing import TYPE_CHECKING, Any, TypeVar @@ -221,11 +220,6 @@ class WorkerWrapperBase: envs_list: list[dict[str, str]], ) -> None: envs = envs_list[self.rpc_rank] - key = "CUDA_VISIBLE_DEVICES" - if key in envs and key in os.environ: - # overwriting CUDA_VISIBLE_DEVICES is desired behavior - # suppress the warning in `update_environment_variables` - del os.environ[key] update_environment_variables(envs) def init_worker(self, all_kwargs: list[dict[str, Any]]) -> None: