diff --git a/benchmarks/attention_benchmarks/mla_runner.py b/benchmarks/attention_benchmarks/mla_runner.py index 110f580fb..3c1ca4b3d 100644 --- a/benchmarks/attention_benchmarks/mla_runner.py +++ b/benchmarks/attention_benchmarks/mla_runner.py @@ -757,7 +757,7 @@ def _run_mla_benchmark_batched( backend_cfg = _get_backend_config(backend) device = torch.device(configs_with_params[0][0].device) - torch.cuda.set_device(device) + torch.accelerator.set_device_index(device) # Determine block size config_block_size = configs_with_params[0][0].block_size diff --git a/benchmarks/attention_benchmarks/runner.py b/benchmarks/attention_benchmarks/runner.py index 7f968cfec..52286186d 100644 --- a/benchmarks/attention_benchmarks/runner.py +++ b/benchmarks/attention_benchmarks/runner.py @@ -443,7 +443,7 @@ def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult: BenchmarkResult with timing and memory statistics """ device = torch.device(config.device) - torch.cuda.set_device(device) + torch.accelerator.set_device_index(device) backend_cfg = _get_backend_config(config.backend) diff --git a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py index 58ccfcc45..3f80b024e 100644 --- a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py +++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py @@ -64,7 +64,7 @@ def bench_run( per_out_ch: bool, mkn: tuple[int, int, int], ): - init_workspace_manager(torch.cuda.current_device()) + init_workspace_manager(torch.accelerator.current_device_index()) (m, k, n) = mkn dtype = torch.half diff --git a/benchmarks/kernels/benchmark_device_communicators.py b/benchmarks/kernels/benchmark_device_communicators.py index 9b5ccac4e..24e22023b 100644 --- a/benchmarks/kernels/benchmark_device_communicators.py +++ b/benchmarks/kernels/benchmark_device_communicators.py @@ -495,7 +495,7 @@ def main(): # Set device device = torch.device(f"cuda:{rank}") - torch.cuda.set_device(device) + torch.accelerator.set_device_index(device) # Get CPU process group cpu_group = dist.new_group(backend="gloo") diff --git a/benchmarks/kernels/benchmark_fused_collective.py b/benchmarks/kernels/benchmark_fused_collective.py index 2547f553f..05b842d7e 100644 --- a/benchmarks/kernels/benchmark_fused_collective.py +++ b/benchmarks/kernels/benchmark_fused_collective.py @@ -392,7 +392,7 @@ def benchmark_operation( num_op_per_cudagraph = 10 # Use vLLM's graph_capture to make tensor_model_parallel_all_reduce graph-safe - device = torch.device(f"cuda:{torch.cuda.current_device()}") + device = torch.device(f"cuda:{torch.accelerator.current_device_index()}") with graph_capture(device=device), torch.cuda.graph(graph): for _ in range(num_op_per_cudagraph): operation_func(*args, **kwargs) @@ -984,7 +984,7 @@ def main(): world_size = int(os.environ["WORLD_SIZE"]) device = torch.device(f"cuda:{rank}") - torch.cuda.set_device(device) + torch.accelerator.set_device_index(device) torch.set_default_device(device) init_distributed_environment() diff --git a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py index 039eb2f29..dd4060bbd 100644 --- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py +++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py @@ -50,7 +50,7 @@ def bench_run( per_out_ch: bool, mkn: tuple[int, int, int], ): - init_workspace_manager(torch.cuda.current_device()) + init_workspace_manager(torch.accelerator.current_device_index()) label = "Quant Matmul" sub_label = ( diff --git a/benchmarks/kernels/benchmark_w8a8_block_fp8.py b/benchmarks/kernels/benchmark_w8a8_block_fp8.py index ceae12e98..36dce1b63 100644 --- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py +++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py @@ -285,7 +285,7 @@ def tune_on_gpu(args_dict): weight_shapes = args_dict["weight_shapes"] args = args_dict["args"] - torch.cuda.set_device(gpu_id) + torch.accelerator.set_device_index(gpu_id) print(f"Starting tuning on GPU {gpu_id} with batch sizes {batch_sizes}") block_n = args.block_n @@ -334,7 +334,7 @@ def distribute_batch_sizes(batch_sizes, num_gpus): def main(args): print(args) - num_gpus = torch.cuda.device_count() + num_gpus = torch.accelerator.device_count() if num_gpus == 0: raise RuntimeError("No GPU available for tuning") print(f"Found {num_gpus} GPUs for parallel tuning") diff --git a/docs/configuration/conserving_memory.md b/docs/configuration/conserving_memory.md index 0aa89a89e..8ea241c58 100644 --- a/docs/configuration/conserving_memory.md +++ b/docs/configuration/conserving_memory.md @@ -15,7 +15,7 @@ llm = LLM(model="ibm-granite/granite-3.1-8b-instruct", tensor_parallel_size=2) ``` !!! warning - To ensure that vLLM initializes CUDA correctly, you should avoid calling related functions (e.g. [torch.cuda.set_device][]) + To ensure that vLLM initializes CUDA correctly, you should avoid calling related functions (e.g. [torch.accelerator.set_device_index][]) before initializing vLLM. Otherwise, you may run into an error like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`. To control which devices are used, please instead set the `CUDA_VISIBLE_DEVICES` environment variable. diff --git a/docs/usage/troubleshooting.md b/docs/usage/troubleshooting.md index bced53936..dc1cd89f8 100644 --- a/docs/usage/troubleshooting.md +++ b/docs/usage/troubleshooting.md @@ -91,8 +91,8 @@ If GPU/CPU communication cannot be established, you can use the following Python import torch import torch.distributed as dist dist.init_process_group(backend="nccl") - local_rank = dist.get_rank() % torch.cuda.device_count() - torch.cuda.set_device(local_rank) + local_rank = dist.get_rank() % torch.accelerator.device_count() + torch.accelerator.set_device_index(local_rank) data = torch.FloatTensor([1,] * 128).to("cuda") dist.all_reduce(data, op=dist.ReduceOp.SUM) torch.accelerator.synchronize() @@ -337,7 +337,7 @@ import vllm import torch print(f"CUDA available: {torch.cuda.is_available()}") -print(f"CUDA device count: {torch.cuda.device_count()}") +print(f"CUDA device count: {torch.accelerator.device_count()}") EOF ``` diff --git a/examples/online_serving/new_weight_syncing/rlhf_http_ipc.py b/examples/online_serving/new_weight_syncing/rlhf_http_ipc.py index d73eba64c..1a6a96d9c 100644 --- a/examples/online_serving/new_weight_syncing/rlhf_http_ipc.py +++ b/examples/online_serving/new_weight_syncing/rlhf_http_ipc.py @@ -106,7 +106,7 @@ def main(): # IPC requires the training model to be on the same GPU as the vLLM server # The server should be started on GPU 0 with reduced memory utilization device = "cuda:0" - torch.cuda.set_device(device) + torch.accelerator.set_device_index(device) # Load the training model on the same GPU as the server # Use bfloat16 to reduce memory footprint diff --git a/examples/online_serving/new_weight_syncing/rlhf_http_nccl.py b/examples/online_serving/new_weight_syncing/rlhf_http_nccl.py index b8a6b180a..afc4cda2e 100644 --- a/examples/online_serving/new_weight_syncing/rlhf_http_nccl.py +++ b/examples/online_serving/new_weight_syncing/rlhf_http_nccl.py @@ -131,7 +131,7 @@ def main(): inference_world_size = get_world_size(BASE_URL) world_size = inference_world_size + 1 # +1 for the trainer device = f"cuda:{inference_world_size}" - torch.cuda.set_device(device) + torch.accelerator.set_device_index(device) # Load the training model print(f"Loading training model: {MODEL_NAME}") diff --git a/tests/compile/passes/distributed/test_async_tp.py b/tests/compile/passes/distributed/test_async_tp.py index abc71768c..7edceee98 100644 --- a/tests/compile/passes/distributed/test_async_tp.py +++ b/tests/compile/passes/distributed/test_async_tp.py @@ -300,7 +300,7 @@ def async_tp_pass_on_test_model( set_random_seed(0) device = torch.device(f"cuda:{local_rank}") - torch.cuda.set_device(device) + torch.accelerator.set_device_index(device) torch.set_default_device(device) torch.set_default_dtype(dtype) diff --git a/tests/compile/passes/distributed/test_fusion_all_reduce.py b/tests/compile/passes/distributed/test_fusion_all_reduce.py index 4beac8c4f..fe50081e5 100644 --- a/tests/compile/passes/distributed/test_fusion_all_reduce.py +++ b/tests/compile/passes/distributed/test_fusion_all_reduce.py @@ -262,7 +262,7 @@ def all_reduce_fusion_pass_on_test_model( set_random_seed(0) device = torch.device(f"cuda:{local_rank}") - torch.cuda.set_device(device) + torch.accelerator.set_device_index(device) torch.set_default_device(device) torch.set_default_dtype(dtype) diff --git a/tests/compile/passes/distributed/test_sequence_parallelism.py b/tests/compile/passes/distributed/test_sequence_parallelism.py index a0fe717ba..e7bf330cc 100644 --- a/tests/compile/passes/distributed/test_sequence_parallelism.py +++ b/tests/compile/passes/distributed/test_sequence_parallelism.py @@ -228,7 +228,7 @@ def sequence_parallelism_pass_on_test_model( set_random_seed(0) device = torch.device(f"cuda:{local_rank}") - torch.cuda.set_device(device) + torch.accelerator.set_device_index(device) torch.set_default_device(device) torch.set_default_dtype(dtype) diff --git a/tests/conftest.py b/tests/conftest.py index 4b907b7dd..719bfa5ed 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -428,7 +428,7 @@ class HfRunner: ) # don't put this import at the top level - # it will call torch.cuda.device_count() + # it will call torch.accelerator.device_count() from transformers import AutoProcessor self.processor = AutoProcessor.from_pretrained( @@ -1535,7 +1535,7 @@ def clean_gpu_memory_between_tests(): from tests.utils import wait_for_gpu_memory_to_clear - num_gpus = torch.cuda.device_count() + num_gpus = torch.accelerator.device_count() if num_gpus > 0: try: wait_for_gpu_memory_to_clear( diff --git a/tests/cuda/scripts/check_device_count_respects_env.py b/tests/cuda/scripts/check_device_count_respects_env.py index 1d218e483..e43c13aa4 100644 --- a/tests/cuda/scripts/check_device_count_respects_env.py +++ b/tests/cuda/scripts/check_device_count_respects_env.py @@ -14,7 +14,7 @@ import torch # noqa: E402 from vllm.platforms import current_platform # noqa: F401, E402 os.environ["CUDA_VISIBLE_DEVICES"] = "0" -count = torch.cuda.device_count() +count = torch.accelerator.device_count() if count == 0: sys.exit(0) # Skip: no GPUs available diff --git a/tests/distributed/eplb_utils.py b/tests/distributed/eplb_utils.py index 7c27347fd..215aff32d 100644 --- a/tests/distributed/eplb_utils.py +++ b/tests/distributed/eplb_utils.py @@ -42,7 +42,7 @@ def set_env_vars_and_device(env: dict[str, str]) -> None: update_environment_variables(env) local_rank = os.environ["LOCAL_RANK"] device = torch.device(f"cuda:{local_rank}") - torch.cuda.set_device(device) + torch.accelerator.set_device_index(device) # Create a minimal vllm config for init_distributed_environment vllm_config = VllmConfig() diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py index ce4c9c24e..2804c95d3 100644 --- a/tests/distributed/test_comm_ops.py +++ b/tests/distributed/test_comm_ops.py @@ -43,7 +43,7 @@ def all_reduce_test_worker( monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False) device = torch.device(f"cuda:{rank}") - torch.cuda.set_device(device) + torch.accelerator.set_device_index(device) init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port) num_elements = 8 all_tensors = [ @@ -69,7 +69,7 @@ def reduce_scatter_test_worker( # they will be able to set the device to the correct GPU monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False) device = torch.device(f"cuda:{rank}") - torch.cuda.set_device(device) + torch.accelerator.set_device_index(device) init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port) num_elements = 8 @@ -100,7 +100,7 @@ def all_gather_test_worker( # they will be able to set the device to the correct GPU monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False) device = torch.device(f"cuda:{rank}") - torch.cuda.set_device(device) + torch.accelerator.set_device_index(device) init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port) num_dimensions = 3 tensor_size = list(range(2, num_dimensions + 2)) @@ -134,7 +134,7 @@ def broadcast_tensor_dict_test_worker( # they will be able to set the device to the correct GPU monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False) device = torch.device(f"cuda:{rank}") - torch.cuda.set_device(device) + torch.accelerator.set_device_index(device) init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port) test_dict = { # device tensor @@ -171,7 +171,7 @@ def send_recv_tensor_dict_test_worker( ): monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False) device = torch.device(f"cuda:{rank}") - torch.cuda.set_device(device) + torch.accelerator.set_device_index(device) init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port) test_dict = { @@ -317,7 +317,7 @@ def send_recv_test_worker( ): monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False) device = torch.device(f"cuda:{rank}") - torch.cuda.set_device(device) + torch.accelerator.set_device_index(device) init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port) size = 64 diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py index 5008c4de0..edddb6ec8 100644 --- a/tests/distributed/test_custom_all_reduce.py +++ b/tests/distributed/test_custom_all_reduce.py @@ -35,7 +35,7 @@ def graph_allreduce( m.delenv("CUDA_VISIBLE_DEVICES", raising=False) m.delenv("HIP_VISIBLE_DEVICES", raising=False) device = torch.device(f"cuda:{rank}") - torch.cuda.set_device(device) + torch.accelerator.set_device_index(device) init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port) ensure_model_parallel_initialized(tp_size, pp_size) group = get_tp_group().device_group @@ -62,12 +62,10 @@ def graph_allreduce( for dtype in [torch.float32, torch.float16, torch.bfloat16]: with graph_capture(device=device) as graph_capture_context: # use integers so result matches NCCL exactly - inp1 = torch.randint( - 1, 16, (sz,), dtype=dtype, device=torch.cuda.current_device() - ) - inp2 = torch.randint( - 1, 16, (sz,), dtype=dtype, device=torch.cuda.current_device() - ) + device_idx = torch.accelerator.current_device_index() + inp1 = torch.randint(1, 16, (sz,), dtype=dtype, device=device_idx) + inp2 = torch.randint(1, 16, (sz,), dtype=dtype, device=device_idx) + torch.accelerator.synchronize() graph = torch.cuda.CUDAGraph() with torch.cuda.graph(graph, stream=graph_capture_context.stream): @@ -95,7 +93,7 @@ def eager_allreduce( m.delenv("CUDA_VISIBLE_DEVICES", raising=False) m.delenv("HIP_VISIBLE_DEVICES", raising=False) device = torch.device(f"cuda:{rank}") - torch.cuda.set_device(device) + torch.accelerator.set_device_index(device) init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port) # we use the first group to communicate once @@ -129,6 +127,6 @@ def test_custom_allreduce( test_target, ): world_size = tp_size * pipeline_parallel_size - if world_size > torch.cuda.device_count(): + if world_size > torch.accelerator.device_count(): pytest.skip("Not enough GPUs to run the test.") multi_process_parallel(monkeypatch, tp_size, pipeline_parallel_size, test_target) diff --git a/tests/distributed/test_eplb_execute.py b/tests/distributed/test_eplb_execute.py index 674a665b0..50c7e6538 100644 --- a/tests/distributed/test_eplb_execute.py +++ b/tests/distributed/test_eplb_execute.py @@ -442,7 +442,7 @@ def test_rearrange_expert_weights_with_redundancy( ): """Test the functionality of rearranging expert weights with redundancy.""" - if torch.cuda.device_count() < world_size: + if torch.accelerator.device_count() < world_size: pytest.skip(f"Need at least {world_size} GPUs to run the test") distributed_run( _test_rearrange_expert_weights_with_redundancy, @@ -528,7 +528,7 @@ def test_async_transfer_layer_without_mtp( ): """Exercise async EPLB transfer path without MTP/spec decode.""" - if torch.cuda.device_count() < world_size: + if torch.accelerator.device_count() < world_size: pytest.skip(f"Need at least {world_size} GPUs to run the test") distributed_run( @@ -547,7 +547,7 @@ def test_rearrange_expert_weights_no_change(world_size): unchanged. """ - if torch.cuda.device_count() < world_size: + if torch.accelerator.device_count() < world_size: pytest.skip(f"Need at least {world_size} GPUs to run the test") distributed_run(_test_rearrange_expert_weights_no_change, world_size) @@ -623,6 +623,6 @@ def _test_rearrange_expert_weights_profile_mode(env, world_size) -> None: def test_rearrange_expert_weights_profile_mode(world_size): """Test profile mode (should not copy actual weights)""" - if torch.cuda.device_count() < world_size: + if torch.accelerator.device_count() < world_size: pytest.skip(f"Need at least {world_size} GPUs to run the test") distributed_run(_test_rearrange_expert_weights_profile_mode, world_size) diff --git a/tests/distributed/test_eplb_fused_moe_layer.py b/tests/distributed/test_eplb_fused_moe_layer.py index 55f265198..eacdb3abc 100644 --- a/tests/distributed/test_eplb_fused_moe_layer.py +++ b/tests/distributed/test_eplb_fused_moe_layer.py @@ -257,7 +257,7 @@ def test_eplb_fml( intermediate_size: int, column_major_scales: bool, ): - if torch.cuda.device_count() < world_size: + if torch.accelerator.device_count() < world_size: pytest.skip(f"Need at least {world_size} GPUs to run the test") num_local_experts = num_experts // world_size diff --git a/tests/distributed/test_eplb_fused_moe_layer_dep_nvfp4.py b/tests/distributed/test_eplb_fused_moe_layer_dep_nvfp4.py index 951b692e1..68b2407c2 100644 --- a/tests/distributed/test_eplb_fused_moe_layer_dep_nvfp4.py +++ b/tests/distributed/test_eplb_fused_moe_layer_dep_nvfp4.py @@ -253,7 +253,7 @@ def test_eplb_fml( monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1") monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", backend) - if torch.cuda.device_count() < world_size: + if torch.accelerator.device_count() < world_size: pytest.skip(f"Need at least {world_size} GPUs to run the test") num_local_experts = num_experts // world_size diff --git a/tests/distributed/test_nccl_symm_mem_allreduce.py b/tests/distributed/test_nccl_symm_mem_allreduce.py index b81624fe1..420bf631d 100644 --- a/tests/distributed/test_nccl_symm_mem_allreduce.py +++ b/tests/distributed/test_nccl_symm_mem_allreduce.py @@ -38,7 +38,7 @@ def nccl_symm_mem_allreduce_worker(local_rank: int, world_size: int): m.delenv("CUDA_VISIBLE_DEVICES", raising=False) dtype = torch.bfloat16 device = torch.device(f"cuda:{local_rank}") - torch.cuda.set_device(device) + torch.accelerator.set_device_index(device) torch.set_default_device(device) torch.set_default_dtype(dtype) update_environment_variables( @@ -84,7 +84,7 @@ def nccl_symm_mem_allreduce_worker(local_rank: int, world_size: int): @pytest.mark.parametrize("world_size", [2]) @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA") def test_nccl_symm_mem_allreduce(monkeypatch: pytest.MonkeyPatch, world_size): - if world_size > torch.cuda.device_count(): + if world_size > torch.accelerator.device_count(): pytest.skip("Not enough GPUs to run the test.") # Enable SymmMemCommunicator diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py index 3b5b45aa0..a1d5355d4 100644 --- a/tests/distributed/test_pynccl.py +++ b/tests/distributed/test_pynccl.py @@ -54,7 +54,7 @@ def worker_fn_wrapper(fn): update_environment_variables(env) local_rank = os.environ["LOCAL_RANK"] device = torch.device(f"cuda:{local_rank}") - torch.cuda.set_device(device) + torch.accelerator.set_device_index(device) init_distributed_environment() fn() @@ -73,7 +73,7 @@ def worker_fn(): @pytest.mark.skipif( - torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test." + torch.accelerator.device_count() < 2, reason="Need at least 2 GPUs to run the test." ) def test_pynccl(): distributed_run(worker_fn, 2) @@ -102,7 +102,7 @@ def multiple_allreduce_worker_fn(): @pytest.mark.skipif( - torch.cuda.device_count() < 4, reason="Need at least 4 GPUs to run the test." + torch.accelerator.device_count() < 4, reason="Need at least 4 GPUs to run the test." ) def test_pynccl_multiple_allreduce(): # this tests pynccl for multiple tp groups, in a standalone way @@ -130,7 +130,7 @@ def multiple_allreduce_with_vllm_worker_fn(): @pytest.mark.skipif( - torch.cuda.device_count() < 4, reason="Need at least 4 GPUs to run the test." + torch.accelerator.device_count() < 4, reason="Need at least 4 GPUs to run the test." ) def test_pynccl_multiple_allreduce_with_vllm(): # this tests pynccl for multiple tp groups, together with vllm @@ -185,7 +185,7 @@ def all_gather_worker_fn(): @pytest.mark.skipif( - torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test." + torch.accelerator.device_count() < 2, reason="Need at least 2 GPUs to run the test." ) def test_pynccl_all_gather(): distributed_run(all_gather_worker_fn, 2) @@ -220,7 +220,7 @@ def all_gatherv_worker_fn(): @pytest.mark.skipif( - torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test." + torch.accelerator.device_count() < 2, reason="Need at least 2 GPUs to run the test." ) def test_pynccl_all_gatherv(): distributed_run(all_gatherv_worker_fn, 2) @@ -260,7 +260,7 @@ def reduce_scatter_worker_fn(): @pytest.mark.skipif( - torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test." + torch.accelerator.device_count() < 2, reason="Need at least 2 GPUs to run the test." ) def test_pynccl_reduce_scatter(): distributed_run(reduce_scatter_worker_fn, 2) @@ -298,14 +298,14 @@ def reduce_scatterv_worker_fn(): @pytest.mark.skipif( - torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test." + torch.accelerator.device_count() < 2, reason="Need at least 2 GPUs to run the test." ) def test_pynccl_reduce_scatterv(): distributed_run(reduce_scatterv_worker_fn, 2) @pytest.mark.skipif( - torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test." + torch.accelerator.device_count() < 2, reason="Need at least 2 GPUs to run the test." ) def test_pynccl_with_cudagraph(): distributed_run(worker_fn_with_cudagraph, 2) @@ -330,7 +330,7 @@ def send_recv_worker_fn(): @pytest.mark.skipif( - torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test." + torch.accelerator.device_count() < 2, reason="Need at least 2 GPUs to run the test." ) def test_pynccl_send_recv(): distributed_run(send_recv_worker_fn, 2) @@ -363,14 +363,14 @@ def multiple_send_recv_worker_fn(): @pytest.mark.skipif( - torch.cuda.device_count() < 4, reason="Need at least 4 GPUs to run the test." + torch.accelerator.device_count() < 4, reason="Need at least 4 GPUs to run the test." ) def test_pynccl_multiple_send_recv(): distributed_run(multiple_send_recv_worker_fn, 4) @pytest.mark.skipif( - torch.cuda.device_count() < 4, reason="Need at least 4 GPUs to run the test." + torch.accelerator.device_count() < 4, reason="Need at least 4 GPUs to run the test." ) def test_pynccl_broadcast(): distributed_run(broadcast_worker_fn, 4) diff --git a/tests/distributed/test_quick_all_reduce.py b/tests/distributed/test_quick_all_reduce.py index 5af3101a9..9fbc4e0e9 100644 --- a/tests/distributed/test_quick_all_reduce.py +++ b/tests/distributed/test_quick_all_reduce.py @@ -39,7 +39,7 @@ def graph_quickreduce( with monkeypatch.context() as m: m.delenv("CUDA_VISIBLE_DEVICES", raising=False) device = torch.device(f"cuda:{rank}") - torch.cuda.set_device(device) + torch.accelerator.set_device_index(device) init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port) ensure_model_parallel_initialized(tp_size, pp_size) group = get_tp_group().device_group @@ -65,12 +65,10 @@ def graph_quickreduce( for sz in test_sizes: for dtype in [torch.float16, torch.bfloat16]: with graph_capture(device=device) as graph_capture_context: - inp1 = torch.randint( - 1, 23, (sz,), dtype=dtype, device=torch.cuda.current_device() - ) - inp2 = torch.randint( - -23, 1, (sz,), dtype=dtype, device=torch.cuda.current_device() - ) + device_idx = torch.accelerator.current_device_index() + inp1 = torch.randint(1, 23, (sz,), dtype=dtype, device=device_idx) + inp2 = torch.randint(-23, 1, (sz,), dtype=dtype, device=device_idx) + torch.accelerator.synchronize() graph = torch.cuda.CUDAGraph() with torch.cuda.graph(graph, stream=graph_capture_context.stream): @@ -95,7 +93,7 @@ def eager_quickreduce( with monkeypatch.context() as m: m.delenv("CUDA_VISIBLE_DEVICES", raising=False) device = torch.device(f"cuda:{rank}") - torch.cuda.set_device(device) + torch.accelerator.set_device_index(device) init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port) @@ -130,7 +128,7 @@ def test_custom_quick_allreduce( quant_mode, ): world_size = tp_size * pipeline_parallel_size - if world_size > torch.cuda.device_count(): + if world_size > torch.accelerator.device_count(): pytest.skip("Not enough GPUs to run the test.") monkeypatch.setenv("VLLM_ROCM_QUICK_REDUCE_QUANTIZATION", quant_mode) @@ -145,7 +143,7 @@ def qr_variable_input(rank, world_size): has been observed with the gpt_oss model). """ device = torch.device(f"cuda:{rank}") - torch.cuda.set_device(device) + torch.accelerator.set_device_index(device) qr_max_size = None # MB _ptr = ops.init_custom_qr(rank, world_size, qr_max_size) ranks = [] @@ -169,14 +167,13 @@ def qr_variable_input(rank, world_size): s1 = 1024 while num < 50000: # 50000 is sufficient to identify issues. dtype = torch.float16 + device_idx = torch.accelerator.current_device_index() if num % 2 == 0: s2 = 1024 - inp1 = torch.zeros( - (s1, s2), dtype=dtype, device=torch.cuda.current_device() - ) + inp1 = torch.zeros((s1, s2), dtype=dtype, device=device_idx) else: s2 = 2048 - inp1 = torch.ones((s1, s2), dtype=dtype, device=torch.cuda.current_device()) + inp1 = torch.ones((s1, s2), dtype=dtype, device=device_idx) result = torch.empty_like(inp1) # FP = 0 INT8 = 1 INT6 = 2 INT4 = 3 NONE = 4 ops.qr_all_reduce(_ptr, inp1, result, 3, cast_bf2half=True) @@ -198,7 +195,7 @@ def qr_variable_input(rank, world_size): @pytest.mark.parametrize("pipeline_parallel_size", [1]) def test_custom_quick_allreduce_variable_input(tp_size, pipeline_parallel_size): world_size = tp_size * pipeline_parallel_size - if world_size > torch.cuda.device_count(): + if world_size > torch.accelerator.device_count(): pytest.skip("Not enough GPUs to run the test.") multiprocessing.set_start_method("spawn", force=True) diff --git a/tests/distributed/test_symm_mem_allreduce.py b/tests/distributed/test_symm_mem_allreduce.py index b8f04cf8e..6750aa788 100644 --- a/tests/distributed/test_symm_mem_allreduce.py +++ b/tests/distributed/test_symm_mem_allreduce.py @@ -39,7 +39,7 @@ def symm_mem_allreduce_worker(local_rank: int, world_size: int, q: mp.Queue): m.delenv("CUDA_VISIBLE_DEVICES", raising=False) dtype = torch.bfloat16 device = torch.device(f"cuda:{local_rank}") - torch.cuda.set_device(device) + torch.accelerator.set_device_index(device) torch.set_default_device(device) torch.set_default_dtype(dtype) update_environment_variables( @@ -105,7 +105,7 @@ def test_symm_mem_allreduce( monkeypatch: pytest.MonkeyPatch, tp_size, pipeline_parallel_size ): world_size = tp_size * pipeline_parallel_size - if world_size > torch.cuda.device_count(): + if world_size > torch.accelerator.device_count(): pytest.skip("Not enough GPUs to run the test.") q = mp.get_context("spawn").Queue() mp.spawn(symm_mem_allreduce_worker, args=(world_size, q), nprocs=world_size) @@ -126,7 +126,7 @@ def test_symm_mem_allreduce( @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA") def test_dp_with_symm_mem_allreduce(monkeypatch: pytest.MonkeyPatch): world_size = 4 - if world_size > torch.cuda.device_count(): + if world_size > torch.accelerator.device_count(): pytest.skip("Not enough GPUs to run the test.") # Verify that the DataParallel runs without error engine_args = EngineArgs( diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py index c2fea7c1d..784918642 100644 --- a/tests/distributed/test_utils.py +++ b/tests/distributed/test_utils.py @@ -66,7 +66,7 @@ def cpu_worker(rank, WORLD_SIZE, port1, port2): def gpu_worker(rank, WORLD_SIZE, port1, port2): - torch.cuda.set_device(rank) + torch.accelerator.set_device_index(rank) pg1 = StatelessProcessGroup.create( host="127.0.0.1", port=port1, rank=rank, world_size=WORLD_SIZE ) diff --git a/tests/distributed/test_weight_transfer.py b/tests/distributed/test_weight_transfer.py index def1e1dfd..1309edf5a 100644 --- a/tests/distributed/test_weight_transfer.py +++ b/tests/distributed/test_weight_transfer.py @@ -203,7 +203,7 @@ class TestEngineRegistry: def test_nccl_receive_weights_without_init_raises(): """Test that receive_weights raises if init_transfer_engine wasn't called.""" - if torch.cuda.device_count() < 1: + if torch.accelerator.device_count() < 1: pytest.skip("Need at least 1 GPU for this test") config = WeightTransferConfig(backend="nccl") @@ -336,7 +336,7 @@ def inference_receive_tensor( @pytest.mark.skipif( - torch.cuda.device_count() < 2, + torch.accelerator.device_count() < 2, reason="Need at least 2 GPUs to run NCCL weight transfer test.", ) def test_nccl_weight_transfer_between_processes(): @@ -382,7 +382,7 @@ class TestIPCWeightTransferUpdateInfoValidation: def test_valid_update_info(self): """Test creating valid IPCWeightTransferUpdateInfo.""" - if torch.cuda.device_count() < 1: + if torch.accelerator.device_count() < 1: pytest.skip("Need at least 1 GPU for this test") # Create a dummy tensor and IPC handle @@ -404,7 +404,7 @@ class TestIPCWeightTransferUpdateInfoValidation: def test_mismatched_dtype_names_raises(self): """Test that mismatched dtype_names length raises ValueError.""" - if torch.cuda.device_count() < 1: + if torch.accelerator.device_count() < 1: pytest.skip("Need at least 1 GPU for this test") dummy_tensor = torch.ones(10, 10, device="cuda:0") @@ -422,7 +422,7 @@ class TestIPCWeightTransferUpdateInfoValidation: def test_mismatched_shapes_raises(self): """Test that mismatched shapes length raises ValueError.""" - if torch.cuda.device_count() < 1: + if torch.accelerator.device_count() < 1: pytest.skip("Need at least 1 GPU for this test") dummy_tensor = torch.ones(10, 10, device="cuda:0") @@ -440,7 +440,7 @@ class TestIPCWeightTransferUpdateInfoValidation: def test_mismatched_ipc_handles_raises(self): """Test that mismatched ipc_handles length raises ValueError.""" - if torch.cuda.device_count() < 1: + if torch.accelerator.device_count() < 1: pytest.skip("Need at least 1 GPU for this test") dummy_tensor = torch.ones(10, 10, device="cuda:0") @@ -458,7 +458,7 @@ class TestIPCWeightTransferUpdateInfoValidation: def test_valid_update_info_from_pickled(self, monkeypatch): """Test creating IPCWeightTransferUpdateInfo from pickled handles.""" - if torch.cuda.device_count() < 1: + if torch.accelerator.device_count() < 1: pytest.skip("Need at least 1 GPU for this test") monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") @@ -493,7 +493,7 @@ class TestIPCWeightTransferUpdateInfoValidation: def test_both_handles_and_pickled_raises(self): """Test that providing both ipc_handles and ipc_handles_pickled raises.""" - if torch.cuda.device_count() < 1: + if torch.accelerator.device_count() < 1: pytest.skip("Need at least 1 GPU for this test") dummy_tensor = torch.ones(10, 10, device="cuda:0") @@ -540,7 +540,7 @@ class TestIPCEngineParsing: def test_parse_update_info_valid(self): """Test parsing valid update info dict.""" - if torch.cuda.device_count() < 1: + if torch.accelerator.device_count() < 1: pytest.skip("Need at least 1 GPU for this test") config = WeightTransferConfig(backend="ipc") @@ -572,7 +572,7 @@ class TestIPCEngineParsing: def test_parse_update_info_pickled(self, monkeypatch): """Test parsing update info with pickled IPC handles (HTTP path).""" - if torch.cuda.device_count() < 1: + if torch.accelerator.device_count() < 1: pytest.skip("Need at least 1 GPU for this test") monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") @@ -731,7 +731,7 @@ def inference_receive_ipc_tensor( @pytest.mark.skipif( - torch.cuda.device_count() < 1, + torch.accelerator.device_count() < 1, reason="Need at least 1 GPU to run IPC weight transfer test.", ) @pytest.mark.parametrize("mode", ["ray", "http"]) @@ -789,7 +789,7 @@ def test_ipc_weight_transfer_between_processes(mode: str): def test_ipc_receive_weights_missing_gpu_uuid_raises(): """Test that receive_weights raises if GPU UUID not found in IPC handles.""" - if torch.cuda.device_count() < 1: + if torch.accelerator.device_count() < 1: pytest.skip("Need at least 1 GPU for this test") config = WeightTransferConfig(backend="ipc") diff --git a/tests/entrypoints/llm/test_collective_rpc.py b/tests/entrypoints/llm/test_collective_rpc.py index 747676ac9..d66455889 100644 --- a/tests/entrypoints/llm/test_collective_rpc.py +++ b/tests/entrypoints/llm/test_collective_rpc.py @@ -13,7 +13,7 @@ from ...utils import create_new_process_for_each_test @pytest.mark.parametrize("backend", ["mp", "ray"]) @create_new_process_for_each_test() def test_collective_rpc(tp_size, backend, monkeypatch): - if torch.cuda.device_count() < tp_size: + if torch.accelerator.device_count() < tp_size: pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}") if tp_size == 1 and backend == "ray": pytest.skip("Skip duplicate test case") diff --git a/tests/entrypoints/weight_transfer/test_weight_transfer_llm.py b/tests/entrypoints/weight_transfer/test_weight_transfer_llm.py index 255bca444..7d6d330aa 100644 --- a/tests/entrypoints/weight_transfer/test_weight_transfer_llm.py +++ b/tests/entrypoints/weight_transfer/test_weight_transfer_llm.py @@ -106,7 +106,7 @@ def mock_create_engine(config, parallel_config): @create_new_process_for_each_test() def test_get_world_size_tp1(): """Test world_size is correctly configured for TP=1.""" - if torch.cuda.device_count() < 1: + if torch.accelerator.device_count() < 1: pytest.skip("Need at least 1 GPU for this test") llm = LLM( @@ -125,7 +125,7 @@ def test_get_world_size_tp1(): def test_init_weight_transfer_engine_calls_engine(): """Test that init_weight_transfer_engine calls the engine's init_transfer_engine method.""" - if torch.cuda.device_count() < 1: + if torch.accelerator.device_count() < 1: pytest.skip("Need at least 1 GPU for this test") # Run in-process so mock.patch works (spawn won't inherit the mock) @@ -174,7 +174,7 @@ def test_init_weight_transfer_engine_calls_engine(): @create_new_process_for_each_test() def test_update_weights_calls_engine(): """Test that update_weights calls the engine's receive_weights method.""" - if torch.cuda.device_count() < 1: + if torch.accelerator.device_count() < 1: pytest.skip("Need at least 1 GPU for this test") # Run in-process so mock.patch works (spawn won't inherit the mock) @@ -233,7 +233,7 @@ def test_update_weights_calls_engine(): @create_new_process_for_each_test() def test_full_weight_transfer_flow(): """Test the complete weight transfer flow: init -> update.""" - if torch.cuda.device_count() < 1: + if torch.accelerator.device_count() < 1: pytest.skip("Need at least 1 GPU for this test") # Run in-process so mock.patch works (spawn won't inherit the mock) @@ -294,7 +294,7 @@ def test_full_weight_transfer_flow(): @create_new_process_for_each_test() def test_weight_transfer_config_backend(): """Test that WeightTransferConfig backend is properly configured.""" - if torch.cuda.device_count() < 1: + if torch.accelerator.device_count() < 1: pytest.skip("Need at least 1 GPU for this test") # Test with nccl backend diff --git a/tests/kernels/attention/test_attention.py b/tests/kernels/attention/test_attention.py index a14b80b32..9ddceef8f 100644 --- a/tests/kernels/attention/test_attention.py +++ b/tests/kernels/attention/test_attention.py @@ -36,7 +36,9 @@ BLOCK_SIZES = [16, 32] USE_ALIBI = [False, True] KV_CACHE_DTYPE = ["auto", "fp8"] SEEDS = [0] -CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)] +CUDA_DEVICES = [ + f"cuda:{i}" for i in range(1 if torch.accelerator.device_count() == 1 else 2) +] def ref_masked_attention( diff --git a/tests/kernels/attention/test_cache.py b/tests/kernels/attention/test_cache.py index 7c60a8a14..0249461dd 100644 --- a/tests/kernels/attention/test_cache.py +++ b/tests/kernels/attention/test_cache.py @@ -35,7 +35,9 @@ NUM_BLOCKS = [1024, 10000] NUM_MAPPINGS = [256] # Arbitrary values for testing SEEDS = [0] -CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)] +CUDA_DEVICES = [ + f"cuda:{i}" for i in range(1 if torch.accelerator.device_count() == 1 else 2) +] # We assume fp8 is always enabled for testing. KV_CACHE_DTYPE = ["auto", "fp8"] @@ -69,7 +71,7 @@ def test_reshape_and_cache( pytest.skip() set_random_seed(seed) torch.set_default_device(device) - torch.cuda.set_device(device) + torch.accelerator.set_device_index(device) # Create a random slot mapping. num_slots = block_size * num_blocks slot_mapping_lst = random.sample(range(num_slots), num_tokens) @@ -192,7 +194,7 @@ def test_reshape_and_cache_flash( ) -> None: set_random_seed(seed) torch.set_default_device(device) - torch.cuda.set_device(device) + torch.accelerator.set_device_index(device) assert implementation in ["cuda", "triton"] if implementation == "triton" and kv_cache_layout == "HND": pytest.skip("Triton implementation only supports NHD layout.") @@ -553,7 +555,7 @@ def test_concat_and_cache_mla( ) -> None: set_random_seed(seed) torch.set_default_device(device) - torch.cuda.set_device(device) + torch.accelerator.set_device_index(device) total_slots = num_blocks * block_size slot_mapping_lst = random.sample(range(total_slots), num_tokens) @@ -632,7 +634,7 @@ def test_concat_and_cache_ds_mla( kv_cache_dtype = "fp8_ds_mla" set_random_seed(seed) torch.set_default_device(device) - torch.cuda.set_device(device) + torch.accelerator.set_device_index(device) total_slots = num_blocks * block_size slot_mapping_lst = random.sample(range(total_slots), num_tokens) @@ -744,7 +746,7 @@ def test_swap_blocks_mla( ) -> None: set_random_seed(seed) torch.set_default_device(device) - torch.cuda.set_device(device) + torch.accelerator.set_device_index(device) entry_size = kv_lora_rank + qk_rope_head_dim diff --git a/tests/kernels/attention/test_cutlass_mla_decode.py b/tests/kernels/attention/test_cutlass_mla_decode.py index 1f2fb66b3..33bd36058 100644 --- a/tests/kernels/attention/test_cutlass_mla_decode.py +++ b/tests/kernels/attention/test_cutlass_mla_decode.py @@ -69,7 +69,7 @@ def test_cutlass_mla_decode( init_dtype = torch.bfloat16 if torch_dtype == torch.float8_e4m3fn else torch_dtype torch.set_default_dtype(init_dtype) torch.set_default_device(device) - torch.cuda.set_device(device) + torch.accelerator.set_device_index(device) torch.manual_seed(42) random.seed(42) diff --git a/tests/kernels/attention/test_flashmla.py b/tests/kernels/attention/test_flashmla.py index 6b3d3485d..657b256f4 100644 --- a/tests/kernels/attention/test_flashmla.py +++ b/tests/kernels/attention/test_flashmla.py @@ -57,7 +57,7 @@ def test_flash_mla( init_dtype = torch.bfloat16 if torch_dtype == torch.float8_e4m3fn else torch_dtype torch.set_default_dtype(init_dtype) torch.set_default_device(device) - torch.cuda.set_device(device) + torch.accelerator.set_device_index(device) torch.manual_seed(0) random.seed(0) diff --git a/tests/kernels/attention/test_prefix_prefill.py b/tests/kernels/attention/test_prefix_prefill.py index 7aeeaf8b4..de63b4548 100644 --- a/tests/kernels/attention/test_prefix_prefill.py +++ b/tests/kernels/attention/test_prefix_prefill.py @@ -21,7 +21,9 @@ NUM_HEADS = [64] NUM_QUERIES_PER_KV = [1, 64] HEAD_SIZES = [24, 128] DTYPES = [torch.float16] -CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)] +CUDA_DEVICES = [ + f"cuda:{i}" for i in range(1 if torch.accelerator.device_count() == 1 else 2) +] SLIDING_WINDOW = [0, 16, 2048] KV_CACHE_DTYPES = ["auto", "fp8", "fp8_e5m2"] @@ -135,7 +137,7 @@ def test_contexted_kv_attention( # for GPU 1 would run on both GPU0 and GPU1 and things would hang # # see also similar issue: https://github.com/Dao-AILab/flash-attention/issues/523 - torch.cuda.set_device(device) + torch.accelerator.set_device_index(device) MAX_SEQ_LEN = 1024 MAX_CTX_LEN = 1024 @@ -356,7 +358,7 @@ def test_contexted_kv_attention_alibi( # for GPU 1 would run on both GPU0 and GPU1 and things would hang # # see also similar issue: https://github.com/Dao-AILab/flash-attention/issues/523 - torch.cuda.set_device(device) + torch.accelerator.set_device_index(device) def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor: # Fork from: vllm/vllm/model_executor/models/bloom.py#L44 diff --git a/tests/kernels/core/test_activation.py b/tests/kernels/core/test_activation.py index 66727a309..e7de77312 100644 --- a/tests/kernels/core/test_activation.py +++ b/tests/kernels/core/test_activation.py @@ -26,7 +26,9 @@ DTYPES = [torch.half, torch.bfloat16, torch.float] NUM_TOKENS = [7, 83, 2048] # Arbitrary values for testing D = [512, 13824] # Arbitrary values for testing SEEDS = [0] -CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)] +CUDA_DEVICES = [ + f"cuda:{i}" for i in range(1 if torch.accelerator.device_count() == 1 else 2) +] @pytest.mark.parametrize( diff --git a/tests/kernels/core/test_fused_quant_layernorm.py b/tests/kernels/core/test_fused_quant_layernorm.py index b7e6ce386..fe06605af 100644 --- a/tests/kernels/core/test_fused_quant_layernorm.py +++ b/tests/kernels/core/test_fused_quant_layernorm.py @@ -33,7 +33,9 @@ SCALE_UBS = [True, False] GROUP_SIZES = [None, [1, 64], [1, 128]] TMA_ALIGNMENTS = [0, 4] SEEDS = [0] -CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)] +CUDA_DEVICES = [ + f"cuda:{i}" for i in range(1 if torch.accelerator.device_count() == 1 else 2) +] EPS = 1e-6 @@ -182,7 +184,7 @@ def test_rms_norm( if torch.cuda.is_available(): torch.cuda.manual_seed(seed) torch.set_default_device(device) - torch.cuda.set_device(device) + torch.accelerator.set_device_index(device) if group_size is not None and hidden_size % group_size[1] != 0: # skip diff --git a/tests/kernels/core/test_layernorm.py b/tests/kernels/core/test_layernorm.py index 2dca0da07..f8f966094 100644 --- a/tests/kernels/core/test_layernorm.py +++ b/tests/kernels/core/test_layernorm.py @@ -14,7 +14,9 @@ NUM_TOKENS = [7, 83, 4096] # Arbitrary values for testing HIDDEN_SIZES = [8, 768, 769, 5120, 5125, 8192] # Arbitrary values for testing ADD_RESIDUAL = [False, True] SEEDS = [0] -CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)] +CUDA_DEVICES = [ + f"cuda:{i}" for i in range(1 if torch.accelerator.device_count() == 1 else 2) +] @pytest.mark.parametrize("num_tokens", NUM_TOKENS) diff --git a/tests/kernels/core/test_pos_encoding.py b/tests/kernels/core/test_pos_encoding.py index 5094a29c5..3a750b743 100644 --- a/tests/kernels/core/test_pos_encoding.py +++ b/tests/kernels/core/test_pos_encoding.py @@ -19,7 +19,9 @@ NUM_HEADS = [17] # Arbitrary values for testing BATCH_SIZES = [5] # Arbitrary values for testing SEQ_LENS = [11, 8192] # Arbitrary values for testing SEEDS = [0] -CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)] +CUDA_DEVICES = [ + f"cuda:{i}" for i in range(1 if torch.accelerator.device_count() == 1 else 2) +] USE_KEY = [True, False] diff --git a/tests/kernels/core/test_rotary_embedding_mla_cache_fused.py b/tests/kernels/core/test_rotary_embedding_mla_cache_fused.py index a8781afd8..181f10f31 100644 --- a/tests/kernels/core/test_rotary_embedding_mla_cache_fused.py +++ b/tests/kernels/core/test_rotary_embedding_mla_cache_fused.py @@ -28,7 +28,8 @@ from vllm.utils.torch_utils import set_random_seed @pytest.mark.parametrize("block_size", [16, 64, 256]) @pytest.mark.parametrize("seed", [0]) @pytest.mark.parametrize( - "device", [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)] + "device", + [f"cuda:{i}" for i in range(1 if torch.accelerator.device_count() == 1 else 2)], ) @torch.inference_mode() def test_concat_and_cache_mla_rope_fused( diff --git a/tests/kernels/core/test_uva.py b/tests/kernels/core/test_uva.py index f4a0296d8..7c2561250 100644 --- a/tests/kernels/core/test_uva.py +++ b/tests/kernels/core/test_uva.py @@ -6,7 +6,9 @@ import torch from vllm.utils.platform_utils import is_uva_available from vllm.utils.torch_utils import get_accelerator_view_from_cpu_tensor -CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)] +CUDA_DEVICES = [ + f"cuda:{i}" for i in range(1 if torch.accelerator.device_count() == 1 else 2) +] @pytest.mark.skipif(not is_uva_available(), reason="UVA is not available.") diff --git a/tests/kernels/mamba/test_mamba_mixer2.py b/tests/kernels/mamba/test_mamba_mixer2.py index 322e717e9..973e7885c 100644 --- a/tests/kernels/mamba/test_mamba_mixer2.py +++ b/tests/kernels/mamba/test_mamba_mixer2.py @@ -71,7 +71,7 @@ def mixer2_gated_norm_tensor_parallel( set_random_seed(0) device = torch.device(f"cuda:{local_rank}") - torch.cuda.set_device(device) + torch.accelerator.set_device_index(device) torch.set_default_device(device) torch.set_default_dtype(dtype) diff --git a/tests/kernels/moe/modular_kernel_tools/common.py b/tests/kernels/moe/modular_kernel_tools/common.py index 4b2b1653b..6f9abc607 100644 --- a/tests/kernels/moe/modular_kernel_tools/common.py +++ b/tests/kernels/moe/modular_kernel_tools/common.py @@ -322,7 +322,7 @@ class WeightTensors: ) def to_current_device(self): - device = torch.cuda.current_device() + device = torch.accelerator.current_device_index() self.w1 = self.w1.to(device=device) self.w2 = self.w2.to(device=device) @@ -392,7 +392,8 @@ class RankTensors: Return hidden_states """ m, k, dtype = (config.M, config.K, config.dtype) - a = torch.randn((m, k), device=torch.cuda.current_device(), dtype=dtype) / 15.0 + device = torch.accelerator.current_device_index() + a = torch.randn((m, k), device=device, dtype=dtype) / 15.0 if config.quant_dtype is None: return a, None @@ -428,9 +429,10 @@ class RankTensors: topk_weights, topk_ids, _ = fused_topk(hidden_states, score, topk, False) # distribute topk_ids evenly + device = torch.accelerator.current_device_index() for mi in range(m): topk_ids[mi] = torch.randperm(config.E)[:topk] - topk_ids = topk_ids.to(device=torch.cuda.current_device()) + topk_ids = topk_ids.to(device=device) expert_map = None if config.world_size > 1 and config.supports_expert_map(): @@ -440,9 +442,7 @@ class RankTensors: s = pgi.rank * num_local_experts e = s + num_local_experts expert_map[s:e] = torch.tensor(list(range(num_local_experts))) - expert_map = expert_map.to( - device=torch.cuda.current_device(), dtype=torch.int32 - ) + expert_map = expert_map.to(device=device, dtype=torch.int32) return RankTensors( hidden_states=hidden_states, @@ -558,7 +558,9 @@ def reference_moe_impl( def _make_gscale(num_experts: int) -> torch.Tensor: return torch.ones( - (num_experts,), device=torch.cuda.current_device(), dtype=torch.float32 + (num_experts,), + device=torch.accelerator.current_device_index(), + dtype=torch.float32, ) diff --git a/tests/kernels/moe/modular_kernel_tools/parallel_utils.py b/tests/kernels/moe/modular_kernel_tools/parallel_utils.py index 8528ee0cd..3ff2ce3b3 100644 --- a/tests/kernels/moe/modular_kernel_tools/parallel_utils.py +++ b/tests/kernels/moe/modular_kernel_tools/parallel_utils.py @@ -66,7 +66,7 @@ def _worker_parallel_launch( **kwargs: P.kwargs, ) -> None: rank = node_rank * world_local_size + local_rank - torch.cuda.set_device(local_rank) + torch.accelerator.set_device_index(local_rank) device = torch.device("cuda", local_rank) torch.distributed.init_process_group( backend="cpu:gloo,cuda:nccl", diff --git a/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py b/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py index 9f0f9f2ea..95442103b 100644 --- a/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py +++ b/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py @@ -34,7 +34,8 @@ def do_profile( record_shapes=True, ) as tprof: fn(**fn_kwargs) - torch.accelerator.synchronize(torch.cuda.current_device()) + device = torch.accelerator.current_device_index() + torch.accelerator.synchronize(device=device) # TODO (varun): Add a descriptive trace file name tprof.export_chrome_trace( diff --git a/tests/kernels/moe/parallel_utils.py b/tests/kernels/moe/parallel_utils.py index 90728c1e3..525e3e67b 100644 --- a/tests/kernels/moe/parallel_utils.py +++ b/tests/kernels/moe/parallel_utils.py @@ -52,7 +52,7 @@ def _worker_parallel_launch( **kwargs: P.kwargs, ) -> None: rank = node_rank * world_local_size + local_rank - torch.cuda.set_device(local_rank) + torch.accelerator.set_device_index(local_rank) device = torch.device("cuda", local_rank) torch.distributed.init_process_group( backend="cpu:gloo,cuda:nccl", diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py index a01fb1a45..b9404975e 100644 --- a/tests/kernels/moe/test_deepep_deepgemm_moe.py +++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py @@ -134,10 +134,8 @@ class TestTensors: fp8_info = torch.finfo(torch.float8_e4m3fn) fp8_max, fp8_min = fp8_info.max, fp8_info.min - - rank_tokens = ( - torch.randn((m, k), device=torch.cuda.current_device(), dtype=dtype) / 10.0 - ) + device = torch.accelerator.current_device_index() + rank_tokens = torch.randn((m, k), device=device, dtype=dtype) / 10.0 rank_tokens = rank_tokens.clamp(min=fp8_min, max=fp8_max) rank_token_scales = None @@ -145,11 +143,13 @@ class TestTensors: low=0, high=config.num_experts, size=(m, topk), - device=torch.cuda.current_device(), + device=device, ).to(dtype=torch.int64) topk_weights = torch.randn( - topk_ids.shape, dtype=torch.float32, device=torch.cuda.current_device() + topk_ids.shape, + dtype=torch.float32, + device=device, ) return TestTensors( @@ -296,7 +296,8 @@ def deepep_deepgemm_moe_impl( s = pgi.rank * num_local_experts e = s + num_local_experts expert_map[s:e] = torch.tensor(list(range(num_local_experts))) - return expert_map.to(device=torch.cuda.current_device(), dtype=torch.int32) + device = torch.accelerator.current_device_index() + return expert_map.to(device=device, dtype=torch.int32) quant_config = fp8_w8a8_moe_quant_config( w1_scale=w1_scale, @@ -376,10 +377,11 @@ def _test_deepep_deepgemm_moe( set_random_seed(pgi.rank) - w1 = w1.to(device=torch.cuda.current_device()) - w2 = w2.to(device=torch.cuda.current_device()) - w1_scale = w1_scale.to(device=torch.cuda.current_device()) - w2_scale = w2_scale.to(device=torch.cuda.current_device()) + device = torch.accelerator.current_device_index() + w1 = w1.to(device=device) + w2 = w2.to(device=device) + w1_scale = w1_scale.to(device=device) + w2_scale = w2_scale.to(device=device) pg = torch.distributed.new_group(list(range(pgi.world_size))) test_tensors = TestTensors.make(config, pgi.rank) diff --git a/tests/kernels/moe/test_deepep_moe.py b/tests/kernels/moe/test_deepep_moe.py index 362b71a40..28bb83107 100644 --- a/tests/kernels/moe/test_deepep_moe.py +++ b/tests/kernels/moe/test_deepep_moe.py @@ -210,7 +210,8 @@ def deep_ep_moe_impl( s = pgi.rank * num_local_experts e = s + num_local_experts expert_map[s:e] = torch.tensor(list(range(num_local_experts))) - return expert_map.to(device=torch.cuda.current_device(), dtype=torch.int32) + device = torch.accelerator.current_device_index() + return expert_map.to(device=device, dtype=torch.int32) hidden_size = test_tensors.rank_tokens.size(1) is_quantized = w1.dtype == torch.float8_e4m3fn @@ -365,15 +366,13 @@ def _deep_ep_moe( ) is_quantized = w1.dtype == torch.float8_e4m3fn - w1 = w1.to(device=torch.cuda.current_device()) - w2 = w2.to(device=torch.cuda.current_device()) + device_idx = torch.accelerator.current_device_index() + w1 = w1.to(device=device_idx) + w2 = w2.to(device=device_idx) if is_quantized: - w1_scale = w1_scale.to( # type: ignore - device=torch.cuda.current_device() - ) - w2_scale = w2_scale.to( # type: ignore - device=torch.cuda.current_device() - ) + assert w1_scale is not None and w2_scale is not None + w1_scale = w1_scale.to(device=device_idx) + w2_scale = w2_scale.to(device=device_idx) pg = torch.distributed.new_group(list(range(pgi.world_size))) test_tensors = TestTensors.make(config, low_latency_mode) diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py index 43bdd03cf..84483fea8 100644 --- a/tests/kernels/moe/test_moe.py +++ b/tests/kernels/moe/test_moe.py @@ -716,7 +716,7 @@ def test_mixtral_moe( monkeypatch.setenv("MASTER_ADDR", "localhost") monkeypatch.setenv("MASTER_PORT", "12345") init_distributed_environment() - init_workspace_manager(torch.cuda.current_device()) + init_workspace_manager(torch.accelerator.current_device_index()) # Instantiate our and huggingface's MoE blocks vllm_config.compilation_config.static_forward_context = dict() diff --git a/tests/kernels/moe/test_ocp_mx_moe.py b/tests/kernels/moe/test_ocp_mx_moe.py index 73502932d..cf9021663 100644 --- a/tests/kernels/moe/test_ocp_mx_moe.py +++ b/tests/kernels/moe/test_ocp_mx_moe.py @@ -71,10 +71,10 @@ def enable_pickle(monkeypatch): ) @pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE, reason="amd-quark>=0.9 is not available") def test_mxfp4_loading_and_execution_moe(vllm_runner, model_case: ModelCase): - if torch.cuda.device_count() < model_case.tp: + if torch.accelerator.device_count() < model_case.tp: pytest.skip( f"This test requires >={model_case.tp} gpus, got only " - f"{torch.cuda.device_count()}" + f"{torch.accelerator.device_count()}" ) # `cudagraph_capture_sizes=[16]` to reduce load time. diff --git a/tests/kernels/quantization/test_cutlass_2of4_sparse.py b/tests/kernels/quantization/test_cutlass_2of4_sparse.py index cfdb36580..ccccc79cb 100644 --- a/tests/kernels/quantization/test_cutlass_2of4_sparse.py +++ b/tests/kernels/quantization/test_cutlass_2of4_sparse.py @@ -15,7 +15,9 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( ) from vllm.platforms import current_platform -CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)] +CUDA_DEVICES = [ + f"cuda:{i}" for i in range(1 if torch.accelerator.device_count() == 1 else 2) +] capability = current_platform.get_device_capability() capability = capability[0] * 10 + capability[1] diff --git a/tests/kernels/quantization/test_cutlass_scaled_mm.py b/tests/kernels/quantization/test_cutlass_scaled_mm.py index bc4744df7..a8adec49a 100644 --- a/tests/kernels/quantization/test_cutlass_scaled_mm.py +++ b/tests/kernels/quantization/test_cutlass_scaled_mm.py @@ -40,7 +40,9 @@ MNK_FACTORS = [ (512, 24576, 128), ] -CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)] +CUDA_DEVICES = [ + f"cuda:{i}" for i in range(1 if torch.accelerator.device_count() == 1 else 2) +] # -1 means full extent in that dimension TENSORWISE_GROUP_SHAPE = (-1, -1) diff --git a/tests/kernels/quantization/test_machete_mm.py b/tests/kernels/quantization/test_machete_mm.py index 7f4ce2a08..62d0ba4f1 100644 --- a/tests/kernels/quantization/test_machete_mm.py +++ b/tests/kernels/quantization/test_machete_mm.py @@ -29,7 +29,9 @@ if current_platform.is_rocm(): allow_module_level=True, ) -CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)] +CUDA_DEVICES = [ + f"cuda:{i}" for i in range(1 if torch.accelerator.device_count() == 1 else 2) +] # TODO: in future PR refactor this and `is_quant_method_supported` in the kernel # unit tests to a common utility function. Currently the use of diff --git a/tests/kernels/test_cache_kernels.py b/tests/kernels/test_cache_kernels.py index 4cc8e3b14..25402fe03 100644 --- a/tests/kernels/test_cache_kernels.py +++ b/tests/kernels/test_cache_kernels.py @@ -13,7 +13,7 @@ except ImportError: ) -@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Need CUDA device") +@pytest.mark.skipif(torch.accelerator.device_count() < 1, reason="Need CUDA device") def test_gather_cache_oob(): """ Tests for OOB read in gather_and_maybe_dequant_cache (Issue #27909). diff --git a/tests/kernels/test_fused_quant_activation.py b/tests/kernels/test_fused_quant_activation.py index 2170b0200..2670f224d 100644 --- a/tests/kernels/test_fused_quant_activation.py +++ b/tests/kernels/test_fused_quant_activation.py @@ -13,7 +13,9 @@ QUANT_DTYPES = [current_platform.fp8_dtype()] NUM_TOKENS = [1, 17, 86, 1234, 3045] # Arbitrary values for testing HIDDEN_SIZES = [16, 48, 128, 1562, 4096] # Arbitrary values for testing SEEDS = [0] -CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)] +CUDA_DEVICES = [ + f"cuda:{i}" for i in range(1 if torch.accelerator.device_count() == 1 else 2) +] def ref_impl( diff --git a/tests/lora/test_fused_moe_lora_kernel.py b/tests/lora/test_fused_moe_lora_kernel.py index f3c3cb8cf..66a985a06 100644 --- a/tests/lora/test_fused_moe_lora_kernel.py +++ b/tests/lora/test_fused_moe_lora_kernel.py @@ -638,7 +638,7 @@ def use_fused_moe_lora_kernel_tensor_parallel( set_random_seed(seed) device = torch.device(f"cuda:{local_rank}") - torch.cuda.set_device(device) + torch.accelerator.set_device_index(device) torch.set_default_device(device) torch.set_default_dtype(dtype) diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index d3c1f3deb..08fd03724 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -61,7 +61,7 @@ pytestmark = pytest.mark.skipif( ) DEVICES = ( - [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)] + [f"cuda:{i}" for i in range(1 if torch.accelerator.device_count() == 1 else 2)] if current_platform.is_cuda_alike() else ["cpu"] ) @@ -260,7 +260,7 @@ def test_embeddings( # device, see: https://github.com/triton-lang/triton/issues/2925 # Same below. if current_platform.is_cuda_alike(): - torch.cuda.set_device(device) + torch.accelerator.set_device_index(device) torch.set_default_device(device) max_loras = 8 @@ -359,7 +359,7 @@ def test_lm_head_logits_processor( default_vllm_config, dist_init, num_loras, device, vocab_size, stage ) -> None: if current_platform.is_cuda_alike(): - torch.cuda.set_device(device) + torch.accelerator.set_device_index(device) torch.set_default_device(device) max_loras = 8 @@ -476,7 +476,7 @@ def test_lm_head_logits_processor_invalid_vocab_size( ) -> None: """Test that LogitsProcessorWithLoRA raises ValueError for invalid vocab sizes.""" if current_platform.is_cuda_alike(): - torch.cuda.set_device(device) + torch.accelerator.set_device_index(device) torch.set_default_device(device) max_loras = 8 @@ -505,7 +505,7 @@ def test_linear_replicated( stage, ) -> None: if current_platform.is_cuda_alike(): - torch.cuda.set_device(device) + torch.accelerator.set_device_index(device) max_loras = 8 torch.set_default_device(device) @@ -612,7 +612,7 @@ def test_linear_parallel( default_vllm_config, dist_init, num_loras, orientation, fully_shard, device, stage ) -> None: if current_platform.is_cuda_alike(): - torch.cuda.set_device(device) + torch.accelerator.set_device_index(device) max_loras = 8 torch.set_default_device(device) @@ -737,7 +737,7 @@ def test_column_parallel_packed( default_vllm_config, dist_init, num_loras, repeats, fully_shard, device, stage ) -> None: if current_platform.is_cuda_alike(): - torch.cuda.set_device(device) + torch.accelerator.set_device_index(device) max_loras = 8 torch.set_default_device(device) @@ -885,7 +885,7 @@ def test_merged_column_parallel_variable_slice( default_vllm_config, dist_init, num_loras, num_slices, device, stage ) -> None: if current_platform.is_cuda_alike(): - torch.cuda.set_device(device) + torch.accelerator.set_device_index(device) max_loras = 8 torch.set_default_device(device) diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py index c37780ec6..d2a7cd155 100644 --- a/tests/lora/test_lora_manager.py +++ b/tests/lora/test_lora_manager.py @@ -37,7 +37,7 @@ EMBEDDING_MODULES = { DEVICES = ( - [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)] + [f"cuda:{i}" for i in range(1 if torch.accelerator.device_count() == 1 else 2)] if current_platform.is_cuda_alike() else ["cpu"] ) diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py index 12c73f2d7..3868bff79 100644 --- a/tests/lora/test_mixtral.py +++ b/tests/lora/test_mixtral.py @@ -34,7 +34,7 @@ def do_sample( def test_mixtral_lora(mixtral_lora_files, tp_size): """Original test, the LoRA model has the common target modules, not all""" if ( - torch.cuda.device_count() < tp_size + torch.accelerator.device_count() < tp_size and tp_size > 1 and current_platform.is_cuda_alike() ): diff --git a/tests/lora/test_punica_ops.py b/tests/lora/test_punica_ops.py index 82db7fece..8a2634e82 100644 --- a/tests/lora/test_punica_ops.py +++ b/tests/lora/test_punica_ops.py @@ -395,7 +395,7 @@ def test_kernels( Tests LoRA kernels. """ torch.set_default_device(device) - torch.cuda.set_device(device) + torch.accelerator.set_device_index(device) set_random_seed(seed) if op_type == "shrink": @@ -448,7 +448,7 @@ def test_kernels_hidden_size( Tests SGMV and LoRA kernels. """ torch.set_default_device(device) - torch.cuda.set_device(device) + torch.accelerator.set_device_index(device) set_random_seed(seed) if op_type == "shrink": diff --git a/tests/model_executor/model_loader/tensorizer_loader/test_tensorizer.py b/tests/model_executor/model_loader/tensorizer_loader/test_tensorizer.py index 610f69c8d..3b950c843 100644 --- a/tests/model_executor/model_loader/tensorizer_loader/test_tensorizer.py +++ b/tests/model_executor/model_loader/tensorizer_loader/test_tensorizer.py @@ -203,7 +203,7 @@ def test_raise_value_error_on_invalid_load_format(vllm_runner, capfd, model_ref) torch.accelerator.empty_cache() -@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Requires 2 GPUs") +@pytest.mark.skipif(torch.accelerator.device_count() < 2, reason="Requires 2 GPUs") def test_tensorizer_with_tp_path_without_template(vllm_runner, capfd): try: model_ref = "EleutherAI/pythia-1.4b" @@ -231,7 +231,7 @@ def test_tensorizer_with_tp_path_without_template(vllm_runner, capfd): ) in combined_output -@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Requires 2 GPUs") +@pytest.mark.skipif(torch.accelerator.device_count() < 2, reason="Requires 2 GPUs") def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs( vllm_runner, tmp_path ): diff --git a/tests/model_executor/test_eagle_quantization.py b/tests/model_executor/test_eagle_quantization.py index 6f0dc55a5..1203aef6a 100644 --- a/tests/model_executor/test_eagle_quantization.py +++ b/tests/model_executor/test_eagle_quantization.py @@ -11,7 +11,7 @@ from vllm.model_executor.models.utils import get_draft_quant_config from vllm.platforms import current_platform DEVICES = ( - [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)] + [f"cuda:{i}" for i in range(1 if torch.accelerator.device_count() == 1 else 2)] if current_platform.is_cuda_alike() else ["cpu"] ) @@ -61,7 +61,7 @@ def test_fc_layer_quant_config_usage(default_vllm_config, dist_init, device) -> from vllm.model_executor.layers.linear import ReplicatedLinear if current_platform.is_cuda_alike(): - torch.cuda.set_device(device) + torch.accelerator.set_device_index(device) torch.set_default_device(device) diff --git a/tests/models/test_vision.py b/tests/models/test_vision.py index 17d82b125..7d03de1ab 100644 --- a/tests/models/test_vision.py +++ b/tests/models/test_vision.py @@ -102,7 +102,7 @@ def run_dp_sharded_vision_model_vs_direct( set_random_seed(0) device = f"{current_platform.device_name}:{local_rank}" - current_platform.set_device(device) + torch.accelerator.set_device_index(device) torch.set_default_device(device) update_environment_variables( @@ -288,7 +288,7 @@ def run_dp_sharded_mrope_vision_model_vs_direct( # Set random seed for reproducibility set_random_seed(0) device = f"{current_platform.device_name}:{local_rank}" - current_platform.set_device(device) + torch.accelerator.set_device_index(device) torch.set_default_device(device) update_environment_variables( @@ -365,7 +365,7 @@ def run_dp_sharded_mrope_vision_model_empty_input_worker( """Test run_dp_sharded_mrope_vision_model with empty input.""" # Set up distributed environment device = f"{current_platform.device_name}:{local_rank}" - current_platform.set_device(device) + torch.accelerator.set_device_index(device) torch.set_default_device(device) update_environment_variables( @@ -414,7 +414,7 @@ def run_dp_sharded_mrope_vision_model_uneven_load_worker( # Set up distributed environment set_random_seed(123) device = f"{current_platform.device_name}:{local_rank}" - current_platform.set_device(device) + torch.accelerator.set_device_index(device) torch.set_default_device(device) update_environment_variables( diff --git a/tests/quantization/test_quark.py b/tests/quantization/test_quark.py index a560494a4..afb0437f5 100644 --- a/tests/quantization/test_quark.py +++ b/tests/quantization/test_quark.py @@ -210,10 +210,9 @@ WIKITEXT_ACCURACY_CONFIGS = [ @pytest.mark.parametrize("config", WIKITEXT_ACCURACY_CONFIGS) @pytest.mark.parametrize("tp_size", [1, 2]) def test_ocp_mx_wikitext_correctness(config: AccuracyTestConfig, tp_size: int): - if torch.cuda.device_count() < tp_size: - pytest.skip( - f"This test requires >={tp_size} gpus, got only {torch.cuda.device_count()}" - ) + device_count = torch.accelerator.device_count() + if device_count < tp_size: + pytest.skip(f"This test requires >={tp_size} gpus, got only {device_count}") task = "wikitext" rtol = 0.1 @@ -246,10 +245,9 @@ def test_ocp_mx_wikitext_correctness(config: AccuracyTestConfig, tp_size: int): reason="Read access to huggingface.co/amd is required for this test.", ) def test_mxfp4_gsm8k_correctness(config: AccuracyTestConfig): - if torch.cuda.device_count() < 8: - pytest.skip( - f"This test requires >=8 gpus, got only {torch.cuda.device_count()}" - ) + device_count = torch.accelerator.device_count() + if device_count < 8: + pytest.skip(f"This test requires >=8 gpus, got only {device_count}") task = "gsm8k" rtol = 0.03 diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py index 8fdca83a2..4695f6f19 100644 --- a/tests/v1/e2e/test_spec_decode.py +++ b/tests/v1/e2e/test_spec_decode.py @@ -32,7 +32,7 @@ MTP_SIMILARITY_RATE = 0.8 def _skip_if_insufficient_gpus_for_tp(tp_size: int): """Skip test if available GPUs < tp_size on ROCm.""" - available_gpus = torch.cuda.device_count() + available_gpus = torch.accelerator.device_count() if available_gpus < tp_size: pytest.skip( f"Test requires {tp_size} GPUs, but only {available_gpus} available" diff --git a/tests/v1/kv_connector/unit/test_example_connector.py b/tests/v1/kv_connector/unit/test_example_connector.py index e42f691ea..7e05a0d93 100644 --- a/tests/v1/kv_connector/unit/test_example_connector.py +++ b/tests/v1/kv_connector/unit/test_example_connector.py @@ -148,7 +148,7 @@ def test_shared_storage_connector_hashes(tmp_path, attn_backend): ) # don't put this import at the top level - # it will call torch.cuda.device_count() + # it will call torch.accelerator.device_count() from transformers import AutoProcessor # Create processor to handle the chat prompt diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py index 10fa4f14f..5dd90eb50 100644 --- a/tests/v1/kv_connector/unit/test_nixl_connector.py +++ b/tests/v1/kv_connector/unit/test_nixl_connector.py @@ -1570,7 +1570,7 @@ def test_register_kv_caches( ] ], cache_dtype=torch.bfloat16, - device=torch.cuda.current_device(), + device=torch.accelerator.current_device_index(), kernel_block_sizes=[block_size], ) ) diff --git a/tests/v1/spec_decode/test_acceptance_length.py b/tests/v1/spec_decode/test_acceptance_length.py index 8a6a72781..aa8e40a2d 100644 --- a/tests/v1/spec_decode/test_acceptance_length.py +++ b/tests/v1/spec_decode/test_acceptance_length.py @@ -141,7 +141,7 @@ def get_attention_backend_params() -> list[str]: def get_tp_size_params() -> list[pytest.param]: - num_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 1 + num_gpus = torch.accelerator.device_count() if torch.cuda.is_available() else 1 return [pytest.param(tp, id=f"tp{tp}") for tp in TP_SIZES if tp <= num_gpus] diff --git a/tests/v1/worker/test_worker_memory_snapshot.py b/tests/v1/worker/test_worker_memory_snapshot.py index 27a9b4a75..fe8a5a21f 100644 --- a/tests/v1/worker/test_worker_memory_snapshot.py +++ b/tests/v1/worker/test_worker_memory_snapshot.py @@ -117,7 +117,8 @@ def worker_process( @pytest.mark.skipif( - torch.cuda.device_count() < 2, reason="Need at least 2 GPUs for tensor parallelism" + torch.accelerator.device_count() < 2, + reason="Need at least 2 GPUs for tensor parallelism", ) def test_init_distributed_is_called_before_memory_snapshot(): """Test that distributed env is setup before memory snapshot. diff --git a/tools/pre_commit/check_torch_cuda.py b/tools/pre_commit/check_torch_cuda.py index 42cb0945b..4099c315e 100644 --- a/tools/pre_commit/check_torch_cuda.py +++ b/tools/pre_commit/check_torch_cuda.py @@ -8,8 +8,8 @@ import regex as re # Regex: match `torch.cuda.xxx` but allow `torch.accelerator.xxx` # --------------------------------------------------------------------------- # _TORCH_CUDA_PATTERNS = [ - r"\btorch\.cuda\.(empty_cache|synchronize|device\()\b", - r"\bwith\btorch\.cuda\.device\b", + r"\btorch\.cuda\.(empty_cache|synchronize|device_count|current_device|set_device|device\()\b", + r"\bwith\storch\.cuda\.device\b", ] ALLOWED_FILES = {"vllm/platforms/", "vllm/device_allocator/"} @@ -25,7 +25,9 @@ def scan_file(path: str) -> int: print( f"{path}:{line_num}: " "\033[91merror:\033[0m " # red color - "Found torch.cuda API call" + "Found torch.cuda API call. Please refer RFC " + "https://github.com/vllm-project/vllm/issues/30679, use " + "torch.accelerator API instead." ) return 1 return 0 diff --git a/vllm/distributed/device_communicators/all2all.py b/vllm/distributed/device_communicators/all2all.py index 3efcebd54..97c5faad6 100644 --- a/vllm/distributed/device_communicators/all2all.py +++ b/vllm/distributed/device_communicators/all2all.py @@ -491,7 +491,7 @@ class FlashInferAllToAllManager(All2AllManagerBase): self.initialize( world_size=self.world_size, rank=self.rank, - gpus_per_node=torch.cuda.device_count, + gpus_per_node=torch.accelerator.device_count, ) return self.initialized diff --git a/vllm/distributed/device_communicators/pynccl_allocator.py b/vllm/distributed/device_communicators/pynccl_allocator.py index 0ce307bc5..27445b814 100644 --- a/vllm/distributed/device_communicators/pynccl_allocator.py +++ b/vllm/distributed/device_communicators/pynccl_allocator.py @@ -151,7 +151,7 @@ class nccl_symm_mem_context: self.pynccl_comm = pynccl_comm self._mem_pool_ctx = torch.cuda.use_mem_pool(get_nccl_mem_pool()) self.is_graph_capture = torch.cuda.is_current_stream_capturing() - self.device = torch.cuda.current_device() + self.device = torch.accelerator.current_device_index() def __enter__(self): if self.disabled: diff --git a/vllm/distributed/device_communicators/symm_mem.py b/vllm/distributed/device_communicators/symm_mem.py index eb1f173b1..98c7ac20a 100644 --- a/vllm/distributed/device_communicators/symm_mem.py +++ b/vllm/distributed/device_communicators/symm_mem.py @@ -50,7 +50,7 @@ class SymmMemCommunicator: device = torch.device(f"cuda:{device}") elif isinstance(device, str): device = torch.device(device) - torch.cuda.set_device(device) + torch.accelerator.set_device_index(device) self.dtype = torch.bfloat16 self.device = device self.group = group diff --git a/vllm/distributed/eplb/async_worker.py b/vllm/distributed/eplb/async_worker.py index 5dd862f36..7e753fdbf 100644 --- a/vllm/distributed/eplb/async_worker.py +++ b/vllm/distributed/eplb/async_worker.py @@ -33,7 +33,7 @@ def start_async_worker( def thread_target() -> None: assert device_index is not None - torch.cuda.set_device(device_index) + torch.accelerator.set_device_index(device_index) cuda_stream = torch.cuda.Stream(device=device_index) loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) diff --git a/vllm/distributed/eplb/eplb_state.py b/vllm/distributed/eplb/eplb_state.py index b417c2b32..863b29f6f 100644 --- a/vllm/distributed/eplb/eplb_state.py +++ b/vllm/distributed/eplb/eplb_state.py @@ -314,7 +314,7 @@ class EplbState: if self.device.type == "cuda": self.cuda_device_index = self.device.index if self.cuda_device_index is None and torch.cuda.is_available(): - self.cuda_device_index = torch.cuda.current_device() + self.cuda_device_index = torch.accelerator.current_device_index() @staticmethod def build_initial_global_physical_to_logical_map( diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py index 51af1958b..4aacbddb8 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py @@ -483,9 +483,9 @@ def _init_lmcache_engine( ) # Change current device. - num_gpus = torch.cuda.device_count() + num_gpus = torch.accelerator.device_count() local_rank = parallel_config.rank % num_gpus - torch.cuda.set_device(local_rank) + torch.accelerator.set_device_index(local_rank) device = torch.device(f"cuda:{local_rank}") metadata = LMCacheEngineMetadata( model_config.model, diff --git a/vllm/distributed/weight_transfer/ipc_engine.py b/vllm/distributed/weight_transfer/ipc_engine.py index 85dd34553..9b72cfe71 100644 --- a/vllm/distributed/weight_transfer/ipc_engine.py +++ b/vllm/distributed/weight_transfer/ipc_engine.py @@ -169,7 +169,7 @@ class IPCWeightTransferEngine( update_info.shapes, update_info.ipc_handles, ): - device_index = torch.cuda.current_device() + device_index = torch.accelerator.current_device_index() props = torch.cuda.get_device_properties(device_index) physical_gpu_id = str(props.uuid) @@ -242,7 +242,7 @@ class IPCWeightTransferEngine( args = trainer_args # Get physical GPU UUID - device_index = torch.cuda.current_device() + device_index = torch.accelerator.current_device_index() props = torch.cuda.get_device_properties(device_index) gpu_uuid = str(props.uuid) diff --git a/vllm/distributed/weight_transfer/nccl_engine.py b/vllm/distributed/weight_transfer/nccl_engine.py index e8a1091b9..3d97fafb2 100644 --- a/vllm/distributed/weight_transfer/nccl_engine.py +++ b/vllm/distributed/weight_transfer/nccl_engine.py @@ -140,13 +140,14 @@ class NCCLWeightTransferEngine( worker_rank = dp_rank * world_size_per_dp + rank_within_dp rank = worker_rank + init_info.rank_offset # Create stateless process group + device = torch.accelerator.current_device_index() self.model_update_group = ( NCCLWeightTransferEngine._stateless_init_process_group( init_info.master_address, init_info.master_port, rank, init_info.world_size, - torch.cuda.current_device(), + device=device, ) ) @@ -275,7 +276,7 @@ class NCCLWeightTransferEngine( Initialize NCCL process group for trainer-side weight transfer. The trainer is always rank 0 in the process group. Uses the current - CUDA device (torch.cuda.current_device()). + CUDA device (torch.accelerator.current_device_index()). Args: init_info: Either an NCCLWeightTransferInitInfo object or a dict with keys: @@ -309,8 +310,13 @@ class NCCLWeightTransferEngine( world_size = init_info.world_size # Trainer is always rank 0 + device = torch.accelerator.current_device_index() return NCCLWeightTransferEngine._stateless_init_process_group( - master_address, master_port, 0, world_size, torch.cuda.current_device() + master_address, + master_port, + 0, + world_size, + device, ) @staticmethod diff --git a/vllm/model_executor/layers/attention/static_sink_attention.py b/vllm/model_executor/layers/attention/static_sink_attention.py index fe8dc7e34..60419f967 100644 --- a/vllm/model_executor/layers/attention/static_sink_attention.py +++ b/vllm/model_executor/layers/attention/static_sink_attention.py @@ -190,7 +190,7 @@ class StaticSinkAttention(Attention, CustomOp): sink_kv_slot_mapping = torch.arange( self.block_size, self.sink_len + self.block_size, - device=torch.cuda.current_device(), + device=torch.accelerator.current_device_index(), dtype=torch.long, ) triton_reshape_and_cache_flash_diffkv( diff --git a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py index 512b71284..db97a5374 100644 --- a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py +++ b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py @@ -295,14 +295,17 @@ class DefaultMoERunner(MoERunner): states_shape = (moe.max_num_tokens, self.moe_config.hidden_dim) logits_shape = (moe.max_num_tokens, self.moe_config.num_logical_experts) + device = torch.accelerator.current_device_index() self.batched_hidden_states = torch.zeros( - states_shape, dtype=moe.in_dtype, device=torch.cuda.current_device() + states_shape, + dtype=moe.in_dtype, + device=device, ) self.batched_router_logits = torch.zeros( logits_shape, dtype=moe.router_logits_dtype, - device=torch.cuda.current_device(), + device=device, ) def must_reduce_shared_expert_outputs(self) -> bool: diff --git a/vllm/model_executor/layers/fused_moe/trtllm_moe.py b/vllm/model_executor/layers/fused_moe/trtllm_moe.py index 5160840a2..3f256ca21 100644 --- a/vllm/model_executor/layers/fused_moe/trtllm_moe.py +++ b/vllm/model_executor/layers/fused_moe/trtllm_moe.py @@ -28,7 +28,7 @@ class TrtLlmGenExperts(mk.FusedMoEExpertsModular): max_capture_size, ): super().__init__(moe_config, quant_config) - self.device = torch.cuda.current_device() + self.device = torch.accelerator.current_device_index() self.num_experts = moe_config.num_local_experts self.gemm1_alpha = torch.tensor( [1.702] * self.num_experts, dtype=torch.float32, device=self.device diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index 2a1180dd6..ecc36556c 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -202,7 +202,7 @@ class RMSNorm(CustomOp): # external Oink initialization work in this case. else: try: - device_index = torch.cuda.current_device() + device_index = torch.accelerator.current_device_index() if _oink_ops.is_oink_available_for_device(device_index): self._use_oink_rmsnorm = True self._use_oink_fused_add_rmsnorm = ( diff --git a/vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py b/vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py index e5dabe035..ec03fc653 100644 --- a/vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py +++ b/vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py @@ -36,7 +36,8 @@ class DualChunkRotaryEmbedding(CustomOp): self.chunk_size = chunk_size self.local_size = local_size self.dtype = dtype - self.device = torch.device(f"cuda:{torch.cuda.current_device()}") + device_idx = torch.accelerator.current_device_index() + self.device = torch.device(f"cuda:{device_idx}") (q_cache, qc_cache, k_cache, qc_no_clamp_cache, q_inter_cache) = ( self._compute_cos_sin_cache() ) diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index 6e8aee8bc..1ff1a448a 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -539,6 +539,8 @@ def deserialize_tensorizer_model( ) before_mem = get_mem_usage() start = time.perf_counter() + device_index = torch.accelerator.current_device_index() + device_type = current_platform.device_type with ( open_stream( tensorizer_config.tensorizer_uri, mode="rb", **tensorizer_args.stream_kwargs @@ -546,9 +548,7 @@ def deserialize_tensorizer_model( TensorDeserializer( stream, dtype=tensorizer_config.dtype, - device=f"xpu:{torch.xpu.current_device()}" - if current_platform.is_xpu() - else f"cuda:{torch.cuda.current_device()}", + device=f"{device_type}:{device_index}", **tensorizer_args.deserialization_kwargs, ) as deserializer, ): diff --git a/vllm/utils/torch_utils.py b/vllm/utils/torch_utils.py index e4aa4fe61..61f863f1d 100644 --- a/vllm/utils/torch_utils.py +++ b/vllm/utils/torch_utils.py @@ -624,7 +624,7 @@ def cuda_device_count_stateless() -> int: """Get number of CUDA devices, caching based on the value of CUDA_VISIBLE_DEVICES at the time of call. - This should be used instead of torch.cuda.device_count() + This should be used instead of torch.accelerator.device_count() unless CUDA_VISIBLE_DEVICES has already been set to the desired value.""" diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py index 0150d8863..9a72bc5d3 100644 --- a/vllm/v1/engine/utils.py +++ b/vllm/v1/engine/utils.py @@ -134,7 +134,7 @@ class CoreEngineProcManager: for proc, local_dp_rank in zip(self.processes, local_dp_ranks): # Adjust device control in DP for non-CUDA platforms # as well as external and ray launchers - # For CUDA platforms, we use torch.cuda.set_device() + # For CUDA platforms, we use torch.accelerator.set_device_index()() if is_dp and ( not current_platform.is_cuda_alike() or vllm_config.parallel_config.use_ray diff --git a/vllm/v1/worker/gpu_ubatch_wrapper.py b/vllm/v1/worker/gpu_ubatch_wrapper.py index c4cbfff5a..64856052f 100644 --- a/vllm/v1/worker/gpu_ubatch_wrapper.py +++ b/vllm/v1/worker/gpu_ubatch_wrapper.py @@ -73,8 +73,8 @@ class SMControlContextManager: assert current_platform.is_cuda(), ( "SM control is currently only supported on CUDA" ) - - total_sms = num_compute_units(torch.cuda.current_device()) + device = torch.accelerator.current_device_index() + total_sms = num_compute_units(device) assert comm_sms < total_sms self.total_sms = total_sms @@ -204,7 +204,7 @@ class UBatchWrapper: @torch.inference_mode() def _capture_ubatch_thread(results, ubatch_metadata): - torch.cuda.set_device(self.device) + torch.accelerator.set_device_index(self.device) ubatch_context = ubatch_metadata.context with torch.cuda.stream(ubatch_context.compute_stream): _ = torch.cuda.current_blas_handle() diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 842e76549..58e28e694 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -239,11 +239,11 @@ class Worker(WorkerBase): # DP_LOCAL_RANK * TP_PP_WORLD_SIZE + TP_LOCAL_RANK self.local_rank += dp_local_rank * tp_pp_world_size - assert self.local_rank < torch.cuda.device_count(), ( + assert self.local_rank < torch.accelerator.device_count(), ( f"DP adjusted local rank {self.local_rank} is out of bounds. " ) visible_device_count = ( - torch.cuda.device_count() if torch.cuda.is_available() else 0 + torch.accelerator.device_count() if torch.cuda.is_available() else 0 ) assert self.parallel_config.local_world_size <= visible_device_count, ( f"local_world_size ({self.parallel_config.local_world_size}) must " @@ -252,7 +252,7 @@ class Worker(WorkerBase): ) self.device = torch.device(f"cuda:{self.local_rank}") - current_platform.set_device(self.device) + torch.accelerator.set_device_index(self.device) current_platform.check_if_supports_dtype(self.model_config.dtype) diff --git a/vllm/v1/worker/xpu_worker.py b/vllm/v1/worker/xpu_worker.py index 112a71b37..421105923 100644 --- a/vllm/v1/worker/xpu_worker.py +++ b/vllm/v1/worker/xpu_worker.py @@ -60,7 +60,7 @@ class XPUWorker(Worker): and current_platform.is_xpu() ): self.device = torch.device(f"xpu:{self.local_rank}") - current_platform.set_device(self.device) + torch.accelerator.set_device_index(self.device) current_platform.check_if_supports_dtype(self.model_config.dtype) torch.accelerator.empty_cache() self.init_gpu_memory = torch.xpu.get_device_properties(