diff --git a/benchmarks/attention_benchmarks/mla_runner.py b/benchmarks/attention_benchmarks/mla_runner.py
index 110f580fb..3c1ca4b3d 100644
--- a/benchmarks/attention_benchmarks/mla_runner.py
+++ b/benchmarks/attention_benchmarks/mla_runner.py
@@ -757,7 +757,7 @@ def _run_mla_benchmark_batched(
 
     backend_cfg = _get_backend_config(backend)
     device = torch.device(configs_with_params[0][0].device)
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
 
     # Determine block size
     config_block_size = configs_with_params[0][0].block_size
diff --git a/benchmarks/attention_benchmarks/runner.py b/benchmarks/attention_benchmarks/runner.py
index 7f968cfec..52286186d 100644
--- a/benchmarks/attention_benchmarks/runner.py
+++ b/benchmarks/attention_benchmarks/runner.py
@@ -443,7 +443,7 @@ def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
         BenchmarkResult with timing and memory statistics
     """
     device = torch.device(config.device)
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
 
     backend_cfg = _get_backend_config(config.backend)
 
diff --git a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
index 58ccfcc45..3f80b024e 100644
--- a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
+++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
@@ -64,7 +64,7 @@ def bench_run(
     per_out_ch: bool,
     mkn: tuple[int, int, int],
 ):
-    init_workspace_manager(torch.cuda.current_device())
+    init_workspace_manager(torch.accelerator.current_device_index())
     (m, k, n) = mkn
 
     dtype = torch.half
diff --git a/benchmarks/kernels/benchmark_device_communicators.py b/benchmarks/kernels/benchmark_device_communicators.py
index 9b5ccac4e..24e22023b 100644
--- a/benchmarks/kernels/benchmark_device_communicators.py
+++ b/benchmarks/kernels/benchmark_device_communicators.py
@@ -495,7 +495,7 @@ def main():
 
     # Set device
     device = torch.device(f"cuda:{rank}")
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
 
     # Get CPU process group
     cpu_group = dist.new_group(backend="gloo")
diff --git a/benchmarks/kernels/benchmark_fused_collective.py b/benchmarks/kernels/benchmark_fused_collective.py
index 2547f553f..05b842d7e 100644
--- a/benchmarks/kernels/benchmark_fused_collective.py
+++ b/benchmarks/kernels/benchmark_fused_collective.py
@@ -392,7 +392,7 @@ def benchmark_operation(
     num_op_per_cudagraph = 10
 
     # Use vLLM's graph_capture to make tensor_model_parallel_all_reduce graph-safe
-    device = torch.device(f"cuda:{torch.cuda.current_device()}")
+    device = torch.device(f"cuda:{torch.accelerator.current_device_index()}")
     with graph_capture(device=device), torch.cuda.graph(graph):
         for _ in range(num_op_per_cudagraph):
             operation_func(*args, **kwargs)
@@ -984,7 +984,7 @@ def main():
     world_size = int(os.environ["WORLD_SIZE"])
 
     device = torch.device(f"cuda:{rank}")
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
     torch.set_default_device(device)
 
     init_distributed_environment()
diff --git a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
index 039eb2f29..dd4060bbd 100644
--- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
+++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@@ -50,7 +50,7 @@ def bench_run(
     per_out_ch: bool,
     mkn: tuple[int, int, int],
 ):
-    init_workspace_manager(torch.cuda.current_device())
+    init_workspace_manager(torch.accelerator.current_device_index())
     label = "Quant Matmul"
 
     sub_label = (
diff --git a/benchmarks/kernels/benchmark_w8a8_block_fp8.py b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
index ceae12e98..36dce1b63 100644
--- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@@ -285,7 +285,7 @@ def tune_on_gpu(args_dict):
     weight_shapes = args_dict["weight_shapes"]
     args = args_dict["args"]
 
-    torch.cuda.set_device(gpu_id)
+    torch.accelerator.set_device_index(gpu_id)
     print(f"Starting tuning on GPU {gpu_id} with batch sizes {batch_sizes}")
 
     block_n = args.block_n
@@ -334,7 +334,7 @@ def distribute_batch_sizes(batch_sizes, num_gpus):
 
 def main(args):
     print(args)
-    num_gpus = torch.cuda.device_count()
+    num_gpus = torch.accelerator.device_count()
     if num_gpus == 0:
         raise RuntimeError("No GPU available for tuning")
     print(f"Found {num_gpus} GPUs for parallel tuning")
diff --git a/docs/configuration/conserving_memory.md b/docs/configuration/conserving_memory.md
index 0aa89a89e..8ea241c58 100644
--- a/docs/configuration/conserving_memory.md
+++ b/docs/configuration/conserving_memory.md
@@ -15,7 +15,7 @@ llm = LLM(model="ibm-granite/granite-3.1-8b-instruct", tensor_parallel_size=2)
 ```
 
 !!! warning
-    To ensure that vLLM initializes CUDA correctly, you should avoid calling related functions (e.g. [torch.cuda.set_device][])
+    To ensure that vLLM initializes CUDA correctly, you should avoid calling related functions (e.g. [torch.accelerator.set_device_index][])
     before initializing vLLM. Otherwise, you may run into an error like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`.
 
     To control which devices are used, please instead set the `CUDA_VISIBLE_DEVICES` environment variable.
diff --git a/docs/usage/troubleshooting.md b/docs/usage/troubleshooting.md
index bced53936..dc1cd89f8 100644
--- a/docs/usage/troubleshooting.md
+++ b/docs/usage/troubleshooting.md
@@ -91,8 +91,8 @@ If GPU/CPU communication cannot be established, you can use the following Python
     import torch
     import torch.distributed as dist
     dist.init_process_group(backend="nccl")
-    local_rank = dist.get_rank() % torch.cuda.device_count()
-    torch.cuda.set_device(local_rank)
+    local_rank = dist.get_rank() % torch.accelerator.device_count()
+    torch.accelerator.set_device_index(local_rank)
     data = torch.FloatTensor([1,] * 128).to("cuda")
     dist.all_reduce(data, op=dist.ReduceOp.SUM)
     torch.accelerator.synchronize()
@@ -337,7 +337,7 @@ import vllm
 import torch
 
 print(f"CUDA available: {torch.cuda.is_available()}")
-print(f"CUDA device count: {torch.cuda.device_count()}")
+print(f"CUDA device count: {torch.accelerator.device_count()}")
 EOF
 ```
 
diff --git a/examples/online_serving/new_weight_syncing/rlhf_http_ipc.py b/examples/online_serving/new_weight_syncing/rlhf_http_ipc.py
index d73eba64c..1a6a96d9c 100644
--- a/examples/online_serving/new_weight_syncing/rlhf_http_ipc.py
+++ b/examples/online_serving/new_weight_syncing/rlhf_http_ipc.py
@@ -106,7 +106,7 @@ def main():
     # IPC requires the training model to be on the same GPU as the vLLM server
     # The server should be started on GPU 0 with reduced memory utilization
     device = "cuda:0"
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
 
     # Load the training model on the same GPU as the server
     # Use bfloat16 to reduce memory footprint
diff --git a/examples/online_serving/new_weight_syncing/rlhf_http_nccl.py b/examples/online_serving/new_weight_syncing/rlhf_http_nccl.py
index b8a6b180a..afc4cda2e 100644
--- a/examples/online_serving/new_weight_syncing/rlhf_http_nccl.py
+++ b/examples/online_serving/new_weight_syncing/rlhf_http_nccl.py
@@ -131,7 +131,7 @@ def main():
     inference_world_size = get_world_size(BASE_URL)
     world_size = inference_world_size + 1  # +1 for the trainer
     device = f"cuda:{inference_world_size}"
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
 
     # Load the training model
     print(f"Loading training model: {MODEL_NAME}")
diff --git a/tests/compile/passes/distributed/test_async_tp.py b/tests/compile/passes/distributed/test_async_tp.py
index abc71768c..7edceee98 100644
--- a/tests/compile/passes/distributed/test_async_tp.py
+++ b/tests/compile/passes/distributed/test_async_tp.py
@@ -300,7 +300,7 @@ def async_tp_pass_on_test_model(
     set_random_seed(0)
 
     device = torch.device(f"cuda:{local_rank}")
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
     torch.set_default_device(device)
     torch.set_default_dtype(dtype)
 
diff --git a/tests/compile/passes/distributed/test_fusion_all_reduce.py b/tests/compile/passes/distributed/test_fusion_all_reduce.py
index 4beac8c4f..fe50081e5 100644
--- a/tests/compile/passes/distributed/test_fusion_all_reduce.py
+++ b/tests/compile/passes/distributed/test_fusion_all_reduce.py
@@ -262,7 +262,7 @@ def all_reduce_fusion_pass_on_test_model(
     set_random_seed(0)
 
     device = torch.device(f"cuda:{local_rank}")
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
     torch.set_default_device(device)
     torch.set_default_dtype(dtype)
 
diff --git a/tests/compile/passes/distributed/test_sequence_parallelism.py b/tests/compile/passes/distributed/test_sequence_parallelism.py
index a0fe717ba..e7bf330cc 100644
--- a/tests/compile/passes/distributed/test_sequence_parallelism.py
+++ b/tests/compile/passes/distributed/test_sequence_parallelism.py
@@ -228,7 +228,7 @@ def sequence_parallelism_pass_on_test_model(
     set_random_seed(0)
 
     device = torch.device(f"cuda:{local_rank}")
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
     torch.set_default_device(device)
     torch.set_default_dtype(dtype)
 
diff --git a/tests/conftest.py b/tests/conftest.py
index 4b907b7dd..719bfa5ed 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -428,7 +428,7 @@ class HfRunner:
             )
 
         # don't put this import at the top level
-        # it will call torch.cuda.device_count()
+        # it will call torch.accelerator.device_count()
         from transformers import AutoProcessor
 
         self.processor = AutoProcessor.from_pretrained(
@@ -1535,7 +1535,7 @@ def clean_gpu_memory_between_tests():
 
     from tests.utils import wait_for_gpu_memory_to_clear
 
-    num_gpus = torch.cuda.device_count()
+    num_gpus = torch.accelerator.device_count()
     if num_gpus > 0:
         try:
             wait_for_gpu_memory_to_clear(
diff --git a/tests/cuda/scripts/check_device_count_respects_env.py b/tests/cuda/scripts/check_device_count_respects_env.py
index 1d218e483..e43c13aa4 100644
--- a/tests/cuda/scripts/check_device_count_respects_env.py
+++ b/tests/cuda/scripts/check_device_count_respects_env.py
@@ -14,7 +14,7 @@ import torch  # noqa: E402
 from vllm.platforms import current_platform  # noqa: F401, E402
 
 os.environ["CUDA_VISIBLE_DEVICES"] = "0"
-count = torch.cuda.device_count()
+count = torch.accelerator.device_count()
 
 if count == 0:
     sys.exit(0)  # Skip: no GPUs available
diff --git a/tests/distributed/eplb_utils.py b/tests/distributed/eplb_utils.py
index 7c27347fd..215aff32d 100644
--- a/tests/distributed/eplb_utils.py
+++ b/tests/distributed/eplb_utils.py
@@ -42,7 +42,7 @@ def set_env_vars_and_device(env: dict[str, str]) -> None:
     update_environment_variables(env)
     local_rank = os.environ["LOCAL_RANK"]
     device = torch.device(f"cuda:{local_rank}")
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
 
     # Create a minimal vllm config for init_distributed_environment
     vllm_config = VllmConfig()
diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py
index ce4c9c24e..2804c95d3 100644
--- a/tests/distributed/test_comm_ops.py
+++ b/tests/distributed/test_comm_ops.py
@@ -43,7 +43,7 @@ def all_reduce_test_worker(
     monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
 
     device = torch.device(f"cuda:{rank}")
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
     init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
     num_elements = 8
     all_tensors = [
@@ -69,7 +69,7 @@ def reduce_scatter_test_worker(
     # they will be able to set the device to the correct GPU
     monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
     device = torch.device(f"cuda:{rank}")
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
     init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
 
     num_elements = 8
@@ -100,7 +100,7 @@ def all_gather_test_worker(
     # they will be able to set the device to the correct GPU
     monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
     device = torch.device(f"cuda:{rank}")
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
     init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
     num_dimensions = 3
     tensor_size = list(range(2, num_dimensions + 2))
@@ -134,7 +134,7 @@ def broadcast_tensor_dict_test_worker(
     # they will be able to set the device to the correct GPU
     monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
     device = torch.device(f"cuda:{rank}")
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
     init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
     test_dict = {
         # device tensor
@@ -171,7 +171,7 @@ def send_recv_tensor_dict_test_worker(
 ):
     monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
     device = torch.device(f"cuda:{rank}")
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
     init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
 
     test_dict = {
@@ -317,7 +317,7 @@ def send_recv_test_worker(
 ):
     monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
     device = torch.device(f"cuda:{rank}")
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
     init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
 
     size = 64
diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py
index 5008c4de0..edddb6ec8 100644
--- a/tests/distributed/test_custom_all_reduce.py
+++ b/tests/distributed/test_custom_all_reduce.py
@@ -35,7 +35,7 @@ def graph_allreduce(
         m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
         m.delenv("HIP_VISIBLE_DEVICES", raising=False)
         device = torch.device(f"cuda:{rank}")
-        torch.cuda.set_device(device)
+        torch.accelerator.set_device_index(device)
         init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
         ensure_model_parallel_initialized(tp_size, pp_size)
         group = get_tp_group().device_group
@@ -62,12 +62,10 @@ def graph_allreduce(
             for dtype in [torch.float32, torch.float16, torch.bfloat16]:
                 with graph_capture(device=device) as graph_capture_context:
                     # use integers so result matches NCCL exactly
-                    inp1 = torch.randint(
-                        1, 16, (sz,), dtype=dtype, device=torch.cuda.current_device()
-                    )
-                    inp2 = torch.randint(
-                        1, 16, (sz,), dtype=dtype, device=torch.cuda.current_device()
-                    )
+                    device_idx = torch.accelerator.current_device_index()
+                    inp1 = torch.randint(1, 16, (sz,), dtype=dtype, device=device_idx)
+                    inp2 = torch.randint(1, 16, (sz,), dtype=dtype, device=device_idx)
+
                     torch.accelerator.synchronize()
                     graph = torch.cuda.CUDAGraph()
                     with torch.cuda.graph(graph, stream=graph_capture_context.stream):
@@ -95,7 +93,7 @@ def eager_allreduce(
         m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
         m.delenv("HIP_VISIBLE_DEVICES", raising=False)
         device = torch.device(f"cuda:{rank}")
-        torch.cuda.set_device(device)
+        torch.accelerator.set_device_index(device)
         init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
 
         # we use the first group to communicate once
@@ -129,6 +127,6 @@ def test_custom_allreduce(
     test_target,
 ):
     world_size = tp_size * pipeline_parallel_size
-    if world_size > torch.cuda.device_count():
+    if world_size > torch.accelerator.device_count():
         pytest.skip("Not enough GPUs to run the test.")
     multi_process_parallel(monkeypatch, tp_size, pipeline_parallel_size, test_target)
diff --git a/tests/distributed/test_eplb_execute.py b/tests/distributed/test_eplb_execute.py
index 674a665b0..50c7e6538 100644
--- a/tests/distributed/test_eplb_execute.py
+++ b/tests/distributed/test_eplb_execute.py
@@ -442,7 +442,7 @@ def test_rearrange_expert_weights_with_redundancy(
 ):
     """Test the functionality of rearranging expert weights with redundancy."""
 
-    if torch.cuda.device_count() < world_size:
+    if torch.accelerator.device_count() < world_size:
         pytest.skip(f"Need at least {world_size} GPUs to run the test")
     distributed_run(
         _test_rearrange_expert_weights_with_redundancy,
@@ -528,7 +528,7 @@ def test_async_transfer_layer_without_mtp(
 ):
     """Exercise async EPLB transfer path without MTP/spec decode."""
 
-    if torch.cuda.device_count() < world_size:
+    if torch.accelerator.device_count() < world_size:
         pytest.skip(f"Need at least {world_size} GPUs to run the test")
 
     distributed_run(
@@ -547,7 +547,7 @@ def test_rearrange_expert_weights_no_change(world_size):
     unchanged.
     """
 
-    if torch.cuda.device_count() < world_size:
+    if torch.accelerator.device_count() < world_size:
         pytest.skip(f"Need at least {world_size} GPUs to run the test")
     distributed_run(_test_rearrange_expert_weights_no_change, world_size)
 
@@ -623,6 +623,6 @@ def _test_rearrange_expert_weights_profile_mode(env, world_size) -> None:
 def test_rearrange_expert_weights_profile_mode(world_size):
     """Test profile mode (should not copy actual weights)"""
 
-    if torch.cuda.device_count() < world_size:
+    if torch.accelerator.device_count() < world_size:
         pytest.skip(f"Need at least {world_size} GPUs to run the test")
     distributed_run(_test_rearrange_expert_weights_profile_mode, world_size)
diff --git a/tests/distributed/test_eplb_fused_moe_layer.py b/tests/distributed/test_eplb_fused_moe_layer.py
index 55f265198..eacdb3abc 100644
--- a/tests/distributed/test_eplb_fused_moe_layer.py
+++ b/tests/distributed/test_eplb_fused_moe_layer.py
@@ -257,7 +257,7 @@ def test_eplb_fml(
     intermediate_size: int,
     column_major_scales: bool,
 ):
-    if torch.cuda.device_count() < world_size:
+    if torch.accelerator.device_count() < world_size:
         pytest.skip(f"Need at least {world_size} GPUs to run the test")
 
     num_local_experts = num_experts // world_size
diff --git a/tests/distributed/test_eplb_fused_moe_layer_dep_nvfp4.py b/tests/distributed/test_eplb_fused_moe_layer_dep_nvfp4.py
index 951b692e1..68b2407c2 100644
--- a/tests/distributed/test_eplb_fused_moe_layer_dep_nvfp4.py
+++ b/tests/distributed/test_eplb_fused_moe_layer_dep_nvfp4.py
@@ -253,7 +253,7 @@ def test_eplb_fml(
     monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
     monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", backend)
 
-    if torch.cuda.device_count() < world_size:
+    if torch.accelerator.device_count() < world_size:
         pytest.skip(f"Need at least {world_size} GPUs to run the test")
 
     num_local_experts = num_experts // world_size
diff --git a/tests/distributed/test_nccl_symm_mem_allreduce.py b/tests/distributed/test_nccl_symm_mem_allreduce.py
index b81624fe1..420bf631d 100644
--- a/tests/distributed/test_nccl_symm_mem_allreduce.py
+++ b/tests/distributed/test_nccl_symm_mem_allreduce.py
@@ -38,7 +38,7 @@ def nccl_symm_mem_allreduce_worker(local_rank: int, world_size: int):
         m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
         dtype = torch.bfloat16
         device = torch.device(f"cuda:{local_rank}")
-        torch.cuda.set_device(device)
+        torch.accelerator.set_device_index(device)
         torch.set_default_device(device)
         torch.set_default_dtype(dtype)
         update_environment_variables(
@@ -84,7 +84,7 @@ def nccl_symm_mem_allreduce_worker(local_rank: int, world_size: int):
 @pytest.mark.parametrize("world_size", [2])
 @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA")
 def test_nccl_symm_mem_allreduce(monkeypatch: pytest.MonkeyPatch, world_size):
-    if world_size > torch.cuda.device_count():
+    if world_size > torch.accelerator.device_count():
         pytest.skip("Not enough GPUs to run the test.")
 
     # Enable SymmMemCommunicator
diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py
index 3b5b45aa0..a1d5355d4 100644
--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@@ -54,7 +54,7 @@ def worker_fn_wrapper(fn):
         update_environment_variables(env)
         local_rank = os.environ["LOCAL_RANK"]
         device = torch.device(f"cuda:{local_rank}")
-        torch.cuda.set_device(device)
+        torch.accelerator.set_device_index(device)
         init_distributed_environment()
         fn()
 
@@ -73,7 +73,7 @@ def worker_fn():
 
 
 @pytest.mark.skipif(
-    torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
+    torch.accelerator.device_count() < 2, reason="Need at least 2 GPUs to run the test."
 )
 def test_pynccl():
     distributed_run(worker_fn, 2)
@@ -102,7 +102,7 @@ def multiple_allreduce_worker_fn():
 
 
 @pytest.mark.skipif(
-    torch.cuda.device_count() < 4, reason="Need at least 4 GPUs to run the test."
+    torch.accelerator.device_count() < 4, reason="Need at least 4 GPUs to run the test."
 )
 def test_pynccl_multiple_allreduce():
     # this tests pynccl for multiple tp groups, in a standalone way
@@ -130,7 +130,7 @@ def multiple_allreduce_with_vllm_worker_fn():
 
 
 @pytest.mark.skipif(
-    torch.cuda.device_count() < 4, reason="Need at least 4 GPUs to run the test."
+    torch.accelerator.device_count() < 4, reason="Need at least 4 GPUs to run the test."
 )
 def test_pynccl_multiple_allreduce_with_vllm():
     # this tests pynccl for multiple tp groups, together with vllm
@@ -185,7 +185,7 @@ def all_gather_worker_fn():
 
 
 @pytest.mark.skipif(
-    torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
+    torch.accelerator.device_count() < 2, reason="Need at least 2 GPUs to run the test."
 )
 def test_pynccl_all_gather():
     distributed_run(all_gather_worker_fn, 2)
@@ -220,7 +220,7 @@ def all_gatherv_worker_fn():
 
 
 @pytest.mark.skipif(
-    torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
+    torch.accelerator.device_count() < 2, reason="Need at least 2 GPUs to run the test."
 )
 def test_pynccl_all_gatherv():
     distributed_run(all_gatherv_worker_fn, 2)
@@ -260,7 +260,7 @@ def reduce_scatter_worker_fn():
 
 
 @pytest.mark.skipif(
-    torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
+    torch.accelerator.device_count() < 2, reason="Need at least 2 GPUs to run the test."
 )
 def test_pynccl_reduce_scatter():
     distributed_run(reduce_scatter_worker_fn, 2)
@@ -298,14 +298,14 @@ def reduce_scatterv_worker_fn():
 
 
 @pytest.mark.skipif(
-    torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
+    torch.accelerator.device_count() < 2, reason="Need at least 2 GPUs to run the test."
 )
 def test_pynccl_reduce_scatterv():
     distributed_run(reduce_scatterv_worker_fn, 2)
 
 
 @pytest.mark.skipif(
-    torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
+    torch.accelerator.device_count() < 2, reason="Need at least 2 GPUs to run the test."
 )
 def test_pynccl_with_cudagraph():
     distributed_run(worker_fn_with_cudagraph, 2)
@@ -330,7 +330,7 @@ def send_recv_worker_fn():
 
 
 @pytest.mark.skipif(
-    torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test."
+    torch.accelerator.device_count() < 2, reason="Need at least 2 GPUs to run the test."
 )
 def test_pynccl_send_recv():
     distributed_run(send_recv_worker_fn, 2)
@@ -363,14 +363,14 @@ def multiple_send_recv_worker_fn():
 
 
 @pytest.mark.skipif(
-    torch.cuda.device_count() < 4, reason="Need at least 4 GPUs to run the test."
+    torch.accelerator.device_count() < 4, reason="Need at least 4 GPUs to run the test."
 )
 def test_pynccl_multiple_send_recv():
     distributed_run(multiple_send_recv_worker_fn, 4)
 
 
 @pytest.mark.skipif(
-    torch.cuda.device_count() < 4, reason="Need at least 4 GPUs to run the test."
+    torch.accelerator.device_count() < 4, reason="Need at least 4 GPUs to run the test."
 )
 def test_pynccl_broadcast():
     distributed_run(broadcast_worker_fn, 4)
diff --git a/tests/distributed/test_quick_all_reduce.py b/tests/distributed/test_quick_all_reduce.py
index 5af3101a9..9fbc4e0e9 100644
--- a/tests/distributed/test_quick_all_reduce.py
+++ b/tests/distributed/test_quick_all_reduce.py
@@ -39,7 +39,7 @@ def graph_quickreduce(
     with monkeypatch.context() as m:
         m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
         device = torch.device(f"cuda:{rank}")
-        torch.cuda.set_device(device)
+        torch.accelerator.set_device_index(device)
         init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
         ensure_model_parallel_initialized(tp_size, pp_size)
         group = get_tp_group().device_group
@@ -65,12 +65,10 @@ def graph_quickreduce(
         for sz in test_sizes:
             for dtype in [torch.float16, torch.bfloat16]:
                 with graph_capture(device=device) as graph_capture_context:
-                    inp1 = torch.randint(
-                        1, 23, (sz,), dtype=dtype, device=torch.cuda.current_device()
-                    )
-                    inp2 = torch.randint(
-                        -23, 1, (sz,), dtype=dtype, device=torch.cuda.current_device()
-                    )
+                    device_idx = torch.accelerator.current_device_index()
+                    inp1 = torch.randint(1, 23, (sz,), dtype=dtype, device=device_idx)
+                    inp2 = torch.randint(-23, 1, (sz,), dtype=dtype, device=device_idx)
+
                     torch.accelerator.synchronize()
                     graph = torch.cuda.CUDAGraph()
                     with torch.cuda.graph(graph, stream=graph_capture_context.stream):
@@ -95,7 +93,7 @@ def eager_quickreduce(
     with monkeypatch.context() as m:
         m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
         device = torch.device(f"cuda:{rank}")
-        torch.cuda.set_device(device)
+        torch.accelerator.set_device_index(device)
 
         init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
 
@@ -130,7 +128,7 @@ def test_custom_quick_allreduce(
     quant_mode,
 ):
     world_size = tp_size * pipeline_parallel_size
-    if world_size > torch.cuda.device_count():
+    if world_size > torch.accelerator.device_count():
         pytest.skip("Not enough GPUs to run the test.")
 
     monkeypatch.setenv("VLLM_ROCM_QUICK_REDUCE_QUANTIZATION", quant_mode)
@@ -145,7 +143,7 @@ def qr_variable_input(rank, world_size):
     has been observed with the gpt_oss model).
     """
     device = torch.device(f"cuda:{rank}")
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
     qr_max_size = None  # MB
     _ptr = ops.init_custom_qr(rank, world_size, qr_max_size)
     ranks = []
@@ -169,14 +167,13 @@ def qr_variable_input(rank, world_size):
     s1 = 1024
     while num < 50000:  # 50000 is sufficient to identify issues.
         dtype = torch.float16
+        device_idx = torch.accelerator.current_device_index()
         if num % 2 == 0:
             s2 = 1024
-            inp1 = torch.zeros(
-                (s1, s2), dtype=dtype, device=torch.cuda.current_device()
-            )
+            inp1 = torch.zeros((s1, s2), dtype=dtype, device=device_idx)
         else:
             s2 = 2048
-            inp1 = torch.ones((s1, s2), dtype=dtype, device=torch.cuda.current_device())
+            inp1 = torch.ones((s1, s2), dtype=dtype, device=device_idx)
         result = torch.empty_like(inp1)
         # FP = 0 INT8 = 1 INT6 = 2 INT4 = 3 NONE = 4
         ops.qr_all_reduce(_ptr, inp1, result, 3, cast_bf2half=True)
@@ -198,7 +195,7 @@ def qr_variable_input(rank, world_size):
 @pytest.mark.parametrize("pipeline_parallel_size", [1])
 def test_custom_quick_allreduce_variable_input(tp_size, pipeline_parallel_size):
     world_size = tp_size * pipeline_parallel_size
-    if world_size > torch.cuda.device_count():
+    if world_size > torch.accelerator.device_count():
         pytest.skip("Not enough GPUs to run the test.")
 
     multiprocessing.set_start_method("spawn", force=True)
diff --git a/tests/distributed/test_symm_mem_allreduce.py b/tests/distributed/test_symm_mem_allreduce.py
index b8f04cf8e..6750aa788 100644
--- a/tests/distributed/test_symm_mem_allreduce.py
+++ b/tests/distributed/test_symm_mem_allreduce.py
@@ -39,7 +39,7 @@ def symm_mem_allreduce_worker(local_rank: int, world_size: int, q: mp.Queue):
         m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
         dtype = torch.bfloat16
         device = torch.device(f"cuda:{local_rank}")
-        torch.cuda.set_device(device)
+        torch.accelerator.set_device_index(device)
         torch.set_default_device(device)
         torch.set_default_dtype(dtype)
         update_environment_variables(
@@ -105,7 +105,7 @@ def test_symm_mem_allreduce(
     monkeypatch: pytest.MonkeyPatch, tp_size, pipeline_parallel_size
 ):
     world_size = tp_size * pipeline_parallel_size
-    if world_size > torch.cuda.device_count():
+    if world_size > torch.accelerator.device_count():
         pytest.skip("Not enough GPUs to run the test.")
     q = mp.get_context("spawn").Queue()
     mp.spawn(symm_mem_allreduce_worker, args=(world_size, q), nprocs=world_size)
@@ -126,7 +126,7 @@ def test_symm_mem_allreduce(
 @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA")
 def test_dp_with_symm_mem_allreduce(monkeypatch: pytest.MonkeyPatch):
     world_size = 4
-    if world_size > torch.cuda.device_count():
+    if world_size > torch.accelerator.device_count():
         pytest.skip("Not enough GPUs to run the test.")
     # Verify that the DataParallel runs without error
     engine_args = EngineArgs(
diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py
index c2fea7c1d..784918642 100644
--- a/tests/distributed/test_utils.py
+++ b/tests/distributed/test_utils.py
@@ -66,7 +66,7 @@ def cpu_worker(rank, WORLD_SIZE, port1, port2):
 
 
 def gpu_worker(rank, WORLD_SIZE, port1, port2):
-    torch.cuda.set_device(rank)
+    torch.accelerator.set_device_index(rank)
     pg1 = StatelessProcessGroup.create(
         host="127.0.0.1", port=port1, rank=rank, world_size=WORLD_SIZE
     )
diff --git a/tests/distributed/test_weight_transfer.py b/tests/distributed/test_weight_transfer.py
index def1e1dfd..1309edf5a 100644
--- a/tests/distributed/test_weight_transfer.py
+++ b/tests/distributed/test_weight_transfer.py
@@ -203,7 +203,7 @@ class TestEngineRegistry:
 
 def test_nccl_receive_weights_without_init_raises():
     """Test that receive_weights raises if init_transfer_engine wasn't called."""
-    if torch.cuda.device_count() < 1:
+    if torch.accelerator.device_count() < 1:
         pytest.skip("Need at least 1 GPU for this test")
 
     config = WeightTransferConfig(backend="nccl")
@@ -336,7 +336,7 @@ def inference_receive_tensor(
 
 
 @pytest.mark.skipif(
-    torch.cuda.device_count() < 2,
+    torch.accelerator.device_count() < 2,
     reason="Need at least 2 GPUs to run NCCL weight transfer test.",
 )
 def test_nccl_weight_transfer_between_processes():
@@ -382,7 +382,7 @@ class TestIPCWeightTransferUpdateInfoValidation:
 
     def test_valid_update_info(self):
         """Test creating valid IPCWeightTransferUpdateInfo."""
-        if torch.cuda.device_count() < 1:
+        if torch.accelerator.device_count() < 1:
             pytest.skip("Need at least 1 GPU for this test")
 
         # Create a dummy tensor and IPC handle
@@ -404,7 +404,7 @@ class TestIPCWeightTransferUpdateInfoValidation:
 
     def test_mismatched_dtype_names_raises(self):
         """Test that mismatched dtype_names length raises ValueError."""
-        if torch.cuda.device_count() < 1:
+        if torch.accelerator.device_count() < 1:
             pytest.skip("Need at least 1 GPU for this test")
 
         dummy_tensor = torch.ones(10, 10, device="cuda:0")
@@ -422,7 +422,7 @@ class TestIPCWeightTransferUpdateInfoValidation:
 
     def test_mismatched_shapes_raises(self):
         """Test that mismatched shapes length raises ValueError."""
-        if torch.cuda.device_count() < 1:
+        if torch.accelerator.device_count() < 1:
             pytest.skip("Need at least 1 GPU for this test")
 
         dummy_tensor = torch.ones(10, 10, device="cuda:0")
@@ -440,7 +440,7 @@ class TestIPCWeightTransferUpdateInfoValidation:
 
     def test_mismatched_ipc_handles_raises(self):
         """Test that mismatched ipc_handles length raises ValueError."""
-        if torch.cuda.device_count() < 1:
+        if torch.accelerator.device_count() < 1:
             pytest.skip("Need at least 1 GPU for this test")
 
         dummy_tensor = torch.ones(10, 10, device="cuda:0")
@@ -458,7 +458,7 @@ class TestIPCWeightTransferUpdateInfoValidation:
 
     def test_valid_update_info_from_pickled(self, monkeypatch):
         """Test creating IPCWeightTransferUpdateInfo from pickled handles."""
-        if torch.cuda.device_count() < 1:
+        if torch.accelerator.device_count() < 1:
             pytest.skip("Need at least 1 GPU for this test")
 
         monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
@@ -493,7 +493,7 @@ class TestIPCWeightTransferUpdateInfoValidation:
 
     def test_both_handles_and_pickled_raises(self):
         """Test that providing both ipc_handles and ipc_handles_pickled raises."""
-        if torch.cuda.device_count() < 1:
+        if torch.accelerator.device_count() < 1:
             pytest.skip("Need at least 1 GPU for this test")
 
         dummy_tensor = torch.ones(10, 10, device="cuda:0")
@@ -540,7 +540,7 @@ class TestIPCEngineParsing:
 
     def test_parse_update_info_valid(self):
         """Test parsing valid update info dict."""
-        if torch.cuda.device_count() < 1:
+        if torch.accelerator.device_count() < 1:
             pytest.skip("Need at least 1 GPU for this test")
 
         config = WeightTransferConfig(backend="ipc")
@@ -572,7 +572,7 @@ class TestIPCEngineParsing:
 
     def test_parse_update_info_pickled(self, monkeypatch):
         """Test parsing update info with pickled IPC handles (HTTP path)."""
-        if torch.cuda.device_count() < 1:
+        if torch.accelerator.device_count() < 1:
             pytest.skip("Need at least 1 GPU for this test")
 
         monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
@@ -731,7 +731,7 @@ def inference_receive_ipc_tensor(
 
 
 @pytest.mark.skipif(
-    torch.cuda.device_count() < 1,
+    torch.accelerator.device_count() < 1,
     reason="Need at least 1 GPU to run IPC weight transfer test.",
 )
 @pytest.mark.parametrize("mode", ["ray", "http"])
@@ -789,7 +789,7 @@ def test_ipc_weight_transfer_between_processes(mode: str):
 
 def test_ipc_receive_weights_missing_gpu_uuid_raises():
     """Test that receive_weights raises if GPU UUID not found in IPC handles."""
-    if torch.cuda.device_count() < 1:
+    if torch.accelerator.device_count() < 1:
         pytest.skip("Need at least 1 GPU for this test")
 
     config = WeightTransferConfig(backend="ipc")
diff --git a/tests/entrypoints/llm/test_collective_rpc.py b/tests/entrypoints/llm/test_collective_rpc.py
index 747676ac9..d66455889 100644
--- a/tests/entrypoints/llm/test_collective_rpc.py
+++ b/tests/entrypoints/llm/test_collective_rpc.py
@@ -13,7 +13,7 @@ from ...utils import create_new_process_for_each_test
 @pytest.mark.parametrize("backend", ["mp", "ray"])
 @create_new_process_for_each_test()
 def test_collective_rpc(tp_size, backend, monkeypatch):
-    if torch.cuda.device_count() < tp_size:
+    if torch.accelerator.device_count() < tp_size:
         pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
     if tp_size == 1 and backend == "ray":
         pytest.skip("Skip duplicate test case")
diff --git a/tests/entrypoints/weight_transfer/test_weight_transfer_llm.py b/tests/entrypoints/weight_transfer/test_weight_transfer_llm.py
index 255bca444..7d6d330aa 100644
--- a/tests/entrypoints/weight_transfer/test_weight_transfer_llm.py
+++ b/tests/entrypoints/weight_transfer/test_weight_transfer_llm.py
@@ -106,7 +106,7 @@ def mock_create_engine(config, parallel_config):
 @create_new_process_for_each_test()
 def test_get_world_size_tp1():
     """Test world_size is correctly configured for TP=1."""
-    if torch.cuda.device_count() < 1:
+    if torch.accelerator.device_count() < 1:
         pytest.skip("Need at least 1 GPU for this test")
 
     llm = LLM(
@@ -125,7 +125,7 @@ def test_get_world_size_tp1():
 def test_init_weight_transfer_engine_calls_engine():
     """Test that init_weight_transfer_engine calls the engine's
     init_transfer_engine method."""
-    if torch.cuda.device_count() < 1:
+    if torch.accelerator.device_count() < 1:
         pytest.skip("Need at least 1 GPU for this test")
 
     # Run in-process so mock.patch works (spawn won't inherit the mock)
@@ -174,7 +174,7 @@ def test_init_weight_transfer_engine_calls_engine():
 @create_new_process_for_each_test()
 def test_update_weights_calls_engine():
     """Test that update_weights calls the engine's receive_weights method."""
-    if torch.cuda.device_count() < 1:
+    if torch.accelerator.device_count() < 1:
         pytest.skip("Need at least 1 GPU for this test")
 
     # Run in-process so mock.patch works (spawn won't inherit the mock)
@@ -233,7 +233,7 @@ def test_update_weights_calls_engine():
 @create_new_process_for_each_test()
 def test_full_weight_transfer_flow():
     """Test the complete weight transfer flow: init -> update."""
-    if torch.cuda.device_count() < 1:
+    if torch.accelerator.device_count() < 1:
         pytest.skip("Need at least 1 GPU for this test")
 
     # Run in-process so mock.patch works (spawn won't inherit the mock)
@@ -294,7 +294,7 @@ def test_full_weight_transfer_flow():
 @create_new_process_for_each_test()
 def test_weight_transfer_config_backend():
     """Test that WeightTransferConfig backend is properly configured."""
-    if torch.cuda.device_count() < 1:
+    if torch.accelerator.device_count() < 1:
         pytest.skip("Need at least 1 GPU for this test")
 
     # Test with nccl backend
diff --git a/tests/kernels/attention/test_attention.py b/tests/kernels/attention/test_attention.py
index a14b80b32..9ddceef8f 100644
--- a/tests/kernels/attention/test_attention.py
+++ b/tests/kernels/attention/test_attention.py
@@ -36,7 +36,9 @@ BLOCK_SIZES = [16, 32]
 USE_ALIBI = [False, True]
 KV_CACHE_DTYPE = ["auto", "fp8"]
 SEEDS = [0]
-CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.accelerator.device_count() == 1 else 2)
+]
 
 
 def ref_masked_attention(
diff --git a/tests/kernels/attention/test_cache.py b/tests/kernels/attention/test_cache.py
index 7c60a8a14..0249461dd 100644
--- a/tests/kernels/attention/test_cache.py
+++ b/tests/kernels/attention/test_cache.py
@@ -35,7 +35,9 @@ NUM_BLOCKS = [1024, 10000]
 
 NUM_MAPPINGS = [256]  # Arbitrary values for testing
 SEEDS = [0]
-CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.accelerator.device_count() == 1 else 2)
+]
 
 # We assume fp8 is always enabled for testing.
 KV_CACHE_DTYPE = ["auto", "fp8"]
@@ -69,7 +71,7 @@ def test_reshape_and_cache(
         pytest.skip()
     set_random_seed(seed)
     torch.set_default_device(device)
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
     # Create a random slot mapping.
     num_slots = block_size * num_blocks
     slot_mapping_lst = random.sample(range(num_slots), num_tokens)
@@ -192,7 +194,7 @@ def test_reshape_and_cache_flash(
 ) -> None:
     set_random_seed(seed)
     torch.set_default_device(device)
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
     assert implementation in ["cuda", "triton"]
     if implementation == "triton" and kv_cache_layout == "HND":
         pytest.skip("Triton implementation only supports NHD layout.")
@@ -553,7 +555,7 @@ def test_concat_and_cache_mla(
 ) -> None:
     set_random_seed(seed)
     torch.set_default_device(device)
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
 
     total_slots = num_blocks * block_size
     slot_mapping_lst = random.sample(range(total_slots), num_tokens)
@@ -632,7 +634,7 @@ def test_concat_and_cache_ds_mla(
     kv_cache_dtype = "fp8_ds_mla"
     set_random_seed(seed)
     torch.set_default_device(device)
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
 
     total_slots = num_blocks * block_size
     slot_mapping_lst = random.sample(range(total_slots), num_tokens)
@@ -744,7 +746,7 @@ def test_swap_blocks_mla(
 ) -> None:
     set_random_seed(seed)
     torch.set_default_device(device)
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
 
     entry_size = kv_lora_rank + qk_rope_head_dim
 
diff --git a/tests/kernels/attention/test_cutlass_mla_decode.py b/tests/kernels/attention/test_cutlass_mla_decode.py
index 1f2fb66b3..33bd36058 100644
--- a/tests/kernels/attention/test_cutlass_mla_decode.py
+++ b/tests/kernels/attention/test_cutlass_mla_decode.py
@@ -69,7 +69,7 @@ def test_cutlass_mla_decode(
     init_dtype = torch.bfloat16 if torch_dtype == torch.float8_e4m3fn else torch_dtype
     torch.set_default_dtype(init_dtype)
     torch.set_default_device(device)
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
     torch.manual_seed(42)
     random.seed(42)
 
diff --git a/tests/kernels/attention/test_flashmla.py b/tests/kernels/attention/test_flashmla.py
index 6b3d3485d..657b256f4 100644
--- a/tests/kernels/attention/test_flashmla.py
+++ b/tests/kernels/attention/test_flashmla.py
@@ -57,7 +57,7 @@ def test_flash_mla(
     init_dtype = torch.bfloat16 if torch_dtype == torch.float8_e4m3fn else torch_dtype
     torch.set_default_dtype(init_dtype)
     torch.set_default_device(device)
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
     torch.manual_seed(0)
     random.seed(0)
 
diff --git a/tests/kernels/attention/test_prefix_prefill.py b/tests/kernels/attention/test_prefix_prefill.py
index 7aeeaf8b4..de63b4548 100644
--- a/tests/kernels/attention/test_prefix_prefill.py
+++ b/tests/kernels/attention/test_prefix_prefill.py
@@ -21,7 +21,9 @@ NUM_HEADS = [64]
 NUM_QUERIES_PER_KV = [1, 64]
 HEAD_SIZES = [24, 128]
 DTYPES = [torch.float16]
-CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.accelerator.device_count() == 1 else 2)
+]
 SLIDING_WINDOW = [0, 16, 2048]
 KV_CACHE_DTYPES = ["auto", "fp8", "fp8_e5m2"]
 
@@ -135,7 +137,7 @@ def test_contexted_kv_attention(
     # for GPU 1 would run on both GPU0 and GPU1 and things would hang
     #
     # see also similar issue: https://github.com/Dao-AILab/flash-attention/issues/523
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
 
     MAX_SEQ_LEN = 1024
     MAX_CTX_LEN = 1024
@@ -356,7 +358,7 @@ def test_contexted_kv_attention_alibi(
     # for GPU 1 would run on both GPU0 and GPU1 and things would hang
     #
     # see also similar issue: https://github.com/Dao-AILab/flash-attention/issues/523
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
 
     def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
         # Fork from: vllm/vllm/model_executor/models/bloom.py#L44
diff --git a/tests/kernels/core/test_activation.py b/tests/kernels/core/test_activation.py
index 66727a309..e7de77312 100644
--- a/tests/kernels/core/test_activation.py
+++ b/tests/kernels/core/test_activation.py
@@ -26,7 +26,9 @@ DTYPES = [torch.half, torch.bfloat16, torch.float]
 NUM_TOKENS = [7, 83, 2048]  # Arbitrary values for testing
 D = [512, 13824]  # Arbitrary values for testing
 SEEDS = [0]
-CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.accelerator.device_count() == 1 else 2)
+]
 
 
 @pytest.mark.parametrize(
diff --git a/tests/kernels/core/test_fused_quant_layernorm.py b/tests/kernels/core/test_fused_quant_layernorm.py
index b7e6ce386..fe06605af 100644
--- a/tests/kernels/core/test_fused_quant_layernorm.py
+++ b/tests/kernels/core/test_fused_quant_layernorm.py
@@ -33,7 +33,9 @@ SCALE_UBS = [True, False]
 GROUP_SIZES = [None, [1, 64], [1, 128]]
 TMA_ALIGNMENTS = [0, 4]
 SEEDS = [0]
-CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.accelerator.device_count() == 1 else 2)
+]
 
 EPS = 1e-6
 
@@ -182,7 +184,7 @@ def test_rms_norm(
     if torch.cuda.is_available():
         torch.cuda.manual_seed(seed)
     torch.set_default_device(device)
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
 
     if group_size is not None and hidden_size % group_size[1] != 0:
         # skip
diff --git a/tests/kernels/core/test_layernorm.py b/tests/kernels/core/test_layernorm.py
index 2dca0da07..f8f966094 100644
--- a/tests/kernels/core/test_layernorm.py
+++ b/tests/kernels/core/test_layernorm.py
@@ -14,7 +14,9 @@ NUM_TOKENS = [7, 83, 4096]  # Arbitrary values for testing
 HIDDEN_SIZES = [8, 768, 769, 5120, 5125, 8192]  # Arbitrary values for testing
 ADD_RESIDUAL = [False, True]
 SEEDS = [0]
-CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.accelerator.device_count() == 1 else 2)
+]
 
 
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
diff --git a/tests/kernels/core/test_pos_encoding.py b/tests/kernels/core/test_pos_encoding.py
index 5094a29c5..3a750b743 100644
--- a/tests/kernels/core/test_pos_encoding.py
+++ b/tests/kernels/core/test_pos_encoding.py
@@ -19,7 +19,9 @@ NUM_HEADS = [17]  # Arbitrary values for testing
 BATCH_SIZES = [5]  # Arbitrary values for testing
 SEQ_LENS = [11, 8192]  # Arbitrary values for testing
 SEEDS = [0]
-CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.accelerator.device_count() == 1 else 2)
+]
 USE_KEY = [True, False]
 
 
diff --git a/tests/kernels/core/test_rotary_embedding_mla_cache_fused.py b/tests/kernels/core/test_rotary_embedding_mla_cache_fused.py
index a8781afd8..181f10f31 100644
--- a/tests/kernels/core/test_rotary_embedding_mla_cache_fused.py
+++ b/tests/kernels/core/test_rotary_embedding_mla_cache_fused.py
@@ -28,7 +28,8 @@ from vllm.utils.torch_utils import set_random_seed
 @pytest.mark.parametrize("block_size", [16, 64, 256])
 @pytest.mark.parametrize("seed", [0])
 @pytest.mark.parametrize(
-    "device", [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
+    "device",
+    [f"cuda:{i}" for i in range(1 if torch.accelerator.device_count() == 1 else 2)],
 )
 @torch.inference_mode()
 def test_concat_and_cache_mla_rope_fused(
diff --git a/tests/kernels/core/test_uva.py b/tests/kernels/core/test_uva.py
index f4a0296d8..7c2561250 100644
--- a/tests/kernels/core/test_uva.py
+++ b/tests/kernels/core/test_uva.py
@@ -6,7 +6,9 @@ import torch
 from vllm.utils.platform_utils import is_uva_available
 from vllm.utils.torch_utils import get_accelerator_view_from_cpu_tensor
 
-CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.accelerator.device_count() == 1 else 2)
+]
 
 
 @pytest.mark.skipif(not is_uva_available(), reason="UVA is not available.")
diff --git a/tests/kernels/mamba/test_mamba_mixer2.py b/tests/kernels/mamba/test_mamba_mixer2.py
index 322e717e9..973e7885c 100644
--- a/tests/kernels/mamba/test_mamba_mixer2.py
+++ b/tests/kernels/mamba/test_mamba_mixer2.py
@@ -71,7 +71,7 @@ def mixer2_gated_norm_tensor_parallel(
     set_random_seed(0)
 
     device = torch.device(f"cuda:{local_rank}")
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
     torch.set_default_device(device)
     torch.set_default_dtype(dtype)
 
diff --git a/tests/kernels/moe/modular_kernel_tools/common.py b/tests/kernels/moe/modular_kernel_tools/common.py
index 4b2b1653b..6f9abc607 100644
--- a/tests/kernels/moe/modular_kernel_tools/common.py
+++ b/tests/kernels/moe/modular_kernel_tools/common.py
@@ -322,7 +322,7 @@ class WeightTensors:
         )
 
     def to_current_device(self):
-        device = torch.cuda.current_device()
+        device = torch.accelerator.current_device_index()
         self.w1 = self.w1.to(device=device)
         self.w2 = self.w2.to(device=device)
 
@@ -392,7 +392,8 @@ class RankTensors:
         Return hidden_states
         """
         m, k, dtype = (config.M, config.K, config.dtype)
-        a = torch.randn((m, k), device=torch.cuda.current_device(), dtype=dtype) / 15.0
+        device = torch.accelerator.current_device_index()
+        a = torch.randn((m, k), device=device, dtype=dtype) / 15.0
 
         if config.quant_dtype is None:
             return a, None
@@ -428,9 +429,10 @@ class RankTensors:
         topk_weights, topk_ids, _ = fused_topk(hidden_states, score, topk, False)
 
         # distribute topk_ids evenly
+        device = torch.accelerator.current_device_index()
         for mi in range(m):
             topk_ids[mi] = torch.randperm(config.E)[:topk]
-        topk_ids = topk_ids.to(device=torch.cuda.current_device())
+        topk_ids = topk_ids.to(device=device)
 
         expert_map = None
         if config.world_size > 1 and config.supports_expert_map():
@@ -440,9 +442,7 @@ class RankTensors:
             s = pgi.rank * num_local_experts
             e = s + num_local_experts
             expert_map[s:e] = torch.tensor(list(range(num_local_experts)))
-            expert_map = expert_map.to(
-                device=torch.cuda.current_device(), dtype=torch.int32
-            )
+            expert_map = expert_map.to(device=device, dtype=torch.int32)
 
         return RankTensors(
             hidden_states=hidden_states,
@@ -558,7 +558,9 @@ def reference_moe_impl(
 
 def _make_gscale(num_experts: int) -> torch.Tensor:
     return torch.ones(
-        (num_experts,), device=torch.cuda.current_device(), dtype=torch.float32
+        (num_experts,),
+        device=torch.accelerator.current_device_index(),
+        dtype=torch.float32,
     )
 
 
diff --git a/tests/kernels/moe/modular_kernel_tools/parallel_utils.py b/tests/kernels/moe/modular_kernel_tools/parallel_utils.py
index 8528ee0cd..3ff2ce3b3 100644
--- a/tests/kernels/moe/modular_kernel_tools/parallel_utils.py
+++ b/tests/kernels/moe/modular_kernel_tools/parallel_utils.py
@@ -66,7 +66,7 @@ def _worker_parallel_launch(
     **kwargs: P.kwargs,
 ) -> None:
     rank = node_rank * world_local_size + local_rank
-    torch.cuda.set_device(local_rank)
+    torch.accelerator.set_device_index(local_rank)
     device = torch.device("cuda", local_rank)
     torch.distributed.init_process_group(
         backend="cpu:gloo,cuda:nccl",
diff --git a/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py b/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py
index 9f0f9f2ea..95442103b 100644
--- a/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py
+++ b/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py
@@ -34,7 +34,8 @@ def do_profile(
         record_shapes=True,
     ) as tprof:
         fn(**fn_kwargs)
-        torch.accelerator.synchronize(torch.cuda.current_device())
+        device = torch.accelerator.current_device_index()
+        torch.accelerator.synchronize(device=device)
 
     # TODO (varun): Add a descriptive trace file name
     tprof.export_chrome_trace(
diff --git a/tests/kernels/moe/parallel_utils.py b/tests/kernels/moe/parallel_utils.py
index 90728c1e3..525e3e67b 100644
--- a/tests/kernels/moe/parallel_utils.py
+++ b/tests/kernels/moe/parallel_utils.py
@@ -52,7 +52,7 @@ def _worker_parallel_launch(
     **kwargs: P.kwargs,
 ) -> None:
     rank = node_rank * world_local_size + local_rank
-    torch.cuda.set_device(local_rank)
+    torch.accelerator.set_device_index(local_rank)
     device = torch.device("cuda", local_rank)
     torch.distributed.init_process_group(
         backend="cpu:gloo,cuda:nccl",
diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py
index a01fb1a45..b9404975e 100644
--- a/tests/kernels/moe/test_deepep_deepgemm_moe.py
+++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py
@@ -134,10 +134,8 @@ class TestTensors:
 
         fp8_info = torch.finfo(torch.float8_e4m3fn)
         fp8_max, fp8_min = fp8_info.max, fp8_info.min
-
-        rank_tokens = (
-            torch.randn((m, k), device=torch.cuda.current_device(), dtype=dtype) / 10.0
-        )
+        device = torch.accelerator.current_device_index()
+        rank_tokens = torch.randn((m, k), device=device, dtype=dtype) / 10.0
         rank_tokens = rank_tokens.clamp(min=fp8_min, max=fp8_max)
         rank_token_scales = None
 
@@ -145,11 +143,13 @@ class TestTensors:
             low=0,
             high=config.num_experts,
             size=(m, topk),
-            device=torch.cuda.current_device(),
+            device=device,
         ).to(dtype=torch.int64)
 
         topk_weights = torch.randn(
-            topk_ids.shape, dtype=torch.float32, device=torch.cuda.current_device()
+            topk_ids.shape,
+            dtype=torch.float32,
+            device=device,
         )
 
         return TestTensors(
@@ -296,7 +296,8 @@ def deepep_deepgemm_moe_impl(
         s = pgi.rank * num_local_experts
         e = s + num_local_experts
         expert_map[s:e] = torch.tensor(list(range(num_local_experts)))
-        return expert_map.to(device=torch.cuda.current_device(), dtype=torch.int32)
+        device = torch.accelerator.current_device_index()
+        return expert_map.to(device=device, dtype=torch.int32)
 
     quant_config = fp8_w8a8_moe_quant_config(
         w1_scale=w1_scale,
@@ -376,10 +377,11 @@ def _test_deepep_deepgemm_moe(
 
     set_random_seed(pgi.rank)
 
-    w1 = w1.to(device=torch.cuda.current_device())
-    w2 = w2.to(device=torch.cuda.current_device())
-    w1_scale = w1_scale.to(device=torch.cuda.current_device())
-    w2_scale = w2_scale.to(device=torch.cuda.current_device())
+    device = torch.accelerator.current_device_index()
+    w1 = w1.to(device=device)
+    w2 = w2.to(device=device)
+    w1_scale = w1_scale.to(device=device)
+    w2_scale = w2_scale.to(device=device)
 
     pg = torch.distributed.new_group(list(range(pgi.world_size)))
     test_tensors = TestTensors.make(config, pgi.rank)
diff --git a/tests/kernels/moe/test_deepep_moe.py b/tests/kernels/moe/test_deepep_moe.py
index 362b71a40..28bb83107 100644
--- a/tests/kernels/moe/test_deepep_moe.py
+++ b/tests/kernels/moe/test_deepep_moe.py
@@ -210,7 +210,8 @@ def deep_ep_moe_impl(
         s = pgi.rank * num_local_experts
         e = s + num_local_experts
         expert_map[s:e] = torch.tensor(list(range(num_local_experts)))
-        return expert_map.to(device=torch.cuda.current_device(), dtype=torch.int32)
+        device = torch.accelerator.current_device_index()
+        return expert_map.to(device=device, dtype=torch.int32)
 
     hidden_size = test_tensors.rank_tokens.size(1)
     is_quantized = w1.dtype == torch.float8_e4m3fn
@@ -365,15 +366,13 @@ def _deep_ep_moe(
         )
 
     is_quantized = w1.dtype == torch.float8_e4m3fn
-    w1 = w1.to(device=torch.cuda.current_device())
-    w2 = w2.to(device=torch.cuda.current_device())
+    device_idx = torch.accelerator.current_device_index()
+    w1 = w1.to(device=device_idx)
+    w2 = w2.to(device=device_idx)
     if is_quantized:
-        w1_scale = w1_scale.to(  # type: ignore
-            device=torch.cuda.current_device()
-        )
-        w2_scale = w2_scale.to(  # type: ignore
-            device=torch.cuda.current_device()
-        )
+        assert w1_scale is not None and w2_scale is not None
+        w1_scale = w1_scale.to(device=device_idx)
+        w2_scale = w2_scale.to(device=device_idx)
 
     pg = torch.distributed.new_group(list(range(pgi.world_size)))
     test_tensors = TestTensors.make(config, low_latency_mode)
diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py
index 43bdd03cf..84483fea8 100644
--- a/tests/kernels/moe/test_moe.py
+++ b/tests/kernels/moe/test_moe.py
@@ -716,7 +716,7 @@ def test_mixtral_moe(
     monkeypatch.setenv("MASTER_ADDR", "localhost")
     monkeypatch.setenv("MASTER_PORT", "12345")
     init_distributed_environment()
-    init_workspace_manager(torch.cuda.current_device())
+    init_workspace_manager(torch.accelerator.current_device_index())
 
     # Instantiate our and huggingface's MoE blocks
     vllm_config.compilation_config.static_forward_context = dict()
diff --git a/tests/kernels/moe/test_ocp_mx_moe.py b/tests/kernels/moe/test_ocp_mx_moe.py
index 73502932d..cf9021663 100644
--- a/tests/kernels/moe/test_ocp_mx_moe.py
+++ b/tests/kernels/moe/test_ocp_mx_moe.py
@@ -71,10 +71,10 @@ def enable_pickle(monkeypatch):
 )
 @pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE, reason="amd-quark>=0.9 is not available")
 def test_mxfp4_loading_and_execution_moe(vllm_runner, model_case: ModelCase):
-    if torch.cuda.device_count() < model_case.tp:
+    if torch.accelerator.device_count() < model_case.tp:
         pytest.skip(
             f"This test requires >={model_case.tp} gpus, got only "
-            f"{torch.cuda.device_count()}"
+            f"{torch.accelerator.device_count()}"
         )
 
     # `cudagraph_capture_sizes=[16]` to reduce load time.
diff --git a/tests/kernels/quantization/test_cutlass_2of4_sparse.py b/tests/kernels/quantization/test_cutlass_2of4_sparse.py
index cfdb36580..ccccc79cb 100644
--- a/tests/kernels/quantization/test_cutlass_2of4_sparse.py
+++ b/tests/kernels/quantization/test_cutlass_2of4_sparse.py
@@ -15,7 +15,9 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
 )
 from vllm.platforms import current_platform
 
-CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.accelerator.device_count() == 1 else 2)
+]
 
 capability = current_platform.get_device_capability()
 capability = capability[0] * 10 + capability[1]
diff --git a/tests/kernels/quantization/test_cutlass_scaled_mm.py b/tests/kernels/quantization/test_cutlass_scaled_mm.py
index bc4744df7..a8adec49a 100644
--- a/tests/kernels/quantization/test_cutlass_scaled_mm.py
+++ b/tests/kernels/quantization/test_cutlass_scaled_mm.py
@@ -40,7 +40,9 @@ MNK_FACTORS = [
     (512, 24576, 128),
 ]
 
-CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.accelerator.device_count() == 1 else 2)
+]
 
 # -1 means full extent in that dimension
 TENSORWISE_GROUP_SHAPE = (-1, -1)
diff --git a/tests/kernels/quantization/test_machete_mm.py b/tests/kernels/quantization/test_machete_mm.py
index 7f4ce2a08..62d0ba4f1 100644
--- a/tests/kernels/quantization/test_machete_mm.py
+++ b/tests/kernels/quantization/test_machete_mm.py
@@ -29,7 +29,9 @@ if current_platform.is_rocm():
         allow_module_level=True,
     )
 
-CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.accelerator.device_count() == 1 else 2)
+]
 
 # TODO: in future PR refactor this and `is_quant_method_supported` in the kernel
 #  unit tests to a common utility function. Currently the use of
diff --git a/tests/kernels/test_cache_kernels.py b/tests/kernels/test_cache_kernels.py
index 4cc8e3b14..25402fe03 100644
--- a/tests/kernels/test_cache_kernels.py
+++ b/tests/kernels/test_cache_kernels.py
@@ -13,7 +13,7 @@ except ImportError:
     )
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Need CUDA device")
+@pytest.mark.skipif(torch.accelerator.device_count() < 1, reason="Need CUDA device")
 def test_gather_cache_oob():
     """
     Tests for OOB read in gather_and_maybe_dequant_cache (Issue #27909).
diff --git a/tests/kernels/test_fused_quant_activation.py b/tests/kernels/test_fused_quant_activation.py
index 2170b0200..2670f224d 100644
--- a/tests/kernels/test_fused_quant_activation.py
+++ b/tests/kernels/test_fused_quant_activation.py
@@ -13,7 +13,9 @@ QUANT_DTYPES = [current_platform.fp8_dtype()]
 NUM_TOKENS = [1, 17, 86, 1234, 3045]  # Arbitrary values for testing
 HIDDEN_SIZES = [16, 48, 128, 1562, 4096]  # Arbitrary values for testing
 SEEDS = [0]
-CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.accelerator.device_count() == 1 else 2)
+]
 
 
 def ref_impl(
diff --git a/tests/lora/test_fused_moe_lora_kernel.py b/tests/lora/test_fused_moe_lora_kernel.py
index f3c3cb8cf..66a985a06 100644
--- a/tests/lora/test_fused_moe_lora_kernel.py
+++ b/tests/lora/test_fused_moe_lora_kernel.py
@@ -638,7 +638,7 @@ def use_fused_moe_lora_kernel_tensor_parallel(
     set_random_seed(seed)
 
     device = torch.device(f"cuda:{local_rank}")
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
     torch.set_default_device(device)
     torch.set_default_dtype(dtype)
 
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index d3c1f3deb..08fd03724 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -61,7 +61,7 @@ pytestmark = pytest.mark.skipif(
 )
 
 DEVICES = (
-    [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
+    [f"cuda:{i}" for i in range(1 if torch.accelerator.device_count() == 1 else 2)]
     if current_platform.is_cuda_alike()
     else ["cpu"]
 )
@@ -260,7 +260,7 @@ def test_embeddings(
     # device, see: https://github.com/triton-lang/triton/issues/2925
     # Same below.
     if current_platform.is_cuda_alike():
-        torch.cuda.set_device(device)
+        torch.accelerator.set_device_index(device)
 
     torch.set_default_device(device)
     max_loras = 8
@@ -359,7 +359,7 @@ def test_lm_head_logits_processor(
     default_vllm_config, dist_init, num_loras, device, vocab_size, stage
 ) -> None:
     if current_platform.is_cuda_alike():
-        torch.cuda.set_device(device)
+        torch.accelerator.set_device_index(device)
 
     torch.set_default_device(device)
     max_loras = 8
@@ -476,7 +476,7 @@ def test_lm_head_logits_processor_invalid_vocab_size(
 ) -> None:
     """Test that LogitsProcessorWithLoRA raises ValueError for invalid vocab sizes."""
     if current_platform.is_cuda_alike():
-        torch.cuda.set_device(device)
+        torch.accelerator.set_device_index(device)
 
     torch.set_default_device(device)
     max_loras = 8
@@ -505,7 +505,7 @@ def test_linear_replicated(
     stage,
 ) -> None:
     if current_platform.is_cuda_alike():
-        torch.cuda.set_device(device)
+        torch.accelerator.set_device_index(device)
 
     max_loras = 8
     torch.set_default_device(device)
@@ -612,7 +612,7 @@ def test_linear_parallel(
     default_vllm_config, dist_init, num_loras, orientation, fully_shard, device, stage
 ) -> None:
     if current_platform.is_cuda_alike():
-        torch.cuda.set_device(device)
+        torch.accelerator.set_device_index(device)
 
     max_loras = 8
     torch.set_default_device(device)
@@ -737,7 +737,7 @@ def test_column_parallel_packed(
     default_vllm_config, dist_init, num_loras, repeats, fully_shard, device, stage
 ) -> None:
     if current_platform.is_cuda_alike():
-        torch.cuda.set_device(device)
+        torch.accelerator.set_device_index(device)
 
     max_loras = 8
     torch.set_default_device(device)
@@ -885,7 +885,7 @@ def test_merged_column_parallel_variable_slice(
     default_vllm_config, dist_init, num_loras, num_slices, device, stage
 ) -> None:
     if current_platform.is_cuda_alike():
-        torch.cuda.set_device(device)
+        torch.accelerator.set_device_index(device)
 
     max_loras = 8
     torch.set_default_device(device)
diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index c37780ec6..d2a7cd155 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -37,7 +37,7 @@ EMBEDDING_MODULES = {
 
 
 DEVICES = (
-    [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
+    [f"cuda:{i}" for i in range(1 if torch.accelerator.device_count() == 1 else 2)]
     if current_platform.is_cuda_alike()
     else ["cpu"]
 )
diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py
index 12c73f2d7..3868bff79 100644
--- a/tests/lora/test_mixtral.py
+++ b/tests/lora/test_mixtral.py
@@ -34,7 +34,7 @@ def do_sample(
 def test_mixtral_lora(mixtral_lora_files, tp_size):
     """Original test, the LoRA model has the common target modules, not all"""
     if (
-        torch.cuda.device_count() < tp_size
+        torch.accelerator.device_count() < tp_size
         and tp_size > 1
         and current_platform.is_cuda_alike()
     ):
diff --git a/tests/lora/test_punica_ops.py b/tests/lora/test_punica_ops.py
index 82db7fece..8a2634e82 100644
--- a/tests/lora/test_punica_ops.py
+++ b/tests/lora/test_punica_ops.py
@@ -395,7 +395,7 @@ def test_kernels(
     Tests LoRA kernels.
     """
     torch.set_default_device(device)
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
     set_random_seed(seed)
 
     if op_type == "shrink":
@@ -448,7 +448,7 @@ def test_kernels_hidden_size(
     Tests SGMV and LoRA kernels.
     """
     torch.set_default_device(device)
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
     set_random_seed(seed)
 
     if op_type == "shrink":
diff --git a/tests/model_executor/model_loader/tensorizer_loader/test_tensorizer.py b/tests/model_executor/model_loader/tensorizer_loader/test_tensorizer.py
index 610f69c8d..3b950c843 100644
--- a/tests/model_executor/model_loader/tensorizer_loader/test_tensorizer.py
+++ b/tests/model_executor/model_loader/tensorizer_loader/test_tensorizer.py
@@ -203,7 +203,7 @@ def test_raise_value_error_on_invalid_load_format(vllm_runner, capfd, model_ref)
         torch.accelerator.empty_cache()
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Requires 2 GPUs")
+@pytest.mark.skipif(torch.accelerator.device_count() < 2, reason="Requires 2 GPUs")
 def test_tensorizer_with_tp_path_without_template(vllm_runner, capfd):
     try:
         model_ref = "EleutherAI/pythia-1.4b"
@@ -231,7 +231,7 @@ def test_tensorizer_with_tp_path_without_template(vllm_runner, capfd):
         ) in combined_output
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Requires 2 GPUs")
+@pytest.mark.skipif(torch.accelerator.device_count() < 2, reason="Requires 2 GPUs")
 def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(
     vllm_runner, tmp_path
 ):
diff --git a/tests/model_executor/test_eagle_quantization.py b/tests/model_executor/test_eagle_quantization.py
index 6f0dc55a5..1203aef6a 100644
--- a/tests/model_executor/test_eagle_quantization.py
+++ b/tests/model_executor/test_eagle_quantization.py
@@ -11,7 +11,7 @@ from vllm.model_executor.models.utils import get_draft_quant_config
 from vllm.platforms import current_platform
 
 DEVICES = (
-    [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
+    [f"cuda:{i}" for i in range(1 if torch.accelerator.device_count() == 1 else 2)]
     if current_platform.is_cuda_alike()
     else ["cpu"]
 )
@@ -61,7 +61,7 @@ def test_fc_layer_quant_config_usage(default_vllm_config, dist_init, device) ->
     from vllm.model_executor.layers.linear import ReplicatedLinear
 
     if current_platform.is_cuda_alike():
-        torch.cuda.set_device(device)
+        torch.accelerator.set_device_index(device)
 
     torch.set_default_device(device)
 
diff --git a/tests/models/test_vision.py b/tests/models/test_vision.py
index 17d82b125..7d03de1ab 100644
--- a/tests/models/test_vision.py
+++ b/tests/models/test_vision.py
@@ -102,7 +102,7 @@ def run_dp_sharded_vision_model_vs_direct(
     set_random_seed(0)
 
     device = f"{current_platform.device_name}:{local_rank}"
-    current_platform.set_device(device)
+    torch.accelerator.set_device_index(device)
     torch.set_default_device(device)
 
     update_environment_variables(
@@ -288,7 +288,7 @@ def run_dp_sharded_mrope_vision_model_vs_direct(
     # Set random seed for reproducibility
     set_random_seed(0)
     device = f"{current_platform.device_name}:{local_rank}"
-    current_platform.set_device(device)
+    torch.accelerator.set_device_index(device)
     torch.set_default_device(device)
 
     update_environment_variables(
@@ -365,7 +365,7 @@ def run_dp_sharded_mrope_vision_model_empty_input_worker(
     """Test run_dp_sharded_mrope_vision_model with empty input."""
     # Set up distributed environment
     device = f"{current_platform.device_name}:{local_rank}"
-    current_platform.set_device(device)
+    torch.accelerator.set_device_index(device)
     torch.set_default_device(device)
 
     update_environment_variables(
@@ -414,7 +414,7 @@ def run_dp_sharded_mrope_vision_model_uneven_load_worker(
     # Set up distributed environment
     set_random_seed(123)
     device = f"{current_platform.device_name}:{local_rank}"
-    current_platform.set_device(device)
+    torch.accelerator.set_device_index(device)
     torch.set_default_device(device)
 
     update_environment_variables(
diff --git a/tests/quantization/test_quark.py b/tests/quantization/test_quark.py
index a560494a4..afb0437f5 100644
--- a/tests/quantization/test_quark.py
+++ b/tests/quantization/test_quark.py
@@ -210,10 +210,9 @@ WIKITEXT_ACCURACY_CONFIGS = [
 @pytest.mark.parametrize("config", WIKITEXT_ACCURACY_CONFIGS)
 @pytest.mark.parametrize("tp_size", [1, 2])
 def test_ocp_mx_wikitext_correctness(config: AccuracyTestConfig, tp_size: int):
-    if torch.cuda.device_count() < tp_size:
-        pytest.skip(
-            f"This test requires >={tp_size} gpus, got only {torch.cuda.device_count()}"
-        )
+    device_count = torch.accelerator.device_count()
+    if device_count < tp_size:
+        pytest.skip(f"This test requires >={tp_size} gpus, got only {device_count}")
 
     task = "wikitext"
     rtol = 0.1
@@ -246,10 +245,9 @@ def test_ocp_mx_wikitext_correctness(config: AccuracyTestConfig, tp_size: int):
     reason="Read access to huggingface.co/amd is required for this test.",
 )
 def test_mxfp4_gsm8k_correctness(config: AccuracyTestConfig):
-    if torch.cuda.device_count() < 8:
-        pytest.skip(
-            f"This test requires >=8 gpus, got only {torch.cuda.device_count()}"
-        )
+    device_count = torch.accelerator.device_count()
+    if device_count < 8:
+        pytest.skip(f"This test requires >=8 gpus, got only {device_count}")
 
     task = "gsm8k"
     rtol = 0.03
diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index 8fdca83a2..4695f6f19 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -32,7 +32,7 @@ MTP_SIMILARITY_RATE = 0.8
 
 def _skip_if_insufficient_gpus_for_tp(tp_size: int):
     """Skip test if available GPUs < tp_size on ROCm."""
-    available_gpus = torch.cuda.device_count()
+    available_gpus = torch.accelerator.device_count()
     if available_gpus < tp_size:
         pytest.skip(
             f"Test requires {tp_size} GPUs, but only {available_gpus} available"
diff --git a/tests/v1/kv_connector/unit/test_example_connector.py b/tests/v1/kv_connector/unit/test_example_connector.py
index e42f691ea..7e05a0d93 100644
--- a/tests/v1/kv_connector/unit/test_example_connector.py
+++ b/tests/v1/kv_connector/unit/test_example_connector.py
@@ -148,7 +148,7 @@ def test_shared_storage_connector_hashes(tmp_path, attn_backend):
     )
 
     # don't put this import at the top level
-    # it will call torch.cuda.device_count()
+    # it will call torch.accelerator.device_count()
     from transformers import AutoProcessor
 
     # Create processor to handle the chat prompt
diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py
index 10fa4f14f..5dd90eb50 100644
--- a/tests/v1/kv_connector/unit/test_nixl_connector.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector.py
@@ -1570,7 +1570,7 @@ def test_register_kv_caches(
                             ]
                         ],
                         cache_dtype=torch.bfloat16,
-                        device=torch.cuda.current_device(),
+                        device=torch.accelerator.current_device_index(),
                         kernel_block_sizes=[block_size],
                     )
                 )
diff --git a/tests/v1/spec_decode/test_acceptance_length.py b/tests/v1/spec_decode/test_acceptance_length.py
index 8a6a72781..aa8e40a2d 100644
--- a/tests/v1/spec_decode/test_acceptance_length.py
+++ b/tests/v1/spec_decode/test_acceptance_length.py
@@ -141,7 +141,7 @@ def get_attention_backend_params() -> list[str]:
 
 
 def get_tp_size_params() -> list[pytest.param]:
-    num_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 1
+    num_gpus = torch.accelerator.device_count() if torch.cuda.is_available() else 1
     return [pytest.param(tp, id=f"tp{tp}") for tp in TP_SIZES if tp <= num_gpus]
 
 
diff --git a/tests/v1/worker/test_worker_memory_snapshot.py b/tests/v1/worker/test_worker_memory_snapshot.py
index 27a9b4a75..fe8a5a21f 100644
--- a/tests/v1/worker/test_worker_memory_snapshot.py
+++ b/tests/v1/worker/test_worker_memory_snapshot.py
@@ -117,7 +117,8 @@ def worker_process(
 
 
 @pytest.mark.skipif(
-    torch.cuda.device_count() < 2, reason="Need at least 2 GPUs for tensor parallelism"
+    torch.accelerator.device_count() < 2,
+    reason="Need at least 2 GPUs for tensor parallelism",
 )
 def test_init_distributed_is_called_before_memory_snapshot():
     """Test that distributed env is setup before memory snapshot.
diff --git a/tools/pre_commit/check_torch_cuda.py b/tools/pre_commit/check_torch_cuda.py
index 42cb0945b..4099c315e 100644
--- a/tools/pre_commit/check_torch_cuda.py
+++ b/tools/pre_commit/check_torch_cuda.py
@@ -8,8 +8,8 @@ import regex as re
 # Regex: match `torch.cuda.xxx` but allow `torch.accelerator.xxx`
 # --------------------------------------------------------------------------- #
 _TORCH_CUDA_PATTERNS = [
-    r"\btorch\.cuda\.(empty_cache|synchronize|device\()\b",
-    r"\bwith\btorch\.cuda\.device\b",
+    r"\btorch\.cuda\.(empty_cache|synchronize|device_count|current_device|set_device|device\()\b",
+    r"\bwith\storch\.cuda\.device\b",
 ]
 
 ALLOWED_FILES = {"vllm/platforms/", "vllm/device_allocator/"}
@@ -25,7 +25,9 @@ def scan_file(path: str) -> int:
             print(
                 f"{path}:{line_num}: "
                 "\033[91merror:\033[0m "  # red color
-                "Found torch.cuda API call"
+                "Found torch.cuda API call. Please refer RFC "
+                "https://github.com/vllm-project/vllm/issues/30679, use "
+                "torch.accelerator API instead."
             )
             return 1
     return 0
diff --git a/vllm/distributed/device_communicators/all2all.py b/vllm/distributed/device_communicators/all2all.py
index 3efcebd54..97c5faad6 100644
--- a/vllm/distributed/device_communicators/all2all.py
+++ b/vllm/distributed/device_communicators/all2all.py
@@ -491,7 +491,7 @@ class FlashInferAllToAllManager(All2AllManagerBase):
             self.initialize(
                 world_size=self.world_size,
                 rank=self.rank,
-                gpus_per_node=torch.cuda.device_count,
+                gpus_per_node=torch.accelerator.device_count,
             )
         return self.initialized
 
diff --git a/vllm/distributed/device_communicators/pynccl_allocator.py b/vllm/distributed/device_communicators/pynccl_allocator.py
index 0ce307bc5..27445b814 100644
--- a/vllm/distributed/device_communicators/pynccl_allocator.py
+++ b/vllm/distributed/device_communicators/pynccl_allocator.py
@@ -151,7 +151,7 @@ class nccl_symm_mem_context:
             self.pynccl_comm = pynccl_comm
             self._mem_pool_ctx = torch.cuda.use_mem_pool(get_nccl_mem_pool())
             self.is_graph_capture = torch.cuda.is_current_stream_capturing()
-            self.device = torch.cuda.current_device()
+            self.device = torch.accelerator.current_device_index()
 
     def __enter__(self):
         if self.disabled:
diff --git a/vllm/distributed/device_communicators/symm_mem.py b/vllm/distributed/device_communicators/symm_mem.py
index eb1f173b1..98c7ac20a 100644
--- a/vllm/distributed/device_communicators/symm_mem.py
+++ b/vllm/distributed/device_communicators/symm_mem.py
@@ -50,7 +50,7 @@ class SymmMemCommunicator:
             device = torch.device(f"cuda:{device}")
         elif isinstance(device, str):
             device = torch.device(device)
-        torch.cuda.set_device(device)
+        torch.accelerator.set_device_index(device)
         self.dtype = torch.bfloat16
         self.device = device
         self.group = group
diff --git a/vllm/distributed/eplb/async_worker.py b/vllm/distributed/eplb/async_worker.py
index 5dd862f36..7e753fdbf 100644
--- a/vllm/distributed/eplb/async_worker.py
+++ b/vllm/distributed/eplb/async_worker.py
@@ -33,7 +33,7 @@ def start_async_worker(
 
     def thread_target() -> None:
         assert device_index is not None
-        torch.cuda.set_device(device_index)
+        torch.accelerator.set_device_index(device_index)
         cuda_stream = torch.cuda.Stream(device=device_index)
         loop = asyncio.new_event_loop()
         asyncio.set_event_loop(loop)
diff --git a/vllm/distributed/eplb/eplb_state.py b/vllm/distributed/eplb/eplb_state.py
index b417c2b32..863b29f6f 100644
--- a/vllm/distributed/eplb/eplb_state.py
+++ b/vllm/distributed/eplb/eplb_state.py
@@ -314,7 +314,7 @@ class EplbState:
         if self.device.type == "cuda":
             self.cuda_device_index = self.device.index
             if self.cuda_device_index is None and torch.cuda.is_available():
-                self.cuda_device_index = torch.cuda.current_device()
+                self.cuda_device_index = torch.accelerator.current_device_index()
 
     @staticmethod
     def build_initial_global_physical_to_logical_map(
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py
index 51af1958b..4aacbddb8 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py
@@ -483,9 +483,9 @@ def _init_lmcache_engine(
     )
 
     # Change current device.
-    num_gpus = torch.cuda.device_count()
+    num_gpus = torch.accelerator.device_count()
     local_rank = parallel_config.rank % num_gpus
-    torch.cuda.set_device(local_rank)
+    torch.accelerator.set_device_index(local_rank)
     device = torch.device(f"cuda:{local_rank}")
     metadata = LMCacheEngineMetadata(
         model_config.model,
diff --git a/vllm/distributed/weight_transfer/ipc_engine.py b/vllm/distributed/weight_transfer/ipc_engine.py
index 85dd34553..9b72cfe71 100644
--- a/vllm/distributed/weight_transfer/ipc_engine.py
+++ b/vllm/distributed/weight_transfer/ipc_engine.py
@@ -169,7 +169,7 @@ class IPCWeightTransferEngine(
             update_info.shapes,
             update_info.ipc_handles,
         ):
-            device_index = torch.cuda.current_device()
+            device_index = torch.accelerator.current_device_index()
             props = torch.cuda.get_device_properties(device_index)
             physical_gpu_id = str(props.uuid)
 
@@ -242,7 +242,7 @@ class IPCWeightTransferEngine(
             args = trainer_args
 
         # Get physical GPU UUID
-        device_index = torch.cuda.current_device()
+        device_index = torch.accelerator.current_device_index()
         props = torch.cuda.get_device_properties(device_index)
         gpu_uuid = str(props.uuid)
 
diff --git a/vllm/distributed/weight_transfer/nccl_engine.py b/vllm/distributed/weight_transfer/nccl_engine.py
index e8a1091b9..3d97fafb2 100644
--- a/vllm/distributed/weight_transfer/nccl_engine.py
+++ b/vllm/distributed/weight_transfer/nccl_engine.py
@@ -140,13 +140,14 @@ class NCCLWeightTransferEngine(
         worker_rank = dp_rank * world_size_per_dp + rank_within_dp
         rank = worker_rank + init_info.rank_offset
         # Create stateless process group
+        device = torch.accelerator.current_device_index()
         self.model_update_group = (
             NCCLWeightTransferEngine._stateless_init_process_group(
                 init_info.master_address,
                 init_info.master_port,
                 rank,
                 init_info.world_size,
-                torch.cuda.current_device(),
+                device=device,
             )
         )
 
@@ -275,7 +276,7 @@ class NCCLWeightTransferEngine(
         Initialize NCCL process group for trainer-side weight transfer.
 
         The trainer is always rank 0 in the process group. Uses the current
-        CUDA device (torch.cuda.current_device()).
+        CUDA device (torch.accelerator.current_device_index()).
 
         Args:
             init_info: Either an NCCLWeightTransferInitInfo object or a dict with keys:
@@ -309,8 +310,13 @@ class NCCLWeightTransferEngine(
             world_size = init_info.world_size
 
         # Trainer is always rank 0
+        device = torch.accelerator.current_device_index()
         return NCCLWeightTransferEngine._stateless_init_process_group(
-            master_address, master_port, 0, world_size, torch.cuda.current_device()
+            master_address,
+            master_port,
+            0,
+            world_size,
+            device,
         )
 
     @staticmethod
diff --git a/vllm/model_executor/layers/attention/static_sink_attention.py b/vllm/model_executor/layers/attention/static_sink_attention.py
index fe8dc7e34..60419f967 100644
--- a/vllm/model_executor/layers/attention/static_sink_attention.py
+++ b/vllm/model_executor/layers/attention/static_sink_attention.py
@@ -190,7 +190,7 @@ class StaticSinkAttention(Attention, CustomOp):
         sink_kv_slot_mapping = torch.arange(
             self.block_size,
             self.sink_len + self.block_size,
-            device=torch.cuda.current_device(),
+            device=torch.accelerator.current_device_index(),
             dtype=torch.long,
         )
         triton_reshape_and_cache_flash_diffkv(
diff --git a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py
index 512b71284..db97a5374 100644
--- a/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py
+++ b/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py
@@ -295,14 +295,17 @@ class DefaultMoERunner(MoERunner):
             states_shape = (moe.max_num_tokens, self.moe_config.hidden_dim)
             logits_shape = (moe.max_num_tokens, self.moe_config.num_logical_experts)
 
+        device = torch.accelerator.current_device_index()
         self.batched_hidden_states = torch.zeros(
-            states_shape, dtype=moe.in_dtype, device=torch.cuda.current_device()
+            states_shape,
+            dtype=moe.in_dtype,
+            device=device,
         )
 
         self.batched_router_logits = torch.zeros(
             logits_shape,
             dtype=moe.router_logits_dtype,
-            device=torch.cuda.current_device(),
+            device=device,
         )
 
     def must_reduce_shared_expert_outputs(self) -> bool:
diff --git a/vllm/model_executor/layers/fused_moe/trtllm_moe.py b/vllm/model_executor/layers/fused_moe/trtllm_moe.py
index 5160840a2..3f256ca21 100644
--- a/vllm/model_executor/layers/fused_moe/trtllm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/trtllm_moe.py
@@ -28,7 +28,7 @@ class TrtLlmGenExperts(mk.FusedMoEExpertsModular):
         max_capture_size,
     ):
         super().__init__(moe_config, quant_config)
-        self.device = torch.cuda.current_device()
+        self.device = torch.accelerator.current_device_index()
         self.num_experts = moe_config.num_local_experts
         self.gemm1_alpha = torch.tensor(
             [1.702] * self.num_experts, dtype=torch.float32, device=self.device
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index 2a1180dd6..ecc36556c 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -202,7 +202,7 @@ class RMSNorm(CustomOp):
                 # external Oink initialization work in this case.
             else:
                 try:
-                    device_index = torch.cuda.current_device()
+                    device_index = torch.accelerator.current_device_index()
                     if _oink_ops.is_oink_available_for_device(device_index):
                         self._use_oink_rmsnorm = True
                         self._use_oink_fused_add_rmsnorm = (
diff --git a/vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py b/vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py
index e5dabe035..ec03fc653 100644
--- a/vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/dual_chunk_rope.py
@@ -36,7 +36,8 @@ class DualChunkRotaryEmbedding(CustomOp):
         self.chunk_size = chunk_size
         self.local_size = local_size
         self.dtype = dtype
-        self.device = torch.device(f"cuda:{torch.cuda.current_device()}")
+        device_idx = torch.accelerator.current_device_index()
+        self.device = torch.device(f"cuda:{device_idx}")
         (q_cache, qc_cache, k_cache, qc_no_clamp_cache, q_inter_cache) = (
             self._compute_cos_sin_cache()
         )
diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py
index 6e8aee8bc..1ff1a448a 100644
--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@@ -539,6 +539,8 @@ def deserialize_tensorizer_model(
         )
     before_mem = get_mem_usage()
     start = time.perf_counter()
+    device_index = torch.accelerator.current_device_index()
+    device_type = current_platform.device_type
     with (
         open_stream(
             tensorizer_config.tensorizer_uri, mode="rb", **tensorizer_args.stream_kwargs
@@ -546,9 +548,7 @@ def deserialize_tensorizer_model(
         TensorDeserializer(
             stream,
             dtype=tensorizer_config.dtype,
-            device=f"xpu:{torch.xpu.current_device()}"
-            if current_platform.is_xpu()
-            else f"cuda:{torch.cuda.current_device()}",
+            device=f"{device_type}:{device_index}",
             **tensorizer_args.deserialization_kwargs,
         ) as deserializer,
     ):
diff --git a/vllm/utils/torch_utils.py b/vllm/utils/torch_utils.py
index e4aa4fe61..61f863f1d 100644
--- a/vllm/utils/torch_utils.py
+++ b/vllm/utils/torch_utils.py
@@ -624,7 +624,7 @@ def cuda_device_count_stateless() -> int:
     """Get number of CUDA devices, caching based on the value of
     CUDA_VISIBLE_DEVICES at the time of call.
 
-    This should be used instead of torch.cuda.device_count()
+    This should be used instead of torch.accelerator.device_count()
     unless CUDA_VISIBLE_DEVICES has already been set to the desired
     value."""
 
diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py
index 0150d8863..9a72bc5d3 100644
--- a/vllm/v1/engine/utils.py
+++ b/vllm/v1/engine/utils.py
@@ -134,7 +134,7 @@ class CoreEngineProcManager:
             for proc, local_dp_rank in zip(self.processes, local_dp_ranks):
                 # Adjust device control in DP for non-CUDA platforms
                 # as well as external and ray launchers
-                # For CUDA platforms, we use torch.cuda.set_device()
+                # For CUDA platforms, we use torch.accelerator.set_device_index()()
                 if is_dp and (
                     not current_platform.is_cuda_alike()
                     or vllm_config.parallel_config.use_ray
diff --git a/vllm/v1/worker/gpu_ubatch_wrapper.py b/vllm/v1/worker/gpu_ubatch_wrapper.py
index c4cbfff5a..64856052f 100644
--- a/vllm/v1/worker/gpu_ubatch_wrapper.py
+++ b/vllm/v1/worker/gpu_ubatch_wrapper.py
@@ -73,8 +73,8 @@ class SMControlContextManager:
         assert current_platform.is_cuda(), (
             "SM control is currently only supported on CUDA"
         )
-
-        total_sms = num_compute_units(torch.cuda.current_device())
+        device = torch.accelerator.current_device_index()
+        total_sms = num_compute_units(device)
 
         assert comm_sms < total_sms
         self.total_sms = total_sms
@@ -204,7 +204,7 @@ class UBatchWrapper:
 
         @torch.inference_mode()
         def _capture_ubatch_thread(results, ubatch_metadata):
-            torch.cuda.set_device(self.device)
+            torch.accelerator.set_device_index(self.device)
             ubatch_context = ubatch_metadata.context
             with torch.cuda.stream(ubatch_context.compute_stream):
                 _ = torch.cuda.current_blas_handle()
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 842e76549..58e28e694 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -239,11 +239,11 @@ class Worker(WorkerBase):
 
                 # DP_LOCAL_RANK * TP_PP_WORLD_SIZE + TP_LOCAL_RANK
                 self.local_rank += dp_local_rank * tp_pp_world_size
-                assert self.local_rank < torch.cuda.device_count(), (
+                assert self.local_rank < torch.accelerator.device_count(), (
                     f"DP adjusted local rank {self.local_rank} is out of bounds. "
                 )
                 visible_device_count = (
-                    torch.cuda.device_count() if torch.cuda.is_available() else 0
+                    torch.accelerator.device_count() if torch.cuda.is_available() else 0
                 )
                 assert self.parallel_config.local_world_size <= visible_device_count, (
                     f"local_world_size ({self.parallel_config.local_world_size}) must "
@@ -252,7 +252,7 @@ class Worker(WorkerBase):
                 )
 
             self.device = torch.device(f"cuda:{self.local_rank}")
-            current_platform.set_device(self.device)
+            torch.accelerator.set_device_index(self.device)
 
             current_platform.check_if_supports_dtype(self.model_config.dtype)
 
diff --git a/vllm/v1/worker/xpu_worker.py b/vllm/v1/worker/xpu_worker.py
index 112a71b37..421105923 100644
--- a/vllm/v1/worker/xpu_worker.py
+++ b/vllm/v1/worker/xpu_worker.py
@@ -60,7 +60,7 @@ class XPUWorker(Worker):
             and current_platform.is_xpu()
         ):
             self.device = torch.device(f"xpu:{self.local_rank}")
-            current_platform.set_device(self.device)
+            torch.accelerator.set_device_index(self.device)
             current_platform.check_if_supports_dtype(self.model_config.dtype)
             torch.accelerator.empty_cache()
             self.init_gpu_memory = torch.xpu.get_device_properties(