diff --git a/tests/lora/test_fused_moe_lora_kernel.py b/tests/lora/test_fused_moe_lora_kernel.py
index 66a985a06..8adc20865 100644
--- a/tests/lora/test_fused_moe_lora_kernel.py
+++ b/tests/lora/test_fused_moe_lora_kernel.py
@@ -637,7 +637,7 @@ def use_fused_moe_lora_kernel_tensor_parallel(
 
     set_random_seed(seed)
 
-    device = torch.device(f"cuda:{local_rank}")
+    device = torch.device(f"{DEVICE_TYPE}:{local_rank}")
     torch.accelerator.set_device_index(device)
     torch.set_default_device(device)
     torch.set_default_dtype(dtype)
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 08fd03724..2a37abac6 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -60,8 +60,12 @@ pytestmark = pytest.mark.skipif(
     reason="Backend not supported",
 )
 
+DEVICE_TYPE = current_platform.device_type
 DEVICES = (
-    [f"cuda:{i}" for i in range(1 if torch.accelerator.device_count() == 1 else 2)]
+    [
+        f"{DEVICE_TYPE}:{i}"
+        for i in range(1 if torch.accelerator.device_count() == 1 else 2)
+    ]
     if current_platform.is_cuda_alike()
     else ["cpu"]
 )
@@ -196,7 +200,7 @@ def create_random_inputs(
     input_size: tuple[int, ...],
     input_range: tuple[float, float],
     input_type: torch.dtype = torch.int,
-    device: torch.device = "cuda",
+    device: torch.device = DEVICE_TYPE,
 ) -> tuple[list[torch.Tensor], list[int], list[int]]:
     """Creates random inputs.
 
diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index e7addab11..e80d96f00 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -35,9 +35,9 @@ EMBEDDING_MODULES = {
     "lm_head": "output_embeddings",
 }
 
-
+DEVICE_TYPE = current_platform.device_type
 DEVICES = (
-    [f"cuda:{i}" for i in range(1 if torch.accelerator.device_count() == 1 else 2)]
+    [f"{DEVICE_TYPE}:{i}" for i in range(min(torch.accelerator.device_count(), 2))]
     if current_platform.is_cuda_alike()
     else ["cpu"]
 )
diff --git a/tests/lora/test_moe_lora_align_sum.py b/tests/lora/test_moe_lora_align_sum.py
index bb46b4d86..1451ec162 100644
--- a/tests/lora/test_moe_lora_align_sum.py
+++ b/tests/lora/test_moe_lora_align_sum.py
@@ -6,6 +6,9 @@ import pytest
 import torch
 
 from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
+
+DEVICE_TYPE = current_platform.device_type
 
 
 def round_up(x, base):
@@ -27,7 +30,7 @@ def sample_data(num_experts, max_loras, num_tokens, topk_num):
             topk_ids[i, j] = pool[j]
         token_lora_mapping[i] = random.randint(0, max_loras - 1)
 
-    return topk_ids.to("cuda"), token_lora_mapping.to("cuda")
+    return topk_ids.to(DEVICE_TYPE), token_lora_mapping.to(DEVICE_TYPE)
 
 
 @pytest.mark.parametrize("num_tokens", [100, 200, 1024, 4096])  # 81920
@@ -56,14 +59,21 @@ def test_moe_lora_align_block_size(
         (max_loras * max_num_tokens_padded,),
         topk_ids.numel(),
         dtype=torch.int32,
-        device="cuda",
+        device=DEVICE_TYPE,
     )
     expert_ids = torch.full(
-        (max_loras * max_num_m_blocks,), num_experts, dtype=torch.int32, device="cuda"
+        (max_loras * max_num_m_blocks,),
+        num_experts,
+        dtype=torch.int32,
+        device=DEVICE_TYPE,
     )
-    num_tokens_post_pad = torch.zeros((max_loras,), dtype=torch.int32, device="cuda")
-    adapter_enabled = torch.ones((max_loras + 1,), dtype=torch.int32, device="cuda")
-    lora_ids = torch.arange(max_loras + 2, dtype=torch.int32, device="cuda")
+    num_tokens_post_pad = torch.zeros(
+        (max_loras,), dtype=torch.int32, device=DEVICE_TYPE
+    )
+    adapter_enabled = torch.ones(
+        (max_loras + 1,), dtype=torch.int32, device=DEVICE_TYPE
+    )
+    lora_ids = torch.arange(max_loras + 2, dtype=torch.int32, device=DEVICE_TYPE)
 
     # call kernel
     ops.moe_lora_align_block_size(
diff --git a/tests/lora/test_punica_ops.py b/tests/lora/test_punica_ops.py
index 8a2634e82..7706d0e2a 100644
--- a/tests/lora/test_punica_ops.py
+++ b/tests/lora/test_punica_ops.py
@@ -9,10 +9,13 @@ import vllm.lora.ops.torch_ops as torch_ops
 import vllm.lora.ops.triton_ops as triton_ops
 from vllm.lora.ops.triton_ops import LoRAKernelMeta
 from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
+from vllm.platforms import current_platform
 from vllm.utils.torch_utils import set_random_seed
 
 from .utils import PunicaTensors, assert_close, generate_data_for_nslices
 
+DEVICE_TYPE = current_platform.device_type
+
 
 @pytest.fixture(autouse=True)
 def reset_device(reset_default_device):
@@ -146,7 +149,9 @@ def check_lora_shrink_kernel(
 
     # Setup metadata information for the LoRA kernel.
     lora_meta = LoRAKernelMeta.make(
-        max_loras=num_loras, max_num_tokens=token_nums, device="cuda"
+        max_loras=num_loras,
+        max_num_tokens=token_nums,
+        device=DEVICE_TYPE,
     )
     lora_meta.prepare_tensors(data.token_lora_mapping)
 
@@ -219,7 +224,9 @@ def check_lora_expand_kernel(
 
     # Setup metadata information for the LoRA kernel.
     lora_meta = LoRAKernelMeta.make(
-        max_loras=num_loras, max_num_tokens=token_nums, device="cuda"
+        max_loras=num_loras,
+        max_num_tokens=token_nums,
+        device=DEVICE_TYPE,
     )
     lora_meta.prepare_tensors(data.token_lora_mapping)
 
@@ -367,7 +374,7 @@ test_params = {
 }
 
 DTYPES = [torch.float16, torch.bfloat16]
-DEVICES = [f"cuda:{0}"]
+DEVICES = [f"{DEVICE_TYPE}:{0}"]
 SEED = [0]
 
 
diff --git a/tests/lora/test_punica_ops_fp8.py b/tests/lora/test_punica_ops_fp8.py
index 042313336..3e7fe7b27 100644
--- a/tests/lora/test_punica_ops_fp8.py
+++ b/tests/lora/test_punica_ops_fp8.py
@@ -28,9 +28,11 @@ from vllm.lora.ops.triton_ops.lora_shrink_fp8_op import (
     _SHRINK_LORA_SCALE_PTR_DICT,
 )
 from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
+from vllm.platforms import current_platform
 from vllm.utils.torch_utils import set_random_seed
 
-DEVICES = [f"cuda:{0}"]
+DEVICE_TYPE = current_platform.device_type
+DEVICES = [f"{DEVICE_TYPE}:{0}"]
 SEED = [0]
 
 _dict_lock = Lock()
diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py
index 4af3ccf89..88763551c 100644
--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -19,11 +19,14 @@ from vllm.config.load import LoadConfig
 from vllm.config.lora import LoRAConfig
 from vllm.lora.model_manager import LoRAMapping
 from vllm.lora.request import LoRARequest
+from vllm.platforms import current_platform
 from vllm.v1.worker.gpu_worker import Worker
 
 MODEL_PATH = "Qwen/Qwen3-0.6B"
 NUM_LORAS = 16
 
+DEVICE_TYPE = current_platform.device_type
+
 
 @patch.dict(os.environ, {"RANK": "0"})
 def test_worker_apply_lora(qwen3_lora_files):
@@ -61,7 +64,7 @@ def test_worker_apply_lora(qwen3_lora_files):
             max_num_seqs=32,
             max_num_partial_prefills=32,
         ),
-        device_config=DeviceConfig("cuda"),
+        device_config=DeviceConfig(DEVICE_TYPE),
         cache_config=CacheConfig(
             block_size=16,
             cache_dtype="auto",
diff --git a/tests/lora/utils.py b/tests/lora/utils.py
index 6aba5299b..e5ce7a884 100644
--- a/tests/lora/utils.py
+++ b/tests/lora/utils.py
@@ -9,10 +9,13 @@ import torch
 from safetensors.torch import save_file
 
 from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights
+from vllm.platforms import current_platform
+
+DEVICE_TYPE = current_platform.device_type
 
 
 class DummyLoRAManager:
-    def __init__(self, device: torch.device = "cuda:0"):
+    def __init__(self, device: torch.device = f"{DEVICE_TYPE}:0"):
         super().__init__()
         self._loras: dict[str, LoRALayerWeights] = {}
         self._device = device
@@ -57,8 +60,8 @@ class DummyLoRAManager:
             module_name,
             rank=rank,
             lora_alpha=1,
-            lora_a=torch.rand([rank, input_dim], device="cuda"),
-            lora_b=torch.rand([output_dim, input_dim], device="cuda"),
+            lora_a=torch.rand([rank, input_dim], device=DEVICE_TYPE),
+            lora_b=torch.rand([output_dim, input_dim], device=DEVICE_TYPE),
             embeddings_tensor=embeddings_tensor,
         )
         self.set_module_lora(module_name, lora)
diff --git a/tests/v1/attention/test_attention_backends.py b/tests/v1/attention/test_attention_backends.py
index 8c3a62b6e..06095b87e 100644
--- a/tests/v1/attention/test_attention_backends.py
+++ b/tests/v1/attention/test_attention_backends.py
@@ -40,6 +40,8 @@ BACKENDS_TO_TEST = [
     "FLEX_ATTENTION_SLOW",
 ]
 
+DEVICE_TYPE = current_platform.device_type
+
 # Remove flashinfer from the list if it's not available
 try:
     import flashinfer  # noqa: F401
@@ -366,7 +368,7 @@ def _test_backend_correctness(
         num_gpu_blocks=8192,
         hf_config_override=hf_config_override,
     )
-    device = torch.device("cuda:0")
+    device = torch.device(f"{DEVICE_TYPE}:0")
 
     kv_cache_spec = create_standard_kv_cache_spec(vllm_config)
 
diff --git a/tests/v1/attention/test_chunked_local_attention.py b/tests/v1/attention/test_chunked_local_attention.py
index 4529c2cfc..c2798c8f2 100644
--- a/tests/v1/attention/test_chunked_local_attention.py
+++ b/tests/v1/attention/test_chunked_local_attention.py
@@ -7,6 +7,7 @@ import pytest
 import torch
 
 from tests.v1.attention.utils import BatchSpec, create_common_attn_metadata
+from vllm.platforms import current_platform
 from vllm.v1.attention.backends.utils import make_local_attention_virtual_batches
 
 
@@ -22,6 +23,8 @@ class LocalAttentionTestData:
     expected_local_block_table: list[list[int]]
 
 
+DEVICE_TYPE = current_platform.device_type
+
 test_data_list = [
     # Same as example in docstring of make_local_attention_virtual_batches
     # except block table has 9 columns instead of 10
@@ -151,7 +154,7 @@ test_data_list = [
 
 @pytest.mark.parametrize("test_data", test_data_list)
 def test_local_attention_virtual_batches(test_data: LocalAttentionTestData):
-    device = torch.device("cuda:0")
+    device = torch.device(f"{DEVICE_TYPE}:0")
     batch_spec = test_data.batch_spec
     attn_chunk_size = test_data.attn_chunk_size
     block_size = test_data.block_size
diff --git a/tests/v1/attention/test_mla_backends.py b/tests/v1/attention/test_mla_backends.py
index 796912a68..e65d1d604 100644
--- a/tests/v1/attention/test_mla_backends.py
+++ b/tests/v1/attention/test_mla_backends.py
@@ -42,6 +42,8 @@ BACKENDS_TO_TEST = [
     AttentionBackendEnum.TRITON_MLA,
 ]
 
+DEVICE_TYPE = current_platform.device_type
+
 # Remove sm100 backends from the list if not using sm100
 if not torch.cuda.is_available() or torch.cuda.get_device_properties(0).major < 10:
     BACKENDS_TO_TEST.remove(AttentionBackendEnum.CUTLASS_MLA)
@@ -763,7 +765,7 @@ def test_backend_correctness(
             method="ngram", num_speculative_tokens=query_len - 1
         )
 
-    device = torch.device("cuda:0")
+    device = torch.device(f"{DEVICE_TYPE}:0")
 
     # 1. Setup
     batch_size = batch_spec.batch_size
diff --git a/tests/v1/attention/test_sparse_mla_backends.py b/tests/v1/attention/test_sparse_mla_backends.py
index c49ccd24e..22acc748d 100644
--- a/tests/v1/attention/test_sparse_mla_backends.py
+++ b/tests/v1/attention/test_sparse_mla_backends.py
@@ -64,6 +64,8 @@ SPARSE_BACKEND_BATCH_SPECS["large_q_pure_prefill"] = BatchSpec(
     seq_lens=[256] * 2, query_lens=[256] * 2
 )
 
+DEVICE_TYPE = current_platform.device_type
+
 
 def _float_to_e8m0_truncate(f: float) -> float:
     """Simulate SM100's float -> e8m0 -> bf16 scale conversion.
@@ -222,7 +224,7 @@ def test_sparse_backend_decode_correctness(
     batch_spec = SPARSE_BACKEND_BATCH_SPECS[batch_name]
     use_fp8_ds_mla_quantization = kv_cache_dtype == "fp8_ds_mla"
 
-    device = torch.device("cuda")
+    device = torch.device(DEVICE_TYPE)
     dtype = torch.bfloat16
 
     # Model hyper-parameters (kept intentionally small for the unit test)
@@ -586,7 +588,7 @@ def _triton_convert_reference_impl(
 def test_triton_convert_req_index_to_global_index_decode_only(
     block_size, num_topk_tokens
 ):
-    device = torch.device("cuda")
+    device = torch.device(DEVICE_TYPE)
     num_tokens = 8
     num_requests = 4
     max_blocks_per_req = 10
@@ -639,7 +641,7 @@ def test_triton_convert_req_index_to_global_index_decode_only(
     reason="FlashMLASparseBackend requires CUDA 9.0 or higher",
 )
 def test_triton_convert_req_index_to_global_index_with_prefill_workspace(block_size):
-    device = torch.device("cuda")
+    device = torch.device(DEVICE_TYPE)
     num_requests = 4
     max_blocks_per_req = 8
     num_topk_tokens = 128
@@ -794,7 +796,7 @@ def test_split_indexer_prefill_chunks_single_request_overflow():
 
 def test_triton_convert_returns_valid_counts():
     """Test that return_valid_counts correctly counts non-negative indices."""
-    device = torch.device("cuda")
+    device = torch.device(DEVICE_TYPE)
     num_tokens = 8
     num_requests = 2
     max_blocks_per_req = 10
diff --git a/tests/v1/attention/test_trtllm_attention_integration.py b/tests/v1/attention/test_trtllm_attention_integration.py
index 113442bf6..12af0773c 100644
--- a/tests/v1/attention/test_trtllm_attention_integration.py
+++ b/tests/v1/attention/test_trtllm_attention_integration.py
@@ -55,6 +55,7 @@ class MockAttentionLayer:
 MODEL = "Qwen/Qwen2.5-0.5B"
 BLOCK_SIZE = 16
 NUM_GPU_BLOCKS = 8192
+DEVICE_TYPE = current_platform.device_type
 
 BATCH_SPECS = {
     "decode_only": BatchSpec(
@@ -172,7 +173,7 @@ def _run_trtllm_integration(batch_spec):
     """Run TRTLLM attention through the full FlashInfer pipeline
     and compare against an SDPA reference."""
     set_random_seed(42)
-    device = torch.device("cuda:0")
+    device = torch.device(f"{DEVICE_TYPE}:0")
 
     vllm_config = create_vllm_config(
         model_name=MODEL,
diff --git a/tests/v1/cudagraph/test_cudagraph_dispatch.py b/tests/v1/cudagraph/test_cudagraph_dispatch.py
index 52e927cee..66e6d7dd4 100644
--- a/tests/v1/cudagraph/test_cudagraph_dispatch.py
+++ b/tests/v1/cudagraph/test_cudagraph_dispatch.py
@@ -23,6 +23,8 @@ from vllm.forward_context import BatchDescriptor, set_forward_context
 from vllm.platforms import current_platform
 from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher
 
+DEVICE_TYPE = current_platform.device_type
+
 
 # Helper MLP for testing
 class SimpleMLP(nn.Module):
@@ -269,9 +271,9 @@ class TestCudagraphDispatcher:
 class TestCUDAGraphWrapper:
     def setup_method(self):
         self.vllm_config = _create_vllm_config(CompilationConfig())
-        self.model = SimpleMLP().to("cuda")
-        self.persistent_input_buffer = torch.zeros(1, 10, device="cuda")
-        self.input_tensor = torch.randn(1, 10, device="cuda")
+        self.model = SimpleMLP().to(DEVICE_TYPE)
+        self.persistent_input_buffer = torch.zeros(1, 10, device=DEVICE_TYPE)
+        self.input_tensor = torch.randn(1, 10, device=DEVICE_TYPE)
 
     def test_capture_and_replay(self):
         wrapper = CUDAGraphWrapper(
@@ -428,10 +430,10 @@ class TestCudagraphIntegration:
 
     @create_new_process_for_each_test("spawn")
     def test_capture_replay_bypass_logic(self):
-        model = SimpleMLP().to("cuda")
+        model = SimpleMLP().to(DEVICE_TYPE)
         full_wrapper = CUDAGraphWrapper(model, self.vllm_config, CUDAGraphMode.FULL)
         max_bs = 16
-        persistent_input_buffer = torch.zeros(max_bs, 10, device="cuda")
+        persistent_input_buffer = torch.zeros(max_bs, 10, device=DEVICE_TYPE)
         input_1 = persistent_input_buffer[:1]
         input_2 = persistent_input_buffer[:2]
         input_3 = persistent_input_buffer[:3]
@@ -486,17 +488,17 @@ class TestCudagraphIntegration:
     @create_new_process_for_each_test("spawn")
     def test_nested_wrappers(self):
         """Tests a scenario with a PIECEWISE wrapper inside a FULL one."""
-        model = SimpleMLP().to("cuda")
+        model = SimpleMLP().to(DEVICE_TYPE)
         full_wrapper = CUDAGraphWrapper(model, self.vllm_config, CUDAGraphMode.FULL)
-        input_1 = torch.randn(1, 10, device="cuda")
+        input_1 = torch.randn(1, 10, device=DEVICE_TYPE)
 
         # Setup: Inner model is wrapped with PIECEWISE, outer with FULL
-        inner_model = SimpleMLP().to("cuda")
+        inner_model = SimpleMLP().to(DEVICE_TYPE)
         piecewise_wrapper = CUDAGraphWrapper(
             inner_model, self.vllm_config, CUDAGraphMode.PIECEWISE
         )
         inner_model.forward = MagicMock(wraps=inner_model.forward)
-        outer_model = SimpleMLP().to("cuda")
+        outer_model = SimpleMLP().to(DEVICE_TYPE)
         # When outer model is called, it calls the piecewise_wrapper
         outer_model.forward = MagicMock(
             wraps=outer_model.forward, side_effect=piecewise_wrapper
diff --git a/tests/v1/determinism/test_rms_norm_batch_invariant.py b/tests/v1/determinism/test_rms_norm_batch_invariant.py
index 5e5b40d09..7d3b8437a 100644
--- a/tests/v1/determinism/test_rms_norm_batch_invariant.py
+++ b/tests/v1/determinism/test_rms_norm_batch_invariant.py
@@ -13,6 +13,9 @@ from utils import skip_unsupported
 
 from vllm.model_executor.layers.batch_invariant import rms_norm as triton_rms_norm
 from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.platforms import current_platform
+
+DEVICE_TYPE = current_platform.device_type
 
 
 @skip_unsupported
@@ -34,7 +37,7 @@ def test_rms_norm_batch_invariant_vs_standard(
     equivalent results to the standard CUDA implementation across various
     configurations.
     """
-    device = torch.device("cuda")
+    device = torch.device(DEVICE_TYPE)
 
     # Create test input and weight
     torch.manual_seed(42)
@@ -81,7 +84,7 @@ def test_rms_norm_3d_input(
     Ensures that the batch-invariant RMS norm correctly handles multi-dimensional
     inputs that are common in transformer models.
     """
-    device = torch.device("cuda")
+    device = torch.device(DEVICE_TYPE)
     dtype = torch.bfloat16
     eps = 1e-6
 
@@ -120,7 +123,7 @@ def test_rms_norm_numerical_stability(default_vllm_config):
     Ensures that both implementations handle edge cases like very small or large
     values without producing NaN or Inf.
     """
-    device = torch.device("cuda")
+    device = torch.device(DEVICE_TYPE)
     dtype = torch.float16
     eps = 1e-6
     hidden_size = 2048
@@ -179,7 +182,7 @@ def test_rms_norm_formula(default_vllm_config):
 
     Verifies: output = input / sqrt(mean(input^2) + eps) * weight
     """
-    device = torch.device("cuda")
+    device = torch.device(DEVICE_TYPE)
     dtype = torch.float32  # Use float32 for higher precision in formula check
     eps = 1e-6
     hidden_size = 1024
@@ -214,7 +217,7 @@ def test_rms_norm_different_hidden_sizes(default_vllm_config, hidden_size: int):
     The Triton kernel uses a fixed BLOCK_SIZE=1024, so this tests that it
     correctly handles hidden sizes both smaller and larger than the block size.
     """
-    device = torch.device("cuda")
+    device = torch.device(DEVICE_TYPE)
     dtype = torch.bfloat16
     eps = 1e-6
     batch_size = 16
@@ -251,7 +254,7 @@ def test_rms_norm_determinism(default_vllm_config):
     Runs the same input through the kernel multiple times and verifies
     identical outputs.
     """
-    device = torch.device("cuda")
+    device = torch.device(DEVICE_TYPE)
     dtype = torch.bfloat16
     eps = 1e-6
     hidden_size = 4096
@@ -283,7 +286,7 @@ if __name__ == "__main__":
     # Run a quick smoke test
     print("Running quick smoke test of RMS norm implementations...")
 
-    device = torch.device("cuda")
+    device = torch.device(DEVICE_TYPE)
     batch_size = 8
     hidden_size = 4096
     dtype = torch.bfloat16
diff --git a/tests/v1/e2e/general/test_mamba_prefix_cache.py b/tests/v1/e2e/general/test_mamba_prefix_cache.py
index 747c5defe..8b9f7bb6c 100644
--- a/tests/v1/e2e/general/test_mamba_prefix_cache.py
+++ b/tests/v1/e2e/general/test_mamba_prefix_cache.py
@@ -16,6 +16,7 @@ from vllm import LLM, SamplingParams, TokensPrompt
 from vllm.config import CacheConfig
 from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.model_executor.layers.mamba.mamba_utils import MambaStateCopyFunc
+from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 from vllm.v1.attention.backends.utils import CommonAttentionMetadata
 from vllm.v1.core.kv_cache_manager import KVCacheBlocks, KVCacheManager
@@ -48,6 +49,7 @@ num_accepted_tokens = 1
 prompt_token_ids: list[int] = []
 MODEL = "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8"
 BLOCK_SIZE = 560
+DEVICE_TYPE = current_platform.device_type
 NUM_HIDDEN_LAYERS = 1
 cur_step_action_idx = 0
 cur_step_action: StepAction | None = None
@@ -71,7 +73,7 @@ def get_fake_sample_fn() -> SamplerOutput:
             return SamplerOutput(
                 sampled_token_ids=torch.tensor(
                     [[prompt_token_ids[first_token_id_index]]],
-                    device="cuda",
+                    device=DEVICE_TYPE,
                     dtype=torch.int32,
                 ),
                 logprobs_tensors=None,
@@ -83,7 +85,9 @@ def get_fake_sample_fn() -> SamplerOutput:
         sampled_token_ids = accepted_tokens
         return SamplerOutput(
             sampled_token_ids=torch.tensor(
-                [sampled_token_ids], device="cuda", dtype=torch.int32
+                [sampled_token_ids],
+                device=DEVICE_TYPE,
+                dtype=torch.int32,
             ),
             logprobs_tensors=None,
         )
@@ -128,17 +132,23 @@ def get_fake_propose_draft_token_ids_fn():
                 - 1
                 + num_accepted_tokens
             ],
-            device="cuda",
+            device=DEVICE_TYPE,
             dtype=torch.int32,
         )
 
         valid_sampled_tokens_count = torch.tensor(
-            [num_accepted_tokens], device="cuda", dtype=torch.int32
+            [num_accepted_tokens],
+            device=DEVICE_TYPE,
+            dtype=torch.int32,
         )
 
         self._copy_valid_sampled_token_count(next_token_ids, valid_sampled_tokens_count)
 
-        return torch.tensor(proposed_draft_token_ids, device="cuda", dtype=torch.int32)
+        return torch.tensor(
+            proposed_draft_token_ids,
+            device=DEVICE_TYPE,
+            dtype=torch.int32,
+        )
 
     return fake_propose_draft_token_ids_fn
 
diff --git a/tests/v1/kv_offload/test_cpu_gpu.py b/tests/v1/kv_offload/test_cpu_gpu.py
index 1983cca22..2da3a5e56 100644
--- a/tests/v1/kv_offload/test_cpu_gpu.py
+++ b/tests/v1/kv_offload/test_cpu_gpu.py
@@ -6,6 +6,7 @@ import time
 import pytest
 import torch
 
+from vllm.platforms import current_platform
 from vllm.utils.torch_utils import set_random_seed
 from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
 from vllm.v1.kv_offload.spec import (
@@ -21,7 +22,8 @@ GPU_PAGE_SIZES = [512, 1024]
 BLOCK_SIZE_FACTORS = [1, 3]
 NUM_TENSORS = [4]
 SEEDS = [0]
-CUDA_DEVICES = ["cuda:0"]
+DEVICE_TYPE = current_platform.device_type
+DEVICES = [f"{DEVICE_TYPE}:0"]
 NUM_MAPPINGS = [3]
 
 
@@ -33,7 +35,7 @@ NUM_MAPPINGS = [3]
 @pytest.mark.parametrize("num_cpu_blocks", NUM_CPU_BLOCKS)
 @pytest.mark.parametrize("num_tensors", NUM_TENSORS)
 @pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 @torch.inference_mode()
 def test_transfer(
     default_vllm_config,
diff --git a/tests/v1/logits_processors/test_correctness.py b/tests/v1/logits_processors/test_correctness.py
index bc2cc1720..bf2979371 100644
--- a/tests/v1/logits_processors/test_correctness.py
+++ b/tests/v1/logits_processors/test_correctness.py
@@ -39,8 +39,9 @@ PIN_MEMORY_AVAILABLE = is_pin_memory_available()
 MAX_NUM_REQS = 256
 VOCAB_SIZE = 1024
 NUM_OUTPUT_TOKENS = 20
-CUDA_DEVICES = [
-    f"{current_platform.device_type}:{i}"
+DEVICE_TYPE = current_platform.device_type
+DEVICES = [
+    f"{DEVICE_TYPE}:{i}"
     for i in range(1 if current_platform.device_count() == 1 else 2)
 ]
 MAX_NUM_PROMPT_TOKENS = 64
@@ -801,7 +802,7 @@ def _assert_valid(
 
 
 @create_new_process_for_each_test()
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("reqs_per_logitproc", [REQS_PER_LOGITPROC])
 @pytest.mark.parametrize("logitsprocs_under_test", _get_test_cases())
 def test_logitsprocs(
diff --git a/tests/v1/sample/test_rejection_sampler.py b/tests/v1/sample/test_rejection_sampler.py
index 552a27fe2..ecfcade2b 100644
--- a/tests/v1/sample/test_rejection_sampler.py
+++ b/tests/v1/sample/test_rejection_sampler.py
@@ -19,7 +19,7 @@ from vllm.v1.sample.rejection_sampler import (
 from vllm.v1.sample.sampler import Sampler, SamplerOutput
 from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
 
-DEVICE = current_platform.device_type
+DEVICE_TYPE = current_platform.device_type
 
 
 @pytest.fixture
@@ -57,7 +57,7 @@ def create_logits_tensor(
     will produce desired token ids on argmax"""
     token_ids = [tokens[:-1] for tokens in output_token_ids]
     num_total_tokens = sum(len(tokens) for tokens in token_ids)
-    logits = torch.full((num_total_tokens, vocab_size), -100.0, device=DEVICE)
+    logits = torch.full((num_total_tokens, vocab_size), -100.0, device=DEVICE_TYPE)
     start_loc = 0
     for tokens in token_ids:
         for j, token_id in enumerate(tokens):
@@ -99,9 +99,9 @@ def create_sampling_metadata(
         assert output_token_ids
         assert len(output_token_ids) > 0
 
-        frequency_penalties = torch.tensor(frequency_penalties, device=DEVICE)
-        presence_penalties = torch.tensor(presence_penalties, device=DEVICE)
-        repetition_penalties = torch.tensor(repetition_penalties, device=DEVICE)
+        frequency_penalties = torch.tensor(frequency_penalties, device=DEVICE_TYPE)
+        presence_penalties = torch.tensor(presence_penalties, device=DEVICE_TYPE)
+        repetition_penalties = torch.tensor(repetition_penalties, device=DEVICE_TYPE)
     else:
         no_penalties = True
         frequency_penalties = torch.tensor([])
@@ -320,14 +320,27 @@ def test_deterministic_when_seeded(
     n_rep: int,
 ):
     num_tokens = batch_size * k
-    draft_probs = torch.rand(num_tokens, vocab_size, dtype=torch.float32, device=DEVICE)
+    draft_probs = torch.rand(
+        num_tokens,
+        vocab_size,
+        dtype=torch.float32,
+        device=DEVICE_TYPE,
+    )
     draft_probs = F.softmax(draft_probs, dim=-1)
     target_logits = torch.rand_like(draft_probs)
     bonus_token_ids = torch.randint(
-        low=0, high=vocab_size, size=(batch_size, 1), dtype=torch.int64, device=DEVICE
+        low=0,
+        high=vocab_size,
+        size=(batch_size, 1),
+        dtype=torch.int64,
+        device=DEVICE_TYPE,
     )
     draft_token_ids = torch.randint(
-        low=0, high=vocab_size, size=(batch_size, k), dtype=torch.int64, device=DEVICE
+        low=0,
+        high=vocab_size,
+        size=(batch_size, k),
+        dtype=torch.int64,
+        device=DEVICE_TYPE,
     )
 
     seeded_mask = torch.rand(batch_size, dtype=torch.float32) <= frac_seeded
@@ -335,12 +348,12 @@ def test_deterministic_when_seeded(
     results = []
     for _ in range(n_rep):
         seeded_seqs = {
-            i: torch.Generator(device=DEVICE).manual_seed(i)
+            i: torch.Generator(device=DEVICE_TYPE).manual_seed(i)
             for i in range(batch_size)
             if seeded_mask[i]
         }
 
-        temperature = torch.ones(batch_size, dtype=torch.float32, device=DEVICE)
+        temperature = torch.ones(batch_size, dtype=torch.float32, device=DEVICE_TYPE)
         sampling_metadata = create_sampling_metadata(
             all_greedy=False, temperature=temperature, generators=seeded_seqs
         )
@@ -387,7 +400,7 @@ def test_rejection_sampling_approximates_target_distribution():
     much more than the distance improvement between the observed
     distribution and the random distribution.
     """
-    torch.set_default_device(DEVICE)
+    torch.set_default_device(DEVICE_TYPE)
     vocab_size = 10
     k = 2
     num_reference_probs = 100
@@ -410,7 +423,7 @@ def test_rejection_sampling_approximates_target_distribution():
         rej_sample_probs = estimate_rejection_sampling_pdf(
             draft_probs, target_logits, k, vocab_size, num_samples
         )
-        rej_sample_probs = rej_sample_probs.to(DEVICE)
+        rej_sample_probs = rej_sample_probs.to(DEVICE_TYPE)
 
         # Average distance from reference probs.
         reference_vs_rejsample_dist = (
@@ -491,11 +504,11 @@ def estimate_rejection_sampling_pdf(
     draft_probs = draft_probs.view(num_tokens, vocab_size)
 
     # Bonus tokens not used but required.
-    bonus_token_ids = torch.zeros((1, 1), dtype=torch.int64, device=DEVICE).repeat(
+    bonus_token_ids = torch.zeros((1, 1), dtype=torch.int64, device=DEVICE_TYPE).repeat(
         num_samples, 1
     )
 
-    temperature = torch.ones(num_samples, dtype=torch.float32, device=DEVICE)
+    temperature = torch.ones(num_samples, dtype=torch.float32, device=DEVICE_TYPE)
     sampling_metadata = create_sampling_metadata(
         all_greedy=False, temperature=temperature
     )
@@ -600,7 +613,7 @@ def _test_masked_logits(
 
     # Create random draft probabilities.
     draft_probs = torch.rand(
-        (num_tokens, vocab_size), dtype=torch.float32, device=DEVICE
+        (num_tokens, vocab_size), dtype=torch.float32, device=DEVICE_TYPE
     )
     draft_probs = F.softmax(draft_probs, dim=-1)
 
@@ -610,7 +623,11 @@ def _test_masked_logits(
     draft_token_ids = draft_token_ids.tolist()
 
     # Bonus tokens not used but required
-    bonus_token_ids = torch.zeros((batch_size, 1), dtype=torch.int64, device=DEVICE)
+    bonus_token_ids = torch.zeros(
+        (batch_size, 1),
+        dtype=torch.int64,
+        device=DEVICE_TYPE,
+    )
 
     # Create spec decode metadata
     spec_decode_metadata = create_spec_decode_metadata(draft_token_ids, target_logits)
@@ -645,12 +662,13 @@ def test_top_k(rejection_sampler, top_k):
 
     # Randomly create top-k indices.
     top_k_indices = [
-        torch.randperm(vocab_size, device=DEVICE)[:top_k] for _ in range(num_tokens)
+        torch.randperm(vocab_size, device=DEVICE_TYPE)[:top_k]
+        for _ in range(num_tokens)
     ]
     top_k_indices = torch.stack(top_k_indices)
 
     # Create logits with the uniform distribution.
-    target_logits = torch.zeros((num_tokens, vocab_size), device=DEVICE)
+    target_logits = torch.zeros((num_tokens, vocab_size), device=DEVICE_TYPE)
 
     # Increment the logits for top-k indices, a little bit more than the other
     # ones. If the masking is effective, the non-topk indices will never be
@@ -659,11 +677,11 @@ def test_top_k(rejection_sampler, top_k):
         target_logits[i, top_k_indices[i]] += 0.1
 
     # Create sampling metadata
-    temperature = torch.ones(batch_size, dtype=torch.float32, device=DEVICE)
+    temperature = torch.ones(batch_size, dtype=torch.float32, device=DEVICE_TYPE)
     sampling_metadata = create_sampling_metadata(
         all_greedy=False,
         temperature=temperature,
-        top_k=torch.tensor([top_k] * batch_size, device=DEVICE, dtype=torch.int64),
+        top_k=torch.tensor([top_k] * batch_size, device=DEVICE_TYPE, dtype=torch.int64),
     )
 
     _test_masked_logits(
@@ -686,8 +704,8 @@ def test_top_p(rejection_sampler, top_p):
     num_tokens = batch_size * num_draft_tokens
 
     # Create logits with the uniform distribution.
-    target_logits = torch.randn((num_tokens, vocab_size), device=DEVICE)
-    temperature = torch.ones(batch_size, dtype=torch.float32, device=DEVICE)
+    target_logits = torch.randn((num_tokens, vocab_size), device=DEVICE_TYPE)
+    temperature = torch.ones(batch_size, dtype=torch.float32, device=DEVICE_TYPE)
     rescaled_logits = target_logits / temperature
 
     logits_sort, logits_idx = rescaled_logits.sort(dim=-1, descending=False)
@@ -706,7 +724,11 @@ def test_top_p(rejection_sampler, top_p):
     sampling_metadata = create_sampling_metadata(
         all_greedy=False,
         temperature=temperature,
-        top_p=torch.tensor([top_p] * batch_size, device=DEVICE, dtype=torch.float32),
+        top_p=torch.tensor(
+            [top_p] * batch_size,
+            device=DEVICE_TYPE,
+            dtype=torch.float32,
+        ),
     )
 
     _test_masked_logits(
@@ -732,7 +754,10 @@ def test_frequency_penalties(rejection_sampler):
         all_greedy=True,
         output_token_ids=[[2], [3], [4]],
         spec_token_ids=spec_tokens,
-        prompt_token_ids=torch.tensor([[5, 6, 7], [6, 7, 8], [7, 8, 9]], device=DEVICE),
+        prompt_token_ids=torch.tensor(
+            [[5, 6, 7], [6, 7, 8], [7, 8, 9]],
+            device=DEVICE_TYPE,
+        ),
         frequency_penalties=[1.5, 1.5, 0.7],
         presence_penalties=[0.0] * num_requests,
         repetition_penalties=[1.0] * num_requests,
@@ -858,21 +883,26 @@ def test_sample_recovered_tokens(
     num_tokens = batch_size * max_spec_len
 
     # Create random draft probabilities.
-    draft_probs = torch.rand(num_tokens, vocab_size, dtype=torch.float32, device=DEVICE)
+    draft_probs = torch.rand(
+        num_tokens,
+        vocab_size,
+        dtype=torch.float32,
+        device=DEVICE_TYPE,
+    )
     draft_probs = F.softmax(draft_probs, dim=-1)
 
     # Create random target probabilities.
     target_logits = torch.rand(
-        num_tokens, vocab_size, dtype=torch.float32, device=DEVICE
+        num_tokens, vocab_size, dtype=torch.float32, device=DEVICE_TYPE
     )
     target_probs = F.softmax(target_logits, dim=-1)
 
     # Randomly sample draft token ids from draft probs
     draft_token_ids = torch.multinomial(draft_probs, num_samples=1).to(torch.int32)
 
-    temperature = torch.ones(batch_size, dtype=torch.float32, device=DEVICE)
+    temperature = torch.ones(batch_size, dtype=torch.float32, device=DEVICE_TYPE)
     generators = {
-        i: torch.Generator(device=DEVICE).manual_seed(i) for i in range(batch_size)
+        i: torch.Generator(device=DEVICE_TYPE).manual_seed(i) for i in range(batch_size)
     }
     sampling_metadata = create_sampling_metadata(
         all_greedy=False, temperature=temperature, generators=generators
@@ -890,7 +920,7 @@ def test_sample_recovered_tokens(
         None if no_draft_probs else draft_probs,
         target_probs,
         sampling_metadata,
-        device=DEVICE,
+        device=DEVICE_TYPE,
     )
     recovered_token_ids = sample_recovered_tokens(
         max_spec_len,
@@ -900,6 +930,6 @@ def test_sample_recovered_tokens(
         None if no_draft_probs else draft_probs,
         target_probs,
         sampling_metadata,
-        device=DEVICE,
+        device=DEVICE_TYPE,
     )
     assert torch.equal(recovered_token_ids, ref_recovered_token_ids)
diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py
index 51f2bf5e7..c67199fa4 100644
--- a/tests/v1/sample/test_sampler.py
+++ b/tests/v1/sample/test_sampler.py
@@ -17,8 +17,9 @@ PIN_MEMORY_AVAILABLE = is_pin_memory_available()
 MAX_NUM_REQS = 256
 VOCAB_SIZE = 1024
 NUM_OUTPUT_TOKENS = 20
-CUDA_DEVICES = [
-    f"{current_platform.device_type}:{i}"
+DEVICE_TYPE = current_platform.device_type
+DEVICES = [
+    f"{DEVICE_TYPE}:{i}"
     for i in range(1 if current_platform.device_count() == 1 else 2)
 ]
 MAX_NUM_PROMPT_TOKENS = 64
@@ -199,7 +200,7 @@ def _create_weighted_output_token_list(
     return output_token_ids, sorted_token_ids_in_output
 
 
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("batch_size", [1, 2, 32])
 @pytest.mark.parametrize("presence_penalty", [-2.0, 2.0])
 def test_sampler_presence_penalty(
@@ -249,7 +250,7 @@ def test_sampler_presence_penalty(
             assert penalized_token_id not in output_token_ids[batch_idx]
 
 
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("batch_size", [1, 2, 32])
 @pytest.mark.parametrize("frequency_penalty", [-2.0, 2.0])
 def test_sampler_frequency_penalty(
@@ -305,7 +306,7 @@ def test_sampler_frequency_penalty(
             assert penalized_token_id not in distinct_sorted_token_ids_in_output
 
 
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("batch_size", [1, 2, 32])
 @pytest.mark.parametrize("repetition_penalty", [0.1, 1.9])
 def test_sampler_repetition_penalty(
@@ -363,7 +364,7 @@ def test_sampler_repetition_penalty(
             )
 
 
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("batch_size", [1, 2, 32])
 @pytest.mark.parametrize("num_allowed_token_ids", [0, 1, 2])
 def test_sampler_allowed_token_ids(
@@ -409,7 +410,7 @@ def test_sampler_allowed_token_ids(
                 assert logits_for_req[token_id] != -float("inf")
 
 
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("batch_size", [1, 2, 32])
 @pytest.mark.parametrize("bad_words_lengths", [(1,), (1, 3), (2, 2)])
 def test_sampler_bad_words(
diff --git a/tests/v1/sample/test_topk_topp_sampler.py b/tests/v1/sample/test_topk_topp_sampler.py
index ce1e288a2..511f26680 100644
--- a/tests/v1/sample/test_topk_topp_sampler.py
+++ b/tests/v1/sample/test_topk_topp_sampler.py
@@ -7,8 +7,7 @@ from torch import Generator
 from vllm.platforms import current_platform
 from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p_pytorch
 
-CUDA_DEVICE = "cuda" if current_platform.is_cuda() else None
-DEVICE = current_platform.device_type
+DEVICE_TYPE = current_platform.device_type
 
 BATCH_SIZE = 1024
 VOCAB_SIZE = 128 * 1024
@@ -26,8 +25,8 @@ def reset_default_device():
 
 
 def test_topk_impl_equivalence():
-    torch.set_default_device(DEVICE)
-    generator = Generator(device=DEVICE).manual_seed(33)
+    torch.set_default_device(DEVICE_TYPE)
+    generator = Generator(device=DEVICE_TYPE).manual_seed(33)
 
     logits = torch.rand((BATCH_SIZE, VOCAB_SIZE), generator=generator)
 
@@ -76,8 +75,8 @@ def test_flashinfer_sampler():
     if not FLASHINFER_ENABLED:
         pytest.skip("FlashInfer not installed or not available on this platform.")
 
-    torch.set_default_device(DEVICE)
-    generator = Generator(device=DEVICE).manual_seed(42)
+    torch.set_default_device(DEVICE_TYPE)
+    generator = Generator(device=DEVICE_TYPE).manual_seed(42)
 
     # Generate random logits
     logits = torch.rand((BATCH_SIZE, VOCAB_SIZE), generator=generator)
@@ -128,15 +127,15 @@ def test_flashinfer_sampler():
 # =============================================================================
 
 
-@pytest.mark.skipif(CUDA_DEVICE is None, reason="CUDA not available")
+@pytest.mark.skipif("CPU" in DEVICE_TYPE, reason="CUDA/XPU not available")
 class TestTritonTopkTopp:
     """Tests for the Triton top-k/top-p kernel."""
 
     @pytest.fixture(autouse=True)
     def setup(self):
         """Set up test fixtures."""
-        torch.set_default_device(CUDA_DEVICE)
-        self.generator = Generator(device=CUDA_DEVICE).manual_seed(42)
+        torch.set_default_device(DEVICE_TYPE)
+        self.generator = Generator(device=DEVICE_TYPE).manual_seed(42)
 
     def _compare_results(
         self,
diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
index c10822024..5d587fa3e 100644
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -42,6 +42,7 @@ dflash_target_dir = "Qwen/Qwen3-8B"
 dflash_dir = "z-lab/Qwen3-8B-DFlash-b16"
 
 BLOCK_SIZE = 16
+DEVICE_TYPE = current_platform.device_type
 
 
 def _create_proposer(
@@ -92,7 +93,7 @@ def _create_proposer(
         # Overwrite pard_token to avoid crash during init
         speculative_config.draft_model_config.hf_config.pard_token = 0
 
-    device = current_platform.device_type
+    device = DEVICE_TYPE
     vllm_config = VllmConfig(
         model_config=model_config,
         cache_config=CacheConfig(block_size=16),
@@ -124,7 +125,7 @@ def test_prepare_next_token_ids():
     either the GPU tensor of sampled_token_ids with -1 for rejected tokens,
     or the CPU python list[list[int]] with the rejected tokens removed.
     """
-    device = torch.device(current_platform.device_type)
+    device = torch.device(DEVICE_TYPE)
 
     num_requests = 4
     num_speculative_tokens = 4
@@ -207,7 +208,7 @@ def test_prepare_inputs():
                     a, a + 1, ..., a + b - n2 - 1,
                     a + b, a + b + 1, ..., a + b + c - n3 - 1]
     """
-    device = torch.device(current_platform.device_type)
+    device = torch.device(DEVICE_TYPE)
 
     # q1 = 4, q2 = 7, q3 = 5
     # n1 = 1, n2 = 3, n3 = 2
@@ -300,7 +301,7 @@ def test_prepare_inputs_padded():
             from the original indices to sample from.
     """
 
-    device = torch.device(current_platform.device_type)
+    device = torch.device(DEVICE_TYPE)
 
     expected_token_indices_to_sample = torch.tensor(
         [1, 5, 6], dtype=torch.int32, device=device
@@ -370,7 +371,7 @@ def test_set_inputs_first_pass_default_eagle():
     - After inserting next_tokens [100, 200, 300]:
         [a2, a3, 100, b2, 200, c2, c3, c4, 300]
     """
-    device = torch.device(current_platform.device_type)
+    device = torch.device(DEVICE_TYPE)
 
     num_speculative_tokens = 3
     proposer = _create_proposer("eagle", num_speculative_tokens)
@@ -471,7 +472,7 @@ def test_set_inputs_first_pass_draft_model():
       - idx 5: token 21, pos 1
       - idx 6: token 200, pos 2 (bonus token)
     """
-    device = torch.device(current_platform.device_type)
+    device = torch.device(DEVICE_TYPE)
 
     num_speculative_tokens = 2
     block_size = BLOCK_SIZE
@@ -609,7 +610,7 @@ def test_set_inputs_first_pass_parallel_drafting():
       - idx 9: bonus token 200
       - idx 10-11: parallel_drafting_tokens, is_masked=True
     """
-    device = torch.device(current_platform.device_type)
+    device = torch.device(DEVICE_TYPE)
 
     num_speculative_tokens = 3
     block_size = BLOCK_SIZE
@@ -859,7 +860,7 @@ def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch):
         monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
 
     # Use GPU device
-    device = torch.device(current_platform.device_type)
+    device = torch.device(DEVICE_TYPE)
 
     # Setup test parameters
     batch_size = 2
@@ -1030,7 +1031,7 @@ def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch):
 )
 def test_propose_tree(spec_token_tree):
     # Get GPU device.
-    device = torch.device(current_platform.device_type)
+    device = torch.device(DEVICE_TYPE)
 
     # Setup test parameters.
     batch_size = 2
diff --git a/tests/v1/spec_decode/test_eagle_step_kernel.py b/tests/v1/spec_decode/test_eagle_step_kernel.py
index 319ab4a33..275a157d1 100644
--- a/tests/v1/spec_decode/test_eagle_step_kernel.py
+++ b/tests/v1/spec_decode/test_eagle_step_kernel.py
@@ -5,11 +5,14 @@
 import pytest
 import torch
 
+from vllm.platforms import current_platform
 from vllm.v1.spec_decode.utils import (
     PADDING_SLOT_ID,
     eagle_step_update_slot_mapping_and_metadata,
 )
 
+DEVICE_TYPE = current_platform.device_type
+
 # Skip if no CUDA - Triton kernel requires GPU
 pytest.importorskip("triton")
 if not torch.cuda.is_available():
@@ -47,7 +50,7 @@ def _reference_eagle_step_slot_mapping(
 
 def test_eagle_step_slot_mapping_kernel():
     """Test fused kernel matches Python reference for slot mapping and metadata."""
-    device = torch.device("cuda")
+    device = torch.device(DEVICE_TYPE)
     batch_size = 32
     block_size = 16
     max_model_len = 4096
@@ -93,7 +96,7 @@ def test_eagle_step_slot_mapping_kernel():
 
 def test_eagle_step_slot_mapping_kernel_exceeds_max():
     """Test fused kernel when position exceeds max_model_len."""
-    device = torch.device("cuda")
+    device = torch.device(DEVICE_TYPE)
     batch_size = 4
     block_size = 16
     max_model_len = 100
@@ -130,7 +133,7 @@ def test_eagle_step_slot_mapping_kernel_exceeds_max():
 def test_eagle_step_slot_mapping_kernel_cudagraph_padding():
     """Test that padding threads write PADDING_SLOT_ID when
     input_batch_size > batch_size (cudagraph padding)."""
-    device = torch.device("cuda")
+    device = torch.device(DEVICE_TYPE)
     batch_size = 4
     input_batch_size = 8
     block_size = 16
diff --git a/tests/v1/spec_decode/test_extract_hidden_states.py b/tests/v1/spec_decode/test_extract_hidden_states.py
index 9f9758b82..95004eb65 100644
--- a/tests/v1/spec_decode/test_extract_hidden_states.py
+++ b/tests/v1/spec_decode/test_extract_hidden_states.py
@@ -27,6 +27,7 @@ from vllm.v1.spec_decode.extract_hidden_states import ExtractHiddenStatesPropose
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 
 model_dir = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+DEVICE_TYPE = current_platform.device_type
 
 
 def _create_proposer(
@@ -51,7 +52,7 @@ def _create_proposer(
         },
     )
 
-    device = current_platform.device_type
+    device = DEVICE_TYPE
     vllm_config = VllmConfig(
         model_config=model_config,
         cache_config=CacheConfig(),
@@ -101,7 +102,7 @@ def test_proposer_initialization_missing_layer_ids():
         },
     )
 
-    device = current_platform.device_type
+    device = DEVICE_TYPE
     vllm_config = VllmConfig(
         model_config=model_config,
         cache_config=CacheConfig(),
@@ -130,7 +131,7 @@ def test_prepare_next_token_ids_padded():
     For each request we either use the sampled token (if valid and not discarded)
     or a backup token from the request state.
     """
-    device = torch.device(current_platform.device_type)
+    device = torch.device(DEVICE_TYPE)
 
     num_requests = 4
     req_ids = [f"req_{i + 1}" for i in range(num_requests)]
@@ -197,7 +198,7 @@ def test_propose():
     2. Return the sampled tokens as "draft" tokens (shape [batch_size, 1])
     3. Cache the hidden states in the model's KV cache
     """
-    device = torch.device(current_platform.device_type)
+    device = torch.device(DEVICE_TYPE)
 
     # Setup test parameters
     batch_size = 2
@@ -273,7 +274,7 @@ def test_propose():
 @pytest.mark.parametrize("num_hidden_layers", [1, 4, 8])
 def test_propose_different_layer_counts(num_hidden_layers):
     """Test that propose works correctly with different numbers of hidden layers."""
-    device = torch.device(current_platform.device_type)
+    device = torch.device(DEVICE_TYPE)
 
     batch_size = 2
     num_tokens = 5
diff --git a/tests/v1/spec_decode/test_mtp.py b/tests/v1/spec_decode/test_mtp.py
index 0a48b0e7b..094611e05 100644
--- a/tests/v1/spec_decode/test_mtp.py
+++ b/tests/v1/spec_decode/test_mtp.py
@@ -28,6 +28,7 @@ from vllm.v1.attention.backends.registry import AttentionBackendEnum
 from vllm.v1.spec_decode.eagle import EagleProposer
 
 mimo_7b_dir = "XiaomiMiMo/MiMo-7B-Base"
+DEVICE_TYPE = current_platform.device_type
 
 
 def _create_mtp_proposer(num_speculative_tokens: int) -> EagleProposer:
@@ -48,7 +49,7 @@ def _create_mtp_proposer(num_speculative_tokens: int) -> EagleProposer:
         model_config=model_config,
         cache_config=CacheConfig(),
         speculative_config=speculative_config,
-        device_config=DeviceConfig(device=current_platform.device_type),
+        device_config=DeviceConfig(device=DEVICE_TYPE),
         parallel_config=ParallelConfig(),
         load_config=LoadConfig(),
         scheduler_config=SchedulerConfig(
@@ -57,7 +58,7 @@ def _create_mtp_proposer(num_speculative_tokens: int) -> EagleProposer:
         ),
     )
 
-    return EagleProposer(vllm_config=vllm_config, device=current_platform.device_type)
+    return EagleProposer(vllm_config=vllm_config, device=DEVICE_TYPE)
 
 
 @mock.patch("vllm.v1.spec_decode.eagle.get_pp_group")
@@ -118,7 +119,7 @@ def test_mtp_load_model_unified(mock_get_model, mock_get_layers, mock_get_pp_gro
 def test_mtp_propose(num_speculative_tokens, monkeypatch):
     """Test that MTP's forward method returns hidden states directly"""
 
-    device = torch.device(current_platform.device_type)
+    device = torch.device(DEVICE_TYPE)
     batch_size = 2
     seq_lens = [5, 3]
     total_tokens = sum(seq_lens)
diff --git a/tests/v1/spec_decode/test_tree_attention.py b/tests/v1/spec_decode/test_tree_attention.py
index 52bc722cf..cb487acec 100644
--- a/tests/v1/spec_decode/test_tree_attention.py
+++ b/tests/v1/spec_decode/test_tree_attention.py
@@ -18,6 +18,8 @@ from vllm.v1.attention.backend import CommonAttentionMetadata
 from vllm.v1.attention.backends.fa_utils import is_flash_attn_varlen_func_available
 from vllm.v1.attention.backends.registry import AttentionBackendEnum
 
+DEVICE_TYPE = current_platform.device_type
+
 if not is_flash_attn_varlen_func_available():
     pytest.skip(
         "This test requires flash_attn_varlen_func, but it's not available.",
@@ -170,9 +172,9 @@ def _get_available_reference_backends() -> list[AttentionBackendEnum]:
 
 
 class MockAttentionLayer(torch.nn.Module):
-    _q_scale = torch.tensor(1.0, dtype=torch.float32, device="cuda")
-    _k_scale = torch.tensor(1.0, dtype=torch.float32, device="cuda")
-    _v_scale = torch.tensor(1.0, dtype=torch.float32, device="cuda")
+    _q_scale = torch.tensor(1.0, dtype=torch.float32, device=DEVICE_TYPE)
+    _k_scale = torch.tensor(1.0, dtype=torch.float32, device=DEVICE_TYPE)
+    _v_scale = torch.tensor(1.0, dtype=torch.float32, device=DEVICE_TYPE)
     layer_name = "mock_layer"
 
     def __init__(self):
diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py
index d4eee19ad..3a478d210 100644
--- a/tests/v1/worker/test_gpu_input_batch.py
+++ b/tests/v1/worker/test_gpu_input_batch.py
@@ -22,10 +22,8 @@ from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 VOCAB_SIZE = 1024
 NUM_OUTPUT_TOKENS = 20
 MAX_PROMPT_SIZE = 100
-CUDA_DEVICES = [
-    f"{current_platform.device_type}:{i}"
-    for i in range(min(current_platform.device_count(), 2))
-]
+DEVICE_TYPE = current_platform.device_type
+DEVICES = [f"{DEVICE_TYPE}:{i}" for i in range(min(current_platform.device_count(), 2))]
 MAX_NUM_PROMPT_TOKENS = 64
 
 
@@ -219,7 +217,7 @@ def _construct_cached_request_state(req_id_suffix: int):
     )
 
 
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("batch_size", [1, 2, 32, 64])
 def test_sampling_metadata_in_input_batch(device: str, batch_size: int):
     """
@@ -313,7 +311,7 @@ def test_sampling_metadata_in_input_batch(device: str, batch_size: int):
     )
 
 
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("batch_size", [32])
 @pytest.mark.parametrize("swap_list", [((0, 1),)])
 def test_swap_states_in_input_batch(device: str, batch_size: int, swap_list: list):
@@ -400,7 +398,7 @@ def _construct_pooling_request(req_id_suffix: int, pooling_params=None):
     )
 
 
-@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("device", DEVICES)
 def test_pooling_prompt_lens_not_aliased(device: str):
     """Verify that prompt_lens in PoolingMetadata does not share memory
     with the internal num_prompt_tokens pinned buffer. Guards against possible
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index d7695027a..0de443858 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -45,7 +45,7 @@ from vllm.v1.worker.utils import AttentionGroup, select_common_block_size
 
 BLOCK_SIZE = 16
 NUM_BLOCKS = 10
-DEVICE = current_platform.device_type
+DEVICE_TYPE = current_platform.device_type
 
 
 def initialize_kv_cache(runner: GPUModelRunner):
@@ -121,7 +121,7 @@ def model_runner():
         vllm_config.compilation_config.static_forward_context["layer.0"] = Attention(
             num_heads, head_size, 0.1
         )
-        runner = GPUModelRunner(vllm_config, DEVICE)
+        runner = GPUModelRunner(vllm_config, DEVICE_TYPE)
         initialize_kv_cache(runner)
         yield runner
 
@@ -340,7 +340,7 @@ def test_get_nans_in_logits(model_runner, dist_init):
             [1.0, 2.0, 3.0],
             [3.0, 2.0, 1.0],
         ],
-        device=DEVICE,
+        device=DEVICE_TYPE,
     )
     result = model_runner._get_nans_in_logits(logits)
     assert result == {"req_0": 0, "req_1": 0}
@@ -350,7 +350,7 @@ def test_get_nans_in_logits(model_runner, dist_init):
             [1.0, float("nan"), 3.0],
             [4.0, float("nan"), float("nan")],
         ],
-        device=DEVICE,
+        device=DEVICE_TYPE,
     )
     result = model_runner._get_nans_in_logits(logits)
     assert result == {"req_0": 1, "req_1": 2}
@@ -360,7 +360,7 @@ def test_get_nans_in_logits(model_runner, dist_init):
             [1.0, 2.0, 3.0],
             [4.0, float("nan"), float("nan")],
         ],
-        device=DEVICE,
+        device=DEVICE_TYPE,
     )
     result = model_runner._get_nans_in_logits(logits)
     assert result == {"req_0": 0, "req_1": 2}
@@ -372,7 +372,7 @@ def test_get_nans_in_logits(model_runner, dist_init):
         [
             [1.0, float("nan"), 3.0],
         ],
-        device=DEVICE,
+        device=DEVICE_TYPE,
     )
     result = model_runner._get_nans_in_logits(logits)
     assert result == {"req_0": 1, "req_1": 0}
@@ -383,7 +383,7 @@ def test_get_nans_in_logits(model_runner, dist_init):
             [1.0, 2.0, 3.0],
             [float("nan"), 2.0, 3.0],
         ],
-        device=DEVICE,
+        device=DEVICE_TYPE,
     )
     result = model_runner._get_nans_in_logits(logits)
     assert result == {"req_0": 2, "req_1": 0}
@@ -643,7 +643,7 @@ def test_init_kv_cache_without_kv_sharing(default_vllm_config):
     # Set high context length to test max context length estimation
     vllm_config.model_config.max_model_len = 3_000_000
     vllm_ctx = vllm_config.compilation_config.static_forward_context
-    runner = GPUModelRunner(vllm_config, DEVICE)
+    runner = GPUModelRunner(vllm_config, DEVICE_TYPE)
     kv_cache_spec = runner.get_kv_cache_spec()
     assert len(kv_cache_spec) == 2
     assert len(runner.shared_kv_cache_layers) == 0
@@ -711,7 +711,7 @@ def test_init_kv_cache_with_kv_sharing_valid(default_vllm_config):
     # Set high context length to test max context length estimation
     vllm_config.model_config.max_model_len = 3_000_000
     vllm_ctx = vllm_config.compilation_config.static_forward_context
-    runner = GPUModelRunner(vllm_config, DEVICE)
+    runner = GPUModelRunner(vllm_config, DEVICE_TYPE)
     kv_cache_spec = runner.get_kv_cache_spec()
     assert len(kv_cache_spec) == 1
     assert layer_0 in kv_cache_spec
@@ -850,7 +850,7 @@ def test_hybrid_attention_mamba_tensor_shapes():
         assert fwd_context is not None
         vllm_ctx = vllm_config.compilation_config.static_forward_context
 
-        runner = GPUModelRunner(vllm_config, DEVICE)
+        runner = GPUModelRunner(vllm_config, DEVICE_TYPE)
         current_platform.update_block_size_for_backend(vllm_config)
         kv_cache_spec = runner.get_kv_cache_spec()
 
@@ -896,13 +896,13 @@ def test_hybrid_attention_mamba_tensor_shapes():
     ssm_constant_shape = ssm_shape[1:]
 
     attn_blocks_constant = torch.full(
-        (test_block_size, *attn_constant_shape), device=DEVICE, fill_value=3.33
+        (test_block_size, *attn_constant_shape), device=DEVICE_TYPE, fill_value=3.33
     )
     conv_blocks_constant = torch.full(
-        (test_block_size, *conv_constant_shape), device=DEVICE, fill_value=6.66
+        (test_block_size, *conv_constant_shape), device=DEVICE_TYPE, fill_value=6.66
     )
     ssm_blocks_constant = torch.full(
-        (test_block_size, *ssm_constant_shape), device=DEVICE, fill_value=9.99
+        (test_block_size, *ssm_constant_shape), device=DEVICE_TYPE, fill_value=9.99
     )
 
     # Fill attention blocks with constants using kv block indices
@@ -997,7 +997,7 @@ def test_hybrid_block_table_initialization():
         max_num_blocks_per_req=max_num_blocks_per_req,
         max_num_batched_tokens=max_num_batched_tokens,
         pin_memory=False,
-        device=torch.device(DEVICE),
+        device=torch.device(DEVICE_TYPE),
         kernel_block_size=kernel_block_sizes[0],
         cp_kv_cache_interleave_size=cp_kv_cache_interleave_size,
     )
@@ -1036,7 +1036,7 @@ def test_input_batch_with_kernel_block_sizes():
     max_num_reqs = 10
     max_model_len = 512
     max_num_batched_tokens = 512
-    device = torch.device(DEVICE)
+    device = torch.device(DEVICE_TYPE)
     pin_memory = False
     vocab_size = 50272
 
@@ -1083,7 +1083,7 @@ def test_hybrid_cache_integration(default_vllm_config, dist_init):
         num_heads, head_size, 0.1
     )
 
-    runner = GPUModelRunner(vllm_config, DEVICE)
+    runner = GPUModelRunner(vllm_config, DEVICE_TYPE)
 
     # Initialize KV cache with configuration
     attn_spec = FullAttentionSpec(
@@ -1306,7 +1306,7 @@ def test_mamba_cache_raises_when_max_num_seqs_exceeds_blocks():
             )
         assert fwd_context is not None
 
-        runner = GPUModelRunner(vllm_config, DEVICE)
+        runner = GPUModelRunner(vllm_config, DEVICE_TYPE)
         current_platform.update_block_size_for_backend(vllm_config)
         kv_cache_spec = runner.get_kv_cache_spec()