refactor hard coded device string in test files under tests/v1 and tests/lora (#37566)

Signed-off-by: Liao, Wei <wei.liao@intel.com>
2026-04-02 20:21:47 -07:00
parent 4a06e1246e
commit 32e0c0bfa2
28 changed files with 239 additions and 146 deletions
--- a/tests/lora/test_fused_moe_lora_kernel.py
+++ b/tests/lora/test_fused_moe_lora_kernel.py
@@ -637,7 +637,7 @@ def use_fused_moe_lora_kernel_tensor_parallel(

    set_random_seed(seed)

-    device = torch.device(f"cuda:{local_rank}")
+    device = torch.device(f"{DEVICE_TYPE}:{local_rank}")
    torch.accelerator.set_device_index(device)
    torch.set_default_device(device)
    torch.set_default_dtype(dtype)
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -60,8 +60,12 @@ pytestmark = pytest.mark.skipif(
    reason="Backend not supported",
 )

+DEVICE_TYPE = current_platform.device_type
 DEVICES = (
-    [f"cuda:{i}" for i in range(1 if torch.accelerator.device_count() == 1 else 2)]
+    [
+        f"{DEVICE_TYPE}:{i}"
+        for i in range(1 if torch.accelerator.device_count() == 1 else 2)
+    ]
    if current_platform.is_cuda_alike()
    else ["cpu"]
 )
@@ -196,7 +200,7 @@ def create_random_inputs(
    input_size: tuple[int, ...],
    input_range: tuple[float, float],
    input_type: torch.dtype = torch.int,
-    device: torch.device = "cuda",
+    device: torch.device = DEVICE_TYPE,
 ) -> tuple[list[torch.Tensor], list[int], list[int]]:
    """Creates random inputs.

--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -35,9 +35,9 @@ EMBEDDING_MODULES = {
    "lm_head": "output_embeddings",
 }

-
+DEVICE_TYPE = current_platform.device_type
 DEVICES = (
-    [f"cuda:{i}" for i in range(1 if torch.accelerator.device_count() == 1 else 2)]
+    [f"{DEVICE_TYPE}:{i}" for i in range(min(torch.accelerator.device_count(), 2))]
    if current_platform.is_cuda_alike()
    else ["cpu"]
 )
--- a/tests/lora/test_moe_lora_align_sum.py
+++ b/tests/lora/test_moe_lora_align_sum.py
@@ -6,6 +6,9 @@ import pytest
 import torch

 from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
+
+DEVICE_TYPE = current_platform.device_type


 def round_up(x, base):
@@ -27,7 +30,7 @@ def sample_data(num_experts, max_loras, num_tokens, topk_num):
            topk_ids[i, j] = pool[j]
        token_lora_mapping[i] = random.randint(0, max_loras - 1)

-    return topk_ids.to("cuda"), token_lora_mapping.to("cuda")
+    return topk_ids.to(DEVICE_TYPE), token_lora_mapping.to(DEVICE_TYPE)


@pytest.mark.parametrize("num_tokens", [100, 200, 1024, 4096])  # 81920
@@ -56,14 +59,21 @@ def test_moe_lora_align_block_size(
        (max_loras * max_num_tokens_padded,),
        topk_ids.numel(),
        dtype=torch.int32,
-        device="cuda",
+        device=DEVICE_TYPE,
    )
    expert_ids = torch.full(
-        (max_loras * max_num_m_blocks,), num_experts, dtype=torch.int32, device="cuda"
+        (max_loras * max_num_m_blocks,),
+        num_experts,
+        dtype=torch.int32,
+        device=DEVICE_TYPE,
    )
-    num_tokens_post_pad = torch.zeros((max_loras,), dtype=torch.int32, device="cuda")
-    adapter_enabled = torch.ones((max_loras + 1,), dtype=torch.int32, device="cuda")
-    lora_ids = torch.arange(max_loras + 2, dtype=torch.int32, device="cuda")
+    num_tokens_post_pad = torch.zeros(
+        (max_loras,), dtype=torch.int32, device=DEVICE_TYPE
+    )
+    adapter_enabled = torch.ones(
+        (max_loras + 1,), dtype=torch.int32, device=DEVICE_TYPE
+    )
+    lora_ids = torch.arange(max_loras + 2, dtype=torch.int32, device=DEVICE_TYPE)

    # call kernel
    ops.moe_lora_align_block_size(
--- a/tests/lora/test_punica_ops.py
+++ b/tests/lora/test_punica_ops.py
@@ -9,10 +9,13 @@ import vllm.lora.ops.torch_ops as torch_ops
 import vllm.lora.ops.triton_ops as triton_ops
 from vllm.lora.ops.triton_ops import LoRAKernelMeta
 from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
+from vllm.platforms import current_platform
 from vllm.utils.torch_utils import set_random_seed

 from .utils import PunicaTensors, assert_close, generate_data_for_nslices

+DEVICE_TYPE = current_platform.device_type
+

@pytest.fixture(autouse=True)
 def reset_device(reset_default_device):
@@ -146,7 +149,9 @@ def check_lora_shrink_kernel(

    # Setup metadata information for the LoRA kernel.
    lora_meta = LoRAKernelMeta.make(
-        max_loras=num_loras, max_num_tokens=token_nums, device="cuda"
+        max_loras=num_loras,
+        max_num_tokens=token_nums,
+        device=DEVICE_TYPE,
    )
    lora_meta.prepare_tensors(data.token_lora_mapping)

@@ -219,7 +224,9 @@ def check_lora_expand_kernel(

    # Setup metadata information for the LoRA kernel.
    lora_meta = LoRAKernelMeta.make(
-        max_loras=num_loras, max_num_tokens=token_nums, device="cuda"
+        max_loras=num_loras,
+        max_num_tokens=token_nums,
+        device=DEVICE_TYPE,
    )
    lora_meta.prepare_tensors(data.token_lora_mapping)

@@ -367,7 +374,7 @@ test_params = {
 }

 DTYPES = [torch.float16, torch.bfloat16]
-DEVICES = [f"cuda:{0}"]
+DEVICES = [f"{DEVICE_TYPE}:{0}"]
 SEED = [0]


--- a/tests/lora/test_punica_ops_fp8.py
+++ b/tests/lora/test_punica_ops_fp8.py
@@ -28,9 +28,11 @@ from vllm.lora.ops.triton_ops.lora_shrink_fp8_op import (
    _SHRINK_LORA_SCALE_PTR_DICT,
 )
 from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
+from vllm.platforms import current_platform
 from vllm.utils.torch_utils import set_random_seed

-DEVICES = [f"cuda:{0}"]
+DEVICE_TYPE = current_platform.device_type
+DEVICES = [f"{DEVICE_TYPE}:{0}"]
 SEED = [0]

 _dict_lock = Lock()
--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -19,11 +19,14 @@ from vllm.config.load import LoadConfig
 from vllm.config.lora import LoRAConfig
 from vllm.lora.model_manager import LoRAMapping
 from vllm.lora.request import LoRARequest
+from vllm.platforms import current_platform
 from vllm.v1.worker.gpu_worker import Worker

 MODEL_PATH = "Qwen/Qwen3-0.6B"
 NUM_LORAS = 16

+DEVICE_TYPE = current_platform.device_type
+

@patch.dict(os.environ, {"RANK": "0"})
 def test_worker_apply_lora(qwen3_lora_files):
@@ -61,7 +64,7 @@ def test_worker_apply_lora(qwen3_lora_files):
            max_num_seqs=32,
            max_num_partial_prefills=32,
        ),
-        device_config=DeviceConfig("cuda"),
+        device_config=DeviceConfig(DEVICE_TYPE),
        cache_config=CacheConfig(
            block_size=16,
            cache_dtype="auto",
--- a/tests/lora/utils.py
+++ b/tests/lora/utils.py
@@ -9,10 +9,13 @@ import torch
 from safetensors.torch import save_file

 from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights
+from vllm.platforms import current_platform
+
+DEVICE_TYPE = current_platform.device_type


 class DummyLoRAManager:
-    def __init__(self, device: torch.device = "cuda:0"):
+    def __init__(self, device: torch.device = f"{DEVICE_TYPE}:0"):
        super().__init__()
        self._loras: dict[str, LoRALayerWeights] = {}
        self._device = device
@@ -57,8 +60,8 @@ class DummyLoRAManager:
            module_name,
            rank=rank,
            lora_alpha=1,
-            lora_a=torch.rand([rank, input_dim], device="cuda"),
-            lora_b=torch.rand([output_dim, input_dim], device="cuda"),
+            lora_a=torch.rand([rank, input_dim], device=DEVICE_TYPE),
+            lora_b=torch.rand([output_dim, input_dim], device=DEVICE_TYPE),
            embeddings_tensor=embeddings_tensor,
        )
        self.set_module_lora(module_name, lora)