refactor hard coded device string in test files under tests/v1 and tests/lora (#37566)

Signed-off-by: Liao, Wei <wei.liao@intel.com>
This commit is contained in:
wliao2
2026-04-02 20:21:47 -07:00
committed by GitHub
parent 4a06e1246e
commit 32e0c0bfa2
28 changed files with 239 additions and 146 deletions

View File

@@ -637,7 +637,7 @@ def use_fused_moe_lora_kernel_tensor_parallel(
set_random_seed(seed)
device = torch.device(f"cuda:{local_rank}")
device = torch.device(f"{DEVICE_TYPE}:{local_rank}")
torch.accelerator.set_device_index(device)
torch.set_default_device(device)
torch.set_default_dtype(dtype)

View File

@@ -60,8 +60,12 @@ pytestmark = pytest.mark.skipif(
reason="Backend not supported",
)
DEVICE_TYPE = current_platform.device_type
DEVICES = (
[f"cuda:{i}" for i in range(1 if torch.accelerator.device_count() == 1 else 2)]
[
f"{DEVICE_TYPE}:{i}"
for i in range(1 if torch.accelerator.device_count() == 1 else 2)
]
if current_platform.is_cuda_alike()
else ["cpu"]
)
@@ -196,7 +200,7 @@ def create_random_inputs(
input_size: tuple[int, ...],
input_range: tuple[float, float],
input_type: torch.dtype = torch.int,
device: torch.device = "cuda",
device: torch.device = DEVICE_TYPE,
) -> tuple[list[torch.Tensor], list[int], list[int]]:
"""Creates random inputs.

View File

@@ -35,9 +35,9 @@ EMBEDDING_MODULES = {
"lm_head": "output_embeddings",
}
DEVICE_TYPE = current_platform.device_type
DEVICES = (
[f"cuda:{i}" for i in range(1 if torch.accelerator.device_count() == 1 else 2)]
[f"{DEVICE_TYPE}:{i}" for i in range(min(torch.accelerator.device_count(), 2))]
if current_platform.is_cuda_alike()
else ["cpu"]
)

View File

@@ -6,6 +6,9 @@ import pytest
import torch
from vllm import _custom_ops as ops
from vllm.platforms import current_platform
DEVICE_TYPE = current_platform.device_type
def round_up(x, base):
@@ -27,7 +30,7 @@ def sample_data(num_experts, max_loras, num_tokens, topk_num):
topk_ids[i, j] = pool[j]
token_lora_mapping[i] = random.randint(0, max_loras - 1)
return topk_ids.to("cuda"), token_lora_mapping.to("cuda")
return topk_ids.to(DEVICE_TYPE), token_lora_mapping.to(DEVICE_TYPE)
@pytest.mark.parametrize("num_tokens", [100, 200, 1024, 4096]) # 81920
@@ -56,14 +59,21 @@ def test_moe_lora_align_block_size(
(max_loras * max_num_tokens_padded,),
topk_ids.numel(),
dtype=torch.int32,
device="cuda",
device=DEVICE_TYPE,
)
expert_ids = torch.full(
(max_loras * max_num_m_blocks,), num_experts, dtype=torch.int32, device="cuda"
(max_loras * max_num_m_blocks,),
num_experts,
dtype=torch.int32,
device=DEVICE_TYPE,
)
num_tokens_post_pad = torch.zeros((max_loras,), dtype=torch.int32, device="cuda")
adapter_enabled = torch.ones((max_loras + 1,), dtype=torch.int32, device="cuda")
lora_ids = torch.arange(max_loras + 2, dtype=torch.int32, device="cuda")
num_tokens_post_pad = torch.zeros(
(max_loras,), dtype=torch.int32, device=DEVICE_TYPE
)
adapter_enabled = torch.ones(
(max_loras + 1,), dtype=torch.int32, device=DEVICE_TYPE
)
lora_ids = torch.arange(max_loras + 2, dtype=torch.int32, device=DEVICE_TYPE)
# call kernel
ops.moe_lora_align_block_size(

View File

@@ -9,10 +9,13 @@ import vllm.lora.ops.torch_ops as torch_ops
import vllm.lora.ops.triton_ops as triton_ops
from vllm.lora.ops.triton_ops import LoRAKernelMeta
from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
from .utils import PunicaTensors, assert_close, generate_data_for_nslices
DEVICE_TYPE = current_platform.device_type
@pytest.fixture(autouse=True)
def reset_device(reset_default_device):
@@ -146,7 +149,9 @@ def check_lora_shrink_kernel(
# Setup metadata information for the LoRA kernel.
lora_meta = LoRAKernelMeta.make(
max_loras=num_loras, max_num_tokens=token_nums, device="cuda"
max_loras=num_loras,
max_num_tokens=token_nums,
device=DEVICE_TYPE,
)
lora_meta.prepare_tensors(data.token_lora_mapping)
@@ -219,7 +224,9 @@ def check_lora_expand_kernel(
# Setup metadata information for the LoRA kernel.
lora_meta = LoRAKernelMeta.make(
max_loras=num_loras, max_num_tokens=token_nums, device="cuda"
max_loras=num_loras,
max_num_tokens=token_nums,
device=DEVICE_TYPE,
)
lora_meta.prepare_tensors(data.token_lora_mapping)
@@ -367,7 +374,7 @@ test_params = {
}
DTYPES = [torch.float16, torch.bfloat16]
DEVICES = [f"cuda:{0}"]
DEVICES = [f"{DEVICE_TYPE}:{0}"]
SEED = [0]

View File

@@ -28,9 +28,11 @@ from vllm.lora.ops.triton_ops.lora_shrink_fp8_op import (
_SHRINK_LORA_SCALE_PTR_DICT,
)
from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
DEVICES = [f"cuda:{0}"]
DEVICE_TYPE = current_platform.device_type
DEVICES = [f"{DEVICE_TYPE}:{0}"]
SEED = [0]
_dict_lock = Lock()

View File

@@ -19,11 +19,14 @@ from vllm.config.load import LoadConfig
from vllm.config.lora import LoRAConfig
from vllm.lora.model_manager import LoRAMapping
from vllm.lora.request import LoRARequest
from vllm.platforms import current_platform
from vllm.v1.worker.gpu_worker import Worker
MODEL_PATH = "Qwen/Qwen3-0.6B"
NUM_LORAS = 16
DEVICE_TYPE = current_platform.device_type
@patch.dict(os.environ, {"RANK": "0"})
def test_worker_apply_lora(qwen3_lora_files):
@@ -61,7 +64,7 @@ def test_worker_apply_lora(qwen3_lora_files):
max_num_seqs=32,
max_num_partial_prefills=32,
),
device_config=DeviceConfig("cuda"),
device_config=DeviceConfig(DEVICE_TYPE),
cache_config=CacheConfig(
block_size=16,
cache_dtype="auto",

View File

@@ -9,10 +9,13 @@ import torch
from safetensors.torch import save_file
from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights
from vllm.platforms import current_platform
DEVICE_TYPE = current_platform.device_type
class DummyLoRAManager:
def __init__(self, device: torch.device = "cuda:0"):
def __init__(self, device: torch.device = f"{DEVICE_TYPE}:0"):
super().__init__()
self._loras: dict[str, LoRALayerWeights] = {}
self._device = device
@@ -57,8 +60,8 @@ class DummyLoRAManager:
module_name,
rank=rank,
lora_alpha=1,
lora_a=torch.rand([rank, input_dim], device="cuda"),
lora_b=torch.rand([output_dim, input_dim], device="cuda"),
lora_a=torch.rand([rank, input_dim], device=DEVICE_TYPE),
lora_b=torch.rand([output_dim, input_dim], device=DEVICE_TYPE),
embeddings_tensor=embeddings_tensor,
)
self.set_module_lora(module_name, lora)