refactor hard coded device string in test files under tests/v1 and tests/lora (#37566)
Signed-off-by: Liao, Wei <wei.liao@intel.com>
This commit is contained in:
@@ -637,7 +637,7 @@ def use_fused_moe_lora_kernel_tensor_parallel(
|
||||
|
||||
set_random_seed(seed)
|
||||
|
||||
device = torch.device(f"cuda:{local_rank}")
|
||||
device = torch.device(f"{DEVICE_TYPE}:{local_rank}")
|
||||
torch.accelerator.set_device_index(device)
|
||||
torch.set_default_device(device)
|
||||
torch.set_default_dtype(dtype)
|
||||
|
||||
@@ -60,8 +60,12 @@ pytestmark = pytest.mark.skipif(
|
||||
reason="Backend not supported",
|
||||
)
|
||||
|
||||
DEVICE_TYPE = current_platform.device_type
|
||||
DEVICES = (
|
||||
[f"cuda:{i}" for i in range(1 if torch.accelerator.device_count() == 1 else 2)]
|
||||
[
|
||||
f"{DEVICE_TYPE}:{i}"
|
||||
for i in range(1 if torch.accelerator.device_count() == 1 else 2)
|
||||
]
|
||||
if current_platform.is_cuda_alike()
|
||||
else ["cpu"]
|
||||
)
|
||||
@@ -196,7 +200,7 @@ def create_random_inputs(
|
||||
input_size: tuple[int, ...],
|
||||
input_range: tuple[float, float],
|
||||
input_type: torch.dtype = torch.int,
|
||||
device: torch.device = "cuda",
|
||||
device: torch.device = DEVICE_TYPE,
|
||||
) -> tuple[list[torch.Tensor], list[int], list[int]]:
|
||||
"""Creates random inputs.
|
||||
|
||||
|
||||
@@ -35,9 +35,9 @@ EMBEDDING_MODULES = {
|
||||
"lm_head": "output_embeddings",
|
||||
}
|
||||
|
||||
|
||||
DEVICE_TYPE = current_platform.device_type
|
||||
DEVICES = (
|
||||
[f"cuda:{i}" for i in range(1 if torch.accelerator.device_count() == 1 else 2)]
|
||||
[f"{DEVICE_TYPE}:{i}" for i in range(min(torch.accelerator.device_count(), 2))]
|
||||
if current_platform.is_cuda_alike()
|
||||
else ["cpu"]
|
||||
)
|
||||
|
||||
@@ -6,6 +6,9 @@ import pytest
|
||||
import torch
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
DEVICE_TYPE = current_platform.device_type
|
||||
|
||||
|
||||
def round_up(x, base):
|
||||
@@ -27,7 +30,7 @@ def sample_data(num_experts, max_loras, num_tokens, topk_num):
|
||||
topk_ids[i, j] = pool[j]
|
||||
token_lora_mapping[i] = random.randint(0, max_loras - 1)
|
||||
|
||||
return topk_ids.to("cuda"), token_lora_mapping.to("cuda")
|
||||
return topk_ids.to(DEVICE_TYPE), token_lora_mapping.to(DEVICE_TYPE)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_tokens", [100, 200, 1024, 4096]) # 81920
|
||||
@@ -56,14 +59,21 @@ def test_moe_lora_align_block_size(
|
||||
(max_loras * max_num_tokens_padded,),
|
||||
topk_ids.numel(),
|
||||
dtype=torch.int32,
|
||||
device="cuda",
|
||||
device=DEVICE_TYPE,
|
||||
)
|
||||
expert_ids = torch.full(
|
||||
(max_loras * max_num_m_blocks,), num_experts, dtype=torch.int32, device="cuda"
|
||||
(max_loras * max_num_m_blocks,),
|
||||
num_experts,
|
||||
dtype=torch.int32,
|
||||
device=DEVICE_TYPE,
|
||||
)
|
||||
num_tokens_post_pad = torch.zeros((max_loras,), dtype=torch.int32, device="cuda")
|
||||
adapter_enabled = torch.ones((max_loras + 1,), dtype=torch.int32, device="cuda")
|
||||
lora_ids = torch.arange(max_loras + 2, dtype=torch.int32, device="cuda")
|
||||
num_tokens_post_pad = torch.zeros(
|
||||
(max_loras,), dtype=torch.int32, device=DEVICE_TYPE
|
||||
)
|
||||
adapter_enabled = torch.ones(
|
||||
(max_loras + 1,), dtype=torch.int32, device=DEVICE_TYPE
|
||||
)
|
||||
lora_ids = torch.arange(max_loras + 2, dtype=torch.int32, device=DEVICE_TYPE)
|
||||
|
||||
# call kernel
|
||||
ops.moe_lora_align_block_size(
|
||||
|
||||
@@ -9,10 +9,13 @@ import vllm.lora.ops.torch_ops as torch_ops
|
||||
import vllm.lora.ops.triton_ops as triton_ops
|
||||
from vllm.lora.ops.triton_ops import LoRAKernelMeta
|
||||
from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.torch_utils import set_random_seed
|
||||
|
||||
from .utils import PunicaTensors, assert_close, generate_data_for_nslices
|
||||
|
||||
DEVICE_TYPE = current_platform.device_type
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def reset_device(reset_default_device):
|
||||
@@ -146,7 +149,9 @@ def check_lora_shrink_kernel(
|
||||
|
||||
# Setup metadata information for the LoRA kernel.
|
||||
lora_meta = LoRAKernelMeta.make(
|
||||
max_loras=num_loras, max_num_tokens=token_nums, device="cuda"
|
||||
max_loras=num_loras,
|
||||
max_num_tokens=token_nums,
|
||||
device=DEVICE_TYPE,
|
||||
)
|
||||
lora_meta.prepare_tensors(data.token_lora_mapping)
|
||||
|
||||
@@ -219,7 +224,9 @@ def check_lora_expand_kernel(
|
||||
|
||||
# Setup metadata information for the LoRA kernel.
|
||||
lora_meta = LoRAKernelMeta.make(
|
||||
max_loras=num_loras, max_num_tokens=token_nums, device="cuda"
|
||||
max_loras=num_loras,
|
||||
max_num_tokens=token_nums,
|
||||
device=DEVICE_TYPE,
|
||||
)
|
||||
lora_meta.prepare_tensors(data.token_lora_mapping)
|
||||
|
||||
@@ -367,7 +374,7 @@ test_params = {
|
||||
}
|
||||
|
||||
DTYPES = [torch.float16, torch.bfloat16]
|
||||
DEVICES = [f"cuda:{0}"]
|
||||
DEVICES = [f"{DEVICE_TYPE}:{0}"]
|
||||
SEED = [0]
|
||||
|
||||
|
||||
|
||||
@@ -28,9 +28,11 @@ from vllm.lora.ops.triton_ops.lora_shrink_fp8_op import (
|
||||
_SHRINK_LORA_SCALE_PTR_DICT,
|
||||
)
|
||||
from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.torch_utils import set_random_seed
|
||||
|
||||
DEVICES = [f"cuda:{0}"]
|
||||
DEVICE_TYPE = current_platform.device_type
|
||||
DEVICES = [f"{DEVICE_TYPE}:{0}"]
|
||||
SEED = [0]
|
||||
|
||||
_dict_lock = Lock()
|
||||
|
||||
@@ -19,11 +19,14 @@ from vllm.config.load import LoadConfig
|
||||
from vllm.config.lora import LoRAConfig
|
||||
from vllm.lora.model_manager import LoRAMapping
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.v1.worker.gpu_worker import Worker
|
||||
|
||||
MODEL_PATH = "Qwen/Qwen3-0.6B"
|
||||
NUM_LORAS = 16
|
||||
|
||||
DEVICE_TYPE = current_platform.device_type
|
||||
|
||||
|
||||
@patch.dict(os.environ, {"RANK": "0"})
|
||||
def test_worker_apply_lora(qwen3_lora_files):
|
||||
@@ -61,7 +64,7 @@ def test_worker_apply_lora(qwen3_lora_files):
|
||||
max_num_seqs=32,
|
||||
max_num_partial_prefills=32,
|
||||
),
|
||||
device_config=DeviceConfig("cuda"),
|
||||
device_config=DeviceConfig(DEVICE_TYPE),
|
||||
cache_config=CacheConfig(
|
||||
block_size=16,
|
||||
cache_dtype="auto",
|
||||
|
||||
@@ -9,10 +9,13 @@ import torch
|
||||
from safetensors.torch import save_file
|
||||
|
||||
from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
DEVICE_TYPE = current_platform.device_type
|
||||
|
||||
|
||||
class DummyLoRAManager:
|
||||
def __init__(self, device: torch.device = "cuda:0"):
|
||||
def __init__(self, device: torch.device = f"{DEVICE_TYPE}:0"):
|
||||
super().__init__()
|
||||
self._loras: dict[str, LoRALayerWeights] = {}
|
||||
self._device = device
|
||||
@@ -57,8 +60,8 @@ class DummyLoRAManager:
|
||||
module_name,
|
||||
rank=rank,
|
||||
lora_alpha=1,
|
||||
lora_a=torch.rand([rank, input_dim], device="cuda"),
|
||||
lora_b=torch.rand([output_dim, input_dim], device="cuda"),
|
||||
lora_a=torch.rand([rank, input_dim], device=DEVICE_TYPE),
|
||||
lora_b=torch.rand([output_dim, input_dim], device=DEVICE_TYPE),
|
||||
embeddings_tensor=embeddings_tensor,
|
||||
)
|
||||
self.set_module_lora(module_name, lora)
|
||||
|
||||
Reference in New Issue
Block a user