diff --git a/tests/lora/test_fused_moe_lora_kernel.py b/tests/lora/test_fused_moe_lora_kernel.py index 66a985a06..8adc20865 100644 --- a/tests/lora/test_fused_moe_lora_kernel.py +++ b/tests/lora/test_fused_moe_lora_kernel.py @@ -637,7 +637,7 @@ def use_fused_moe_lora_kernel_tensor_parallel( set_random_seed(seed) - device = torch.device(f"cuda:{local_rank}") + device = torch.device(f"{DEVICE_TYPE}:{local_rank}") torch.accelerator.set_device_index(device) torch.set_default_device(device) torch.set_default_dtype(dtype) diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index 08fd03724..2a37abac6 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -60,8 +60,12 @@ pytestmark = pytest.mark.skipif( reason="Backend not supported", ) +DEVICE_TYPE = current_platform.device_type DEVICES = ( - [f"cuda:{i}" for i in range(1 if torch.accelerator.device_count() == 1 else 2)] + [ + f"{DEVICE_TYPE}:{i}" + for i in range(1 if torch.accelerator.device_count() == 1 else 2) + ] if current_platform.is_cuda_alike() else ["cpu"] ) @@ -196,7 +200,7 @@ def create_random_inputs( input_size: tuple[int, ...], input_range: tuple[float, float], input_type: torch.dtype = torch.int, - device: torch.device = "cuda", + device: torch.device = DEVICE_TYPE, ) -> tuple[list[torch.Tensor], list[int], list[int]]: """Creates random inputs. diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py index e7addab11..e80d96f00 100644 --- a/tests/lora/test_lora_manager.py +++ b/tests/lora/test_lora_manager.py @@ -35,9 +35,9 @@ EMBEDDING_MODULES = { "lm_head": "output_embeddings", } - +DEVICE_TYPE = current_platform.device_type DEVICES = ( - [f"cuda:{i}" for i in range(1 if torch.accelerator.device_count() == 1 else 2)] + [f"{DEVICE_TYPE}:{i}" for i in range(min(torch.accelerator.device_count(), 2))] if current_platform.is_cuda_alike() else ["cpu"] ) diff --git a/tests/lora/test_moe_lora_align_sum.py b/tests/lora/test_moe_lora_align_sum.py index bb46b4d86..1451ec162 100644 --- a/tests/lora/test_moe_lora_align_sum.py +++ b/tests/lora/test_moe_lora_align_sum.py @@ -6,6 +6,9 @@ import pytest import torch from vllm import _custom_ops as ops +from vllm.platforms import current_platform + +DEVICE_TYPE = current_platform.device_type def round_up(x, base): @@ -27,7 +30,7 @@ def sample_data(num_experts, max_loras, num_tokens, topk_num): topk_ids[i, j] = pool[j] token_lora_mapping[i] = random.randint(0, max_loras - 1) - return topk_ids.to("cuda"), token_lora_mapping.to("cuda") + return topk_ids.to(DEVICE_TYPE), token_lora_mapping.to(DEVICE_TYPE) @pytest.mark.parametrize("num_tokens", [100, 200, 1024, 4096]) # 81920 @@ -56,14 +59,21 @@ def test_moe_lora_align_block_size( (max_loras * max_num_tokens_padded,), topk_ids.numel(), dtype=torch.int32, - device="cuda", + device=DEVICE_TYPE, ) expert_ids = torch.full( - (max_loras * max_num_m_blocks,), num_experts, dtype=torch.int32, device="cuda" + (max_loras * max_num_m_blocks,), + num_experts, + dtype=torch.int32, + device=DEVICE_TYPE, ) - num_tokens_post_pad = torch.zeros((max_loras,), dtype=torch.int32, device="cuda") - adapter_enabled = torch.ones((max_loras + 1,), dtype=torch.int32, device="cuda") - lora_ids = torch.arange(max_loras + 2, dtype=torch.int32, device="cuda") + num_tokens_post_pad = torch.zeros( + (max_loras,), dtype=torch.int32, device=DEVICE_TYPE + ) + adapter_enabled = torch.ones( + (max_loras + 1,), dtype=torch.int32, device=DEVICE_TYPE + ) + lora_ids = torch.arange(max_loras + 2, dtype=torch.int32, device=DEVICE_TYPE) # call kernel ops.moe_lora_align_block_size( diff --git a/tests/lora/test_punica_ops.py b/tests/lora/test_punica_ops.py index 8a2634e82..7706d0e2a 100644 --- a/tests/lora/test_punica_ops.py +++ b/tests/lora/test_punica_ops.py @@ -9,10 +9,13 @@ import vllm.lora.ops.torch_ops as torch_ops import vllm.lora.ops.triton_ops as triton_ops from vllm.lora.ops.triton_ops import LoRAKernelMeta from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT +from vllm.platforms import current_platform from vllm.utils.torch_utils import set_random_seed from .utils import PunicaTensors, assert_close, generate_data_for_nslices +DEVICE_TYPE = current_platform.device_type + @pytest.fixture(autouse=True) def reset_device(reset_default_device): @@ -146,7 +149,9 @@ def check_lora_shrink_kernel( # Setup metadata information for the LoRA kernel. lora_meta = LoRAKernelMeta.make( - max_loras=num_loras, max_num_tokens=token_nums, device="cuda" + max_loras=num_loras, + max_num_tokens=token_nums, + device=DEVICE_TYPE, ) lora_meta.prepare_tensors(data.token_lora_mapping) @@ -219,7 +224,9 @@ def check_lora_expand_kernel( # Setup metadata information for the LoRA kernel. lora_meta = LoRAKernelMeta.make( - max_loras=num_loras, max_num_tokens=token_nums, device="cuda" + max_loras=num_loras, + max_num_tokens=token_nums, + device=DEVICE_TYPE, ) lora_meta.prepare_tensors(data.token_lora_mapping) @@ -367,7 +374,7 @@ test_params = { } DTYPES = [torch.float16, torch.bfloat16] -DEVICES = [f"cuda:{0}"] +DEVICES = [f"{DEVICE_TYPE}:{0}"] SEED = [0] diff --git a/tests/lora/test_punica_ops_fp8.py b/tests/lora/test_punica_ops_fp8.py index 042313336..3e7fe7b27 100644 --- a/tests/lora/test_punica_ops_fp8.py +++ b/tests/lora/test_punica_ops_fp8.py @@ -28,9 +28,11 @@ from vllm.lora.ops.triton_ops.lora_shrink_fp8_op import ( _SHRINK_LORA_SCALE_PTR_DICT, ) from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT +from vllm.platforms import current_platform from vllm.utils.torch_utils import set_random_seed -DEVICES = [f"cuda:{0}"] +DEVICE_TYPE = current_platform.device_type +DEVICES = [f"{DEVICE_TYPE}:{0}"] SEED = [0] _dict_lock = Lock() diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py index 4af3ccf89..88763551c 100644 --- a/tests/lora/test_worker.py +++ b/tests/lora/test_worker.py @@ -19,11 +19,14 @@ from vllm.config.load import LoadConfig from vllm.config.lora import LoRAConfig from vllm.lora.model_manager import LoRAMapping from vllm.lora.request import LoRARequest +from vllm.platforms import current_platform from vllm.v1.worker.gpu_worker import Worker MODEL_PATH = "Qwen/Qwen3-0.6B" NUM_LORAS = 16 +DEVICE_TYPE = current_platform.device_type + @patch.dict(os.environ, {"RANK": "0"}) def test_worker_apply_lora(qwen3_lora_files): @@ -61,7 +64,7 @@ def test_worker_apply_lora(qwen3_lora_files): max_num_seqs=32, max_num_partial_prefills=32, ), - device_config=DeviceConfig("cuda"), + device_config=DeviceConfig(DEVICE_TYPE), cache_config=CacheConfig( block_size=16, cache_dtype="auto", diff --git a/tests/lora/utils.py b/tests/lora/utils.py index 6aba5299b..e5ce7a884 100644 --- a/tests/lora/utils.py +++ b/tests/lora/utils.py @@ -9,10 +9,13 @@ import torch from safetensors.torch import save_file from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights +from vllm.platforms import current_platform + +DEVICE_TYPE = current_platform.device_type class DummyLoRAManager: - def __init__(self, device: torch.device = "cuda:0"): + def __init__(self, device: torch.device = f"{DEVICE_TYPE}:0"): super().__init__() self._loras: dict[str, LoRALayerWeights] = {} self._device = device @@ -57,8 +60,8 @@ class DummyLoRAManager: module_name, rank=rank, lora_alpha=1, - lora_a=torch.rand([rank, input_dim], device="cuda"), - lora_b=torch.rand([output_dim, input_dim], device="cuda"), + lora_a=torch.rand([rank, input_dim], device=DEVICE_TYPE), + lora_b=torch.rand([output_dim, input_dim], device=DEVICE_TYPE), embeddings_tensor=embeddings_tensor, ) self.set_module_lora(module_name, lora) diff --git a/tests/v1/attention/test_attention_backends.py b/tests/v1/attention/test_attention_backends.py index 8c3a62b6e..06095b87e 100644 --- a/tests/v1/attention/test_attention_backends.py +++ b/tests/v1/attention/test_attention_backends.py @@ -40,6 +40,8 @@ BACKENDS_TO_TEST = [ "FLEX_ATTENTION_SLOW", ] +DEVICE_TYPE = current_platform.device_type + # Remove flashinfer from the list if it's not available try: import flashinfer # noqa: F401 @@ -366,7 +368,7 @@ def _test_backend_correctness( num_gpu_blocks=8192, hf_config_override=hf_config_override, ) - device = torch.device("cuda:0") + device = torch.device(f"{DEVICE_TYPE}:0") kv_cache_spec = create_standard_kv_cache_spec(vllm_config) diff --git a/tests/v1/attention/test_chunked_local_attention.py b/tests/v1/attention/test_chunked_local_attention.py index 4529c2cfc..c2798c8f2 100644 --- a/tests/v1/attention/test_chunked_local_attention.py +++ b/tests/v1/attention/test_chunked_local_attention.py @@ -7,6 +7,7 @@ import pytest import torch from tests.v1.attention.utils import BatchSpec, create_common_attn_metadata +from vllm.platforms import current_platform from vllm.v1.attention.backends.utils import make_local_attention_virtual_batches @@ -22,6 +23,8 @@ class LocalAttentionTestData: expected_local_block_table: list[list[int]] +DEVICE_TYPE = current_platform.device_type + test_data_list = [ # Same as example in docstring of make_local_attention_virtual_batches # except block table has 9 columns instead of 10 @@ -151,7 +154,7 @@ test_data_list = [ @pytest.mark.parametrize("test_data", test_data_list) def test_local_attention_virtual_batches(test_data: LocalAttentionTestData): - device = torch.device("cuda:0") + device = torch.device(f"{DEVICE_TYPE}:0") batch_spec = test_data.batch_spec attn_chunk_size = test_data.attn_chunk_size block_size = test_data.block_size diff --git a/tests/v1/attention/test_mla_backends.py b/tests/v1/attention/test_mla_backends.py index 796912a68..e65d1d604 100644 --- a/tests/v1/attention/test_mla_backends.py +++ b/tests/v1/attention/test_mla_backends.py @@ -42,6 +42,8 @@ BACKENDS_TO_TEST = [ AttentionBackendEnum.TRITON_MLA, ] +DEVICE_TYPE = current_platform.device_type + # Remove sm100 backends from the list if not using sm100 if not torch.cuda.is_available() or torch.cuda.get_device_properties(0).major < 10: BACKENDS_TO_TEST.remove(AttentionBackendEnum.CUTLASS_MLA) @@ -763,7 +765,7 @@ def test_backend_correctness( method="ngram", num_speculative_tokens=query_len - 1 ) - device = torch.device("cuda:0") + device = torch.device(f"{DEVICE_TYPE}:0") # 1. Setup batch_size = batch_spec.batch_size diff --git a/tests/v1/attention/test_sparse_mla_backends.py b/tests/v1/attention/test_sparse_mla_backends.py index c49ccd24e..22acc748d 100644 --- a/tests/v1/attention/test_sparse_mla_backends.py +++ b/tests/v1/attention/test_sparse_mla_backends.py @@ -64,6 +64,8 @@ SPARSE_BACKEND_BATCH_SPECS["large_q_pure_prefill"] = BatchSpec( seq_lens=[256] * 2, query_lens=[256] * 2 ) +DEVICE_TYPE = current_platform.device_type + def _float_to_e8m0_truncate(f: float) -> float: """Simulate SM100's float -> e8m0 -> bf16 scale conversion. @@ -222,7 +224,7 @@ def test_sparse_backend_decode_correctness( batch_spec = SPARSE_BACKEND_BATCH_SPECS[batch_name] use_fp8_ds_mla_quantization = kv_cache_dtype == "fp8_ds_mla" - device = torch.device("cuda") + device = torch.device(DEVICE_TYPE) dtype = torch.bfloat16 # Model hyper-parameters (kept intentionally small for the unit test) @@ -586,7 +588,7 @@ def _triton_convert_reference_impl( def test_triton_convert_req_index_to_global_index_decode_only( block_size, num_topk_tokens ): - device = torch.device("cuda") + device = torch.device(DEVICE_TYPE) num_tokens = 8 num_requests = 4 max_blocks_per_req = 10 @@ -639,7 +641,7 @@ def test_triton_convert_req_index_to_global_index_decode_only( reason="FlashMLASparseBackend requires CUDA 9.0 or higher", ) def test_triton_convert_req_index_to_global_index_with_prefill_workspace(block_size): - device = torch.device("cuda") + device = torch.device(DEVICE_TYPE) num_requests = 4 max_blocks_per_req = 8 num_topk_tokens = 128 @@ -794,7 +796,7 @@ def test_split_indexer_prefill_chunks_single_request_overflow(): def test_triton_convert_returns_valid_counts(): """Test that return_valid_counts correctly counts non-negative indices.""" - device = torch.device("cuda") + device = torch.device(DEVICE_TYPE) num_tokens = 8 num_requests = 2 max_blocks_per_req = 10 diff --git a/tests/v1/attention/test_trtllm_attention_integration.py b/tests/v1/attention/test_trtllm_attention_integration.py index 113442bf6..12af0773c 100644 --- a/tests/v1/attention/test_trtllm_attention_integration.py +++ b/tests/v1/attention/test_trtllm_attention_integration.py @@ -55,6 +55,7 @@ class MockAttentionLayer: MODEL = "Qwen/Qwen2.5-0.5B" BLOCK_SIZE = 16 NUM_GPU_BLOCKS = 8192 +DEVICE_TYPE = current_platform.device_type BATCH_SPECS = { "decode_only": BatchSpec( @@ -172,7 +173,7 @@ def _run_trtllm_integration(batch_spec): """Run TRTLLM attention through the full FlashInfer pipeline and compare against an SDPA reference.""" set_random_seed(42) - device = torch.device("cuda:0") + device = torch.device(f"{DEVICE_TYPE}:0") vllm_config = create_vllm_config( model_name=MODEL, diff --git a/tests/v1/cudagraph/test_cudagraph_dispatch.py b/tests/v1/cudagraph/test_cudagraph_dispatch.py index 52e927cee..66e6d7dd4 100644 --- a/tests/v1/cudagraph/test_cudagraph_dispatch.py +++ b/tests/v1/cudagraph/test_cudagraph_dispatch.py @@ -23,6 +23,8 @@ from vllm.forward_context import BatchDescriptor, set_forward_context from vllm.platforms import current_platform from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher +DEVICE_TYPE = current_platform.device_type + # Helper MLP for testing class SimpleMLP(nn.Module): @@ -269,9 +271,9 @@ class TestCudagraphDispatcher: class TestCUDAGraphWrapper: def setup_method(self): self.vllm_config = _create_vllm_config(CompilationConfig()) - self.model = SimpleMLP().to("cuda") - self.persistent_input_buffer = torch.zeros(1, 10, device="cuda") - self.input_tensor = torch.randn(1, 10, device="cuda") + self.model = SimpleMLP().to(DEVICE_TYPE) + self.persistent_input_buffer = torch.zeros(1, 10, device=DEVICE_TYPE) + self.input_tensor = torch.randn(1, 10, device=DEVICE_TYPE) def test_capture_and_replay(self): wrapper = CUDAGraphWrapper( @@ -428,10 +430,10 @@ class TestCudagraphIntegration: @create_new_process_for_each_test("spawn") def test_capture_replay_bypass_logic(self): - model = SimpleMLP().to("cuda") + model = SimpleMLP().to(DEVICE_TYPE) full_wrapper = CUDAGraphWrapper(model, self.vllm_config, CUDAGraphMode.FULL) max_bs = 16 - persistent_input_buffer = torch.zeros(max_bs, 10, device="cuda") + persistent_input_buffer = torch.zeros(max_bs, 10, device=DEVICE_TYPE) input_1 = persistent_input_buffer[:1] input_2 = persistent_input_buffer[:2] input_3 = persistent_input_buffer[:3] @@ -486,17 +488,17 @@ class TestCudagraphIntegration: @create_new_process_for_each_test("spawn") def test_nested_wrappers(self): """Tests a scenario with a PIECEWISE wrapper inside a FULL one.""" - model = SimpleMLP().to("cuda") + model = SimpleMLP().to(DEVICE_TYPE) full_wrapper = CUDAGraphWrapper(model, self.vllm_config, CUDAGraphMode.FULL) - input_1 = torch.randn(1, 10, device="cuda") + input_1 = torch.randn(1, 10, device=DEVICE_TYPE) # Setup: Inner model is wrapped with PIECEWISE, outer with FULL - inner_model = SimpleMLP().to("cuda") + inner_model = SimpleMLP().to(DEVICE_TYPE) piecewise_wrapper = CUDAGraphWrapper( inner_model, self.vllm_config, CUDAGraphMode.PIECEWISE ) inner_model.forward = MagicMock(wraps=inner_model.forward) - outer_model = SimpleMLP().to("cuda") + outer_model = SimpleMLP().to(DEVICE_TYPE) # When outer model is called, it calls the piecewise_wrapper outer_model.forward = MagicMock( wraps=outer_model.forward, side_effect=piecewise_wrapper diff --git a/tests/v1/determinism/test_rms_norm_batch_invariant.py b/tests/v1/determinism/test_rms_norm_batch_invariant.py index 5e5b40d09..7d3b8437a 100644 --- a/tests/v1/determinism/test_rms_norm_batch_invariant.py +++ b/tests/v1/determinism/test_rms_norm_batch_invariant.py @@ -13,6 +13,9 @@ from utils import skip_unsupported from vllm.model_executor.layers.batch_invariant import rms_norm as triton_rms_norm from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.platforms import current_platform + +DEVICE_TYPE = current_platform.device_type @skip_unsupported @@ -34,7 +37,7 @@ def test_rms_norm_batch_invariant_vs_standard( equivalent results to the standard CUDA implementation across various configurations. """ - device = torch.device("cuda") + device = torch.device(DEVICE_TYPE) # Create test input and weight torch.manual_seed(42) @@ -81,7 +84,7 @@ def test_rms_norm_3d_input( Ensures that the batch-invariant RMS norm correctly handles multi-dimensional inputs that are common in transformer models. """ - device = torch.device("cuda") + device = torch.device(DEVICE_TYPE) dtype = torch.bfloat16 eps = 1e-6 @@ -120,7 +123,7 @@ def test_rms_norm_numerical_stability(default_vllm_config): Ensures that both implementations handle edge cases like very small or large values without producing NaN or Inf. """ - device = torch.device("cuda") + device = torch.device(DEVICE_TYPE) dtype = torch.float16 eps = 1e-6 hidden_size = 2048 @@ -179,7 +182,7 @@ def test_rms_norm_formula(default_vllm_config): Verifies: output = input / sqrt(mean(input^2) + eps) * weight """ - device = torch.device("cuda") + device = torch.device(DEVICE_TYPE) dtype = torch.float32 # Use float32 for higher precision in formula check eps = 1e-6 hidden_size = 1024 @@ -214,7 +217,7 @@ def test_rms_norm_different_hidden_sizes(default_vllm_config, hidden_size: int): The Triton kernel uses a fixed BLOCK_SIZE=1024, so this tests that it correctly handles hidden sizes both smaller and larger than the block size. """ - device = torch.device("cuda") + device = torch.device(DEVICE_TYPE) dtype = torch.bfloat16 eps = 1e-6 batch_size = 16 @@ -251,7 +254,7 @@ def test_rms_norm_determinism(default_vllm_config): Runs the same input through the kernel multiple times and verifies identical outputs. """ - device = torch.device("cuda") + device = torch.device(DEVICE_TYPE) dtype = torch.bfloat16 eps = 1e-6 hidden_size = 4096 @@ -283,7 +286,7 @@ if __name__ == "__main__": # Run a quick smoke test print("Running quick smoke test of RMS norm implementations...") - device = torch.device("cuda") + device = torch.device(DEVICE_TYPE) batch_size = 8 hidden_size = 4096 dtype = torch.bfloat16 diff --git a/tests/v1/e2e/general/test_mamba_prefix_cache.py b/tests/v1/e2e/general/test_mamba_prefix_cache.py index 747c5defe..8b9f7bb6c 100644 --- a/tests/v1/e2e/general/test_mamba_prefix_cache.py +++ b/tests/v1/e2e/general/test_mamba_prefix_cache.py @@ -16,6 +16,7 @@ from vllm import LLM, SamplingParams, TokensPrompt from vllm.config import CacheConfig from vllm.distributed import cleanup_dist_env_and_memory from vllm.model_executor.layers.mamba.mamba_utils import MambaStateCopyFunc +from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors from vllm.v1.attention.backends.utils import CommonAttentionMetadata from vllm.v1.core.kv_cache_manager import KVCacheBlocks, KVCacheManager @@ -48,6 +49,7 @@ num_accepted_tokens = 1 prompt_token_ids: list[int] = [] MODEL = "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8" BLOCK_SIZE = 560 +DEVICE_TYPE = current_platform.device_type NUM_HIDDEN_LAYERS = 1 cur_step_action_idx = 0 cur_step_action: StepAction | None = None @@ -71,7 +73,7 @@ def get_fake_sample_fn() -> SamplerOutput: return SamplerOutput( sampled_token_ids=torch.tensor( [[prompt_token_ids[first_token_id_index]]], - device="cuda", + device=DEVICE_TYPE, dtype=torch.int32, ), logprobs_tensors=None, @@ -83,7 +85,9 @@ def get_fake_sample_fn() -> SamplerOutput: sampled_token_ids = accepted_tokens return SamplerOutput( sampled_token_ids=torch.tensor( - [sampled_token_ids], device="cuda", dtype=torch.int32 + [sampled_token_ids], + device=DEVICE_TYPE, + dtype=torch.int32, ), logprobs_tensors=None, ) @@ -128,17 +132,23 @@ def get_fake_propose_draft_token_ids_fn(): - 1 + num_accepted_tokens ], - device="cuda", + device=DEVICE_TYPE, dtype=torch.int32, ) valid_sampled_tokens_count = torch.tensor( - [num_accepted_tokens], device="cuda", dtype=torch.int32 + [num_accepted_tokens], + device=DEVICE_TYPE, + dtype=torch.int32, ) self._copy_valid_sampled_token_count(next_token_ids, valid_sampled_tokens_count) - return torch.tensor(proposed_draft_token_ids, device="cuda", dtype=torch.int32) + return torch.tensor( + proposed_draft_token_ids, + device=DEVICE_TYPE, + dtype=torch.int32, + ) return fake_propose_draft_token_ids_fn diff --git a/tests/v1/kv_offload/test_cpu_gpu.py b/tests/v1/kv_offload/test_cpu_gpu.py index 1983cca22..2da3a5e56 100644 --- a/tests/v1/kv_offload/test_cpu_gpu.py +++ b/tests/v1/kv_offload/test_cpu_gpu.py @@ -6,6 +6,7 @@ import time import pytest import torch +from vllm.platforms import current_platform from vllm.utils.torch_utils import set_random_seed from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec from vllm.v1.kv_offload.spec import ( @@ -21,7 +22,8 @@ GPU_PAGE_SIZES = [512, 1024] BLOCK_SIZE_FACTORS = [1, 3] NUM_TENSORS = [4] SEEDS = [0] -CUDA_DEVICES = ["cuda:0"] +DEVICE_TYPE = current_platform.device_type +DEVICES = [f"{DEVICE_TYPE}:0"] NUM_MAPPINGS = [3] @@ -33,7 +35,7 @@ NUM_MAPPINGS = [3] @pytest.mark.parametrize("num_cpu_blocks", NUM_CPU_BLOCKS) @pytest.mark.parametrize("num_tensors", NUM_TENSORS) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) @torch.inference_mode() def test_transfer( default_vllm_config, diff --git a/tests/v1/logits_processors/test_correctness.py b/tests/v1/logits_processors/test_correctness.py index bc2cc1720..bf2979371 100644 --- a/tests/v1/logits_processors/test_correctness.py +++ b/tests/v1/logits_processors/test_correctness.py @@ -39,8 +39,9 @@ PIN_MEMORY_AVAILABLE = is_pin_memory_available() MAX_NUM_REQS = 256 VOCAB_SIZE = 1024 NUM_OUTPUT_TOKENS = 20 -CUDA_DEVICES = [ - f"{current_platform.device_type}:{i}" +DEVICE_TYPE = current_platform.device_type +DEVICES = [ + f"{DEVICE_TYPE}:{i}" for i in range(1 if current_platform.device_count() == 1 else 2) ] MAX_NUM_PROMPT_TOKENS = 64 @@ -801,7 +802,7 @@ def _assert_valid( @create_new_process_for_each_test() -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("reqs_per_logitproc", [REQS_PER_LOGITPROC]) @pytest.mark.parametrize("logitsprocs_under_test", _get_test_cases()) def test_logitsprocs( diff --git a/tests/v1/sample/test_rejection_sampler.py b/tests/v1/sample/test_rejection_sampler.py index 552a27fe2..ecfcade2b 100644 --- a/tests/v1/sample/test_rejection_sampler.py +++ b/tests/v1/sample/test_rejection_sampler.py @@ -19,7 +19,7 @@ from vllm.v1.sample.rejection_sampler import ( from vllm.v1.sample.sampler import Sampler, SamplerOutput from vllm.v1.spec_decode.metadata import SpecDecodeMetadata -DEVICE = current_platform.device_type +DEVICE_TYPE = current_platform.device_type @pytest.fixture @@ -57,7 +57,7 @@ def create_logits_tensor( will produce desired token ids on argmax""" token_ids = [tokens[:-1] for tokens in output_token_ids] num_total_tokens = sum(len(tokens) for tokens in token_ids) - logits = torch.full((num_total_tokens, vocab_size), -100.0, device=DEVICE) + logits = torch.full((num_total_tokens, vocab_size), -100.0, device=DEVICE_TYPE) start_loc = 0 for tokens in token_ids: for j, token_id in enumerate(tokens): @@ -99,9 +99,9 @@ def create_sampling_metadata( assert output_token_ids assert len(output_token_ids) > 0 - frequency_penalties = torch.tensor(frequency_penalties, device=DEVICE) - presence_penalties = torch.tensor(presence_penalties, device=DEVICE) - repetition_penalties = torch.tensor(repetition_penalties, device=DEVICE) + frequency_penalties = torch.tensor(frequency_penalties, device=DEVICE_TYPE) + presence_penalties = torch.tensor(presence_penalties, device=DEVICE_TYPE) + repetition_penalties = torch.tensor(repetition_penalties, device=DEVICE_TYPE) else: no_penalties = True frequency_penalties = torch.tensor([]) @@ -320,14 +320,27 @@ def test_deterministic_when_seeded( n_rep: int, ): num_tokens = batch_size * k - draft_probs = torch.rand(num_tokens, vocab_size, dtype=torch.float32, device=DEVICE) + draft_probs = torch.rand( + num_tokens, + vocab_size, + dtype=torch.float32, + device=DEVICE_TYPE, + ) draft_probs = F.softmax(draft_probs, dim=-1) target_logits = torch.rand_like(draft_probs) bonus_token_ids = torch.randint( - low=0, high=vocab_size, size=(batch_size, 1), dtype=torch.int64, device=DEVICE + low=0, + high=vocab_size, + size=(batch_size, 1), + dtype=torch.int64, + device=DEVICE_TYPE, ) draft_token_ids = torch.randint( - low=0, high=vocab_size, size=(batch_size, k), dtype=torch.int64, device=DEVICE + low=0, + high=vocab_size, + size=(batch_size, k), + dtype=torch.int64, + device=DEVICE_TYPE, ) seeded_mask = torch.rand(batch_size, dtype=torch.float32) <= frac_seeded @@ -335,12 +348,12 @@ def test_deterministic_when_seeded( results = [] for _ in range(n_rep): seeded_seqs = { - i: torch.Generator(device=DEVICE).manual_seed(i) + i: torch.Generator(device=DEVICE_TYPE).manual_seed(i) for i in range(batch_size) if seeded_mask[i] } - temperature = torch.ones(batch_size, dtype=torch.float32, device=DEVICE) + temperature = torch.ones(batch_size, dtype=torch.float32, device=DEVICE_TYPE) sampling_metadata = create_sampling_metadata( all_greedy=False, temperature=temperature, generators=seeded_seqs ) @@ -387,7 +400,7 @@ def test_rejection_sampling_approximates_target_distribution(): much more than the distance improvement between the observed distribution and the random distribution. """ - torch.set_default_device(DEVICE) + torch.set_default_device(DEVICE_TYPE) vocab_size = 10 k = 2 num_reference_probs = 100 @@ -410,7 +423,7 @@ def test_rejection_sampling_approximates_target_distribution(): rej_sample_probs = estimate_rejection_sampling_pdf( draft_probs, target_logits, k, vocab_size, num_samples ) - rej_sample_probs = rej_sample_probs.to(DEVICE) + rej_sample_probs = rej_sample_probs.to(DEVICE_TYPE) # Average distance from reference probs. reference_vs_rejsample_dist = ( @@ -491,11 +504,11 @@ def estimate_rejection_sampling_pdf( draft_probs = draft_probs.view(num_tokens, vocab_size) # Bonus tokens not used but required. - bonus_token_ids = torch.zeros((1, 1), dtype=torch.int64, device=DEVICE).repeat( + bonus_token_ids = torch.zeros((1, 1), dtype=torch.int64, device=DEVICE_TYPE).repeat( num_samples, 1 ) - temperature = torch.ones(num_samples, dtype=torch.float32, device=DEVICE) + temperature = torch.ones(num_samples, dtype=torch.float32, device=DEVICE_TYPE) sampling_metadata = create_sampling_metadata( all_greedy=False, temperature=temperature ) @@ -600,7 +613,7 @@ def _test_masked_logits( # Create random draft probabilities. draft_probs = torch.rand( - (num_tokens, vocab_size), dtype=torch.float32, device=DEVICE + (num_tokens, vocab_size), dtype=torch.float32, device=DEVICE_TYPE ) draft_probs = F.softmax(draft_probs, dim=-1) @@ -610,7 +623,11 @@ def _test_masked_logits( draft_token_ids = draft_token_ids.tolist() # Bonus tokens not used but required - bonus_token_ids = torch.zeros((batch_size, 1), dtype=torch.int64, device=DEVICE) + bonus_token_ids = torch.zeros( + (batch_size, 1), + dtype=torch.int64, + device=DEVICE_TYPE, + ) # Create spec decode metadata spec_decode_metadata = create_spec_decode_metadata(draft_token_ids, target_logits) @@ -645,12 +662,13 @@ def test_top_k(rejection_sampler, top_k): # Randomly create top-k indices. top_k_indices = [ - torch.randperm(vocab_size, device=DEVICE)[:top_k] for _ in range(num_tokens) + torch.randperm(vocab_size, device=DEVICE_TYPE)[:top_k] + for _ in range(num_tokens) ] top_k_indices = torch.stack(top_k_indices) # Create logits with the uniform distribution. - target_logits = torch.zeros((num_tokens, vocab_size), device=DEVICE) + target_logits = torch.zeros((num_tokens, vocab_size), device=DEVICE_TYPE) # Increment the logits for top-k indices, a little bit more than the other # ones. If the masking is effective, the non-topk indices will never be @@ -659,11 +677,11 @@ def test_top_k(rejection_sampler, top_k): target_logits[i, top_k_indices[i]] += 0.1 # Create sampling metadata - temperature = torch.ones(batch_size, dtype=torch.float32, device=DEVICE) + temperature = torch.ones(batch_size, dtype=torch.float32, device=DEVICE_TYPE) sampling_metadata = create_sampling_metadata( all_greedy=False, temperature=temperature, - top_k=torch.tensor([top_k] * batch_size, device=DEVICE, dtype=torch.int64), + top_k=torch.tensor([top_k] * batch_size, device=DEVICE_TYPE, dtype=torch.int64), ) _test_masked_logits( @@ -686,8 +704,8 @@ def test_top_p(rejection_sampler, top_p): num_tokens = batch_size * num_draft_tokens # Create logits with the uniform distribution. - target_logits = torch.randn((num_tokens, vocab_size), device=DEVICE) - temperature = torch.ones(batch_size, dtype=torch.float32, device=DEVICE) + target_logits = torch.randn((num_tokens, vocab_size), device=DEVICE_TYPE) + temperature = torch.ones(batch_size, dtype=torch.float32, device=DEVICE_TYPE) rescaled_logits = target_logits / temperature logits_sort, logits_idx = rescaled_logits.sort(dim=-1, descending=False) @@ -706,7 +724,11 @@ def test_top_p(rejection_sampler, top_p): sampling_metadata = create_sampling_metadata( all_greedy=False, temperature=temperature, - top_p=torch.tensor([top_p] * batch_size, device=DEVICE, dtype=torch.float32), + top_p=torch.tensor( + [top_p] * batch_size, + device=DEVICE_TYPE, + dtype=torch.float32, + ), ) _test_masked_logits( @@ -732,7 +754,10 @@ def test_frequency_penalties(rejection_sampler): all_greedy=True, output_token_ids=[[2], [3], [4]], spec_token_ids=spec_tokens, - prompt_token_ids=torch.tensor([[5, 6, 7], [6, 7, 8], [7, 8, 9]], device=DEVICE), + prompt_token_ids=torch.tensor( + [[5, 6, 7], [6, 7, 8], [7, 8, 9]], + device=DEVICE_TYPE, + ), frequency_penalties=[1.5, 1.5, 0.7], presence_penalties=[0.0] * num_requests, repetition_penalties=[1.0] * num_requests, @@ -858,21 +883,26 @@ def test_sample_recovered_tokens( num_tokens = batch_size * max_spec_len # Create random draft probabilities. - draft_probs = torch.rand(num_tokens, vocab_size, dtype=torch.float32, device=DEVICE) + draft_probs = torch.rand( + num_tokens, + vocab_size, + dtype=torch.float32, + device=DEVICE_TYPE, + ) draft_probs = F.softmax(draft_probs, dim=-1) # Create random target probabilities. target_logits = torch.rand( - num_tokens, vocab_size, dtype=torch.float32, device=DEVICE + num_tokens, vocab_size, dtype=torch.float32, device=DEVICE_TYPE ) target_probs = F.softmax(target_logits, dim=-1) # Randomly sample draft token ids from draft probs draft_token_ids = torch.multinomial(draft_probs, num_samples=1).to(torch.int32) - temperature = torch.ones(batch_size, dtype=torch.float32, device=DEVICE) + temperature = torch.ones(batch_size, dtype=torch.float32, device=DEVICE_TYPE) generators = { - i: torch.Generator(device=DEVICE).manual_seed(i) for i in range(batch_size) + i: torch.Generator(device=DEVICE_TYPE).manual_seed(i) for i in range(batch_size) } sampling_metadata = create_sampling_metadata( all_greedy=False, temperature=temperature, generators=generators @@ -890,7 +920,7 @@ def test_sample_recovered_tokens( None if no_draft_probs else draft_probs, target_probs, sampling_metadata, - device=DEVICE, + device=DEVICE_TYPE, ) recovered_token_ids = sample_recovered_tokens( max_spec_len, @@ -900,6 +930,6 @@ def test_sample_recovered_tokens( None if no_draft_probs else draft_probs, target_probs, sampling_metadata, - device=DEVICE, + device=DEVICE_TYPE, ) assert torch.equal(recovered_token_ids, ref_recovered_token_ids) diff --git a/tests/v1/sample/test_sampler.py b/tests/v1/sample/test_sampler.py index 51f2bf5e7..c67199fa4 100644 --- a/tests/v1/sample/test_sampler.py +++ b/tests/v1/sample/test_sampler.py @@ -17,8 +17,9 @@ PIN_MEMORY_AVAILABLE = is_pin_memory_available() MAX_NUM_REQS = 256 VOCAB_SIZE = 1024 NUM_OUTPUT_TOKENS = 20 -CUDA_DEVICES = [ - f"{current_platform.device_type}:{i}" +DEVICE_TYPE = current_platform.device_type +DEVICES = [ + f"{DEVICE_TYPE}:{i}" for i in range(1 if current_platform.device_count() == 1 else 2) ] MAX_NUM_PROMPT_TOKENS = 64 @@ -199,7 +200,7 @@ def _create_weighted_output_token_list( return output_token_ids, sorted_token_ids_in_output -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("batch_size", [1, 2, 32]) @pytest.mark.parametrize("presence_penalty", [-2.0, 2.0]) def test_sampler_presence_penalty( @@ -249,7 +250,7 @@ def test_sampler_presence_penalty( assert penalized_token_id not in output_token_ids[batch_idx] -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("batch_size", [1, 2, 32]) @pytest.mark.parametrize("frequency_penalty", [-2.0, 2.0]) def test_sampler_frequency_penalty( @@ -305,7 +306,7 @@ def test_sampler_frequency_penalty( assert penalized_token_id not in distinct_sorted_token_ids_in_output -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("batch_size", [1, 2, 32]) @pytest.mark.parametrize("repetition_penalty", [0.1, 1.9]) def test_sampler_repetition_penalty( @@ -363,7 +364,7 @@ def test_sampler_repetition_penalty( ) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("batch_size", [1, 2, 32]) @pytest.mark.parametrize("num_allowed_token_ids", [0, 1, 2]) def test_sampler_allowed_token_ids( @@ -409,7 +410,7 @@ def test_sampler_allowed_token_ids( assert logits_for_req[token_id] != -float("inf") -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("batch_size", [1, 2, 32]) @pytest.mark.parametrize("bad_words_lengths", [(1,), (1, 3), (2, 2)]) def test_sampler_bad_words( diff --git a/tests/v1/sample/test_topk_topp_sampler.py b/tests/v1/sample/test_topk_topp_sampler.py index ce1e288a2..511f26680 100644 --- a/tests/v1/sample/test_topk_topp_sampler.py +++ b/tests/v1/sample/test_topk_topp_sampler.py @@ -7,8 +7,7 @@ from torch import Generator from vllm.platforms import current_platform from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p_pytorch -CUDA_DEVICE = "cuda" if current_platform.is_cuda() else None -DEVICE = current_platform.device_type +DEVICE_TYPE = current_platform.device_type BATCH_SIZE = 1024 VOCAB_SIZE = 128 * 1024 @@ -26,8 +25,8 @@ def reset_default_device(): def test_topk_impl_equivalence(): - torch.set_default_device(DEVICE) - generator = Generator(device=DEVICE).manual_seed(33) + torch.set_default_device(DEVICE_TYPE) + generator = Generator(device=DEVICE_TYPE).manual_seed(33) logits = torch.rand((BATCH_SIZE, VOCAB_SIZE), generator=generator) @@ -76,8 +75,8 @@ def test_flashinfer_sampler(): if not FLASHINFER_ENABLED: pytest.skip("FlashInfer not installed or not available on this platform.") - torch.set_default_device(DEVICE) - generator = Generator(device=DEVICE).manual_seed(42) + torch.set_default_device(DEVICE_TYPE) + generator = Generator(device=DEVICE_TYPE).manual_seed(42) # Generate random logits logits = torch.rand((BATCH_SIZE, VOCAB_SIZE), generator=generator) @@ -128,15 +127,15 @@ def test_flashinfer_sampler(): # ============================================================================= -@pytest.mark.skipif(CUDA_DEVICE is None, reason="CUDA not available") +@pytest.mark.skipif("CPU" in DEVICE_TYPE, reason="CUDA/XPU not available") class TestTritonTopkTopp: """Tests for the Triton top-k/top-p kernel.""" @pytest.fixture(autouse=True) def setup(self): """Set up test fixtures.""" - torch.set_default_device(CUDA_DEVICE) - self.generator = Generator(device=CUDA_DEVICE).manual_seed(42) + torch.set_default_device(DEVICE_TYPE) + self.generator = Generator(device=DEVICE_TYPE).manual_seed(42) def _compare_results( self, diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py index c10822024..5d587fa3e 100644 --- a/tests/v1/spec_decode/test_eagle.py +++ b/tests/v1/spec_decode/test_eagle.py @@ -42,6 +42,7 @@ dflash_target_dir = "Qwen/Qwen3-8B" dflash_dir = "z-lab/Qwen3-8B-DFlash-b16" BLOCK_SIZE = 16 +DEVICE_TYPE = current_platform.device_type def _create_proposer( @@ -92,7 +93,7 @@ def _create_proposer( # Overwrite pard_token to avoid crash during init speculative_config.draft_model_config.hf_config.pard_token = 0 - device = current_platform.device_type + device = DEVICE_TYPE vllm_config = VllmConfig( model_config=model_config, cache_config=CacheConfig(block_size=16), @@ -124,7 +125,7 @@ def test_prepare_next_token_ids(): either the GPU tensor of sampled_token_ids with -1 for rejected tokens, or the CPU python list[list[int]] with the rejected tokens removed. """ - device = torch.device(current_platform.device_type) + device = torch.device(DEVICE_TYPE) num_requests = 4 num_speculative_tokens = 4 @@ -207,7 +208,7 @@ def test_prepare_inputs(): a, a + 1, ..., a + b - n2 - 1, a + b, a + b + 1, ..., a + b + c - n3 - 1] """ - device = torch.device(current_platform.device_type) + device = torch.device(DEVICE_TYPE) # q1 = 4, q2 = 7, q3 = 5 # n1 = 1, n2 = 3, n3 = 2 @@ -300,7 +301,7 @@ def test_prepare_inputs_padded(): from the original indices to sample from. """ - device = torch.device(current_platform.device_type) + device = torch.device(DEVICE_TYPE) expected_token_indices_to_sample = torch.tensor( [1, 5, 6], dtype=torch.int32, device=device @@ -370,7 +371,7 @@ def test_set_inputs_first_pass_default_eagle(): - After inserting next_tokens [100, 200, 300]: [a2, a3, 100, b2, 200, c2, c3, c4, 300] """ - device = torch.device(current_platform.device_type) + device = torch.device(DEVICE_TYPE) num_speculative_tokens = 3 proposer = _create_proposer("eagle", num_speculative_tokens) @@ -471,7 +472,7 @@ def test_set_inputs_first_pass_draft_model(): - idx 5: token 21, pos 1 - idx 6: token 200, pos 2 (bonus token) """ - device = torch.device(current_platform.device_type) + device = torch.device(DEVICE_TYPE) num_speculative_tokens = 2 block_size = BLOCK_SIZE @@ -609,7 +610,7 @@ def test_set_inputs_first_pass_parallel_drafting(): - idx 9: bonus token 200 - idx 10-11: parallel_drafting_tokens, is_masked=True """ - device = torch.device(current_platform.device_type) + device = torch.device(DEVICE_TYPE) num_speculative_tokens = 3 block_size = BLOCK_SIZE @@ -859,7 +860,7 @@ def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch): monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1") # Use GPU device - device = torch.device(current_platform.device_type) + device = torch.device(DEVICE_TYPE) # Setup test parameters batch_size = 2 @@ -1030,7 +1031,7 @@ def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch): ) def test_propose_tree(spec_token_tree): # Get GPU device. - device = torch.device(current_platform.device_type) + device = torch.device(DEVICE_TYPE) # Setup test parameters. batch_size = 2 diff --git a/tests/v1/spec_decode/test_eagle_step_kernel.py b/tests/v1/spec_decode/test_eagle_step_kernel.py index 319ab4a33..275a157d1 100644 --- a/tests/v1/spec_decode/test_eagle_step_kernel.py +++ b/tests/v1/spec_decode/test_eagle_step_kernel.py @@ -5,11 +5,14 @@ import pytest import torch +from vllm.platforms import current_platform from vllm.v1.spec_decode.utils import ( PADDING_SLOT_ID, eagle_step_update_slot_mapping_and_metadata, ) +DEVICE_TYPE = current_platform.device_type + # Skip if no CUDA - Triton kernel requires GPU pytest.importorskip("triton") if not torch.cuda.is_available(): @@ -47,7 +50,7 @@ def _reference_eagle_step_slot_mapping( def test_eagle_step_slot_mapping_kernel(): """Test fused kernel matches Python reference for slot mapping and metadata.""" - device = torch.device("cuda") + device = torch.device(DEVICE_TYPE) batch_size = 32 block_size = 16 max_model_len = 4096 @@ -93,7 +96,7 @@ def test_eagle_step_slot_mapping_kernel(): def test_eagle_step_slot_mapping_kernel_exceeds_max(): """Test fused kernel when position exceeds max_model_len.""" - device = torch.device("cuda") + device = torch.device(DEVICE_TYPE) batch_size = 4 block_size = 16 max_model_len = 100 @@ -130,7 +133,7 @@ def test_eagle_step_slot_mapping_kernel_exceeds_max(): def test_eagle_step_slot_mapping_kernel_cudagraph_padding(): """Test that padding threads write PADDING_SLOT_ID when input_batch_size > batch_size (cudagraph padding).""" - device = torch.device("cuda") + device = torch.device(DEVICE_TYPE) batch_size = 4 input_batch_size = 8 block_size = 16 diff --git a/tests/v1/spec_decode/test_extract_hidden_states.py b/tests/v1/spec_decode/test_extract_hidden_states.py index 9f9758b82..95004eb65 100644 --- a/tests/v1/spec_decode/test_extract_hidden_states.py +++ b/tests/v1/spec_decode/test_extract_hidden_states.py @@ -27,6 +27,7 @@ from vllm.v1.spec_decode.extract_hidden_states import ExtractHiddenStatesPropose from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch model_dir = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" +DEVICE_TYPE = current_platform.device_type def _create_proposer( @@ -51,7 +52,7 @@ def _create_proposer( }, ) - device = current_platform.device_type + device = DEVICE_TYPE vllm_config = VllmConfig( model_config=model_config, cache_config=CacheConfig(), @@ -101,7 +102,7 @@ def test_proposer_initialization_missing_layer_ids(): }, ) - device = current_platform.device_type + device = DEVICE_TYPE vllm_config = VllmConfig( model_config=model_config, cache_config=CacheConfig(), @@ -130,7 +131,7 @@ def test_prepare_next_token_ids_padded(): For each request we either use the sampled token (if valid and not discarded) or a backup token from the request state. """ - device = torch.device(current_platform.device_type) + device = torch.device(DEVICE_TYPE) num_requests = 4 req_ids = [f"req_{i + 1}" for i in range(num_requests)] @@ -197,7 +198,7 @@ def test_propose(): 2. Return the sampled tokens as "draft" tokens (shape [batch_size, 1]) 3. Cache the hidden states in the model's KV cache """ - device = torch.device(current_platform.device_type) + device = torch.device(DEVICE_TYPE) # Setup test parameters batch_size = 2 @@ -273,7 +274,7 @@ def test_propose(): @pytest.mark.parametrize("num_hidden_layers", [1, 4, 8]) def test_propose_different_layer_counts(num_hidden_layers): """Test that propose works correctly with different numbers of hidden layers.""" - device = torch.device(current_platform.device_type) + device = torch.device(DEVICE_TYPE) batch_size = 2 num_tokens = 5 diff --git a/tests/v1/spec_decode/test_mtp.py b/tests/v1/spec_decode/test_mtp.py index 0a48b0e7b..094611e05 100644 --- a/tests/v1/spec_decode/test_mtp.py +++ b/tests/v1/spec_decode/test_mtp.py @@ -28,6 +28,7 @@ from vllm.v1.attention.backends.registry import AttentionBackendEnum from vllm.v1.spec_decode.eagle import EagleProposer mimo_7b_dir = "XiaomiMiMo/MiMo-7B-Base" +DEVICE_TYPE = current_platform.device_type def _create_mtp_proposer(num_speculative_tokens: int) -> EagleProposer: @@ -48,7 +49,7 @@ def _create_mtp_proposer(num_speculative_tokens: int) -> EagleProposer: model_config=model_config, cache_config=CacheConfig(), speculative_config=speculative_config, - device_config=DeviceConfig(device=current_platform.device_type), + device_config=DeviceConfig(device=DEVICE_TYPE), parallel_config=ParallelConfig(), load_config=LoadConfig(), scheduler_config=SchedulerConfig( @@ -57,7 +58,7 @@ def _create_mtp_proposer(num_speculative_tokens: int) -> EagleProposer: ), ) - return EagleProposer(vllm_config=vllm_config, device=current_platform.device_type) + return EagleProposer(vllm_config=vllm_config, device=DEVICE_TYPE) @mock.patch("vllm.v1.spec_decode.eagle.get_pp_group") @@ -118,7 +119,7 @@ def test_mtp_load_model_unified(mock_get_model, mock_get_layers, mock_get_pp_gro def test_mtp_propose(num_speculative_tokens, monkeypatch): """Test that MTP's forward method returns hidden states directly""" - device = torch.device(current_platform.device_type) + device = torch.device(DEVICE_TYPE) batch_size = 2 seq_lens = [5, 3] total_tokens = sum(seq_lens) diff --git a/tests/v1/spec_decode/test_tree_attention.py b/tests/v1/spec_decode/test_tree_attention.py index 52bc722cf..cb487acec 100644 --- a/tests/v1/spec_decode/test_tree_attention.py +++ b/tests/v1/spec_decode/test_tree_attention.py @@ -18,6 +18,8 @@ from vllm.v1.attention.backend import CommonAttentionMetadata from vllm.v1.attention.backends.fa_utils import is_flash_attn_varlen_func_available from vllm.v1.attention.backends.registry import AttentionBackendEnum +DEVICE_TYPE = current_platform.device_type + if not is_flash_attn_varlen_func_available(): pytest.skip( "This test requires flash_attn_varlen_func, but it's not available.", @@ -170,9 +172,9 @@ def _get_available_reference_backends() -> list[AttentionBackendEnum]: class MockAttentionLayer(torch.nn.Module): - _q_scale = torch.tensor(1.0, dtype=torch.float32, device="cuda") - _k_scale = torch.tensor(1.0, dtype=torch.float32, device="cuda") - _v_scale = torch.tensor(1.0, dtype=torch.float32, device="cuda") + _q_scale = torch.tensor(1.0, dtype=torch.float32, device=DEVICE_TYPE) + _k_scale = torch.tensor(1.0, dtype=torch.float32, device=DEVICE_TYPE) + _v_scale = torch.tensor(1.0, dtype=torch.float32, device=DEVICE_TYPE) layer_name = "mock_layer" def __init__(self): diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py index d4eee19ad..3a478d210 100644 --- a/tests/v1/worker/test_gpu_input_batch.py +++ b/tests/v1/worker/test_gpu_input_batch.py @@ -22,10 +22,8 @@ from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch VOCAB_SIZE = 1024 NUM_OUTPUT_TOKENS = 20 MAX_PROMPT_SIZE = 100 -CUDA_DEVICES = [ - f"{current_platform.device_type}:{i}" - for i in range(min(current_platform.device_count(), 2)) -] +DEVICE_TYPE = current_platform.device_type +DEVICES = [f"{DEVICE_TYPE}:{i}" for i in range(min(current_platform.device_count(), 2))] MAX_NUM_PROMPT_TOKENS = 64 @@ -219,7 +217,7 @@ def _construct_cached_request_state(req_id_suffix: int): ) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("batch_size", [1, 2, 32, 64]) def test_sampling_metadata_in_input_batch(device: str, batch_size: int): """ @@ -313,7 +311,7 @@ def test_sampling_metadata_in_input_batch(device: str, batch_size: int): ) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("batch_size", [32]) @pytest.mark.parametrize("swap_list", [((0, 1),)]) def test_swap_states_in_input_batch(device: str, batch_size: int, swap_list: list): @@ -400,7 +398,7 @@ def _construct_pooling_request(req_id_suffix: int, pooling_params=None): ) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) def test_pooling_prompt_lens_not_aliased(device: str): """Verify that prompt_lens in PoolingMetadata does not share memory with the internal num_prompt_tokens pinned buffer. Guards against possible diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index d7695027a..0de443858 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -45,7 +45,7 @@ from vllm.v1.worker.utils import AttentionGroup, select_common_block_size BLOCK_SIZE = 16 NUM_BLOCKS = 10 -DEVICE = current_platform.device_type +DEVICE_TYPE = current_platform.device_type def initialize_kv_cache(runner: GPUModelRunner): @@ -121,7 +121,7 @@ def model_runner(): vllm_config.compilation_config.static_forward_context["layer.0"] = Attention( num_heads, head_size, 0.1 ) - runner = GPUModelRunner(vllm_config, DEVICE) + runner = GPUModelRunner(vllm_config, DEVICE_TYPE) initialize_kv_cache(runner) yield runner @@ -340,7 +340,7 @@ def test_get_nans_in_logits(model_runner, dist_init): [1.0, 2.0, 3.0], [3.0, 2.0, 1.0], ], - device=DEVICE, + device=DEVICE_TYPE, ) result = model_runner._get_nans_in_logits(logits) assert result == {"req_0": 0, "req_1": 0} @@ -350,7 +350,7 @@ def test_get_nans_in_logits(model_runner, dist_init): [1.0, float("nan"), 3.0], [4.0, float("nan"), float("nan")], ], - device=DEVICE, + device=DEVICE_TYPE, ) result = model_runner._get_nans_in_logits(logits) assert result == {"req_0": 1, "req_1": 2} @@ -360,7 +360,7 @@ def test_get_nans_in_logits(model_runner, dist_init): [1.0, 2.0, 3.0], [4.0, float("nan"), float("nan")], ], - device=DEVICE, + device=DEVICE_TYPE, ) result = model_runner._get_nans_in_logits(logits) assert result == {"req_0": 0, "req_1": 2} @@ -372,7 +372,7 @@ def test_get_nans_in_logits(model_runner, dist_init): [ [1.0, float("nan"), 3.0], ], - device=DEVICE, + device=DEVICE_TYPE, ) result = model_runner._get_nans_in_logits(logits) assert result == {"req_0": 1, "req_1": 0} @@ -383,7 +383,7 @@ def test_get_nans_in_logits(model_runner, dist_init): [1.0, 2.0, 3.0], [float("nan"), 2.0, 3.0], ], - device=DEVICE, + device=DEVICE_TYPE, ) result = model_runner._get_nans_in_logits(logits) assert result == {"req_0": 2, "req_1": 0} @@ -643,7 +643,7 @@ def test_init_kv_cache_without_kv_sharing(default_vllm_config): # Set high context length to test max context length estimation vllm_config.model_config.max_model_len = 3_000_000 vllm_ctx = vllm_config.compilation_config.static_forward_context - runner = GPUModelRunner(vllm_config, DEVICE) + runner = GPUModelRunner(vllm_config, DEVICE_TYPE) kv_cache_spec = runner.get_kv_cache_spec() assert len(kv_cache_spec) == 2 assert len(runner.shared_kv_cache_layers) == 0 @@ -711,7 +711,7 @@ def test_init_kv_cache_with_kv_sharing_valid(default_vllm_config): # Set high context length to test max context length estimation vllm_config.model_config.max_model_len = 3_000_000 vllm_ctx = vllm_config.compilation_config.static_forward_context - runner = GPUModelRunner(vllm_config, DEVICE) + runner = GPUModelRunner(vllm_config, DEVICE_TYPE) kv_cache_spec = runner.get_kv_cache_spec() assert len(kv_cache_spec) == 1 assert layer_0 in kv_cache_spec @@ -850,7 +850,7 @@ def test_hybrid_attention_mamba_tensor_shapes(): assert fwd_context is not None vllm_ctx = vllm_config.compilation_config.static_forward_context - runner = GPUModelRunner(vllm_config, DEVICE) + runner = GPUModelRunner(vllm_config, DEVICE_TYPE) current_platform.update_block_size_for_backend(vllm_config) kv_cache_spec = runner.get_kv_cache_spec() @@ -896,13 +896,13 @@ def test_hybrid_attention_mamba_tensor_shapes(): ssm_constant_shape = ssm_shape[1:] attn_blocks_constant = torch.full( - (test_block_size, *attn_constant_shape), device=DEVICE, fill_value=3.33 + (test_block_size, *attn_constant_shape), device=DEVICE_TYPE, fill_value=3.33 ) conv_blocks_constant = torch.full( - (test_block_size, *conv_constant_shape), device=DEVICE, fill_value=6.66 + (test_block_size, *conv_constant_shape), device=DEVICE_TYPE, fill_value=6.66 ) ssm_blocks_constant = torch.full( - (test_block_size, *ssm_constant_shape), device=DEVICE, fill_value=9.99 + (test_block_size, *ssm_constant_shape), device=DEVICE_TYPE, fill_value=9.99 ) # Fill attention blocks with constants using kv block indices @@ -997,7 +997,7 @@ def test_hybrid_block_table_initialization(): max_num_blocks_per_req=max_num_blocks_per_req, max_num_batched_tokens=max_num_batched_tokens, pin_memory=False, - device=torch.device(DEVICE), + device=torch.device(DEVICE_TYPE), kernel_block_size=kernel_block_sizes[0], cp_kv_cache_interleave_size=cp_kv_cache_interleave_size, ) @@ -1036,7 +1036,7 @@ def test_input_batch_with_kernel_block_sizes(): max_num_reqs = 10 max_model_len = 512 max_num_batched_tokens = 512 - device = torch.device(DEVICE) + device = torch.device(DEVICE_TYPE) pin_memory = False vocab_size = 50272 @@ -1083,7 +1083,7 @@ def test_hybrid_cache_integration(default_vllm_config, dist_init): num_heads, head_size, 0.1 ) - runner = GPUModelRunner(vllm_config, DEVICE) + runner = GPUModelRunner(vllm_config, DEVICE_TYPE) # Initialize KV cache with configuration attn_spec = FullAttentionSpec( @@ -1306,7 +1306,7 @@ def test_mamba_cache_raises_when_max_num_seqs_exceeds_blocks(): ) assert fwd_context is not None - runner = GPUModelRunner(vllm_config, DEVICE) + runner = GPUModelRunner(vllm_config, DEVICE_TYPE) current_platform.update_block_size_for_backend(vllm_config) kv_cache_spec = runner.get_kv_cache_spec()