refactor hard coded device string in test files under tests/v1 and tests/lora (#37566)
Signed-off-by: Liao, Wei <wei.liao@intel.com>
This commit is contained in:
@@ -42,6 +42,7 @@ dflash_target_dir = "Qwen/Qwen3-8B"
|
||||
dflash_dir = "z-lab/Qwen3-8B-DFlash-b16"
|
||||
|
||||
BLOCK_SIZE = 16
|
||||
DEVICE_TYPE = current_platform.device_type
|
||||
|
||||
|
||||
def _create_proposer(
|
||||
@@ -92,7 +93,7 @@ def _create_proposer(
|
||||
# Overwrite pard_token to avoid crash during init
|
||||
speculative_config.draft_model_config.hf_config.pard_token = 0
|
||||
|
||||
device = current_platform.device_type
|
||||
device = DEVICE_TYPE
|
||||
vllm_config = VllmConfig(
|
||||
model_config=model_config,
|
||||
cache_config=CacheConfig(block_size=16),
|
||||
@@ -124,7 +125,7 @@ def test_prepare_next_token_ids():
|
||||
either the GPU tensor of sampled_token_ids with -1 for rejected tokens,
|
||||
or the CPU python list[list[int]] with the rejected tokens removed.
|
||||
"""
|
||||
device = torch.device(current_platform.device_type)
|
||||
device = torch.device(DEVICE_TYPE)
|
||||
|
||||
num_requests = 4
|
||||
num_speculative_tokens = 4
|
||||
@@ -207,7 +208,7 @@ def test_prepare_inputs():
|
||||
a, a + 1, ..., a + b - n2 - 1,
|
||||
a + b, a + b + 1, ..., a + b + c - n3 - 1]
|
||||
"""
|
||||
device = torch.device(current_platform.device_type)
|
||||
device = torch.device(DEVICE_TYPE)
|
||||
|
||||
# q1 = 4, q2 = 7, q3 = 5
|
||||
# n1 = 1, n2 = 3, n3 = 2
|
||||
@@ -300,7 +301,7 @@ def test_prepare_inputs_padded():
|
||||
from the original indices to sample from.
|
||||
"""
|
||||
|
||||
device = torch.device(current_platform.device_type)
|
||||
device = torch.device(DEVICE_TYPE)
|
||||
|
||||
expected_token_indices_to_sample = torch.tensor(
|
||||
[1, 5, 6], dtype=torch.int32, device=device
|
||||
@@ -370,7 +371,7 @@ def test_set_inputs_first_pass_default_eagle():
|
||||
- After inserting next_tokens [100, 200, 300]:
|
||||
[a2, a3, 100, b2, 200, c2, c3, c4, 300]
|
||||
"""
|
||||
device = torch.device(current_platform.device_type)
|
||||
device = torch.device(DEVICE_TYPE)
|
||||
|
||||
num_speculative_tokens = 3
|
||||
proposer = _create_proposer("eagle", num_speculative_tokens)
|
||||
@@ -471,7 +472,7 @@ def test_set_inputs_first_pass_draft_model():
|
||||
- idx 5: token 21, pos 1
|
||||
- idx 6: token 200, pos 2 (bonus token)
|
||||
"""
|
||||
device = torch.device(current_platform.device_type)
|
||||
device = torch.device(DEVICE_TYPE)
|
||||
|
||||
num_speculative_tokens = 2
|
||||
block_size = BLOCK_SIZE
|
||||
@@ -609,7 +610,7 @@ def test_set_inputs_first_pass_parallel_drafting():
|
||||
- idx 9: bonus token 200
|
||||
- idx 10-11: parallel_drafting_tokens, is_masked=True
|
||||
"""
|
||||
device = torch.device(current_platform.device_type)
|
||||
device = torch.device(DEVICE_TYPE)
|
||||
|
||||
num_speculative_tokens = 3
|
||||
block_size = BLOCK_SIZE
|
||||
@@ -859,7 +860,7 @@ def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch):
|
||||
monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
|
||||
|
||||
# Use GPU device
|
||||
device = torch.device(current_platform.device_type)
|
||||
device = torch.device(DEVICE_TYPE)
|
||||
|
||||
# Setup test parameters
|
||||
batch_size = 2
|
||||
@@ -1030,7 +1031,7 @@ def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch):
|
||||
)
|
||||
def test_propose_tree(spec_token_tree):
|
||||
# Get GPU device.
|
||||
device = torch.device(current_platform.device_type)
|
||||
device = torch.device(DEVICE_TYPE)
|
||||
|
||||
# Setup test parameters.
|
||||
batch_size = 2
|
||||
|
||||
@@ -5,11 +5,14 @@
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.v1.spec_decode.utils import (
|
||||
PADDING_SLOT_ID,
|
||||
eagle_step_update_slot_mapping_and_metadata,
|
||||
)
|
||||
|
||||
DEVICE_TYPE = current_platform.device_type
|
||||
|
||||
# Skip if no CUDA - Triton kernel requires GPU
|
||||
pytest.importorskip("triton")
|
||||
if not torch.cuda.is_available():
|
||||
@@ -47,7 +50,7 @@ def _reference_eagle_step_slot_mapping(
|
||||
|
||||
def test_eagle_step_slot_mapping_kernel():
|
||||
"""Test fused kernel matches Python reference for slot mapping and metadata."""
|
||||
device = torch.device("cuda")
|
||||
device = torch.device(DEVICE_TYPE)
|
||||
batch_size = 32
|
||||
block_size = 16
|
||||
max_model_len = 4096
|
||||
@@ -93,7 +96,7 @@ def test_eagle_step_slot_mapping_kernel():
|
||||
|
||||
def test_eagle_step_slot_mapping_kernel_exceeds_max():
|
||||
"""Test fused kernel when position exceeds max_model_len."""
|
||||
device = torch.device("cuda")
|
||||
device = torch.device(DEVICE_TYPE)
|
||||
batch_size = 4
|
||||
block_size = 16
|
||||
max_model_len = 100
|
||||
@@ -130,7 +133,7 @@ def test_eagle_step_slot_mapping_kernel_exceeds_max():
|
||||
def test_eagle_step_slot_mapping_kernel_cudagraph_padding():
|
||||
"""Test that padding threads write PADDING_SLOT_ID when
|
||||
input_batch_size > batch_size (cudagraph padding)."""
|
||||
device = torch.device("cuda")
|
||||
device = torch.device(DEVICE_TYPE)
|
||||
batch_size = 4
|
||||
input_batch_size = 8
|
||||
block_size = 16
|
||||
|
||||
@@ -27,6 +27,7 @@ from vllm.v1.spec_decode.extract_hidden_states import ExtractHiddenStatesPropose
|
||||
from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
|
||||
|
||||
model_dir = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
|
||||
DEVICE_TYPE = current_platform.device_type
|
||||
|
||||
|
||||
def _create_proposer(
|
||||
@@ -51,7 +52,7 @@ def _create_proposer(
|
||||
},
|
||||
)
|
||||
|
||||
device = current_platform.device_type
|
||||
device = DEVICE_TYPE
|
||||
vllm_config = VllmConfig(
|
||||
model_config=model_config,
|
||||
cache_config=CacheConfig(),
|
||||
@@ -101,7 +102,7 @@ def test_proposer_initialization_missing_layer_ids():
|
||||
},
|
||||
)
|
||||
|
||||
device = current_platform.device_type
|
||||
device = DEVICE_TYPE
|
||||
vllm_config = VllmConfig(
|
||||
model_config=model_config,
|
||||
cache_config=CacheConfig(),
|
||||
@@ -130,7 +131,7 @@ def test_prepare_next_token_ids_padded():
|
||||
For each request we either use the sampled token (if valid and not discarded)
|
||||
or a backup token from the request state.
|
||||
"""
|
||||
device = torch.device(current_platform.device_type)
|
||||
device = torch.device(DEVICE_TYPE)
|
||||
|
||||
num_requests = 4
|
||||
req_ids = [f"req_{i + 1}" for i in range(num_requests)]
|
||||
@@ -197,7 +198,7 @@ def test_propose():
|
||||
2. Return the sampled tokens as "draft" tokens (shape [batch_size, 1])
|
||||
3. Cache the hidden states in the model's KV cache
|
||||
"""
|
||||
device = torch.device(current_platform.device_type)
|
||||
device = torch.device(DEVICE_TYPE)
|
||||
|
||||
# Setup test parameters
|
||||
batch_size = 2
|
||||
@@ -273,7 +274,7 @@ def test_propose():
|
||||
@pytest.mark.parametrize("num_hidden_layers", [1, 4, 8])
|
||||
def test_propose_different_layer_counts(num_hidden_layers):
|
||||
"""Test that propose works correctly with different numbers of hidden layers."""
|
||||
device = torch.device(current_platform.device_type)
|
||||
device = torch.device(DEVICE_TYPE)
|
||||
|
||||
batch_size = 2
|
||||
num_tokens = 5
|
||||
|
||||
@@ -28,6 +28,7 @@ from vllm.v1.attention.backends.registry import AttentionBackendEnum
|
||||
from vllm.v1.spec_decode.eagle import EagleProposer
|
||||
|
||||
mimo_7b_dir = "XiaomiMiMo/MiMo-7B-Base"
|
||||
DEVICE_TYPE = current_platform.device_type
|
||||
|
||||
|
||||
def _create_mtp_proposer(num_speculative_tokens: int) -> EagleProposer:
|
||||
@@ -48,7 +49,7 @@ def _create_mtp_proposer(num_speculative_tokens: int) -> EagleProposer:
|
||||
model_config=model_config,
|
||||
cache_config=CacheConfig(),
|
||||
speculative_config=speculative_config,
|
||||
device_config=DeviceConfig(device=current_platform.device_type),
|
||||
device_config=DeviceConfig(device=DEVICE_TYPE),
|
||||
parallel_config=ParallelConfig(),
|
||||
load_config=LoadConfig(),
|
||||
scheduler_config=SchedulerConfig(
|
||||
@@ -57,7 +58,7 @@ def _create_mtp_proposer(num_speculative_tokens: int) -> EagleProposer:
|
||||
),
|
||||
)
|
||||
|
||||
return EagleProposer(vllm_config=vllm_config, device=current_platform.device_type)
|
||||
return EagleProposer(vllm_config=vllm_config, device=DEVICE_TYPE)
|
||||
|
||||
|
||||
@mock.patch("vllm.v1.spec_decode.eagle.get_pp_group")
|
||||
@@ -118,7 +119,7 @@ def test_mtp_load_model_unified(mock_get_model, mock_get_layers, mock_get_pp_gro
|
||||
def test_mtp_propose(num_speculative_tokens, monkeypatch):
|
||||
"""Test that MTP's forward method returns hidden states directly"""
|
||||
|
||||
device = torch.device(current_platform.device_type)
|
||||
device = torch.device(DEVICE_TYPE)
|
||||
batch_size = 2
|
||||
seq_lens = [5, 3]
|
||||
total_tokens = sum(seq_lens)
|
||||
|
||||
@@ -18,6 +18,8 @@ from vllm.v1.attention.backend import CommonAttentionMetadata
|
||||
from vllm.v1.attention.backends.fa_utils import is_flash_attn_varlen_func_available
|
||||
from vllm.v1.attention.backends.registry import AttentionBackendEnum
|
||||
|
||||
DEVICE_TYPE = current_platform.device_type
|
||||
|
||||
if not is_flash_attn_varlen_func_available():
|
||||
pytest.skip(
|
||||
"This test requires flash_attn_varlen_func, but it's not available.",
|
||||
@@ -170,9 +172,9 @@ def _get_available_reference_backends() -> list[AttentionBackendEnum]:
|
||||
|
||||
|
||||
class MockAttentionLayer(torch.nn.Module):
|
||||
_q_scale = torch.tensor(1.0, dtype=torch.float32, device="cuda")
|
||||
_k_scale = torch.tensor(1.0, dtype=torch.float32, device="cuda")
|
||||
_v_scale = torch.tensor(1.0, dtype=torch.float32, device="cuda")
|
||||
_q_scale = torch.tensor(1.0, dtype=torch.float32, device=DEVICE_TYPE)
|
||||
_k_scale = torch.tensor(1.0, dtype=torch.float32, device=DEVICE_TYPE)
|
||||
_v_scale = torch.tensor(1.0, dtype=torch.float32, device=DEVICE_TYPE)
|
||||
layer_name = "mock_layer"
|
||||
|
||||
def __init__(self):
|
||||
|
||||
Reference in New Issue
Block a user