[Platform] Deprecate seed_everything (#31659)
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
@@ -8,6 +8,7 @@ import torch
|
||||
import vllm.v1.attention.backends.rocm_aiter_fa # noqa: F401
|
||||
from vllm.attention.utils.fa_utils import is_flash_attn_varlen_func_available
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.torch_utils import set_random_seed
|
||||
|
||||
NUM_HEADS = [(4, 4), (8, 2)]
|
||||
HEAD_SIZES = [128, 256]
|
||||
@@ -104,7 +105,7 @@ def test_varlen_with_paged_kv(
|
||||
if not is_flash_attn_varlen_func_available():
|
||||
pytest.skip("flash_attn_varlen_func required to run this test.")
|
||||
torch.set_default_device("cuda")
|
||||
current_platform.seed_everything(0)
|
||||
set_random_seed(0)
|
||||
num_seqs = len(seq_lens)
|
||||
query_lens = [x[0] for x in seq_lens]
|
||||
kv_lens = [x[1] for x in seq_lens]
|
||||
|
||||
@@ -13,6 +13,7 @@ from vllm.attention.layer import Attention
|
||||
from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.mem_utils import get_max_shared_memory_bytes
|
||||
from vllm.utils.torch_utils import set_random_seed
|
||||
|
||||
FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
|
||||
# This will change depending on the compute capability.
|
||||
@@ -150,7 +151,7 @@ def test_paged_attention(
|
||||
|
||||
global PARTITION_SIZE
|
||||
|
||||
current_platform.seed_everything(seed)
|
||||
set_random_seed(seed)
|
||||
torch.set_default_device(device)
|
||||
scale = float(1.0 / (head_size**0.5))
|
||||
num_query_heads, num_kv_heads = num_heads
|
||||
|
||||
@@ -9,6 +9,7 @@ import torch
|
||||
from tests.kernels.utils import DEFAULT_OPCHECK_TEST_UTILS, opcheck
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.torch_utils import set_random_seed
|
||||
|
||||
COPYING_DIRECTION = [("cuda", "cpu"), ("cuda", "cuda"), ("cpu", "cuda")]
|
||||
DTYPES = [torch.bfloat16, torch.float]
|
||||
@@ -64,7 +65,7 @@ def test_reshape_and_cache(
|
||||
) -> None:
|
||||
if kv_cache_dtype == "fp8" and head_size % 16:
|
||||
pytest.skip()
|
||||
current_platform.seed_everything(seed)
|
||||
set_random_seed(seed)
|
||||
torch.set_default_device(device)
|
||||
torch.cuda.set_device(device)
|
||||
# Create a random slot mapping.
|
||||
@@ -185,7 +186,7 @@ def test_reshape_and_cache_flash(
|
||||
kv_cache_layout: str,
|
||||
implementation: str,
|
||||
) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
set_random_seed(seed)
|
||||
torch.set_default_device(device)
|
||||
torch.cuda.set_device(device)
|
||||
assert implementation in ["cuda", "triton"]
|
||||
@@ -355,7 +356,7 @@ def test_swap_blocks(
|
||||
if kv_cache_dtype == "fp8" and head_size % 16:
|
||||
pytest.skip()
|
||||
|
||||
current_platform.seed_everything(seed)
|
||||
set_random_seed(seed)
|
||||
|
||||
src_device = device if direction[0] == "cuda" else "cpu"
|
||||
dst_device = device if direction[1] == "cuda" else "cpu"
|
||||
@@ -444,7 +445,7 @@ def test_fp8_e4m3_conversion(
|
||||
seed: int,
|
||||
device: str,
|
||||
) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
set_random_seed(seed)
|
||||
|
||||
low = -224.0
|
||||
high = 224.0
|
||||
@@ -507,7 +508,7 @@ def test_concat_and_cache_mla(
|
||||
device: str,
|
||||
kv_cache_dtype: str,
|
||||
) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
set_random_seed(seed)
|
||||
torch.set_default_device(device)
|
||||
torch.cuda.set_device(device)
|
||||
|
||||
@@ -584,7 +585,7 @@ def test_concat_and_cache_ds_mla(
|
||||
if dtype.itemsize != 2:
|
||||
pytest.skip("ds_mla only supports 16-bit input")
|
||||
kv_cache_dtype = "fp8_ds_mla"
|
||||
current_platform.seed_everything(seed)
|
||||
set_random_seed(seed)
|
||||
torch.set_default_device(device)
|
||||
torch.cuda.set_device(device)
|
||||
|
||||
@@ -695,7 +696,7 @@ def test_swap_blocks_mla(
|
||||
device: str,
|
||||
kv_cache_dtype: str,
|
||||
) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
set_random_seed(seed)
|
||||
torch.set_default_device(device)
|
||||
torch.cuda.set_device(device)
|
||||
|
||||
@@ -947,7 +948,7 @@ def test_concat_and_cache_mla_cpu(
|
||||
) -> None:
|
||||
device = "cpu"
|
||||
kv_cache_dtype = "auto"
|
||||
current_platform.seed_everything(seed)
|
||||
set_random_seed(seed)
|
||||
torch.set_default_device(device)
|
||||
|
||||
total_slots = num_blocks * block_size
|
||||
|
||||
@@ -6,6 +6,7 @@ import pytest
|
||||
import torch
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.torch_utils import set_random_seed
|
||||
from vllm.v1.attention.backends.flash_attn import cascade_attention, merge_attn_states
|
||||
|
||||
try:
|
||||
@@ -39,7 +40,7 @@ def test_merge_kernel(
|
||||
dtype: torch.dtype,
|
||||
):
|
||||
torch.set_default_device("cuda")
|
||||
current_platform.seed_everything(0)
|
||||
set_random_seed(0)
|
||||
num_query_heads = num_heads[0]
|
||||
num_kv_heads = num_heads[1]
|
||||
assert num_query_heads % num_kv_heads == 0
|
||||
@@ -103,7 +104,7 @@ def test_cascade(
|
||||
f'to: "{fa_version_unsupported_reason(fa_version)}"'
|
||||
)
|
||||
|
||||
current_platform.seed_everything(0)
|
||||
set_random_seed(0)
|
||||
|
||||
window_size = (-1, -1)
|
||||
scale = head_size**-0.5
|
||||
|
||||
@@ -8,6 +8,7 @@ import pytest
|
||||
import torch
|
||||
|
||||
from vllm.platforms import CpuArchEnum, current_platform
|
||||
from vllm.utils.torch_utils import set_random_seed
|
||||
from vllm.v1.attention.backends.cpu_attn import _get_attn_isa
|
||||
|
||||
if not current_platform.is_cpu():
|
||||
@@ -190,7 +191,7 @@ def varlen_with_paged_kv(
|
||||
use_sink: bool,
|
||||
isa: str,
|
||||
) -> None:
|
||||
current_platform.seed_everything(0)
|
||||
set_random_seed(0)
|
||||
num_seqs = len(seq_lens)
|
||||
query_lens = [x[0] for x in seq_lens]
|
||||
kv_lens = [x[1] for x in seq_lens]
|
||||
|
||||
@@ -6,6 +6,7 @@ import pytest
|
||||
import torch
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.torch_utils import set_random_seed
|
||||
|
||||
try:
|
||||
from vllm.vllm_flash_attn import (
|
||||
@@ -129,7 +130,7 @@ def test_varlen_with_paged_kv(
|
||||
"Flash attention with quantized inputs is only "
|
||||
"supported on version 3 with bfloat16 base type"
|
||||
)
|
||||
current_platform.seed_everything(0)
|
||||
set_random_seed(0)
|
||||
num_seqs = len(seq_lens)
|
||||
query_lens = [x[0] for x in seq_lens]
|
||||
kv_lens = [x[1] for x in seq_lens]
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
import pytest
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.torch_utils import set_random_seed
|
||||
|
||||
try:
|
||||
import flashinfer
|
||||
@@ -101,7 +102,7 @@ def test_flashinfer_decode_with_paged_kv(
|
||||
sliding_window: int | None,
|
||||
) -> None:
|
||||
torch.set_default_device("cuda")
|
||||
current_platform.seed_everything(0)
|
||||
set_random_seed(0)
|
||||
num_seqs = len(kv_lens)
|
||||
num_query_heads = num_heads[0]
|
||||
num_kv_heads = num_heads[1]
|
||||
@@ -196,7 +197,7 @@ def test_flashinfer_prefill_with_paged_kv(
|
||||
sliding_window: int | None,
|
||||
) -> None:
|
||||
torch.set_default_device("cuda")
|
||||
current_platform.seed_everything(0)
|
||||
set_random_seed(0)
|
||||
num_seqs = len(seq_lens)
|
||||
query_lens = [x[0] for x in seq_lens]
|
||||
kv_lens = [x[1] for x in seq_lens]
|
||||
@@ -299,7 +300,7 @@ def test_flashinfer_prefill_with_paged_fp8_kv(
|
||||
) -> None:
|
||||
pytest.skip("TODO: fix the accuracy issue")
|
||||
torch.set_default_device("cuda")
|
||||
current_platform.seed_everything(0)
|
||||
set_random_seed(0)
|
||||
num_seqs = len(seq_lens)
|
||||
query_lens = [x[0] for x in seq_lens]
|
||||
kv_lens = [x[1] for x in seq_lens]
|
||||
@@ -409,7 +410,7 @@ def test_flashinfer_decode_with_paged_fp8_kv(
|
||||
) -> None:
|
||||
# test doesn't work for num_heads = (16,16)
|
||||
torch.set_default_device("cuda")
|
||||
current_platform.seed_everything(0)
|
||||
set_random_seed(0)
|
||||
num_seqs = len(kv_lens)
|
||||
num_query_heads = num_heads[0]
|
||||
num_kv_heads = num_heads[1]
|
||||
|
||||
@@ -10,6 +10,7 @@ from tests.kernels.quantization.nvfp4_utils import (
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.math_utils import round_up
|
||||
from vllm.utils.torch_utils import set_random_seed
|
||||
|
||||
if not current_platform.is_device_capability_family(100):
|
||||
pytest.skip(
|
||||
@@ -80,7 +81,7 @@ def test_flashinfer_trtllm_decode_with_baseline(
|
||||
has_sinks: bool,
|
||||
) -> None:
|
||||
torch.set_default_device("cuda")
|
||||
current_platform.seed_everything(42)
|
||||
set_random_seed(42)
|
||||
|
||||
q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtypes
|
||||
q_quant_dtype = q_quant_dtype or dtype
|
||||
@@ -279,7 +280,7 @@ def test_flashinfer_trtllm_prefill_with_baseline(
|
||||
has_sinks: bool,
|
||||
) -> None:
|
||||
torch.set_default_device("cuda")
|
||||
current_platform.seed_everything(42)
|
||||
set_random_seed(42)
|
||||
|
||||
q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtypes
|
||||
q_quant_dtype = q_quant_dtype or dtype
|
||||
|
||||
@@ -5,7 +5,7 @@ import pytest
|
||||
import torch
|
||||
|
||||
from vllm.model_executor.layers.lightning_attn import linear_decode_forward_triton
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.torch_utils import set_random_seed
|
||||
|
||||
NUM_HEADS = [4, 8]
|
||||
HEAD_SIZES = [64]
|
||||
@@ -124,7 +124,7 @@ def test_linear_decode_forward_triton(
|
||||
torch.set_default_device("cuda")
|
||||
torch.manual_seed(42)
|
||||
torch.cuda.manual_seed_all(42)
|
||||
current_platform.seed_everything(42)
|
||||
set_random_seed(42)
|
||||
base = 0.01
|
||||
q = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
|
||||
k = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype)
|
||||
@@ -167,7 +167,7 @@ def test_linear_decode_forward_triton_with_padding(
|
||||
torch.set_default_device("cuda")
|
||||
torch.manual_seed(42)
|
||||
torch.cuda.manual_seed_all(42)
|
||||
current_platform.seed_everything(42)
|
||||
set_random_seed(42)
|
||||
|
||||
batch_size = 4
|
||||
base = 0.01
|
||||
@@ -231,7 +231,7 @@ def test_lightning_attention_reference(
|
||||
torch.set_default_device("cuda")
|
||||
torch.manual_seed(42)
|
||||
torch.cuda.manual_seed_all(42)
|
||||
current_platform.seed_everything(42)
|
||||
set_random_seed(42)
|
||||
|
||||
base = 0.01
|
||||
q = base * torch.randn(batch_size, num_heads, seq_len, head_size, dtype=dtype)
|
||||
|
||||
@@ -19,6 +19,7 @@ from vllm.platforms import current_platform
|
||||
from vllm.platforms.cpu import CpuPlatform
|
||||
from vllm.platforms.cuda import CudaPlatform
|
||||
from vllm.platforms.rocm import RocmPlatform
|
||||
from vllm.utils.torch_utils import set_random_seed
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
@@ -123,7 +124,7 @@ def test_mha_attn_forward(
|
||||
dtype: torch.dtype,
|
||||
device: str,
|
||||
):
|
||||
current_platform.seed_everything(0)
|
||||
set_random_seed(0)
|
||||
torch.set_default_device(device)
|
||||
torch.set_default_dtype(dtype)
|
||||
|
||||
@@ -168,7 +169,7 @@ def test_mha_attn_varlen_forward(
|
||||
dtype: torch.dtype,
|
||||
device: str,
|
||||
):
|
||||
current_platform.seed_everything(0)
|
||||
set_random_seed(0)
|
||||
torch.set_default_device(device)
|
||||
torch.set_default_dtype(dtype)
|
||||
|
||||
|
||||
@@ -13,7 +13,7 @@ import torch.nn.functional as F
|
||||
from vllm.attention.ops.chunked_prefill_paged_decode import chunked_prefill_paged_decode
|
||||
from vllm.attention.ops.prefix_prefill import context_attention_fwd
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed
|
||||
|
||||
NUM_HEADS = [64]
|
||||
NUM_QUERIES_PER_KV = [1, 64]
|
||||
@@ -125,7 +125,7 @@ def test_contexted_kv_attention(
|
||||
):
|
||||
pytest.skip("ROCm custom paged attention does not support fp8_e5m2 KV cache")
|
||||
|
||||
current_platform.seed_everything(0)
|
||||
set_random_seed(0)
|
||||
torch.set_default_device(device)
|
||||
|
||||
# Need this, otherwise when we capture the graph the process
|
||||
@@ -346,7 +346,7 @@ def test_contexted_kv_attention_alibi(
|
||||
):
|
||||
pytest.skip("ROCm custom paged attention does not support fp8_e5m2 KV cache")
|
||||
|
||||
current_platform.seed_everything(0)
|
||||
set_random_seed(0)
|
||||
torch.set_default_device(device)
|
||||
|
||||
# Need this, otherwise when we capture the graph the process
|
||||
|
||||
@@ -8,6 +8,7 @@ import torch
|
||||
from vllm.attention.ops.triton_unified_attention import unified_attention
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils.math_utils import next_power_of_2
|
||||
from vllm.utils.torch_utils import set_random_seed
|
||||
|
||||
NUM_HEADS = [(4, 4), (8, 2)]
|
||||
HEAD_SIZES = [128, 256]
|
||||
@@ -113,7 +114,7 @@ def test_triton_unified_attn(
|
||||
) -> None:
|
||||
torch.set_default_device("cuda")
|
||||
|
||||
current_platform.seed_everything(0)
|
||||
set_random_seed(0)
|
||||
num_seqs = len(seq_lens)
|
||||
query_lens = [x[0] for x in seq_lens]
|
||||
kv_lens = [x[1] for x in seq_lens]
|
||||
|
||||
Reference in New Issue
Block a user