[Platform] Deprecate seed_everything (#31659)

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2026-01-05 10:34:04 +08:00
parent 367856de14
commit bb4337b34c
77 changed files with 219 additions and 171 deletions
--- a/tests/kernels/quantization/test_awq_triton.py
+++ b/tests/kernels/quantization/test_awq_triton.py
@@ -13,7 +13,7 @@ from vllm.model_executor.layers.quantization.awq_triton import (
    awq_dequantize_triton,
    awq_gemm_triton,
 )
-from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed

 device = "cuda"

@@ -86,7 +86,7 @@ def test_dequantize(qweight_rows, qweight_cols, group_size):
    zeros_cols = qweight_cols
    zeros_dtype = torch.int32

-    current_platform.seed_everything(0)
+    set_random_seed(0)

    qweight = torch.randint(
        0,
@@ -141,7 +141,7 @@ def test_gemm(N, K, M, splitK, group_size):
    qzeros_rows = scales_rows
    qzeros_cols = qweight_cols

-    current_platform.seed_everything(0)
+    set_random_seed(0)

    input = torch.rand((input_rows, input_cols), dtype=input_dtype, device=device)
    qweight = torch.randint(
--- a/tests/kernels/quantization/test_cutlass_w4a8_moe.py
+++ b/tests/kernels/quantization/test_cutlass_w4a8_moe.py
@@ -17,6 +17,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
 )
 from vllm.platforms import current_platform
 from vllm.scalar_type import ScalarType, scalar_types
+from vllm.utils.torch_utils import set_random_seed

 IS_SUPPORTED_BY_GPU = (
    current_platform.is_cuda() and current_platform.get_device_capability()[0] >= 9
@@ -248,7 +249,7 @@ def compute_moe_reference_output(setup: MoETestSetup) -> torch.Tensor:
@pytest.mark.parametrize("random_zero", [True, False])
 def test_cutlass_w4a8_moe_mm_end_to_end(shape, random_zero):
    num_experts, N, K = shape
-    current_platform.seed_everything(42)
+    set_random_seed(42)
    setup = make_moe_test_setup(
        num_experts=num_experts, K=K, N=N, max_blocks=64, random_zero=random_zero
    )
@@ -308,7 +309,7 @@ class W4A8MoELayer(torch.nn.Module):
    reason="W4A8 Grouped GEMM is not supported on this GPU type.",
 )
 def test_cutlass_w4a8_moe_mm_cuda_graph():
-    current_platform.seed_everything(42)
+    set_random_seed(42)
    # Fixed config for CUDA graph test (single parameter point).
    num_experts = 8
    K = 512
--- a/tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
+++ b/tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
@@ -12,6 +12,7 @@ from nvfp4_utils import (
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
 from vllm.utils.flashinfer import flashinfer_scaled_fp4_mm
+from vllm.utils.torch_utils import set_random_seed

 if not current_platform.has_device_capability(100):
    pytest.skip(
@@ -72,7 +73,7 @@ def test_flashinfer_nvfp4_gemm(
    if backend == "trtllm" and dtype == torch.float16:
        pytest.skip("Only torch.bfloat16 is supported for TRTLLM FP4 GEMM operations")

-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
    m, n, packed_k = shape
    k = packed_k * 2
    block_size = 16
--- a/tests/kernels/quantization/test_flashinfer_scaled_mm.py
+++ b/tests/kernels/quantization/test_flashinfer_scaled_mm.py
@@ -6,6 +6,7 @@ import torch
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
 from vllm.utils.flashinfer import flashinfer_scaled_fp8_mm
+from vllm.utils.torch_utils import set_random_seed

 if not current_platform.has_device_capability(100):
    pytest.skip(
@@ -38,7 +39,7 @@ def test_flashinfer_fp8_gemm(
    device: str,
    autotune: bool,
 ) -> None:
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
    m, n, k = shape
    a = torch.randn((m, k), dtype=dtype, device=device)
    b = torch.randn((n, k), dtype=dtype, device=device) / k
--- a/tests/kernels/quantization/test_fp8_quant.py
+++ b/tests/kernels/quantization/test_fp8_quant.py
@@ -11,7 +11,7 @@ from tests.kernels.quant_utils import (
    ref_dynamic_per_token_quant,
 )
 from tests.kernels.utils import opcheck
-from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed

 DTYPES = [torch.bfloat16, torch.float]
 HIDDEN_SIZES = [17, 1024, 1025, 1026, 5137, 8193]
@@ -51,7 +51,7 @@ def opcheck_fp8_quant(
 def test_dynamic_per_token_fp8_quant(
    num_tokens: int, hidden_size: int, dtype: torch.dtype, scale_ub: bool, seed: int
 ) -> None:
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)

    x = (
        torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") + 1e-6
@@ -81,7 +81,7 @@ def test_dynamic_per_token_fp8_quant(
 def test_dynamic_per_tensor_fp8_quant(
    num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int
 ) -> None:
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)

    x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda")

@@ -101,7 +101,7 @@ def test_dynamic_per_tensor_fp8_quant(
@torch.inference_mode()
@pytest.mark.parametrize("seed", SEEDS)
 def test_fp8_quant_large(seed: int) -> None:
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)

    num_tokens = 1024000  # Mistral-Nemo's max_position_embeddings
    hidden_size = 1152  # Smallest hidden_size to reproduce the error
--- a/tests/kernels/quantization/test_fp8_quant_group.py
+++ b/tests/kernels/quantization/test_fp8_quant_group.py
@@ -7,7 +7,7 @@ import torch

 from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
 from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
-from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed


@pytest.mark.parametrize(
@@ -30,7 +30,7 @@ def test_quantfp8_group_functionality(
    Tests both CUDA and native implementations, column-major scales,
    and verifies consistency between implementations.
    """
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)

    x = torch.randn((batch_size, hidden_dim), dtype=torch.bfloat16, device="cuda") * 8
    expected_num_groups = (hidden_dim + group_size - 1) // group_size
@@ -83,7 +83,7 @@ def test_quantfp8_group_functionality(
@pytest.mark.parametrize("use_ue8m0", [True, False])
@torch.inference_mode()
 def test_quantfp8_group_multidimensional(seed: int, use_ue8m0: bool) -> None:
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)

    group_size = 64

@@ -136,7 +136,7 @@ def test_quantfp8_group_multidimensional(seed: int, use_ue8m0: bool) -> None:
@pytest.mark.parametrize("seed", [42])
@torch.inference_mode()
 def test_quantfp8_group_edge_cases(seed: int) -> None:
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)

    batch_size = 16
    group_size = 64
--- a/tests/kernels/quantization/test_gguf.py
+++ b/tests/kernels/quantization/test_gguf.py
@@ -11,7 +11,7 @@ from huggingface_hub import snapshot_download
 import vllm._custom_ops as ops
 from vllm.model_executor.layers.fused_moe import fused_experts
 from vllm.model_executor.layers.quantization.gguf import _fused_moe_gguf
-from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed

 GGUF_SAMPLE = snapshot_download("Isotr0py/test-gguf-sample")
 GGUF_SAMPLE_MOE = snapshot_download("SzymonOzog/test-gguf-moe-sample")
@@ -91,7 +91,7 @@ def test_dequantize(
@pytest.mark.parametrize("quant_type", QUANT_TYPES)
@torch.inference_mode()
 def test_mmvq(hidden_size: int, dtype: torch.dtype, quant_type: GGMLQuantizationType):
-    current_platform.seed_everything(0)
+    set_random_seed(0)

    tensors = get_gguf_sample_tensors(hidden_size, quant_type)
    x = torch.rand((1, hidden_size), dtype=dtype, device="cuda")
@@ -134,7 +134,7 @@ def test_mmq(
    dtype: torch.dtype,
    quant_type: GGMLQuantizationType,
 ):
-    current_platform.seed_everything(0)
+    set_random_seed(0)

    tensors = get_gguf_sample_tensors(hidden_size, quant_type)
    x = torch.rand((num_tokens, hidden_size), dtype=dtype, device="cuda")
@@ -169,7 +169,7 @@ def test_moe(
    quant_type: GGMLQuantizationType,
    top_k: int,
 ):
-    current_platform.seed_everything(0)
+    set_random_seed(0)
    H, E = 1024, 256

    x = torch.rand((num_tokens, H), dtype=dtype, device="cuda")
--- a/tests/kernels/quantization/test_int8_quant.py
+++ b/tests/kernels/quantization/test_int8_quant.py
@@ -7,7 +7,7 @@ import torch
 from tests.kernels.quant_utils import ref_dynamic_per_token_quant
 from tests.kernels.utils import opcheck
 from vllm._custom_ops import scaled_int8_quant
-from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed

 DTYPES = [torch.bfloat16, torch.float]
 HIDDEN_SIZES = [17, 1024, 1025, 1026, 5137, 8193]
@@ -46,7 +46,7 @@ def opcheck_int8_quant_dynamic(output, input, symmetric=True):
 def test_dynamic_scaled_int8_quant(
    num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int
 ) -> None:
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)

    x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000

@@ -70,7 +70,7 @@ def test_dynamic_scaled_int8_quant(
 def test_dynamic_scaled_int8_azp_quant(
    num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int
 ) -> None:
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
    int8_traits = torch.iinfo(torch.int8)

    x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000 - 300
@@ -111,7 +111,7 @@ def test_dynamic_scaled_int8_azp_quant(
 def test_static_scaled_int8_quant(
    num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int, scale: float
 ) -> None:
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
    int8_traits = torch.iinfo(torch.int8)

    x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000
@@ -144,7 +144,7 @@ def test_static_scaled_int8_azp_quant(
    scale: float,
    azp: int,
 ) -> None:
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
    int8_traits = torch.iinfo(torch.int8)

    x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000 - 300
--- a/tests/kernels/quantization/test_mxfp4_qutlass.py
+++ b/tests/kernels/quantization/test_mxfp4_qutlass.py
@@ -24,6 +24,7 @@ from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_m
 from vllm._custom_ops import fusedQuantizeMx, matmul_mxf4_bf16_tn
 from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked
 from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed

 if not torch.cuda.is_available():
    pytest.skip("CUDA required for these tests.", allow_module_level=True)
@@ -205,7 +206,7 @@ LLAMA_MODELS = {

@pytest.fixture(autouse=True)
 def _seed_each_test():
-    current_platform.seed_everything(0)
+    set_random_seed(0)
    np.random.seed(0)
    torch.random.manual_seed(0)

--- a/tests/kernels/quantization/test_nvfp4_quant.py
+++ b/tests/kernels/quantization/test_nvfp4_quant.py
@@ -6,6 +6,7 @@ import torch
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
+from vllm.utils.torch_utils import set_random_seed

 if not current_platform.has_device_capability(100):
    pytest.skip(
@@ -134,7 +135,7 @@ def test_quantize_to_fp4(
    seed: int,
    device: str,
 ) -> None:
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
    torch.set_default_device(device)

    m, n = shape
@@ -156,7 +157,7 @@ def test_quantize_to_fp4(
@torch.inference_mode()
 def test_quantize_to_fp4_padded(pad_shape: tuple[int, int]) -> None:
    dtype = torch.float16
-    current_platform.seed_everything(42)
+    set_random_seed(42)
    torch.set_default_device("cuda:0")

    m, n = pad_shape
--- a/tests/kernels/quantization/test_nvfp4_qutlass.py
+++ b/tests/kernels/quantization/test_nvfp4_qutlass.py
@@ -25,6 +25,7 @@ from vllm import _custom_ops as ops  # use existing nvfp4 gemm in vllm
 from vllm._custom_ops import fusedQuantizeNv
 from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked
 from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed

 if not torch.cuda.is_available():
    pytest.skip("CUDA required for these tests.", allow_module_level=True)
@@ -193,7 +194,7 @@ LLAMA_MODELS = {

@pytest.fixture(autouse=True)
 def _seed_each_test():
-    current_platform.seed_everything(0)
+    set_random_seed(0)
    np.random.seed(0)
    torch.random.manual_seed(0)

--- a/tests/kernels/quantization/test_nvfp4_scaled_mm.py
+++ b/tests/kernels/quantization/test_nvfp4_scaled_mm.py
@@ -6,6 +6,7 @@ from nvfp4_utils import FLOAT4_E2M1_MAX, FLOAT8_E4M3_MAX, dequantize_nvfp4_to_dt

 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed

 if not current_platform.has_device_capability(100):
    pytest.skip(
@@ -59,7 +60,7 @@ def test_nvfp4_gemm(
    seed: int,
    device: str,
 ) -> None:
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
    m, n, packed_k = shape
    k = packed_k * 2
    block_size = 16
--- a/tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
+++ b/tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
@@ -11,6 +11,7 @@ from tests.kernels.quantization.nvfp4_utils import (
 from vllm._custom_ops import scaled_fp4_quant
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed

 if not current_platform.has_device_capability(100):
    pytest.skip(
@@ -33,7 +34,7 @@ def test_silu_mul_nvfp4_quant(
    dtype: torch.dtype,
    shape: tuple[int, int],
 ) -> None:
-    current_platform.seed_everything(42)
+    set_random_seed(42)
    device = "cuda:0"
    torch.set_default_device(device)

--- a/tests/kernels/quantization/test_triton_scaled_mm.py
+++ b/tests/kernels/quantization/test_triton_scaled_mm.py
@@ -11,6 +11,7 @@ import pytest
 import torch

 from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed

 device = "cuda"

@@ -85,7 +86,7 @@ def test_scaled_mm(
 ):
    is_floating_point_type = lambda t: torch.tensor([1, 1], dtype=t).is_floating_point()

-    current_platform.seed_everything(0)
+    set_random_seed(0)

    # NOTE: There are cases, where if the matrix is large enough, an output
    # like 65504.4 can be produced, and can easily turn into inf when