[Platform] Deprecate seed_everything (#31659)

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
wangxiyuan
2026-01-05 10:34:04 +08:00
committed by GitHub
parent 367856de14
commit bb4337b34c
77 changed files with 219 additions and 171 deletions

View File

@@ -13,7 +13,7 @@ from vllm.model_executor.layers.quantization.awq_triton import (
awq_dequantize_triton,
awq_gemm_triton,
)
from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
device = "cuda"
@@ -86,7 +86,7 @@ def test_dequantize(qweight_rows, qweight_cols, group_size):
zeros_cols = qweight_cols
zeros_dtype = torch.int32
current_platform.seed_everything(0)
set_random_seed(0)
qweight = torch.randint(
0,
@@ -141,7 +141,7 @@ def test_gemm(N, K, M, splitK, group_size):
qzeros_rows = scales_rows
qzeros_cols = qweight_cols
current_platform.seed_everything(0)
set_random_seed(0)
input = torch.rand((input_rows, input_cols), dtype=input_dtype, device=device)
qweight = torch.randint(

View File

@@ -17,6 +17,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
)
from vllm.platforms import current_platform
from vllm.scalar_type import ScalarType, scalar_types
from vllm.utils.torch_utils import set_random_seed
IS_SUPPORTED_BY_GPU = (
current_platform.is_cuda() and current_platform.get_device_capability()[0] >= 9
@@ -248,7 +249,7 @@ def compute_moe_reference_output(setup: MoETestSetup) -> torch.Tensor:
@pytest.mark.parametrize("random_zero", [True, False])
def test_cutlass_w4a8_moe_mm_end_to_end(shape, random_zero):
num_experts, N, K = shape
current_platform.seed_everything(42)
set_random_seed(42)
setup = make_moe_test_setup(
num_experts=num_experts, K=K, N=N, max_blocks=64, random_zero=random_zero
)
@@ -308,7 +309,7 @@ class W4A8MoELayer(torch.nn.Module):
reason="W4A8 Grouped GEMM is not supported on this GPU type.",
)
def test_cutlass_w4a8_moe_mm_cuda_graph():
current_platform.seed_everything(42)
set_random_seed(42)
# Fixed config for CUDA graph test (single parameter point).
num_experts = 8
K = 512

View File

@@ -12,6 +12,7 @@ from nvfp4_utils import (
from vllm import _custom_ops as ops
from vllm.platforms import current_platform
from vllm.utils.flashinfer import flashinfer_scaled_fp4_mm
from vllm.utils.torch_utils import set_random_seed
if not current_platform.has_device_capability(100):
pytest.skip(
@@ -72,7 +73,7 @@ def test_flashinfer_nvfp4_gemm(
if backend == "trtllm" and dtype == torch.float16:
pytest.skip("Only torch.bfloat16 is supported for TRTLLM FP4 GEMM operations")
current_platform.seed_everything(seed)
set_random_seed(seed)
m, n, packed_k = shape
k = packed_k * 2
block_size = 16

View File

@@ -6,6 +6,7 @@ import torch
from vllm import _custom_ops as ops
from vllm.platforms import current_platform
from vllm.utils.flashinfer import flashinfer_scaled_fp8_mm
from vllm.utils.torch_utils import set_random_seed
if not current_platform.has_device_capability(100):
pytest.skip(
@@ -38,7 +39,7 @@ def test_flashinfer_fp8_gemm(
device: str,
autotune: bool,
) -> None:
current_platform.seed_everything(seed)
set_random_seed(seed)
m, n, k = shape
a = torch.randn((m, k), dtype=dtype, device=device)
b = torch.randn((n, k), dtype=dtype, device=device) / k

View File

@@ -11,7 +11,7 @@ from tests.kernels.quant_utils import (
ref_dynamic_per_token_quant,
)
from tests.kernels.utils import opcheck
from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
DTYPES = [torch.bfloat16, torch.float]
HIDDEN_SIZES = [17, 1024, 1025, 1026, 5137, 8193]
@@ -51,7 +51,7 @@ def opcheck_fp8_quant(
def test_dynamic_per_token_fp8_quant(
num_tokens: int, hidden_size: int, dtype: torch.dtype, scale_ub: bool, seed: int
) -> None:
current_platform.seed_everything(seed)
set_random_seed(seed)
x = (
torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") + 1e-6
@@ -81,7 +81,7 @@ def test_dynamic_per_token_fp8_quant(
def test_dynamic_per_tensor_fp8_quant(
num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int
) -> None:
current_platform.seed_everything(seed)
set_random_seed(seed)
x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda")
@@ -101,7 +101,7 @@ def test_dynamic_per_tensor_fp8_quant(
@torch.inference_mode()
@pytest.mark.parametrize("seed", SEEDS)
def test_fp8_quant_large(seed: int) -> None:
current_platform.seed_everything(seed)
set_random_seed(seed)
num_tokens = 1024000 # Mistral-Nemo's max_position_embeddings
hidden_size = 1152 # Smallest hidden_size to reproduce the error

View File

@@ -7,7 +7,7 @@ import torch
from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
@pytest.mark.parametrize(
@@ -30,7 +30,7 @@ def test_quantfp8_group_functionality(
Tests both CUDA and native implementations, column-major scales,
and verifies consistency between implementations.
"""
current_platform.seed_everything(seed)
set_random_seed(seed)
x = torch.randn((batch_size, hidden_dim), dtype=torch.bfloat16, device="cuda") * 8
expected_num_groups = (hidden_dim + group_size - 1) // group_size
@@ -83,7 +83,7 @@ def test_quantfp8_group_functionality(
@pytest.mark.parametrize("use_ue8m0", [True, False])
@torch.inference_mode()
def test_quantfp8_group_multidimensional(seed: int, use_ue8m0: bool) -> None:
current_platform.seed_everything(seed)
set_random_seed(seed)
group_size = 64
@@ -136,7 +136,7 @@ def test_quantfp8_group_multidimensional(seed: int, use_ue8m0: bool) -> None:
@pytest.mark.parametrize("seed", [42])
@torch.inference_mode()
def test_quantfp8_group_edge_cases(seed: int) -> None:
current_platform.seed_everything(seed)
set_random_seed(seed)
batch_size = 16
group_size = 64

View File

@@ -11,7 +11,7 @@ from huggingface_hub import snapshot_download
import vllm._custom_ops as ops
from vllm.model_executor.layers.fused_moe import fused_experts
from vllm.model_executor.layers.quantization.gguf import _fused_moe_gguf
from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
GGUF_SAMPLE = snapshot_download("Isotr0py/test-gguf-sample")
GGUF_SAMPLE_MOE = snapshot_download("SzymonOzog/test-gguf-moe-sample")
@@ -91,7 +91,7 @@ def test_dequantize(
@pytest.mark.parametrize("quant_type", QUANT_TYPES)
@torch.inference_mode()
def test_mmvq(hidden_size: int, dtype: torch.dtype, quant_type: GGMLQuantizationType):
current_platform.seed_everything(0)
set_random_seed(0)
tensors = get_gguf_sample_tensors(hidden_size, quant_type)
x = torch.rand((1, hidden_size), dtype=dtype, device="cuda")
@@ -134,7 +134,7 @@ def test_mmq(
dtype: torch.dtype,
quant_type: GGMLQuantizationType,
):
current_platform.seed_everything(0)
set_random_seed(0)
tensors = get_gguf_sample_tensors(hidden_size, quant_type)
x = torch.rand((num_tokens, hidden_size), dtype=dtype, device="cuda")
@@ -169,7 +169,7 @@ def test_moe(
quant_type: GGMLQuantizationType,
top_k: int,
):
current_platform.seed_everything(0)
set_random_seed(0)
H, E = 1024, 256
x = torch.rand((num_tokens, H), dtype=dtype, device="cuda")

View File

@@ -7,7 +7,7 @@ import torch
from tests.kernels.quant_utils import ref_dynamic_per_token_quant
from tests.kernels.utils import opcheck
from vllm._custom_ops import scaled_int8_quant
from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
DTYPES = [torch.bfloat16, torch.float]
HIDDEN_SIZES = [17, 1024, 1025, 1026, 5137, 8193]
@@ -46,7 +46,7 @@ def opcheck_int8_quant_dynamic(output, input, symmetric=True):
def test_dynamic_scaled_int8_quant(
num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int
) -> None:
current_platform.seed_everything(seed)
set_random_seed(seed)
x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000
@@ -70,7 +70,7 @@ def test_dynamic_scaled_int8_quant(
def test_dynamic_scaled_int8_azp_quant(
num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int
) -> None:
current_platform.seed_everything(seed)
set_random_seed(seed)
int8_traits = torch.iinfo(torch.int8)
x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000 - 300
@@ -111,7 +111,7 @@ def test_dynamic_scaled_int8_azp_quant(
def test_static_scaled_int8_quant(
num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int, scale: float
) -> None:
current_platform.seed_everything(seed)
set_random_seed(seed)
int8_traits = torch.iinfo(torch.int8)
x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000
@@ -144,7 +144,7 @@ def test_static_scaled_int8_azp_quant(
scale: float,
azp: int,
) -> None:
current_platform.seed_everything(seed)
set_random_seed(seed)
int8_traits = torch.iinfo(torch.int8)
x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000 - 300

View File

@@ -24,6 +24,7 @@ from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_m
from vllm._custom_ops import fusedQuantizeMx, matmul_mxf4_bf16_tn
from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked
from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
if not torch.cuda.is_available():
pytest.skip("CUDA required for these tests.", allow_module_level=True)
@@ -205,7 +206,7 @@ LLAMA_MODELS = {
@pytest.fixture(autouse=True)
def _seed_each_test():
current_platform.seed_everything(0)
set_random_seed(0)
np.random.seed(0)
torch.random.manual_seed(0)

View File

@@ -6,6 +6,7 @@ import torch
from vllm import _custom_ops as ops
from vllm.platforms import current_platform
from vllm.scalar_type import scalar_types
from vllm.utils.torch_utils import set_random_seed
if not current_platform.has_device_capability(100):
pytest.skip(
@@ -134,7 +135,7 @@ def test_quantize_to_fp4(
seed: int,
device: str,
) -> None:
current_platform.seed_everything(seed)
set_random_seed(seed)
torch.set_default_device(device)
m, n = shape
@@ -156,7 +157,7 @@ def test_quantize_to_fp4(
@torch.inference_mode()
def test_quantize_to_fp4_padded(pad_shape: tuple[int, int]) -> None:
dtype = torch.float16
current_platform.seed_everything(42)
set_random_seed(42)
torch.set_default_device("cuda:0")
m, n = pad_shape

View File

@@ -25,6 +25,7 @@ from vllm import _custom_ops as ops # use existing nvfp4 gemm in vllm
from vllm._custom_ops import fusedQuantizeNv
from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked
from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
if not torch.cuda.is_available():
pytest.skip("CUDA required for these tests.", allow_module_level=True)
@@ -193,7 +194,7 @@ LLAMA_MODELS = {
@pytest.fixture(autouse=True)
def _seed_each_test():
current_platform.seed_everything(0)
set_random_seed(0)
np.random.seed(0)
torch.random.manual_seed(0)

View File

@@ -6,6 +6,7 @@ from nvfp4_utils import FLOAT4_E2M1_MAX, FLOAT8_E4M3_MAX, dequantize_nvfp4_to_dt
from vllm import _custom_ops as ops
from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
if not current_platform.has_device_capability(100):
pytest.skip(
@@ -59,7 +60,7 @@ def test_nvfp4_gemm(
seed: int,
device: str,
) -> None:
current_platform.seed_everything(seed)
set_random_seed(seed)
m, n, packed_k = shape
k = packed_k * 2
block_size = 16

View File

@@ -11,6 +11,7 @@ from tests.kernels.quantization.nvfp4_utils import (
from vllm._custom_ops import scaled_fp4_quant
from vllm.model_executor.layers.activation import SiluAndMul
from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
if not current_platform.has_device_capability(100):
pytest.skip(
@@ -33,7 +34,7 @@ def test_silu_mul_nvfp4_quant(
dtype: torch.dtype,
shape: tuple[int, int],
) -> None:
current_platform.seed_everything(42)
set_random_seed(42)
device = "cuda:0"
torch.set_default_device(device)

View File

@@ -11,6 +11,7 @@ import pytest
import torch
from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
device = "cuda"
@@ -85,7 +86,7 @@ def test_scaled_mm(
):
is_floating_point_type = lambda t: torch.tensor([1, 1], dtype=t).is_floating_point()
current_platform.seed_everything(0)
set_random_seed(0)
# NOTE: There are cases, where if the matrix is large enough, an output
# like 65504.4 can be produced, and can easily turn into inf when