diff --git a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py index fb3329975..4978a8777 100644 --- a/benchmarks/fused_kernels/layernorm_rms_benchmarks.py +++ b/benchmarks/fused_kernels/layernorm_rms_benchmarks.py @@ -13,6 +13,7 @@ from torch.utils.benchmark import Measurement as TMeasurement from tqdm import tqdm import vllm._custom_ops as ops +from vllm.benchmarks.lib.utils import default_vllm_config from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.quantization.utils.fp8_utils import ( per_token_group_quant_fp8, @@ -291,6 +292,7 @@ def print_timers(timers: Iterable[TMeasurement]): compare.print() +@default_vllm_config() def main(): torch.set_default_device("cuda") bench_params = get_bench_params() diff --git a/benchmarks/kernels/benchmark_activation.py b/benchmarks/kernels/benchmark_activation.py index bb66e5d08..e1cec02b7 100644 --- a/benchmarks/kernels/benchmark_activation.py +++ b/benchmarks/kernels/benchmark_activation.py @@ -7,6 +7,7 @@ import itertools import torch import vllm.model_executor.layers.activation # noqa F401 +from vllm.benchmarks.lib.utils import default_vllm_config from vllm.model_executor.custom_op import op_registry from vllm.triton_utils import triton from vllm.utils.argparse_utils import FlexibleArgumentParser @@ -18,6 +19,7 @@ intermediate_size = [3072, 9728, 12288] configs = list(itertools.product(batch_size_range, seq_len_range, intermediate_size)) +@default_vllm_config() def benchmark_activation( batch_size: int, seq_len: int, diff --git a/benchmarks/kernels/bench_block_fp8_gemm.py b/benchmarks/kernels/benchmark_block_fp8_gemm.py similarity index 98% rename from benchmarks/kernels/bench_block_fp8_gemm.py rename to benchmarks/kernels/benchmark_block_fp8_gemm.py index 11e3ac7f0..8d50c3828 100644 --- a/benchmarks/kernels/bench_block_fp8_gemm.py +++ b/benchmarks/kernels/benchmark_block_fp8_gemm.py @@ -8,6 +8,7 @@ os.environ["VLLM_USE_DEEP_GEMM"] = "0" import torch +from vllm.benchmarks.lib.utils import default_vllm_config from vllm.model_executor.layers.quantization.utils.fp8_utils import ( W8A8BlockFp8LinearOp, ) @@ -40,6 +41,7 @@ DEEPSEEK_V3_SHAPES = [ ] +@default_vllm_config() def build_w8a8_block_fp8_runner(M, N, K, block_size, device, use_cutlass): """Build runner function for w8a8 block fp8 matmul.""" factor_for_scale = 1e-2 diff --git a/benchmarks/kernels/bench_fp8_gemm.py b/benchmarks/kernels/benchmark_fp8_gemm.py similarity index 100% rename from benchmarks/kernels/bench_fp8_gemm.py rename to benchmarks/kernels/benchmark_fp8_gemm.py diff --git a/benchmarks/kernels/bench_int8_gemm.py b/benchmarks/kernels/benchmark_int8_gemm.py similarity index 100% rename from benchmarks/kernels/bench_int8_gemm.py rename to benchmarks/kernels/benchmark_int8_gemm.py diff --git a/benchmarks/kernels/benchmark_layernorm.py b/benchmarks/kernels/benchmark_layernorm.py index 2292d2f87..cc1c1cf09 100644 --- a/benchmarks/kernels/benchmark_layernorm.py +++ b/benchmarks/kernels/benchmark_layernorm.py @@ -5,12 +5,14 @@ import time import torch +from vllm.benchmarks.lib.utils import default_vllm_config from vllm.model_executor.layers.layernorm import RMSNorm from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed @torch.inference_mode() +@default_vllm_config() def main( num_tokens: int, hidden_size: int, diff --git a/benchmarks/kernels/benchmark_mrope.py b/benchmarks/kernels/benchmark_mrope.py index 3e0365135..2c086870c 100644 --- a/benchmarks/kernels/benchmark_mrope.py +++ b/benchmarks/kernels/benchmark_mrope.py @@ -36,6 +36,7 @@ from typing import Any import numpy as np import torch +from vllm.benchmarks.lib.utils import default_vllm_config from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.transformers_utils.config import get_config from vllm.utils.argparse_utils import FlexibleArgumentParser @@ -78,6 +79,7 @@ def calculate_stats(times: list[float]) -> dict[str, float]: } +@default_vllm_config() def benchmark_mrope( model_name: str, num_tokens: int, diff --git a/benchmarks/kernels/bench_mxfp4_qutlass.py b/benchmarks/kernels/benchmark_mxfp4_qutlass.py similarity index 100% rename from benchmarks/kernels/bench_mxfp4_qutlass.py rename to benchmarks/kernels/benchmark_mxfp4_qutlass.py diff --git a/benchmarks/kernels/bench_nvfp4_gemm.py b/benchmarks/kernels/benchmark_nvfp4_gemm.py similarity index 100% rename from benchmarks/kernels/bench_nvfp4_gemm.py rename to benchmarks/kernels/benchmark_nvfp4_gemm.py diff --git a/benchmarks/kernels/bench_nvfp4_quant.py b/benchmarks/kernels/benchmark_nvfp4_quant.py similarity index 100% rename from benchmarks/kernels/bench_nvfp4_quant.py rename to benchmarks/kernels/benchmark_nvfp4_quant.py diff --git a/benchmarks/kernels/bench_nvfp4_qutlass.py b/benchmarks/kernels/benchmark_nvfp4_qutlass.py similarity index 100% rename from benchmarks/kernels/bench_nvfp4_qutlass.py rename to benchmarks/kernels/benchmark_nvfp4_qutlass.py diff --git a/benchmarks/kernels/bench_per_token_quant_fp8.py b/benchmarks/kernels/benchmark_per_token_quant_fp8.py similarity index 99% rename from benchmarks/kernels/bench_per_token_quant_fp8.py rename to benchmarks/kernels/benchmark_per_token_quant_fp8.py index 7792cfd03..6ce97e303 100644 --- a/benchmarks/kernels/bench_per_token_quant_fp8.py +++ b/benchmarks/kernels/benchmark_per_token_quant_fp8.py @@ -7,6 +7,7 @@ from unittest.mock import patch import pandas as pd import torch +from vllm.benchmarks.lib.utils import default_vllm_config from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8 from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape from vllm.triton_utils import triton @@ -84,6 +85,7 @@ def calculate_diff( configs = [] +@default_vllm_config() def benchmark_quantization( batch_size, hidden_size, diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py index 7a1bc050b..5e1df3b29 100644 --- a/benchmarks/kernels/benchmark_rope.py +++ b/benchmarks/kernels/benchmark_rope.py @@ -5,6 +5,7 @@ import itertools import torch +from vllm.benchmarks.lib.utils import default_vllm_config from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.triton_utils import triton from vllm.utils.argparse_utils import FlexibleArgumentParser @@ -29,6 +30,7 @@ def get_benchmark(head_size, rotary_dim, is_neox_style, device): args={}, ) ) + @default_vllm_config() def benchmark(batch_size, seq_len, num_heads, provider): dtype = torch.bfloat16 max_position = 8192 diff --git a/vllm/benchmarks/lib/utils.py b/vllm/benchmarks/lib/utils.py index d3b6be869..99a3bf927 100644 --- a/vllm/benchmarks/lib/utils.py +++ b/vllm/benchmarks/lib/utils.py @@ -5,6 +5,7 @@ import argparse import json import math import os +from contextlib import contextmanager from typing import Any @@ -117,3 +118,14 @@ def write_to_json(filename: str, records: list) -> None: cls=InfEncoder, default=lambda o: f"<{type(o).__name__} is not JSON serializable>", ) + + +@contextmanager +def default_vllm_config(): + """Set a default VllmConfig for cases that directly test CustomOps or pathways + that use get_current_vllm_config() outside of a full engine context. + """ + from vllm.config import VllmConfig, set_current_vllm_config + + with set_current_vllm_config(VllmConfig()): + yield