diff --git a/tests/benchmarks/test_bench_startup.py b/tests/benchmarks/test_bench_startup.py new file mode 100644 index 000000000..44c9bc9b7 --- /dev/null +++ b/tests/benchmarks/test_bench_startup.py @@ -0,0 +1,19 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import subprocess + +import pytest + + +@pytest.mark.benchmark +def test_bench_startup(): + command = [ + "vllm", + "bench", + "startup", + ] + result = subprocess.run(command, capture_output=True, text=True) + print(result.stdout) + print(result.stderr) + + assert result.returncode == 0, f"Benchmark failed: {result.stderr}" diff --git a/vllm/benchmarks/startup.py b/vllm/benchmarks/startup.py index 086f7bf62..d0d46a963 100644 --- a/vllm/benchmarks/startup.py +++ b/vllm/benchmarks/startup.py @@ -55,7 +55,7 @@ def cold_startup(): os.environ.pop("VLLM_CACHE_ROOT", None) -def run_startup_in_subprocess(engine_args_dict, result_queue): +def run_startup_in_subprocess(engine_args, result_queue): """ Run LLM startup in a subprocess and return timing metrics via a queue. This ensures complete isolation between iterations. @@ -63,9 +63,6 @@ def run_startup_in_subprocess(engine_args_dict, result_queue): try: # Import inside the subprocess to avoid issues with forking from vllm import LLM - from vllm.engine.arg_utils import EngineArgs - - engine_args = EngineArgs(**engine_args_dict) # Measure total startup time start_time = time.perf_counter() @@ -200,15 +197,13 @@ def main(args: argparse.Namespace): Create LLM instance in a subprocess and measure startup time. Returns timing metrics, using subprocess for complete isolation. """ - # Convert engine_args to dictionary for pickling - engine_args_dict = dataclasses.asdict(engine_args) # Create a queue for inter-process communication result_queue = multiprocessing.Queue() process = multiprocessing.Process( target=run_startup_in_subprocess, args=( - engine_args_dict, + engine_args, result_queue, ), )