diff --git a/docs/benchmarking/sweeps.md b/docs/benchmarking/sweeps.md index e0a7a1b6d..5571db0a5 100644 --- a/docs/benchmarking/sweeps.md +++ b/docs/benchmarking/sweeps.md @@ -112,6 +112,7 @@ Example command: vllm bench sweep serve_sla \ --serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \ --bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 100' \ + --sla-variable max_concurrency \ --serve-params benchmarks/serve_hparams.json \ --bench-params benchmarks/bench_hparams.json -o benchmarks/results @@ -119,8 +120,8 @@ vllm bench sweep serve_sla \ The algorithm for scanning through different values of `sla_variable` can be summarized as follows: -1. Run the benchmark once with `sla_variable = 1` to simulate serial inference. This results in the lowest possible latency and throughput. -2. Run the benchmark once with `sla_variable = num_prompts` to simulate batch inference over the whole dataset. This results in the highest possible latency and throughput. +1. Run the benchmark by sending requests one at a time (serial inference). This results in the lowest possible latency and throughput. +2. Run the benchmark by sending all requests at once (batch inference). This results in the highest possible latency and throughput. 3. Estimate the maximum value of `sla_variable` that can be supported by the server without oversaturating it. 4. Run the benchmark over intermediate values of `sla_variable` uniformly using the remaining iterations. @@ -129,6 +130,9 @@ You can override the number of iterations in the algorithm by setting `--sla-ite !!! tip This is our equivalent of [GuideLLM's `--profile sweep`](https://github.com/vllm-project/guidellm/blob/v0.5.3/src/guidellm/benchmark/profiles.py#L575). + In general, `--sla-variable max_concurrency` produces more reliable results because it directly controls the workload imposed on the vLLM engine. + Nevertheless, we default to `--sla-variable request_rate` to maintain similar behavior as GuideLLM. + ## Startup Benchmark `vllm bench sweep startup` runs `vllm bench startup` across parameter combinations to compare cold/warm startup time for different engine settings. @@ -197,23 +201,32 @@ Control the variables to plot via `--var-x` and `--var-y`, optionally applying ` Example commands for visualizing [SLA Scanner](#sla-scanner) results: ```bash -# Latency increases as the request rate increases -vllm bench sweep plot benchmarks/results/ \ - --var-x request_rate \ - --var-y p99_ttft_ms \ - --row-by random_input_len \ - --col-by random_output_len \ +# Name of the directory that stores the results +TIMESTAMP=$1 + +# Latency increases as the workload increases +vllm bench sweep plot benchmarks/results/$TIMESTAMP \ + --var-x max_concurrency \ + --var-y median_ttft_ms \ + --col-by _benchmark_name \ --curve-by max_num_seqs,max_num_batched_tokens \ - --filter-by 'request_rate<=128' + --fig-name latency_curve + +# Throughput saturates as workload increases +vllm bench sweep plot benchmarks/results/$TIMESTAMP \ + --var-x max_concurrency \ + --var-y total_token_throughput \ + --col-by _benchmark_name \ + --curve-by max_num_seqs,max_num_batched_tokens \ + --fig-name throughput_curve # Tradeoff between latency and throughput -vllm bench sweep plot benchmarks/results/ \ - --var-x request_throughput \ +vllm bench sweep plot benchmarks/results/$TIMESTAMP \ + --var-x total_token_throughput \ --var-y median_ttft_ms \ - --row-by random_input_len \ - --col-by random_output_len \ + --col-by _benchmark_name \ --curve-by max_num_seqs,max_num_batched_tokens \ - --filter-by 'request_rate<=128' + --fig-name latency_throughput ``` !!! tip diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index a8b6b2161..0cd76d891 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -60,6 +60,8 @@ except ImportError: logger = logging.getLogger(__name__) +DEFAULT_NUM_PROMPTS = 1000 + # ----------------------------------------------------------------------------- # Data Classes # ----------------------------------------------------------------------------- @@ -1338,7 +1340,7 @@ def add_dataset_parser(parser: FlexibleArgumentParser): parser.add_argument( "--num-prompts", type=int, - default=1000, + default=DEFAULT_NUM_PROMPTS, help="Number of prompts to process.", ) parser.add_argument( diff --git a/vllm/benchmarks/sweep/plot.py b/vllm/benchmarks/sweep/plot.py index 53c7db387..4f9184f95 100644 --- a/vllm/benchmarks/sweep/plot.py +++ b/vllm/benchmarks/sweep/plot.py @@ -324,6 +324,11 @@ def _plot_fig( df = filter_by.apply(df) df = bin_by.apply(df) + if len(df) == 0: + print(f"No data to plot. Filters: {filter_by}") + print("[END FIGURE]") + return + # Sort by curve_by columns alphabetically for consistent legend ordering if curve_by: df = df.sort_values(by=curve_by) @@ -570,13 +575,13 @@ class SweepPlotArgs: parser.add_argument( "--var-x", type=str, - default="request_throughput", + default="total_token_throughput", help="The variable for the x-axis.", ) parser.add_argument( "--var-y", type=str, - default="p99_ttft_ms", + default="median_ttft_ms", help="The variable for the y-axis", ) parser.add_argument( diff --git a/vllm/benchmarks/sweep/serve.py b/vllm/benchmarks/sweep/serve.py index 7420f2518..4ab2dab5f 100644 --- a/vllm/benchmarks/sweep/serve.py +++ b/vllm/benchmarks/sweep/serve.py @@ -138,12 +138,16 @@ def _get_comb_base_path( output_dir: Path, serve_comb: ParameterSweepItem, bench_comb: ParameterSweepItem, + *, + extra_parts: tuple[str, ...] = (), ): parts = list[str]() if serve_comb: parts.extend(("SERVE-", serve_comb.name)) if bench_comb: parts.extend(("BENCH-", bench_comb.name)) + if extra_parts: + parts.extend(extra_parts) return output_dir / sanitize_filename("-".join(parts)) diff --git a/vllm/benchmarks/sweep/serve_sla.py b/vllm/benchmarks/sweep/serve_sla.py index 89169ec15..38d54ea42 100644 --- a/vllm/benchmarks/sweep/serve_sla.py +++ b/vllm/benchmarks/sweep/serve_sla.py @@ -10,6 +10,7 @@ from typing import ClassVar, Literal, get_args import numpy as np from typing_extensions import assert_never +from vllm.benchmarks.datasets import DEFAULT_NUM_PROMPTS from vllm.utils.import_utils import PlaceholderModule from .param_sweep import ParameterSweep, ParameterSweepItem @@ -65,7 +66,12 @@ def run_comb_sla( bench_cmd, serve_comb=serve_comb, bench_comb=bench_comb_sla, - base_path=_get_comb_base_path(output_dir, serve_comb, bench_comb_sla), + base_path=_get_comb_base_path( + output_dir, + serve_comb, + bench_comb, + extra_parts=("SLA-", f"{sla_variable}={sla_value}"), + ), num_runs=num_runs, dry_run=dry_run, link_vars=link_vars, @@ -93,11 +99,25 @@ def explore_sla( if sla_iters < 2: raise ValueError("`sla_iters` should be at least 2") + dataset_size = DEFAULT_NUM_PROMPTS + if "num_prompts" in bench_comb: + dataset_size = int(bench_comb["num_prompts"]) # type: ignore + else: + for i, arg in enumerate(bench_cmd): + if arg == "--num-prompts" and i + 1 < len(bench_cmd): + dataset_size = int(bench_cmd[i + 1]) + break + elif arg.startswith("--num-prompts="): + dataset_size = int(arg.split("=", 1)[1]) + break + + print(f"Dataset size: {dataset_size}") + serial_comb_data = run_comb_sla( server, bench_cmd, serve_comb=serve_comb, - bench_comb=bench_comb, + bench_comb=bench_comb | {"max_concurrency": 1}, output_dir=output_dir, num_runs=num_runs, dry_run=dry_run, @@ -109,13 +129,13 @@ def explore_sla( server, bench_cmd, serve_comb=serve_comb, - bench_comb=bench_comb, + bench_comb=bench_comb | {"max_concurrency": dataset_size}, output_dir=output_dir, num_runs=num_runs, dry_run=dry_run, link_vars=link_vars, sla_variable=sla_variable, - sla_value=int(bench_comb.get("num_prompts", 1000)), # type: ignore + sla_value=dataset_size, ) if serial_comb_data is None or batch_comb_data is None: