diff --git a/docs/benchmarking/sweeps.md b/docs/benchmarking/sweeps.md index 5571db0a5..156b9c0c0 100644 --- a/docs/benchmarking/sweeps.md +++ b/docs/benchmarking/sweeps.md @@ -102,36 +102,39 @@ By default, each parameter combination is benchmarked 3 times to make the result !!! tip You can use the `--resume` option to continue the parameter sweep if an unexpected error occurs, e.g., timeout when connecting to HF Hub. -### SLA Scanner +### Workload Explorer -`vllm bench sweep serve_sla` is a variant of `vllm bench sweep serve` that scans through values of request rate or concurrency (choose using `--sla-variable`) in order to find the tradeoff between latency and throughput. The results can then be [visualized](#visualization) to determine the feasible SLAs. +`vllm bench sweep serve_workload` is a variant of `vllm bench sweep serve` that explores different workload levels in order to find the tradeoff between latency and throughput. The results can also be [visualized](#visualization) to determine the feasible SLAs. + +The workload can be expressed in terms of request rate or concurrency (choose using `--workload-var`). Example command: ```bash -vllm bench sweep serve_sla \ +vllm bench sweep serve_workload \ --serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \ --bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 100' \ - --sla-variable max_concurrency \ + --workload-var max_concurrency \ --serve-params benchmarks/serve_hparams.json \ - --bench-params benchmarks/bench_hparams.json + --bench-params benchmarks/bench_hparams.json \ + --num-runs 1 \ -o benchmarks/results ``` -The algorithm for scanning through different values of `sla_variable` can be summarized as follows: +The algorithm for exploring different workload levels can be summarized as follows: -1. Run the benchmark by sending requests one at a time (serial inference). This results in the lowest possible latency and throughput. -2. Run the benchmark by sending all requests at once (batch inference). This results in the highest possible latency and throughput. -3. Estimate the maximum value of `sla_variable` that can be supported by the server without oversaturating it. -4. Run the benchmark over intermediate values of `sla_variable` uniformly using the remaining iterations. +1. Run the benchmark by sending requests one at a time (serial inference, lowest workload). This results in the lowest possible latency and throughput. +2. Run the benchmark by sending all requests at once (batch inference, highest workload). This results in the highest possible latency and throughput. +3. Estimate the value of `workload_var` corresponding to Step 2. +4. Run the benchmark over intermediate values of `workload_var` uniformly using the remaining iterations. -You can override the number of iterations in the algorithm by setting `--sla-iters`. +You can override the number of iterations in the algorithm by setting `--workload-iters`. !!! tip This is our equivalent of [GuideLLM's `--profile sweep`](https://github.com/vllm-project/guidellm/blob/v0.5.3/src/guidellm/benchmark/profiles.py#L575). - In general, `--sla-variable max_concurrency` produces more reliable results because it directly controls the workload imposed on the vLLM engine. - Nevertheless, we default to `--sla-variable request_rate` to maintain similar behavior as GuideLLM. + In general, `--workload-var max_concurrency` produces more reliable results because it directly controls the workload imposed on the vLLM engine. + Nevertheless, we default to `--workload-var request_rate` to maintain similar behavior as GuideLLM. ## Startup Benchmark @@ -198,7 +201,7 @@ vllm bench sweep startup \ Control the variables to plot via `--var-x` and `--var-y`, optionally applying `--filter-by` and `--bin-by` to the values. The plot is organized according to `--fig-by`, `--row-by`, `--col-by`, and `--curve-by`. -Example commands for visualizing [SLA Scanner](#sla-scanner) results: +Example commands for visualizing [Workload Explorer](#workload-explorer) results: ```bash # Name of the directory that stores the results diff --git a/docs/cli/bench/sweep/serve_sla.md b/docs/cli/bench/sweep/serve_sla.md deleted file mode 100644 index 688d64f0b..000000000 --- a/docs/cli/bench/sweep/serve_sla.md +++ /dev/null @@ -1,9 +0,0 @@ -# vllm bench sweep serve_sla - -## JSON CLI Arguments - ---8<-- "docs/cli/json_tip.inc.md" - -## Arguments - ---8<-- "docs/generated/argparse/bench_sweep_serve_sla.inc.md" diff --git a/docs/cli/bench/sweep/serve_workload.md b/docs/cli/bench/sweep/serve_workload.md new file mode 100644 index 000000000..8c21788e8 --- /dev/null +++ b/docs/cli/bench/sweep/serve_workload.md @@ -0,0 +1,9 @@ +# vllm bench sweep serve_workload + +## JSON CLI Arguments + +--8<-- "docs/cli/json_tip.inc.md" + +## Arguments + +--8<-- "docs/generated/argparse/bench_sweep_serve_workload.inc.md" diff --git a/docs/mkdocs/hooks/generate_argparse.py b/docs/mkdocs/hooks/generate_argparse.py index 801cc8a05..9d87f88f5 100644 --- a/docs/mkdocs/hooks/generate_argparse.py +++ b/docs/mkdocs/hooks/generate_argparse.py @@ -100,8 +100,8 @@ bench_sweep_plot_pareto = auto_mock( "vllm.benchmarks.sweep.plot_pareto", "SweepPlotParetoArgs" ) bench_sweep_serve = auto_mock("vllm.benchmarks.sweep.serve", "SweepServeArgs") -bench_sweep_serve_sla = auto_mock( - "vllm.benchmarks.sweep.serve_sla", "SweepServeSLAArgs" +bench_sweep_serve_workload = auto_mock( + "vllm.benchmarks.sweep.serve_workload", "SweepServeWorkloadArgs" ) bench_throughput = auto_mock("vllm.benchmarks", "throughput") AsyncEngineArgs = auto_mock("vllm.engine.arg_utils", "AsyncEngineArgs") @@ -229,7 +229,9 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool): "bench_sweep_plot": create_parser(bench_sweep_plot.add_cli_args), "bench_sweep_plot_pareto": create_parser(bench_sweep_plot_pareto.add_cli_args), "bench_sweep_serve": create_parser(bench_sweep_serve.add_cli_args), - "bench_sweep_serve_sla": create_parser(bench_sweep_serve_sla.add_cli_args), + "bench_sweep_serve_workload": create_parser( + bench_sweep_serve_workload.add_cli_args + ), "bench_throughput": create_parser(bench_throughput.add_cli_args), } diff --git a/vllm/benchmarks/sweep/cli.py b/vllm/benchmarks/sweep/cli.py index a752000f9..75549105f 100644 --- a/vllm/benchmarks/sweep/cli.py +++ b/vllm/benchmarks/sweep/cli.py @@ -10,14 +10,14 @@ from .plot_pareto import SweepPlotParetoArgs from .plot_pareto import main as plot_pareto_main from .serve import SweepServeArgs from .serve import main as serve_main -from .serve_sla import SweepServeSLAArgs -from .serve_sla import main as serve_sla_main +from .serve_workload import SweepServeWorkloadArgs +from .serve_workload import main as serve_workload_main from .startup import SweepStartupArgs from .startup import main as startup_main SUBCOMMANDS = ( (SweepServeArgs, serve_main), - (SweepServeSLAArgs, serve_sla_main), + (SweepServeWorkloadArgs, serve_workload_main), (SweepStartupArgs, startup_main), (SweepPlotArgs, plot_main), (SweepPlotParetoArgs, plot_pareto_main), diff --git a/vllm/benchmarks/sweep/serve_sla.py b/vllm/benchmarks/sweep/serve_workload.py similarity index 61% rename from vllm/benchmarks/sweep/serve_sla.py rename to vllm/benchmarks/sweep/serve_workload.py index 38d54ea42..3da403a84 100644 --- a/vllm/benchmarks/sweep/serve_sla.py +++ b/vllm/benchmarks/sweep/serve_workload.py @@ -28,25 +28,32 @@ except ImportError: pd = PlaceholderModule("pandas") -SLAVariable = Literal["request_rate", "max_concurrency"] +WorkloadVariable = Literal["request_rate", "max_concurrency"] -def _estimate_sla_value(run_data: dict[str, object], sla_variable: SLAVariable): +def _estimate_workload_value( + run_data: dict[str, object], + workload_var: WorkloadVariable, +): request_throughput = float(run_data["request_throughput"]) # type: ignore - if sla_variable == "request_rate": + if workload_var == "request_rate": return request_throughput - if sla_variable == "max_concurrency": + if workload_var == "max_concurrency": mean_latency_ms = float(run_data["mean_e2el_ms"]) # type: ignore return request_throughput * mean_latency_ms / 1000 - assert_never(sla_variable) + assert_never(workload_var) -def _estimate_sla_avg(runs: list[dict[str, object]], sla_variable: SLAVariable): - return sum(_estimate_sla_value(run, sla_variable) for run in runs) / len(runs) +def _estimate_workload_avg( + runs: list[dict[str, object]], + workload_var: WorkloadVariable, +): + total = sum(_estimate_workload_value(run, workload_var) for run in runs) + return total / len(runs) -def run_comb_sla( +def run_comb_workload( server: ServerProcess | None, bench_cmd: list[str], *, @@ -56,21 +63,21 @@ def run_comb_sla( num_runs: int, dry_run: bool, link_vars: list[tuple[str, str]], - sla_variable: SLAVariable, - sla_value: int, + workload_var: WorkloadVariable, + workload_value: int, ) -> list[dict[str, object]] | None: - bench_comb_sla = bench_comb | {sla_variable: sla_value} + bench_comb_workload = bench_comb | {workload_var: workload_value} return run_comb( server, bench_cmd, serve_comb=serve_comb, - bench_comb=bench_comb_sla, + bench_comb=bench_comb_workload, base_path=_get_comb_base_path( output_dir, serve_comb, bench_comb, - extra_parts=("SLA-", f"{sla_variable}={sla_value}"), + extra_parts=("WL-", f"{workload_var}={workload_value}"), ), num_runs=num_runs, dry_run=dry_run, @@ -78,26 +85,26 @@ def run_comb_sla( ) -def explore_sla( +def explore_comb_workloads( server: ServerProcess | None, bench_cmd: list[str], *, serve_comb: ParameterSweepItem, bench_comb: ParameterSweepItem, - sla_variable: SLAVariable, - sla_iters: int, + workload_var: WorkloadVariable, + workload_iters: int, output_dir: Path, num_runs: int, dry_run: bool, link_vars: list[tuple[str, str]], ): - print("[SLA START]") + print("[WL START]") print(f"Serve parameters: {serve_comb.as_text() or '(None)'}") print(f"Bench parameters: {bench_comb.as_text() or '(None)'}") - print(f"Number of SLA iterations: {sla_iters}") + print(f"Number of workload iterations: {workload_iters}") - if sla_iters < 2: - raise ValueError("`sla_iters` should be at least 2") + if workload_iters < 2: + raise ValueError("`workload_iters` should be at least 2") dataset_size = DEFAULT_NUM_PROMPTS if "num_prompts" in bench_comb: @@ -113,7 +120,7 @@ def explore_sla( print(f"Dataset size: {dataset_size}") - serial_comb_data = run_comb_sla( + serial_workload_data = run_comb_workload( server, bench_cmd, serve_comb=serve_comb, @@ -122,10 +129,10 @@ def explore_sla( num_runs=num_runs, dry_run=dry_run, link_vars=link_vars, - sla_variable=sla_variable, - sla_value=1, + workload_var=workload_var, + workload_value=1, ) - batch_comb_data = run_comb_sla( + batch_workload_data = run_comb_workload( server, bench_cmd, serve_comb=serve_comb, @@ -134,32 +141,38 @@ def explore_sla( num_runs=num_runs, dry_run=dry_run, link_vars=link_vars, - sla_variable=sla_variable, - sla_value=dataset_size, + workload_var=workload_var, + workload_value=dataset_size, ) - if serial_comb_data is None or batch_comb_data is None: + if serial_workload_data is None or batch_workload_data is None: if dry_run: - print("Omitting intermediate SLA iterations.") - print("[SLA END]") + print("Omitting intermediate Workload iterations.") + print("[WL END]") return - serial_sla_value = math.ceil(_estimate_sla_avg(serial_comb_data, sla_variable)) - print(f"Serial inference: {sla_variable}={serial_sla_value}") + serial_workload_value = math.ceil( + _estimate_workload_avg(serial_workload_data, workload_var) + ) + print(f"Serial inference: {workload_var}={serial_workload_value}") - batch_sla_value = math.floor(_estimate_sla_avg(batch_comb_data, sla_variable)) - print(f"Batch inference: {sla_variable}={batch_sla_value}") + batch_workload_value = math.floor( + _estimate_workload_avg(batch_workload_data, workload_var) + ) + print(f"Batch inference: {workload_var}={batch_workload_value}") # Avoid duplicated runs for intermediate values if the range between - # `serial_sla_value` and `batch_sla_value` is small - inter_sla_values = np.linspace(serial_sla_value, batch_sla_value, sla_iters)[1:-1] - inter_sla_values = sorted(set(map(round, inter_sla_values))) + # `serial_workload_value` and `batch_workload_value` is small + inter_workload_values = np.linspace( + serial_workload_value, batch_workload_value, workload_iters + )[1:-1] + inter_workload_values = sorted(set(map(round, inter_workload_values))) - inter_combs_data: list[dict[str, object]] = [] - for inter_sla_value in inter_sla_values: - print(f"Exploring: {sla_variable}={inter_sla_value}") - inter_comb_data = run_comb_sla( + inter_workloads_data: list[dict[str, object]] = [] + for inter_workload_value in inter_workload_values: + print(f"Exploring: {workload_var}={inter_workload_value}") + inter_workload_data = run_comb_workload( server, bench_cmd, serve_comb=serve_comb, @@ -168,18 +181,18 @@ def explore_sla( num_runs=num_runs, dry_run=dry_run, link_vars=link_vars, - sla_variable=sla_variable, - sla_value=inter_sla_value, + workload_var=workload_var, + workload_value=inter_workload_value, ) - if inter_comb_data is not None: - inter_combs_data.extend(inter_comb_data) + if inter_workload_data is not None: + inter_workloads_data.extend(inter_workload_data) - print("[SLA END]") + print("[WL END]") - return serial_comb_data + inter_combs_data + batch_comb_data + return serial_workload_data + inter_workloads_data + batch_workload_data -def run_slas( +def explore_combs_workloads( serve_cmd: list[str], bench_cmd: list[str], after_bench_cmd: list[str], @@ -188,17 +201,17 @@ def run_slas( server_ready_timeout: int, serve_params: ParameterSweep, bench_params: ParameterSweep, - sla_variable: SLAVariable, - sla_iters: int, + workload_var: WorkloadVariable, + workload_iters: int, output_dir: Path, num_runs: int, dry_run: bool, link_vars: list[tuple[str, str]], ): - if any(bench_comb.has_param(sla_variable) for bench_comb in bench_params): + if any(bench_comb.has_param(workload_var) for bench_comb in bench_params): raise ValueError( - f"You should not override `{sla_variable}` in `bench_params` in SLA mode, " - "since it is supposed to be determined automatically." + f"You should not override `{workload_var}` in `bench_params` " + "since it is supposed to be explored automatically." ) all_data = list[dict[str, object]]() @@ -214,13 +227,13 @@ def run_slas( dry_run=dry_run, ) as server: for bench_comb in bench_params: - comb_data = explore_sla( + comb_data = explore_comb_workloads( server, bench_cmd, serve_comb=serve_comb, bench_comb=bench_comb, - sla_variable=sla_variable, - sla_iters=sla_iters, + workload_var=workload_var, + workload_iters=workload_iters, output_dir=output_dir, num_runs=num_runs, dry_run=dry_run, @@ -240,13 +253,13 @@ def run_slas( @dataclass -class SweepServeSLAArgs(SweepServeArgs): - sla_variable: SLAVariable - sla_iters: int +class SweepServeWorkloadArgs(SweepServeArgs): + workload_var: WorkloadVariable + workload_iters: int - parser_name: ClassVar[str] = "serve_sla" + parser_name: ClassVar[str] = "serve_workload" parser_help: ClassVar[str] = ( - "Explore the latency-throughput space for determining SLAs." + "Explore the latency-throughput tradeoff for different workload levels." ) @classmethod @@ -256,35 +269,35 @@ class SweepServeSLAArgs(SweepServeArgs): return cls( **asdict(base_args), - sla_variable=args.sla_variable, - sla_iters=args.sla_iters, + workload_var=args.workload_var, + workload_iters=args.workload_iters, ) @classmethod def add_cli_args(cls, parser: argparse.ArgumentParser) -> argparse.ArgumentParser: parser = super().add_cli_args(parser) - sla_group = parser.add_argument_group("sla options") - sla_group.add_argument( - "--sla-variable", + workload_group = parser.add_argument_group("workload options") + workload_group.add_argument( + "--workload-var", type=str, - choices=get_args(SLAVariable), + choices=get_args(WorkloadVariable), default="request_rate", help="The variable to adjust in each iteration.", ) - sla_group.add_argument( - "--sla-iters", + workload_group.add_argument( + "--workload-iters", type=int, default=10, - help="Number of iterations used to explore the latency-throughput space. " + help="Number of workload levels to explore. " "This includes the first two iterations used to interpolate the value of " - "`sla_variable` for remaining iterations.", + "`workload_var` for remaining iterations.", ) return parser -def run_main(args: SweepServeSLAArgs): +def run_main(args: SweepServeWorkloadArgs): timestamp = args.resume or datetime.now().strftime("%Y%m%d_%H%M%S") output_dir = args.output_dir / timestamp @@ -292,7 +305,7 @@ def run_main(args: SweepServeSLAArgs): raise ValueError(f"Cannot resume from non-existent directory ({output_dir})") try: - return run_slas( + return explore_combs_workloads( serve_cmd=args.serve_cmd, bench_cmd=args.bench_cmd, after_bench_cmd=args.after_bench_cmd, @@ -300,8 +313,8 @@ def run_main(args: SweepServeSLAArgs): server_ready_timeout=args.server_ready_timeout, serve_params=args.serve_params, bench_params=args.bench_params, - sla_variable=args.sla_variable, - sla_iters=args.sla_iters, + workload_var=args.workload_var, + workload_iters=args.workload_iters, output_dir=output_dir, num_runs=args.num_runs, dry_run=args.dry_run, @@ -315,11 +328,11 @@ def run_main(args: SweepServeSLAArgs): def main(args: argparse.Namespace): - run_main(SweepServeSLAArgs.from_cli_args(args)) + run_main(SweepServeWorkloadArgs.from_cli_args(args)) if __name__ == "__main__": - parser = argparse.ArgumentParser(description=SweepServeSLAArgs.parser_help) - SweepServeSLAArgs.add_cli_args(parser) + parser = argparse.ArgumentParser(description=SweepServeWorkloadArgs.parser_help) + SweepServeWorkloadArgs.add_cli_args(parser) main(parser.parse_args())