benchmarks/benchmark_latency.py

# SPDX-License-Identifier: Apache-2.0
"""Benchmark the latency of processing a single batch of requests."""

import argparse
import dataclasses
import json
import os
import time
from pathlib import Path
from typing import Any, Optional

import numpy as np
import torch
from tqdm import tqdm

from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import EngineArgs
from vllm.inputs import PromptType
from vllm.sampling_params import BeamSearchParams
from vllm.utils import FlexibleArgumentParser


def save_to_pytorch_benchmark_format(
    args: argparse.Namespace, results: dict[str, Any]
) -> None:
    pt_records = convert_to_pytorch_benchmark_format(
        args=args,
        metrics={"latency": results["latencies"]},
        extra_info={k: results[k] for k in ["avg_latency", "percentiles"]},
    )
    if pt_records:
        pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
        write_to_json(pt_file, pt_records)


def main(args: argparse.Namespace):
    print(args)

    engine_args = EngineArgs.from_cli_args(args)

    # NOTE(woosuk): If the request cannot be processed in a single batch,
    # the engine will automatically process the request in multiple batches.
    llm = LLM(**dataclasses.asdict(engine_args))
    assert llm.llm_engine.model_config.max_model_len >= (
        args.input_len + args.output_len
    ), (
        "Please ensure that max_model_len is greater than"
        " the sum of input_len and output_len."
    )

    sampling_params = SamplingParams(
        n=args.n,
        temperature=1.0,
        top_p=1.0,
        ignore_eos=True,
        max_tokens=args.output_len,
        detokenize=not args.disable_detokenize,
    )
    print(sampling_params)
    dummy_prompt_token_ids = np.random.randint(
        10000, size=(args.batch_size, args.input_len)
    )
    dummy_prompts: list[PromptType] = [
        {"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist()
    ]

    def llm_generate():
        if not args.use_beam_search:
            llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False)
        else:
            llm.beam_search(
                dummy_prompts,
                BeamSearchParams(
                    beam_width=args.n,
                    max_tokens=args.output_len,
                    ignore_eos=True,
                ),
            )

    def run_to_completion(profile_dir: Optional[str] = None):
        if profile_dir:
            with torch.profiler.profile(
                activities=[
                    torch.profiler.ProfilerActivity.CPU,
                    torch.profiler.ProfilerActivity.CUDA,
                ],
                on_trace_ready=torch.profiler.tensorboard_trace_handler(
                    str(profile_dir)
                ),
            ) as p:
                llm_generate()
            print(p.key_averages().table(sort_by="self_cuda_time_total"))
        else:
            start_time = time.perf_counter()
            llm_generate()
            end_time = time.perf_counter()
            latency = end_time - start_time
            return latency

    print("Warming up...")
    for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
        run_to_completion(profile_dir=None)

    if args.profile:
        profile_dir = args.profile_result_dir
        if not profile_dir:
            profile_dir = (
                Path(".") / "vllm_benchmark_result" / f"latency_result_{time.time()}"
            )
        print(f"Profiling (results will be saved to '{profile_dir}')...")
        run_to_completion(profile_dir=profile_dir)
        return

    # Benchmark.
    latencies = []
    for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
        latencies.append(run_to_completion(profile_dir=None))
    latencies = np.array(latencies)
    percentages = [10, 25, 50, 75, 90, 99]
    percentiles = np.percentile(latencies, percentages)
    print(f"Avg latency: {np.mean(latencies)} seconds")
    for percentage, percentile in zip(percentages, percentiles):
        print(f"{percentage}% percentile latency: {percentile} seconds")

    # Output JSON results if specified
    if args.output_json:
        results = {
            "avg_latency": np.mean(latencies),
            "latencies": latencies.tolist(),
            "percentiles": dict(zip(percentages, percentiles.tolist())),
        }
        with open(args.output_json, "w") as f:
            json.dump(results, f, indent=4)
        save_to_pytorch_benchmark_format(args, results)


if __name__ == "__main__":
    parser = FlexibleArgumentParser(
        description="Benchmark the latency of processing a single batch of "
        "requests till completion."
    )
    parser.add_argument("--input-len", type=int, default=32)
    parser.add_argument("--output-len", type=int, default=128)
    parser.add_argument("--batch-size", type=int, default=8)
    parser.add_argument(
        "--n",
        type=int,
        default=1,
        help="Number of generated sequences per prompt.",
    )
    parser.add_argument("--use-beam-search", action="store_true")
    parser.add_argument(
        "--num-iters-warmup",
        type=int,
        default=10,
        help="Number of iterations to run for warmup.",
    )
    parser.add_argument(
        "--num-iters", type=int, default=30, help="Number of iterations to run."
    )
    parser.add_argument(
        "--profile",
        action="store_true",
        help="profile the generation process of a single batch",
    )
    parser.add_argument(
        "--profile-result-dir",
        type=str,
        default=None,
        help=(
            "path to save the pytorch profiler output. Can be visualized "
            "with ui.perfetto.dev or Tensorboard."
        ),
    )
    parser.add_argument(
        "--output-json",
        type=str,
        default=None,
        help="Path to save the latency results in JSON format.",
    )
    parser.add_argument(
        "--disable-detokenize",
        action="store_true",
        help=(
            "Do not detokenize responses (i.e. do not include "
            "detokenization time in the latency measurement)"
        ),
    )

    parser = EngineArgs.add_cli_args(parser)
    args = parser.parse_args()
    main(args)
[Misc] Add SPDX-License-Identifier headers to python source files (#12628) - Add SPDX license headers to python source files - Check for SPDX headers using pre-commit commit 9d7ef44c3cfb72ca4c32e1c677d99259d10d4745 Author: Russell Bryant <rbryant@redhat.com> Date: Fri Jan 31 14:18:24 2025 -0500 Add SPDX license headers to python source files This commit adds SPDX license headers to python source files as recommended to the project by the Linux Foundation. These headers provide a concise way that is both human and machine readable for communicating license information for each source file. It helps avoid any ambiguity about the license of the code and can also be easily used by tools to help manage license compliance. The Linux Foundation runs license scans against the codebase to help ensure we are in compliance with the licenses of the code we use, including dependencies. Having these headers in place helps that tool do its job. More information can be found on the SPDX site: - https://spdx.dev/learn/handling-license-info/ Signed-off-by: Russell Bryant <rbryant@redhat.com> commit 5a1cf1cb3b80759131c73f6a9dddebccac039dea Author: Russell Bryant <rbryant@redhat.com> Date: Fri Jan 31 14:36:32 2025 -0500 Check for SPDX headers using pre-commit Signed-off-by: Russell Bryant <rbryant@redhat.com> --------- Signed-off-by: Russell Bryant <rbryant@redhat.com> 2025-02-02 14:58:18 -05:00			`# SPDX-License-Identifier: Apache-2.0`
Add script for benchmarking serving throughput (#145) 2023-06-14 19:55:38 -07:00			`"""Benchmark the latency of processing a single batch of requests."""`
Run v1 benchmark and integrate with PyTorch OSS benchmark database (#13068) Signed-off-by: Huy Do <huydhn@gmail.com> 2025-02-17 00:16:32 -08:00
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00			`import argparse`
[Misc] Make benchmarks use EngineArgs (#9529) 2024-10-22 17:40:38 -05:00			`import dataclasses`
Add JSON output support for benchmark_latency and benchmark_throughput (#4848) 2024-05-16 10:02:56 -07:00			`import json`
Run v1 benchmark and integrate with PyTorch OSS benchmark database (#13068) Signed-off-by: Huy Do <huydhn@gmail.com> 2025-02-17 00:16:32 -08:00			`import os`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00			`import time`
Save pytorch profiler output for latency benchmark (#1871) * Save profiler output * Apply feedback from code review 2023-12-05 20:55:55 -08:00			`from pathlib import Path`
Update deprecated Python 3.8 typing (#13971) 2025-03-03 01:34:51 +00:00			`from typing import Any, Optional`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00
			`import numpy as np`
			`import torch`
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`from tqdm import tqdm`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00
Convert `benchmarks` to `ruff format` (#18068) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> 2025-05-13 14:43:29 +01:00			`from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json`
Change the name to vLLM (#150) 2023-06-17 03:07:40 -07:00			`from vllm import LLM, SamplingParams`
[Misc] Make benchmarks use EngineArgs (#9529) 2024-10-22 17:40:38 -05:00			`from vllm.engine.arg_utils import EngineArgs`
[Core] rename`PromptInputs` and `inputs` (#8876) 2024-09-27 11:35:15 +08:00			`from vllm.inputs import PromptType`
[Bugfix] fix beam search input errors and latency benchmark script (#11875) Signed-off-by: Ye Qi <yeq@meta.com> Co-authored-by: yeq <yeq@devgpu004.lla3.facebook.com> 2025-01-09 01:36:39 -08:00			`from vllm.sampling_params import BeamSearchParams`
[Frontend] Add FlexibleArgumentParser to support both underscore and dash in names (#5718) 2024-06-20 19:00:13 -04:00			`from vllm.utils import FlexibleArgumentParser`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00

Convert `benchmarks` to `ruff format` (#18068) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> 2025-05-13 14:43:29 +01:00			`def save_to_pytorch_benchmark_format(`
			`args: argparse.Namespace, results: dict[str, Any]`
			`) -> None:`
Run v1 benchmark and integrate with PyTorch OSS benchmark database (#13068) Signed-off-by: Huy Do <huydhn@gmail.com> 2025-02-17 00:16:32 -08:00			`pt_records = convert_to_pytorch_benchmark_format(`
			`args=args,`
			`metrics={"latency": results["latencies"]},`
Convert `benchmarks` to `ruff format` (#18068) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> 2025-05-13 14:43:29 +01:00			`extra_info={k: results[k] for k in ["avg_latency", "percentiles"]},`
			`)`
Run v1 benchmark and integrate with PyTorch OSS benchmark database (#13068) Signed-off-by: Huy Do <huydhn@gmail.com> 2025-02-17 00:16:32 -08:00			`if pt_records:`
			`pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"`
Fix some issues with benchmark data output (#13641) Signed-off-by: Huy Do <huydhn@gmail.com> 2025-02-23 18:23:18 -08:00			`write_to_json(pt_file, pt_records)`
Run v1 benchmark and integrate with PyTorch OSS benchmark database (#13068) Signed-off-by: Huy Do <huydhn@gmail.com> 2025-02-17 00:16:32 -08:00

Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00			`def main(args: argparse.Namespace):`
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`print(args)`

[Misc] Make benchmarks use EngineArgs (#9529) 2024-10-22 17:40:38 -05:00			`engine_args = EngineArgs.from_cli_args(args)`

Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`# NOTE(woosuk): If the request cannot be processed in a single batch,`
Rename servers to engines (#152) 2023-06-17 17:25:21 +08:00			`# the engine will automatically process the request in multiple batches.`
[Misc] Make benchmarks use EngineArgs (#9529) 2024-10-22 17:40:38 -05:00			`llm = LLM(**dataclasses.asdict(engine_args))`
[Bugfix] Fix benchmark script bug: inaccurate stats for vllm backend when max_model_len < input_len + output_len (#13691) Signed-off-by: WangErXiao <863579016@qq.com> 2025-02-22 14:10:38 +08:00			`assert llm.llm_engine.model_config.max_model_len >= (`
Convert `benchmarks` to `ruff format` (#18068) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> 2025-05-13 14:43:29 +01:00			`args.input_len + args.output_len`
			`), (`
			`"Please ensure that max_model_len is greater than"`
			`" the sum of input_len and output_len."`
			`)`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00
Enhance SamplingParams (#96) 2023-05-11 15:45:30 -07:00			`sampling_params = SamplingParams(`
			`n=args.n,`
[core] remove beam search from the core (#9105) 2024-10-06 22:47:04 -07:00			`temperature=1.0,`
Enhance SamplingParams (#96) 2023-05-11 15:45:30 -07:00			`top_p=1.0,`
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`ignore_eos=True,`
Enhance SamplingParams (#96) 2023-05-11 15:45:30 -07:00			`max_tokens=args.output_len,`
[Benchmarks] Make detokenization optional in benchmark scripts (#11697) Signed-off-by: Jeremy Arnold <Jeremy.Arnold@amd.com> 2025-03-07 10:09:00 -06:00			`detokenize=not args.disable_detokenize,`
Enhance SamplingParams (#96) 2023-05-11 15:45:30 -07:00			`)`
Implement block copy kernel to optimize beam search (#32) 2023-04-07 17:45:07 -07:00			`print(sampling_params)`
Convert `benchmarks` to `ruff format` (#18068) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> 2025-05-13 14:43:29 +01:00			`dummy_prompt_token_ids = np.random.randint(`
			`10000, size=(args.batch_size, args.input_len)`
			`)`
			`dummy_prompts: list[PromptType] = [`
			`{"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist()`
			`]`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00
[Bugfix] fix beam search input errors and latency benchmark script (#11875) Signed-off-by: Ye Qi <yeq@meta.com> Co-authored-by: yeq <yeq@devgpu004.lla3.facebook.com> 2025-01-09 01:36:39 -08:00			`def llm_generate():`
			`if not args.use_beam_search:`
Convert `benchmarks` to `ruff format` (#18068) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> 2025-05-13 14:43:29 +01:00			`llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False)`
[Bugfix] fix beam search input errors and latency benchmark script (#11875) Signed-off-by: Ye Qi <yeq@meta.com> Co-authored-by: yeq <yeq@devgpu004.lla3.facebook.com> 2025-01-09 01:36:39 -08:00			`else:`
			`llm.beam_search(`
			`dummy_prompts,`
			`BeamSearchParams(`
			`beam_width=args.n,`
			`max_tokens=args.output_len,`
			`ignore_eos=True,`
Run v1 benchmark and integrate with PyTorch OSS benchmark database (#13068) Signed-off-by: Huy Do <huydhn@gmail.com> 2025-02-17 00:16:32 -08:00			`),`
			`)`
[Bugfix] fix beam search input errors and latency benchmark script (#11875) Signed-off-by: Ye Qi <yeq@meta.com> Co-authored-by: yeq <yeq@devgpu004.lla3.facebook.com> 2025-01-09 01:36:39 -08:00
Save pytorch profiler output for latency benchmark (#1871) * Save profiler output * Apply feedback from code review 2023-12-05 20:55:55 -08:00			`def run_to_completion(profile_dir: Optional[str] = None):`
			`if profile_dir:`
			`with torch.profiler.profile(`
Convert `benchmarks` to `ruff format` (#18068) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> 2025-05-13 14:43:29 +01:00			`activities=[`
			`torch.profiler.ProfilerActivity.CPU,`
			`torch.profiler.ProfilerActivity.CUDA,`
			`],`
			`on_trace_ready=torch.profiler.tensorboard_trace_handler(`
			`str(profile_dir)`
			`),`
Run v1 benchmark and integrate with PyTorch OSS benchmark database (#13068) Signed-off-by: Huy Do <huydhn@gmail.com> 2025-02-17 00:16:32 -08:00			`) as p:`
[Bugfix] fix beam search input errors and latency benchmark script (#11875) Signed-off-by: Ye Qi <yeq@meta.com> Co-authored-by: yeq <yeq@devgpu004.lla3.facebook.com> 2025-01-09 01:36:39 -08:00			`llm_generate()`
[Misc] sort torch profiler table by kernel timing (#11813) 2025-01-07 20:57:04 -06:00			`print(p.key_averages().table(sort_by="self_cuda_time_total"))`
Add profile option to latency benchmark script (#1839) 2023-11-29 23:42:52 -08:00			`else:`
			`start_time = time.perf_counter()`
[Bugfix] fix beam search input errors and latency benchmark script (#11875) Signed-off-by: Ye Qi <yeq@meta.com> Co-authored-by: yeq <yeq@devgpu004.lla3.facebook.com> 2025-01-09 01:36:39 -08:00			`llm_generate()`
Add profile option to latency benchmark script (#1839) 2023-11-29 23:42:52 -08:00			`end_time = time.perf_counter()`
			`latency = end_time - start_time`
			`return latency`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`print("Warming up...")`
[CI/Benchmark] add more iteration and use median for robust latency benchmark (#3889) 2024-04-06 14:32:30 -07:00			`for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):`
			`run_to_completion(profile_dir=None)`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00
Add profile option to latency benchmark script (#1839) 2023-11-29 23:42:52 -08:00			`if args.profile:`
Save pytorch profiler output for latency benchmark (#1871) * Save profiler output * Apply feedback from code review 2023-12-05 20:55:55 -08:00			`profile_dir = args.profile_result_dir`
			`if not profile_dir:`
Convert `benchmarks` to `ruff format` (#18068) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> 2025-05-13 14:43:29 +01:00			`profile_dir = (`
			`Path(".") / "vllm_benchmark_result" / f"latency_result_{time.time()}"`
			`)`
Save pytorch profiler output for latency benchmark (#1871) * Save profiler output * Apply feedback from code review 2023-12-05 20:55:55 -08:00			`print(f"Profiling (results will be saved to '{profile_dir}')...")`
[Minor] Fix benchmark_latency script (#2765) 2024-02-05 12:45:37 -08:00			`run_to_completion(profile_dir=profile_dir)`
Add profile option to latency benchmark script (#1839) 2023-11-29 23:42:52 -08:00			`return`

Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00			`# Benchmark.`
			`latencies = []`
Fix latency benchmark script (#118) 2023-05-22 17:03:40 -07:00			`for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):`
Fix latency benchmark script (#2035) 2023-12-11 11:19:08 -08:00			`latencies.append(run_to_completion(profile_dir=None))`
[CI/Benchmark] add more iteration and use median for robust latency benchmark (#3889) 2024-04-06 14:32:30 -07:00			`latencies = np.array(latencies)`
[CI] the readability of benchmarking and prepare for dashboard (#5571) [CI] Improve the readability of performance benchmarking results and prepare for upcoming performance dashboard (#5571) 2024-06-17 11:41:08 -07:00			`percentages = [10, 25, 50, 75, 90, 99]`
[CI/Benchmark] add more iteration and use median for robust latency benchmark (#3889) 2024-04-06 14:32:30 -07:00			`percentiles = np.percentile(latencies, percentages)`
Run v1 benchmark and integrate with PyTorch OSS benchmark database (#13068) Signed-off-by: Huy Do <huydhn@gmail.com> 2025-02-17 00:16:32 -08:00			`print(f"Avg latency: {np.mean(latencies)} seconds")`
[CI/Benchmark] add more iteration and use median for robust latency benchmark (#3889) 2024-04-06 14:32:30 -07:00			`for percentage, percentile in zip(percentages, percentiles):`
Run v1 benchmark and integrate with PyTorch OSS benchmark database (#13068) Signed-off-by: Huy Do <huydhn@gmail.com> 2025-02-17 00:16:32 -08:00			`print(f"{percentage}% percentile latency: {percentile} seconds")`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00
Add JSON output support for benchmark_latency and benchmark_throughput (#4848) 2024-05-16 10:02:56 -07:00			`# Output JSON results if specified`
			`if args.output_json:`
			`results = {`
			`"avg_latency": np.mean(latencies),`
			`"latencies": latencies.tolist(),`
			`"percentiles": dict(zip(percentages, percentiles.tolist())),`
			`}`
			`with open(args.output_json, "w") as f:`
			`json.dump(results, f, indent=4)`
Run v1 benchmark and integrate with PyTorch OSS benchmark database (#13068) Signed-off-by: Huy Do <huydhn@gmail.com> 2025-02-17 00:16:32 -08:00			`save_to_pytorch_benchmark_format(args, results)`
Add JSON output support for benchmark_latency and benchmark_throughput (#4848) 2024-05-16 10:02:56 -07:00
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00
Run v1 benchmark and integrate with PyTorch OSS benchmark database (#13068) Signed-off-by: Huy Do <huydhn@gmail.com> 2025-02-17 00:16:32 -08:00			`if __name__ == "__main__":`
[Frontend] Add FlexibleArgumentParser to support both underscore and dash in names (#5718) 2024-06-20 19:00:13 -04:00			`parser = FlexibleArgumentParser(`
Run v1 benchmark and integrate with PyTorch OSS benchmark database (#13068) Signed-off-by: Huy Do <huydhn@gmail.com> 2025-02-17 00:16:32 -08:00			`description="Benchmark the latency of processing a single batch of "`
Convert `benchmarks` to `ruff format` (#18068) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> 2025-05-13 14:43:29 +01:00			`"requests till completion."`
			`)`
Run v1 benchmark and integrate with PyTorch OSS benchmark database (#13068) Signed-off-by: Huy Do <huydhn@gmail.com> 2025-02-17 00:16:32 -08:00			`parser.add_argument("--input-len", type=int, default=32)`
			`parser.add_argument("--output-len", type=int, default=128)`
			`parser.add_argument("--batch-size", type=int, default=8)`
			`parser.add_argument(`
			`"--n",`
			`type=int,`
			`default=1,`
			`help="Number of generated sequences per prompt.",`
			`)`
			`parser.add_argument("--use-beam-search", action="store_true")`
			`parser.add_argument(`
			`"--num-iters-warmup",`
			`type=int,`
			`default=10,`
			`help="Number of iterations to run for warmup.",`
			`)`
Convert `benchmarks` to `ruff format` (#18068) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> 2025-05-13 14:43:29 +01:00			`parser.add_argument(`
			`"--num-iters", type=int, default=30, help="Number of iterations to run."`
			`)`
Add profile option to latency benchmark script (#1839) 2023-11-29 23:42:52 -08:00			`parser.add_argument(`
Run v1 benchmark and integrate with PyTorch OSS benchmark database (#13068) Signed-off-by: Huy Do <huydhn@gmail.com> 2025-02-17 00:16:32 -08:00			`"--profile",`
			`action="store_true",`
			`help="profile the generation process of a single batch",`
			`)`
Save pytorch profiler output for latency benchmark (#1871) * Save profiler output * Apply feedback from code review 2023-12-05 20:55:55 -08:00			`parser.add_argument(`
Run v1 benchmark and integrate with PyTorch OSS benchmark database (#13068) Signed-off-by: Huy Do <huydhn@gmail.com> 2025-02-17 00:16:32 -08:00			`"--profile-result-dir",`
Save pytorch profiler output for latency benchmark (#1871) * Save profiler output * Apply feedback from code review 2023-12-05 20:55:55 -08:00			`type=str,`
			`default=None,`
Convert `benchmarks` to `ruff format` (#18068) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> 2025-05-13 14:43:29 +01:00			`help=(`
			`"path to save the pytorch profiler output. Can be visualized "`
			`"with ui.perfetto.dev or Tensorboard."`
			`),`
Run v1 benchmark and integrate with PyTorch OSS benchmark database (#13068) Signed-off-by: Huy Do <huydhn@gmail.com> 2025-02-17 00:16:32 -08:00			`)`
Add JSON output support for benchmark_latency and benchmark_throughput (#4848) 2024-05-16 10:02:56 -07:00			`parser.add_argument(`
Run v1 benchmark and integrate with PyTorch OSS benchmark database (#13068) Signed-off-by: Huy Do <huydhn@gmail.com> 2025-02-17 00:16:32 -08:00			`"--output-json",`
Add JSON output support for benchmark_latency and benchmark_throughput (#4848) 2024-05-16 10:02:56 -07:00			`type=str,`
			`default=None,`
Run v1 benchmark and integrate with PyTorch OSS benchmark database (#13068) Signed-off-by: Huy Do <huydhn@gmail.com> 2025-02-17 00:16:32 -08:00			`help="Path to save the latency results in JSON format.",`
			`)`
[Benchmarks] Make detokenization optional in benchmark scripts (#11697) Signed-off-by: Jeremy Arnold <Jeremy.Arnold@amd.com> 2025-03-07 10:09:00 -06:00			`parser.add_argument(`
			`"--disable-detokenize",`
			`action="store_true",`
Convert `benchmarks` to `ruff format` (#18068) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> 2025-05-13 14:43:29 +01:00			`help=(`
			`"Do not detokenize responses (i.e. do not include "`
			`"detokenization time in the latency measurement)"`
			`),`
[Benchmarks] Make detokenization optional in benchmark scripts (#11697) Signed-off-by: Jeremy Arnold <Jeremy.Arnold@amd.com> 2025-03-07 10:09:00 -06:00			`)`
[Misc] Make benchmarks use EngineArgs (#9529) 2024-10-22 17:40:38 -05:00
			`parser = EngineArgs.add_cli_args(parser)`
Optimize tensor parallel execution speed (#17) 2023-04-01 00:51:08 +08:00			`args = parser.parse_args()`
			`main(args)`