Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-10-05 15:06:22 +01:00
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
--- a/vllm/benchmarks/latency.py
+++ b/vllm/benchmarks/latency.py
@@ -13,20 +13,20 @@ import numpy as np
 from tqdm import tqdm

 import vllm.envs as envs
-from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format,
-                                       write_to_json)
+from vllm.benchmarks.lib.utils import convert_to_pytorch_benchmark_format, write_to_json
 from vllm.engine.arg_utils import EngineArgs
 from vllm.inputs import PromptType
 from vllm.sampling_params import BeamSearchParams


-def save_to_pytorch_benchmark_format(args: argparse.Namespace,
-                                     results: dict[str, Any]) -> None:
+def save_to_pytorch_benchmark_format(
+    args: argparse.Namespace, results: dict[str, Any]
+) -> None:
    pt_records = convert_to_pytorch_benchmark_format(
        args=args,
        metrics={"latency": results["latencies"]},
-        extra_info={k: results[k]
-                    for k in ["avg_latency", "percentiles"]})
+        extra_info={k: results[k] for k in ["avg_latency", "percentiles"]},
+    )
    if pt_records:
        pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
        write_to_json(pt_file, pt_records)
@@ -49,10 +49,9 @@ def add_cli_args(parser: argparse.ArgumentParser):
        default=10,
        help="Number of iterations to run for warmup.",
    )
-    parser.add_argument("--num-iters",
-                        type=int,
-                        default=30,
-                        help="Number of iterations to run.")
+    parser.add_argument(
+        "--num-iters", type=int, default=30, help="Number of iterations to run."
+    )
    parser.add_argument(
        "--profile",
        action="store_true",
@@ -67,8 +66,10 @@ def add_cli_args(parser: argparse.ArgumentParser):
    parser.add_argument(
        "--disable-detokenize",
        action="store_true",
-        help=("Do not detokenize responses (i.e. do not include "
-              "detokenization time in the latency measurement)"),
+        help=(
+            "Do not detokenize responses (i.e. do not include "
+            "detokenization time in the latency measurement)"
+        ),
    )

    parser = EngineArgs.add_cli_args(parser)
@@ -81,7 +82,8 @@ def main(args: argparse.Namespace):
    if args.profile and not envs.VLLM_TORCH_PROFILER_DIR:
        raise OSError(
            "The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. "
-            "Please set it to a valid path to use torch profiler.")
+            "Please set it to a valid path to use torch profiler."
+        )
    engine_args = EngineArgs.from_cli_args(args)

    # Lazy import to avoid importing LLM when the bench command is not selected.
@@ -91,9 +93,11 @@ def main(args: argparse.Namespace):
    # the engine will automatically process the request in multiple batches.
    llm = LLM(**dataclasses.asdict(engine_args))
    assert llm.llm_engine.model_config.max_model_len >= (
-        args.input_len +
-        args.output_len), ("Please ensure that max_model_len is greater than"
-                           " the sum of input_len and output_len.")
+        args.input_len + args.output_len
+    ), (
+        "Please ensure that max_model_len is greater than"
+        " the sum of input_len and output_len."
+    )

    sampling_params = SamplingParams(
        n=args.n,
@@ -103,18 +107,16 @@ def main(args: argparse.Namespace):
        max_tokens=args.output_len,
        detokenize=not args.disable_detokenize,
    )
-    dummy_prompt_token_ids = np.random.randint(10000,
-                                               size=(args.batch_size,
-                                                     args.input_len))
-    dummy_prompts: list[PromptType] = [{
-        "prompt_token_ids": batch
-    } for batch in dummy_prompt_token_ids.tolist()]
+    dummy_prompt_token_ids = np.random.randint(
+        10000, size=(args.batch_size, args.input_len)
+    )
+    dummy_prompts: list[PromptType] = [
+        {"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist()
+    ]

    def llm_generate():
        if not args.use_beam_search:
-            llm.generate(dummy_prompts,
-                         sampling_params=sampling_params,
-                         use_tqdm=False)
+            llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False)
        else:
            llm.beam_search(
                dummy_prompts,
--- a/vllm/benchmarks/lib/endpoint_request_func.py
+++ b/vllm/benchmarks/lib/endpoint_request_func.py
@@ -62,6 +62,7 @@ class StreamedResponseHandler:
@dataclass
 class RequestFuncInput:
    """The input for the request function."""
+
    prompt: str
    api_url: str
    prompt_len: int
@@ -80,13 +81,13 @@ class RequestFuncInput:
@dataclass
 class RequestFuncOutput:
    """The output of the request function including metrics."""
+
    generated_text: str = ""
    success: bool = False
    latency: float = 0.0
    output_tokens: int = 0
    ttft: float = 0.0  # Time to first token
-    itl: list[float] = field(
-        default_factory=list)  # list of inter-token latencies
+    itl: list[float] = field(default_factory=list)  # list of inter-token latencies
    tpot: float = 0.0  # avg next-token latencies
    prompt_len: int = 0
    error: str = ""
@@ -99,8 +100,7 @@ class RequestFunc(Protocol):
        request_func_input: RequestFuncInput,
        session: aiohttp.ClientSession,
        pbar: Optional[tqdm] = None,
-    ) -> Awaitable[RequestFuncOutput]:
-        ...
+    ) -> Awaitable[RequestFuncOutput]: ...


 async def async_request_openai_completions(
@@ -118,13 +118,14 @@ async def async_request_openai_completions(
        The output of the request function.
    """
    api_url = request_func_input.api_url
-    assert api_url.endswith(
-        ("completions", "profile")
-    ), "OpenAI Completions API URL must end with 'completions' or 'profile'."
+    assert api_url.endswith(("completions", "profile")), (
+        "OpenAI Completions API URL must end with 'completions' or 'profile'."
+    )

    payload = {
        "model": request_func_input.model_name
-        if request_func_input.model_name else request_func_input.model,
+        if request_func_input.model_name
+        else request_func_input.model,
        "prompt": request_func_input.prompt,
        "temperature": 0.0,
        "repetition_penalty": 1.0,
@@ -139,9 +140,7 @@ async def async_request_openai_completions(
        payload["ignore_eos"] = request_func_input.ignore_eos
    if request_func_input.extra_body:
        payload.update(request_func_input.extra_body)
-    headers = {
-        "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
-    }
+    headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
    if request_func_input.extra_headers:
        headers |= request_func_input.extra_headers
    if request_func_input.request_id:
@@ -155,8 +154,7 @@ async def async_request_openai_completions(
    output.start_time = st
    most_recent_timestamp = st
    try:
-        async with session.post(url=api_url, json=payload,
-                                headers=headers) as response:
+        async with session.post(url=api_url, json=payload, headers=headers) as response:
            if response.status == 200:
                first_chunk_received = False
                handler = StreamedResponseHandler()
@@ -195,21 +193,20 @@ async def async_request_openai_completions(

                                # Decoding phase
                                else:
-                                    output.itl.append(timestamp -
-                                                    most_recent_timestamp)
+                                    output.itl.append(timestamp - most_recent_timestamp)

                                most_recent_timestamp = timestamp
                                generated_text += text or ""
                            elif usage := data.get("usage"):
-                                output.output_tokens = usage.get(
-                                    "completion_tokens")
+                                output.output_tokens = usage.get("completion_tokens")
                if first_chunk_received:
                    output.success = True
                else:
                    output.success = False
                    output.error = (
                        "Never received a valid chunk to calculate TTFT."
-                        "This response will be marked as failed!")
+                        "This response will be marked as failed!"
+                    )
                output.generated_text = generated_text
                output.latency = most_recent_timestamp - st
            else:
@@ -232,7 +229,8 @@ async def async_request_openai_chat_completions(
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
    assert api_url.endswith(("chat/completions", "profile")), (
-        "OpenAI Chat Completions API URL must end with 'chat/completions'.")
+        "OpenAI Chat Completions API URL must end with 'chat/completions'."
+    )

    content = [{"type": "text", "text": request_func_input.prompt}]
    if request_func_input.multi_modal_content:
@@ -243,25 +241,18 @@ async def async_request_openai_chat_completions(
            content.append(mm_content)
        else:
            raise TypeError(
-                "multi_modal_content must be a dict or list[dict] "
-                "for openai-chat"
+                "multi_modal_content must be a dict or list[dict] for openai-chat"
            )
    payload = {
-        "model":
-        request_func_input.model_name
-        if request_func_input.model_name else request_func_input.model,
+        "model": request_func_input.model_name
+        if request_func_input.model_name
+        else request_func_input.model,
        "messages": [
-            {
-                "role": "user",
-                "content": content
-            },
+            {"role": "user", "content": content},
        ],
-        "temperature":
-        0.0,
-        "max_completion_tokens":
-        request_func_input.output_len,
-        "stream":
-        True,
+        "temperature": 0.0,
+        "max_completion_tokens": request_func_input.output_len,
+        "stream": True,
        "stream_options": {
            "include_usage": True,
        },
@@ -288,8 +279,7 @@ async def async_request_openai_chat_completions(
    output.start_time = st
    most_recent_timestamp = st
    try:
-        async with session.post(url=api_url, json=payload,
-                                headers=headers) as response:
+        async with session.post(url=api_url, json=payload, headers=headers) as response:
            if response.status == 200:
                handler = StreamedResponseHandler()
                async for chunk_bytes in response.content.iter_any():
@@ -320,13 +310,11 @@ async def async_request_openai_chat_completions(

                                # Decoding phase
                                else:
-                                    output.itl.append(timestamp -
-                                                    most_recent_timestamp)
+                                    output.itl.append(timestamp - most_recent_timestamp)

                                generated_text += content or ""
                            elif usage := data.get("usage"):
-                                output.output_tokens = usage.get(
-                                    "completion_tokens")
+                                output.output_tokens = usage.get("completion_tokens")

                            most_recent_timestamp = timestamp

@@ -356,27 +344,22 @@ async def async_request_openai_audio(

    api_url = request_func_input.api_url
    assert api_url.endswith(("transcriptions", "translations")), (
-        "OpenAI Chat Completions API URL must end with 'transcriptions' ")
+        "OpenAI Chat Completions API URL must end with 'transcriptions' "
+    )
    "or `translations`."

    content = [{"type": "text", "text": request_func_input.prompt}]
    payload = {
-        "model":
-        request_func_input.model_name
-        if request_func_input.model_name else request_func_input.model,
-        "temperature":
-        0.0,
-        "max_completion_tokens":
-        request_func_input.output_len,
-        "stream":
-        True,
-        "language":
-        "en",
+        "model": request_func_input.model_name
+        if request_func_input.model_name
+        else request_func_input.model,
+        "temperature": 0.0,
+        "max_completion_tokens": request_func_input.output_len,
+        "stream": True,
+        "language": "en",
        # Flattened due to multipart/form-data
-        "stream_include_usage":
-        True,
-        "stream_continuous_usage_stats":
-        True,
+        "stream_include_usage": True,
+        "stream_continuous_usage_stats": True,
    }
    if request_func_input.extra_body:
        payload.update(request_func_input.extra_body)
@@ -413,9 +396,9 @@ async def async_request_openai_audio(
        output.start_time = st
        most_recent_timestamp = st
        try:
-            async with session.post(url=api_url,
-                                    data=form,
-                                    headers=headers) as response:
+            async with session.post(
+                url=api_url, data=form, headers=headers
+            ) as response:
                if response.status == 200:
                    handler = StreamedResponseHandler()

@@ -426,15 +409,13 @@ async def async_request_openai_audio(

                        messages = handler.add_chunk(chunk_bytes)
                        for message in messages:
-                            chunk = message.decode("utf-8").removeprefix(
-                                "data: ")
+                            chunk = message.decode("utf-8").removeprefix("data: ")
                            if chunk != "[DONE]":
                                timestamp = time.perf_counter()
                                data = json.loads(chunk)

                                if choices := data.get("choices"):
-                                    content = choices[0]["delta"].get(
-                                        "content")
+                                    content = choices[0]["delta"].get("content")
                                    # First token
                                    if ttft == 0.0:
                                        ttft = timestamp - st
@@ -443,12 +424,14 @@ async def async_request_openai_audio(
                                    # Decoding phase
                                    else:
                                        output.itl.append(
-                                            timestamp - most_recent_timestamp)
+                                            timestamp - most_recent_timestamp
+                                        )

                                    generated_text += content or ""
                                elif usage := data.get("usage"):
                                    output.output_tokens = usage.get(
-                                        "completion_tokens")
+                                        "completion_tokens"
+                                    )

                                most_recent_timestamp = timestamp

@@ -474,9 +457,9 @@ async def async_request_openai_embeddings(
    pbar: Optional[tqdm] = None,
 ):
    api_url = request_func_input.api_url
-    assert api_url.endswith(
-        "embeddings"
-    ), "OpenAI Embeddings API URL must end with 'embeddings'."
+    assert api_url.endswith("embeddings"), (
+        "OpenAI Embeddings API URL must end with 'embeddings'."
+    )

    headers = {
        "Content-Type": "application/json",
@@ -492,19 +475,13 @@ async def async_request_openai_embeddings(
    st = time.perf_counter()
    output.start_time = st
    try:
-        async with session.post(
-            url=api_url,
-            headers=headers,
-            json=payload
-        ) as response:
+        async with session.post(url=api_url, headers=headers, json=payload) as response:
            if response.status == 200:
                output.latency = time.perf_counter() - st
                data = await response.json()
                output.success = True
                output.generated_text = ""
-                output.prompt_len = data.get(
-                    "usage", {}).get(
-                    "prompt_tokens", 0)
+                output.prompt_len = data.get("usage", {}).get("prompt_tokens", 0)
            else:
                output.success = False
                output.error = response.reason or ""
@@ -527,7 +504,7 @@ ASYNC_REQUEST_FUNCS: dict[str, RequestFunc] = {
 }

 OPENAI_COMPATIBLE_BACKENDS = [
-    k for k, v in ASYNC_REQUEST_FUNCS.items()
-    if v in (async_request_openai_completions,
-             async_request_openai_chat_completions)
+    k
+    for k, v in ASYNC_REQUEST_FUNCS.items()
+    if v in (async_request_openai_completions, async_request_openai_chat_completions)
 ]
--- a/vllm/benchmarks/lib/ready_checker.py
+++ b/vllm/benchmarks/lib/ready_checker.py
@@ -8,8 +8,7 @@ import time
 import aiohttp
 from tqdm.asyncio import tqdm

-from .endpoint_request_func import (RequestFunc, RequestFuncInput,
-                                    RequestFuncOutput)
+from .endpoint_request_func import RequestFunc, RequestFuncInput, RequestFuncOutput


 async def wait_for_endpoint(
@@ -21,30 +20,29 @@ async def wait_for_endpoint(
 ) -> RequestFuncOutput:
    """
    Wait for an endpoint to become available before starting benchmarks.
-    
+
    Args:
        request_func: The async request function to call
        test_input: The RequestFuncInput to test with
        timeout_seconds: Maximum time to wait in seconds (default: 10 minutes)
        retry_interval: Time between retries in seconds (default: 5 seconds)
-        
+
    Returns:
        RequestFuncOutput: The successful response
-        
+
    Raises:
        ValueError: If the endpoint doesn't become available within the timeout
    """
    deadline = time.perf_counter() + timeout_seconds
    output = RequestFuncOutput(success=False)
    print(f"Waiting for endpoint to become up in {timeout_seconds} seconds")
-    
+
    with tqdm(
-        total=timeout_seconds, 
+        total=timeout_seconds,
        bar_format="{desc} |{bar}| {elapsed} elapsed, {remaining} remaining",
        unit="s",
    ) as pbar:
-
-        while True:            
+        while True:
            # update progress bar
            remaining = deadline - time.perf_counter()
            elapsed = timeout_seconds - remaining
@@ -58,16 +56,17 @@ async def wait_for_endpoint(
            # ping the endpoint using request_func
            try:
                output = await request_func(
-                    request_func_input=test_input, session=session)
+                    request_func_input=test_input, session=session
+                )
                if output.success:
                    pbar.close()
                    return output
            except aiohttp.ClientConnectorError:
                pass
-            
+
            # retry after a delay
            sleep_duration = min(retry_interval, remaining)
            if sleep_duration > 0:
                await asyncio.sleep(sleep_duration)
-    
+
    return output
--- a/vllm/benchmarks/lib/utils.py
+++ b/vllm/benchmarks/lib/utils.py
@@ -8,9 +8,9 @@ import os
 from typing import Any


-def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
-                                        metrics: dict[str, list],
-                                        extra_info: dict[str, Any]) -> list:
+def convert_to_pytorch_benchmark_format(
+    args: argparse.Namespace, metrics: dict[str, list], extra_info: dict[str, Any]
+) -> list:
    """
    Save the benchmark results in the format used by PyTorch OSS benchmark with
    on metric per record
@@ -38,12 +38,12 @@ def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
            },
        }

-        tp = record["benchmark"]["extra_info"]["args"].get(
-            "tensor_parallel_size")
+        tp = record["benchmark"]["extra_info"]["args"].get("tensor_parallel_size")
        # Save tensor_parallel_size parameter if it's part of the metadata
        if not tp and "tensor_parallel_size" in extra_info:
-            record["benchmark"]["extra_info"]["args"][
-                "tensor_parallel_size"] = extra_info["tensor_parallel_size"]
+            record["benchmark"]["extra_info"]["args"]["tensor_parallel_size"] = (
+                extra_info["tensor_parallel_size"]
+            )

        records.append(record)

@@ -51,7 +51,6 @@ def convert_to_pytorch_benchmark_format(args: argparse.Namespace,


 class InfEncoder(json.JSONEncoder):
-
    def clear_inf(self, o: Any):
        if isinstance(o, dict):
            return {
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -15,6 +15,7 @@ On the client side, run:
        --request-rate <request_rate. Default inf> \
        --num-prompts <num_prompts. Default 1000>
 """
+
 import argparse
 import asyncio
 import gc
@@ -36,20 +37,22 @@ import numpy as np
 from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase

-from vllm.benchmarks.datasets import (SampleRequest, add_dataset_parser,
-                                      get_samples)
+from vllm.benchmarks.datasets import SampleRequest, add_dataset_parser, get_samples
 from vllm.benchmarks.lib.endpoint_request_func import (
-    ASYNC_REQUEST_FUNCS, OPENAI_COMPATIBLE_BACKENDS, RequestFuncInput,
-    RequestFuncOutput)
+    ASYNC_REQUEST_FUNCS,
+    OPENAI_COMPATIBLE_BACKENDS,
+    RequestFuncInput,
+    RequestFuncOutput,
+)
 from vllm.benchmarks.lib.ready_checker import wait_for_endpoint
-from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format,
-                                       write_to_json)
+from vllm.benchmarks.lib.utils import convert_to_pytorch_benchmark_format, write_to_json
 from vllm.transformers_utils.tokenizer import get_tokenizer

 MILLISECONDS_TO_SECONDS_CONVERSION = 1000

-TERM_PLOTLIB_AVAILABLE = ((importlib.util.find_spec("termplotlib") is not None)
-                          and (shutil.which("gnuplot") is not None))
+TERM_PLOTLIB_AVAILABLE = (importlib.util.find_spec("termplotlib") is not None) and (
+    shutil.which("gnuplot") is not None
+)


 class TaskType(Enum):
@@ -110,8 +113,11 @@ def _get_current_request_rate(
    total_requests: int,
    request_rate: float,
 ) -> float:
-    if (ramp_up_strategy and ramp_up_start_rps is not None
-            and ramp_up_end_rps is not None):
+    if (
+        ramp_up_strategy
+        and ramp_up_start_rps is not None
+        and ramp_up_end_rps is not None
+    ):
        progress = request_index / max(total_requests - 1, 1)
        if ramp_up_strategy == "linear":
            increase = (ramp_up_end_rps - ramp_up_start_rps) * progress
@@ -158,10 +164,10 @@ async def get_request(
            The ending request rate for ramp-up.
    """
    assert burstiness > 0, (
-        f"A positive burstiness factor is expected, but given {burstiness}.")
+        f"A positive burstiness factor is expected, but given {burstiness}."
+    )
    # Convert to list to get length for ramp-up calculations
-    if isinstance(input_requests,
-                  Iterable) and not isinstance(input_requests, list):
+    if isinstance(input_requests, Iterable) and not isinstance(input_requests, list):
        input_requests = list(input_requests)

    total_requests = len(input_requests)
@@ -172,8 +178,13 @@ async def get_request(
    delay_ts = []
    for request_index, request in enumerate(input_requests):
        current_request_rate = _get_current_request_rate(
-            ramp_up_strategy, ramp_up_start_rps, ramp_up_end_rps,
-            request_index, total_requests, request_rate)
+            ramp_up_strategy,
+            ramp_up_start_rps,
+            ramp_up_end_rps,
+            request_index,
+            total_requests,
+            request_rate,
+        )
        request_rates.append(current_request_rate)
        if current_request_rate == float("inf"):
            delay_ts.append(0)
@@ -213,8 +224,8 @@ async def get_request(


 def calculate_metrics_for_embeddings(
-        outputs: list[RequestFuncOutput], dur_s: float,
-        selected_percentiles: list[float]) -> EmbedBenchmarkMetrics:
+    outputs: list[RequestFuncOutput], dur_s: float, selected_percentiles: list[float]
+) -> EmbedBenchmarkMetrics:
    """Calculate the metrics for the embedding requests.

    Args:
@@ -238,7 +249,8 @@ def calculate_metrics_for_embeddings(
        warnings.warn(
            "All requests failed. This is likely due to a misconfiguration "
            "on the benchmark arguments.",
-            stacklevel=2)
+            stacklevel=2,
+        )
    metrics = EmbedBenchmarkMetrics(
        completed=completed,
        total_input=total_input,
@@ -247,8 +259,9 @@ def calculate_metrics_for_embeddings(
        mean_e2el_ms=np.mean(e2els or 0) * 1000,
        std_e2el_ms=np.std(e2els or 0) * 1000,
        median_e2el_ms=np.median(e2els or 0) * 1000,
-        percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000)
-                             for p in selected_percentiles],
+        percentiles_e2el_ms=[
+            (p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles
+        ],
    )
    return metrics

@@ -294,8 +307,10 @@ def calculate_metrics(
                # bundled together
                # Note : this may inflate the output token count slightly
                output_len = len(
-                    tokenizer(outputs[i].generated_text,
-                              add_special_tokens=False).input_ids)
+                    tokenizer(
+                        outputs[i].generated_text, add_special_tokens=False
+                    ).input_ids
+                )
            actual_output_lens.append(output_len)
            total_input += input_requests[i].prompt_len
            tpot = 0
@@ -318,16 +333,19 @@ def calculate_metrics(

        if "ttft" in goodput_config_dict:
            valid_metrics.append(ttfts)
-            slo_values.append(goodput_config_dict["ttft"] /
-                              MILLISECONDS_TO_SECONDS_CONVERSION)
+            slo_values.append(
+                goodput_config_dict["ttft"] / MILLISECONDS_TO_SECONDS_CONVERSION
+            )
        if "tpot" in goodput_config_dict:
            valid_metrics.append(all_tpots)
-            slo_values.append(goodput_config_dict["tpot"] /
-                              MILLISECONDS_TO_SECONDS_CONVERSION)
+            slo_values.append(
+                goodput_config_dict["tpot"] / MILLISECONDS_TO_SECONDS_CONVERSION
+            )
        if "e2el" in goodput_config_dict:
            valid_metrics.append(e2els)
-            slo_values.append(goodput_config_dict["e2el"] /
-                              MILLISECONDS_TO_SECONDS_CONVERSION)
+            slo_values.append(
+                goodput_config_dict["e2el"] / MILLISECONDS_TO_SECONDS_CONVERSION
+            )

        for req_metric in zip(*valid_metrics):
            is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)])
@@ -338,7 +356,8 @@ def calculate_metrics(
        warnings.warn(
            "All requests failed. This is likely due to a misconfiguration "
            "on the benchmark arguments.",
-            stacklevel=2)
+            stacklevel=2,
+        )

    # Calculate max output tokens per second metric
    max_output_tokens_per_s = 0.0
@@ -347,10 +366,10 @@ def calculate_metrics(
    # Find the time range across all successful requests
    successful_outputs = [output for output in outputs if output.success]
    if successful_outputs:
-        min_start_time = min(output.start_time
-                             for output in successful_outputs)
-        max_end_time = max(output.start_time + output.latency
-                           for output in successful_outputs)
+        min_start_time = min(output.start_time for output in successful_outputs)
+        max_end_time = max(
+            output.start_time + output.latency for output in successful_outputs
+        )

        # Create second buckets (ceiling to ensure we capture all time)
        duration_seconds = int(np.ceil(max_end_time - min_start_time)) + 1
@@ -374,8 +393,9 @@ def calculate_metrics(

            # Track concurrent requests for each second this request was active
            request_start_second = int(output.start_time - min_start_time)
-            request_end_second = int((output.start_time + output.latency) -
-                                     min_start_time)
+            request_end_second = int(
+                (output.start_time + output.latency) - min_start_time
+            )

            for second in range(request_start_second, request_end_second + 1):
                concurrent_requests_per_second[second] += 1
@@ -384,18 +404,22 @@ def calculate_metrics(
        # concurrent requests
        if len(tokens_per_second) > 0:
            max_output_tokens_per_s = float(np.max(tokens_per_second))
-            max_concurrent_requests = int(
-                np.max(concurrent_requests_per_second))
+            max_concurrent_requests = int(np.max(concurrent_requests_per_second))

        if TERM_PLOTLIB_AVAILABLE:
            import termplotlib as tpl
+
            fig = tpl.figure()
-            fig.plot(np.arange(len(tokens_per_second)),
-                     tokens_per_second,
-                     title="Output tokens per second")
-            fig.plot(np.arange(len(concurrent_requests_per_second)),
-                     concurrent_requests_per_second,
-                     title="Concurrent requests per second")
+            fig.plot(
+                np.arange(len(tokens_per_second)),
+                tokens_per_second,
+                title="Output tokens per second",
+            )
+            fig.plot(
+                np.arange(len(concurrent_requests_per_second)),
+                concurrent_requests_per_second,
+                title="Concurrent requests per second",
+            )
            fig.show()
        else:
            print("tip: install termplotlib and gnuplot to plot the metrics")
@@ -408,27 +432,31 @@ def calculate_metrics(
        request_goodput=good_completed / dur_s,
        output_throughput=sum(actual_output_lens) / dur_s,
        total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s,
-        mean_ttft_ms=np.mean(ttfts or 0) *
-        1000,  # ttfts is empty if streaming is not supported by the endpoint
+        mean_ttft_ms=np.mean(ttfts or 0)
+        * 1000,  # ttfts is empty if streaming is not supported by the endpoint
        std_ttft_ms=np.std(ttfts or 0) * 1000,
        median_ttft_ms=np.median(ttfts or 0) * 1000,
-        percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000)
-                             for p in selected_percentiles],
+        percentiles_ttft_ms=[
+            (p, np.percentile(ttfts or 0, p) * 1000) for p in selected_percentiles
+        ],
        mean_tpot_ms=np.mean(tpots or 0) * 1000,
        std_tpot_ms=np.std(tpots or 0) * 1000,
        median_tpot_ms=np.median(tpots or 0) * 1000,
-        percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000)
-                             for p in selected_percentiles],
+        percentiles_tpot_ms=[
+            (p, np.percentile(tpots or 0, p) * 1000) for p in selected_percentiles
+        ],
        mean_itl_ms=np.mean(itls or 0) * 1000,
        std_itl_ms=np.std(itls or 0) * 1000,
        median_itl_ms=np.median(itls or 0) * 1000,
-        percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000)
-                            for p in selected_percentiles],
+        percentiles_itl_ms=[
+            (p, np.percentile(itls or 0, p) * 1000) for p in selected_percentiles
+        ],
        mean_e2el_ms=np.mean(e2els or 0) * 1000,
        std_e2el_ms=np.std(e2els or 0) * 1000,
        median_e2el_ms=np.median(e2els or 0) * 1000,
-        percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000)
-                             for p in selected_percentiles],
+        percentiles_e2el_ms=[
+            (p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles
+        ],
        max_output_tokens_per_s=max_output_tokens_per_s,
        max_concurrent_requests=max_concurrent_requests,
    )
@@ -462,8 +490,11 @@ async def benchmark(
    ramp_up_end_rps: Optional[int] = None,
    ready_check_timeout_sec: int = 600,
 ):
-    task_type = (TaskType.EMBEDDING if api_url.endswith("/v1/embeddings") else
-                 TaskType.GENERATION)
+    task_type = (
+        TaskType.EMBEDDING
+        if api_url.endswith("/v1/embeddings")
+        else TaskType.GENERATION
+    )
    if endpoint_type in ASYNC_REQUEST_FUNCS:
        if task_type == TaskType.EMBEDDING:
            request_func = ASYNC_REQUEST_FUNCS["openai-embeddings"]
@@ -498,10 +529,14 @@ async def benchmark(
        input_requests[0].multi_modal_data,
    )

-    assert (test_mm_content is None or isinstance(test_mm_content, dict)
-            or (isinstance(test_mm_content, list)
-                and all(isinstance(item, dict) for item in test_mm_content))
-            ), "multi_modal_data must be a dict or list[dict]"
+    assert (
+        test_mm_content is None
+        or isinstance(test_mm_content, dict)
+        or (
+            isinstance(test_mm_content, list)
+            and all(isinstance(item, dict) for item in test_mm_content)
+        )
+    ), "multi_modal_data must be a dict or list[dict]"
    test_input = RequestFuncInput(
        model=model_id,
        model_name=model_name,
@@ -527,7 +562,8 @@ async def benchmark(
            raise ValueError(
                "Initial test run failed - Please make sure benchmark "
                "arguments are correctly specified. "
-                f"Error: {test_output.error}")
+                f"Error: {test_output.error}"
+            )
        else:
            print("Initial test run completed. Starting main benchmark run...")
    else:
@@ -536,33 +572,38 @@ async def benchmark(
    if lora_modules:
        # For each input request, choose a LoRA module at random.
        lora_modules = iter(
-            [random.choice(lora_modules) for _ in range(len(input_requests))])
+            [random.choice(lora_modules) for _ in range(len(input_requests))]
+        )

    if profile:
        print("Starting profiler...")
-        profile_input = RequestFuncInput(model=model_id,
-                                         model_name=model_name,
-                                         prompt=test_prompt,
-                                         api_url=base_url + "/start_profile",
-                                         prompt_len=test_prompt_len,
-                                         output_len=test_output_len,
-                                         logprobs=logprobs,
-                                         multi_modal_content=test_mm_content,
-                                         ignore_eos=ignore_eos,
-                                         extra_headers=extra_headers,
-                                         extra_body=extra_body)
-        profile_output = await request_func(request_func_input=profile_input,
-                                            session=session)
+        profile_input = RequestFuncInput(
+            model=model_id,
+            model_name=model_name,
+            prompt=test_prompt,
+            api_url=base_url + "/start_profile",
+            prompt_len=test_prompt_len,
+            output_len=test_output_len,
+            logprobs=logprobs,
+            multi_modal_content=test_mm_content,
+            ignore_eos=ignore_eos,
+            extra_headers=extra_headers,
+            extra_body=extra_body,
+        )
+        profile_output = await request_func(
+            request_func_input=profile_input, session=session
+        )
        if profile_output.success:
            print("Profiler started")

-    distribution = ("Poisson process"
-                    if burstiness == 1.0 else "Gamma distribution")
+    distribution = "Poisson process" if burstiness == 1.0 else "Gamma distribution"

    if ramp_up_strategy is not None:
        print(f"Traffic ramp-up strategy: {ramp_up_strategy}.")
-        print(f"Will increase RPS from {ramp_up_start_rps} to "
-              f"{ramp_up_end_rps} RPS over the duration of the benchmark.")
+        print(
+            f"Will increase RPS from {ramp_up_start_rps} to "
+            f"{ramp_up_end_rps} RPS over the duration of the benchmark."
+        )
    else:
        print(f"Traffic request rate: {request_rate}")

@@ -575,18 +616,17 @@ async def benchmark(
    # and it will simplify the code in limited_request_func.
    #    semaphore = (asyncio.Semaphore(max_concurrency)
    #                 if max_concurrency else contextlib.nullcontext())
-    semaphore = (asyncio.Semaphore(max_concurrency)
-                 if max_concurrency else None)
+    semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None

    async def limited_request_func(request_func_input, session, pbar):
        if semaphore is None:
-            return await request_func(request_func_input=request_func_input,
-                                      session=session,
-                                      pbar=pbar)
+            return await request_func(
+                request_func_input=request_func_input, session=session, pbar=pbar
+            )
        async with semaphore:
-            return await request_func(request_func_input=request_func_input,
-                                      session=session,
-                                      pbar=pbar)
+            return await request_func(
+                request_func_input=request_func_input, session=session, pbar=pbar
+            )

    benchmark_start_time = time.perf_counter()
    tasks: list[asyncio.Task] = []
@@ -595,23 +635,27 @@ async def benchmark(
    last_int_rps = -1
    if ramp_up_strategy is not None and ramp_up_start_rps is not None:
        last_int_rps = ramp_up_start_rps
-        rps_change_events.append({
-            "rps": last_int_rps,
-            "timestamp": datetime.now().isoformat(),
-        })
+        rps_change_events.append(
+            {
+                "rps": last_int_rps,
+                "timestamp": datetime.now().isoformat(),
+            }
+        )

    async for request, current_request_rate in get_request(
-            input_requests, request_rate, burstiness, ramp_up_strategy,
-            ramp_up_start_rps, ramp_up_end_rps):
+        input_requests,
+        request_rate,
+        burstiness,
+        ramp_up_strategy,
+        ramp_up_start_rps,
+        ramp_up_end_rps,
+    ):
        if ramp_up_strategy is not None:
            current_int_rps = int(current_request_rate)
            if current_int_rps > last_int_rps:
                timestamp = datetime.now().isoformat()
                for rps_val in range(last_int_rps + 1, current_int_rps + 1):
-                    rps_change_events.append({
-                        "rps": rps_val,
-                        "timestamp": timestamp
-                    })
+                    rps_change_events.append({"rps": rps_val, "timestamp": timestamp})
                last_int_rps = current_int_rps
        prompt, prompt_len, output_len, mm_content, request_id = (
            request.prompt,
@@ -641,9 +685,11 @@ async def benchmark(
        )
        tasks.append(
            asyncio.create_task(
-                limited_request_func(request_func_input=request_func_input,
-                                     session=session,
-                                     pbar=pbar)))
+                limited_request_func(
+                    request_func_input=request_func_input, session=session, pbar=pbar
+                )
+            )
+        )
    outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)

    if pbar is not None:
@@ -668,35 +714,48 @@ async def benchmark(
        )
        actual_output_lens = 0

-    print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
+    print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
    print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
    if max_concurrency is not None:
-        print("{:<40} {:<10}".format("Maximum request concurrency:",
-                                     max_concurrency))
-    if request_rate != float('inf'):
-        print("{:<40} {:<10.2f}".format("Request rate configured (RPS):",
-                                        request_rate))
-    print("{:<40} {:<10.2f}".format("Benchmark duration (s):",
-                                    benchmark_duration))
+        print("{:<40} {:<10}".format("Maximum request concurrency:", max_concurrency))
+    if request_rate != float("inf"):
+        print("{:<40} {:<10.2f}".format("Request rate configured (RPS):", request_rate))
+    print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
    print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
    if isinstance(metrics, BenchmarkMetrics):
-        print("{:<40} {:<10}".format("Total generated tokens:",
-                                     metrics.total_output))
-    print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
-                                    metrics.request_throughput))
+        print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Request throughput (req/s):", metrics.request_throughput
+        )
+    )
    if goodput_config_dict:
-        print("{:<40} {:<10.2f}".format("Request goodput (req/s):",
-                                        metrics.request_goodput))
+        print(
+            "{:<40} {:<10.2f}".format(
+                "Request goodput (req/s):", metrics.request_goodput
+            )
+        )
    if isinstance(metrics, BenchmarkMetrics):
-        print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
-                                        metrics.output_throughput))
-        print("{:<40} {:<10.2f}".format(
-            "Peak output token throughput (tok/s):",
-            metrics.max_output_tokens_per_s))
-        print("{:<40} {:<10.2f}".format("Peak concurrent requests:",
-                                        metrics.max_concurrent_requests))
-    print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):",
-                                    metrics.total_token_throughput))
+        print(
+            "{:<40} {:<10.2f}".format(
+                "Output token throughput (tok/s):", metrics.output_throughput
+            )
+        )
+        print(
+            "{:<40} {:<10.2f}".format(
+                "Peak output token throughput (tok/s):", metrics.max_output_tokens_per_s
+            )
+        )
+        print(
+            "{:<40} {:<10.2f}".format(
+                "Peak concurrent requests:", metrics.max_concurrent_requests
+            )
+        )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Total Token throughput (tok/s):", metrics.total_token_throughput
+        )
+    )

    if isinstance(metrics, BenchmarkMetrics):
        result = {
@@ -705,8 +764,7 @@ async def benchmark(
            "total_input_tokens": metrics.total_input,
            "total_output_tokens": metrics.total_output,
            "request_throughput": metrics.request_throughput,
-            "request_goodput":
-            metrics.request_goodput if goodput_config_dict else None,
+            "request_goodput": metrics.request_goodput if goodput_config_dict else None,
            "output_throughput": metrics.output_throughput,
            "total_token_throughput": metrics.total_token_throughput,
            "input_lens": [output.prompt_len for output in outputs],
@@ -744,30 +802,36 @@ async def benchmark(
        # metric.
        if metric_attribute_name not in selected_percentile_metrics:
            return
-        print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-'))
-        print("{:<40} {:<10.2f}".format(
-            f"Mean {metric_name} (ms):",
-            getattr(metrics, f"mean_{metric_attribute_name}_ms")))
-        print("{:<40} {:<10.2f}".format(
-            f"Median {metric_name} (ms):",
-            getattr(metrics, f"median_{metric_attribute_name}_ms")))
+        print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-"))
+        print(
+            "{:<40} {:<10.2f}".format(
+                f"Mean {metric_name} (ms):",
+                getattr(metrics, f"mean_{metric_attribute_name}_ms"),
+            )
+        )
+        print(
+            "{:<40} {:<10.2f}".format(
+                f"Median {metric_name} (ms):",
+                getattr(metrics, f"median_{metric_attribute_name}_ms"),
+            )
+        )
        result[f"mean_{metric_attribute_name}_ms"] = getattr(
-            metrics, f"mean_{metric_attribute_name}_ms")
+            metrics, f"mean_{metric_attribute_name}_ms"
+        )
        result[f"median_{metric_attribute_name}_ms"] = getattr(
-            metrics, f"median_{metric_attribute_name}_ms")
+            metrics, f"median_{metric_attribute_name}_ms"
+        )
        result[f"std_{metric_attribute_name}_ms"] = getattr(
-            metrics, f"std_{metric_attribute_name}_ms")
-        for p, value in getattr(metrics,
-                                f"percentiles_{metric_attribute_name}_ms"):
+            metrics, f"std_{metric_attribute_name}_ms"
+        )
+        for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}_ms"):
            p_word = str(int(p)) if int(p) == p else str(p)
-            print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):",
-                                            value))
+            print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value))
            result[f"p{p_word}_{metric_attribute_name}_ms"] = value

    if task_type == TaskType.GENERATION:
        process_one_metric("ttft", "TTFT", "Time to First Token")
-        process_one_metric("tpot", "TPOT",
-                           "Time per Output Token (excl. 1st token)")
+        process_one_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)")
        process_one_metric("itl", "ITL", "Inter-token Latency")
    process_one_metric("e2el", "E2EL", "End-to-end Latency")

@@ -783,8 +847,9 @@ async def benchmark(
            output_len=test_output_len,
            logprobs=logprobs,
        )
-        profile_output = await request_func(request_func_input=profile_input,
-                                            session=session)
+        profile_output = await request_func(
+            request_func_input=profile_input, session=session
+        )
        if profile_output.success:
            print("Profiler stopped")

@@ -803,12 +868,14 @@ def check_goodput_args(args):
                raise ValueError(
                    f"Invalid metric name found, {slo_name}: {slo_val}. "
                    "The service level objective name should be one of "
-                    f"{str(VALID_NAMES)}. ")
+                    f"{str(VALID_NAMES)}. "
+                )
            if slo_val < 0:
                raise ValueError(
                    f"Invalid value found, {slo_name}: {slo_val}. "
                    "The service level objective value should be "
-                    "non-negative.")
+                    "non-negative."
+                )
    return goodput_config_dict


@@ -821,31 +888,42 @@ def parse_goodput(slo_pairs):
    except ValueError as err:
        raise argparse.ArgumentTypeError(
            "Invalid format found for service level objectives. "
-            "Specify service level objectives for goodput as \"KEY:VALUE\" "
+            'Specify service level objectives for goodput as "KEY:VALUE" '
            "pairs, where the key is a metric name, and the value is a "
-            "number in milliseconds.") from err
+            "number in milliseconds."
+        ) from err
    return goodput_config_dict


-def save_to_pytorch_benchmark_format(args: argparse.Namespace,
-                                     results: dict[str, Any],
-                                     file_name: str) -> None:
+def save_to_pytorch_benchmark_format(
+    args: argparse.Namespace, results: dict[str, Any], file_name: str
+) -> None:
    metrics = [
-        "median_ttft_ms", "mean_ttft_ms", "std_ttft_ms", "p99_ttft_ms",
-        "mean_tpot_ms", "median_tpot_ms", "std_tpot_ms", "p99_tpot_ms",
-        "median_itl_ms", "mean_itl_ms", "std_itl_ms", "p99_itl_ms"
+        "median_ttft_ms",
+        "mean_ttft_ms",
+        "std_ttft_ms",
+        "p99_ttft_ms",
+        "mean_tpot_ms",
+        "median_tpot_ms",
+        "std_tpot_ms",
+        "p99_tpot_ms",
+        "median_itl_ms",
+        "mean_itl_ms",
+        "std_itl_ms",
+        "p99_itl_ms",
    ]
    # These raw data might be useful, but they are rather big. They can be added
    # later if needed
    ignored_metrics = ["ttfts", "itls", "generated_texts", "errors"]
    pt_records = convert_to_pytorch_benchmark_format(
        args=args,
-        metrics={k: [results[k]]
-                 for k in metrics if k in results},
+        metrics={k: [results[k]] for k in metrics if k in results},
        extra_info={
            k: results[k]
-            for k in results if k not in metrics and k not in ignored_metrics
-        })
+            for k in results
+            if k not in metrics and k not in ignored_metrics
+        },
+    )
    if pt_records:
        # Don't use json suffix here as we don't want CI to pick it up
        pt_file = f"{os.path.splitext(file_name)[0]}.pytorch.json"
@@ -866,7 +944,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
        type=str,
        default="openai",
        choices=list(ASYNC_REQUEST_FUNCS.keys()),
-        help="The type of backend or endpoint to use for the benchmark."
+        help="The type of backend or endpoint to use for the benchmark.",
    )
    parser.add_argument(
        "--base-url",
@@ -888,9 +966,9 @@ def add_cli_args(parser: argparse.ArgumentParser):
        metavar="KEY=VALUE",
        nargs="*",
        help="Key-value pairs (e.g, --header x-additional-info=0.3.3) "
-        "for headers to be passed with each request. These headers override " \
-        "per backend constants and values set via environment variable, and " \
-        "will be overriden by other arguments (such as request ids)."
+        "for headers to be passed with each request. These headers override "
+        "per backend constants and values set via environment variable, and "
+        "will be overriden by other arguments (such as request ids).",
    )
    parser.add_argument(
        "--max-concurrency",
@@ -915,19 +993,20 @@ def add_cli_args(parser: argparse.ArgumentParser):
    parser.add_argument(
        "--tokenizer",
        type=str,
-        help=
-        "Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
+        help="Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
    )
    parser.add_argument("--use-beam-search", action="store_true")
    parser.add_argument(
        "--logprobs",
        type=int,
        default=None,
-        help=("Number of logprobs-per-token to compute & return as part of "
-              "the request. If unspecified, then either (1) if beam search "
-              "is disabled, no logprobs are computed & a single dummy "
-              "logprob is returned for each token; or (2) if beam search "
-              "is enabled 1 logprob per token is computed"),
+        help=(
+            "Number of logprobs-per-token to compute & return as part of "
+            "the request. If unspecified, then either (1) if beam search "
+            "is disabled, no logprobs are computed & a single dummy "
+            "logprob is returned for each token; or (2) if beam search "
+            "is enabled 1 logprob per token is computed"
+        ),
    )
    parser.add_argument(
        "--request-rate",
@@ -1010,32 +1089,34 @@ def add_cli_args(parser: argparse.ArgumentParser):
        "--ignore-eos",
        action="store_true",
        help="Set ignore_eos flag when sending the benchmark request."
-        "Warning: ignore_eos is not supported in deepspeed_mii and tgi.")
+        "Warning: ignore_eos is not supported in deepspeed_mii and tgi.",
+    )
    parser.add_argument(
        "--percentile-metrics",
        type=str,
        default="ttft,tpot,itl",
        help="Comma-separated list of selected metrics to report percentils. "
        "This argument specifies the metrics to report percentiles. "
-        "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". ")
+        'Allowed metric names are "ttft", "tpot", "itl", "e2el". ',
+    )
    parser.add_argument(
        "--metric-percentiles",
        type=str,
        default="99",
        help="Comma-separated list of percentiles for selected metrics. "
-        "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
-        "Default value is \"99\"."
-        "Use \"--percentile-metrics\" to select metrics.",
+        'To report 25-th, 50-th, and 75-th percentiles, use "25,50,75". '
+        'Default value is "99".'
+        'Use "--percentile-metrics" to select metrics.',
    )
    parser.add_argument(
        "--goodput",
        nargs="+",
        required=False,
-        help="Specify service level objectives for goodput as \"KEY:VALUE\" "
+        help='Specify service level objectives for goodput as "KEY:VALUE" '
        "pairs, where the key is a metric name, and the value is in "
-        "milliseconds. Multiple \"KEY:VALUE\" pairs can be provided, "
+        'milliseconds. Multiple "KEY:VALUE" pairs can be provided, '
        "separated by spaces. Allowed request level metric names are "
-        "\"ttft\", \"tpot\", \"e2el\". For more context on the definition of "
+        '"ttft", "tpot", "e2el". For more context on the definition of '
        "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
        "and the blog: https://hao-ai-lab.github.io/blogs/distserve",
    )
@@ -1052,22 +1133,19 @@ def add_cli_args(parser: argparse.ArgumentParser):
        "--top-p",
        type=float,
        default=None,
-        help="Top-p sampling parameter. Only has effect on "
-        "openai-compatible backends.",
+        help="Top-p sampling parameter. Only has effect on openai-compatible backends.",
    )
    sampling_group.add_argument(
        "--top-k",
        type=int,
        default=None,
-        help="Top-k sampling parameter. Only has effect on "
-        "openai-compatible backends.",
+        help="Top-k sampling parameter. Only has effect on openai-compatible backends.",
    )
    sampling_group.add_argument(
        "--min-p",
        type=float,
        default=None,
-        help="Min-p sampling parameter. Only has effect on "
-        "openai-compatible backends.",
+        help="Min-p sampling parameter. Only has effect on openai-compatible backends.",
    )
    sampling_group.add_argument(
        "--temperature",
@@ -1100,29 +1178,34 @@ def add_cli_args(parser: argparse.ArgumentParser):
    )

    parser.add_argument(
-        '--tokenizer-mode',
+        "--tokenizer-mode",
        type=str,
        default="auto",
-        choices=['auto', 'slow', 'mistral', 'custom'],
+        choices=["auto", "slow", "mistral", "custom"],
        help='The tokenizer mode.\n\n* "auto" will use the '
        'fast tokenizer if available.\n* "slow" will '
-        'always use the slow tokenizer. \n* '
+        "always use the slow tokenizer. \n* "
        '"mistral" will always use the `mistral_common` tokenizer. \n*'
-        '"custom" will use --tokenizer to select the preregistered tokenizer.')
+        '"custom" will use --tokenizer to select the preregistered tokenizer.',
+    )

-    parser.add_argument("--served-model-name",
-                        type=str,
-                        default=None,
-                        help="The model name used in the API. "
-                        "If not specified, the model name will be the "
-                        "same as the ``--model`` argument. ")
+    parser.add_argument(
+        "--served-model-name",
+        type=str,
+        default=None,
+        help="The model name used in the API. "
+        "If not specified, the model name will be the "
+        "same as the ``--model`` argument. ",
+    )

-    parser.add_argument("--lora-modules",
-                        nargs='+',
-                        default=None,
-                        help="A subset of LoRA module names passed in when "
-                        "launching the server. For each request, the "
-                        "script chooses a LoRA module at random.")
+    parser.add_argument(
+        "--lora-modules",
+        nargs="+",
+        default=None,
+        help="A subset of LoRA module names passed in when "
+        "launching the server. For each request, the "
+        "script chooses a LoRA module at random.",
+    )

    parser.add_argument(
        "--ramp-up-strategy",
@@ -1132,7 +1215,8 @@ def add_cli_args(parser: argparse.ArgumentParser):
        help="The ramp-up strategy. This would be used to "
        "ramp up the request rate from initial RPS to final "
        "RPS rate (specified by --ramp-up-start-rps and "
-        "--ramp-up-end-rps.) over the duration of the benchmark.")
+        "--ramp-up-end-rps.) over the duration of the benchmark.",
+    )
    parser.add_argument(
        "--ramp-up-start-rps",
        type=int,
@@ -1153,7 +1237,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
        default=600,
        help="Maximum time to wait for the endpoint to become ready "
        "in seconds (default: 600 seconds / 10 minutes). If set to 0, "
-        "the ready check will be skipped."
+        "the ready check will be skipped.",
    )


@@ -1172,19 +1256,19 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
            raise ValueError(
                "When using ramp-up, do not specify --request-rate. "
                "The request rate will be controlled by ramp-up parameters. "
-                "Please remove the --request-rate argument.")
+                "Please remove the --request-rate argument."
+            )
        if args.ramp_up_start_rps is None or args.ramp_up_end_rps is None:
            raise ValueError(
                "When using --ramp-up-strategy, both --ramp-up-start-rps and "
-                "--ramp-up-end-rps must be specified")
+                "--ramp-up-end-rps must be specified"
+            )
        if args.ramp_up_start_rps < 0 or args.ramp_up_end_rps < 0:
            raise ValueError("Ramp-up start and end RPS must be non-negative")
        if args.ramp_up_start_rps > args.ramp_up_end_rps:
            raise ValueError("Ramp-up start RPS must be less than end RPS")
-        if (args.ramp_up_strategy == "exponential"
-                and args.ramp_up_start_rps == 0):
-            raise ValueError(
-                "For exponential ramp-up, the start RPS cannot be 0.")
+        if args.ramp_up_strategy == "exponential" and args.ramp_up_start_rps == 0:
+            raise ValueError("For exponential ramp-up, the start RPS cannot be 0.")

    label = args.label
    model_id = args.model
@@ -1208,17 +1292,19 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
                kvstring = item.split("=", 1)
                headers[kvstring[0].strip()] = kvstring[1].strip()
            else:
-                raise ValueError(
-                    "Invalid header format. Please use KEY=VALUE format.")
+                raise ValueError("Invalid header format. Please use KEY=VALUE format.")

-    tokenizer = get_tokenizer(tokenizer_id,
-                              tokenizer_mode=tokenizer_mode,
-                              trust_remote_code=args.trust_remote_code)
+    tokenizer = get_tokenizer(
+        tokenizer_id,
+        tokenizer_mode=tokenizer_mode,
+        trust_remote_code=args.trust_remote_code,
+    )

    if args.dataset_name is None:
        raise ValueError(
            "Please specify '--dataset-name' and the corresponding "
-            "'--dataset-path' if required.")
+            "'--dataset-path' if required."
+        )

    # Load the dataset.
    input_requests = get_samples(args, tokenizer)
@@ -1235,13 +1321,15 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
            "frequency_penalty": args.frequency_penalty,
            "presence_penalty": args.presence_penalty,
            "repetition_penalty": args.repetition_penalty,
-        }.items() if v is not None
+        }.items()
+        if v is not None
    }

    # Sampling parameters are only supported by openai-compatible backend.
    if sampling_params and args.backend not in OPENAI_COMPATIBLE_BACKENDS:
-        raise ValueError("Sampling parameters are only supported by "
-                         "openai-compatible backends.")
+        raise ValueError(
+            "Sampling parameters are only supported by openai-compatible backends."
+        )

    if "temperature" not in sampling_params:
        sampling_params["temperature"] = 0.0  # Default to greedy decoding.
@@ -1264,9 +1352,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
        disable_tqdm=args.disable_tqdm,
        profile=args.profile,
        selected_percentile_metrics=args.percentile_metrics.split(","),
-        selected_percentiles=[
-            float(p) for p in args.metric_percentiles.split(",")
-        ],
+        selected_percentiles=[float(p) for p in args.metric_percentiles.split(",")],
        ignore_eos=args.ignore_eos,
        goodput_config_dict=goodput_config_dict,
        max_concurrency=args.max_concurrency,
@@ -1285,7 +1371,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
    # Setup
    current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
    result_json["date"] = current_dt
-    result_json["endpoint_type"] = args.backend # for backward compatibility
+    result_json["endpoint_type"] = args.backend  # for backward compatibility
    result_json["backend"] = args.backend
    result_json["label"] = label
    result_json["model_id"] = model_id
@@ -1300,11 +1386,13 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
                result_json[kvstring[0].strip()] = kvstring[1].strip()
            else:
                raise ValueError(
-                    "Invalid metadata format. Please use KEY=VALUE format.")
+                    "Invalid metadata format. Please use KEY=VALUE format."
+                )

    # Traffic
-    result_json["request_rate"] = (args.request_rate if args.request_rate
-                                   < float("inf") else "inf")
+    result_json["request_rate"] = (
+        args.request_rate if args.request_rate < float("inf") else "inf"
+    )
    result_json["burstiness"] = args.burstiness
    result_json["max_concurrency"] = args.max_concurrency

@@ -1319,12 +1407,12 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
    if not args.save_detailed:
        # Remove fields with too many data points
        for field in [
-                "input_lens",
-                "output_lens",
-                "ttfts",
-                "itls",
-                "generated_texts",
-                "errors",
+            "input_lens",
+            "output_lens",
+            "ttfts",
+            "itls",
+            "generated_texts",
+            "errors",
        ]:
            if field in result_json:
                del result_json[field]
@@ -1334,8 +1422,11 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
        # Save to file
    if args.save_result or args.append_result:
        base_model_id = model_id.split("/")[-1]
-        max_concurrency_str = (f"-concurrency{args.max_concurrency}"
-                               if args.max_concurrency is not None else "")
+        max_concurrency_str = (
+            f"-concurrency{args.max_concurrency}"
+            if args.max_concurrency is not None
+            else ""
+        )
        label = label or args.backend
        if args.ramp_up_strategy is not None:
            file_name = f"{label}-ramp-up-{args.ramp_up_strategy}-{args.ramp_up_start_rps}qps-{args.ramp_up_end_rps}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json"  # noqa
@@ -1346,9 +1437,9 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
        if args.result_dir:
            os.makedirs(args.result_dir, exist_ok=True)
            file_name = os.path.join(args.result_dir, file_name)
-        with open(file_name,
-                  mode="a+" if args.append_result else "w",
-                  encoding="utf-8") as outfile:
+        with open(
+            file_name, mode="a+" if args.append_result else "w", encoding="utf-8"
+        ) as outfile:
            # Append a newline.
            if args.append_result and outfile.tell() != 0:
                outfile.write("\n")
--- a/vllm/benchmarks/throughput.py
+++ b/vllm/benchmarks/throughput.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Benchmark offline inference throughput."""
+
 import argparse
 import dataclasses
 import json
@@ -13,18 +14,21 @@ from typing import Any, Optional, Union
 import torch
 import uvloop
 from tqdm import tqdm
-from transformers import (AutoModelForCausalLM, AutoTokenizer,
-                          PreTrainedTokenizerBase)
+from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase

-from vllm.benchmarks.datasets import (AIMODataset, BurstGPTDataset,
-                                      ConversationDataset,
-                                      InstructCoderDataset,
-                                      PrefixRepetitionRandomDataset,
-                                      RandomDataset, SampleRequest,
-                                      ShareGPTDataset, SonnetDataset,
-                                      VisionArenaDataset)
-from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format,
-                                       write_to_json)
+from vllm.benchmarks.datasets import (
+    AIMODataset,
+    BurstGPTDataset,
+    ConversationDataset,
+    InstructCoderDataset,
+    PrefixRepetitionRandomDataset,
+    RandomDataset,
+    SampleRequest,
+    ShareGPTDataset,
+    SonnetDataset,
+    VisionArenaDataset,
+)
+from vllm.benchmarks.lib.utils import convert_to_pytorch_benchmark_format, write_to_json
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.inputs import TextPrompt, TokensPrompt
 from vllm.lora.request import LoRARequest
@@ -41,23 +45,30 @@ def run_vllm(
    disable_detokenize: bool = False,
 ) -> tuple[float, Optional[list[RequestOutput]]]:
    from vllm import LLM, SamplingParams
+
    llm = LLM(**dataclasses.asdict(engine_args))
    assert all(
-        llm.llm_engine.model_config.max_model_len >= (
-            request.prompt_len + request.expected_output_len)
-        for request in requests), (
-            "Please ensure that max_model_len is greater than the sum of"
-            " prompt_len and expected_output_len for all requests.")
+        llm.llm_engine.model_config.max_model_len
+        >= (request.prompt_len + request.expected_output_len)
+        for request in requests
+    ), (
+        "Please ensure that max_model_len is greater than the sum of"
+        " prompt_len and expected_output_len for all requests."
+    )
    # Add the requests to the engine.
    prompts: list[Union[TextPrompt, TokensPrompt]] = []
    sampling_params: list[SamplingParams] = []
    for request in requests:
        prompts.append(
-            TokensPrompt(prompt_token_ids=request.prompt["prompt_token_ids"],
-                       multi_modal_data=request.multi_modal_data)
-            if "prompt_token_ids" in request.prompt else \
-            TextPrompt(prompt=request.prompt,
-                       multi_modal_data=request.multi_modal_data))
+            TokensPrompt(
+                prompt_token_ids=request.prompt["prompt_token_ids"],
+                multi_modal_data=request.multi_modal_data,
+            )
+            if "prompt_token_ids" in request.prompt
+            else TextPrompt(
+                prompt=request.prompt, multi_modal_data=request.multi_modal_data
+            )
+        )
        sampling_params.append(
            SamplingParams(
                n=n,
@@ -66,7 +77,8 @@ def run_vllm(
                ignore_eos=True,
                max_tokens=request.expected_output_len,
                detokenize=not disable_detokenize,
-            ))
+            )
+        )
    lora_requests: Optional[list[LoRARequest]] = None
    if engine_args.enable_lora:
        lora_requests = [request.lora_request for request in requests]
@@ -78,10 +90,9 @@ def run_vllm(
        start = time.perf_counter()
        if do_profile:
            llm.start_profile()
-        outputs = llm.generate(prompts,
-                               sampling_params,
-                               lora_request=lora_requests,
-                               use_tqdm=True)
+        outputs = llm.generate(
+            prompts, sampling_params, lora_request=lora_requests, use_tqdm=True
+        )
        if do_profile:
            llm.stop_profile()
        end = time.perf_counter()
@@ -101,7 +112,8 @@ def run_vllm(
                beam_width=n,
                max_tokens=output_len,
                ignore_eos=True,
-            ))
+            ),
+        )
        if do_profile:
            llm.stop_profile()
        end = time.perf_counter()
@@ -109,25 +121,29 @@ def run_vllm(


 def run_vllm_chat(
-        requests: list[SampleRequest],
-        n: int,
-        engine_args: EngineArgs,
-        do_profile: bool,
-        disable_detokenize: bool = False) -> tuple[float, list[RequestOutput]]:
+    requests: list[SampleRequest],
+    n: int,
+    engine_args: EngineArgs,
+    do_profile: bool,
+    disable_detokenize: bool = False,
+) -> tuple[float, list[RequestOutput]]:
    """
    Run vLLM chat benchmark. This function is recommended ONLY for benchmarking
    multimodal models as it properly handles multimodal inputs and chat
    formatting. For non-multimodal models, use run_vllm() instead.
    """
    from vllm import LLM, SamplingParams
+
    llm = LLM(**dataclasses.asdict(engine_args))

    assert all(
-        llm.llm_engine.model_config.max_model_len >= (
-            request.prompt_len + request.expected_output_len)
-        for request in requests), (
-            "Please ensure that max_model_len is greater than the sum of "
-            "prompt_len and expected_output_len for all requests.")
+        llm.llm_engine.model_config.max_model_len
+        >= (request.prompt_len + request.expected_output_len)
+        for request in requests
+    ), (
+        "Please ensure that max_model_len is greater than the sum of "
+        "prompt_len and expected_output_len for all requests."
+    )

    prompts = []
    sampling_params: list[SamplingParams] = []
@@ -141,7 +157,8 @@ def run_vllm_chat(
                ignore_eos=True,
                max_tokens=request.expected_output_len,
                detokenize=not disable_detokenize,
-            ))
+            )
+        )
    start = time.perf_counter()
    if do_profile:
        llm.start_profile()
@@ -162,7 +179,8 @@ async def run_vllm_async(
 ) -> float:
    from vllm import SamplingParams
    from vllm.entrypoints.openai.api_server import (
-        build_async_engine_client_from_engine_args)
+        build_async_engine_client_from_engine_args,
+    )

    async with build_async_engine_client_from_engine_args(
        engine_args,
@@ -170,11 +188,13 @@ async def run_vllm_async(
    ) as llm:
        model_config = await llm.get_model_config()
        assert all(
-            model_config.max_model_len >= (request.prompt_len +
-                                           request.expected_output_len)
-            for request in requests), (
-                "Please ensure that max_model_len is greater than the sum of"
-                " prompt_len and expected_output_len for all requests.")
+            model_config.max_model_len
+            >= (request.prompt_len + request.expected_output_len)
+            for request in requests
+        ), (
+            "Please ensure that max_model_len is greater than the sum of"
+            " prompt_len and expected_output_len for all requests."
+        )

        # Add the requests to the engine.
        prompts: list[Union[TextPrompt, TokensPrompt]] = []
@@ -182,11 +202,15 @@ async def run_vllm_async(
        lora_requests: list[Optional[LoRARequest]] = []
        for request in requests:
            prompts.append(
-                TokensPrompt(prompt_token_ids=request.prompt["prompt_token_ids"],
-                        multi_modal_data=request.multi_modal_data)
-                if "prompt_token_ids" in request.prompt else \
-                TextPrompt(prompt=request.prompt,
-                           multi_modal_data=request.multi_modal_data))
+                TokensPrompt(
+                    prompt_token_ids=request.prompt["prompt_token_ids"],
+                    multi_modal_data=request.multi_modal_data,
+                )
+                if "prompt_token_ids" in request.prompt
+                else TextPrompt(
+                    prompt=request.prompt, multi_modal_data=request.multi_modal_data
+                )
+            )
            sampling_params.append(
                SamplingParams(
                    n=n,
@@ -195,19 +219,18 @@ async def run_vllm_async(
                    ignore_eos=True,
                    max_tokens=request.expected_output_len,
                    detokenize=not disable_detokenize,
-                ))
+                )
+            )
            lora_requests.append(request.lora_request)

        generators = []
        start = time.perf_counter()
        if do_profile:
            await llm.start_profile()
-        for i, (prompt, sp,
-                lr) in enumerate(zip(prompts, sampling_params, lora_requests)):
-            generator = llm.generate(prompt,
-                                     sp,
-                                     lora_request=lr,
-                                     request_id=f"test{i}")
+        for i, (prompt, sp, lr) in enumerate(
+            zip(prompts, sampling_params, lora_requests)
+        ):
+            generator = llm.generate(prompt, sp, lora_request=lr, request_id=f"test{i}")
            generators.append(generator)
        all_gens = merge_async_iterators(*generators)
        async for i, res in all_gens:
@@ -228,7 +251,8 @@ def run_hf(
    disable_detokenize: bool = False,
 ) -> float:
    llm = AutoModelForCausalLM.from_pretrained(
-        model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
+        model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code
+    )
    if llm.config.model_type == "llama":
        # To enable padding in the HF backend.
        tokenizer.pad_token = tokenizer.eos_token
@@ -251,14 +275,15 @@ def run_hf(
            # Check if we can add more requests to the batch.
            next_prompt_len = requests[i + 1].prompt_len
            next_output_len = requests[i + 1].expected_output_len
-            if (max(max_prompt_len, next_prompt_len) +
-                    max(max_output_len, next_output_len)) <= 2048:
+            if (
+                max(max_prompt_len, next_prompt_len)
+                + max(max_output_len, next_output_len)
+            ) <= 2048:
                # We can add more requests to the batch.
                continue

        # Generate the sequences.
-        input_ids = tokenizer(batch, return_tensors="pt",
-                              padding=True).input_ids
+        input_ids = tokenizer(batch, return_tensors="pt", padding=True).input_ids
        llm_outputs = llm.generate(
            input_ids=input_ids.cuda(),
            do_sample=True,
@@ -281,8 +306,9 @@ def run_hf(
    return end - start


-def save_to_pytorch_benchmark_format(args: argparse.Namespace,
-                                     results: dict[str, Any]) -> None:
+def save_to_pytorch_benchmark_format(
+    args: argparse.Namespace, results: dict[str, Any]
+) -> None:
    pt_records = convert_to_pytorch_benchmark_format(
        args=args,
        metrics={
@@ -290,9 +316,9 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace,
            "tokens_per_second": [results["tokens_per_second"]],
        },
        extra_info={
-            k: results[k]
-            for k in ["elapsed_time", "num_requests", "total_num_tokens"]
-        })
+            k: results[k] for k in ["elapsed_time", "num_requests", "total_num_tokens"]
+        },
+    )
    if pt_records:
        # Don't use json suffix here as we don't want CI to pick it up
        pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
@@ -324,7 +350,8 @@ def get_requests(args, tokenizer):
            sample_kwargs["enable_multimodal_chat"] = True
    elif args.dataset_name == "sonnet":
        assert tokenizer.chat_template or tokenizer.default_chat_template, (
-            "Tokenizer/model must have chat template for sonnet dataset.")
+            "Tokenizer/model must have chat template for sonnet dataset."
+        )
        dataset_cls = SonnetDataset
        sample_kwargs["prefix_len"] = args.prefix_len
        sample_kwargs["return_prompt_formatted"] = True
@@ -333,21 +360,21 @@ def get_requests(args, tokenizer):
    elif args.dataset_name == "hf":
        if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
            dataset_cls = VisionArenaDataset
-            common_kwargs['dataset_subset'] = None
-            common_kwargs['dataset_split'] = "train"
+            common_kwargs["dataset_subset"] = None
+            common_kwargs["dataset_split"] = "train"
            sample_kwargs["enable_multimodal_chat"] = True
        elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
            dataset_cls = InstructCoderDataset
-            common_kwargs['dataset_split'] = "train"
+            common_kwargs["dataset_split"] = "train"
        elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
            dataset_cls = ConversationDataset
-            common_kwargs['dataset_subset'] = args.hf_subset
-            common_kwargs['dataset_split'] = args.hf_split
+            common_kwargs["dataset_subset"] = args.hf_subset
+            common_kwargs["dataset_split"] = args.hf_split
            sample_kwargs["enable_multimodal_chat"] = True
        elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
            dataset_cls = AIMODataset
-            common_kwargs['dataset_subset'] = None
-            common_kwargs['dataset_split'] = "train"
+            common_kwargs["dataset_subset"] = None
+            common_kwargs["dataset_split"] = "train"
    elif args.dataset_name == "prefix_repetition":
        dataset_cls = PrefixRepetitionRandomDataset
        sample_kwargs["prefix_len"] = args.prefix_repetition_prefix_len
@@ -373,8 +400,11 @@ def filter_requests_for_dp(requests, data_parallel_size):
    global_rank = int(os.environ["RANK"])
    world_size = int(os.environ["WORLD_SIZE"])
    data_parallel_rank = global_rank // (world_size // data_parallel_size)
-    return [r for i, r in enumerate(requests)
-            if i % data_parallel_size == data_parallel_rank]
+    return [
+        r
+        for i, r in enumerate(requests)
+        if i % data_parallel_size == data_parallel_rank
+    ]


 def validate_args(args):
@@ -387,7 +417,8 @@ def validate_args(args):
        warnings.warn(
            "The '--dataset' argument will be deprecated in the next release. "
            "Please use '--dataset-name' and '--dataset-path' instead.",
-            stacklevel=2)
+            stacklevel=2,
+        )
        args.dataset_path = args.dataset

    if not getattr(args, "tokenizer", None):
@@ -404,9 +435,8 @@ def validate_args(args):
        and not args.dataset_path
        and args.dataset_name not in {"prefix_repetition"}
    ):
-        print(
-            "When dataset path is not set, it will default to random dataset")
-        args.dataset_name = 'random'
+        print("When dataset path is not set, it will default to random dataset")
+        args.dataset_name = "random"
        if args.input_len is None:
            raise ValueError("input_len must be provided for a random dataset")

@@ -414,41 +444,55 @@ def validate_args(args):
    # --hf-subset and --hf-split: only used
    # when dataset_name is 'hf'
    if args.dataset_name != "hf" and (
-            getattr(args, "hf_subset", None) is not None
-            or getattr(args, "hf_split", None) is not None):
-        warnings.warn("--hf-subset and --hf-split will be ignored \
+        getattr(args, "hf_subset", None) is not None
+        or getattr(args, "hf_split", None) is not None
+    ):
+        warnings.warn(
+            "--hf-subset and --hf-split will be ignored \
                since --dataset-name is not 'hf'.",
-                      stacklevel=2)
+            stacklevel=2,
+        )
    elif args.dataset_name == "hf":
        if args.dataset_path in (
-                VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys()
-                | ConversationDataset.SUPPORTED_DATASET_PATHS):
-            assert args.backend == "vllm-chat", f"{args.dataset_path} needs to use vllm-chat as the backend."  #noqa: E501
-        elif args.dataset_path in (InstructCoderDataset.SUPPORTED_DATASET_PATHS
-                                   | AIMODataset.SUPPORTED_DATASET_PATHS):
-            assert args.backend == "vllm", f"{args.dataset_path} needs to use vllm as the backend."  #noqa: E501
+            VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys()
+            | ConversationDataset.SUPPORTED_DATASET_PATHS
+        ):
+            assert args.backend == "vllm-chat", (
+                f"{args.dataset_path} needs to use vllm-chat as the backend."
+            )  # noqa: E501
+        elif args.dataset_path in (
+            InstructCoderDataset.SUPPORTED_DATASET_PATHS
+            | AIMODataset.SUPPORTED_DATASET_PATHS
+        ):
+            assert args.backend == "vllm", (
+                f"{args.dataset_path} needs to use vllm as the backend."
+            )  # noqa: E501
        else:
-            raise ValueError(
-                f"{args.dataset_path} is not supported by hf dataset.")
+            raise ValueError(f"{args.dataset_path} is not supported by hf dataset.")

    # --random-range-ratio: only used when dataset_name is 'random'
-    if args.dataset_name != 'random' and args.random_range_ratio is not None:
-        warnings.warn("--random-range-ratio will be ignored since \
+    if args.dataset_name != "random" and args.random_range_ratio is not None:
+        warnings.warn(
+            "--random-range-ratio will be ignored since \
                --dataset-name is not 'random'.",
-                      stacklevel=2)
+            stacklevel=2,
+        )

    # --prefix-len: only used when dataset_name is 'random', 'sonnet', or not
    # set.
-    if args.dataset_name not in {"random", "sonnet", None
-                                 } and args.prefix_len is not None:
-        warnings.warn("--prefix-len will be ignored since --dataset-name\
+    if (
+        args.dataset_name not in {"random", "sonnet", None}
+        and args.prefix_len is not None
+    ):
+        warnings.warn(
+            "--prefix-len will be ignored since --dataset-name\
                 is not 'random', 'sonnet', or not set.",
-                      stacklevel=2)
+            stacklevel=2,
+        )

    # === LoRA Settings ===
    if getattr(args, "enable_lora", False) and args.backend != "vllm":
-        raise ValueError(
-            "LoRA benchmarking is only supported for vLLM backend")
+        raise ValueError("LoRA benchmarking is only supported for vLLM backend")
    if getattr(args, "enable_lora", False) and args.lora_path is None:
        raise ValueError("LoRA path must be provided when enable_lora is True")

@@ -458,8 +502,10 @@ def validate_args(args):
    if args.backend != "hf" and args.hf_max_batch_size is not None:
        raise ValueError("HF max batch size is only for HF backend.")

-    if args.backend in {"hf", "mii"} and getattr(args, "quantization",
-                                                 None) is not None:
+    if (
+        args.backend in {"hf", "mii"}
+        and getattr(args, "quantization", None) is not None
+    ):
        raise ValueError("Quantization is only for vLLM backend.")

    if args.backend == "mii" and args.dtype != "auto":
@@ -467,12 +513,11 @@ def validate_args(args):
    if args.backend == "mii" and args.n != 1:
        raise ValueError("n must be 1 for MII backend.")
    if args.backend == "mii" and args.tokenizer != args.model:
-        raise ValueError(
-            "Tokenizer must be the same as the model for MII backend.")
+        raise ValueError("Tokenizer must be the same as the model for MII backend.")

    if args.data_parallel_size > 1 and (
-        args.distributed_executor_backend != "external_launcher"
-        or args.async_engine):
+        args.distributed_executor_backend != "external_launcher" or args.async_engine
+    ):
        # --data-parallel is not supported fully.
        # Old issue: https://github.com/vllm-project/vllm/issues/16222
        # Currently we only support data parallel with external launcher
@@ -485,19 +530,19 @@ def validate_args(args):


 def add_cli_args(parser: argparse.ArgumentParser):
-    parser.add_argument("--backend",
-                        type=str,
-                        choices=["vllm", "hf", "mii", "vllm-chat"],
-                        default="vllm")
+    parser.add_argument(
+        "--backend",
+        type=str,
+        choices=["vllm", "hf", "mii", "vllm-chat"],
+        default="vllm",
+    )
    parser.add_argument(
        "--dataset-name",
        type=str,
-        choices=[
-            "sharegpt", "random", "sonnet", "burstgpt", "hf",
-            "prefix_repetition"
-        ],
+        choices=["sharegpt", "random", "sonnet", "burstgpt", "hf", "prefix_repetition"],
        help="Name of the dataset to benchmark on.",
-        default="sharegpt")
+        default="sharegpt",
+    )
    parser.add_argument(
        "--dataset",
        type=str,
@@ -505,57 +550,70 @@ def add_cli_args(parser: argparse.ArgumentParser):
        help="Path to the ShareGPT dataset, will be deprecated in\
            the next release. The dataset is expected to "
        "be a json in form of list[dict[..., conversations: "
-        "list[dict[..., value: <prompt_or_response>]]]]")
-    parser.add_argument("--dataset-path",
-                        type=str,
-                        default=None,
-                        help="Path to the dataset")
-    parser.add_argument("--input-len",
-                        type=int,
-                        default=None,
-                        help="Input prompt length for each request")
-    parser.add_argument("--output-len",
-                        type=int,
-                        default=None,
-                        help="Output length for each request. Overrides the "
-                        "output length from the dataset.")
-    parser.add_argument("--n",
-                        type=int,
-                        default=1,
-                        help="Number of generated sequences per prompt.")
-    parser.add_argument("--num-prompts",
-                        type=int,
-                        default=1000,
-                        help="Number of prompts to process.")
-    parser.add_argument("--hf-max-batch-size",
-                        type=int,
-                        default=None,
-                        help="Maximum batch size for HF backend.")
+        "list[dict[..., value: <prompt_or_response>]]]]",
+    )
    parser.add_argument(
-        '--output-json',
+        "--dataset-path", type=str, default=None, help="Path to the dataset"
+    )
+    parser.add_argument(
+        "--input-len",
+        type=int,
+        default=None,
+        help="Input prompt length for each request",
+    )
+    parser.add_argument(
+        "--output-len",
+        type=int,
+        default=None,
+        help="Output length for each request. Overrides the "
+        "output length from the dataset.",
+    )
+    parser.add_argument(
+        "--n", type=int, default=1, help="Number of generated sequences per prompt."
+    )
+    parser.add_argument(
+        "--num-prompts", type=int, default=1000, help="Number of prompts to process."
+    )
+    parser.add_argument(
+        "--hf-max-batch-size",
+        type=int,
+        default=None,
+        help="Maximum batch size for HF backend.",
+    )
+    parser.add_argument(
+        "--output-json",
        type=str,
        default=None,
-        help='Path to save the throughput results in JSON format.')
-    parser.add_argument("--async-engine",
-                        action='store_true',
-                        default=False,
-                        help="Use vLLM async engine rather than LLM class.")
-    parser.add_argument("--disable-frontend-multiprocessing",
-                        action='store_true',
-                        default=False,
-                        help="Disable decoupled async engine frontend.")
+        help="Path to save the throughput results in JSON format.",
+    )
+    parser.add_argument(
+        "--async-engine",
+        action="store_true",
+        default=False,
+        help="Use vLLM async engine rather than LLM class.",
+    )
+    parser.add_argument(
+        "--disable-frontend-multiprocessing",
+        action="store_true",
+        default=False,
+        help="Disable decoupled async engine frontend.",
+    )
    parser.add_argument(
        "--disable-detokenize",
        action="store_true",
-        help=("Do not detokenize the response (i.e. do not include "
-              "detokenization time in the measurement)"))
+        help=(
+            "Do not detokenize the response (i.e. do not include "
+            "detokenization time in the measurement)"
+        ),
+    )
    # LoRA
    parser.add_argument(
        "--lora-path",
        type=str,
        default=None,
        help="Path to the lora adapters to use. This can be an absolute path, "
-        "a relative path, or a Hugging Face model identifier.")
+        "a relative path, or a Hugging Face model identifier.",
+    )
    parser.add_argument(
        "--prefix-len",
        type=int,
@@ -575,24 +633,24 @@ def add_cli_args(parser: argparse.ArgumentParser):
    )

    # hf dtaset
-    parser.add_argument("--hf-subset",
-                        type=str,
-                        default=None,
-                        help="Subset of the HF dataset.")
-    parser.add_argument("--hf-split",
-                        type=str,
-                        default=None,
-                        help="Split of the HF dataset.")
+    parser.add_argument(
+        "--hf-subset", type=str, default=None, help="Subset of the HF dataset."
+    )
+    parser.add_argument(
+        "--hf-split", type=str, default=None, help="Split of the HF dataset."
+    )
    parser.add_argument(
        "--profile",
        action="store_true",
        default=False,
        help="Use Torch Profiler. The env variable "
-        "VLLM_TORCH_PROFILER_DIR must be set to enable profiler.")
+        "VLLM_TORCH_PROFILER_DIR must be set to enable profiler.",
+    )

    # prefix repetition dataset
    prefix_repetition_group = parser.add_argument_group(
-        "prefix repetition dataset options")
+        "prefix repetition dataset options"
+    )
    prefix_repetition_group.add_argument(
        "--prefix-repetition-prefix-len",
        type=int,
@@ -634,10 +692,10 @@ def main(args: argparse.Namespace):
    random.seed(args.seed)
    # Sample the requests.
    tokenizer = AutoTokenizer.from_pretrained(
-        args.tokenizer, trust_remote_code=args.trust_remote_code)
+        args.tokenizer, trust_remote_code=args.trust_remote_code
+    )
    requests = get_requests(args, tokenizer)
-    is_multi_modal = any(request.multi_modal_data is not None
-                         for request in requests)
+    is_multi_modal = any(request.multi_modal_data is not None for request in requests)
    request_outputs: Optional[list[RequestOutput]] = None
    if args.backend == "vllm":
        if args.async_engine:
@@ -649,24 +707,37 @@ def main(args: argparse.Namespace):
                    disable_frontend_multiprocessing=args.disable_frontend_multiprocessing,
                    disable_detokenize=args.disable_detokenize,
                    do_profile=args.profile,
-                ))
+                )
+            )
        else:
            elapsed_time, request_outputs = run_vllm(
-                requests, args.n, EngineArgs.from_cli_args(args),
+                requests,
+                args.n,
+                EngineArgs.from_cli_args(args),
                disable_detokenize=args.disable_detokenize,
-                do_profile=args.profile)
+                do_profile=args.profile,
+            )
    elif args.backend == "hf":
        assert args.tensor_parallel_size == 1
        if args.profile:
-            raise NotImplementedError(
-                "Profiling not implemented yet for backend='hf'.")
-        elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
-                              args.hf_max_batch_size, args.trust_remote_code,
-                              args.disable_detokenize)
+            raise NotImplementedError("Profiling not implemented yet for backend='hf'.")
+        elapsed_time = run_hf(
+            requests,
+            args.model,
+            tokenizer,
+            args.n,
+            args.hf_max_batch_size,
+            args.trust_remote_code,
+            args.disable_detokenize,
+        )
    elif args.backend == "vllm-chat":
        elapsed_time, request_outputs = run_vllm_chat(
-            requests, args.n, EngineArgs.from_cli_args(args),
-            disable_detokenize=args.disable_detokenize, do_profile=args.profile)
+            requests,
+            args.n,
+            EngineArgs.from_cli_args(args),
+            disable_detokenize=args.disable_detokenize,
+            do_profile=args.profile,
+        )
    else:
        raise ValueError(f"Unknown backend: {args.backend}")

@@ -678,28 +749,31 @@ def main(args: argparse.Namespace):
        for ro in request_outputs:
            if not isinstance(ro, RequestOutput):
                continue
-            total_prompt_tokens += len(
-                ro.prompt_token_ids) if ro.prompt_token_ids else 0
-            total_output_tokens += sum(
-                len(o.token_ids) for o in ro.outputs if o)
+            total_prompt_tokens += (
+                len(ro.prompt_token_ids) if ro.prompt_token_ids else 0
+            )
+            total_output_tokens += sum(len(o.token_ids) for o in ro.outputs if o)
        total_num_tokens = total_prompt_tokens + total_output_tokens
    else:
-        total_num_tokens = sum(r.prompt_len + r.expected_output_len
-                               for r in requests)
+        total_num_tokens = sum(r.prompt_len + r.expected_output_len for r in requests)
        total_output_tokens = sum(r.expected_output_len for r in requests)
        total_prompt_tokens = total_num_tokens - total_output_tokens

    if is_multi_modal and args.backend != "vllm-chat":
-        print("\033[91mWARNING\033[0m: Multi-modal request with "
-              f"{args.backend} backend detected. The "
-              "following metrics are not accurate because image tokens are not"
-              " counted. See vllm-project/vllm/issues/9778 for details.")
+        print(
+            "\033[91mWARNING\033[0m: Multi-modal request with "
+            f"{args.backend} backend detected. The "
+            "following metrics are not accurate because image tokens are not"
+            " counted. See vllm-project/vllm/issues/9778 for details."
+        )
        # TODO(vllm-project/vllm/issues/9778): Count multi-modal token length.
        # vllm-chat backend counts the image tokens now

-    print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
-          f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
-          f"{total_output_tokens / elapsed_time:.2f} output tokens/s")
+    print(
+        f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
+        f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
+        f"{total_output_tokens / elapsed_time:.2f} output tokens/s"
+    )
    print(f"Total num prompt tokens:  {total_prompt_tokens}")
    print(f"Total num output tokens:  {total_output_tokens}")