Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor
2025-10-05 15:06:22 +01:00
committed by GitHub
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -13,20 +13,20 @@ import numpy as np
from tqdm import tqdm
import vllm.envs as envs
from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format,
write_to_json)
from vllm.benchmarks.lib.utils import convert_to_pytorch_benchmark_format, write_to_json
from vllm.engine.arg_utils import EngineArgs
from vllm.inputs import PromptType
from vllm.sampling_params import BeamSearchParams
def save_to_pytorch_benchmark_format(args: argparse.Namespace,
results: dict[str, Any]) -> None:
def save_to_pytorch_benchmark_format(
args: argparse.Namespace, results: dict[str, Any]
) -> None:
pt_records = convert_to_pytorch_benchmark_format(
args=args,
metrics={"latency": results["latencies"]},
extra_info={k: results[k]
for k in ["avg_latency", "percentiles"]})
extra_info={k: results[k] for k in ["avg_latency", "percentiles"]},
)
if pt_records:
pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
write_to_json(pt_file, pt_records)
@@ -49,10 +49,9 @@ def add_cli_args(parser: argparse.ArgumentParser):
default=10,
help="Number of iterations to run for warmup.",
)
parser.add_argument("--num-iters",
type=int,
default=30,
help="Number of iterations to run.")
parser.add_argument(
"--num-iters", type=int, default=30, help="Number of iterations to run."
)
parser.add_argument(
"--profile",
action="store_true",
@@ -67,8 +66,10 @@ def add_cli_args(parser: argparse.ArgumentParser):
parser.add_argument(
"--disable-detokenize",
action="store_true",
help=("Do not detokenize responses (i.e. do not include "
"detokenization time in the latency measurement)"),
help=(
"Do not detokenize responses (i.e. do not include "
"detokenization time in the latency measurement)"
),
)
parser = EngineArgs.add_cli_args(parser)
@@ -81,7 +82,8 @@ def main(args: argparse.Namespace):
if args.profile and not envs.VLLM_TORCH_PROFILER_DIR:
raise OSError(
"The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. "
"Please set it to a valid path to use torch profiler.")
"Please set it to a valid path to use torch profiler."
)
engine_args = EngineArgs.from_cli_args(args)
# Lazy import to avoid importing LLM when the bench command is not selected.
@@ -91,9 +93,11 @@ def main(args: argparse.Namespace):
# the engine will automatically process the request in multiple batches.
llm = LLM(**dataclasses.asdict(engine_args))
assert llm.llm_engine.model_config.max_model_len >= (
args.input_len +
args.output_len), ("Please ensure that max_model_len is greater than"
" the sum of input_len and output_len.")
args.input_len + args.output_len
), (
"Please ensure that max_model_len is greater than"
" the sum of input_len and output_len."
)
sampling_params = SamplingParams(
n=args.n,
@@ -103,18 +107,16 @@ def main(args: argparse.Namespace):
max_tokens=args.output_len,
detokenize=not args.disable_detokenize,
)
dummy_prompt_token_ids = np.random.randint(10000,
size=(args.batch_size,
args.input_len))
dummy_prompts: list[PromptType] = [{
"prompt_token_ids": batch
} for batch in dummy_prompt_token_ids.tolist()]
dummy_prompt_token_ids = np.random.randint(
10000, size=(args.batch_size, args.input_len)
)
dummy_prompts: list[PromptType] = [
{"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist()
]
def llm_generate():
if not args.use_beam_search:
llm.generate(dummy_prompts,
sampling_params=sampling_params,
use_tqdm=False)
llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False)
else:
llm.beam_search(
dummy_prompts,

View File

@@ -62,6 +62,7 @@ class StreamedResponseHandler:
@dataclass
class RequestFuncInput:
"""The input for the request function."""
prompt: str
api_url: str
prompt_len: int
@@ -80,13 +81,13 @@ class RequestFuncInput:
@dataclass
class RequestFuncOutput:
"""The output of the request function including metrics."""
generated_text: str = ""
success: bool = False
latency: float = 0.0
output_tokens: int = 0
ttft: float = 0.0 # Time to first token
itl: list[float] = field(
default_factory=list) # list of inter-token latencies
itl: list[float] = field(default_factory=list) # list of inter-token latencies
tpot: float = 0.0 # avg next-token latencies
prompt_len: int = 0
error: str = ""
@@ -99,8 +100,7 @@ class RequestFunc(Protocol):
request_func_input: RequestFuncInput,
session: aiohttp.ClientSession,
pbar: Optional[tqdm] = None,
) -> Awaitable[RequestFuncOutput]:
...
) -> Awaitable[RequestFuncOutput]: ...
async def async_request_openai_completions(
@@ -118,13 +118,14 @@ async def async_request_openai_completions(
The output of the request function.
"""
api_url = request_func_input.api_url
assert api_url.endswith(
("completions", "profile")
), "OpenAI Completions API URL must end with 'completions' or 'profile'."
assert api_url.endswith(("completions", "profile")), (
"OpenAI Completions API URL must end with 'completions' or 'profile'."
)
payload = {
"model": request_func_input.model_name
if request_func_input.model_name else request_func_input.model,
if request_func_input.model_name
else request_func_input.model,
"prompt": request_func_input.prompt,
"temperature": 0.0,
"repetition_penalty": 1.0,
@@ -139,9 +140,7 @@ async def async_request_openai_completions(
payload["ignore_eos"] = request_func_input.ignore_eos
if request_func_input.extra_body:
payload.update(request_func_input.extra_body)
headers = {
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
}
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
if request_func_input.extra_headers:
headers |= request_func_input.extra_headers
if request_func_input.request_id:
@@ -155,8 +154,7 @@ async def async_request_openai_completions(
output.start_time = st
most_recent_timestamp = st
try:
async with session.post(url=api_url, json=payload,
headers=headers) as response:
async with session.post(url=api_url, json=payload, headers=headers) as response:
if response.status == 200:
first_chunk_received = False
handler = StreamedResponseHandler()
@@ -195,21 +193,20 @@ async def async_request_openai_completions(
# Decoding phase
else:
output.itl.append(timestamp -
most_recent_timestamp)
output.itl.append(timestamp - most_recent_timestamp)
most_recent_timestamp = timestamp
generated_text += text or ""
elif usage := data.get("usage"):
output.output_tokens = usage.get(
"completion_tokens")
output.output_tokens = usage.get("completion_tokens")
if first_chunk_received:
output.success = True
else:
output.success = False
output.error = (
"Never received a valid chunk to calculate TTFT."
"This response will be marked as failed!")
"This response will be marked as failed!"
)
output.generated_text = generated_text
output.latency = most_recent_timestamp - st
else:
@@ -232,7 +229,8 @@ async def async_request_openai_chat_completions(
) -> RequestFuncOutput:
api_url = request_func_input.api_url
assert api_url.endswith(("chat/completions", "profile")), (
"OpenAI Chat Completions API URL must end with 'chat/completions'.")
"OpenAI Chat Completions API URL must end with 'chat/completions'."
)
content = [{"type": "text", "text": request_func_input.prompt}]
if request_func_input.multi_modal_content:
@@ -243,25 +241,18 @@ async def async_request_openai_chat_completions(
content.append(mm_content)
else:
raise TypeError(
"multi_modal_content must be a dict or list[dict] "
"for openai-chat"
"multi_modal_content must be a dict or list[dict] for openai-chat"
)
payload = {
"model":
request_func_input.model_name
if request_func_input.model_name else request_func_input.model,
"model": request_func_input.model_name
if request_func_input.model_name
else request_func_input.model,
"messages": [
{
"role": "user",
"content": content
},
{"role": "user", "content": content},
],
"temperature":
0.0,
"max_completion_tokens":
request_func_input.output_len,
"stream":
True,
"temperature": 0.0,
"max_completion_tokens": request_func_input.output_len,
"stream": True,
"stream_options": {
"include_usage": True,
},
@@ -288,8 +279,7 @@ async def async_request_openai_chat_completions(
output.start_time = st
most_recent_timestamp = st
try:
async with session.post(url=api_url, json=payload,
headers=headers) as response:
async with session.post(url=api_url, json=payload, headers=headers) as response:
if response.status == 200:
handler = StreamedResponseHandler()
async for chunk_bytes in response.content.iter_any():
@@ -320,13 +310,11 @@ async def async_request_openai_chat_completions(
# Decoding phase
else:
output.itl.append(timestamp -
most_recent_timestamp)
output.itl.append(timestamp - most_recent_timestamp)
generated_text += content or ""
elif usage := data.get("usage"):
output.output_tokens = usage.get(
"completion_tokens")
output.output_tokens = usage.get("completion_tokens")
most_recent_timestamp = timestamp
@@ -356,27 +344,22 @@ async def async_request_openai_audio(
api_url = request_func_input.api_url
assert api_url.endswith(("transcriptions", "translations")), (
"OpenAI Chat Completions API URL must end with 'transcriptions' ")
"OpenAI Chat Completions API URL must end with 'transcriptions' "
)
"or `translations`."
content = [{"type": "text", "text": request_func_input.prompt}]
payload = {
"model":
request_func_input.model_name
if request_func_input.model_name else request_func_input.model,
"temperature":
0.0,
"max_completion_tokens":
request_func_input.output_len,
"stream":
True,
"language":
"en",
"model": request_func_input.model_name
if request_func_input.model_name
else request_func_input.model,
"temperature": 0.0,
"max_completion_tokens": request_func_input.output_len,
"stream": True,
"language": "en",
# Flattened due to multipart/form-data
"stream_include_usage":
True,
"stream_continuous_usage_stats":
True,
"stream_include_usage": True,
"stream_continuous_usage_stats": True,
}
if request_func_input.extra_body:
payload.update(request_func_input.extra_body)
@@ -413,9 +396,9 @@ async def async_request_openai_audio(
output.start_time = st
most_recent_timestamp = st
try:
async with session.post(url=api_url,
data=form,
headers=headers) as response:
async with session.post(
url=api_url, data=form, headers=headers
) as response:
if response.status == 200:
handler = StreamedResponseHandler()
@@ -426,15 +409,13 @@ async def async_request_openai_audio(
messages = handler.add_chunk(chunk_bytes)
for message in messages:
chunk = message.decode("utf-8").removeprefix(
"data: ")
chunk = message.decode("utf-8").removeprefix("data: ")
if chunk != "[DONE]":
timestamp = time.perf_counter()
data = json.loads(chunk)
if choices := data.get("choices"):
content = choices[0]["delta"].get(
"content")
content = choices[0]["delta"].get("content")
# First token
if ttft == 0.0:
ttft = timestamp - st
@@ -443,12 +424,14 @@ async def async_request_openai_audio(
# Decoding phase
else:
output.itl.append(
timestamp - most_recent_timestamp)
timestamp - most_recent_timestamp
)
generated_text += content or ""
elif usage := data.get("usage"):
output.output_tokens = usage.get(
"completion_tokens")
"completion_tokens"
)
most_recent_timestamp = timestamp
@@ -474,9 +457,9 @@ async def async_request_openai_embeddings(
pbar: Optional[tqdm] = None,
):
api_url = request_func_input.api_url
assert api_url.endswith(
"embeddings"
), "OpenAI Embeddings API URL must end with 'embeddings'."
assert api_url.endswith("embeddings"), (
"OpenAI Embeddings API URL must end with 'embeddings'."
)
headers = {
"Content-Type": "application/json",
@@ -492,19 +475,13 @@ async def async_request_openai_embeddings(
st = time.perf_counter()
output.start_time = st
try:
async with session.post(
url=api_url,
headers=headers,
json=payload
) as response:
async with session.post(url=api_url, headers=headers, json=payload) as response:
if response.status == 200:
output.latency = time.perf_counter() - st
data = await response.json()
output.success = True
output.generated_text = ""
output.prompt_len = data.get(
"usage", {}).get(
"prompt_tokens", 0)
output.prompt_len = data.get("usage", {}).get("prompt_tokens", 0)
else:
output.success = False
output.error = response.reason or ""
@@ -527,7 +504,7 @@ ASYNC_REQUEST_FUNCS: dict[str, RequestFunc] = {
}
OPENAI_COMPATIBLE_BACKENDS = [
k for k, v in ASYNC_REQUEST_FUNCS.items()
if v in (async_request_openai_completions,
async_request_openai_chat_completions)
k
for k, v in ASYNC_REQUEST_FUNCS.items()
if v in (async_request_openai_completions, async_request_openai_chat_completions)
]

View File

@@ -8,8 +8,7 @@ import time
import aiohttp
from tqdm.asyncio import tqdm
from .endpoint_request_func import (RequestFunc, RequestFuncInput,
RequestFuncOutput)
from .endpoint_request_func import RequestFunc, RequestFuncInput, RequestFuncOutput
async def wait_for_endpoint(
@@ -21,30 +20,29 @@ async def wait_for_endpoint(
) -> RequestFuncOutput:
"""
Wait for an endpoint to become available before starting benchmarks.
Args:
request_func: The async request function to call
test_input: The RequestFuncInput to test with
timeout_seconds: Maximum time to wait in seconds (default: 10 minutes)
retry_interval: Time between retries in seconds (default: 5 seconds)
Returns:
RequestFuncOutput: The successful response
Raises:
ValueError: If the endpoint doesn't become available within the timeout
"""
deadline = time.perf_counter() + timeout_seconds
output = RequestFuncOutput(success=False)
print(f"Waiting for endpoint to become up in {timeout_seconds} seconds")
with tqdm(
total=timeout_seconds,
total=timeout_seconds,
bar_format="{desc} |{bar}| {elapsed} elapsed, {remaining} remaining",
unit="s",
) as pbar:
while True:
while True:
# update progress bar
remaining = deadline - time.perf_counter()
elapsed = timeout_seconds - remaining
@@ -58,16 +56,17 @@ async def wait_for_endpoint(
# ping the endpoint using request_func
try:
output = await request_func(
request_func_input=test_input, session=session)
request_func_input=test_input, session=session
)
if output.success:
pbar.close()
return output
except aiohttp.ClientConnectorError:
pass
# retry after a delay
sleep_duration = min(retry_interval, remaining)
if sleep_duration > 0:
await asyncio.sleep(sleep_duration)
return output

View File

@@ -8,9 +8,9 @@ import os
from typing import Any
def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
metrics: dict[str, list],
extra_info: dict[str, Any]) -> list:
def convert_to_pytorch_benchmark_format(
args: argparse.Namespace, metrics: dict[str, list], extra_info: dict[str, Any]
) -> list:
"""
Save the benchmark results in the format used by PyTorch OSS benchmark with
on metric per record
@@ -38,12 +38,12 @@ def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
},
}
tp = record["benchmark"]["extra_info"]["args"].get(
"tensor_parallel_size")
tp = record["benchmark"]["extra_info"]["args"].get("tensor_parallel_size")
# Save tensor_parallel_size parameter if it's part of the metadata
if not tp and "tensor_parallel_size" in extra_info:
record["benchmark"]["extra_info"]["args"][
"tensor_parallel_size"] = extra_info["tensor_parallel_size"]
record["benchmark"]["extra_info"]["args"]["tensor_parallel_size"] = (
extra_info["tensor_parallel_size"]
)
records.append(record)
@@ -51,7 +51,6 @@ def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
class InfEncoder(json.JSONEncoder):
def clear_inf(self, o: Any):
if isinstance(o, dict):
return {

View File

@@ -15,6 +15,7 @@ On the client side, run:
--request-rate <request_rate. Default inf> \
--num-prompts <num_prompts. Default 1000>
"""
import argparse
import asyncio
import gc
@@ -36,20 +37,22 @@ import numpy as np
from tqdm.asyncio import tqdm
from transformers import PreTrainedTokenizerBase
from vllm.benchmarks.datasets import (SampleRequest, add_dataset_parser,
get_samples)
from vllm.benchmarks.datasets import SampleRequest, add_dataset_parser, get_samples
from vllm.benchmarks.lib.endpoint_request_func import (
ASYNC_REQUEST_FUNCS, OPENAI_COMPATIBLE_BACKENDS, RequestFuncInput,
RequestFuncOutput)
ASYNC_REQUEST_FUNCS,
OPENAI_COMPATIBLE_BACKENDS,
RequestFuncInput,
RequestFuncOutput,
)
from vllm.benchmarks.lib.ready_checker import wait_for_endpoint
from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format,
write_to_json)
from vllm.benchmarks.lib.utils import convert_to_pytorch_benchmark_format, write_to_json
from vllm.transformers_utils.tokenizer import get_tokenizer
MILLISECONDS_TO_SECONDS_CONVERSION = 1000
TERM_PLOTLIB_AVAILABLE = ((importlib.util.find_spec("termplotlib") is not None)
and (shutil.which("gnuplot") is not None))
TERM_PLOTLIB_AVAILABLE = (importlib.util.find_spec("termplotlib") is not None) and (
shutil.which("gnuplot") is not None
)
class TaskType(Enum):
@@ -110,8 +113,11 @@ def _get_current_request_rate(
total_requests: int,
request_rate: float,
) -> float:
if (ramp_up_strategy and ramp_up_start_rps is not None
and ramp_up_end_rps is not None):
if (
ramp_up_strategy
and ramp_up_start_rps is not None
and ramp_up_end_rps is not None
):
progress = request_index / max(total_requests - 1, 1)
if ramp_up_strategy == "linear":
increase = (ramp_up_end_rps - ramp_up_start_rps) * progress
@@ -158,10 +164,10 @@ async def get_request(
The ending request rate for ramp-up.
"""
assert burstiness > 0, (
f"A positive burstiness factor is expected, but given {burstiness}.")
f"A positive burstiness factor is expected, but given {burstiness}."
)
# Convert to list to get length for ramp-up calculations
if isinstance(input_requests,
Iterable) and not isinstance(input_requests, list):
if isinstance(input_requests, Iterable) and not isinstance(input_requests, list):
input_requests = list(input_requests)
total_requests = len(input_requests)
@@ -172,8 +178,13 @@ async def get_request(
delay_ts = []
for request_index, request in enumerate(input_requests):
current_request_rate = _get_current_request_rate(
ramp_up_strategy, ramp_up_start_rps, ramp_up_end_rps,
request_index, total_requests, request_rate)
ramp_up_strategy,
ramp_up_start_rps,
ramp_up_end_rps,
request_index,
total_requests,
request_rate,
)
request_rates.append(current_request_rate)
if current_request_rate == float("inf"):
delay_ts.append(0)
@@ -213,8 +224,8 @@ async def get_request(
def calculate_metrics_for_embeddings(
outputs: list[RequestFuncOutput], dur_s: float,
selected_percentiles: list[float]) -> EmbedBenchmarkMetrics:
outputs: list[RequestFuncOutput], dur_s: float, selected_percentiles: list[float]
) -> EmbedBenchmarkMetrics:
"""Calculate the metrics for the embedding requests.
Args:
@@ -238,7 +249,8 @@ def calculate_metrics_for_embeddings(
warnings.warn(
"All requests failed. This is likely due to a misconfiguration "
"on the benchmark arguments.",
stacklevel=2)
stacklevel=2,
)
metrics = EmbedBenchmarkMetrics(
completed=completed,
total_input=total_input,
@@ -247,8 +259,9 @@ def calculate_metrics_for_embeddings(
mean_e2el_ms=np.mean(e2els or 0) * 1000,
std_e2el_ms=np.std(e2els or 0) * 1000,
median_e2el_ms=np.median(e2els or 0) * 1000,
percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000)
for p in selected_percentiles],
percentiles_e2el_ms=[
(p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles
],
)
return metrics
@@ -294,8 +307,10 @@ def calculate_metrics(
# bundled together
# Note : this may inflate the output token count slightly
output_len = len(
tokenizer(outputs[i].generated_text,
add_special_tokens=False).input_ids)
tokenizer(
outputs[i].generated_text, add_special_tokens=False
).input_ids
)
actual_output_lens.append(output_len)
total_input += input_requests[i].prompt_len
tpot = 0
@@ -318,16 +333,19 @@ def calculate_metrics(
if "ttft" in goodput_config_dict:
valid_metrics.append(ttfts)
slo_values.append(goodput_config_dict["ttft"] /
MILLISECONDS_TO_SECONDS_CONVERSION)
slo_values.append(
goodput_config_dict["ttft"] / MILLISECONDS_TO_SECONDS_CONVERSION
)
if "tpot" in goodput_config_dict:
valid_metrics.append(all_tpots)
slo_values.append(goodput_config_dict["tpot"] /
MILLISECONDS_TO_SECONDS_CONVERSION)
slo_values.append(
goodput_config_dict["tpot"] / MILLISECONDS_TO_SECONDS_CONVERSION
)
if "e2el" in goodput_config_dict:
valid_metrics.append(e2els)
slo_values.append(goodput_config_dict["e2el"] /
MILLISECONDS_TO_SECONDS_CONVERSION)
slo_values.append(
goodput_config_dict["e2el"] / MILLISECONDS_TO_SECONDS_CONVERSION
)
for req_metric in zip(*valid_metrics):
is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)])
@@ -338,7 +356,8 @@ def calculate_metrics(
warnings.warn(
"All requests failed. This is likely due to a misconfiguration "
"on the benchmark arguments.",
stacklevel=2)
stacklevel=2,
)
# Calculate max output tokens per second metric
max_output_tokens_per_s = 0.0
@@ -347,10 +366,10 @@ def calculate_metrics(
# Find the time range across all successful requests
successful_outputs = [output for output in outputs if output.success]
if successful_outputs:
min_start_time = min(output.start_time
for output in successful_outputs)
max_end_time = max(output.start_time + output.latency
for output in successful_outputs)
min_start_time = min(output.start_time for output in successful_outputs)
max_end_time = max(
output.start_time + output.latency for output in successful_outputs
)
# Create second buckets (ceiling to ensure we capture all time)
duration_seconds = int(np.ceil(max_end_time - min_start_time)) + 1
@@ -374,8 +393,9 @@ def calculate_metrics(
# Track concurrent requests for each second this request was active
request_start_second = int(output.start_time - min_start_time)
request_end_second = int((output.start_time + output.latency) -
min_start_time)
request_end_second = int(
(output.start_time + output.latency) - min_start_time
)
for second in range(request_start_second, request_end_second + 1):
concurrent_requests_per_second[second] += 1
@@ -384,18 +404,22 @@ def calculate_metrics(
# concurrent requests
if len(tokens_per_second) > 0:
max_output_tokens_per_s = float(np.max(tokens_per_second))
max_concurrent_requests = int(
np.max(concurrent_requests_per_second))
max_concurrent_requests = int(np.max(concurrent_requests_per_second))
if TERM_PLOTLIB_AVAILABLE:
import termplotlib as tpl
fig = tpl.figure()
fig.plot(np.arange(len(tokens_per_second)),
tokens_per_second,
title="Output tokens per second")
fig.plot(np.arange(len(concurrent_requests_per_second)),
concurrent_requests_per_second,
title="Concurrent requests per second")
fig.plot(
np.arange(len(tokens_per_second)),
tokens_per_second,
title="Output tokens per second",
)
fig.plot(
np.arange(len(concurrent_requests_per_second)),
concurrent_requests_per_second,
title="Concurrent requests per second",
)
fig.show()
else:
print("tip: install termplotlib and gnuplot to plot the metrics")
@@ -408,27 +432,31 @@ def calculate_metrics(
request_goodput=good_completed / dur_s,
output_throughput=sum(actual_output_lens) / dur_s,
total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s,
mean_ttft_ms=np.mean(ttfts or 0) *
1000, # ttfts is empty if streaming is not supported by the endpoint
mean_ttft_ms=np.mean(ttfts or 0)
* 1000, # ttfts is empty if streaming is not supported by the endpoint
std_ttft_ms=np.std(ttfts or 0) * 1000,
median_ttft_ms=np.median(ttfts or 0) * 1000,
percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000)
for p in selected_percentiles],
percentiles_ttft_ms=[
(p, np.percentile(ttfts or 0, p) * 1000) for p in selected_percentiles
],
mean_tpot_ms=np.mean(tpots or 0) * 1000,
std_tpot_ms=np.std(tpots or 0) * 1000,
median_tpot_ms=np.median(tpots or 0) * 1000,
percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000)
for p in selected_percentiles],
percentiles_tpot_ms=[
(p, np.percentile(tpots or 0, p) * 1000) for p in selected_percentiles
],
mean_itl_ms=np.mean(itls or 0) * 1000,
std_itl_ms=np.std(itls or 0) * 1000,
median_itl_ms=np.median(itls or 0) * 1000,
percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000)
for p in selected_percentiles],
percentiles_itl_ms=[
(p, np.percentile(itls or 0, p) * 1000) for p in selected_percentiles
],
mean_e2el_ms=np.mean(e2els or 0) * 1000,
std_e2el_ms=np.std(e2els or 0) * 1000,
median_e2el_ms=np.median(e2els or 0) * 1000,
percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000)
for p in selected_percentiles],
percentiles_e2el_ms=[
(p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles
],
max_output_tokens_per_s=max_output_tokens_per_s,
max_concurrent_requests=max_concurrent_requests,
)
@@ -462,8 +490,11 @@ async def benchmark(
ramp_up_end_rps: Optional[int] = None,
ready_check_timeout_sec: int = 600,
):
task_type = (TaskType.EMBEDDING if api_url.endswith("/v1/embeddings") else
TaskType.GENERATION)
task_type = (
TaskType.EMBEDDING
if api_url.endswith("/v1/embeddings")
else TaskType.GENERATION
)
if endpoint_type in ASYNC_REQUEST_FUNCS:
if task_type == TaskType.EMBEDDING:
request_func = ASYNC_REQUEST_FUNCS["openai-embeddings"]
@@ -498,10 +529,14 @@ async def benchmark(
input_requests[0].multi_modal_data,
)
assert (test_mm_content is None or isinstance(test_mm_content, dict)
or (isinstance(test_mm_content, list)
and all(isinstance(item, dict) for item in test_mm_content))
), "multi_modal_data must be a dict or list[dict]"
assert (
test_mm_content is None
or isinstance(test_mm_content, dict)
or (
isinstance(test_mm_content, list)
and all(isinstance(item, dict) for item in test_mm_content)
)
), "multi_modal_data must be a dict or list[dict]"
test_input = RequestFuncInput(
model=model_id,
model_name=model_name,
@@ -527,7 +562,8 @@ async def benchmark(
raise ValueError(
"Initial test run failed - Please make sure benchmark "
"arguments are correctly specified. "
f"Error: {test_output.error}")
f"Error: {test_output.error}"
)
else:
print("Initial test run completed. Starting main benchmark run...")
else:
@@ -536,33 +572,38 @@ async def benchmark(
if lora_modules:
# For each input request, choose a LoRA module at random.
lora_modules = iter(
[random.choice(lora_modules) for _ in range(len(input_requests))])
[random.choice(lora_modules) for _ in range(len(input_requests))]
)
if profile:
print("Starting profiler...")
profile_input = RequestFuncInput(model=model_id,
model_name=model_name,
prompt=test_prompt,
api_url=base_url + "/start_profile",
prompt_len=test_prompt_len,
output_len=test_output_len,
logprobs=logprobs,
multi_modal_content=test_mm_content,
ignore_eos=ignore_eos,
extra_headers=extra_headers,
extra_body=extra_body)
profile_output = await request_func(request_func_input=profile_input,
session=session)
profile_input = RequestFuncInput(
model=model_id,
model_name=model_name,
prompt=test_prompt,
api_url=base_url + "/start_profile",
prompt_len=test_prompt_len,
output_len=test_output_len,
logprobs=logprobs,
multi_modal_content=test_mm_content,
ignore_eos=ignore_eos,
extra_headers=extra_headers,
extra_body=extra_body,
)
profile_output = await request_func(
request_func_input=profile_input, session=session
)
if profile_output.success:
print("Profiler started")
distribution = ("Poisson process"
if burstiness == 1.0 else "Gamma distribution")
distribution = "Poisson process" if burstiness == 1.0 else "Gamma distribution"
if ramp_up_strategy is not None:
print(f"Traffic ramp-up strategy: {ramp_up_strategy}.")
print(f"Will increase RPS from {ramp_up_start_rps} to "
f"{ramp_up_end_rps} RPS over the duration of the benchmark.")
print(
f"Will increase RPS from {ramp_up_start_rps} to "
f"{ramp_up_end_rps} RPS over the duration of the benchmark."
)
else:
print(f"Traffic request rate: {request_rate}")
@@ -575,18 +616,17 @@ async def benchmark(
# and it will simplify the code in limited_request_func.
# semaphore = (asyncio.Semaphore(max_concurrency)
# if max_concurrency else contextlib.nullcontext())
semaphore = (asyncio.Semaphore(max_concurrency)
if max_concurrency else None)
semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
async def limited_request_func(request_func_input, session, pbar):
if semaphore is None:
return await request_func(request_func_input=request_func_input,
session=session,
pbar=pbar)
return await request_func(
request_func_input=request_func_input, session=session, pbar=pbar
)
async with semaphore:
return await request_func(request_func_input=request_func_input,
session=session,
pbar=pbar)
return await request_func(
request_func_input=request_func_input, session=session, pbar=pbar
)
benchmark_start_time = time.perf_counter()
tasks: list[asyncio.Task] = []
@@ -595,23 +635,27 @@ async def benchmark(
last_int_rps = -1
if ramp_up_strategy is not None and ramp_up_start_rps is not None:
last_int_rps = ramp_up_start_rps
rps_change_events.append({
"rps": last_int_rps,
"timestamp": datetime.now().isoformat(),
})
rps_change_events.append(
{
"rps": last_int_rps,
"timestamp": datetime.now().isoformat(),
}
)
async for request, current_request_rate in get_request(
input_requests, request_rate, burstiness, ramp_up_strategy,
ramp_up_start_rps, ramp_up_end_rps):
input_requests,
request_rate,
burstiness,
ramp_up_strategy,
ramp_up_start_rps,
ramp_up_end_rps,
):
if ramp_up_strategy is not None:
current_int_rps = int(current_request_rate)
if current_int_rps > last_int_rps:
timestamp = datetime.now().isoformat()
for rps_val in range(last_int_rps + 1, current_int_rps + 1):
rps_change_events.append({
"rps": rps_val,
"timestamp": timestamp
})
rps_change_events.append({"rps": rps_val, "timestamp": timestamp})
last_int_rps = current_int_rps
prompt, prompt_len, output_len, mm_content, request_id = (
request.prompt,
@@ -641,9 +685,11 @@ async def benchmark(
)
tasks.append(
asyncio.create_task(
limited_request_func(request_func_input=request_func_input,
session=session,
pbar=pbar)))
limited_request_func(
request_func_input=request_func_input, session=session, pbar=pbar
)
)
)
outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
if pbar is not None:
@@ -668,35 +714,48 @@ async def benchmark(
)
actual_output_lens = 0
print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
if max_concurrency is not None:
print("{:<40} {:<10}".format("Maximum request concurrency:",
max_concurrency))
if request_rate != float('inf'):
print("{:<40} {:<10.2f}".format("Request rate configured (RPS):",
request_rate))
print("{:<40} {:<10.2f}".format("Benchmark duration (s):",
benchmark_duration))
print("{:<40} {:<10}".format("Maximum request concurrency:", max_concurrency))
if request_rate != float("inf"):
print("{:<40} {:<10.2f}".format("Request rate configured (RPS):", request_rate))
print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
if isinstance(metrics, BenchmarkMetrics):
print("{:<40} {:<10}".format("Total generated tokens:",
metrics.total_output))
print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
metrics.request_throughput))
print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
print(
"{:<40} {:<10.2f}".format(
"Request throughput (req/s):", metrics.request_throughput
)
)
if goodput_config_dict:
print("{:<40} {:<10.2f}".format("Request goodput (req/s):",
metrics.request_goodput))
print(
"{:<40} {:<10.2f}".format(
"Request goodput (req/s):", metrics.request_goodput
)
)
if isinstance(metrics, BenchmarkMetrics):
print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
metrics.output_throughput))
print("{:<40} {:<10.2f}".format(
"Peak output token throughput (tok/s):",
metrics.max_output_tokens_per_s))
print("{:<40} {:<10.2f}".format("Peak concurrent requests:",
metrics.max_concurrent_requests))
print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):",
metrics.total_token_throughput))
print(
"{:<40} {:<10.2f}".format(
"Output token throughput (tok/s):", metrics.output_throughput
)
)
print(
"{:<40} {:<10.2f}".format(
"Peak output token throughput (tok/s):", metrics.max_output_tokens_per_s
)
)
print(
"{:<40} {:<10.2f}".format(
"Peak concurrent requests:", metrics.max_concurrent_requests
)
)
print(
"{:<40} {:<10.2f}".format(
"Total Token throughput (tok/s):", metrics.total_token_throughput
)
)
if isinstance(metrics, BenchmarkMetrics):
result = {
@@ -705,8 +764,7 @@ async def benchmark(
"total_input_tokens": metrics.total_input,
"total_output_tokens": metrics.total_output,
"request_throughput": metrics.request_throughput,
"request_goodput":
metrics.request_goodput if goodput_config_dict else None,
"request_goodput": metrics.request_goodput if goodput_config_dict else None,
"output_throughput": metrics.output_throughput,
"total_token_throughput": metrics.total_token_throughput,
"input_lens": [output.prompt_len for output in outputs],
@@ -744,30 +802,36 @@ async def benchmark(
# metric.
if metric_attribute_name not in selected_percentile_metrics:
return
print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-'))
print("{:<40} {:<10.2f}".format(
f"Mean {metric_name} (ms):",
getattr(metrics, f"mean_{metric_attribute_name}_ms")))
print("{:<40} {:<10.2f}".format(
f"Median {metric_name} (ms):",
getattr(metrics, f"median_{metric_attribute_name}_ms")))
print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-"))
print(
"{:<40} {:<10.2f}".format(
f"Mean {metric_name} (ms):",
getattr(metrics, f"mean_{metric_attribute_name}_ms"),
)
)
print(
"{:<40} {:<10.2f}".format(
f"Median {metric_name} (ms):",
getattr(metrics, f"median_{metric_attribute_name}_ms"),
)
)
result[f"mean_{metric_attribute_name}_ms"] = getattr(
metrics, f"mean_{metric_attribute_name}_ms")
metrics, f"mean_{metric_attribute_name}_ms"
)
result[f"median_{metric_attribute_name}_ms"] = getattr(
metrics, f"median_{metric_attribute_name}_ms")
metrics, f"median_{metric_attribute_name}_ms"
)
result[f"std_{metric_attribute_name}_ms"] = getattr(
metrics, f"std_{metric_attribute_name}_ms")
for p, value in getattr(metrics,
f"percentiles_{metric_attribute_name}_ms"):
metrics, f"std_{metric_attribute_name}_ms"
)
for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}_ms"):
p_word = str(int(p)) if int(p) == p else str(p)
print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):",
value))
print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value))
result[f"p{p_word}_{metric_attribute_name}_ms"] = value
if task_type == TaskType.GENERATION:
process_one_metric("ttft", "TTFT", "Time to First Token")
process_one_metric("tpot", "TPOT",
"Time per Output Token (excl. 1st token)")
process_one_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)")
process_one_metric("itl", "ITL", "Inter-token Latency")
process_one_metric("e2el", "E2EL", "End-to-end Latency")
@@ -783,8 +847,9 @@ async def benchmark(
output_len=test_output_len,
logprobs=logprobs,
)
profile_output = await request_func(request_func_input=profile_input,
session=session)
profile_output = await request_func(
request_func_input=profile_input, session=session
)
if profile_output.success:
print("Profiler stopped")
@@ -803,12 +868,14 @@ def check_goodput_args(args):
raise ValueError(
f"Invalid metric name found, {slo_name}: {slo_val}. "
"The service level objective name should be one of "
f"{str(VALID_NAMES)}. ")
f"{str(VALID_NAMES)}. "
)
if slo_val < 0:
raise ValueError(
f"Invalid value found, {slo_name}: {slo_val}. "
"The service level objective value should be "
"non-negative.")
"non-negative."
)
return goodput_config_dict
@@ -821,31 +888,42 @@ def parse_goodput(slo_pairs):
except ValueError as err:
raise argparse.ArgumentTypeError(
"Invalid format found for service level objectives. "
"Specify service level objectives for goodput as \"KEY:VALUE\" "
'Specify service level objectives for goodput as "KEY:VALUE" '
"pairs, where the key is a metric name, and the value is a "
"number in milliseconds.") from err
"number in milliseconds."
) from err
return goodput_config_dict
def save_to_pytorch_benchmark_format(args: argparse.Namespace,
results: dict[str, Any],
file_name: str) -> None:
def save_to_pytorch_benchmark_format(
args: argparse.Namespace, results: dict[str, Any], file_name: str
) -> None:
metrics = [
"median_ttft_ms", "mean_ttft_ms", "std_ttft_ms", "p99_ttft_ms",
"mean_tpot_ms", "median_tpot_ms", "std_tpot_ms", "p99_tpot_ms",
"median_itl_ms", "mean_itl_ms", "std_itl_ms", "p99_itl_ms"
"median_ttft_ms",
"mean_ttft_ms",
"std_ttft_ms",
"p99_ttft_ms",
"mean_tpot_ms",
"median_tpot_ms",
"std_tpot_ms",
"p99_tpot_ms",
"median_itl_ms",
"mean_itl_ms",
"std_itl_ms",
"p99_itl_ms",
]
# These raw data might be useful, but they are rather big. They can be added
# later if needed
ignored_metrics = ["ttfts", "itls", "generated_texts", "errors"]
pt_records = convert_to_pytorch_benchmark_format(
args=args,
metrics={k: [results[k]]
for k in metrics if k in results},
metrics={k: [results[k]] for k in metrics if k in results},
extra_info={
k: results[k]
for k in results if k not in metrics and k not in ignored_metrics
})
for k in results
if k not in metrics and k not in ignored_metrics
},
)
if pt_records:
# Don't use json suffix here as we don't want CI to pick it up
pt_file = f"{os.path.splitext(file_name)[0]}.pytorch.json"
@@ -866,7 +944,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
type=str,
default="openai",
choices=list(ASYNC_REQUEST_FUNCS.keys()),
help="The type of backend or endpoint to use for the benchmark."
help="The type of backend or endpoint to use for the benchmark.",
)
parser.add_argument(
"--base-url",
@@ -888,9 +966,9 @@ def add_cli_args(parser: argparse.ArgumentParser):
metavar="KEY=VALUE",
nargs="*",
help="Key-value pairs (e.g, --header x-additional-info=0.3.3) "
"for headers to be passed with each request. These headers override " \
"per backend constants and values set via environment variable, and " \
"will be overriden by other arguments (such as request ids)."
"for headers to be passed with each request. These headers override "
"per backend constants and values set via environment variable, and "
"will be overriden by other arguments (such as request ids).",
)
parser.add_argument(
"--max-concurrency",
@@ -915,19 +993,20 @@ def add_cli_args(parser: argparse.ArgumentParser):
parser.add_argument(
"--tokenizer",
type=str,
help=
"Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
)
parser.add_argument("--use-beam-search", action="store_true")
parser.add_argument(
"--logprobs",
type=int,
default=None,
help=("Number of logprobs-per-token to compute & return as part of "
"the request. If unspecified, then either (1) if beam search "
"is disabled, no logprobs are computed & a single dummy "
"logprob is returned for each token; or (2) if beam search "
"is enabled 1 logprob per token is computed"),
help=(
"Number of logprobs-per-token to compute & return as part of "
"the request. If unspecified, then either (1) if beam search "
"is disabled, no logprobs are computed & a single dummy "
"logprob is returned for each token; or (2) if beam search "
"is enabled 1 logprob per token is computed"
),
)
parser.add_argument(
"--request-rate",
@@ -1010,32 +1089,34 @@ def add_cli_args(parser: argparse.ArgumentParser):
"--ignore-eos",
action="store_true",
help="Set ignore_eos flag when sending the benchmark request."
"Warning: ignore_eos is not supported in deepspeed_mii and tgi.")
"Warning: ignore_eos is not supported in deepspeed_mii and tgi.",
)
parser.add_argument(
"--percentile-metrics",
type=str,
default="ttft,tpot,itl",
help="Comma-separated list of selected metrics to report percentils. "
"This argument specifies the metrics to report percentiles. "
"Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". ")
'Allowed metric names are "ttft", "tpot", "itl", "e2el". ',
)
parser.add_argument(
"--metric-percentiles",
type=str,
default="99",
help="Comma-separated list of percentiles for selected metrics. "
"To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
"Default value is \"99\"."
"Use \"--percentile-metrics\" to select metrics.",
'To report 25-th, 50-th, and 75-th percentiles, use "25,50,75". '
'Default value is "99".'
'Use "--percentile-metrics" to select metrics.',
)
parser.add_argument(
"--goodput",
nargs="+",
required=False,
help="Specify service level objectives for goodput as \"KEY:VALUE\" "
help='Specify service level objectives for goodput as "KEY:VALUE" '
"pairs, where the key is a metric name, and the value is in "
"milliseconds. Multiple \"KEY:VALUE\" pairs can be provided, "
'milliseconds. Multiple "KEY:VALUE" pairs can be provided, '
"separated by spaces. Allowed request level metric names are "
"\"ttft\", \"tpot\", \"e2el\". For more context on the definition of "
'"ttft", "tpot", "e2el". For more context on the definition of '
"goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
"and the blog: https://hao-ai-lab.github.io/blogs/distserve",
)
@@ -1052,22 +1133,19 @@ def add_cli_args(parser: argparse.ArgumentParser):
"--top-p",
type=float,
default=None,
help="Top-p sampling parameter. Only has effect on "
"openai-compatible backends.",
help="Top-p sampling parameter. Only has effect on openai-compatible backends.",
)
sampling_group.add_argument(
"--top-k",
type=int,
default=None,
help="Top-k sampling parameter. Only has effect on "
"openai-compatible backends.",
help="Top-k sampling parameter. Only has effect on openai-compatible backends.",
)
sampling_group.add_argument(
"--min-p",
type=float,
default=None,
help="Min-p sampling parameter. Only has effect on "
"openai-compatible backends.",
help="Min-p sampling parameter. Only has effect on openai-compatible backends.",
)
sampling_group.add_argument(
"--temperature",
@@ -1100,29 +1178,34 @@ def add_cli_args(parser: argparse.ArgumentParser):
)
parser.add_argument(
'--tokenizer-mode',
"--tokenizer-mode",
type=str,
default="auto",
choices=['auto', 'slow', 'mistral', 'custom'],
choices=["auto", "slow", "mistral", "custom"],
help='The tokenizer mode.\n\n* "auto" will use the '
'fast tokenizer if available.\n* "slow" will '
'always use the slow tokenizer. \n* '
"always use the slow tokenizer. \n* "
'"mistral" will always use the `mistral_common` tokenizer. \n*'
'"custom" will use --tokenizer to select the preregistered tokenizer.')
'"custom" will use --tokenizer to select the preregistered tokenizer.',
)
parser.add_argument("--served-model-name",
type=str,
default=None,
help="The model name used in the API. "
"If not specified, the model name will be the "
"same as the ``--model`` argument. ")
parser.add_argument(
"--served-model-name",
type=str,
default=None,
help="The model name used in the API. "
"If not specified, the model name will be the "
"same as the ``--model`` argument. ",
)
parser.add_argument("--lora-modules",
nargs='+',
default=None,
help="A subset of LoRA module names passed in when "
"launching the server. For each request, the "
"script chooses a LoRA module at random.")
parser.add_argument(
"--lora-modules",
nargs="+",
default=None,
help="A subset of LoRA module names passed in when "
"launching the server. For each request, the "
"script chooses a LoRA module at random.",
)
parser.add_argument(
"--ramp-up-strategy",
@@ -1132,7 +1215,8 @@ def add_cli_args(parser: argparse.ArgumentParser):
help="The ramp-up strategy. This would be used to "
"ramp up the request rate from initial RPS to final "
"RPS rate (specified by --ramp-up-start-rps and "
"--ramp-up-end-rps.) over the duration of the benchmark.")
"--ramp-up-end-rps.) over the duration of the benchmark.",
)
parser.add_argument(
"--ramp-up-start-rps",
type=int,
@@ -1153,7 +1237,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
default=600,
help="Maximum time to wait for the endpoint to become ready "
"in seconds (default: 600 seconds / 10 minutes). If set to 0, "
"the ready check will be skipped."
"the ready check will be skipped.",
)
@@ -1172,19 +1256,19 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
raise ValueError(
"When using ramp-up, do not specify --request-rate. "
"The request rate will be controlled by ramp-up parameters. "
"Please remove the --request-rate argument.")
"Please remove the --request-rate argument."
)
if args.ramp_up_start_rps is None or args.ramp_up_end_rps is None:
raise ValueError(
"When using --ramp-up-strategy, both --ramp-up-start-rps and "
"--ramp-up-end-rps must be specified")
"--ramp-up-end-rps must be specified"
)
if args.ramp_up_start_rps < 0 or args.ramp_up_end_rps < 0:
raise ValueError("Ramp-up start and end RPS must be non-negative")
if args.ramp_up_start_rps > args.ramp_up_end_rps:
raise ValueError("Ramp-up start RPS must be less than end RPS")
if (args.ramp_up_strategy == "exponential"
and args.ramp_up_start_rps == 0):
raise ValueError(
"For exponential ramp-up, the start RPS cannot be 0.")
if args.ramp_up_strategy == "exponential" and args.ramp_up_start_rps == 0:
raise ValueError("For exponential ramp-up, the start RPS cannot be 0.")
label = args.label
model_id = args.model
@@ -1208,17 +1292,19 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
kvstring = item.split("=", 1)
headers[kvstring[0].strip()] = kvstring[1].strip()
else:
raise ValueError(
"Invalid header format. Please use KEY=VALUE format.")
raise ValueError("Invalid header format. Please use KEY=VALUE format.")
tokenizer = get_tokenizer(tokenizer_id,
tokenizer_mode=tokenizer_mode,
trust_remote_code=args.trust_remote_code)
tokenizer = get_tokenizer(
tokenizer_id,
tokenizer_mode=tokenizer_mode,
trust_remote_code=args.trust_remote_code,
)
if args.dataset_name is None:
raise ValueError(
"Please specify '--dataset-name' and the corresponding "
"'--dataset-path' if required.")
"'--dataset-path' if required."
)
# Load the dataset.
input_requests = get_samples(args, tokenizer)
@@ -1235,13 +1321,15 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
"frequency_penalty": args.frequency_penalty,
"presence_penalty": args.presence_penalty,
"repetition_penalty": args.repetition_penalty,
}.items() if v is not None
}.items()
if v is not None
}
# Sampling parameters are only supported by openai-compatible backend.
if sampling_params and args.backend not in OPENAI_COMPATIBLE_BACKENDS:
raise ValueError("Sampling parameters are only supported by "
"openai-compatible backends.")
raise ValueError(
"Sampling parameters are only supported by openai-compatible backends."
)
if "temperature" not in sampling_params:
sampling_params["temperature"] = 0.0 # Default to greedy decoding.
@@ -1264,9 +1352,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
disable_tqdm=args.disable_tqdm,
profile=args.profile,
selected_percentile_metrics=args.percentile_metrics.split(","),
selected_percentiles=[
float(p) for p in args.metric_percentiles.split(",")
],
selected_percentiles=[float(p) for p in args.metric_percentiles.split(",")],
ignore_eos=args.ignore_eos,
goodput_config_dict=goodput_config_dict,
max_concurrency=args.max_concurrency,
@@ -1285,7 +1371,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
# Setup
current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
result_json["date"] = current_dt
result_json["endpoint_type"] = args.backend # for backward compatibility
result_json["endpoint_type"] = args.backend # for backward compatibility
result_json["backend"] = args.backend
result_json["label"] = label
result_json["model_id"] = model_id
@@ -1300,11 +1386,13 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
result_json[kvstring[0].strip()] = kvstring[1].strip()
else:
raise ValueError(
"Invalid metadata format. Please use KEY=VALUE format.")
"Invalid metadata format. Please use KEY=VALUE format."
)
# Traffic
result_json["request_rate"] = (args.request_rate if args.request_rate
< float("inf") else "inf")
result_json["request_rate"] = (
args.request_rate if args.request_rate < float("inf") else "inf"
)
result_json["burstiness"] = args.burstiness
result_json["max_concurrency"] = args.max_concurrency
@@ -1319,12 +1407,12 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
if not args.save_detailed:
# Remove fields with too many data points
for field in [
"input_lens",
"output_lens",
"ttfts",
"itls",
"generated_texts",
"errors",
"input_lens",
"output_lens",
"ttfts",
"itls",
"generated_texts",
"errors",
]:
if field in result_json:
del result_json[field]
@@ -1334,8 +1422,11 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
# Save to file
if args.save_result or args.append_result:
base_model_id = model_id.split("/")[-1]
max_concurrency_str = (f"-concurrency{args.max_concurrency}"
if args.max_concurrency is not None else "")
max_concurrency_str = (
f"-concurrency{args.max_concurrency}"
if args.max_concurrency is not None
else ""
)
label = label or args.backend
if args.ramp_up_strategy is not None:
file_name = f"{label}-ramp-up-{args.ramp_up_strategy}-{args.ramp_up_start_rps}qps-{args.ramp_up_end_rps}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" # noqa
@@ -1346,9 +1437,9 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
if args.result_dir:
os.makedirs(args.result_dir, exist_ok=True)
file_name = os.path.join(args.result_dir, file_name)
with open(file_name,
mode="a+" if args.append_result else "w",
encoding="utf-8") as outfile:
with open(
file_name, mode="a+" if args.append_result else "w", encoding="utf-8"
) as outfile:
# Append a newline.
if args.append_result and outfile.tell() != 0:
outfile.write("\n")

View File

@@ -1,6 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Benchmark offline inference throughput."""
import argparse
import dataclasses
import json
@@ -13,18 +14,21 @@ from typing import Any, Optional, Union
import torch
import uvloop
from tqdm import tqdm
from transformers import (AutoModelForCausalLM, AutoTokenizer,
PreTrainedTokenizerBase)
from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase
from vllm.benchmarks.datasets import (AIMODataset, BurstGPTDataset,
ConversationDataset,
InstructCoderDataset,
PrefixRepetitionRandomDataset,
RandomDataset, SampleRequest,
ShareGPTDataset, SonnetDataset,
VisionArenaDataset)
from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format,
write_to_json)
from vllm.benchmarks.datasets import (
AIMODataset,
BurstGPTDataset,
ConversationDataset,
InstructCoderDataset,
PrefixRepetitionRandomDataset,
RandomDataset,
SampleRequest,
ShareGPTDataset,
SonnetDataset,
VisionArenaDataset,
)
from vllm.benchmarks.lib.utils import convert_to_pytorch_benchmark_format, write_to_json
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
from vllm.inputs import TextPrompt, TokensPrompt
from vllm.lora.request import LoRARequest
@@ -41,23 +45,30 @@ def run_vllm(
disable_detokenize: bool = False,
) -> tuple[float, Optional[list[RequestOutput]]]:
from vllm import LLM, SamplingParams
llm = LLM(**dataclasses.asdict(engine_args))
assert all(
llm.llm_engine.model_config.max_model_len >= (
request.prompt_len + request.expected_output_len)
for request in requests), (
"Please ensure that max_model_len is greater than the sum of"
" prompt_len and expected_output_len for all requests.")
llm.llm_engine.model_config.max_model_len
>= (request.prompt_len + request.expected_output_len)
for request in requests
), (
"Please ensure that max_model_len is greater than the sum of"
" prompt_len and expected_output_len for all requests."
)
# Add the requests to the engine.
prompts: list[Union[TextPrompt, TokensPrompt]] = []
sampling_params: list[SamplingParams] = []
for request in requests:
prompts.append(
TokensPrompt(prompt_token_ids=request.prompt["prompt_token_ids"],
multi_modal_data=request.multi_modal_data)
if "prompt_token_ids" in request.prompt else \
TextPrompt(prompt=request.prompt,
multi_modal_data=request.multi_modal_data))
TokensPrompt(
prompt_token_ids=request.prompt["prompt_token_ids"],
multi_modal_data=request.multi_modal_data,
)
if "prompt_token_ids" in request.prompt
else TextPrompt(
prompt=request.prompt, multi_modal_data=request.multi_modal_data
)
)
sampling_params.append(
SamplingParams(
n=n,
@@ -66,7 +77,8 @@ def run_vllm(
ignore_eos=True,
max_tokens=request.expected_output_len,
detokenize=not disable_detokenize,
))
)
)
lora_requests: Optional[list[LoRARequest]] = None
if engine_args.enable_lora:
lora_requests = [request.lora_request for request in requests]
@@ -78,10 +90,9 @@ def run_vllm(
start = time.perf_counter()
if do_profile:
llm.start_profile()
outputs = llm.generate(prompts,
sampling_params,
lora_request=lora_requests,
use_tqdm=True)
outputs = llm.generate(
prompts, sampling_params, lora_request=lora_requests, use_tqdm=True
)
if do_profile:
llm.stop_profile()
end = time.perf_counter()
@@ -101,7 +112,8 @@ def run_vllm(
beam_width=n,
max_tokens=output_len,
ignore_eos=True,
))
),
)
if do_profile:
llm.stop_profile()
end = time.perf_counter()
@@ -109,25 +121,29 @@ def run_vllm(
def run_vllm_chat(
requests: list[SampleRequest],
n: int,
engine_args: EngineArgs,
do_profile: bool,
disable_detokenize: bool = False) -> tuple[float, list[RequestOutput]]:
requests: list[SampleRequest],
n: int,
engine_args: EngineArgs,
do_profile: bool,
disable_detokenize: bool = False,
) -> tuple[float, list[RequestOutput]]:
"""
Run vLLM chat benchmark. This function is recommended ONLY for benchmarking
multimodal models as it properly handles multimodal inputs and chat
formatting. For non-multimodal models, use run_vllm() instead.
"""
from vllm import LLM, SamplingParams
llm = LLM(**dataclasses.asdict(engine_args))
assert all(
llm.llm_engine.model_config.max_model_len >= (
request.prompt_len + request.expected_output_len)
for request in requests), (
"Please ensure that max_model_len is greater than the sum of "
"prompt_len and expected_output_len for all requests.")
llm.llm_engine.model_config.max_model_len
>= (request.prompt_len + request.expected_output_len)
for request in requests
), (
"Please ensure that max_model_len is greater than the sum of "
"prompt_len and expected_output_len for all requests."
)
prompts = []
sampling_params: list[SamplingParams] = []
@@ -141,7 +157,8 @@ def run_vllm_chat(
ignore_eos=True,
max_tokens=request.expected_output_len,
detokenize=not disable_detokenize,
))
)
)
start = time.perf_counter()
if do_profile:
llm.start_profile()
@@ -162,7 +179,8 @@ async def run_vllm_async(
) -> float:
from vllm import SamplingParams
from vllm.entrypoints.openai.api_server import (
build_async_engine_client_from_engine_args)
build_async_engine_client_from_engine_args,
)
async with build_async_engine_client_from_engine_args(
engine_args,
@@ -170,11 +188,13 @@ async def run_vllm_async(
) as llm:
model_config = await llm.get_model_config()
assert all(
model_config.max_model_len >= (request.prompt_len +
request.expected_output_len)
for request in requests), (
"Please ensure that max_model_len is greater than the sum of"
" prompt_len and expected_output_len for all requests.")
model_config.max_model_len
>= (request.prompt_len + request.expected_output_len)
for request in requests
), (
"Please ensure that max_model_len is greater than the sum of"
" prompt_len and expected_output_len for all requests."
)
# Add the requests to the engine.
prompts: list[Union[TextPrompt, TokensPrompt]] = []
@@ -182,11 +202,15 @@ async def run_vllm_async(
lora_requests: list[Optional[LoRARequest]] = []
for request in requests:
prompts.append(
TokensPrompt(prompt_token_ids=request.prompt["prompt_token_ids"],
multi_modal_data=request.multi_modal_data)
if "prompt_token_ids" in request.prompt else \
TextPrompt(prompt=request.prompt,
multi_modal_data=request.multi_modal_data))
TokensPrompt(
prompt_token_ids=request.prompt["prompt_token_ids"],
multi_modal_data=request.multi_modal_data,
)
if "prompt_token_ids" in request.prompt
else TextPrompt(
prompt=request.prompt, multi_modal_data=request.multi_modal_data
)
)
sampling_params.append(
SamplingParams(
n=n,
@@ -195,19 +219,18 @@ async def run_vllm_async(
ignore_eos=True,
max_tokens=request.expected_output_len,
detokenize=not disable_detokenize,
))
)
)
lora_requests.append(request.lora_request)
generators = []
start = time.perf_counter()
if do_profile:
await llm.start_profile()
for i, (prompt, sp,
lr) in enumerate(zip(prompts, sampling_params, lora_requests)):
generator = llm.generate(prompt,
sp,
lora_request=lr,
request_id=f"test{i}")
for i, (prompt, sp, lr) in enumerate(
zip(prompts, sampling_params, lora_requests)
):
generator = llm.generate(prompt, sp, lora_request=lr, request_id=f"test{i}")
generators.append(generator)
all_gens = merge_async_iterators(*generators)
async for i, res in all_gens:
@@ -228,7 +251,8 @@ def run_hf(
disable_detokenize: bool = False,
) -> float:
llm = AutoModelForCausalLM.from_pretrained(
model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code
)
if llm.config.model_type == "llama":
# To enable padding in the HF backend.
tokenizer.pad_token = tokenizer.eos_token
@@ -251,14 +275,15 @@ def run_hf(
# Check if we can add more requests to the batch.
next_prompt_len = requests[i + 1].prompt_len
next_output_len = requests[i + 1].expected_output_len
if (max(max_prompt_len, next_prompt_len) +
max(max_output_len, next_output_len)) <= 2048:
if (
max(max_prompt_len, next_prompt_len)
+ max(max_output_len, next_output_len)
) <= 2048:
# We can add more requests to the batch.
continue
# Generate the sequences.
input_ids = tokenizer(batch, return_tensors="pt",
padding=True).input_ids
input_ids = tokenizer(batch, return_tensors="pt", padding=True).input_ids
llm_outputs = llm.generate(
input_ids=input_ids.cuda(),
do_sample=True,
@@ -281,8 +306,9 @@ def run_hf(
return end - start
def save_to_pytorch_benchmark_format(args: argparse.Namespace,
results: dict[str, Any]) -> None:
def save_to_pytorch_benchmark_format(
args: argparse.Namespace, results: dict[str, Any]
) -> None:
pt_records = convert_to_pytorch_benchmark_format(
args=args,
metrics={
@@ -290,9 +316,9 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace,
"tokens_per_second": [results["tokens_per_second"]],
},
extra_info={
k: results[k]
for k in ["elapsed_time", "num_requests", "total_num_tokens"]
})
k: results[k] for k in ["elapsed_time", "num_requests", "total_num_tokens"]
},
)
if pt_records:
# Don't use json suffix here as we don't want CI to pick it up
pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
@@ -324,7 +350,8 @@ def get_requests(args, tokenizer):
sample_kwargs["enable_multimodal_chat"] = True
elif args.dataset_name == "sonnet":
assert tokenizer.chat_template or tokenizer.default_chat_template, (
"Tokenizer/model must have chat template for sonnet dataset.")
"Tokenizer/model must have chat template for sonnet dataset."
)
dataset_cls = SonnetDataset
sample_kwargs["prefix_len"] = args.prefix_len
sample_kwargs["return_prompt_formatted"] = True
@@ -333,21 +360,21 @@ def get_requests(args, tokenizer):
elif args.dataset_name == "hf":
if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
dataset_cls = VisionArenaDataset
common_kwargs['dataset_subset'] = None
common_kwargs['dataset_split'] = "train"
common_kwargs["dataset_subset"] = None
common_kwargs["dataset_split"] = "train"
sample_kwargs["enable_multimodal_chat"] = True
elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
dataset_cls = InstructCoderDataset
common_kwargs['dataset_split'] = "train"
common_kwargs["dataset_split"] = "train"
elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
dataset_cls = ConversationDataset
common_kwargs['dataset_subset'] = args.hf_subset
common_kwargs['dataset_split'] = args.hf_split
common_kwargs["dataset_subset"] = args.hf_subset
common_kwargs["dataset_split"] = args.hf_split
sample_kwargs["enable_multimodal_chat"] = True
elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
dataset_cls = AIMODataset
common_kwargs['dataset_subset'] = None
common_kwargs['dataset_split'] = "train"
common_kwargs["dataset_subset"] = None
common_kwargs["dataset_split"] = "train"
elif args.dataset_name == "prefix_repetition":
dataset_cls = PrefixRepetitionRandomDataset
sample_kwargs["prefix_len"] = args.prefix_repetition_prefix_len
@@ -373,8 +400,11 @@ def filter_requests_for_dp(requests, data_parallel_size):
global_rank = int(os.environ["RANK"])
world_size = int(os.environ["WORLD_SIZE"])
data_parallel_rank = global_rank // (world_size // data_parallel_size)
return [r for i, r in enumerate(requests)
if i % data_parallel_size == data_parallel_rank]
return [
r
for i, r in enumerate(requests)
if i % data_parallel_size == data_parallel_rank
]
def validate_args(args):
@@ -387,7 +417,8 @@ def validate_args(args):
warnings.warn(
"The '--dataset' argument will be deprecated in the next release. "
"Please use '--dataset-name' and '--dataset-path' instead.",
stacklevel=2)
stacklevel=2,
)
args.dataset_path = args.dataset
if not getattr(args, "tokenizer", None):
@@ -404,9 +435,8 @@ def validate_args(args):
and not args.dataset_path
and args.dataset_name not in {"prefix_repetition"}
):
print(
"When dataset path is not set, it will default to random dataset")
args.dataset_name = 'random'
print("When dataset path is not set, it will default to random dataset")
args.dataset_name = "random"
if args.input_len is None:
raise ValueError("input_len must be provided for a random dataset")
@@ -414,41 +444,55 @@ def validate_args(args):
# --hf-subset and --hf-split: only used
# when dataset_name is 'hf'
if args.dataset_name != "hf" and (
getattr(args, "hf_subset", None) is not None
or getattr(args, "hf_split", None) is not None):
warnings.warn("--hf-subset and --hf-split will be ignored \
getattr(args, "hf_subset", None) is not None
or getattr(args, "hf_split", None) is not None
):
warnings.warn(
"--hf-subset and --hf-split will be ignored \
since --dataset-name is not 'hf'.",
stacklevel=2)
stacklevel=2,
)
elif args.dataset_name == "hf":
if args.dataset_path in (
VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys()
| ConversationDataset.SUPPORTED_DATASET_PATHS):
assert args.backend == "vllm-chat", f"{args.dataset_path} needs to use vllm-chat as the backend." #noqa: E501
elif args.dataset_path in (InstructCoderDataset.SUPPORTED_DATASET_PATHS
| AIMODataset.SUPPORTED_DATASET_PATHS):
assert args.backend == "vllm", f"{args.dataset_path} needs to use vllm as the backend." #noqa: E501
VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys()
| ConversationDataset.SUPPORTED_DATASET_PATHS
):
assert args.backend == "vllm-chat", (
f"{args.dataset_path} needs to use vllm-chat as the backend."
) # noqa: E501
elif args.dataset_path in (
InstructCoderDataset.SUPPORTED_DATASET_PATHS
| AIMODataset.SUPPORTED_DATASET_PATHS
):
assert args.backend == "vllm", (
f"{args.dataset_path} needs to use vllm as the backend."
) # noqa: E501
else:
raise ValueError(
f"{args.dataset_path} is not supported by hf dataset.")
raise ValueError(f"{args.dataset_path} is not supported by hf dataset.")
# --random-range-ratio: only used when dataset_name is 'random'
if args.dataset_name != 'random' and args.random_range_ratio is not None:
warnings.warn("--random-range-ratio will be ignored since \
if args.dataset_name != "random" and args.random_range_ratio is not None:
warnings.warn(
"--random-range-ratio will be ignored since \
--dataset-name is not 'random'.",
stacklevel=2)
stacklevel=2,
)
# --prefix-len: only used when dataset_name is 'random', 'sonnet', or not
# set.
if args.dataset_name not in {"random", "sonnet", None
} and args.prefix_len is not None:
warnings.warn("--prefix-len will be ignored since --dataset-name\
if (
args.dataset_name not in {"random", "sonnet", None}
and args.prefix_len is not None
):
warnings.warn(
"--prefix-len will be ignored since --dataset-name\
is not 'random', 'sonnet', or not set.",
stacklevel=2)
stacklevel=2,
)
# === LoRA Settings ===
if getattr(args, "enable_lora", False) and args.backend != "vllm":
raise ValueError(
"LoRA benchmarking is only supported for vLLM backend")
raise ValueError("LoRA benchmarking is only supported for vLLM backend")
if getattr(args, "enable_lora", False) and args.lora_path is None:
raise ValueError("LoRA path must be provided when enable_lora is True")
@@ -458,8 +502,10 @@ def validate_args(args):
if args.backend != "hf" and args.hf_max_batch_size is not None:
raise ValueError("HF max batch size is only for HF backend.")
if args.backend in {"hf", "mii"} and getattr(args, "quantization",
None) is not None:
if (
args.backend in {"hf", "mii"}
and getattr(args, "quantization", None) is not None
):
raise ValueError("Quantization is only for vLLM backend.")
if args.backend == "mii" and args.dtype != "auto":
@@ -467,12 +513,11 @@ def validate_args(args):
if args.backend == "mii" and args.n != 1:
raise ValueError("n must be 1 for MII backend.")
if args.backend == "mii" and args.tokenizer != args.model:
raise ValueError(
"Tokenizer must be the same as the model for MII backend.")
raise ValueError("Tokenizer must be the same as the model for MII backend.")
if args.data_parallel_size > 1 and (
args.distributed_executor_backend != "external_launcher"
or args.async_engine):
args.distributed_executor_backend != "external_launcher" or args.async_engine
):
# --data-parallel is not supported fully.
# Old issue: https://github.com/vllm-project/vllm/issues/16222
# Currently we only support data parallel with external launcher
@@ -485,19 +530,19 @@ def validate_args(args):
def add_cli_args(parser: argparse.ArgumentParser):
parser.add_argument("--backend",
type=str,
choices=["vllm", "hf", "mii", "vllm-chat"],
default="vllm")
parser.add_argument(
"--backend",
type=str,
choices=["vllm", "hf", "mii", "vllm-chat"],
default="vllm",
)
parser.add_argument(
"--dataset-name",
type=str,
choices=[
"sharegpt", "random", "sonnet", "burstgpt", "hf",
"prefix_repetition"
],
choices=["sharegpt", "random", "sonnet", "burstgpt", "hf", "prefix_repetition"],
help="Name of the dataset to benchmark on.",
default="sharegpt")
default="sharegpt",
)
parser.add_argument(
"--dataset",
type=str,
@@ -505,57 +550,70 @@ def add_cli_args(parser: argparse.ArgumentParser):
help="Path to the ShareGPT dataset, will be deprecated in\
the next release. The dataset is expected to "
"be a json in form of list[dict[..., conversations: "
"list[dict[..., value: <prompt_or_response>]]]]")
parser.add_argument("--dataset-path",
type=str,
default=None,
help="Path to the dataset")
parser.add_argument("--input-len",
type=int,
default=None,
help="Input prompt length for each request")
parser.add_argument("--output-len",
type=int,
default=None,
help="Output length for each request. Overrides the "
"output length from the dataset.")
parser.add_argument("--n",
type=int,
default=1,
help="Number of generated sequences per prompt.")
parser.add_argument("--num-prompts",
type=int,
default=1000,
help="Number of prompts to process.")
parser.add_argument("--hf-max-batch-size",
type=int,
default=None,
help="Maximum batch size for HF backend.")
"list[dict[..., value: <prompt_or_response>]]]]",
)
parser.add_argument(
'--output-json',
"--dataset-path", type=str, default=None, help="Path to the dataset"
)
parser.add_argument(
"--input-len",
type=int,
default=None,
help="Input prompt length for each request",
)
parser.add_argument(
"--output-len",
type=int,
default=None,
help="Output length for each request. Overrides the "
"output length from the dataset.",
)
parser.add_argument(
"--n", type=int, default=1, help="Number of generated sequences per prompt."
)
parser.add_argument(
"--num-prompts", type=int, default=1000, help="Number of prompts to process."
)
parser.add_argument(
"--hf-max-batch-size",
type=int,
default=None,
help="Maximum batch size for HF backend.",
)
parser.add_argument(
"--output-json",
type=str,
default=None,
help='Path to save the throughput results in JSON format.')
parser.add_argument("--async-engine",
action='store_true',
default=False,
help="Use vLLM async engine rather than LLM class.")
parser.add_argument("--disable-frontend-multiprocessing",
action='store_true',
default=False,
help="Disable decoupled async engine frontend.")
help="Path to save the throughput results in JSON format.",
)
parser.add_argument(
"--async-engine",
action="store_true",
default=False,
help="Use vLLM async engine rather than LLM class.",
)
parser.add_argument(
"--disable-frontend-multiprocessing",
action="store_true",
default=False,
help="Disable decoupled async engine frontend.",
)
parser.add_argument(
"--disable-detokenize",
action="store_true",
help=("Do not detokenize the response (i.e. do not include "
"detokenization time in the measurement)"))
help=(
"Do not detokenize the response (i.e. do not include "
"detokenization time in the measurement)"
),
)
# LoRA
parser.add_argument(
"--lora-path",
type=str,
default=None,
help="Path to the lora adapters to use. This can be an absolute path, "
"a relative path, or a Hugging Face model identifier.")
"a relative path, or a Hugging Face model identifier.",
)
parser.add_argument(
"--prefix-len",
type=int,
@@ -575,24 +633,24 @@ def add_cli_args(parser: argparse.ArgumentParser):
)
# hf dtaset
parser.add_argument("--hf-subset",
type=str,
default=None,
help="Subset of the HF dataset.")
parser.add_argument("--hf-split",
type=str,
default=None,
help="Split of the HF dataset.")
parser.add_argument(
"--hf-subset", type=str, default=None, help="Subset of the HF dataset."
)
parser.add_argument(
"--hf-split", type=str, default=None, help="Split of the HF dataset."
)
parser.add_argument(
"--profile",
action="store_true",
default=False,
help="Use Torch Profiler. The env variable "
"VLLM_TORCH_PROFILER_DIR must be set to enable profiler.")
"VLLM_TORCH_PROFILER_DIR must be set to enable profiler.",
)
# prefix repetition dataset
prefix_repetition_group = parser.add_argument_group(
"prefix repetition dataset options")
"prefix repetition dataset options"
)
prefix_repetition_group.add_argument(
"--prefix-repetition-prefix-len",
type=int,
@@ -634,10 +692,10 @@ def main(args: argparse.Namespace):
random.seed(args.seed)
# Sample the requests.
tokenizer = AutoTokenizer.from_pretrained(
args.tokenizer, trust_remote_code=args.trust_remote_code)
args.tokenizer, trust_remote_code=args.trust_remote_code
)
requests = get_requests(args, tokenizer)
is_multi_modal = any(request.multi_modal_data is not None
for request in requests)
is_multi_modal = any(request.multi_modal_data is not None for request in requests)
request_outputs: Optional[list[RequestOutput]] = None
if args.backend == "vllm":
if args.async_engine:
@@ -649,24 +707,37 @@ def main(args: argparse.Namespace):
disable_frontend_multiprocessing=args.disable_frontend_multiprocessing,
disable_detokenize=args.disable_detokenize,
do_profile=args.profile,
))
)
)
else:
elapsed_time, request_outputs = run_vllm(
requests, args.n, EngineArgs.from_cli_args(args),
requests,
args.n,
EngineArgs.from_cli_args(args),
disable_detokenize=args.disable_detokenize,
do_profile=args.profile)
do_profile=args.profile,
)
elif args.backend == "hf":
assert args.tensor_parallel_size == 1
if args.profile:
raise NotImplementedError(
"Profiling not implemented yet for backend='hf'.")
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
args.hf_max_batch_size, args.trust_remote_code,
args.disable_detokenize)
raise NotImplementedError("Profiling not implemented yet for backend='hf'.")
elapsed_time = run_hf(
requests,
args.model,
tokenizer,
args.n,
args.hf_max_batch_size,
args.trust_remote_code,
args.disable_detokenize,
)
elif args.backend == "vllm-chat":
elapsed_time, request_outputs = run_vllm_chat(
requests, args.n, EngineArgs.from_cli_args(args),
disable_detokenize=args.disable_detokenize, do_profile=args.profile)
requests,
args.n,
EngineArgs.from_cli_args(args),
disable_detokenize=args.disable_detokenize,
do_profile=args.profile,
)
else:
raise ValueError(f"Unknown backend: {args.backend}")
@@ -678,28 +749,31 @@ def main(args: argparse.Namespace):
for ro in request_outputs:
if not isinstance(ro, RequestOutput):
continue
total_prompt_tokens += len(
ro.prompt_token_ids) if ro.prompt_token_ids else 0
total_output_tokens += sum(
len(o.token_ids) for o in ro.outputs if o)
total_prompt_tokens += (
len(ro.prompt_token_ids) if ro.prompt_token_ids else 0
)
total_output_tokens += sum(len(o.token_ids) for o in ro.outputs if o)
total_num_tokens = total_prompt_tokens + total_output_tokens
else:
total_num_tokens = sum(r.prompt_len + r.expected_output_len
for r in requests)
total_num_tokens = sum(r.prompt_len + r.expected_output_len for r in requests)
total_output_tokens = sum(r.expected_output_len for r in requests)
total_prompt_tokens = total_num_tokens - total_output_tokens
if is_multi_modal and args.backend != "vllm-chat":
print("\033[91mWARNING\033[0m: Multi-modal request with "
f"{args.backend} backend detected. The "
"following metrics are not accurate because image tokens are not"
" counted. See vllm-project/vllm/issues/9778 for details.")
print(
"\033[91mWARNING\033[0m: Multi-modal request with "
f"{args.backend} backend detected. The "
"following metrics are not accurate because image tokens are not"
" counted. See vllm-project/vllm/issues/9778 for details."
)
# TODO(vllm-project/vllm/issues/9778): Count multi-modal token length.
# vllm-chat backend counts the image tokens now
print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
f"{total_output_tokens / elapsed_time:.2f} output tokens/s")
print(
f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
f"{total_output_tokens / elapsed_time:.2f} output tokens/s"
)
print(f"Total num prompt tokens: {total_prompt_tokens}")
print(f"Total num output tokens: {total_output_tokens}")