Convert formatting to use ruff instead of yapf + isort (#26247)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
File diff suppressed because it is too large
Load Diff
@@ -13,20 +13,20 @@ import numpy as np
|
||||
from tqdm import tqdm
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format,
|
||||
write_to_json)
|
||||
from vllm.benchmarks.lib.utils import convert_to_pytorch_benchmark_format, write_to_json
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.inputs import PromptType
|
||||
from vllm.sampling_params import BeamSearchParams
|
||||
|
||||
|
||||
def save_to_pytorch_benchmark_format(args: argparse.Namespace,
|
||||
results: dict[str, Any]) -> None:
|
||||
def save_to_pytorch_benchmark_format(
|
||||
args: argparse.Namespace, results: dict[str, Any]
|
||||
) -> None:
|
||||
pt_records = convert_to_pytorch_benchmark_format(
|
||||
args=args,
|
||||
metrics={"latency": results["latencies"]},
|
||||
extra_info={k: results[k]
|
||||
for k in ["avg_latency", "percentiles"]})
|
||||
extra_info={k: results[k] for k in ["avg_latency", "percentiles"]},
|
||||
)
|
||||
if pt_records:
|
||||
pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
|
||||
write_to_json(pt_file, pt_records)
|
||||
@@ -49,10 +49,9 @@ def add_cli_args(parser: argparse.ArgumentParser):
|
||||
default=10,
|
||||
help="Number of iterations to run for warmup.",
|
||||
)
|
||||
parser.add_argument("--num-iters",
|
||||
type=int,
|
||||
default=30,
|
||||
help="Number of iterations to run.")
|
||||
parser.add_argument(
|
||||
"--num-iters", type=int, default=30, help="Number of iterations to run."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--profile",
|
||||
action="store_true",
|
||||
@@ -67,8 +66,10 @@ def add_cli_args(parser: argparse.ArgumentParser):
|
||||
parser.add_argument(
|
||||
"--disable-detokenize",
|
||||
action="store_true",
|
||||
help=("Do not detokenize responses (i.e. do not include "
|
||||
"detokenization time in the latency measurement)"),
|
||||
help=(
|
||||
"Do not detokenize responses (i.e. do not include "
|
||||
"detokenization time in the latency measurement)"
|
||||
),
|
||||
)
|
||||
|
||||
parser = EngineArgs.add_cli_args(parser)
|
||||
@@ -81,7 +82,8 @@ def main(args: argparse.Namespace):
|
||||
if args.profile and not envs.VLLM_TORCH_PROFILER_DIR:
|
||||
raise OSError(
|
||||
"The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. "
|
||||
"Please set it to a valid path to use torch profiler.")
|
||||
"Please set it to a valid path to use torch profiler."
|
||||
)
|
||||
engine_args = EngineArgs.from_cli_args(args)
|
||||
|
||||
# Lazy import to avoid importing LLM when the bench command is not selected.
|
||||
@@ -91,9 +93,11 @@ def main(args: argparse.Namespace):
|
||||
# the engine will automatically process the request in multiple batches.
|
||||
llm = LLM(**dataclasses.asdict(engine_args))
|
||||
assert llm.llm_engine.model_config.max_model_len >= (
|
||||
args.input_len +
|
||||
args.output_len), ("Please ensure that max_model_len is greater than"
|
||||
" the sum of input_len and output_len.")
|
||||
args.input_len + args.output_len
|
||||
), (
|
||||
"Please ensure that max_model_len is greater than"
|
||||
" the sum of input_len and output_len."
|
||||
)
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
n=args.n,
|
||||
@@ -103,18 +107,16 @@ def main(args: argparse.Namespace):
|
||||
max_tokens=args.output_len,
|
||||
detokenize=not args.disable_detokenize,
|
||||
)
|
||||
dummy_prompt_token_ids = np.random.randint(10000,
|
||||
size=(args.batch_size,
|
||||
args.input_len))
|
||||
dummy_prompts: list[PromptType] = [{
|
||||
"prompt_token_ids": batch
|
||||
} for batch in dummy_prompt_token_ids.tolist()]
|
||||
dummy_prompt_token_ids = np.random.randint(
|
||||
10000, size=(args.batch_size, args.input_len)
|
||||
)
|
||||
dummy_prompts: list[PromptType] = [
|
||||
{"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist()
|
||||
]
|
||||
|
||||
def llm_generate():
|
||||
if not args.use_beam_search:
|
||||
llm.generate(dummy_prompts,
|
||||
sampling_params=sampling_params,
|
||||
use_tqdm=False)
|
||||
llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False)
|
||||
else:
|
||||
llm.beam_search(
|
||||
dummy_prompts,
|
||||
|
||||
@@ -62,6 +62,7 @@ class StreamedResponseHandler:
|
||||
@dataclass
|
||||
class RequestFuncInput:
|
||||
"""The input for the request function."""
|
||||
|
||||
prompt: str
|
||||
api_url: str
|
||||
prompt_len: int
|
||||
@@ -80,13 +81,13 @@ class RequestFuncInput:
|
||||
@dataclass
|
||||
class RequestFuncOutput:
|
||||
"""The output of the request function including metrics."""
|
||||
|
||||
generated_text: str = ""
|
||||
success: bool = False
|
||||
latency: float = 0.0
|
||||
output_tokens: int = 0
|
||||
ttft: float = 0.0 # Time to first token
|
||||
itl: list[float] = field(
|
||||
default_factory=list) # list of inter-token latencies
|
||||
itl: list[float] = field(default_factory=list) # list of inter-token latencies
|
||||
tpot: float = 0.0 # avg next-token latencies
|
||||
prompt_len: int = 0
|
||||
error: str = ""
|
||||
@@ -99,8 +100,7 @@ class RequestFunc(Protocol):
|
||||
request_func_input: RequestFuncInput,
|
||||
session: aiohttp.ClientSession,
|
||||
pbar: Optional[tqdm] = None,
|
||||
) -> Awaitable[RequestFuncOutput]:
|
||||
...
|
||||
) -> Awaitable[RequestFuncOutput]: ...
|
||||
|
||||
|
||||
async def async_request_openai_completions(
|
||||
@@ -118,13 +118,14 @@ async def async_request_openai_completions(
|
||||
The output of the request function.
|
||||
"""
|
||||
api_url = request_func_input.api_url
|
||||
assert api_url.endswith(
|
||||
("completions", "profile")
|
||||
), "OpenAI Completions API URL must end with 'completions' or 'profile'."
|
||||
assert api_url.endswith(("completions", "profile")), (
|
||||
"OpenAI Completions API URL must end with 'completions' or 'profile'."
|
||||
)
|
||||
|
||||
payload = {
|
||||
"model": request_func_input.model_name
|
||||
if request_func_input.model_name else request_func_input.model,
|
||||
if request_func_input.model_name
|
||||
else request_func_input.model,
|
||||
"prompt": request_func_input.prompt,
|
||||
"temperature": 0.0,
|
||||
"repetition_penalty": 1.0,
|
||||
@@ -139,9 +140,7 @@ async def async_request_openai_completions(
|
||||
payload["ignore_eos"] = request_func_input.ignore_eos
|
||||
if request_func_input.extra_body:
|
||||
payload.update(request_func_input.extra_body)
|
||||
headers = {
|
||||
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
|
||||
}
|
||||
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
|
||||
if request_func_input.extra_headers:
|
||||
headers |= request_func_input.extra_headers
|
||||
if request_func_input.request_id:
|
||||
@@ -155,8 +154,7 @@ async def async_request_openai_completions(
|
||||
output.start_time = st
|
||||
most_recent_timestamp = st
|
||||
try:
|
||||
async with session.post(url=api_url, json=payload,
|
||||
headers=headers) as response:
|
||||
async with session.post(url=api_url, json=payload, headers=headers) as response:
|
||||
if response.status == 200:
|
||||
first_chunk_received = False
|
||||
handler = StreamedResponseHandler()
|
||||
@@ -195,21 +193,20 @@ async def async_request_openai_completions(
|
||||
|
||||
# Decoding phase
|
||||
else:
|
||||
output.itl.append(timestamp -
|
||||
most_recent_timestamp)
|
||||
output.itl.append(timestamp - most_recent_timestamp)
|
||||
|
||||
most_recent_timestamp = timestamp
|
||||
generated_text += text or ""
|
||||
elif usage := data.get("usage"):
|
||||
output.output_tokens = usage.get(
|
||||
"completion_tokens")
|
||||
output.output_tokens = usage.get("completion_tokens")
|
||||
if first_chunk_received:
|
||||
output.success = True
|
||||
else:
|
||||
output.success = False
|
||||
output.error = (
|
||||
"Never received a valid chunk to calculate TTFT."
|
||||
"This response will be marked as failed!")
|
||||
"This response will be marked as failed!"
|
||||
)
|
||||
output.generated_text = generated_text
|
||||
output.latency = most_recent_timestamp - st
|
||||
else:
|
||||
@@ -232,7 +229,8 @@ async def async_request_openai_chat_completions(
|
||||
) -> RequestFuncOutput:
|
||||
api_url = request_func_input.api_url
|
||||
assert api_url.endswith(("chat/completions", "profile")), (
|
||||
"OpenAI Chat Completions API URL must end with 'chat/completions'.")
|
||||
"OpenAI Chat Completions API URL must end with 'chat/completions'."
|
||||
)
|
||||
|
||||
content = [{"type": "text", "text": request_func_input.prompt}]
|
||||
if request_func_input.multi_modal_content:
|
||||
@@ -243,25 +241,18 @@ async def async_request_openai_chat_completions(
|
||||
content.append(mm_content)
|
||||
else:
|
||||
raise TypeError(
|
||||
"multi_modal_content must be a dict or list[dict] "
|
||||
"for openai-chat"
|
||||
"multi_modal_content must be a dict or list[dict] for openai-chat"
|
||||
)
|
||||
payload = {
|
||||
"model":
|
||||
request_func_input.model_name
|
||||
if request_func_input.model_name else request_func_input.model,
|
||||
"model": request_func_input.model_name
|
||||
if request_func_input.model_name
|
||||
else request_func_input.model,
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": content
|
||||
},
|
||||
{"role": "user", "content": content},
|
||||
],
|
||||
"temperature":
|
||||
0.0,
|
||||
"max_completion_tokens":
|
||||
request_func_input.output_len,
|
||||
"stream":
|
||||
True,
|
||||
"temperature": 0.0,
|
||||
"max_completion_tokens": request_func_input.output_len,
|
||||
"stream": True,
|
||||
"stream_options": {
|
||||
"include_usage": True,
|
||||
},
|
||||
@@ -288,8 +279,7 @@ async def async_request_openai_chat_completions(
|
||||
output.start_time = st
|
||||
most_recent_timestamp = st
|
||||
try:
|
||||
async with session.post(url=api_url, json=payload,
|
||||
headers=headers) as response:
|
||||
async with session.post(url=api_url, json=payload, headers=headers) as response:
|
||||
if response.status == 200:
|
||||
handler = StreamedResponseHandler()
|
||||
async for chunk_bytes in response.content.iter_any():
|
||||
@@ -320,13 +310,11 @@ async def async_request_openai_chat_completions(
|
||||
|
||||
# Decoding phase
|
||||
else:
|
||||
output.itl.append(timestamp -
|
||||
most_recent_timestamp)
|
||||
output.itl.append(timestamp - most_recent_timestamp)
|
||||
|
||||
generated_text += content or ""
|
||||
elif usage := data.get("usage"):
|
||||
output.output_tokens = usage.get(
|
||||
"completion_tokens")
|
||||
output.output_tokens = usage.get("completion_tokens")
|
||||
|
||||
most_recent_timestamp = timestamp
|
||||
|
||||
@@ -356,27 +344,22 @@ async def async_request_openai_audio(
|
||||
|
||||
api_url = request_func_input.api_url
|
||||
assert api_url.endswith(("transcriptions", "translations")), (
|
||||
"OpenAI Chat Completions API URL must end with 'transcriptions' ")
|
||||
"OpenAI Chat Completions API URL must end with 'transcriptions' "
|
||||
)
|
||||
"or `translations`."
|
||||
|
||||
content = [{"type": "text", "text": request_func_input.prompt}]
|
||||
payload = {
|
||||
"model":
|
||||
request_func_input.model_name
|
||||
if request_func_input.model_name else request_func_input.model,
|
||||
"temperature":
|
||||
0.0,
|
||||
"max_completion_tokens":
|
||||
request_func_input.output_len,
|
||||
"stream":
|
||||
True,
|
||||
"language":
|
||||
"en",
|
||||
"model": request_func_input.model_name
|
||||
if request_func_input.model_name
|
||||
else request_func_input.model,
|
||||
"temperature": 0.0,
|
||||
"max_completion_tokens": request_func_input.output_len,
|
||||
"stream": True,
|
||||
"language": "en",
|
||||
# Flattened due to multipart/form-data
|
||||
"stream_include_usage":
|
||||
True,
|
||||
"stream_continuous_usage_stats":
|
||||
True,
|
||||
"stream_include_usage": True,
|
||||
"stream_continuous_usage_stats": True,
|
||||
}
|
||||
if request_func_input.extra_body:
|
||||
payload.update(request_func_input.extra_body)
|
||||
@@ -413,9 +396,9 @@ async def async_request_openai_audio(
|
||||
output.start_time = st
|
||||
most_recent_timestamp = st
|
||||
try:
|
||||
async with session.post(url=api_url,
|
||||
data=form,
|
||||
headers=headers) as response:
|
||||
async with session.post(
|
||||
url=api_url, data=form, headers=headers
|
||||
) as response:
|
||||
if response.status == 200:
|
||||
handler = StreamedResponseHandler()
|
||||
|
||||
@@ -426,15 +409,13 @@ async def async_request_openai_audio(
|
||||
|
||||
messages = handler.add_chunk(chunk_bytes)
|
||||
for message in messages:
|
||||
chunk = message.decode("utf-8").removeprefix(
|
||||
"data: ")
|
||||
chunk = message.decode("utf-8").removeprefix("data: ")
|
||||
if chunk != "[DONE]":
|
||||
timestamp = time.perf_counter()
|
||||
data = json.loads(chunk)
|
||||
|
||||
if choices := data.get("choices"):
|
||||
content = choices[0]["delta"].get(
|
||||
"content")
|
||||
content = choices[0]["delta"].get("content")
|
||||
# First token
|
||||
if ttft == 0.0:
|
||||
ttft = timestamp - st
|
||||
@@ -443,12 +424,14 @@ async def async_request_openai_audio(
|
||||
# Decoding phase
|
||||
else:
|
||||
output.itl.append(
|
||||
timestamp - most_recent_timestamp)
|
||||
timestamp - most_recent_timestamp
|
||||
)
|
||||
|
||||
generated_text += content or ""
|
||||
elif usage := data.get("usage"):
|
||||
output.output_tokens = usage.get(
|
||||
"completion_tokens")
|
||||
"completion_tokens"
|
||||
)
|
||||
|
||||
most_recent_timestamp = timestamp
|
||||
|
||||
@@ -474,9 +457,9 @@ async def async_request_openai_embeddings(
|
||||
pbar: Optional[tqdm] = None,
|
||||
):
|
||||
api_url = request_func_input.api_url
|
||||
assert api_url.endswith(
|
||||
"embeddings"
|
||||
), "OpenAI Embeddings API URL must end with 'embeddings'."
|
||||
assert api_url.endswith("embeddings"), (
|
||||
"OpenAI Embeddings API URL must end with 'embeddings'."
|
||||
)
|
||||
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
@@ -492,19 +475,13 @@ async def async_request_openai_embeddings(
|
||||
st = time.perf_counter()
|
||||
output.start_time = st
|
||||
try:
|
||||
async with session.post(
|
||||
url=api_url,
|
||||
headers=headers,
|
||||
json=payload
|
||||
) as response:
|
||||
async with session.post(url=api_url, headers=headers, json=payload) as response:
|
||||
if response.status == 200:
|
||||
output.latency = time.perf_counter() - st
|
||||
data = await response.json()
|
||||
output.success = True
|
||||
output.generated_text = ""
|
||||
output.prompt_len = data.get(
|
||||
"usage", {}).get(
|
||||
"prompt_tokens", 0)
|
||||
output.prompt_len = data.get("usage", {}).get("prompt_tokens", 0)
|
||||
else:
|
||||
output.success = False
|
||||
output.error = response.reason or ""
|
||||
@@ -527,7 +504,7 @@ ASYNC_REQUEST_FUNCS: dict[str, RequestFunc] = {
|
||||
}
|
||||
|
||||
OPENAI_COMPATIBLE_BACKENDS = [
|
||||
k for k, v in ASYNC_REQUEST_FUNCS.items()
|
||||
if v in (async_request_openai_completions,
|
||||
async_request_openai_chat_completions)
|
||||
k
|
||||
for k, v in ASYNC_REQUEST_FUNCS.items()
|
||||
if v in (async_request_openai_completions, async_request_openai_chat_completions)
|
||||
]
|
||||
|
||||
@@ -8,8 +8,7 @@ import time
|
||||
import aiohttp
|
||||
from tqdm.asyncio import tqdm
|
||||
|
||||
from .endpoint_request_func import (RequestFunc, RequestFuncInput,
|
||||
RequestFuncOutput)
|
||||
from .endpoint_request_func import RequestFunc, RequestFuncInput, RequestFuncOutput
|
||||
|
||||
|
||||
async def wait_for_endpoint(
|
||||
@@ -21,30 +20,29 @@ async def wait_for_endpoint(
|
||||
) -> RequestFuncOutput:
|
||||
"""
|
||||
Wait for an endpoint to become available before starting benchmarks.
|
||||
|
||||
|
||||
Args:
|
||||
request_func: The async request function to call
|
||||
test_input: The RequestFuncInput to test with
|
||||
timeout_seconds: Maximum time to wait in seconds (default: 10 minutes)
|
||||
retry_interval: Time between retries in seconds (default: 5 seconds)
|
||||
|
||||
|
||||
Returns:
|
||||
RequestFuncOutput: The successful response
|
||||
|
||||
|
||||
Raises:
|
||||
ValueError: If the endpoint doesn't become available within the timeout
|
||||
"""
|
||||
deadline = time.perf_counter() + timeout_seconds
|
||||
output = RequestFuncOutput(success=False)
|
||||
print(f"Waiting for endpoint to become up in {timeout_seconds} seconds")
|
||||
|
||||
|
||||
with tqdm(
|
||||
total=timeout_seconds,
|
||||
total=timeout_seconds,
|
||||
bar_format="{desc} |{bar}| {elapsed} elapsed, {remaining} remaining",
|
||||
unit="s",
|
||||
) as pbar:
|
||||
|
||||
while True:
|
||||
while True:
|
||||
# update progress bar
|
||||
remaining = deadline - time.perf_counter()
|
||||
elapsed = timeout_seconds - remaining
|
||||
@@ -58,16 +56,17 @@ async def wait_for_endpoint(
|
||||
# ping the endpoint using request_func
|
||||
try:
|
||||
output = await request_func(
|
||||
request_func_input=test_input, session=session)
|
||||
request_func_input=test_input, session=session
|
||||
)
|
||||
if output.success:
|
||||
pbar.close()
|
||||
return output
|
||||
except aiohttp.ClientConnectorError:
|
||||
pass
|
||||
|
||||
|
||||
# retry after a delay
|
||||
sleep_duration = min(retry_interval, remaining)
|
||||
if sleep_duration > 0:
|
||||
await asyncio.sleep(sleep_duration)
|
||||
|
||||
|
||||
return output
|
||||
|
||||
@@ -8,9 +8,9 @@ import os
|
||||
from typing import Any
|
||||
|
||||
|
||||
def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
|
||||
metrics: dict[str, list],
|
||||
extra_info: dict[str, Any]) -> list:
|
||||
def convert_to_pytorch_benchmark_format(
|
||||
args: argparse.Namespace, metrics: dict[str, list], extra_info: dict[str, Any]
|
||||
) -> list:
|
||||
"""
|
||||
Save the benchmark results in the format used by PyTorch OSS benchmark with
|
||||
on metric per record
|
||||
@@ -38,12 +38,12 @@ def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
|
||||
},
|
||||
}
|
||||
|
||||
tp = record["benchmark"]["extra_info"]["args"].get(
|
||||
"tensor_parallel_size")
|
||||
tp = record["benchmark"]["extra_info"]["args"].get("tensor_parallel_size")
|
||||
# Save tensor_parallel_size parameter if it's part of the metadata
|
||||
if not tp and "tensor_parallel_size" in extra_info:
|
||||
record["benchmark"]["extra_info"]["args"][
|
||||
"tensor_parallel_size"] = extra_info["tensor_parallel_size"]
|
||||
record["benchmark"]["extra_info"]["args"]["tensor_parallel_size"] = (
|
||||
extra_info["tensor_parallel_size"]
|
||||
)
|
||||
|
||||
records.append(record)
|
||||
|
||||
@@ -51,7 +51,6 @@ def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
|
||||
|
||||
|
||||
class InfEncoder(json.JSONEncoder):
|
||||
|
||||
def clear_inf(self, o: Any):
|
||||
if isinstance(o, dict):
|
||||
return {
|
||||
|
||||
@@ -15,6 +15,7 @@ On the client side, run:
|
||||
--request-rate <request_rate. Default inf> \
|
||||
--num-prompts <num_prompts. Default 1000>
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import gc
|
||||
@@ -36,20 +37,22 @@ import numpy as np
|
||||
from tqdm.asyncio import tqdm
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
|
||||
from vllm.benchmarks.datasets import (SampleRequest, add_dataset_parser,
|
||||
get_samples)
|
||||
from vllm.benchmarks.datasets import SampleRequest, add_dataset_parser, get_samples
|
||||
from vllm.benchmarks.lib.endpoint_request_func import (
|
||||
ASYNC_REQUEST_FUNCS, OPENAI_COMPATIBLE_BACKENDS, RequestFuncInput,
|
||||
RequestFuncOutput)
|
||||
ASYNC_REQUEST_FUNCS,
|
||||
OPENAI_COMPATIBLE_BACKENDS,
|
||||
RequestFuncInput,
|
||||
RequestFuncOutput,
|
||||
)
|
||||
from vllm.benchmarks.lib.ready_checker import wait_for_endpoint
|
||||
from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format,
|
||||
write_to_json)
|
||||
from vllm.benchmarks.lib.utils import convert_to_pytorch_benchmark_format, write_to_json
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
|
||||
MILLISECONDS_TO_SECONDS_CONVERSION = 1000
|
||||
|
||||
TERM_PLOTLIB_AVAILABLE = ((importlib.util.find_spec("termplotlib") is not None)
|
||||
and (shutil.which("gnuplot") is not None))
|
||||
TERM_PLOTLIB_AVAILABLE = (importlib.util.find_spec("termplotlib") is not None) and (
|
||||
shutil.which("gnuplot") is not None
|
||||
)
|
||||
|
||||
|
||||
class TaskType(Enum):
|
||||
@@ -110,8 +113,11 @@ def _get_current_request_rate(
|
||||
total_requests: int,
|
||||
request_rate: float,
|
||||
) -> float:
|
||||
if (ramp_up_strategy and ramp_up_start_rps is not None
|
||||
and ramp_up_end_rps is not None):
|
||||
if (
|
||||
ramp_up_strategy
|
||||
and ramp_up_start_rps is not None
|
||||
and ramp_up_end_rps is not None
|
||||
):
|
||||
progress = request_index / max(total_requests - 1, 1)
|
||||
if ramp_up_strategy == "linear":
|
||||
increase = (ramp_up_end_rps - ramp_up_start_rps) * progress
|
||||
@@ -158,10 +164,10 @@ async def get_request(
|
||||
The ending request rate for ramp-up.
|
||||
"""
|
||||
assert burstiness > 0, (
|
||||
f"A positive burstiness factor is expected, but given {burstiness}.")
|
||||
f"A positive burstiness factor is expected, but given {burstiness}."
|
||||
)
|
||||
# Convert to list to get length for ramp-up calculations
|
||||
if isinstance(input_requests,
|
||||
Iterable) and not isinstance(input_requests, list):
|
||||
if isinstance(input_requests, Iterable) and not isinstance(input_requests, list):
|
||||
input_requests = list(input_requests)
|
||||
|
||||
total_requests = len(input_requests)
|
||||
@@ -172,8 +178,13 @@ async def get_request(
|
||||
delay_ts = []
|
||||
for request_index, request in enumerate(input_requests):
|
||||
current_request_rate = _get_current_request_rate(
|
||||
ramp_up_strategy, ramp_up_start_rps, ramp_up_end_rps,
|
||||
request_index, total_requests, request_rate)
|
||||
ramp_up_strategy,
|
||||
ramp_up_start_rps,
|
||||
ramp_up_end_rps,
|
||||
request_index,
|
||||
total_requests,
|
||||
request_rate,
|
||||
)
|
||||
request_rates.append(current_request_rate)
|
||||
if current_request_rate == float("inf"):
|
||||
delay_ts.append(0)
|
||||
@@ -213,8 +224,8 @@ async def get_request(
|
||||
|
||||
|
||||
def calculate_metrics_for_embeddings(
|
||||
outputs: list[RequestFuncOutput], dur_s: float,
|
||||
selected_percentiles: list[float]) -> EmbedBenchmarkMetrics:
|
||||
outputs: list[RequestFuncOutput], dur_s: float, selected_percentiles: list[float]
|
||||
) -> EmbedBenchmarkMetrics:
|
||||
"""Calculate the metrics for the embedding requests.
|
||||
|
||||
Args:
|
||||
@@ -238,7 +249,8 @@ def calculate_metrics_for_embeddings(
|
||||
warnings.warn(
|
||||
"All requests failed. This is likely due to a misconfiguration "
|
||||
"on the benchmark arguments.",
|
||||
stacklevel=2)
|
||||
stacklevel=2,
|
||||
)
|
||||
metrics = EmbedBenchmarkMetrics(
|
||||
completed=completed,
|
||||
total_input=total_input,
|
||||
@@ -247,8 +259,9 @@ def calculate_metrics_for_embeddings(
|
||||
mean_e2el_ms=np.mean(e2els or 0) * 1000,
|
||||
std_e2el_ms=np.std(e2els or 0) * 1000,
|
||||
median_e2el_ms=np.median(e2els or 0) * 1000,
|
||||
percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000)
|
||||
for p in selected_percentiles],
|
||||
percentiles_e2el_ms=[
|
||||
(p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles
|
||||
],
|
||||
)
|
||||
return metrics
|
||||
|
||||
@@ -294,8 +307,10 @@ def calculate_metrics(
|
||||
# bundled together
|
||||
# Note : this may inflate the output token count slightly
|
||||
output_len = len(
|
||||
tokenizer(outputs[i].generated_text,
|
||||
add_special_tokens=False).input_ids)
|
||||
tokenizer(
|
||||
outputs[i].generated_text, add_special_tokens=False
|
||||
).input_ids
|
||||
)
|
||||
actual_output_lens.append(output_len)
|
||||
total_input += input_requests[i].prompt_len
|
||||
tpot = 0
|
||||
@@ -318,16 +333,19 @@ def calculate_metrics(
|
||||
|
||||
if "ttft" in goodput_config_dict:
|
||||
valid_metrics.append(ttfts)
|
||||
slo_values.append(goodput_config_dict["ttft"] /
|
||||
MILLISECONDS_TO_SECONDS_CONVERSION)
|
||||
slo_values.append(
|
||||
goodput_config_dict["ttft"] / MILLISECONDS_TO_SECONDS_CONVERSION
|
||||
)
|
||||
if "tpot" in goodput_config_dict:
|
||||
valid_metrics.append(all_tpots)
|
||||
slo_values.append(goodput_config_dict["tpot"] /
|
||||
MILLISECONDS_TO_SECONDS_CONVERSION)
|
||||
slo_values.append(
|
||||
goodput_config_dict["tpot"] / MILLISECONDS_TO_SECONDS_CONVERSION
|
||||
)
|
||||
if "e2el" in goodput_config_dict:
|
||||
valid_metrics.append(e2els)
|
||||
slo_values.append(goodput_config_dict["e2el"] /
|
||||
MILLISECONDS_TO_SECONDS_CONVERSION)
|
||||
slo_values.append(
|
||||
goodput_config_dict["e2el"] / MILLISECONDS_TO_SECONDS_CONVERSION
|
||||
)
|
||||
|
||||
for req_metric in zip(*valid_metrics):
|
||||
is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)])
|
||||
@@ -338,7 +356,8 @@ def calculate_metrics(
|
||||
warnings.warn(
|
||||
"All requests failed. This is likely due to a misconfiguration "
|
||||
"on the benchmark arguments.",
|
||||
stacklevel=2)
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
# Calculate max output tokens per second metric
|
||||
max_output_tokens_per_s = 0.0
|
||||
@@ -347,10 +366,10 @@ def calculate_metrics(
|
||||
# Find the time range across all successful requests
|
||||
successful_outputs = [output for output in outputs if output.success]
|
||||
if successful_outputs:
|
||||
min_start_time = min(output.start_time
|
||||
for output in successful_outputs)
|
||||
max_end_time = max(output.start_time + output.latency
|
||||
for output in successful_outputs)
|
||||
min_start_time = min(output.start_time for output in successful_outputs)
|
||||
max_end_time = max(
|
||||
output.start_time + output.latency for output in successful_outputs
|
||||
)
|
||||
|
||||
# Create second buckets (ceiling to ensure we capture all time)
|
||||
duration_seconds = int(np.ceil(max_end_time - min_start_time)) + 1
|
||||
@@ -374,8 +393,9 @@ def calculate_metrics(
|
||||
|
||||
# Track concurrent requests for each second this request was active
|
||||
request_start_second = int(output.start_time - min_start_time)
|
||||
request_end_second = int((output.start_time + output.latency) -
|
||||
min_start_time)
|
||||
request_end_second = int(
|
||||
(output.start_time + output.latency) - min_start_time
|
||||
)
|
||||
|
||||
for second in range(request_start_second, request_end_second + 1):
|
||||
concurrent_requests_per_second[second] += 1
|
||||
@@ -384,18 +404,22 @@ def calculate_metrics(
|
||||
# concurrent requests
|
||||
if len(tokens_per_second) > 0:
|
||||
max_output_tokens_per_s = float(np.max(tokens_per_second))
|
||||
max_concurrent_requests = int(
|
||||
np.max(concurrent_requests_per_second))
|
||||
max_concurrent_requests = int(np.max(concurrent_requests_per_second))
|
||||
|
||||
if TERM_PLOTLIB_AVAILABLE:
|
||||
import termplotlib as tpl
|
||||
|
||||
fig = tpl.figure()
|
||||
fig.plot(np.arange(len(tokens_per_second)),
|
||||
tokens_per_second,
|
||||
title="Output tokens per second")
|
||||
fig.plot(np.arange(len(concurrent_requests_per_second)),
|
||||
concurrent_requests_per_second,
|
||||
title="Concurrent requests per second")
|
||||
fig.plot(
|
||||
np.arange(len(tokens_per_second)),
|
||||
tokens_per_second,
|
||||
title="Output tokens per second",
|
||||
)
|
||||
fig.plot(
|
||||
np.arange(len(concurrent_requests_per_second)),
|
||||
concurrent_requests_per_second,
|
||||
title="Concurrent requests per second",
|
||||
)
|
||||
fig.show()
|
||||
else:
|
||||
print("tip: install termplotlib and gnuplot to plot the metrics")
|
||||
@@ -408,27 +432,31 @@ def calculate_metrics(
|
||||
request_goodput=good_completed / dur_s,
|
||||
output_throughput=sum(actual_output_lens) / dur_s,
|
||||
total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s,
|
||||
mean_ttft_ms=np.mean(ttfts or 0) *
|
||||
1000, # ttfts is empty if streaming is not supported by the endpoint
|
||||
mean_ttft_ms=np.mean(ttfts or 0)
|
||||
* 1000, # ttfts is empty if streaming is not supported by the endpoint
|
||||
std_ttft_ms=np.std(ttfts or 0) * 1000,
|
||||
median_ttft_ms=np.median(ttfts or 0) * 1000,
|
||||
percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000)
|
||||
for p in selected_percentiles],
|
||||
percentiles_ttft_ms=[
|
||||
(p, np.percentile(ttfts or 0, p) * 1000) for p in selected_percentiles
|
||||
],
|
||||
mean_tpot_ms=np.mean(tpots or 0) * 1000,
|
||||
std_tpot_ms=np.std(tpots or 0) * 1000,
|
||||
median_tpot_ms=np.median(tpots or 0) * 1000,
|
||||
percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000)
|
||||
for p in selected_percentiles],
|
||||
percentiles_tpot_ms=[
|
||||
(p, np.percentile(tpots or 0, p) * 1000) for p in selected_percentiles
|
||||
],
|
||||
mean_itl_ms=np.mean(itls or 0) * 1000,
|
||||
std_itl_ms=np.std(itls or 0) * 1000,
|
||||
median_itl_ms=np.median(itls or 0) * 1000,
|
||||
percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000)
|
||||
for p in selected_percentiles],
|
||||
percentiles_itl_ms=[
|
||||
(p, np.percentile(itls or 0, p) * 1000) for p in selected_percentiles
|
||||
],
|
||||
mean_e2el_ms=np.mean(e2els or 0) * 1000,
|
||||
std_e2el_ms=np.std(e2els or 0) * 1000,
|
||||
median_e2el_ms=np.median(e2els or 0) * 1000,
|
||||
percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000)
|
||||
for p in selected_percentiles],
|
||||
percentiles_e2el_ms=[
|
||||
(p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles
|
||||
],
|
||||
max_output_tokens_per_s=max_output_tokens_per_s,
|
||||
max_concurrent_requests=max_concurrent_requests,
|
||||
)
|
||||
@@ -462,8 +490,11 @@ async def benchmark(
|
||||
ramp_up_end_rps: Optional[int] = None,
|
||||
ready_check_timeout_sec: int = 600,
|
||||
):
|
||||
task_type = (TaskType.EMBEDDING if api_url.endswith("/v1/embeddings") else
|
||||
TaskType.GENERATION)
|
||||
task_type = (
|
||||
TaskType.EMBEDDING
|
||||
if api_url.endswith("/v1/embeddings")
|
||||
else TaskType.GENERATION
|
||||
)
|
||||
if endpoint_type in ASYNC_REQUEST_FUNCS:
|
||||
if task_type == TaskType.EMBEDDING:
|
||||
request_func = ASYNC_REQUEST_FUNCS["openai-embeddings"]
|
||||
@@ -498,10 +529,14 @@ async def benchmark(
|
||||
input_requests[0].multi_modal_data,
|
||||
)
|
||||
|
||||
assert (test_mm_content is None or isinstance(test_mm_content, dict)
|
||||
or (isinstance(test_mm_content, list)
|
||||
and all(isinstance(item, dict) for item in test_mm_content))
|
||||
), "multi_modal_data must be a dict or list[dict]"
|
||||
assert (
|
||||
test_mm_content is None
|
||||
or isinstance(test_mm_content, dict)
|
||||
or (
|
||||
isinstance(test_mm_content, list)
|
||||
and all(isinstance(item, dict) for item in test_mm_content)
|
||||
)
|
||||
), "multi_modal_data must be a dict or list[dict]"
|
||||
test_input = RequestFuncInput(
|
||||
model=model_id,
|
||||
model_name=model_name,
|
||||
@@ -527,7 +562,8 @@ async def benchmark(
|
||||
raise ValueError(
|
||||
"Initial test run failed - Please make sure benchmark "
|
||||
"arguments are correctly specified. "
|
||||
f"Error: {test_output.error}")
|
||||
f"Error: {test_output.error}"
|
||||
)
|
||||
else:
|
||||
print("Initial test run completed. Starting main benchmark run...")
|
||||
else:
|
||||
@@ -536,33 +572,38 @@ async def benchmark(
|
||||
if lora_modules:
|
||||
# For each input request, choose a LoRA module at random.
|
||||
lora_modules = iter(
|
||||
[random.choice(lora_modules) for _ in range(len(input_requests))])
|
||||
[random.choice(lora_modules) for _ in range(len(input_requests))]
|
||||
)
|
||||
|
||||
if profile:
|
||||
print("Starting profiler...")
|
||||
profile_input = RequestFuncInput(model=model_id,
|
||||
model_name=model_name,
|
||||
prompt=test_prompt,
|
||||
api_url=base_url + "/start_profile",
|
||||
prompt_len=test_prompt_len,
|
||||
output_len=test_output_len,
|
||||
logprobs=logprobs,
|
||||
multi_modal_content=test_mm_content,
|
||||
ignore_eos=ignore_eos,
|
||||
extra_headers=extra_headers,
|
||||
extra_body=extra_body)
|
||||
profile_output = await request_func(request_func_input=profile_input,
|
||||
session=session)
|
||||
profile_input = RequestFuncInput(
|
||||
model=model_id,
|
||||
model_name=model_name,
|
||||
prompt=test_prompt,
|
||||
api_url=base_url + "/start_profile",
|
||||
prompt_len=test_prompt_len,
|
||||
output_len=test_output_len,
|
||||
logprobs=logprobs,
|
||||
multi_modal_content=test_mm_content,
|
||||
ignore_eos=ignore_eos,
|
||||
extra_headers=extra_headers,
|
||||
extra_body=extra_body,
|
||||
)
|
||||
profile_output = await request_func(
|
||||
request_func_input=profile_input, session=session
|
||||
)
|
||||
if profile_output.success:
|
||||
print("Profiler started")
|
||||
|
||||
distribution = ("Poisson process"
|
||||
if burstiness == 1.0 else "Gamma distribution")
|
||||
distribution = "Poisson process" if burstiness == 1.0 else "Gamma distribution"
|
||||
|
||||
if ramp_up_strategy is not None:
|
||||
print(f"Traffic ramp-up strategy: {ramp_up_strategy}.")
|
||||
print(f"Will increase RPS from {ramp_up_start_rps} to "
|
||||
f"{ramp_up_end_rps} RPS over the duration of the benchmark.")
|
||||
print(
|
||||
f"Will increase RPS from {ramp_up_start_rps} to "
|
||||
f"{ramp_up_end_rps} RPS over the duration of the benchmark."
|
||||
)
|
||||
else:
|
||||
print(f"Traffic request rate: {request_rate}")
|
||||
|
||||
@@ -575,18 +616,17 @@ async def benchmark(
|
||||
# and it will simplify the code in limited_request_func.
|
||||
# semaphore = (asyncio.Semaphore(max_concurrency)
|
||||
# if max_concurrency else contextlib.nullcontext())
|
||||
semaphore = (asyncio.Semaphore(max_concurrency)
|
||||
if max_concurrency else None)
|
||||
semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
|
||||
|
||||
async def limited_request_func(request_func_input, session, pbar):
|
||||
if semaphore is None:
|
||||
return await request_func(request_func_input=request_func_input,
|
||||
session=session,
|
||||
pbar=pbar)
|
||||
return await request_func(
|
||||
request_func_input=request_func_input, session=session, pbar=pbar
|
||||
)
|
||||
async with semaphore:
|
||||
return await request_func(request_func_input=request_func_input,
|
||||
session=session,
|
||||
pbar=pbar)
|
||||
return await request_func(
|
||||
request_func_input=request_func_input, session=session, pbar=pbar
|
||||
)
|
||||
|
||||
benchmark_start_time = time.perf_counter()
|
||||
tasks: list[asyncio.Task] = []
|
||||
@@ -595,23 +635,27 @@ async def benchmark(
|
||||
last_int_rps = -1
|
||||
if ramp_up_strategy is not None and ramp_up_start_rps is not None:
|
||||
last_int_rps = ramp_up_start_rps
|
||||
rps_change_events.append({
|
||||
"rps": last_int_rps,
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
})
|
||||
rps_change_events.append(
|
||||
{
|
||||
"rps": last_int_rps,
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
}
|
||||
)
|
||||
|
||||
async for request, current_request_rate in get_request(
|
||||
input_requests, request_rate, burstiness, ramp_up_strategy,
|
||||
ramp_up_start_rps, ramp_up_end_rps):
|
||||
input_requests,
|
||||
request_rate,
|
||||
burstiness,
|
||||
ramp_up_strategy,
|
||||
ramp_up_start_rps,
|
||||
ramp_up_end_rps,
|
||||
):
|
||||
if ramp_up_strategy is not None:
|
||||
current_int_rps = int(current_request_rate)
|
||||
if current_int_rps > last_int_rps:
|
||||
timestamp = datetime.now().isoformat()
|
||||
for rps_val in range(last_int_rps + 1, current_int_rps + 1):
|
||||
rps_change_events.append({
|
||||
"rps": rps_val,
|
||||
"timestamp": timestamp
|
||||
})
|
||||
rps_change_events.append({"rps": rps_val, "timestamp": timestamp})
|
||||
last_int_rps = current_int_rps
|
||||
prompt, prompt_len, output_len, mm_content, request_id = (
|
||||
request.prompt,
|
||||
@@ -641,9 +685,11 @@ async def benchmark(
|
||||
)
|
||||
tasks.append(
|
||||
asyncio.create_task(
|
||||
limited_request_func(request_func_input=request_func_input,
|
||||
session=session,
|
||||
pbar=pbar)))
|
||||
limited_request_func(
|
||||
request_func_input=request_func_input, session=session, pbar=pbar
|
||||
)
|
||||
)
|
||||
)
|
||||
outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
|
||||
|
||||
if pbar is not None:
|
||||
@@ -668,35 +714,48 @@ async def benchmark(
|
||||
)
|
||||
actual_output_lens = 0
|
||||
|
||||
print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
|
||||
print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
|
||||
print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
|
||||
if max_concurrency is not None:
|
||||
print("{:<40} {:<10}".format("Maximum request concurrency:",
|
||||
max_concurrency))
|
||||
if request_rate != float('inf'):
|
||||
print("{:<40} {:<10.2f}".format("Request rate configured (RPS):",
|
||||
request_rate))
|
||||
print("{:<40} {:<10.2f}".format("Benchmark duration (s):",
|
||||
benchmark_duration))
|
||||
print("{:<40} {:<10}".format("Maximum request concurrency:", max_concurrency))
|
||||
if request_rate != float("inf"):
|
||||
print("{:<40} {:<10.2f}".format("Request rate configured (RPS):", request_rate))
|
||||
print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
|
||||
print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
|
||||
if isinstance(metrics, BenchmarkMetrics):
|
||||
print("{:<40} {:<10}".format("Total generated tokens:",
|
||||
metrics.total_output))
|
||||
print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
|
||||
metrics.request_throughput))
|
||||
print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
|
||||
print(
|
||||
"{:<40} {:<10.2f}".format(
|
||||
"Request throughput (req/s):", metrics.request_throughput
|
||||
)
|
||||
)
|
||||
if goodput_config_dict:
|
||||
print("{:<40} {:<10.2f}".format("Request goodput (req/s):",
|
||||
metrics.request_goodput))
|
||||
print(
|
||||
"{:<40} {:<10.2f}".format(
|
||||
"Request goodput (req/s):", metrics.request_goodput
|
||||
)
|
||||
)
|
||||
if isinstance(metrics, BenchmarkMetrics):
|
||||
print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
|
||||
metrics.output_throughput))
|
||||
print("{:<40} {:<10.2f}".format(
|
||||
"Peak output token throughput (tok/s):",
|
||||
metrics.max_output_tokens_per_s))
|
||||
print("{:<40} {:<10.2f}".format("Peak concurrent requests:",
|
||||
metrics.max_concurrent_requests))
|
||||
print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):",
|
||||
metrics.total_token_throughput))
|
||||
print(
|
||||
"{:<40} {:<10.2f}".format(
|
||||
"Output token throughput (tok/s):", metrics.output_throughput
|
||||
)
|
||||
)
|
||||
print(
|
||||
"{:<40} {:<10.2f}".format(
|
||||
"Peak output token throughput (tok/s):", metrics.max_output_tokens_per_s
|
||||
)
|
||||
)
|
||||
print(
|
||||
"{:<40} {:<10.2f}".format(
|
||||
"Peak concurrent requests:", metrics.max_concurrent_requests
|
||||
)
|
||||
)
|
||||
print(
|
||||
"{:<40} {:<10.2f}".format(
|
||||
"Total Token throughput (tok/s):", metrics.total_token_throughput
|
||||
)
|
||||
)
|
||||
|
||||
if isinstance(metrics, BenchmarkMetrics):
|
||||
result = {
|
||||
@@ -705,8 +764,7 @@ async def benchmark(
|
||||
"total_input_tokens": metrics.total_input,
|
||||
"total_output_tokens": metrics.total_output,
|
||||
"request_throughput": metrics.request_throughput,
|
||||
"request_goodput":
|
||||
metrics.request_goodput if goodput_config_dict else None,
|
||||
"request_goodput": metrics.request_goodput if goodput_config_dict else None,
|
||||
"output_throughput": metrics.output_throughput,
|
||||
"total_token_throughput": metrics.total_token_throughput,
|
||||
"input_lens": [output.prompt_len for output in outputs],
|
||||
@@ -744,30 +802,36 @@ async def benchmark(
|
||||
# metric.
|
||||
if metric_attribute_name not in selected_percentile_metrics:
|
||||
return
|
||||
print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-'))
|
||||
print("{:<40} {:<10.2f}".format(
|
||||
f"Mean {metric_name} (ms):",
|
||||
getattr(metrics, f"mean_{metric_attribute_name}_ms")))
|
||||
print("{:<40} {:<10.2f}".format(
|
||||
f"Median {metric_name} (ms):",
|
||||
getattr(metrics, f"median_{metric_attribute_name}_ms")))
|
||||
print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-"))
|
||||
print(
|
||||
"{:<40} {:<10.2f}".format(
|
||||
f"Mean {metric_name} (ms):",
|
||||
getattr(metrics, f"mean_{metric_attribute_name}_ms"),
|
||||
)
|
||||
)
|
||||
print(
|
||||
"{:<40} {:<10.2f}".format(
|
||||
f"Median {metric_name} (ms):",
|
||||
getattr(metrics, f"median_{metric_attribute_name}_ms"),
|
||||
)
|
||||
)
|
||||
result[f"mean_{metric_attribute_name}_ms"] = getattr(
|
||||
metrics, f"mean_{metric_attribute_name}_ms")
|
||||
metrics, f"mean_{metric_attribute_name}_ms"
|
||||
)
|
||||
result[f"median_{metric_attribute_name}_ms"] = getattr(
|
||||
metrics, f"median_{metric_attribute_name}_ms")
|
||||
metrics, f"median_{metric_attribute_name}_ms"
|
||||
)
|
||||
result[f"std_{metric_attribute_name}_ms"] = getattr(
|
||||
metrics, f"std_{metric_attribute_name}_ms")
|
||||
for p, value in getattr(metrics,
|
||||
f"percentiles_{metric_attribute_name}_ms"):
|
||||
metrics, f"std_{metric_attribute_name}_ms"
|
||||
)
|
||||
for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}_ms"):
|
||||
p_word = str(int(p)) if int(p) == p else str(p)
|
||||
print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):",
|
||||
value))
|
||||
print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value))
|
||||
result[f"p{p_word}_{metric_attribute_name}_ms"] = value
|
||||
|
||||
if task_type == TaskType.GENERATION:
|
||||
process_one_metric("ttft", "TTFT", "Time to First Token")
|
||||
process_one_metric("tpot", "TPOT",
|
||||
"Time per Output Token (excl. 1st token)")
|
||||
process_one_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)")
|
||||
process_one_metric("itl", "ITL", "Inter-token Latency")
|
||||
process_one_metric("e2el", "E2EL", "End-to-end Latency")
|
||||
|
||||
@@ -783,8 +847,9 @@ async def benchmark(
|
||||
output_len=test_output_len,
|
||||
logprobs=logprobs,
|
||||
)
|
||||
profile_output = await request_func(request_func_input=profile_input,
|
||||
session=session)
|
||||
profile_output = await request_func(
|
||||
request_func_input=profile_input, session=session
|
||||
)
|
||||
if profile_output.success:
|
||||
print("Profiler stopped")
|
||||
|
||||
@@ -803,12 +868,14 @@ def check_goodput_args(args):
|
||||
raise ValueError(
|
||||
f"Invalid metric name found, {slo_name}: {slo_val}. "
|
||||
"The service level objective name should be one of "
|
||||
f"{str(VALID_NAMES)}. ")
|
||||
f"{str(VALID_NAMES)}. "
|
||||
)
|
||||
if slo_val < 0:
|
||||
raise ValueError(
|
||||
f"Invalid value found, {slo_name}: {slo_val}. "
|
||||
"The service level objective value should be "
|
||||
"non-negative.")
|
||||
"non-negative."
|
||||
)
|
||||
return goodput_config_dict
|
||||
|
||||
|
||||
@@ -821,31 +888,42 @@ def parse_goodput(slo_pairs):
|
||||
except ValueError as err:
|
||||
raise argparse.ArgumentTypeError(
|
||||
"Invalid format found for service level objectives. "
|
||||
"Specify service level objectives for goodput as \"KEY:VALUE\" "
|
||||
'Specify service level objectives for goodput as "KEY:VALUE" '
|
||||
"pairs, where the key is a metric name, and the value is a "
|
||||
"number in milliseconds.") from err
|
||||
"number in milliseconds."
|
||||
) from err
|
||||
return goodput_config_dict
|
||||
|
||||
|
||||
def save_to_pytorch_benchmark_format(args: argparse.Namespace,
|
||||
results: dict[str, Any],
|
||||
file_name: str) -> None:
|
||||
def save_to_pytorch_benchmark_format(
|
||||
args: argparse.Namespace, results: dict[str, Any], file_name: str
|
||||
) -> None:
|
||||
metrics = [
|
||||
"median_ttft_ms", "mean_ttft_ms", "std_ttft_ms", "p99_ttft_ms",
|
||||
"mean_tpot_ms", "median_tpot_ms", "std_tpot_ms", "p99_tpot_ms",
|
||||
"median_itl_ms", "mean_itl_ms", "std_itl_ms", "p99_itl_ms"
|
||||
"median_ttft_ms",
|
||||
"mean_ttft_ms",
|
||||
"std_ttft_ms",
|
||||
"p99_ttft_ms",
|
||||
"mean_tpot_ms",
|
||||
"median_tpot_ms",
|
||||
"std_tpot_ms",
|
||||
"p99_tpot_ms",
|
||||
"median_itl_ms",
|
||||
"mean_itl_ms",
|
||||
"std_itl_ms",
|
||||
"p99_itl_ms",
|
||||
]
|
||||
# These raw data might be useful, but they are rather big. They can be added
|
||||
# later if needed
|
||||
ignored_metrics = ["ttfts", "itls", "generated_texts", "errors"]
|
||||
pt_records = convert_to_pytorch_benchmark_format(
|
||||
args=args,
|
||||
metrics={k: [results[k]]
|
||||
for k in metrics if k in results},
|
||||
metrics={k: [results[k]] for k in metrics if k in results},
|
||||
extra_info={
|
||||
k: results[k]
|
||||
for k in results if k not in metrics and k not in ignored_metrics
|
||||
})
|
||||
for k in results
|
||||
if k not in metrics and k not in ignored_metrics
|
||||
},
|
||||
)
|
||||
if pt_records:
|
||||
# Don't use json suffix here as we don't want CI to pick it up
|
||||
pt_file = f"{os.path.splitext(file_name)[0]}.pytorch.json"
|
||||
@@ -866,7 +944,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
|
||||
type=str,
|
||||
default="openai",
|
||||
choices=list(ASYNC_REQUEST_FUNCS.keys()),
|
||||
help="The type of backend or endpoint to use for the benchmark."
|
||||
help="The type of backend or endpoint to use for the benchmark.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--base-url",
|
||||
@@ -888,9 +966,9 @@ def add_cli_args(parser: argparse.ArgumentParser):
|
||||
metavar="KEY=VALUE",
|
||||
nargs="*",
|
||||
help="Key-value pairs (e.g, --header x-additional-info=0.3.3) "
|
||||
"for headers to be passed with each request. These headers override " \
|
||||
"per backend constants and values set via environment variable, and " \
|
||||
"will be overriden by other arguments (such as request ids)."
|
||||
"for headers to be passed with each request. These headers override "
|
||||
"per backend constants and values set via environment variable, and "
|
||||
"will be overriden by other arguments (such as request ids).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-concurrency",
|
||||
@@ -915,19 +993,20 @@ def add_cli_args(parser: argparse.ArgumentParser):
|
||||
parser.add_argument(
|
||||
"--tokenizer",
|
||||
type=str,
|
||||
help=
|
||||
"Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
|
||||
help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
|
||||
)
|
||||
parser.add_argument("--use-beam-search", action="store_true")
|
||||
parser.add_argument(
|
||||
"--logprobs",
|
||||
type=int,
|
||||
default=None,
|
||||
help=("Number of logprobs-per-token to compute & return as part of "
|
||||
"the request. If unspecified, then either (1) if beam search "
|
||||
"is disabled, no logprobs are computed & a single dummy "
|
||||
"logprob is returned for each token; or (2) if beam search "
|
||||
"is enabled 1 logprob per token is computed"),
|
||||
help=(
|
||||
"Number of logprobs-per-token to compute & return as part of "
|
||||
"the request. If unspecified, then either (1) if beam search "
|
||||
"is disabled, no logprobs are computed & a single dummy "
|
||||
"logprob is returned for each token; or (2) if beam search "
|
||||
"is enabled 1 logprob per token is computed"
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--request-rate",
|
||||
@@ -1010,32 +1089,34 @@ def add_cli_args(parser: argparse.ArgumentParser):
|
||||
"--ignore-eos",
|
||||
action="store_true",
|
||||
help="Set ignore_eos flag when sending the benchmark request."
|
||||
"Warning: ignore_eos is not supported in deepspeed_mii and tgi.")
|
||||
"Warning: ignore_eos is not supported in deepspeed_mii and tgi.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--percentile-metrics",
|
||||
type=str,
|
||||
default="ttft,tpot,itl",
|
||||
help="Comma-separated list of selected metrics to report percentils. "
|
||||
"This argument specifies the metrics to report percentiles. "
|
||||
"Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". ")
|
||||
'Allowed metric names are "ttft", "tpot", "itl", "e2el". ',
|
||||
)
|
||||
parser.add_argument(
|
||||
"--metric-percentiles",
|
||||
type=str,
|
||||
default="99",
|
||||
help="Comma-separated list of percentiles for selected metrics. "
|
||||
"To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
|
||||
"Default value is \"99\"."
|
||||
"Use \"--percentile-metrics\" to select metrics.",
|
||||
'To report 25-th, 50-th, and 75-th percentiles, use "25,50,75". '
|
||||
'Default value is "99".'
|
||||
'Use "--percentile-metrics" to select metrics.',
|
||||
)
|
||||
parser.add_argument(
|
||||
"--goodput",
|
||||
nargs="+",
|
||||
required=False,
|
||||
help="Specify service level objectives for goodput as \"KEY:VALUE\" "
|
||||
help='Specify service level objectives for goodput as "KEY:VALUE" '
|
||||
"pairs, where the key is a metric name, and the value is in "
|
||||
"milliseconds. Multiple \"KEY:VALUE\" pairs can be provided, "
|
||||
'milliseconds. Multiple "KEY:VALUE" pairs can be provided, '
|
||||
"separated by spaces. Allowed request level metric names are "
|
||||
"\"ttft\", \"tpot\", \"e2el\". For more context on the definition of "
|
||||
'"ttft", "tpot", "e2el". For more context on the definition of '
|
||||
"goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
|
||||
"and the blog: https://hao-ai-lab.github.io/blogs/distserve",
|
||||
)
|
||||
@@ -1052,22 +1133,19 @@ def add_cli_args(parser: argparse.ArgumentParser):
|
||||
"--top-p",
|
||||
type=float,
|
||||
default=None,
|
||||
help="Top-p sampling parameter. Only has effect on "
|
||||
"openai-compatible backends.",
|
||||
help="Top-p sampling parameter. Only has effect on openai-compatible backends.",
|
||||
)
|
||||
sampling_group.add_argument(
|
||||
"--top-k",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Top-k sampling parameter. Only has effect on "
|
||||
"openai-compatible backends.",
|
||||
help="Top-k sampling parameter. Only has effect on openai-compatible backends.",
|
||||
)
|
||||
sampling_group.add_argument(
|
||||
"--min-p",
|
||||
type=float,
|
||||
default=None,
|
||||
help="Min-p sampling parameter. Only has effect on "
|
||||
"openai-compatible backends.",
|
||||
help="Min-p sampling parameter. Only has effect on openai-compatible backends.",
|
||||
)
|
||||
sampling_group.add_argument(
|
||||
"--temperature",
|
||||
@@ -1100,29 +1178,34 @@ def add_cli_args(parser: argparse.ArgumentParser):
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--tokenizer-mode',
|
||||
"--tokenizer-mode",
|
||||
type=str,
|
||||
default="auto",
|
||||
choices=['auto', 'slow', 'mistral', 'custom'],
|
||||
choices=["auto", "slow", "mistral", "custom"],
|
||||
help='The tokenizer mode.\n\n* "auto" will use the '
|
||||
'fast tokenizer if available.\n* "slow" will '
|
||||
'always use the slow tokenizer. \n* '
|
||||
"always use the slow tokenizer. \n* "
|
||||
'"mistral" will always use the `mistral_common` tokenizer. \n*'
|
||||
'"custom" will use --tokenizer to select the preregistered tokenizer.')
|
||||
'"custom" will use --tokenizer to select the preregistered tokenizer.',
|
||||
)
|
||||
|
||||
parser.add_argument("--served-model-name",
|
||||
type=str,
|
||||
default=None,
|
||||
help="The model name used in the API. "
|
||||
"If not specified, the model name will be the "
|
||||
"same as the ``--model`` argument. ")
|
||||
parser.add_argument(
|
||||
"--served-model-name",
|
||||
type=str,
|
||||
default=None,
|
||||
help="The model name used in the API. "
|
||||
"If not specified, the model name will be the "
|
||||
"same as the ``--model`` argument. ",
|
||||
)
|
||||
|
||||
parser.add_argument("--lora-modules",
|
||||
nargs='+',
|
||||
default=None,
|
||||
help="A subset of LoRA module names passed in when "
|
||||
"launching the server. For each request, the "
|
||||
"script chooses a LoRA module at random.")
|
||||
parser.add_argument(
|
||||
"--lora-modules",
|
||||
nargs="+",
|
||||
default=None,
|
||||
help="A subset of LoRA module names passed in when "
|
||||
"launching the server. For each request, the "
|
||||
"script chooses a LoRA module at random.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--ramp-up-strategy",
|
||||
@@ -1132,7 +1215,8 @@ def add_cli_args(parser: argparse.ArgumentParser):
|
||||
help="The ramp-up strategy. This would be used to "
|
||||
"ramp up the request rate from initial RPS to final "
|
||||
"RPS rate (specified by --ramp-up-start-rps and "
|
||||
"--ramp-up-end-rps.) over the duration of the benchmark.")
|
||||
"--ramp-up-end-rps.) over the duration of the benchmark.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ramp-up-start-rps",
|
||||
type=int,
|
||||
@@ -1153,7 +1237,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
|
||||
default=600,
|
||||
help="Maximum time to wait for the endpoint to become ready "
|
||||
"in seconds (default: 600 seconds / 10 minutes). If set to 0, "
|
||||
"the ready check will be skipped."
|
||||
"the ready check will be skipped.",
|
||||
)
|
||||
|
||||
|
||||
@@ -1172,19 +1256,19 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
|
||||
raise ValueError(
|
||||
"When using ramp-up, do not specify --request-rate. "
|
||||
"The request rate will be controlled by ramp-up parameters. "
|
||||
"Please remove the --request-rate argument.")
|
||||
"Please remove the --request-rate argument."
|
||||
)
|
||||
if args.ramp_up_start_rps is None or args.ramp_up_end_rps is None:
|
||||
raise ValueError(
|
||||
"When using --ramp-up-strategy, both --ramp-up-start-rps and "
|
||||
"--ramp-up-end-rps must be specified")
|
||||
"--ramp-up-end-rps must be specified"
|
||||
)
|
||||
if args.ramp_up_start_rps < 0 or args.ramp_up_end_rps < 0:
|
||||
raise ValueError("Ramp-up start and end RPS must be non-negative")
|
||||
if args.ramp_up_start_rps > args.ramp_up_end_rps:
|
||||
raise ValueError("Ramp-up start RPS must be less than end RPS")
|
||||
if (args.ramp_up_strategy == "exponential"
|
||||
and args.ramp_up_start_rps == 0):
|
||||
raise ValueError(
|
||||
"For exponential ramp-up, the start RPS cannot be 0.")
|
||||
if args.ramp_up_strategy == "exponential" and args.ramp_up_start_rps == 0:
|
||||
raise ValueError("For exponential ramp-up, the start RPS cannot be 0.")
|
||||
|
||||
label = args.label
|
||||
model_id = args.model
|
||||
@@ -1208,17 +1292,19 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
|
||||
kvstring = item.split("=", 1)
|
||||
headers[kvstring[0].strip()] = kvstring[1].strip()
|
||||
else:
|
||||
raise ValueError(
|
||||
"Invalid header format. Please use KEY=VALUE format.")
|
||||
raise ValueError("Invalid header format. Please use KEY=VALUE format.")
|
||||
|
||||
tokenizer = get_tokenizer(tokenizer_id,
|
||||
tokenizer_mode=tokenizer_mode,
|
||||
trust_remote_code=args.trust_remote_code)
|
||||
tokenizer = get_tokenizer(
|
||||
tokenizer_id,
|
||||
tokenizer_mode=tokenizer_mode,
|
||||
trust_remote_code=args.trust_remote_code,
|
||||
)
|
||||
|
||||
if args.dataset_name is None:
|
||||
raise ValueError(
|
||||
"Please specify '--dataset-name' and the corresponding "
|
||||
"'--dataset-path' if required.")
|
||||
"'--dataset-path' if required."
|
||||
)
|
||||
|
||||
# Load the dataset.
|
||||
input_requests = get_samples(args, tokenizer)
|
||||
@@ -1235,13 +1321,15 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
|
||||
"frequency_penalty": args.frequency_penalty,
|
||||
"presence_penalty": args.presence_penalty,
|
||||
"repetition_penalty": args.repetition_penalty,
|
||||
}.items() if v is not None
|
||||
}.items()
|
||||
if v is not None
|
||||
}
|
||||
|
||||
# Sampling parameters are only supported by openai-compatible backend.
|
||||
if sampling_params and args.backend not in OPENAI_COMPATIBLE_BACKENDS:
|
||||
raise ValueError("Sampling parameters are only supported by "
|
||||
"openai-compatible backends.")
|
||||
raise ValueError(
|
||||
"Sampling parameters are only supported by openai-compatible backends."
|
||||
)
|
||||
|
||||
if "temperature" not in sampling_params:
|
||||
sampling_params["temperature"] = 0.0 # Default to greedy decoding.
|
||||
@@ -1264,9 +1352,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
|
||||
disable_tqdm=args.disable_tqdm,
|
||||
profile=args.profile,
|
||||
selected_percentile_metrics=args.percentile_metrics.split(","),
|
||||
selected_percentiles=[
|
||||
float(p) for p in args.metric_percentiles.split(",")
|
||||
],
|
||||
selected_percentiles=[float(p) for p in args.metric_percentiles.split(",")],
|
||||
ignore_eos=args.ignore_eos,
|
||||
goodput_config_dict=goodput_config_dict,
|
||||
max_concurrency=args.max_concurrency,
|
||||
@@ -1285,7 +1371,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
|
||||
# Setup
|
||||
current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
|
||||
result_json["date"] = current_dt
|
||||
result_json["endpoint_type"] = args.backend # for backward compatibility
|
||||
result_json["endpoint_type"] = args.backend # for backward compatibility
|
||||
result_json["backend"] = args.backend
|
||||
result_json["label"] = label
|
||||
result_json["model_id"] = model_id
|
||||
@@ -1300,11 +1386,13 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
|
||||
result_json[kvstring[0].strip()] = kvstring[1].strip()
|
||||
else:
|
||||
raise ValueError(
|
||||
"Invalid metadata format. Please use KEY=VALUE format.")
|
||||
"Invalid metadata format. Please use KEY=VALUE format."
|
||||
)
|
||||
|
||||
# Traffic
|
||||
result_json["request_rate"] = (args.request_rate if args.request_rate
|
||||
< float("inf") else "inf")
|
||||
result_json["request_rate"] = (
|
||||
args.request_rate if args.request_rate < float("inf") else "inf"
|
||||
)
|
||||
result_json["burstiness"] = args.burstiness
|
||||
result_json["max_concurrency"] = args.max_concurrency
|
||||
|
||||
@@ -1319,12 +1407,12 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
|
||||
if not args.save_detailed:
|
||||
# Remove fields with too many data points
|
||||
for field in [
|
||||
"input_lens",
|
||||
"output_lens",
|
||||
"ttfts",
|
||||
"itls",
|
||||
"generated_texts",
|
||||
"errors",
|
||||
"input_lens",
|
||||
"output_lens",
|
||||
"ttfts",
|
||||
"itls",
|
||||
"generated_texts",
|
||||
"errors",
|
||||
]:
|
||||
if field in result_json:
|
||||
del result_json[field]
|
||||
@@ -1334,8 +1422,11 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
|
||||
# Save to file
|
||||
if args.save_result or args.append_result:
|
||||
base_model_id = model_id.split("/")[-1]
|
||||
max_concurrency_str = (f"-concurrency{args.max_concurrency}"
|
||||
if args.max_concurrency is not None else "")
|
||||
max_concurrency_str = (
|
||||
f"-concurrency{args.max_concurrency}"
|
||||
if args.max_concurrency is not None
|
||||
else ""
|
||||
)
|
||||
label = label or args.backend
|
||||
if args.ramp_up_strategy is not None:
|
||||
file_name = f"{label}-ramp-up-{args.ramp_up_strategy}-{args.ramp_up_start_rps}qps-{args.ramp_up_end_rps}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" # noqa
|
||||
@@ -1346,9 +1437,9 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
|
||||
if args.result_dir:
|
||||
os.makedirs(args.result_dir, exist_ok=True)
|
||||
file_name = os.path.join(args.result_dir, file_name)
|
||||
with open(file_name,
|
||||
mode="a+" if args.append_result else "w",
|
||||
encoding="utf-8") as outfile:
|
||||
with open(
|
||||
file_name, mode="a+" if args.append_result else "w", encoding="utf-8"
|
||||
) as outfile:
|
||||
# Append a newline.
|
||||
if args.append_result and outfile.tell() != 0:
|
||||
outfile.write("\n")
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Benchmark offline inference throughput."""
|
||||
|
||||
import argparse
|
||||
import dataclasses
|
||||
import json
|
||||
@@ -13,18 +14,21 @@ from typing import Any, Optional, Union
|
||||
import torch
|
||||
import uvloop
|
||||
from tqdm import tqdm
|
||||
from transformers import (AutoModelForCausalLM, AutoTokenizer,
|
||||
PreTrainedTokenizerBase)
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase
|
||||
|
||||
from vllm.benchmarks.datasets import (AIMODataset, BurstGPTDataset,
|
||||
ConversationDataset,
|
||||
InstructCoderDataset,
|
||||
PrefixRepetitionRandomDataset,
|
||||
RandomDataset, SampleRequest,
|
||||
ShareGPTDataset, SonnetDataset,
|
||||
VisionArenaDataset)
|
||||
from vllm.benchmarks.lib.utils import (convert_to_pytorch_benchmark_format,
|
||||
write_to_json)
|
||||
from vllm.benchmarks.datasets import (
|
||||
AIMODataset,
|
||||
BurstGPTDataset,
|
||||
ConversationDataset,
|
||||
InstructCoderDataset,
|
||||
PrefixRepetitionRandomDataset,
|
||||
RandomDataset,
|
||||
SampleRequest,
|
||||
ShareGPTDataset,
|
||||
SonnetDataset,
|
||||
VisionArenaDataset,
|
||||
)
|
||||
from vllm.benchmarks.lib.utils import convert_to_pytorch_benchmark_format, write_to_json
|
||||
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
|
||||
from vllm.inputs import TextPrompt, TokensPrompt
|
||||
from vllm.lora.request import LoRARequest
|
||||
@@ -41,23 +45,30 @@ def run_vllm(
|
||||
disable_detokenize: bool = False,
|
||||
) -> tuple[float, Optional[list[RequestOutput]]]:
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
llm = LLM(**dataclasses.asdict(engine_args))
|
||||
assert all(
|
||||
llm.llm_engine.model_config.max_model_len >= (
|
||||
request.prompt_len + request.expected_output_len)
|
||||
for request in requests), (
|
||||
"Please ensure that max_model_len is greater than the sum of"
|
||||
" prompt_len and expected_output_len for all requests.")
|
||||
llm.llm_engine.model_config.max_model_len
|
||||
>= (request.prompt_len + request.expected_output_len)
|
||||
for request in requests
|
||||
), (
|
||||
"Please ensure that max_model_len is greater than the sum of"
|
||||
" prompt_len and expected_output_len for all requests."
|
||||
)
|
||||
# Add the requests to the engine.
|
||||
prompts: list[Union[TextPrompt, TokensPrompt]] = []
|
||||
sampling_params: list[SamplingParams] = []
|
||||
for request in requests:
|
||||
prompts.append(
|
||||
TokensPrompt(prompt_token_ids=request.prompt["prompt_token_ids"],
|
||||
multi_modal_data=request.multi_modal_data)
|
||||
if "prompt_token_ids" in request.prompt else \
|
||||
TextPrompt(prompt=request.prompt,
|
||||
multi_modal_data=request.multi_modal_data))
|
||||
TokensPrompt(
|
||||
prompt_token_ids=request.prompt["prompt_token_ids"],
|
||||
multi_modal_data=request.multi_modal_data,
|
||||
)
|
||||
if "prompt_token_ids" in request.prompt
|
||||
else TextPrompt(
|
||||
prompt=request.prompt, multi_modal_data=request.multi_modal_data
|
||||
)
|
||||
)
|
||||
sampling_params.append(
|
||||
SamplingParams(
|
||||
n=n,
|
||||
@@ -66,7 +77,8 @@ def run_vllm(
|
||||
ignore_eos=True,
|
||||
max_tokens=request.expected_output_len,
|
||||
detokenize=not disable_detokenize,
|
||||
))
|
||||
)
|
||||
)
|
||||
lora_requests: Optional[list[LoRARequest]] = None
|
||||
if engine_args.enable_lora:
|
||||
lora_requests = [request.lora_request for request in requests]
|
||||
@@ -78,10 +90,9 @@ def run_vllm(
|
||||
start = time.perf_counter()
|
||||
if do_profile:
|
||||
llm.start_profile()
|
||||
outputs = llm.generate(prompts,
|
||||
sampling_params,
|
||||
lora_request=lora_requests,
|
||||
use_tqdm=True)
|
||||
outputs = llm.generate(
|
||||
prompts, sampling_params, lora_request=lora_requests, use_tqdm=True
|
||||
)
|
||||
if do_profile:
|
||||
llm.stop_profile()
|
||||
end = time.perf_counter()
|
||||
@@ -101,7 +112,8 @@ def run_vllm(
|
||||
beam_width=n,
|
||||
max_tokens=output_len,
|
||||
ignore_eos=True,
|
||||
))
|
||||
),
|
||||
)
|
||||
if do_profile:
|
||||
llm.stop_profile()
|
||||
end = time.perf_counter()
|
||||
@@ -109,25 +121,29 @@ def run_vllm(
|
||||
|
||||
|
||||
def run_vllm_chat(
|
||||
requests: list[SampleRequest],
|
||||
n: int,
|
||||
engine_args: EngineArgs,
|
||||
do_profile: bool,
|
||||
disable_detokenize: bool = False) -> tuple[float, list[RequestOutput]]:
|
||||
requests: list[SampleRequest],
|
||||
n: int,
|
||||
engine_args: EngineArgs,
|
||||
do_profile: bool,
|
||||
disable_detokenize: bool = False,
|
||||
) -> tuple[float, list[RequestOutput]]:
|
||||
"""
|
||||
Run vLLM chat benchmark. This function is recommended ONLY for benchmarking
|
||||
multimodal models as it properly handles multimodal inputs and chat
|
||||
formatting. For non-multimodal models, use run_vllm() instead.
|
||||
"""
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
llm = LLM(**dataclasses.asdict(engine_args))
|
||||
|
||||
assert all(
|
||||
llm.llm_engine.model_config.max_model_len >= (
|
||||
request.prompt_len + request.expected_output_len)
|
||||
for request in requests), (
|
||||
"Please ensure that max_model_len is greater than the sum of "
|
||||
"prompt_len and expected_output_len for all requests.")
|
||||
llm.llm_engine.model_config.max_model_len
|
||||
>= (request.prompt_len + request.expected_output_len)
|
||||
for request in requests
|
||||
), (
|
||||
"Please ensure that max_model_len is greater than the sum of "
|
||||
"prompt_len and expected_output_len for all requests."
|
||||
)
|
||||
|
||||
prompts = []
|
||||
sampling_params: list[SamplingParams] = []
|
||||
@@ -141,7 +157,8 @@ def run_vllm_chat(
|
||||
ignore_eos=True,
|
||||
max_tokens=request.expected_output_len,
|
||||
detokenize=not disable_detokenize,
|
||||
))
|
||||
)
|
||||
)
|
||||
start = time.perf_counter()
|
||||
if do_profile:
|
||||
llm.start_profile()
|
||||
@@ -162,7 +179,8 @@ async def run_vllm_async(
|
||||
) -> float:
|
||||
from vllm import SamplingParams
|
||||
from vllm.entrypoints.openai.api_server import (
|
||||
build_async_engine_client_from_engine_args)
|
||||
build_async_engine_client_from_engine_args,
|
||||
)
|
||||
|
||||
async with build_async_engine_client_from_engine_args(
|
||||
engine_args,
|
||||
@@ -170,11 +188,13 @@ async def run_vllm_async(
|
||||
) as llm:
|
||||
model_config = await llm.get_model_config()
|
||||
assert all(
|
||||
model_config.max_model_len >= (request.prompt_len +
|
||||
request.expected_output_len)
|
||||
for request in requests), (
|
||||
"Please ensure that max_model_len is greater than the sum of"
|
||||
" prompt_len and expected_output_len for all requests.")
|
||||
model_config.max_model_len
|
||||
>= (request.prompt_len + request.expected_output_len)
|
||||
for request in requests
|
||||
), (
|
||||
"Please ensure that max_model_len is greater than the sum of"
|
||||
" prompt_len and expected_output_len for all requests."
|
||||
)
|
||||
|
||||
# Add the requests to the engine.
|
||||
prompts: list[Union[TextPrompt, TokensPrompt]] = []
|
||||
@@ -182,11 +202,15 @@ async def run_vllm_async(
|
||||
lora_requests: list[Optional[LoRARequest]] = []
|
||||
for request in requests:
|
||||
prompts.append(
|
||||
TokensPrompt(prompt_token_ids=request.prompt["prompt_token_ids"],
|
||||
multi_modal_data=request.multi_modal_data)
|
||||
if "prompt_token_ids" in request.prompt else \
|
||||
TextPrompt(prompt=request.prompt,
|
||||
multi_modal_data=request.multi_modal_data))
|
||||
TokensPrompt(
|
||||
prompt_token_ids=request.prompt["prompt_token_ids"],
|
||||
multi_modal_data=request.multi_modal_data,
|
||||
)
|
||||
if "prompt_token_ids" in request.prompt
|
||||
else TextPrompt(
|
||||
prompt=request.prompt, multi_modal_data=request.multi_modal_data
|
||||
)
|
||||
)
|
||||
sampling_params.append(
|
||||
SamplingParams(
|
||||
n=n,
|
||||
@@ -195,19 +219,18 @@ async def run_vllm_async(
|
||||
ignore_eos=True,
|
||||
max_tokens=request.expected_output_len,
|
||||
detokenize=not disable_detokenize,
|
||||
))
|
||||
)
|
||||
)
|
||||
lora_requests.append(request.lora_request)
|
||||
|
||||
generators = []
|
||||
start = time.perf_counter()
|
||||
if do_profile:
|
||||
await llm.start_profile()
|
||||
for i, (prompt, sp,
|
||||
lr) in enumerate(zip(prompts, sampling_params, lora_requests)):
|
||||
generator = llm.generate(prompt,
|
||||
sp,
|
||||
lora_request=lr,
|
||||
request_id=f"test{i}")
|
||||
for i, (prompt, sp, lr) in enumerate(
|
||||
zip(prompts, sampling_params, lora_requests)
|
||||
):
|
||||
generator = llm.generate(prompt, sp, lora_request=lr, request_id=f"test{i}")
|
||||
generators.append(generator)
|
||||
all_gens = merge_async_iterators(*generators)
|
||||
async for i, res in all_gens:
|
||||
@@ -228,7 +251,8 @@ def run_hf(
|
||||
disable_detokenize: bool = False,
|
||||
) -> float:
|
||||
llm = AutoModelForCausalLM.from_pretrained(
|
||||
model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
|
||||
model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code
|
||||
)
|
||||
if llm.config.model_type == "llama":
|
||||
# To enable padding in the HF backend.
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
@@ -251,14 +275,15 @@ def run_hf(
|
||||
# Check if we can add more requests to the batch.
|
||||
next_prompt_len = requests[i + 1].prompt_len
|
||||
next_output_len = requests[i + 1].expected_output_len
|
||||
if (max(max_prompt_len, next_prompt_len) +
|
||||
max(max_output_len, next_output_len)) <= 2048:
|
||||
if (
|
||||
max(max_prompt_len, next_prompt_len)
|
||||
+ max(max_output_len, next_output_len)
|
||||
) <= 2048:
|
||||
# We can add more requests to the batch.
|
||||
continue
|
||||
|
||||
# Generate the sequences.
|
||||
input_ids = tokenizer(batch, return_tensors="pt",
|
||||
padding=True).input_ids
|
||||
input_ids = tokenizer(batch, return_tensors="pt", padding=True).input_ids
|
||||
llm_outputs = llm.generate(
|
||||
input_ids=input_ids.cuda(),
|
||||
do_sample=True,
|
||||
@@ -281,8 +306,9 @@ def run_hf(
|
||||
return end - start
|
||||
|
||||
|
||||
def save_to_pytorch_benchmark_format(args: argparse.Namespace,
|
||||
results: dict[str, Any]) -> None:
|
||||
def save_to_pytorch_benchmark_format(
|
||||
args: argparse.Namespace, results: dict[str, Any]
|
||||
) -> None:
|
||||
pt_records = convert_to_pytorch_benchmark_format(
|
||||
args=args,
|
||||
metrics={
|
||||
@@ -290,9 +316,9 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace,
|
||||
"tokens_per_second": [results["tokens_per_second"]],
|
||||
},
|
||||
extra_info={
|
||||
k: results[k]
|
||||
for k in ["elapsed_time", "num_requests", "total_num_tokens"]
|
||||
})
|
||||
k: results[k] for k in ["elapsed_time", "num_requests", "total_num_tokens"]
|
||||
},
|
||||
)
|
||||
if pt_records:
|
||||
# Don't use json suffix here as we don't want CI to pick it up
|
||||
pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
|
||||
@@ -324,7 +350,8 @@ def get_requests(args, tokenizer):
|
||||
sample_kwargs["enable_multimodal_chat"] = True
|
||||
elif args.dataset_name == "sonnet":
|
||||
assert tokenizer.chat_template or tokenizer.default_chat_template, (
|
||||
"Tokenizer/model must have chat template for sonnet dataset.")
|
||||
"Tokenizer/model must have chat template for sonnet dataset."
|
||||
)
|
||||
dataset_cls = SonnetDataset
|
||||
sample_kwargs["prefix_len"] = args.prefix_len
|
||||
sample_kwargs["return_prompt_formatted"] = True
|
||||
@@ -333,21 +360,21 @@ def get_requests(args, tokenizer):
|
||||
elif args.dataset_name == "hf":
|
||||
if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
|
||||
dataset_cls = VisionArenaDataset
|
||||
common_kwargs['dataset_subset'] = None
|
||||
common_kwargs['dataset_split'] = "train"
|
||||
common_kwargs["dataset_subset"] = None
|
||||
common_kwargs["dataset_split"] = "train"
|
||||
sample_kwargs["enable_multimodal_chat"] = True
|
||||
elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
|
||||
dataset_cls = InstructCoderDataset
|
||||
common_kwargs['dataset_split'] = "train"
|
||||
common_kwargs["dataset_split"] = "train"
|
||||
elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
|
||||
dataset_cls = ConversationDataset
|
||||
common_kwargs['dataset_subset'] = args.hf_subset
|
||||
common_kwargs['dataset_split'] = args.hf_split
|
||||
common_kwargs["dataset_subset"] = args.hf_subset
|
||||
common_kwargs["dataset_split"] = args.hf_split
|
||||
sample_kwargs["enable_multimodal_chat"] = True
|
||||
elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
|
||||
dataset_cls = AIMODataset
|
||||
common_kwargs['dataset_subset'] = None
|
||||
common_kwargs['dataset_split'] = "train"
|
||||
common_kwargs["dataset_subset"] = None
|
||||
common_kwargs["dataset_split"] = "train"
|
||||
elif args.dataset_name == "prefix_repetition":
|
||||
dataset_cls = PrefixRepetitionRandomDataset
|
||||
sample_kwargs["prefix_len"] = args.prefix_repetition_prefix_len
|
||||
@@ -373,8 +400,11 @@ def filter_requests_for_dp(requests, data_parallel_size):
|
||||
global_rank = int(os.environ["RANK"])
|
||||
world_size = int(os.environ["WORLD_SIZE"])
|
||||
data_parallel_rank = global_rank // (world_size // data_parallel_size)
|
||||
return [r for i, r in enumerate(requests)
|
||||
if i % data_parallel_size == data_parallel_rank]
|
||||
return [
|
||||
r
|
||||
for i, r in enumerate(requests)
|
||||
if i % data_parallel_size == data_parallel_rank
|
||||
]
|
||||
|
||||
|
||||
def validate_args(args):
|
||||
@@ -387,7 +417,8 @@ def validate_args(args):
|
||||
warnings.warn(
|
||||
"The '--dataset' argument will be deprecated in the next release. "
|
||||
"Please use '--dataset-name' and '--dataset-path' instead.",
|
||||
stacklevel=2)
|
||||
stacklevel=2,
|
||||
)
|
||||
args.dataset_path = args.dataset
|
||||
|
||||
if not getattr(args, "tokenizer", None):
|
||||
@@ -404,9 +435,8 @@ def validate_args(args):
|
||||
and not args.dataset_path
|
||||
and args.dataset_name not in {"prefix_repetition"}
|
||||
):
|
||||
print(
|
||||
"When dataset path is not set, it will default to random dataset")
|
||||
args.dataset_name = 'random'
|
||||
print("When dataset path is not set, it will default to random dataset")
|
||||
args.dataset_name = "random"
|
||||
if args.input_len is None:
|
||||
raise ValueError("input_len must be provided for a random dataset")
|
||||
|
||||
@@ -414,41 +444,55 @@ def validate_args(args):
|
||||
# --hf-subset and --hf-split: only used
|
||||
# when dataset_name is 'hf'
|
||||
if args.dataset_name != "hf" and (
|
||||
getattr(args, "hf_subset", None) is not None
|
||||
or getattr(args, "hf_split", None) is not None):
|
||||
warnings.warn("--hf-subset and --hf-split will be ignored \
|
||||
getattr(args, "hf_subset", None) is not None
|
||||
or getattr(args, "hf_split", None) is not None
|
||||
):
|
||||
warnings.warn(
|
||||
"--hf-subset and --hf-split will be ignored \
|
||||
since --dataset-name is not 'hf'.",
|
||||
stacklevel=2)
|
||||
stacklevel=2,
|
||||
)
|
||||
elif args.dataset_name == "hf":
|
||||
if args.dataset_path in (
|
||||
VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys()
|
||||
| ConversationDataset.SUPPORTED_DATASET_PATHS):
|
||||
assert args.backend == "vllm-chat", f"{args.dataset_path} needs to use vllm-chat as the backend." #noqa: E501
|
||||
elif args.dataset_path in (InstructCoderDataset.SUPPORTED_DATASET_PATHS
|
||||
| AIMODataset.SUPPORTED_DATASET_PATHS):
|
||||
assert args.backend == "vllm", f"{args.dataset_path} needs to use vllm as the backend." #noqa: E501
|
||||
VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys()
|
||||
| ConversationDataset.SUPPORTED_DATASET_PATHS
|
||||
):
|
||||
assert args.backend == "vllm-chat", (
|
||||
f"{args.dataset_path} needs to use vllm-chat as the backend."
|
||||
) # noqa: E501
|
||||
elif args.dataset_path in (
|
||||
InstructCoderDataset.SUPPORTED_DATASET_PATHS
|
||||
| AIMODataset.SUPPORTED_DATASET_PATHS
|
||||
):
|
||||
assert args.backend == "vllm", (
|
||||
f"{args.dataset_path} needs to use vllm as the backend."
|
||||
) # noqa: E501
|
||||
else:
|
||||
raise ValueError(
|
||||
f"{args.dataset_path} is not supported by hf dataset.")
|
||||
raise ValueError(f"{args.dataset_path} is not supported by hf dataset.")
|
||||
|
||||
# --random-range-ratio: only used when dataset_name is 'random'
|
||||
if args.dataset_name != 'random' and args.random_range_ratio is not None:
|
||||
warnings.warn("--random-range-ratio will be ignored since \
|
||||
if args.dataset_name != "random" and args.random_range_ratio is not None:
|
||||
warnings.warn(
|
||||
"--random-range-ratio will be ignored since \
|
||||
--dataset-name is not 'random'.",
|
||||
stacklevel=2)
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
# --prefix-len: only used when dataset_name is 'random', 'sonnet', or not
|
||||
# set.
|
||||
if args.dataset_name not in {"random", "sonnet", None
|
||||
} and args.prefix_len is not None:
|
||||
warnings.warn("--prefix-len will be ignored since --dataset-name\
|
||||
if (
|
||||
args.dataset_name not in {"random", "sonnet", None}
|
||||
and args.prefix_len is not None
|
||||
):
|
||||
warnings.warn(
|
||||
"--prefix-len will be ignored since --dataset-name\
|
||||
is not 'random', 'sonnet', or not set.",
|
||||
stacklevel=2)
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
# === LoRA Settings ===
|
||||
if getattr(args, "enable_lora", False) and args.backend != "vllm":
|
||||
raise ValueError(
|
||||
"LoRA benchmarking is only supported for vLLM backend")
|
||||
raise ValueError("LoRA benchmarking is only supported for vLLM backend")
|
||||
if getattr(args, "enable_lora", False) and args.lora_path is None:
|
||||
raise ValueError("LoRA path must be provided when enable_lora is True")
|
||||
|
||||
@@ -458,8 +502,10 @@ def validate_args(args):
|
||||
if args.backend != "hf" and args.hf_max_batch_size is not None:
|
||||
raise ValueError("HF max batch size is only for HF backend.")
|
||||
|
||||
if args.backend in {"hf", "mii"} and getattr(args, "quantization",
|
||||
None) is not None:
|
||||
if (
|
||||
args.backend in {"hf", "mii"}
|
||||
and getattr(args, "quantization", None) is not None
|
||||
):
|
||||
raise ValueError("Quantization is only for vLLM backend.")
|
||||
|
||||
if args.backend == "mii" and args.dtype != "auto":
|
||||
@@ -467,12 +513,11 @@ def validate_args(args):
|
||||
if args.backend == "mii" and args.n != 1:
|
||||
raise ValueError("n must be 1 for MII backend.")
|
||||
if args.backend == "mii" and args.tokenizer != args.model:
|
||||
raise ValueError(
|
||||
"Tokenizer must be the same as the model for MII backend.")
|
||||
raise ValueError("Tokenizer must be the same as the model for MII backend.")
|
||||
|
||||
if args.data_parallel_size > 1 and (
|
||||
args.distributed_executor_backend != "external_launcher"
|
||||
or args.async_engine):
|
||||
args.distributed_executor_backend != "external_launcher" or args.async_engine
|
||||
):
|
||||
# --data-parallel is not supported fully.
|
||||
# Old issue: https://github.com/vllm-project/vllm/issues/16222
|
||||
# Currently we only support data parallel with external launcher
|
||||
@@ -485,19 +530,19 @@ def validate_args(args):
|
||||
|
||||
|
||||
def add_cli_args(parser: argparse.ArgumentParser):
|
||||
parser.add_argument("--backend",
|
||||
type=str,
|
||||
choices=["vllm", "hf", "mii", "vllm-chat"],
|
||||
default="vllm")
|
||||
parser.add_argument(
|
||||
"--backend",
|
||||
type=str,
|
||||
choices=["vllm", "hf", "mii", "vllm-chat"],
|
||||
default="vllm",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dataset-name",
|
||||
type=str,
|
||||
choices=[
|
||||
"sharegpt", "random", "sonnet", "burstgpt", "hf",
|
||||
"prefix_repetition"
|
||||
],
|
||||
choices=["sharegpt", "random", "sonnet", "burstgpt", "hf", "prefix_repetition"],
|
||||
help="Name of the dataset to benchmark on.",
|
||||
default="sharegpt")
|
||||
default="sharegpt",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dataset",
|
||||
type=str,
|
||||
@@ -505,57 +550,70 @@ def add_cli_args(parser: argparse.ArgumentParser):
|
||||
help="Path to the ShareGPT dataset, will be deprecated in\
|
||||
the next release. The dataset is expected to "
|
||||
"be a json in form of list[dict[..., conversations: "
|
||||
"list[dict[..., value: <prompt_or_response>]]]]")
|
||||
parser.add_argument("--dataset-path",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Path to the dataset")
|
||||
parser.add_argument("--input-len",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Input prompt length for each request")
|
||||
parser.add_argument("--output-len",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Output length for each request. Overrides the "
|
||||
"output length from the dataset.")
|
||||
parser.add_argument("--n",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Number of generated sequences per prompt.")
|
||||
parser.add_argument("--num-prompts",
|
||||
type=int,
|
||||
default=1000,
|
||||
help="Number of prompts to process.")
|
||||
parser.add_argument("--hf-max-batch-size",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Maximum batch size for HF backend.")
|
||||
"list[dict[..., value: <prompt_or_response>]]]]",
|
||||
)
|
||||
parser.add_argument(
|
||||
'--output-json',
|
||||
"--dataset-path", type=str, default=None, help="Path to the dataset"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input-len",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Input prompt length for each request",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-len",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Output length for each request. Overrides the "
|
||||
"output length from the dataset.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--n", type=int, default=1, help="Number of generated sequences per prompt."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num-prompts", type=int, default=1000, help="Number of prompts to process."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--hf-max-batch-size",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Maximum batch size for HF backend.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-json",
|
||||
type=str,
|
||||
default=None,
|
||||
help='Path to save the throughput results in JSON format.')
|
||||
parser.add_argument("--async-engine",
|
||||
action='store_true',
|
||||
default=False,
|
||||
help="Use vLLM async engine rather than LLM class.")
|
||||
parser.add_argument("--disable-frontend-multiprocessing",
|
||||
action='store_true',
|
||||
default=False,
|
||||
help="Disable decoupled async engine frontend.")
|
||||
help="Path to save the throughput results in JSON format.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--async-engine",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Use vLLM async engine rather than LLM class.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--disable-frontend-multiprocessing",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Disable decoupled async engine frontend.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--disable-detokenize",
|
||||
action="store_true",
|
||||
help=("Do not detokenize the response (i.e. do not include "
|
||||
"detokenization time in the measurement)"))
|
||||
help=(
|
||||
"Do not detokenize the response (i.e. do not include "
|
||||
"detokenization time in the measurement)"
|
||||
),
|
||||
)
|
||||
# LoRA
|
||||
parser.add_argument(
|
||||
"--lora-path",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Path to the lora adapters to use. This can be an absolute path, "
|
||||
"a relative path, or a Hugging Face model identifier.")
|
||||
"a relative path, or a Hugging Face model identifier.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--prefix-len",
|
||||
type=int,
|
||||
@@ -575,24 +633,24 @@ def add_cli_args(parser: argparse.ArgumentParser):
|
||||
)
|
||||
|
||||
# hf dtaset
|
||||
parser.add_argument("--hf-subset",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Subset of the HF dataset.")
|
||||
parser.add_argument("--hf-split",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Split of the HF dataset.")
|
||||
parser.add_argument(
|
||||
"--hf-subset", type=str, default=None, help="Subset of the HF dataset."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--hf-split", type=str, default=None, help="Split of the HF dataset."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--profile",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Use Torch Profiler. The env variable "
|
||||
"VLLM_TORCH_PROFILER_DIR must be set to enable profiler.")
|
||||
"VLLM_TORCH_PROFILER_DIR must be set to enable profiler.",
|
||||
)
|
||||
|
||||
# prefix repetition dataset
|
||||
prefix_repetition_group = parser.add_argument_group(
|
||||
"prefix repetition dataset options")
|
||||
"prefix repetition dataset options"
|
||||
)
|
||||
prefix_repetition_group.add_argument(
|
||||
"--prefix-repetition-prefix-len",
|
||||
type=int,
|
||||
@@ -634,10 +692,10 @@ def main(args: argparse.Namespace):
|
||||
random.seed(args.seed)
|
||||
# Sample the requests.
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
args.tokenizer, trust_remote_code=args.trust_remote_code)
|
||||
args.tokenizer, trust_remote_code=args.trust_remote_code
|
||||
)
|
||||
requests = get_requests(args, tokenizer)
|
||||
is_multi_modal = any(request.multi_modal_data is not None
|
||||
for request in requests)
|
||||
is_multi_modal = any(request.multi_modal_data is not None for request in requests)
|
||||
request_outputs: Optional[list[RequestOutput]] = None
|
||||
if args.backend == "vllm":
|
||||
if args.async_engine:
|
||||
@@ -649,24 +707,37 @@ def main(args: argparse.Namespace):
|
||||
disable_frontend_multiprocessing=args.disable_frontend_multiprocessing,
|
||||
disable_detokenize=args.disable_detokenize,
|
||||
do_profile=args.profile,
|
||||
))
|
||||
)
|
||||
)
|
||||
else:
|
||||
elapsed_time, request_outputs = run_vllm(
|
||||
requests, args.n, EngineArgs.from_cli_args(args),
|
||||
requests,
|
||||
args.n,
|
||||
EngineArgs.from_cli_args(args),
|
||||
disable_detokenize=args.disable_detokenize,
|
||||
do_profile=args.profile)
|
||||
do_profile=args.profile,
|
||||
)
|
||||
elif args.backend == "hf":
|
||||
assert args.tensor_parallel_size == 1
|
||||
if args.profile:
|
||||
raise NotImplementedError(
|
||||
"Profiling not implemented yet for backend='hf'.")
|
||||
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
|
||||
args.hf_max_batch_size, args.trust_remote_code,
|
||||
args.disable_detokenize)
|
||||
raise NotImplementedError("Profiling not implemented yet for backend='hf'.")
|
||||
elapsed_time = run_hf(
|
||||
requests,
|
||||
args.model,
|
||||
tokenizer,
|
||||
args.n,
|
||||
args.hf_max_batch_size,
|
||||
args.trust_remote_code,
|
||||
args.disable_detokenize,
|
||||
)
|
||||
elif args.backend == "vllm-chat":
|
||||
elapsed_time, request_outputs = run_vllm_chat(
|
||||
requests, args.n, EngineArgs.from_cli_args(args),
|
||||
disable_detokenize=args.disable_detokenize, do_profile=args.profile)
|
||||
requests,
|
||||
args.n,
|
||||
EngineArgs.from_cli_args(args),
|
||||
disable_detokenize=args.disable_detokenize,
|
||||
do_profile=args.profile,
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Unknown backend: {args.backend}")
|
||||
|
||||
@@ -678,28 +749,31 @@ def main(args: argparse.Namespace):
|
||||
for ro in request_outputs:
|
||||
if not isinstance(ro, RequestOutput):
|
||||
continue
|
||||
total_prompt_tokens += len(
|
||||
ro.prompt_token_ids) if ro.prompt_token_ids else 0
|
||||
total_output_tokens += sum(
|
||||
len(o.token_ids) for o in ro.outputs if o)
|
||||
total_prompt_tokens += (
|
||||
len(ro.prompt_token_ids) if ro.prompt_token_ids else 0
|
||||
)
|
||||
total_output_tokens += sum(len(o.token_ids) for o in ro.outputs if o)
|
||||
total_num_tokens = total_prompt_tokens + total_output_tokens
|
||||
else:
|
||||
total_num_tokens = sum(r.prompt_len + r.expected_output_len
|
||||
for r in requests)
|
||||
total_num_tokens = sum(r.prompt_len + r.expected_output_len for r in requests)
|
||||
total_output_tokens = sum(r.expected_output_len for r in requests)
|
||||
total_prompt_tokens = total_num_tokens - total_output_tokens
|
||||
|
||||
if is_multi_modal and args.backend != "vllm-chat":
|
||||
print("\033[91mWARNING\033[0m: Multi-modal request with "
|
||||
f"{args.backend} backend detected. The "
|
||||
"following metrics are not accurate because image tokens are not"
|
||||
" counted. See vllm-project/vllm/issues/9778 for details.")
|
||||
print(
|
||||
"\033[91mWARNING\033[0m: Multi-modal request with "
|
||||
f"{args.backend} backend detected. The "
|
||||
"following metrics are not accurate because image tokens are not"
|
||||
" counted. See vllm-project/vllm/issues/9778 for details."
|
||||
)
|
||||
# TODO(vllm-project/vllm/issues/9778): Count multi-modal token length.
|
||||
# vllm-chat backend counts the image tokens now
|
||||
|
||||
print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
|
||||
f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
|
||||
f"{total_output_tokens / elapsed_time:.2f} output tokens/s")
|
||||
print(
|
||||
f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
|
||||
f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
|
||||
f"{total_output_tokens / elapsed_time:.2f} output tokens/s"
|
||||
)
|
||||
print(f"Total num prompt tokens: {total_prompt_tokens}")
|
||||
print(f"Total num output tokens: {total_output_tokens}")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user