[CI] Update performance benchmark: upgrade trt-llm to r24.07, and add SGLang (#7412)

This commit is contained in:
Kuntai Du
2024-10-04 14:01:44 -07:00
committed by GitHub
parent 05d686432f
commit fbb74420e7
18 changed files with 1149 additions and 1273 deletions

View File

@@ -26,6 +26,7 @@ class RequestFuncInput:
use_beam_search: bool = False
logprobs: Optional[int] = None
multi_modal_content: Optional[dict] = None
ignore_eos: bool = False
@dataclass
@@ -55,6 +56,7 @@ async def async_request_tgi(
"do_sample": True,
"temperature": 0.01, # TGI does not accept 0.0 temperature.
"top_p": 0.99, # TGI does not accept 1.0 top_p.
# TGI does not accept ignore_eos flag.
}
payload = {
"inputs": request_func_input.prompt,
@@ -129,6 +131,8 @@ async def async_request_trt_llm(
"max_tokens": request_func_input.output_len,
"stream": True,
}
if request_func_input.ignore_eos:
payload["min_length"] = request_func_input.output_len
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
@@ -240,6 +244,7 @@ async def async_request_openai_completions(
"max_tokens": request_func_input.output_len,
"logprobs": request_func_input.logprobs,
"stream": True,
"ignore_eos": request_func_input.ignore_eos,
}
headers = {
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
@@ -327,6 +332,7 @@ async def async_request_openai_chat_completions(
"temperature": 0.0,
"max_tokens": request_func_input.output_len,
"stream": True,
"ignore_eos": request_func_input.ignore_eos,
}
headers = {
"Content-Type": "application/json",
@@ -430,4 +436,5 @@ ASYNC_REQUEST_FUNCS = {
"openai-chat": async_request_openai_chat_completions,
"tensorrt-llm": async_request_trt_llm,
"scalellm": async_request_openai_completions,
"sglang": async_request_openai_completions,
}