[CI] Update performance benchmark: upgrade trt-llm to r24.07, and add SGLang (#7412)
This commit is contained in:
@@ -26,6 +26,7 @@ class RequestFuncInput:
|
||||
use_beam_search: bool = False
|
||||
logprobs: Optional[int] = None
|
||||
multi_modal_content: Optional[dict] = None
|
||||
ignore_eos: bool = False
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -55,6 +56,7 @@ async def async_request_tgi(
|
||||
"do_sample": True,
|
||||
"temperature": 0.01, # TGI does not accept 0.0 temperature.
|
||||
"top_p": 0.99, # TGI does not accept 1.0 top_p.
|
||||
# TGI does not accept ignore_eos flag.
|
||||
}
|
||||
payload = {
|
||||
"inputs": request_func_input.prompt,
|
||||
@@ -129,6 +131,8 @@ async def async_request_trt_llm(
|
||||
"max_tokens": request_func_input.output_len,
|
||||
"stream": True,
|
||||
}
|
||||
if request_func_input.ignore_eos:
|
||||
payload["min_length"] = request_func_input.output_len
|
||||
output = RequestFuncOutput()
|
||||
output.prompt_len = request_func_input.prompt_len
|
||||
|
||||
@@ -240,6 +244,7 @@ async def async_request_openai_completions(
|
||||
"max_tokens": request_func_input.output_len,
|
||||
"logprobs": request_func_input.logprobs,
|
||||
"stream": True,
|
||||
"ignore_eos": request_func_input.ignore_eos,
|
||||
}
|
||||
headers = {
|
||||
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
|
||||
@@ -327,6 +332,7 @@ async def async_request_openai_chat_completions(
|
||||
"temperature": 0.0,
|
||||
"max_tokens": request_func_input.output_len,
|
||||
"stream": True,
|
||||
"ignore_eos": request_func_input.ignore_eos,
|
||||
}
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
@@ -430,4 +436,5 @@ ASYNC_REQUEST_FUNCS = {
|
||||
"openai-chat": async_request_openai_chat_completions,
|
||||
"tensorrt-llm": async_request_trt_llm,
|
||||
"scalellm": async_request_openai_completions,
|
||||
"sglang": async_request_openai_completions,
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user