[CI] Update performance benchmark: upgrade trt-llm to r24.07, and add SGLang (#7412)

2024-10-04 14:01:44 -07:00
parent 05d686432f
commit fbb74420e7
18 changed files with 1149 additions and 1273 deletions
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -26,6 +26,7 @@ class RequestFuncInput:
    use_beam_search: bool = False
    logprobs: Optional[int] = None
    multi_modal_content: Optional[dict] = None
+    ignore_eos: bool = False


@dataclass
@@ -55,6 +56,7 @@ async def async_request_tgi(
            "do_sample": True,
            "temperature": 0.01,  # TGI does not accept 0.0 temperature.
            "top_p": 0.99,  # TGI does not accept 1.0 top_p.
+            # TGI does not accept ignore_eos flag.
        }
        payload = {
            "inputs": request_func_input.prompt,
@@ -129,6 +131,8 @@ async def async_request_trt_llm(
            "max_tokens": request_func_input.output_len,
            "stream": True,
        }
+        if request_func_input.ignore_eos:
+            payload["min_length"] = request_func_input.output_len
        output = RequestFuncOutput()
        output.prompt_len = request_func_input.prompt_len

@@ -240,6 +244,7 @@ async def async_request_openai_completions(
            "max_tokens": request_func_input.output_len,
            "logprobs": request_func_input.logprobs,
            "stream": True,
+            "ignore_eos": request_func_input.ignore_eos,
        }
        headers = {
            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
@@ -327,6 +332,7 @@ async def async_request_openai_chat_completions(
            "temperature": 0.0,
            "max_tokens": request_func_input.output_len,
            "stream": True,
+            "ignore_eos": request_func_input.ignore_eos,
        }
        headers = {
            "Content-Type": "application/json",
@@ -430,4 +436,5 @@ ASYNC_REQUEST_FUNCS = {
    "openai-chat": async_request_openai_chat_completions,
    "tensorrt-llm": async_request_trt_llm,
    "scalellm": async_request_openai_completions,
+    "sglang": async_request_openai_completions,
 }