Benchmark serving structured output (#10880)

Signed-off-by: Chendi Xue <chendi.xue@intel.com> Co-authored-by: Michael Goin <michael@neuralmagic.com>
2024-12-04 15:28:21 -06:00
parent 10398b4706
commit 82eb5ea8f3
2 changed files with 887 additions and 0 deletions
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -24,6 +24,7 @@ class RequestFuncInput:
    model: str
    best_of: int = 1
    logprobs: Optional[int] = None
+    extra_body: Optional[dict] = None
    multi_modal_content: Optional[dict] = None
    ignore_eos: bool = False

@@ -36,6 +37,7 @@ class RequestFuncOutput:
    ttft: float = 0.0  # Time to first token
    itl: List[float] = field(
        default_factory=list)  # List of inter-token latencies
+    tpot: float = 0.0  # avg next-token latencies
    prompt_len: int = 0
    error: str = ""

@@ -242,6 +244,8 @@ async def async_request_openai_completions(
            "stream": True,
            "ignore_eos": request_func_input.ignore_eos,
        }
+        if request_func_input.extra_body:
+            payload.update(request_func_input.extra_body)
        headers = {
            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
        }
@@ -336,6 +340,8 @@ async def async_request_openai_chat_completions(
            "stream": True,
            "ignore_eos": request_func_input.ignore_eos,
        }
+        if request_func_input.extra_body:
+            payload.update(request_func_input.extra_body)
        headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",