[benchmark] add peak throughput metrics and plot (#23867)

Signed-off-by: simon-mo <simon.mo@hey.com>
2025-09-17 22:30:02 -07:00
parent b7433ca1a4
commit a904ea78ea
2 changed files with 134 additions and 69 deletions
--- a/vllm/benchmarks/lib/endpoint_request_func.py
+++ b/vllm/benchmarks/lib/endpoint_request_func.py
@@ -89,6 +89,7 @@ class RequestFuncOutput:
    tpot: float = 0.0  # avg next-token latencies
    prompt_len: int = 0
    error: str = ""
+    start_time: float = 0.0


 async def async_request_openai_completions(
@@ -140,6 +141,7 @@ async def async_request_openai_completions(

    generated_text = ""
    st = time.perf_counter()
+    output.start_time = st
    most_recent_timestamp = st
    try:
        async with session.post(url=api_url, json=payload,
@@ -272,6 +274,7 @@ async def async_request_openai_chat_completions(
    generated_text = ""
    ttft = 0.0
    st = time.perf_counter()
+    output.start_time = st
    most_recent_timestamp = st
    try:
        async with session.post(url=api_url, json=payload,
@@ -396,6 +399,7 @@ async def async_request_openai_audio(
        generated_text = ""
        ttft = 0.0
        st = time.perf_counter()
+        output.start_time = st
        most_recent_timestamp = st
        try:
            async with session.post(url=api_url,
@@ -475,6 +479,7 @@ async def async_request_openai_embeddings(

    output = RequestFuncOutput()
    st = time.perf_counter()
+    output.start_time = st
    try:
        async with session.post(
            url=api_url,