[Doc] Add typing hints / mypy types cleanup (#3816)

Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
2024-04-11 17:17:21 -07:00
parent e46a60aa4c
commit c2b4a1bce9
11 changed files with 90 additions and 64 deletions
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -27,8 +27,8 @@ class RequestFuncInput:
 class RequestFuncOutput:
    generated_text: str = ""
    success: bool = False
-    latency: float = 0
-    ttft: float = 0  # Time to first token
+    latency: float = 0.0
+    ttft: float = 0.0  # Time to first token
    itl: List[float] = field(
        default_factory=list)  # List of inter-token latencies
    prompt_len: int = 0
@@ -58,23 +58,24 @@ async def async_request_tgi(
        output = RequestFuncOutput()
        output.prompt_len = request_func_input.prompt_len

-        ttft = 0
+        ttft = 0.0
        st = time.perf_counter()
        most_recent_timestamp = st
        try:
            async with session.post(url=api_url, json=payload) as response:
                if response.status == 200:
-                    async for chunk in response.content:
-                        chunk = chunk.strip()
-                        if not chunk:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
                            continue

-                        chunk = remove_prefix(chunk.decode("utf-8"), "data:")
+                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
+                                              "data:")

                        data = json.loads(chunk)
                        timestamp = time.perf_counter()
                        # First token
-                        if ttft == 0:
+                        if ttft == 0.0:
                            ttft = time.perf_counter() - st
                            output.ttft = ttft

@@ -119,23 +120,24 @@ async def async_request_trt_llm(
        output = RequestFuncOutput()
        output.prompt_len = request_func_input.prompt_len

-        ttft = 0
+        ttft = 0.0
        st = time.perf_counter()
        most_recent_timestamp = st
        try:
            async with session.post(url=api_url, json=payload) as response:
                if response.status == 200:
-                    async for chunk in response.content:
-                        chunk = chunk.strip()
-                        if not chunk:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
                            continue

-                        chunk = remove_prefix(chunk.decode("utf-8"), "data:")
+                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
+                                              "data:")

                        data = json.loads(chunk)
                        timestamp = time.perf_counter()
                        # First token
-                        if ttft == 0:
+                        if ttft == 0.0:
                            ttft = time.perf_counter() - st
                            output.ttft = ttft

@@ -151,7 +153,7 @@ async def async_request_trt_llm(
                    output.success = True

                else:
-                    output.error = response.reason
+                    output.error = response.reason or ""
                    output.success = False
        except Exception:
            output.success = False
@@ -195,7 +197,7 @@ async def async_request_deepspeed_mii(
                    output.generated_text = parsed_resp["text"][0]
                    output.success = True
                else:
-                    output.error = response.reason
+                    output.error = response.reason or ""
                    output.success = False
        except Exception:
            output.success = False
@@ -234,19 +236,20 @@ async def async_request_openai_completions(
        output.prompt_len = request_func_input.prompt_len

        generated_text = ""
-        ttft = 0
+        ttft = 0.0
        st = time.perf_counter()
        most_recent_timestamp = st
        try:
            async with session.post(url=api_url, json=payload,
                                    headers=headers) as response:
                if response.status == 200:
-                    async for chunk in response.content:
-                        chunk = chunk.strip()
-                        if not chunk:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
                            continue

-                        chunk = remove_prefix(chunk.decode("utf-8"), "data: ")
+                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
+                                              "data: ")
                        if chunk == "[DONE]":
                            latency = time.perf_counter() - st
                        else:
@@ -255,7 +258,7 @@ async def async_request_openai_completions(
                            if data["choices"][0]["text"]:
                                timestamp = time.perf_counter()
                                # First token
-                                if ttft == 0:
+                                if ttft == 0.0:
                                    ttft = time.perf_counter() - st
                                    output.ttft = ttft

@@ -315,19 +318,20 @@ async def async_request_openai_chat_completions(
        output.prompt_len = request_func_input.prompt_len

        generated_text = ""
-        ttft = 0
+        ttft = 0.0
        st = time.perf_counter()
        most_recent_timestamp = st
        try:
            async with session.post(url=api_url, json=payload,
                                    headers=headers) as response:
                if response.status == 200:
-                    async for chunk in response.content:
-                        chunk = chunk.strip()
-                        if not chunk:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
                            continue

-                        chunk = remove_prefix(chunk.decode("utf-8"), "data: ")
+                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
+                                              "data: ")
                        if chunk == "[DONE]":
                            latency = time.perf_counter() - st
                        else:
@@ -337,7 +341,7 @@ async def async_request_openai_chat_completions(
                            delta = data["choices"][0]["delta"]
                            if delta.get("content", None):
                                # First token
-                                if ttft == 0:
+                                if ttft == 0.0:
                                    ttft = time.perf_counter() - st
                                    output.ttft = ttft

@@ -354,7 +358,7 @@ async def async_request_openai_chat_completions(
                    output.success = True
                    output.latency = latency
                else:
-                    output.error = response.reason
+                    output.error = response.reason or ""
                    output.success = False
        except Exception:
            output.success = False