[Bugfix] Fix vLLM UsageInfo and logprobs None AssertionError with empty token_ids (#9034)

Co-authored-by: Nick Hill <nickhill@us.ibm.com>
2024-10-15 15:40:43 -07:00
parent 22f8a69549
commit ba30942240
4 changed files with 140 additions and 2 deletions
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -435,6 +435,12 @@ class OpenAIServingChat(OpenAIServing):
                        logprobs = None

                    delta_text = output.text
+
+                    if not delta_text and not output.token_ids and \
+                        not previous_num_tokens[i]:
+                        # Chunked prefill case, don't return empty chunks
+                        continue
+
                    delta_message: Optional[DeltaMessage]

                    # handle streaming deltas for tools with named tool_choice
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -274,8 +274,6 @@ class OpenAIServingCompletion(OpenAIServing):

                for output in res.outputs:
                    i = output.index + prompt_idx * num_choices
-                    # TODO(simon): optimize the performance by avoiding full
-                    # text O(n^2) sending.

                    assert request.max_tokens is not None
                    if request.echo and request.max_tokens == 0:
@@ -307,6 +305,11 @@ class OpenAIServingCompletion(OpenAIServing):
                        delta_token_ids = output.token_ids
                        out_logprobs = output.logprobs

+                        if not delta_text and not delta_token_ids \
+                            and not previous_num_tokens[i]:
+                            # Chunked prefill case, don't return empty chunks
+                            continue
+
                    if request.logprobs is not None:
                        assert out_logprobs is not None, (
                            "Did not output logprobs")