[Bugfix] API stream returning two stops (#3450)

Co-authored-by: Dylan Hawk <dylanwawk@gmail.com>
2024-03-25 10:14:34 -07:00
parent c13ad1b7bd
commit 0b4997e05c
2 changed files with 25 additions and 27 deletions
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -266,6 +266,16 @@ class OpenAIServingCompletion(OpenAIServing):
                    previous_texts[i] = output.text
                    previous_num_tokens[i] = len(output.token_ids)
                    finish_reason = output.finish_reason
+                    if output.finish_reason is not None:  # return final usage
+                        prompt_tokens = len(res.prompt_token_ids)
+                        completion_tokens = len(output.token_ids)
+                        final_usage = UsageInfo(
+                            prompt_tokens=prompt_tokens,
+                            completion_tokens=completion_tokens,
+                            total_tokens=prompt_tokens + completion_tokens,
+                        )
+                    else:
+                        final_usage = None
                    response_json = CompletionStreamResponse(
                        id=request_id,
                        created=created_time,
@@ -277,34 +287,10 @@ class OpenAIServingCompletion(OpenAIServing):
                                logprobs=logprobs,
                                finish_reason=finish_reason,
                            )
-                        ]).model_dump_json()
+                        ],
+                        usage=final_usage,
+                    ).model_dump_json(exclude_unset=True)
                    yield f"data: {response_json}\n\n"
-
-                    if output.finish_reason is not None:  # return final usage
-                        logprobs = LogProbs(
-                        ) if request.logprobs is not None else None
-                        prompt_tokens = len(res.prompt_token_ids)
-                        completion_tokens = len(output.token_ids)
-                        final_usage = UsageInfo(
-                            prompt_tokens=prompt_tokens,
-                            completion_tokens=completion_tokens,
-                            total_tokens=prompt_tokens + completion_tokens,
-                        )
-                        response_json = CompletionStreamResponse(
-                            id=request_id,
-                            created=created_time,
-                            model=model_name,
-                            choices=[
-                                CompletionResponseStreamChoice(
-                                    index=i,
-                                    text="",
-                                    logprobs=logprobs,
-                                    finish_reason=output.finish_reason,
-                                )
-                            ],
-                            usage=final_usage,
-                        ).model_dump_json()
-                        yield f"data: {response_json}\n\n"
        except ValueError as e:
            # TODO: Use a vllm-specific Validation Error
            data = self.create_streaming_error_response(str(e))