[Feature][Frontend]: Continued stream_options implementation also in CompletionRequest (#5319)

2024-06-10 17:22:09 +03:00
parent 6b29d6fe70
commit 774d1035e4
4 changed files with 180 additions and 126 deletions
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -264,7 +264,8 @@ class OpenAIServingCompletion(OpenAIServing):
                        )
                    else:
                        final_usage = None
-                    response_json = CompletionStreamResponse(
+
+                    chunk = CompletionStreamResponse(
                        id=request_id,
                        created=created_time,
                        model=model_name,
@@ -276,10 +277,27 @@ class OpenAIServingCompletion(OpenAIServing):
                                finish_reason=finish_reason,
                                stop_reason=stop_reason,
                            )
-                        ],
-                        usage=final_usage,
-                    ).model_dump_json(exclude_unset=True)
+                        ])
+                    if (request.stream_options
+                            and request.stream_options.include_usage):
+                        chunk.usage = None
+
+                    response_json = chunk.model_dump_json(exclude_unset=True)
                    yield f"data: {response_json}\n\n"
+
+            if (request.stream_options
+                    and request.stream_options.include_usage):
+                final_usage_chunk = CompletionStreamResponse(
+                    id=request_id,
+                    created=created_time,
+                    model=model_name,
+                    choices=[],
+                    usage=final_usage,
+                )
+                final_usage_data = (final_usage_chunk.model_dump_json(
+                    exclude_unset=True, exclude_none=True))
+                yield f"data: {final_usage_data}\n\n"
+
        except ValueError as e:
            # TODO: Use a vllm-specific Validation Error
            data = self.create_streaming_error_response(str(e))