[gpt-oss] Add IncompleteDetails to ResponsesRepsonse (#24561)

Signed-off-by: Andrew Xia <axia@meta.com>
2025-09-15 13:07:55 -07:00
parent 94b03f88dd
commit 25aba2b6a3
7 changed files with 67 additions and 25 deletions
--- a/vllm/entrypoints/context.py
+++ b/vllm/entrypoints/context.py
@@ -112,6 +112,7 @@ class HarmonyContext(ConversationContext):
        available_tools: list[str],
    ):
        self._messages = messages
+        self.finish_reason: Optional[str] = None
        self.available_tools = available_tools
        self._tool_sessions: dict[str, Union[ClientSession, Tool]] = {}
        self.called_tools: set[str] = set()
@@ -135,7 +136,8 @@ class HarmonyContext(ConversationContext):
        if self.parser.current_channel in {"analysis", "commentary"}:
            self.num_reasoning_tokens += 1

-    def append_output(self, output) -> None:
+    def append_output(self, output: Union[RequestOutput,
+                                          list[Message]]) -> None:
        if isinstance(output, RequestOutput):
            output_token_ids = output.outputs[0].token_ids
            self.parser = get_streamable_parser_for_assistant()
@@ -150,6 +152,8 @@ class HarmonyContext(ConversationContext):
            # Move current turn to previous turn for next turn's calculations
            self.previous_turn = self.current_turn.copy()
            output_msgs = self.parser.messages
+            # The responses finish reason is set in the last message
+            self.finish_reason = output.outputs[0].finish_reason
        else:
            # Tool output.
            output_msgs = output
@@ -157,18 +161,18 @@ class HarmonyContext(ConversationContext):

    def _update_prefill_token_usage(self, output: RequestOutput) -> None:
        """Update token usage statistics for the prefill phase of generation.
-        
+
        The prefill phase processes the input prompt tokens. This method:
        1. Counts the prompt tokens for this turn
        2. Calculates tool output tokens for multi-turn conversations
        3. Updates cached token counts
        4. Tracks state for next turn calculations
-        
+
        Tool output tokens are calculated as:
-        current_prompt_tokens - last_turn_prompt_tokens - 
+        current_prompt_tokens - last_turn_prompt_tokens -
        last_turn_output_tokens
        This represents tokens added between turns (typically tool responses).
-        
+
        Args:
            output: The RequestOutput containing prompt token information
        """
@@ -214,18 +218,18 @@ class HarmonyContext(ConversationContext):

    def _update_decode_token_usage(self, output: RequestOutput) -> int:
        """Update token usage statistics for the decode phase of generation.
-        
+
        The decode phase processes the generated output tokens. This method:
        1. Counts output tokens from all completion outputs
        2. Updates the total output token count
        3. Tracks tokens generated in the current turn
-        
+
        In streaming mode, this is called for each token generated.
        In non-streaming mode, this is called once with all output tokens.
-        
+
        Args:
            output: The RequestOutput containing generated token information
-            
+
        Returns:
            int: Number of output tokens processed in this call
        """
@@ -385,7 +389,8 @@ class StreamingHarmonyContext(HarmonyContext):
    def messages(self) -> list:
        return self.parser.messages

-    def append_output(self, output) -> None:
+    def append_output(self, output: Union[RequestOutput,
+                                          list[Message]]) -> None:
        if isinstance(output, RequestOutput):
            # append_output is called for each output token in streaming case,
            # so we only want to add the prompt tokens once for each message.