[gpt-oss] Add IncompleteDetails to ResponsesRepsonse (#24561)

Signed-off-by: Andrew Xia <axia@meta.com>
This commit is contained in:
Andrew Xia
2025-09-15 13:07:55 -07:00
committed by GitHub
parent 94b03f88dd
commit 25aba2b6a3
7 changed files with 67 additions and 25 deletions

View File

@@ -74,6 +74,20 @@ async def test_basic_with_reasoning_effort(client: OpenAI, model_name: str):
assert response.status == "completed"
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_max_tokens(client: OpenAI, model_name: str):
response = await client.responses.create(
model=model_name,
input="What is the first paragraph of Moby Dick?",
reasoning={"effort": "low"},
max_output_tokens=30,
)
assert response is not None
assert response.status == "incomplete"
assert response.incomplete_details.reason == "max_output_tokens"
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_chat(client: OpenAI, model_name: str):

View File

@@ -112,6 +112,7 @@ class HarmonyContext(ConversationContext):
available_tools: list[str],
):
self._messages = messages
self.finish_reason: Optional[str] = None
self.available_tools = available_tools
self._tool_sessions: dict[str, Union[ClientSession, Tool]] = {}
self.called_tools: set[str] = set()
@@ -135,7 +136,8 @@ class HarmonyContext(ConversationContext):
if self.parser.current_channel in {"analysis", "commentary"}:
self.num_reasoning_tokens += 1
def append_output(self, output) -> None:
def append_output(self, output: Union[RequestOutput,
list[Message]]) -> None:
if isinstance(output, RequestOutput):
output_token_ids = output.outputs[0].token_ids
self.parser = get_streamable_parser_for_assistant()
@@ -150,6 +152,8 @@ class HarmonyContext(ConversationContext):
# Move current turn to previous turn for next turn's calculations
self.previous_turn = self.current_turn.copy()
output_msgs = self.parser.messages
# The responses finish reason is set in the last message
self.finish_reason = output.outputs[0].finish_reason
else:
# Tool output.
output_msgs = output
@@ -385,7 +389,8 @@ class StreamingHarmonyContext(HarmonyContext):
def messages(self) -> list:
return self.parser.messages
def append_output(self, output) -> None:
def append_output(self, output: Union[RequestOutput,
list[Message]]) -> None:
if isinstance(output, RequestOutput):
# append_output is called for each output token in streaming case,
# so we only want to add the prompt tokens once for each message.

View File

@@ -387,7 +387,9 @@ def parse_remaining_state(
id=f"msg_{random_uuid()}",
content=[output_text],
role="assistant",
status="completed",
# if the parser still has messages (ie if the generator got cut
# abruptly), this should be incomplete
status="incomplete",
type="message",
)
return [text_item]

View File

@@ -30,7 +30,7 @@ except ImportError: # For newer openai versions (>= 1.100.0)
from openai.types.responses import (ResponseFormatTextConfig as
ResponseTextConfig)
from openai.types.responses.response import ToolChoice
from openai.types.responses.response import IncompleteDetails, ToolChoice
from openai.types.responses.tool import Tool
from openai.types.shared import Metadata, Reasoning
from pydantic import (BaseModel, ConfigDict, Field, TypeAdapter,
@@ -1868,7 +1868,7 @@ class ResponsesResponse(OpenAIBaseModel):
id: str = Field(default_factory=lambda: f"resp_{random_uuid()}")
created_at: int = Field(default_factory=lambda: int(time.time()))
# error: Optional[ResponseError] = None
# incomplete_details: Optional[IncompleteDetails] = None
incomplete_details: Optional[IncompleteDetails] = None
instructions: Optional[str] = None
metadata: Optional[Metadata] = None
model: str
@@ -1904,9 +1904,18 @@ class ResponsesResponse(OpenAIBaseModel):
status: ResponseStatus,
usage: Optional[ResponseUsage] = None,
) -> "ResponsesResponse":
incomplete_details: Optional[IncompleteDetails] = None
if status == 'incomplete':
incomplete_details = IncompleteDetails(reason='max_output_tokens')
# TODO: implement the other reason for incomplete_details,
# which is content_filter
# incomplete_details = IncompleteDetails(reason='content_filter')
return cls(
id=request.request_id,
created_at=created_time,
incomplete_details=incomplete_details,
instructions=request.instructions,
metadata=request.metadata,
model=model_name,

View File

@@ -27,7 +27,7 @@ from openai.types.responses import (ResponseCreatedEvent,
ResponseReasoningItem,
ResponseReasoningTextDeltaEvent,
ResponseReasoningTextDoneEvent,
response_text_delta_event)
ResponseStatus, response_text_delta_event)
from openai.types.responses.response_output_text import (Logprob,
LogprobTopLogprob)
# yapf: enable
@@ -461,10 +461,22 @@ class OpenAIServingResponses(OpenAIServing):
# TODO: Use a vllm-specific Validation Error
return self.create_error_response(str(e))
# NOTE: Implementation of stauts is still WIP, but for now
# we guarantee that if the status is not "completed", it is accurate.
# "completed" is implemented as the "catch-all" for now.
status: ResponseStatus = "completed"
if self.use_harmony:
assert isinstance(context, HarmonyContext)
output = self._make_response_output_items_with_harmony(context)
num_tool_output_tokens = context.num_tool_output_tokens
if len(output) > 0:
if context.finish_reason == "length":
status = "incomplete"
elif context.finish_reason == "abort":
status = "cancelled"
else:
status = "incomplete"
else:
assert isinstance(context, SimpleContext)
final_res = context.last_output
@@ -501,7 +513,7 @@ class OpenAIServingResponses(OpenAIServing):
model_name=model_name,
created_time=created_time,
output=output,
status="completed",
status=status,
usage=usage,
)
@@ -658,7 +670,7 @@ class OpenAIServingResponses(OpenAIServing):
self,
context: HarmonyContext,
) -> list[ResponseOutputItem]:
output_items = []
output_items: list[ResponseOutputItem] = []
num_init_messages = context.num_init_messages
for msg in context.messages[num_init_messages:]:
output_items.extend(parse_output_message(msg))