[responsesAPI] fix incomplete_messages for simple/parsable context (#31836)

Signed-off-by: Andrew Xia <axia@fb.com>
Co-authored-by: Andrew Xia <axia@fb.com>
This commit is contained in:
Andrew Xia
2026-01-09 16:00:57 -05:00
committed by GitHub
parent 0a0aa07747
commit 1f8b7c536b
4 changed files with 44 additions and 0 deletions

View File

@@ -58,6 +58,7 @@ async def test_basic(client: OpenAI, model_name: str):
assert response is not None
print("response: ", response)
assert response.status == "completed"
assert response.incomplete_details is None
@pytest.mark.asyncio
@@ -184,3 +185,17 @@ async def test_mcp_tool_call(client: OpenAI, model_name: str):
assert len(response.input_messages) == 1
assert len(response.output_messages) == 3
assert "312" in response.output_messages[2]["message"]
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_max_tokens(client: OpenAI, model_name: str):
response = await client.responses.create(
model=model_name,
input="What is the first paragraph of Moby Dick?",
reasoning={"effort": "low"},
max_output_tokens=30,
)
assert response is not None
assert response.status == "incomplete"
assert response.incomplete_details.reason == "max_output_tokens"

View File

@@ -40,6 +40,7 @@ async def test_basic(client: OpenAI, model_name: str):
assert response is not None
print("response: ", response)
assert response.status == "completed"
assert response.incomplete_details is None
@pytest.mark.asyncio
@@ -132,3 +133,17 @@ async def test_streaming_output_consistency(client: OpenAI, model_name: str):
f"Streaming: {streaming_text!r}\n"
f"Final: {final_output_text!r}"
)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_max_tokens(client: OpenAI, model_name: str):
response = await client.responses.create(
model=model_name,
input="What is the first paragraph of Moby Dick?",
reasoning={"effort": "low"},
max_output_tokens=30,
)
assert response is not None
assert response.status == "incomplete"
assert response.incomplete_details.reason == "max_output_tokens"

View File

@@ -51,7 +51,13 @@ class ResponsesParser:
if tool_parser_cls is not None:
self.tool_parser_instance = tool_parser_cls(tokenizer)
# Store the last finish_reason to determine response status
self.finish_reason: str | None = None
def process(self, output: CompletionOutput) -> "ResponsesParser":
# Store the finish_reason from the output
self.finish_reason = output.finish_reason
reasoning_content, content = self.reasoning_parser_instance.extract_reasoning(
output.text, request=self.request
)

View File

@@ -692,6 +692,10 @@ class OpenAIServingResponses(OpenAIServing):
# TODO: Calculate usage.
# assert final_res.prompt_token_ids is not None
num_tool_output_tokens = 0
# Check finish reason from the parser
if context.parser.finish_reason == "length":
status = "incomplete"
else:
assert isinstance(context, SimpleContext)
# Use final_output which has accumulated text/token_ids/logprobs
@@ -703,6 +707,10 @@ class OpenAIServingResponses(OpenAIServing):
# finish_reason='error' indicates retryable internal error
self._raise_if_error(final_output.finish_reason, request.request_id)
# Check if generation was stopped due to max_tokens
if final_output.finish_reason == "length":
status = "incomplete"
output = self._make_response_output_items(request, final_output, tokenizer)
if request.enable_response_messages: