[responsesAPI] fix incomplete_messages for simple/parsable context (#31836)
Signed-off-by: Andrew Xia <axia@fb.com> Co-authored-by: Andrew Xia <axia@fb.com>
This commit is contained in:
@@ -58,6 +58,7 @@ async def test_basic(client: OpenAI, model_name: str):
|
||||
assert response is not None
|
||||
print("response: ", response)
|
||||
assert response.status == "completed"
|
||||
assert response.incomplete_details is None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -184,3 +185,17 @@ async def test_mcp_tool_call(client: OpenAI, model_name: str):
|
||||
assert len(response.input_messages) == 1
|
||||
assert len(response.output_messages) == 3
|
||||
assert "312" in response.output_messages[2]["message"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_max_tokens(client: OpenAI, model_name: str):
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input="What is the first paragraph of Moby Dick?",
|
||||
reasoning={"effort": "low"},
|
||||
max_output_tokens=30,
|
||||
)
|
||||
assert response is not None
|
||||
assert response.status == "incomplete"
|
||||
assert response.incomplete_details.reason == "max_output_tokens"
|
||||
|
||||
@@ -40,6 +40,7 @@ async def test_basic(client: OpenAI, model_name: str):
|
||||
assert response is not None
|
||||
print("response: ", response)
|
||||
assert response.status == "completed"
|
||||
assert response.incomplete_details is None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@@ -132,3 +133,17 @@ async def test_streaming_output_consistency(client: OpenAI, model_name: str):
|
||||
f"Streaming: {streaming_text!r}\n"
|
||||
f"Final: {final_output_text!r}"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_max_tokens(client: OpenAI, model_name: str):
|
||||
response = await client.responses.create(
|
||||
model=model_name,
|
||||
input="What is the first paragraph of Moby Dick?",
|
||||
reasoning={"effort": "low"},
|
||||
max_output_tokens=30,
|
||||
)
|
||||
assert response is not None
|
||||
assert response.status == "incomplete"
|
||||
assert response.incomplete_details.reason == "max_output_tokens"
|
||||
|
||||
@@ -51,7 +51,13 @@ class ResponsesParser:
|
||||
if tool_parser_cls is not None:
|
||||
self.tool_parser_instance = tool_parser_cls(tokenizer)
|
||||
|
||||
# Store the last finish_reason to determine response status
|
||||
self.finish_reason: str | None = None
|
||||
|
||||
def process(self, output: CompletionOutput) -> "ResponsesParser":
|
||||
# Store the finish_reason from the output
|
||||
self.finish_reason = output.finish_reason
|
||||
|
||||
reasoning_content, content = self.reasoning_parser_instance.extract_reasoning(
|
||||
output.text, request=self.request
|
||||
)
|
||||
|
||||
@@ -692,6 +692,10 @@ class OpenAIServingResponses(OpenAIServing):
|
||||
# TODO: Calculate usage.
|
||||
# assert final_res.prompt_token_ids is not None
|
||||
num_tool_output_tokens = 0
|
||||
|
||||
# Check finish reason from the parser
|
||||
if context.parser.finish_reason == "length":
|
||||
status = "incomplete"
|
||||
else:
|
||||
assert isinstance(context, SimpleContext)
|
||||
# Use final_output which has accumulated text/token_ids/logprobs
|
||||
@@ -703,6 +707,10 @@ class OpenAIServingResponses(OpenAIServing):
|
||||
# finish_reason='error' indicates retryable internal error
|
||||
self._raise_if_error(final_output.finish_reason, request.request_id)
|
||||
|
||||
# Check if generation was stopped due to max_tokens
|
||||
if final_output.finish_reason == "length":
|
||||
status = "incomplete"
|
||||
|
||||
output = self._make_response_output_items(request, final_output, tokenizer)
|
||||
|
||||
if request.enable_response_messages:
|
||||
|
||||
Reference in New Issue
Block a user