From 1f8b7c536be40975573eeebf36204286cfb4e4e9 Mon Sep 17 00:00:00 2001 From: Andrew Xia Date: Fri, 9 Jan 2026 16:00:57 -0500 Subject: [PATCH] [responsesAPI] fix incomplete_messages for simple/parsable context (#31836) Signed-off-by: Andrew Xia Co-authored-by: Andrew Xia --- .../openai/test_response_api_parsable_context.py | 15 +++++++++++++++ .../openai/test_response_api_simple.py | 15 +++++++++++++++ .../entrypoints/openai/parser/responses_parser.py | 6 ++++++ vllm/entrypoints/openai/serving_responses.py | 8 ++++++++ 4 files changed, 44 insertions(+) diff --git a/tests/entrypoints/openai/test_response_api_parsable_context.py b/tests/entrypoints/openai/test_response_api_parsable_context.py index 6d97602f3..1e2fd3751 100644 --- a/tests/entrypoints/openai/test_response_api_parsable_context.py +++ b/tests/entrypoints/openai/test_response_api_parsable_context.py @@ -58,6 +58,7 @@ async def test_basic(client: OpenAI, model_name: str): assert response is not None print("response: ", response) assert response.status == "completed" + assert response.incomplete_details is None @pytest.mark.asyncio @@ -184,3 +185,17 @@ async def test_mcp_tool_call(client: OpenAI, model_name: str): assert len(response.input_messages) == 1 assert len(response.output_messages) == 3 assert "312" in response.output_messages[2]["message"] + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_max_tokens(client: OpenAI, model_name: str): + response = await client.responses.create( + model=model_name, + input="What is the first paragraph of Moby Dick?", + reasoning={"effort": "low"}, + max_output_tokens=30, + ) + assert response is not None + assert response.status == "incomplete" + assert response.incomplete_details.reason == "max_output_tokens" diff --git a/tests/entrypoints/openai/test_response_api_simple.py b/tests/entrypoints/openai/test_response_api_simple.py index 02e06297f..e39e6ab7c 100644 --- a/tests/entrypoints/openai/test_response_api_simple.py +++ b/tests/entrypoints/openai/test_response_api_simple.py @@ -40,6 +40,7 @@ async def test_basic(client: OpenAI, model_name: str): assert response is not None print("response: ", response) assert response.status == "completed" + assert response.incomplete_details is None @pytest.mark.asyncio @@ -132,3 +133,17 @@ async def test_streaming_output_consistency(client: OpenAI, model_name: str): f"Streaming: {streaming_text!r}\n" f"Final: {final_output_text!r}" ) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_max_tokens(client: OpenAI, model_name: str): + response = await client.responses.create( + model=model_name, + input="What is the first paragraph of Moby Dick?", + reasoning={"effort": "low"}, + max_output_tokens=30, + ) + assert response is not None + assert response.status == "incomplete" + assert response.incomplete_details.reason == "max_output_tokens" diff --git a/vllm/entrypoints/openai/parser/responses_parser.py b/vllm/entrypoints/openai/parser/responses_parser.py index 26f185b85..14a6f5cb7 100644 --- a/vllm/entrypoints/openai/parser/responses_parser.py +++ b/vllm/entrypoints/openai/parser/responses_parser.py @@ -51,7 +51,13 @@ class ResponsesParser: if tool_parser_cls is not None: self.tool_parser_instance = tool_parser_cls(tokenizer) + # Store the last finish_reason to determine response status + self.finish_reason: str | None = None + def process(self, output: CompletionOutput) -> "ResponsesParser": + # Store the finish_reason from the output + self.finish_reason = output.finish_reason + reasoning_content, content = self.reasoning_parser_instance.extract_reasoning( output.text, request=self.request ) diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 8bde4d482..c62fa0f7d 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -692,6 +692,10 @@ class OpenAIServingResponses(OpenAIServing): # TODO: Calculate usage. # assert final_res.prompt_token_ids is not None num_tool_output_tokens = 0 + + # Check finish reason from the parser + if context.parser.finish_reason == "length": + status = "incomplete" else: assert isinstance(context, SimpleContext) # Use final_output which has accumulated text/token_ids/logprobs @@ -703,6 +707,10 @@ class OpenAIServingResponses(OpenAIServing): # finish_reason='error' indicates retryable internal error self._raise_if_error(final_output.finish_reason, request.request_id) + # Check if generation was stopped due to max_tokens + if final_output.finish_reason == "length": + status = "incomplete" + output = self._make_response_output_items(request, final_output, tokenizer) if request.enable_response_messages: