diff --git a/tests/entrypoints/test_responses_utils.py b/tests/entrypoints/test_responses_utils.py index 53e4f4606..8e0ed7a12 100644 --- a/tests/entrypoints/test_responses_utils.py +++ b/tests/entrypoints/test_responses_utils.py @@ -21,6 +21,7 @@ from vllm.entrypoints.responses_utils import ( _maybe_combine_reasoning_and_tool_call, construct_chat_messages_with_tool_call, convert_tool_responses_to_completions_format, + should_continue_final_message, ) @@ -165,6 +166,285 @@ class TestResponsesUtils: assert formatted_item["content"] == "dongyi" +class TestShouldContinueFinalMessage: + """Tests for should_continue_final_message function. + + This function enables Anthropic-style partial message completion, where + users can provide an incomplete assistant message and have the model + continue from where it left off. + """ + + def test_string_input_returns_false(self): + """String input is always a user message, so should not continue.""" + assert should_continue_final_message("Hello, world!") is False + + def test_empty_list_returns_false(self): + """Empty list should not continue.""" + assert should_continue_final_message([]) is False + + def test_completed_message_returns_false(self): + """Completed message should not be continued.""" + output_item = ResponseOutputMessage( + id="msg_123", + content=[ + ResponseOutputText( + annotations=[], + text="The answer is 42.", + type="output_text", + logprobs=None, + ) + ], + role="assistant", + status="completed", + type="message", + ) + assert should_continue_final_message([output_item]) is False + + def test_in_progress_message_returns_true(self): + """In-progress message should be continued. + + This is the key use case for partial message completion. + Example: The user provides "The best answer is (" and wants + the model to continue from there. + """ + output_item = ResponseOutputMessage( + id="msg_123", + content=[ + ResponseOutputText( + annotations=[], + text="The best answer is (", + type="output_text", + logprobs=None, + ) + ], + role="assistant", + status="in_progress", + type="message", + ) + assert should_continue_final_message([output_item]) is True + + def test_incomplete_message_returns_true(self): + """Incomplete message should be continued.""" + output_item = ResponseOutputMessage( + id="msg_123", + content=[ + ResponseOutputText( + annotations=[], + text="The answer", + type="output_text", + logprobs=None, + ) + ], + role="assistant", + status="incomplete", + type="message", + ) + assert should_continue_final_message([output_item]) is True + + def test_in_progress_reasoning_returns_true(self): + """In-progress reasoning should be continued.""" + reasoning_item = ResponseReasoningItem( + id="reasoning_123", + summary=[], + type="reasoning", + content=[ + Content( + text="Let me think about this...", + type="reasoning_text", + ) + ], + encrypted_content=None, + status="in_progress", + ) + assert should_continue_final_message([reasoning_item]) is True + + def test_incomplete_reasoning_returns_true(self): + """Incomplete reasoning should be continued.""" + reasoning_item = ResponseReasoningItem( + id="reasoning_123", + summary=[], + type="reasoning", + content=[ + Content( + text="Let me think", + type="reasoning_text", + ) + ], + encrypted_content=None, + status="incomplete", + ) + assert should_continue_final_message([reasoning_item]) is True + + reasoning_item = { + "id": "reasoning_123", + "summary": [], + "type": "reasoning", + "content": [], + "status": "incomplete", + } + assert should_continue_final_message([reasoning_item]) is True + + def test_completed_reasoning_returns_false(self): + """Completed reasoning should not be continued.""" + reasoning_item = ResponseReasoningItem( + id="reasoning_123", + summary=[], + type="reasoning", + content=[ + Content( + text="I have thought about this.", + type="reasoning_text", + ) + ], + encrypted_content=None, + status="completed", + ) + assert should_continue_final_message([reasoning_item]) is False + + def test_reasoning_with_none_status_returns_false(self): + """Reasoning with None status should not be continued.""" + reasoning_item = ResponseReasoningItem( + id="reasoning_123", + summary=[], + type="reasoning", + content=[ + Content( + text="Some reasoning", + type="reasoning_text", + ) + ], + encrypted_content=None, + status=None, + ) + assert should_continue_final_message([reasoning_item]) is False + + def test_only_last_item_matters(self): + """Only the last item in the list determines continuation.""" + completed_item = ResponseOutputMessage( + id="msg_1", + content=[ + ResponseOutputText( + annotations=[], + text="Complete message.", + type="output_text", + logprobs=None, + ) + ], + role="assistant", + status="completed", + type="message", + ) + in_progress_item = ResponseOutputMessage( + id="msg_2", + content=[ + ResponseOutputText( + annotations=[], + text="Partial message...", + type="output_text", + logprobs=None, + ) + ], + role="assistant", + status="in_progress", + type="message", + ) + + # In-progress as last item -> should continue + assert should_continue_final_message([completed_item, in_progress_item]) is True + + # Completed as last item -> should not continue + assert ( + should_continue_final_message([in_progress_item, completed_item]) is False + ) + + def test_tool_call_returns_false(self): + """Tool calls should not trigger continuation.""" + tool_call = ResponseFunctionToolCall( + id="fc_123", + call_id="call_123", + type="function_call", + status="in_progress", + name="get_weather", + arguments='{"location": "NYC"}', + ) + assert should_continue_final_message([tool_call]) is False + + tool_call = { + "id": "msg_123", + "call_id": "call_123", + "type": "function_call", + "status": "in_progress", + "name": "get_weather", + "arguments": '{"location": "NYC"}', + } + assert should_continue_final_message([tool_call]) is False + + # Tests for dict inputs (e.g., from curl requests) + def test_dict_in_progress_message_returns_true(self): + """Dict with in_progress status should be continued (curl input).""" + dict_item = { + "id": "msg_123", + "type": "message", + "role": "assistant", + "status": "in_progress", + "content": [{"type": "output_text", "text": "The answer is ("}], + } + assert should_continue_final_message([dict_item]) is True + + def test_dict_incomplete_message_returns_true(self): + """Dict with incomplete status should be continued (curl input).""" + dict_item = { + "id": "msg_123", + "type": "message", + "role": "assistant", + "status": "incomplete", + "content": [{"type": "output_text", "text": "Partial answer"}], + } + assert should_continue_final_message([dict_item]) is True + + def test_dict_completed_message_returns_false(self): + """Dict with completed status should not be continued (curl input).""" + dict_item = { + "id": "msg_123", + "type": "message", + "role": "assistant", + "status": "completed", + "content": [{"type": "output_text", "text": "Complete answer."}], + } + assert should_continue_final_message([dict_item]) is False + + def test_dict_reasoning_in_progress_returns_true(self): + """Dict reasoning item with in_progress status should be continued.""" + dict_item = { + "id": "reasoning_123", + "type": "reasoning", + "status": "in_progress", + "content": [{"type": "reasoning_text", "text": "Let me think..."}], + } + assert should_continue_final_message([dict_item]) is True + + def test_dict_without_status_returns_false(self): + """Dict without status field should not be continued.""" + dict_item = { + "id": "msg_123", + "type": "message", + "role": "assistant", + "content": [{"type": "output_text", "text": "Some text"}], + } + assert should_continue_final_message([dict_item]) is False + + def test_dict_with_none_status_returns_false(self): + """Dict with None status should not be continued.""" + dict_item = { + "id": "msg_123", + "type": "message", + "role": "assistant", + "status": None, + "content": [{"type": "output_text", "text": "Some text"}], + } + assert should_continue_final_message([dict_item]) is False + + class TestMaybeCombineReasoningAndToolCall: """Tests for _maybe_combine_reasoning_and_tool_call function.""" diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index e870d6e92..821a936a2 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -111,6 +111,7 @@ from vllm.entrypoints.responses_utils import ( construct_input_messages, construct_tool_dicts, extract_tool_types, + should_continue_final_message, ) from vllm.entrypoints.tool_server import ToolServer from vllm.inputs.data import TokensPrompt @@ -590,6 +591,10 @@ class OpenAIServingResponses(OpenAIServing): prev_response_output=prev_response.output if prev_response else None, ) + # Check if we should continue the final message (partial completion) + # This enables Anthropic-style partial message completion where the + # user provides an incomplete assistant message to continue from. + continue_final = should_continue_final_message(request.input) chat_template_kwargs = dict( reasoning_effort=None if request.reasoning is None @@ -604,6 +609,11 @@ class OpenAIServingResponses(OpenAIServing): tool_parser=self.tool_parser, chat_template=self.chat_template, chat_template_content_format=self.chat_template_content_format, + # When continuing a partial message, we set continue_final_message=True + # and add_generation_prompt=False so the model continues the message + # rather than starting a new one. + add_generation_prompt=not continue_final, + continue_final_message=continue_final, chat_template_kwargs=chat_template_kwargs, ) return messages, engine_prompts diff --git a/vllm/entrypoints/responses_utils.py b/vllm/entrypoints/responses_utils.py index dded2eea7..f37d891dd 100644 --- a/vllm/entrypoints/responses_utils.py +++ b/vllm/entrypoints/responses_utils.py @@ -28,6 +28,53 @@ from vllm.entrypoints.openai.engine.protocol import ( ) +def should_continue_final_message( + request_input: str | list[ResponseInputOutputItem], +) -> bool: + """ + Determine if the last input message is a partial assistant message + that should be continued rather than starting a new generation. + + This enables partial message completion similar to Anthropic's Messages API, + where users can provide an incomplete assistant message and have the model + continue from where it left off. + + A message is considered partial if: + 1. It's a ResponseOutputMessage or ResponseReasoningItem + 2. Its status is "in_progress" or "incomplete" + + Args: + request_input: The input to the Responses API request + + Returns: + True if the final message should be continued, False otherwise + """ + if isinstance(request_input, str): + # Simple string input is always a user message + return False + + if not request_input: + return False + + last_item = request_input[-1] + + # Check if the last item is a partial assistant message + if isinstance(last_item, ResponseOutputMessage): + return last_item.status in ("in_progress", "incomplete") + + # Check if the last item is a partial reasoning item + if isinstance(last_item, ResponseReasoningItem): + return last_item.status in ("in_progress", "incomplete") + + if isinstance(last_item, dict): + # only support partial completion for messages for now + if last_item.get("type", "message") not in ("message", "reasoning"): + return False + return last_item.get("status") in ("in_progress", "incomplete") + + return False + + def construct_input_messages( *, request_instructions: str | None = None,