[responseAPI] support partial message generation (#32100)

Signed-off-by: Andrew Xia <axia@fb.com> Signed-off-by: Andrew Xia <mitandrewxia@gmail.com> Signed-off-by: Lu Fang <30275821+houseroad@users.noreply.github.com> Co-authored-by: Andrew Xia <axia@fb.com> Co-authored-by: Lu Fang <30275821+houseroad@users.noreply.github.com>
2026-01-13 13:41:26 -05:00
parent 6beef12b9b
commit af54d2e2d0
3 changed files with 337 additions and 0 deletions
--- a/tests/entrypoints/test_responses_utils.py
+++ b/tests/entrypoints/test_responses_utils.py
@@ -21,6 +21,7 @@ from vllm.entrypoints.responses_utils import (
    _maybe_combine_reasoning_and_tool_call,
    construct_chat_messages_with_tool_call,
    convert_tool_responses_to_completions_format,
+    should_continue_final_message,
 )


@@ -165,6 +166,285 @@ class TestResponsesUtils:
        assert formatted_item["content"] == "dongyi"


+class TestShouldContinueFinalMessage:
+    """Tests for should_continue_final_message function.
+
+    This function enables Anthropic-style partial message completion, where
+    users can provide an incomplete assistant message and have the model
+    continue from where it left off.
+    """
+
+    def test_string_input_returns_false(self):
+        """String input is always a user message, so should not continue."""
+        assert should_continue_final_message("Hello, world!") is False
+
+    def test_empty_list_returns_false(self):
+        """Empty list should not continue."""
+        assert should_continue_final_message([]) is False
+
+    def test_completed_message_returns_false(self):
+        """Completed message should not be continued."""
+        output_item = ResponseOutputMessage(
+            id="msg_123",
+            content=[
+                ResponseOutputText(
+                    annotations=[],
+                    text="The answer is 42.",
+                    type="output_text",
+                    logprobs=None,
+                )
+            ],
+            role="assistant",
+            status="completed",
+            type="message",
+        )
+        assert should_continue_final_message([output_item]) is False
+
+    def test_in_progress_message_returns_true(self):
+        """In-progress message should be continued.
+
+        This is the key use case for partial message completion.
+        Example: The user provides "The best answer is (" and wants
+        the model to continue from there.
+        """
+        output_item = ResponseOutputMessage(
+            id="msg_123",
+            content=[
+                ResponseOutputText(
+                    annotations=[],
+                    text="The best answer is (",
+                    type="output_text",
+                    logprobs=None,
+                )
+            ],
+            role="assistant",
+            status="in_progress",
+            type="message",
+        )
+        assert should_continue_final_message([output_item]) is True
+
+    def test_incomplete_message_returns_true(self):
+        """Incomplete message should be continued."""
+        output_item = ResponseOutputMessage(
+            id="msg_123",
+            content=[
+                ResponseOutputText(
+                    annotations=[],
+                    text="The answer",
+                    type="output_text",
+                    logprobs=None,
+                )
+            ],
+            role="assistant",
+            status="incomplete",
+            type="message",
+        )
+        assert should_continue_final_message([output_item]) is True
+
+    def test_in_progress_reasoning_returns_true(self):
+        """In-progress reasoning should be continued."""
+        reasoning_item = ResponseReasoningItem(
+            id="reasoning_123",
+            summary=[],
+            type="reasoning",
+            content=[
+                Content(
+                    text="Let me think about this...",
+                    type="reasoning_text",
+                )
+            ],
+            encrypted_content=None,
+            status="in_progress",
+        )
+        assert should_continue_final_message([reasoning_item]) is True
+
+    def test_incomplete_reasoning_returns_true(self):
+        """Incomplete reasoning should be continued."""
+        reasoning_item = ResponseReasoningItem(
+            id="reasoning_123",
+            summary=[],
+            type="reasoning",
+            content=[
+                Content(
+                    text="Let me think",
+                    type="reasoning_text",
+                )
+            ],
+            encrypted_content=None,
+            status="incomplete",
+        )
+        assert should_continue_final_message([reasoning_item]) is True
+
+        reasoning_item = {
+            "id": "reasoning_123",
+            "summary": [],
+            "type": "reasoning",
+            "content": [],
+            "status": "incomplete",
+        }
+        assert should_continue_final_message([reasoning_item]) is True
+
+    def test_completed_reasoning_returns_false(self):
+        """Completed reasoning should not be continued."""
+        reasoning_item = ResponseReasoningItem(
+            id="reasoning_123",
+            summary=[],
+            type="reasoning",
+            content=[
+                Content(
+                    text="I have thought about this.",
+                    type="reasoning_text",
+                )
+            ],
+            encrypted_content=None,
+            status="completed",
+        )
+        assert should_continue_final_message([reasoning_item]) is False
+
+    def test_reasoning_with_none_status_returns_false(self):
+        """Reasoning with None status should not be continued."""
+        reasoning_item = ResponseReasoningItem(
+            id="reasoning_123",
+            summary=[],
+            type="reasoning",
+            content=[
+                Content(
+                    text="Some reasoning",
+                    type="reasoning_text",
+                )
+            ],
+            encrypted_content=None,
+            status=None,
+        )
+        assert should_continue_final_message([reasoning_item]) is False
+
+    def test_only_last_item_matters(self):
+        """Only the last item in the list determines continuation."""
+        completed_item = ResponseOutputMessage(
+            id="msg_1",
+            content=[
+                ResponseOutputText(
+                    annotations=[],
+                    text="Complete message.",
+                    type="output_text",
+                    logprobs=None,
+                )
+            ],
+            role="assistant",
+            status="completed",
+            type="message",
+        )
+        in_progress_item = ResponseOutputMessage(
+            id="msg_2",
+            content=[
+                ResponseOutputText(
+                    annotations=[],
+                    text="Partial message...",
+                    type="output_text",
+                    logprobs=None,
+                )
+            ],
+            role="assistant",
+            status="in_progress",
+            type="message",
+        )
+
+        # In-progress as last item -> should continue
+        assert should_continue_final_message([completed_item, in_progress_item]) is True
+
+        # Completed as last item -> should not continue
+        assert (
+            should_continue_final_message([in_progress_item, completed_item]) is False
+        )
+
+    def test_tool_call_returns_false(self):
+        """Tool calls should not trigger continuation."""
+        tool_call = ResponseFunctionToolCall(
+            id="fc_123",
+            call_id="call_123",
+            type="function_call",
+            status="in_progress",
+            name="get_weather",
+            arguments='{"location": "NYC"}',
+        )
+        assert should_continue_final_message([tool_call]) is False
+
+        tool_call = {
+            "id": "msg_123",
+            "call_id": "call_123",
+            "type": "function_call",
+            "status": "in_progress",
+            "name": "get_weather",
+            "arguments": '{"location": "NYC"}',
+        }
+        assert should_continue_final_message([tool_call]) is False
+
+    # Tests for dict inputs (e.g., from curl requests)
+    def test_dict_in_progress_message_returns_true(self):
+        """Dict with in_progress status should be continued (curl input)."""
+        dict_item = {
+            "id": "msg_123",
+            "type": "message",
+            "role": "assistant",
+            "status": "in_progress",
+            "content": [{"type": "output_text", "text": "The answer is ("}],
+        }
+        assert should_continue_final_message([dict_item]) is True
+
+    def test_dict_incomplete_message_returns_true(self):
+        """Dict with incomplete status should be continued (curl input)."""
+        dict_item = {
+            "id": "msg_123",
+            "type": "message",
+            "role": "assistant",
+            "status": "incomplete",
+            "content": [{"type": "output_text", "text": "Partial answer"}],
+        }
+        assert should_continue_final_message([dict_item]) is True
+
+    def test_dict_completed_message_returns_false(self):
+        """Dict with completed status should not be continued (curl input)."""
+        dict_item = {
+            "id": "msg_123",
+            "type": "message",
+            "role": "assistant",
+            "status": "completed",
+            "content": [{"type": "output_text", "text": "Complete answer."}],
+        }
+        assert should_continue_final_message([dict_item]) is False
+
+    def test_dict_reasoning_in_progress_returns_true(self):
+        """Dict reasoning item with in_progress status should be continued."""
+        dict_item = {
+            "id": "reasoning_123",
+            "type": "reasoning",
+            "status": "in_progress",
+            "content": [{"type": "reasoning_text", "text": "Let me think..."}],
+        }
+        assert should_continue_final_message([dict_item]) is True
+
+    def test_dict_without_status_returns_false(self):
+        """Dict without status field should not be continued."""
+        dict_item = {
+            "id": "msg_123",
+            "type": "message",
+            "role": "assistant",
+            "content": [{"type": "output_text", "text": "Some text"}],
+        }
+        assert should_continue_final_message([dict_item]) is False
+
+    def test_dict_with_none_status_returns_false(self):
+        """Dict with None status should not be continued."""
+        dict_item = {
+            "id": "msg_123",
+            "type": "message",
+            "role": "assistant",
+            "status": None,
+            "content": [{"type": "output_text", "text": "Some text"}],
+        }
+        assert should_continue_final_message([dict_item]) is False
+
+
 class TestMaybeCombineReasoningAndToolCall:
    """Tests for _maybe_combine_reasoning_and_tool_call function."""

--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -111,6 +111,7 @@ from vllm.entrypoints.responses_utils import (
    construct_input_messages,
    construct_tool_dicts,
    extract_tool_types,
+    should_continue_final_message,
 )
 from vllm.entrypoints.tool_server import ToolServer
 from vllm.inputs.data import TokensPrompt
@@ -590,6 +591,10 @@ class OpenAIServingResponses(OpenAIServing):
            prev_response_output=prev_response.output if prev_response else None,
        )

+        # Check if we should continue the final message (partial completion)
+        # This enables Anthropic-style partial message completion where the
+        # user provides an incomplete assistant message to continue from.
+        continue_final = should_continue_final_message(request.input)
        chat_template_kwargs = dict(
            reasoning_effort=None
            if request.reasoning is None
@@ -604,6 +609,11 @@ class OpenAIServingResponses(OpenAIServing):
            tool_parser=self.tool_parser,
            chat_template=self.chat_template,
            chat_template_content_format=self.chat_template_content_format,
+            # When continuing a partial message, we set continue_final_message=True
+            # and add_generation_prompt=False so the model continues the message
+            # rather than starting a new one.
+            add_generation_prompt=not continue_final,
+            continue_final_message=continue_final,
            chat_template_kwargs=chat_template_kwargs,
        )
        return messages, engine_prompts
--- a/vllm/entrypoints/responses_utils.py
+++ b/vllm/entrypoints/responses_utils.py
@@ -28,6 +28,53 @@ from vllm.entrypoints.openai.engine.protocol import (
 )


+def should_continue_final_message(
+    request_input: str | list[ResponseInputOutputItem],
+) -> bool:
+    """
+    Determine if the last input message is a partial assistant message
+    that should be continued rather than starting a new generation.
+
+    This enables partial message completion similar to Anthropic's Messages API,
+    where users can provide an incomplete assistant message and have the model
+    continue from where it left off.
+
+    A message is considered partial if:
+    1. It's a ResponseOutputMessage or ResponseReasoningItem
+    2. Its status is "in_progress" or "incomplete"
+
+    Args:
+        request_input: The input to the Responses API request
+
+    Returns:
+        True if the final message should be continued, False otherwise
+    """
+    if isinstance(request_input, str):
+        # Simple string input is always a user message
+        return False
+
+    if not request_input:
+        return False
+
+    last_item = request_input[-1]
+
+    # Check if the last item is a partial assistant message
+    if isinstance(last_item, ResponseOutputMessage):
+        return last_item.status in ("in_progress", "incomplete")
+
+    # Check if the last item is a partial reasoning item
+    if isinstance(last_item, ResponseReasoningItem):
+        return last_item.status in ("in_progress", "incomplete")
+
+    if isinstance(last_item, dict):
+        # only support partial completion for messages for now
+        if last_item.get("type", "message") not in ("message", "reasoning"):
+            return False
+        return last_item.get("status") in ("in_progress", "incomplete")
+
+    return False
+
+
 def construct_input_messages(
    *,
    request_instructions: str | None = None,