[responseAPI] support partial message generation (#32100)
Signed-off-by: Andrew Xia <axia@fb.com> Signed-off-by: Andrew Xia <mitandrewxia@gmail.com> Signed-off-by: Lu Fang <30275821+houseroad@users.noreply.github.com> Co-authored-by: Andrew Xia <axia@fb.com> Co-authored-by: Lu Fang <30275821+houseroad@users.noreply.github.com>
This commit is contained in:
@@ -21,6 +21,7 @@ from vllm.entrypoints.responses_utils import (
|
||||
_maybe_combine_reasoning_and_tool_call,
|
||||
construct_chat_messages_with_tool_call,
|
||||
convert_tool_responses_to_completions_format,
|
||||
should_continue_final_message,
|
||||
)
|
||||
|
||||
|
||||
@@ -165,6 +166,285 @@ class TestResponsesUtils:
|
||||
assert formatted_item["content"] == "dongyi"
|
||||
|
||||
|
||||
class TestShouldContinueFinalMessage:
|
||||
"""Tests for should_continue_final_message function.
|
||||
|
||||
This function enables Anthropic-style partial message completion, where
|
||||
users can provide an incomplete assistant message and have the model
|
||||
continue from where it left off.
|
||||
"""
|
||||
|
||||
def test_string_input_returns_false(self):
|
||||
"""String input is always a user message, so should not continue."""
|
||||
assert should_continue_final_message("Hello, world!") is False
|
||||
|
||||
def test_empty_list_returns_false(self):
|
||||
"""Empty list should not continue."""
|
||||
assert should_continue_final_message([]) is False
|
||||
|
||||
def test_completed_message_returns_false(self):
|
||||
"""Completed message should not be continued."""
|
||||
output_item = ResponseOutputMessage(
|
||||
id="msg_123",
|
||||
content=[
|
||||
ResponseOutputText(
|
||||
annotations=[],
|
||||
text="The answer is 42.",
|
||||
type="output_text",
|
||||
logprobs=None,
|
||||
)
|
||||
],
|
||||
role="assistant",
|
||||
status="completed",
|
||||
type="message",
|
||||
)
|
||||
assert should_continue_final_message([output_item]) is False
|
||||
|
||||
def test_in_progress_message_returns_true(self):
|
||||
"""In-progress message should be continued.
|
||||
|
||||
This is the key use case for partial message completion.
|
||||
Example: The user provides "The best answer is (" and wants
|
||||
the model to continue from there.
|
||||
"""
|
||||
output_item = ResponseOutputMessage(
|
||||
id="msg_123",
|
||||
content=[
|
||||
ResponseOutputText(
|
||||
annotations=[],
|
||||
text="The best answer is (",
|
||||
type="output_text",
|
||||
logprobs=None,
|
||||
)
|
||||
],
|
||||
role="assistant",
|
||||
status="in_progress",
|
||||
type="message",
|
||||
)
|
||||
assert should_continue_final_message([output_item]) is True
|
||||
|
||||
def test_incomplete_message_returns_true(self):
|
||||
"""Incomplete message should be continued."""
|
||||
output_item = ResponseOutputMessage(
|
||||
id="msg_123",
|
||||
content=[
|
||||
ResponseOutputText(
|
||||
annotations=[],
|
||||
text="The answer",
|
||||
type="output_text",
|
||||
logprobs=None,
|
||||
)
|
||||
],
|
||||
role="assistant",
|
||||
status="incomplete",
|
||||
type="message",
|
||||
)
|
||||
assert should_continue_final_message([output_item]) is True
|
||||
|
||||
def test_in_progress_reasoning_returns_true(self):
|
||||
"""In-progress reasoning should be continued."""
|
||||
reasoning_item = ResponseReasoningItem(
|
||||
id="reasoning_123",
|
||||
summary=[],
|
||||
type="reasoning",
|
||||
content=[
|
||||
Content(
|
||||
text="Let me think about this...",
|
||||
type="reasoning_text",
|
||||
)
|
||||
],
|
||||
encrypted_content=None,
|
||||
status="in_progress",
|
||||
)
|
||||
assert should_continue_final_message([reasoning_item]) is True
|
||||
|
||||
def test_incomplete_reasoning_returns_true(self):
|
||||
"""Incomplete reasoning should be continued."""
|
||||
reasoning_item = ResponseReasoningItem(
|
||||
id="reasoning_123",
|
||||
summary=[],
|
||||
type="reasoning",
|
||||
content=[
|
||||
Content(
|
||||
text="Let me think",
|
||||
type="reasoning_text",
|
||||
)
|
||||
],
|
||||
encrypted_content=None,
|
||||
status="incomplete",
|
||||
)
|
||||
assert should_continue_final_message([reasoning_item]) is True
|
||||
|
||||
reasoning_item = {
|
||||
"id": "reasoning_123",
|
||||
"summary": [],
|
||||
"type": "reasoning",
|
||||
"content": [],
|
||||
"status": "incomplete",
|
||||
}
|
||||
assert should_continue_final_message([reasoning_item]) is True
|
||||
|
||||
def test_completed_reasoning_returns_false(self):
|
||||
"""Completed reasoning should not be continued."""
|
||||
reasoning_item = ResponseReasoningItem(
|
||||
id="reasoning_123",
|
||||
summary=[],
|
||||
type="reasoning",
|
||||
content=[
|
||||
Content(
|
||||
text="I have thought about this.",
|
||||
type="reasoning_text",
|
||||
)
|
||||
],
|
||||
encrypted_content=None,
|
||||
status="completed",
|
||||
)
|
||||
assert should_continue_final_message([reasoning_item]) is False
|
||||
|
||||
def test_reasoning_with_none_status_returns_false(self):
|
||||
"""Reasoning with None status should not be continued."""
|
||||
reasoning_item = ResponseReasoningItem(
|
||||
id="reasoning_123",
|
||||
summary=[],
|
||||
type="reasoning",
|
||||
content=[
|
||||
Content(
|
||||
text="Some reasoning",
|
||||
type="reasoning_text",
|
||||
)
|
||||
],
|
||||
encrypted_content=None,
|
||||
status=None,
|
||||
)
|
||||
assert should_continue_final_message([reasoning_item]) is False
|
||||
|
||||
def test_only_last_item_matters(self):
|
||||
"""Only the last item in the list determines continuation."""
|
||||
completed_item = ResponseOutputMessage(
|
||||
id="msg_1",
|
||||
content=[
|
||||
ResponseOutputText(
|
||||
annotations=[],
|
||||
text="Complete message.",
|
||||
type="output_text",
|
||||
logprobs=None,
|
||||
)
|
||||
],
|
||||
role="assistant",
|
||||
status="completed",
|
||||
type="message",
|
||||
)
|
||||
in_progress_item = ResponseOutputMessage(
|
||||
id="msg_2",
|
||||
content=[
|
||||
ResponseOutputText(
|
||||
annotations=[],
|
||||
text="Partial message...",
|
||||
type="output_text",
|
||||
logprobs=None,
|
||||
)
|
||||
],
|
||||
role="assistant",
|
||||
status="in_progress",
|
||||
type="message",
|
||||
)
|
||||
|
||||
# In-progress as last item -> should continue
|
||||
assert should_continue_final_message([completed_item, in_progress_item]) is True
|
||||
|
||||
# Completed as last item -> should not continue
|
||||
assert (
|
||||
should_continue_final_message([in_progress_item, completed_item]) is False
|
||||
)
|
||||
|
||||
def test_tool_call_returns_false(self):
|
||||
"""Tool calls should not trigger continuation."""
|
||||
tool_call = ResponseFunctionToolCall(
|
||||
id="fc_123",
|
||||
call_id="call_123",
|
||||
type="function_call",
|
||||
status="in_progress",
|
||||
name="get_weather",
|
||||
arguments='{"location": "NYC"}',
|
||||
)
|
||||
assert should_continue_final_message([tool_call]) is False
|
||||
|
||||
tool_call = {
|
||||
"id": "msg_123",
|
||||
"call_id": "call_123",
|
||||
"type": "function_call",
|
||||
"status": "in_progress",
|
||||
"name": "get_weather",
|
||||
"arguments": '{"location": "NYC"}',
|
||||
}
|
||||
assert should_continue_final_message([tool_call]) is False
|
||||
|
||||
# Tests for dict inputs (e.g., from curl requests)
|
||||
def test_dict_in_progress_message_returns_true(self):
|
||||
"""Dict with in_progress status should be continued (curl input)."""
|
||||
dict_item = {
|
||||
"id": "msg_123",
|
||||
"type": "message",
|
||||
"role": "assistant",
|
||||
"status": "in_progress",
|
||||
"content": [{"type": "output_text", "text": "The answer is ("}],
|
||||
}
|
||||
assert should_continue_final_message([dict_item]) is True
|
||||
|
||||
def test_dict_incomplete_message_returns_true(self):
|
||||
"""Dict with incomplete status should be continued (curl input)."""
|
||||
dict_item = {
|
||||
"id": "msg_123",
|
||||
"type": "message",
|
||||
"role": "assistant",
|
||||
"status": "incomplete",
|
||||
"content": [{"type": "output_text", "text": "Partial answer"}],
|
||||
}
|
||||
assert should_continue_final_message([dict_item]) is True
|
||||
|
||||
def test_dict_completed_message_returns_false(self):
|
||||
"""Dict with completed status should not be continued (curl input)."""
|
||||
dict_item = {
|
||||
"id": "msg_123",
|
||||
"type": "message",
|
||||
"role": "assistant",
|
||||
"status": "completed",
|
||||
"content": [{"type": "output_text", "text": "Complete answer."}],
|
||||
}
|
||||
assert should_continue_final_message([dict_item]) is False
|
||||
|
||||
def test_dict_reasoning_in_progress_returns_true(self):
|
||||
"""Dict reasoning item with in_progress status should be continued."""
|
||||
dict_item = {
|
||||
"id": "reasoning_123",
|
||||
"type": "reasoning",
|
||||
"status": "in_progress",
|
||||
"content": [{"type": "reasoning_text", "text": "Let me think..."}],
|
||||
}
|
||||
assert should_continue_final_message([dict_item]) is True
|
||||
|
||||
def test_dict_without_status_returns_false(self):
|
||||
"""Dict without status field should not be continued."""
|
||||
dict_item = {
|
||||
"id": "msg_123",
|
||||
"type": "message",
|
||||
"role": "assistant",
|
||||
"content": [{"type": "output_text", "text": "Some text"}],
|
||||
}
|
||||
assert should_continue_final_message([dict_item]) is False
|
||||
|
||||
def test_dict_with_none_status_returns_false(self):
|
||||
"""Dict with None status should not be continued."""
|
||||
dict_item = {
|
||||
"id": "msg_123",
|
||||
"type": "message",
|
||||
"role": "assistant",
|
||||
"status": None,
|
||||
"content": [{"type": "output_text", "text": "Some text"}],
|
||||
}
|
||||
assert should_continue_final_message([dict_item]) is False
|
||||
|
||||
|
||||
class TestMaybeCombineReasoningAndToolCall:
|
||||
"""Tests for _maybe_combine_reasoning_and_tool_call function."""
|
||||
|
||||
|
||||
@@ -111,6 +111,7 @@ from vllm.entrypoints.responses_utils import (
|
||||
construct_input_messages,
|
||||
construct_tool_dicts,
|
||||
extract_tool_types,
|
||||
should_continue_final_message,
|
||||
)
|
||||
from vllm.entrypoints.tool_server import ToolServer
|
||||
from vllm.inputs.data import TokensPrompt
|
||||
@@ -590,6 +591,10 @@ class OpenAIServingResponses(OpenAIServing):
|
||||
prev_response_output=prev_response.output if prev_response else None,
|
||||
)
|
||||
|
||||
# Check if we should continue the final message (partial completion)
|
||||
# This enables Anthropic-style partial message completion where the
|
||||
# user provides an incomplete assistant message to continue from.
|
||||
continue_final = should_continue_final_message(request.input)
|
||||
chat_template_kwargs = dict(
|
||||
reasoning_effort=None
|
||||
if request.reasoning is None
|
||||
@@ -604,6 +609,11 @@ class OpenAIServingResponses(OpenAIServing):
|
||||
tool_parser=self.tool_parser,
|
||||
chat_template=self.chat_template,
|
||||
chat_template_content_format=self.chat_template_content_format,
|
||||
# When continuing a partial message, we set continue_final_message=True
|
||||
# and add_generation_prompt=False so the model continues the message
|
||||
# rather than starting a new one.
|
||||
add_generation_prompt=not continue_final,
|
||||
continue_final_message=continue_final,
|
||||
chat_template_kwargs=chat_template_kwargs,
|
||||
)
|
||||
return messages, engine_prompts
|
||||
|
||||
@@ -28,6 +28,53 @@ from vllm.entrypoints.openai.engine.protocol import (
|
||||
)
|
||||
|
||||
|
||||
def should_continue_final_message(
|
||||
request_input: str | list[ResponseInputOutputItem],
|
||||
) -> bool:
|
||||
"""
|
||||
Determine if the last input message is a partial assistant message
|
||||
that should be continued rather than starting a new generation.
|
||||
|
||||
This enables partial message completion similar to Anthropic's Messages API,
|
||||
where users can provide an incomplete assistant message and have the model
|
||||
continue from where it left off.
|
||||
|
||||
A message is considered partial if:
|
||||
1. It's a ResponseOutputMessage or ResponseReasoningItem
|
||||
2. Its status is "in_progress" or "incomplete"
|
||||
|
||||
Args:
|
||||
request_input: The input to the Responses API request
|
||||
|
||||
Returns:
|
||||
True if the final message should be continued, False otherwise
|
||||
"""
|
||||
if isinstance(request_input, str):
|
||||
# Simple string input is always a user message
|
||||
return False
|
||||
|
||||
if not request_input:
|
||||
return False
|
||||
|
||||
last_item = request_input[-1]
|
||||
|
||||
# Check if the last item is a partial assistant message
|
||||
if isinstance(last_item, ResponseOutputMessage):
|
||||
return last_item.status in ("in_progress", "incomplete")
|
||||
|
||||
# Check if the last item is a partial reasoning item
|
||||
if isinstance(last_item, ResponseReasoningItem):
|
||||
return last_item.status in ("in_progress", "incomplete")
|
||||
|
||||
if isinstance(last_item, dict):
|
||||
# only support partial completion for messages for now
|
||||
if last_item.get("type", "message") not in ("message", "reasoning"):
|
||||
return False
|
||||
return last_item.get("status") in ("in_progress", "incomplete")
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def construct_input_messages(
|
||||
*,
|
||||
request_instructions: str | None = None,
|
||||
|
||||
Reference in New Issue
Block a user