[2/N] move responses/serving _make_response_output_items logic to parser (#33281)

Signed-off-by: Andrew Xia <axia@fb.com>
Signed-off-by: Andrew Xia <axia@meta.com>
Co-authored-by: Andrew Xia <axia@fb.com>
This commit is contained in:
Andrew Xia
2026-02-05 00:46:15 -05:00
committed by GitHub
parent c1395f72cd
commit 9595afda18
2 changed files with 242 additions and 100 deletions

View File

@@ -63,7 +63,6 @@ from vllm.engine.protocol import EngineClient
from vllm.entrypoints.chat_utils import (
ChatCompletionMessageParam,
ChatTemplateContentFormatOption,
make_tool_call_id,
)
from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.mcp.tool_server import ToolServer
@@ -915,114 +914,57 @@ class OpenAIServingResponses(OpenAIServing):
final_output: CompletionOutput,
tokenizer: TokenizerLike,
) -> list[ResponseOutputItem]:
if self.parser and self.parser.reasoning_parser_cls:
try:
reasoning_parser = self.parser.reasoning_parser_cls(tokenizer)
except RuntimeError as e:
logger.exception("Error in reasoning parser creation.")
raise e
reasoning, content = reasoning_parser.extract_reasoning(
final_output.text, request=request
)
else:
reasoning = None
content = final_output.text
# Log complete response if output logging is enabled
if self.enable_log_outputs and self.request_logger:
output_text = ""
if content:
output_text = content
elif reasoning:
output_text = f"[reasoning: {reasoning}]"
if output_text:
self.request_logger.log_outputs(
request_id=request.request_id,
outputs=output_text,
output_token_ids=final_output.token_ids,
finish_reason=final_output.finish_reason,
is_streaming=False,
delta=False,
)
reasoning_item = None
message_item = None
if reasoning:
reasoning_item = ResponseReasoningItem(
id=f"rs_{random_uuid()}",
summary=[],
type="reasoning",
content=[
ResponseReasoningTextContent(text=reasoning, type="reasoning_text")
],
status=None, # NOTE: Only the last output item has status.
self.request_logger.log_outputs(
request_id=request.request_id,
outputs=final_output.text,
output_token_ids=final_output.token_ids,
finish_reason=final_output.finish_reason,
is_streaming=False,
delta=False,
)
tool_calls, content = self._parse_tool_calls_from_content(
request=request,
tokenizer=tokenizer,
content=content,
enable_auto_tools=self.enable_auto_tools,
tool_parser_cls=self.parser.tool_parser_cls if self.parser else None,
)
if content or (self.use_harmony and tool_calls):
res_text_part = None
if content:
res_text_part = ResponseOutputText(
text=content,
annotations=[], # TODO
type="output_text",
logprobs=(
self._create_response_logprobs(
token_ids=final_output.token_ids,
logprobs=final_output.logprobs,
tokenizer=tokenizer,
top_logprobs=request.top_logprobs,
)
if request.is_include_output_logprobs()
else None
),
)
message_item = ResponseOutputMessage(
# Compute logprobs if requested
logprobs = None
if request.is_include_output_logprobs() and final_output.logprobs:
logprobs = self._create_response_logprobs(
token_ids=final_output.token_ids,
logprobs=final_output.logprobs,
tokenizer=tokenizer,
top_logprobs=request.top_logprobs,
)
# Use parser to extract and create response output items
if self.parser:
parser = self.parser(tokenizer)
return parser.extract_response_outputs(
model_output=final_output.text,
request=request,
enable_auto_tools=self.enable_auto_tools,
tool_call_id_type=self.tool_call_id_type,
logprobs=logprobs,
)
# Fallback when no parser is configured
return [
ResponseOutputMessage(
id=f"msg_{random_uuid()}",
content=[res_text_part] if res_text_part else [],
content=[
ResponseOutputText(
text=final_output.text,
annotations=[],
type="output_text",
logprobs=logprobs,
)
]
if final_output.text
else [],
role="assistant",
status="completed",
type="message",
)
outputs = []
if reasoning_item:
outputs.append(reasoning_item)
if message_item:
outputs.append(message_item)
if tool_calls:
# We use a simple counter for history_tool_call_count because
# we don't track the history of tool calls in the Responses API yet.
# This means that the tool call index will start from 0 for each
# request.
tool_call_items = []
for history_tool_call_cnt, tool_call in enumerate(tool_calls):
tool_call_items.append(
ResponseFunctionToolCall(
id=f"fc_{random_uuid()}",
call_id=tool_call.id
if tool_call.id
else make_tool_call_id(
id_type=self.tool_call_id_type,
func_name=tool_call.name,
idx=history_tool_call_cnt,
),
type="function_call",
status="completed",
name=tool_call.name,
arguments=tool_call.arguments,
)
)
outputs.extend(tool_call_items)
return outputs
]
def _make_response_output_items_with_harmony(
self,

View File

@@ -1,23 +1,46 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json
from abc import abstractmethod
from collections.abc import Sequence
from functools import cached_property
from openai.types.responses import (
ResponseFunctionToolCall,
ResponseOutputItem,
ResponseOutputMessage,
ResponseOutputText,
ResponseReasoningItem,
ToolChoiceFunction,
)
from openai.types.responses.response_output_text import Logprob
from openai.types.responses.response_reasoning_item import (
Content as ResponseReasoningTextContent,
)
from pydantic import TypeAdapter
from vllm.entrypoints.chat_utils import make_tool_call_id
from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionNamedToolChoiceParam,
ChatCompletionRequest,
)
from vllm.entrypoints.openai.engine.protocol import (
DeltaMessage,
ExtractedToolCallInformation,
FunctionCall,
FunctionDefinition,
)
from vllm.entrypoints.openai.responses.protocol import (
ResponsesRequest,
)
from vllm.logger import init_logger
from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
from vllm.tokenizers import TokenizerLike
from vllm.tool_parsers.abstract_tool_parser import ToolParser
from vllm.utils import random_uuid
logger = init_logger(__name__)
class Parser:
@@ -128,6 +151,33 @@ class Parser:
The extracted content token IDs.
"""
@abstractmethod
def extract_response_outputs(
self,
model_output: str,
request: ResponsesRequest,
enable_auto_tools: bool = False,
tool_call_id_type: str = "random",
logprobs: list[Logprob] | None = None,
) -> list[ResponseOutputItem]:
"""
Extract reasoning, content, and tool calls from a complete
model-generated string and return as ResponseOutputItem objects.
Used for non-streaming responses where we have the entire model
response available before sending to the client.
Args:
model_output: The complete model-generated string.
request: The request object used to generate the output.
enable_auto_tools: Whether to enable automatic tool call parsing.
tool_call_id_type: Type of tool call ID generation ("random", etc).
logprobs: Pre-computed logprobs for the output text, if any.
Returns:
A list of ResponseOutputItem objects.
"""
@abstractmethod
def extract_reasoning(
self,
@@ -260,6 +310,156 @@ class DelegatingParser(Parser):
return None, model_output
return self._reasoning_parser.extract_reasoning(model_output, request)
def extract_response_outputs(
self,
model_output: str,
request: ResponsesRequest,
enable_auto_tools: bool = False,
tool_call_id_type: str = "random",
logprobs: list[Logprob] | None = None,
) -> list[ResponseOutputItem]:
# First extract reasoning
reasoning, content = self.extract_reasoning(model_output, request)
# Then parse tool calls from the content
tool_calls, content = self._parse_tool_calls(
request=request,
content=content,
enable_auto_tools=enable_auto_tools,
)
# Build output items
outputs: list[ResponseOutputItem] = []
# Add reasoning item if present
if reasoning:
reasoning_item = ResponseReasoningItem(
id=f"rs_{random_uuid()}",
summary=[],
type="reasoning",
content=[
ResponseReasoningTextContent(text=reasoning, type="reasoning_text")
],
status=None, # NOTE: Only the last output item has status.
)
outputs.append(reasoning_item)
# Add message item if there's content
if content:
res_text_part = ResponseOutputText(
text=content,
annotations=[],
type="output_text",
logprobs=logprobs,
)
message_item = ResponseOutputMessage(
id=f"msg_{random_uuid()}",
content=[res_text_part],
role="assistant",
status="completed",
type="message",
)
outputs.append(message_item)
if tool_calls:
# We use a simple counter for history_tool_call_count because
# we don't track the history of tool calls in the Responses API yet.
# This means that the tool call index will start from 0 for each
# request.
for history_tool_call_cnt, tool_call in enumerate(tool_calls):
tool_call_item = ResponseFunctionToolCall(
id=f"fc_{random_uuid()}",
call_id=tool_call.id
if tool_call.id
else make_tool_call_id(
id_type=tool_call_id_type,
func_name=tool_call.name,
idx=history_tool_call_cnt,
),
type="function_call",
status="completed",
name=tool_call.name,
arguments=tool_call.arguments,
)
outputs.append(tool_call_item)
return outputs
def _parse_tool_calls(
self,
request: ResponsesRequest,
content: str | None,
enable_auto_tools: bool,
) -> tuple[list[FunctionCall], str | None]:
"""
TODO(qandrew): merge _parse_tool_calls_from_content
for ChatCompletions into this function
Parse tool calls from content based on request tool_choice settings.
Returns:
A tuple of (function_calls, remaining_content) if tool calls
were parsed
"""
function_calls: list[FunctionCall] = []
if request.tool_choice and isinstance(request.tool_choice, ToolChoiceFunction):
# Forced Function Call (Responses API style)
assert content is not None
function_calls.append(
FunctionCall(name=request.tool_choice.name, arguments=content)
)
return function_calls, None # Clear content since tool is called.
if request.tool_choice and isinstance(
request.tool_choice, ChatCompletionNamedToolChoiceParam
):
# Forced Function Call (Chat Completion API style)
assert content is not None
function_calls.append(
FunctionCall(name=request.tool_choice.function.name, arguments=content)
)
return function_calls, None # Clear content since tool is called.
if request.tool_choice == "required":
# Required tool calls - parse JSON
assert content is not None
tool_calls = TypeAdapter(list[FunctionDefinition]).validate_json(content)
function_calls.extend(
FunctionCall(
name=tool_call.name,
arguments=json.dumps(tool_call.parameters, ensure_ascii=False),
)
for tool_call in tool_calls
)
return function_calls, None # Clear content since tool is called.
if (
self._tool_parser is not None
and enable_auto_tools
and (request.tool_choice == "auto" or request.tool_choice is None)
):
# Automatic Tool Call Parsing
tool_call_info = self._tool_parser.extract_tool_calls(
content if content is not None else "",
request=request, # type: ignore
)
if tool_call_info is not None and tool_call_info.tools_called:
function_calls.extend(
FunctionCall(
id=tool_call.id,
name=tool_call.function.name,
arguments=tool_call.function.arguments,
)
for tool_call in tool_call_info.tool_calls
)
remaining_content = tool_call_info.content
if remaining_content and remaining_content.strip() == "":
remaining_content = None
return function_calls, remaining_content
# No tool calls
return [], content
def extract_reasoning_streaming(
self,
previous_text: str,