[2/N] move responses/serving _make_response_output_items logic to parser (#33281)

Signed-off-by: Andrew Xia <axia@fb.com> Signed-off-by: Andrew Xia <axia@meta.com> Co-authored-by: Andrew Xia <axia@fb.com>
2026-02-05 00:46:15 -05:00
parent c1395f72cd
commit 9595afda18
2 changed files with 242 additions and 100 deletions
--- a/vllm/entrypoints/openai/responses/serving.py
+++ b/vllm/entrypoints/openai/responses/serving.py
@@ -63,7 +63,6 @@ from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.chat_utils import (
    ChatCompletionMessageParam,
    ChatTemplateContentFormatOption,
-    make_tool_call_id,
 )
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.mcp.tool_server import ToolServer
@@ -915,114 +914,57 @@ class OpenAIServingResponses(OpenAIServing):
        final_output: CompletionOutput,
        tokenizer: TokenizerLike,
    ) -> list[ResponseOutputItem]:
-        if self.parser and self.parser.reasoning_parser_cls:
-            try:
-                reasoning_parser = self.parser.reasoning_parser_cls(tokenizer)
-            except RuntimeError as e:
-                logger.exception("Error in reasoning parser creation.")
-                raise e
-
-            reasoning, content = reasoning_parser.extract_reasoning(
-                final_output.text, request=request
-            )
-        else:
-            reasoning = None
-            content = final_output.text
-
        # Log complete response if output logging is enabled
        if self.enable_log_outputs and self.request_logger:
-            output_text = ""
-            if content:
-                output_text = content
-            elif reasoning:
-                output_text = f"[reasoning: {reasoning}]"
-
-            if output_text:
-                self.request_logger.log_outputs(
-                    request_id=request.request_id,
-                    outputs=output_text,
-                    output_token_ids=final_output.token_ids,
-                    finish_reason=final_output.finish_reason,
-                    is_streaming=False,
-                    delta=False,
-                )
-
-        reasoning_item = None
-        message_item = None
-        if reasoning:
-            reasoning_item = ResponseReasoningItem(
-                id=f"rs_{random_uuid()}",
-                summary=[],
-                type="reasoning",
-                content=[
-                    ResponseReasoningTextContent(text=reasoning, type="reasoning_text")
-                ],
-                status=None,  # NOTE: Only the last output item has status.
+            self.request_logger.log_outputs(
+                request_id=request.request_id,
+                outputs=final_output.text,
+                output_token_ids=final_output.token_ids,
+                finish_reason=final_output.finish_reason,
+                is_streaming=False,
+                delta=False,
            )
-        tool_calls, content = self._parse_tool_calls_from_content(
-            request=request,
-            tokenizer=tokenizer,
-            content=content,
-            enable_auto_tools=self.enable_auto_tools,
-            tool_parser_cls=self.parser.tool_parser_cls if self.parser else None,
-        )

-        if content or (self.use_harmony and tool_calls):
-            res_text_part = None
-            if content:
-                res_text_part = ResponseOutputText(
-                    text=content,
-                    annotations=[],  # TODO
-                    type="output_text",
-                    logprobs=(
-                        self._create_response_logprobs(
-                            token_ids=final_output.token_ids,
-                            logprobs=final_output.logprobs,
-                            tokenizer=tokenizer,
-                            top_logprobs=request.top_logprobs,
-                        )
-                        if request.is_include_output_logprobs()
-                        else None
-                    ),
-                )
-            message_item = ResponseOutputMessage(
+        # Compute logprobs if requested
+        logprobs = None
+        if request.is_include_output_logprobs() and final_output.logprobs:
+            logprobs = self._create_response_logprobs(
+                token_ids=final_output.token_ids,
+                logprobs=final_output.logprobs,
+                tokenizer=tokenizer,
+                top_logprobs=request.top_logprobs,
+            )
+
+        # Use parser to extract and create response output items
+        if self.parser:
+            parser = self.parser(tokenizer)
+            return parser.extract_response_outputs(
+                model_output=final_output.text,
+                request=request,
+                enable_auto_tools=self.enable_auto_tools,
+                tool_call_id_type=self.tool_call_id_type,
+                logprobs=logprobs,
+            )
+
+        # Fallback when no parser is configured
+        return [
+            ResponseOutputMessage(
                id=f"msg_{random_uuid()}",
-                content=[res_text_part] if res_text_part else [],
+                content=[
+                    ResponseOutputText(
+                        text=final_output.text,
+                        annotations=[],
+                        type="output_text",
+                        logprobs=logprobs,
+                    )
+                ]
+                if final_output.text
+                else [],
                role="assistant",
                status="completed",
                type="message",
            )
-        outputs = []
-
-        if reasoning_item:
-            outputs.append(reasoning_item)
-        if message_item:
-            outputs.append(message_item)
-        if tool_calls:
-            # We use a simple counter for history_tool_call_count because
-            # we don't track the history of tool calls in the Responses API yet.
-            # This means that the tool call index will start from 0 for each
-            # request.
-            tool_call_items = []
-            for history_tool_call_cnt, tool_call in enumerate(tool_calls):
-                tool_call_items.append(
-                    ResponseFunctionToolCall(
-                        id=f"fc_{random_uuid()}",
-                        call_id=tool_call.id
-                        if tool_call.id
-                        else make_tool_call_id(
-                            id_type=self.tool_call_id_type,
-                            func_name=tool_call.name,
-                            idx=history_tool_call_cnt,
-                        ),
-                        type="function_call",
-                        status="completed",
-                        name=tool_call.name,
-                        arguments=tool_call.arguments,
-                    )
-                )
-            outputs.extend(tool_call_items)
-        return outputs
+        ]

    def _make_response_output_items_with_harmony(
        self,
--- a/vllm/parser/abstract_parser.py
+++ b/vllm/parser/abstract_parser.py
@@ -1,23 +1,46 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

+import json
 from abc import abstractmethod
 from collections.abc import Sequence
 from functools import cached_property

+from openai.types.responses import (
+    ResponseFunctionToolCall,
+    ResponseOutputItem,
+    ResponseOutputMessage,
+    ResponseOutputText,
+    ResponseReasoningItem,
+    ToolChoiceFunction,
+)
+from openai.types.responses.response_output_text import Logprob
+from openai.types.responses.response_reasoning_item import (
+    Content as ResponseReasoningTextContent,
+)
+from pydantic import TypeAdapter
+
+from vllm.entrypoints.chat_utils import make_tool_call_id
 from vllm.entrypoints.openai.chat_completion.protocol import (
+    ChatCompletionNamedToolChoiceParam,
    ChatCompletionRequest,
 )
 from vllm.entrypoints.openai.engine.protocol import (
    DeltaMessage,
    ExtractedToolCallInformation,
+    FunctionCall,
+    FunctionDefinition,
 )
 from vllm.entrypoints.openai.responses.protocol import (
    ResponsesRequest,
 )
+from vllm.logger import init_logger
 from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
 from vllm.tokenizers import TokenizerLike
 from vllm.tool_parsers.abstract_tool_parser import ToolParser
+from vllm.utils import random_uuid
+
+logger = init_logger(__name__)


 class Parser:
@@ -128,6 +151,33 @@ class Parser:
            The extracted content token IDs.
        """

+    @abstractmethod
+    def extract_response_outputs(
+        self,
+        model_output: str,
+        request: ResponsesRequest,
+        enable_auto_tools: bool = False,
+        tool_call_id_type: str = "random",
+        logprobs: list[Logprob] | None = None,
+    ) -> list[ResponseOutputItem]:
+        """
+        Extract reasoning, content, and tool calls from a complete
+        model-generated string and return as ResponseOutputItem objects.
+
+        Used for non-streaming responses where we have the entire model
+        response available before sending to the client.
+
+        Args:
+            model_output: The complete model-generated string.
+            request: The request object used to generate the output.
+            enable_auto_tools: Whether to enable automatic tool call parsing.
+            tool_call_id_type: Type of tool call ID generation ("random", etc).
+            logprobs: Pre-computed logprobs for the output text, if any.
+
+        Returns:
+            A list of ResponseOutputItem objects.
+        """
+
    @abstractmethod
    def extract_reasoning(
        self,
@@ -260,6 +310,156 @@ class DelegatingParser(Parser):
            return None, model_output
        return self._reasoning_parser.extract_reasoning(model_output, request)

+    def extract_response_outputs(
+        self,
+        model_output: str,
+        request: ResponsesRequest,
+        enable_auto_tools: bool = False,
+        tool_call_id_type: str = "random",
+        logprobs: list[Logprob] | None = None,
+    ) -> list[ResponseOutputItem]:
+        # First extract reasoning
+        reasoning, content = self.extract_reasoning(model_output, request)
+
+        # Then parse tool calls from the content
+        tool_calls, content = self._parse_tool_calls(
+            request=request,
+            content=content,
+            enable_auto_tools=enable_auto_tools,
+        )
+
+        # Build output items
+        outputs: list[ResponseOutputItem] = []
+
+        # Add reasoning item if present
+        if reasoning:
+            reasoning_item = ResponseReasoningItem(
+                id=f"rs_{random_uuid()}",
+                summary=[],
+                type="reasoning",
+                content=[
+                    ResponseReasoningTextContent(text=reasoning, type="reasoning_text")
+                ],
+                status=None,  # NOTE: Only the last output item has status.
+            )
+            outputs.append(reasoning_item)
+
+        # Add message item if there's content
+        if content:
+            res_text_part = ResponseOutputText(
+                text=content,
+                annotations=[],
+                type="output_text",
+                logprobs=logprobs,
+            )
+            message_item = ResponseOutputMessage(
+                id=f"msg_{random_uuid()}",
+                content=[res_text_part],
+                role="assistant",
+                status="completed",
+                type="message",
+            )
+            outputs.append(message_item)
+
+        if tool_calls:
+            # We use a simple counter for history_tool_call_count because
+            # we don't track the history of tool calls in the Responses API yet.
+            # This means that the tool call index will start from 0 for each
+            # request.
+            for history_tool_call_cnt, tool_call in enumerate(tool_calls):
+                tool_call_item = ResponseFunctionToolCall(
+                    id=f"fc_{random_uuid()}",
+                    call_id=tool_call.id
+                    if tool_call.id
+                    else make_tool_call_id(
+                        id_type=tool_call_id_type,
+                        func_name=tool_call.name,
+                        idx=history_tool_call_cnt,
+                    ),
+                    type="function_call",
+                    status="completed",
+                    name=tool_call.name,
+                    arguments=tool_call.arguments,
+                )
+                outputs.append(tool_call_item)
+
+        return outputs
+
+    def _parse_tool_calls(
+        self,
+        request: ResponsesRequest,
+        content: str | None,
+        enable_auto_tools: bool,
+    ) -> tuple[list[FunctionCall], str | None]:
+        """
+        TODO(qandrew): merge _parse_tool_calls_from_content
+        for ChatCompletions into this function
+        Parse tool calls from content based on request tool_choice settings.
+
+        Returns:
+            A tuple of (function_calls, remaining_content) if tool calls
+            were parsed
+        """
+        function_calls: list[FunctionCall] = []
+
+        if request.tool_choice and isinstance(request.tool_choice, ToolChoiceFunction):
+            # Forced Function Call (Responses API style)
+            assert content is not None
+            function_calls.append(
+                FunctionCall(name=request.tool_choice.name, arguments=content)
+            )
+            return function_calls, None  # Clear content since tool is called.
+
+        if request.tool_choice and isinstance(
+            request.tool_choice, ChatCompletionNamedToolChoiceParam
+        ):
+            # Forced Function Call (Chat Completion API style)
+            assert content is not None
+            function_calls.append(
+                FunctionCall(name=request.tool_choice.function.name, arguments=content)
+            )
+            return function_calls, None  # Clear content since tool is called.
+
+        if request.tool_choice == "required":
+            # Required tool calls - parse JSON
+            assert content is not None
+            tool_calls = TypeAdapter(list[FunctionDefinition]).validate_json(content)
+            function_calls.extend(
+                FunctionCall(
+                    name=tool_call.name,
+                    arguments=json.dumps(tool_call.parameters, ensure_ascii=False),
+                )
+                for tool_call in tool_calls
+            )
+            return function_calls, None  # Clear content since tool is called.
+
+        if (
+            self._tool_parser is not None
+            and enable_auto_tools
+            and (request.tool_choice == "auto" or request.tool_choice is None)
+        ):
+            # Automatic Tool Call Parsing
+            tool_call_info = self._tool_parser.extract_tool_calls(
+                content if content is not None else "",
+                request=request,  # type: ignore
+            )
+            if tool_call_info is not None and tool_call_info.tools_called:
+                function_calls.extend(
+                    FunctionCall(
+                        id=tool_call.id,
+                        name=tool_call.function.name,
+                        arguments=tool_call.function.arguments,
+                    )
+                    for tool_call in tool_call_info.tool_calls
+                )
+                remaining_content = tool_call_info.content
+                if remaining_content and remaining_content.strip() == "":
+                    remaining_content = None
+                return function_calls, remaining_content
+
+        # No tool calls
+        return [], content
+
    def extract_reasoning_streaming(
        self,
        previous_text: str,