diff --git a/tests/entrypoints/openai/responses/test_function_call_parsing.py b/tests/entrypoints/openai/responses/test_function_call_parsing.py index d487759c1..8b4d7c739 100644 --- a/tests/entrypoints/openai/responses/test_function_call_parsing.py +++ b/tests/entrypoints/openai/responses/test_function_call_parsing.py @@ -7,7 +7,7 @@ import json import pytest from openai.types.responses import ResponseFunctionToolCall -from vllm.entrypoints.openai.engine.protocol import ResponsesRequest +from vllm.entrypoints.openai.responses.protocol import ResponsesRequest def test_function_call_dict_converted_to_object(): @@ -253,7 +253,7 @@ def test_function_call_validation_failure_logs_debug(caplog): } # Mock the logger to verify debug was called - with patch("vllm.entrypoints.openai.engine.protocol.logger") as mock_logger: + with patch("vllm.entrypoints.openai.responses.protocol.logger") as mock_logger: with pytest.raises(ValueError): ResponsesRequest(**request_data) diff --git a/tests/entrypoints/openai/test_protocol.py b/tests/entrypoints/openai/test_protocol.py index c4baf2f77..db5d7d692 100644 --- a/tests/entrypoints/openai/test_protocol.py +++ b/tests/entrypoints/openai/test_protocol.py @@ -4,7 +4,7 @@ from openai_harmony import ( Message, ) -from vllm.entrypoints.openai.engine.protocol import ( +from vllm.entrypoints.openai.responses.protocol import ( serialize_message, serialize_messages, ) diff --git a/tests/entrypoints/openai/test_serving_responses.py b/tests/entrypoints/openai/test_serving_responses.py index a79ee66a9..1b87cf0a9 100644 --- a/tests/entrypoints/openai/test_serving_responses.py +++ b/tests/entrypoints/openai/test_serving_responses.py @@ -14,8 +14,9 @@ from openai.types.responses.tool import ( ) from vllm.entrypoints.context import ConversationContext -from vllm.entrypoints.openai.engine.protocol import ErrorResponse, ResponsesRequest -from vllm.entrypoints.openai.serving_responses import ( +from vllm.entrypoints.openai.engine.protocol import ErrorResponse +from vllm.entrypoints.openai.responses.protocol import ResponsesRequest +from vllm.entrypoints.openai.responses.serving import ( OpenAIServingResponses, _extract_allowed_tools_from_mcp_requests, extract_tool_types, diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py index 6f61a0114..ff164bcaa 100644 --- a/vllm/entrypoints/context.py +++ b/vllm/entrypoints/context.py @@ -24,9 +24,6 @@ from vllm.entrypoints.chat_utils import ( from vllm.entrypoints.constants import MCP_PREFIX from vllm.entrypoints.openai.engine.protocol import ( FunctionCall, - ResponseInputOutputItem, - ResponseRawMessageAndToken, - ResponsesRequest, ) from vllm.entrypoints.openai.parser.harmony_utils import ( get_encoding, @@ -36,6 +33,11 @@ from vllm.entrypoints.openai.parser.harmony_utils import ( from vllm.entrypoints.openai.parser.responses_parser import ( get_responses_parser_for_simple_context, ) +from vllm.entrypoints.openai.responses.protocol import ( + ResponseInputOutputItem, + ResponseRawMessageAndToken, + ResponsesRequest, +) from vllm.entrypoints.responses_utils import construct_tool_dicts from vllm.entrypoints.tool import Tool from vllm.entrypoints.tool_server import ToolServer diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 86942f22f..acaf5f8d4 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -14,7 +14,7 @@ import socket import tempfile import uuid from argparse import Namespace -from collections.abc import AsyncGenerator, AsyncIterator, Awaitable +from collections.abc import AsyncIterator, Awaitable from contextlib import asynccontextmanager from http import HTTPStatus from typing import Annotated, Any @@ -49,9 +49,6 @@ from vllm.entrypoints.openai.engine.protocol import ( CompletionResponse, ErrorInfo, ErrorResponse, - ResponsesRequest, - ResponsesResponse, - StreamingResponsesResponse, TranscriptionRequest, TranscriptionResponseVariant, TranslationRequest, @@ -59,12 +56,12 @@ from vllm.entrypoints.openai.engine.protocol import ( ) from vllm.entrypoints.openai.engine.serving import OpenAIServing from vllm.entrypoints.openai.orca_metrics import metrics_header +from vllm.entrypoints.openai.responses.serving import OpenAIServingResponses from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion from vllm.entrypoints.openai.serving_models import ( BaseModelPath, OpenAIServingModels, ) -from vllm.entrypoints.openai.serving_responses import OpenAIServingResponses from vllm.entrypoints.openai.serving_transcription import ( OpenAIServingTranscription, OpenAIServingTranslation, @@ -311,112 +308,6 @@ async def show_version(): return JSONResponse(content=ver) -async def _convert_stream_to_sse_events( - generator: AsyncGenerator[StreamingResponsesResponse, None], -) -> AsyncGenerator[str, None]: - """Convert the generator to a stream of events in SSE format""" - async for event in generator: - event_type = getattr(event, "type", "unknown") - # https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#event_stream_format - event_data = ( - f"event: {event_type}\ndata: {event.model_dump_json(indent=None)}\n\n" - ) - yield event_data - - -@router.post( - "/v1/responses", - dependencies=[Depends(validate_json_request)], - responses={ - HTTPStatus.OK.value: {"content": {"text/event-stream": {}}}, - HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, - HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse}, - HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, - }, -) -@with_cancellation -async def create_responses(request: ResponsesRequest, raw_request: Request): - handler = responses(raw_request) - if handler is None: - return base(raw_request).create_error_response( - message="The model does not support Responses API" - ) - try: - generator = await handler.create_responses(request, raw_request) - except Exception as e: - raise HTTPException( - status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e) - ) from e - - if isinstance(generator, ErrorResponse): - return JSONResponse( - content=generator.model_dump(), status_code=generator.error.code - ) - elif isinstance(generator, ResponsesResponse): - return JSONResponse(content=generator.model_dump()) - - return StreamingResponse( - content=_convert_stream_to_sse_events(generator), media_type="text/event-stream" - ) - - -@router.get("/v1/responses/{response_id}") -async def retrieve_responses( - response_id: str, - raw_request: Request, - starting_after: int | None = None, - stream: bool | None = False, -): - handler = responses(raw_request) - if handler is None: - return base(raw_request).create_error_response( - message="The model does not support Responses API" - ) - - try: - response = await handler.retrieve_responses( - response_id, - starting_after=starting_after, - stream=stream, - ) - except Exception as e: - raise HTTPException( - status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e) - ) from e - - if isinstance(response, ErrorResponse): - return JSONResponse( - content=response.model_dump(), status_code=response.error.code - ) - elif isinstance(response, ResponsesResponse): - return JSONResponse(content=response.model_dump()) - return StreamingResponse( - content=_convert_stream_to_sse_events(response), media_type="text/event-stream" - ) - - -@router.post("/v1/responses/{response_id}/cancel") -async def cancel_responses(response_id: str, raw_request: Request): - handler = responses(raw_request) - if handler is None: - return base(raw_request).create_error_response( - message="The model does not support Responses API" - ) - - try: - response = await handler.cancel_responses(response_id) - except Exception as e: - raise HTTPException( - status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e) - ) from e - - if isinstance(response, ErrorResponse): - return JSONResponse( - content=response.model_dump(), status_code=response.error.code - ) - return JSONResponse(content=response.model_dump()) - - @router.post( "/v1/messages", dependencies=[Depends(validate_json_request)], @@ -844,6 +735,12 @@ def build_app(args: Namespace) -> FastAPI: ) register_chat_api_router(app) + + from vllm.entrypoints.openai.responses.api_router import ( + attach_router as register_responses_api_router, + ) + + register_responses_api_router(app) from vllm.entrypoints.sagemaker.routes import register_sagemaker_routes register_sagemaker_routes(router) diff --git a/vllm/entrypoints/openai/engine/protocol.py b/vllm/entrypoints/openai/engine/protocol.py index 4ada3f073..7166b54cb 100644 --- a/vllm/entrypoints/openai/engine/protocol.py +++ b/vllm/entrypoints/openai/engine/protocol.py @@ -11,63 +11,14 @@ from typing import Annotated, Any, ClassVar, Literal, TypeAlias import regex as re import torch from fastapi import HTTPException, UploadFile -from openai.types.responses import ( - ResponseCodeInterpreterCallCodeDeltaEvent, - ResponseCodeInterpreterCallCodeDoneEvent, - ResponseCodeInterpreterCallCompletedEvent, - ResponseCodeInterpreterCallInProgressEvent, - ResponseCodeInterpreterCallInterpretingEvent, - ResponseContentPartAddedEvent, - ResponseContentPartDoneEvent, - ResponseFunctionToolCall, - ResponseInputItemParam, - ResponseMcpCallArgumentsDeltaEvent, - ResponseMcpCallArgumentsDoneEvent, - ResponseMcpCallCompletedEvent, - ResponseMcpCallInProgressEvent, - ResponseOutputItem, - ResponseOutputItemAddedEvent, - ResponseOutputItemDoneEvent, - ResponsePrompt, - ResponseReasoningTextDeltaEvent, - ResponseReasoningTextDoneEvent, - ResponseStatus, - ResponseWebSearchCallCompletedEvent, - ResponseWebSearchCallInProgressEvent, - ResponseWebSearchCallSearchingEvent, -) -from openai.types.responses import ( - ResponseCompletedEvent as OpenAIResponseCompletedEvent, -) -from openai.types.responses import ResponseCreatedEvent as OpenAIResponseCreatedEvent -from openai.types.responses import ( - ResponseInProgressEvent as OpenAIResponseInProgressEvent, -) -from openai.types.responses.response_reasoning_item import ( - Content as ResponseReasoningTextContent, -) -from openai_harmony import Message as OpenAIHarmonyMessage - -# Backward compatibility for OpenAI client versions -try: # For older openai versions (< 1.100.0) - from openai.types.responses import ResponseTextConfig -except ImportError: # For newer openai versions (>= 1.100.0) - from openai.types.responses import ResponseFormatTextConfig as ResponseTextConfig - - -from openai.types.responses.response import IncompleteDetails, ToolChoice -from openai.types.responses.tool import Tool -from openai.types.shared import Metadata, Reasoning from pydantic import ( BaseModel, ConfigDict, Field, - ValidationError, - field_serializer, model_validator, ) -from vllm.entrypoints.chat_utils import ChatCompletionMessageParam, make_tool_call_id +from vllm.entrypoints.chat_utils import make_tool_call_id from vllm.exceptions import VLLMValidationError from vllm.logger import init_logger from vllm.logprobs import Logprob @@ -277,237 +228,6 @@ def get_logits_processors( return None -ResponseInputOutputItem: TypeAlias = ResponseInputItemParam | ResponseOutputItem - - -class ResponsesRequest(OpenAIBaseModel): - # Ordered by official OpenAI API documentation - # https://platform.openai.com/docs/api-reference/responses/create - background: bool | None = False - include: ( - list[ - Literal[ - "code_interpreter_call.outputs", - "computer_call_output.output.image_url", - "file_search_call.results", - "message.input_image.image_url", - "message.output_text.logprobs", - "reasoning.encrypted_content", - ], - ] - | None - ) = None - input: str | list[ResponseInputOutputItem] - instructions: str | None = None - max_output_tokens: int | None = None - max_tool_calls: int | None = None - metadata: Metadata | None = None - model: str | None = None - logit_bias: dict[str, float] | None = None - parallel_tool_calls: bool | None = True - previous_response_id: str | None = None - prompt: ResponsePrompt | None = None - reasoning: Reasoning | None = None - service_tier: Literal["auto", "default", "flex", "scale", "priority"] = "auto" - store: bool | None = True - stream: bool | None = False - temperature: float | None = None - text: ResponseTextConfig | None = None - tool_choice: ToolChoice = "auto" - tools: list[Tool] = Field(default_factory=list) - top_logprobs: int | None = 0 - top_p: float | None = None - top_k: int | None = None - truncation: Literal["auto", "disabled"] | None = "disabled" - user: str | None = None - - # --8<-- [start:responses-extra-params] - request_id: str = Field( - default_factory=lambda: f"resp_{random_uuid()}", - description=( - "The request_id related to this request. If the caller does " - "not set it, a random_uuid will be generated. This id is used " - "through out the inference process and return in response." - ), - ) - mm_processor_kwargs: dict[str, Any] | None = Field( - default=None, - description=("Additional kwargs to pass to the HF processor."), - ) - priority: int = Field( - default=0, - description=( - "The priority of the request (lower means earlier handling; " - "default: 0). Any priority other than 0 will raise an error " - "if the served model does not use priority scheduling." - ), - ) - cache_salt: str | None = Field( - default=None, - description=( - "If specified, the prefix cache will be salted with the provided " - "string to prevent an attacker to guess prompts in multi-user " - "environments. The salt should be random, protected from " - "access by 3rd parties, and long enough to be " - "unpredictable (e.g., 43 characters base64-encoded, corresponding " - "to 256 bit)." - ), - ) - - enable_response_messages: bool = Field( - default=False, - description=( - "Dictates whether or not to return messages as part of the " - "response object. Currently only supported for" - "non-background and gpt-oss only. " - ), - ) - # similar to input_messages / output_messages in ResponsesResponse - # we take in previous_input_messages (ie in harmony format) - # this cannot be used in conjunction with previous_response_id - # TODO: consider supporting non harmony messages as well - previous_input_messages: list[OpenAIHarmonyMessage | dict] | None = None - # --8<-- [end:responses-extra-params] - - _DEFAULT_SAMPLING_PARAMS = { - "temperature": 1.0, - "top_p": 1.0, - "top_k": 0, - } - - def to_sampling_params( - self, - default_max_tokens: int, - default_sampling_params: dict | None = None, - ) -> SamplingParams: - if self.max_output_tokens is None: - max_tokens = default_max_tokens - else: - max_tokens = min(self.max_output_tokens, default_max_tokens) - - default_sampling_params = default_sampling_params or {} - if (temperature := self.temperature) is None: - temperature = default_sampling_params.get( - "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"] - ) - if (top_p := self.top_p) is None: - top_p = default_sampling_params.get( - "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"] - ) - if (top_k := self.top_k) is None: - top_k = default_sampling_params.get( - "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"] - ) - stop_token_ids = default_sampling_params.get("stop_token_ids") - - # Structured output - structured_outputs = None - if self.text is not None and self.text.format is not None: - response_format = self.text.format - if ( - response_format.type == "json_schema" - and response_format.schema_ is not None - ): - structured_outputs = StructuredOutputsParams( - json=response_format.schema_ - ) - elif response_format.type == "json_object": - raise NotImplementedError("json_object is not supported") - - # TODO: add more parameters - return SamplingParams.from_optional( - temperature=temperature, - top_p=top_p, - top_k=top_k, - max_tokens=max_tokens, - logprobs=self.top_logprobs if self.is_include_output_logprobs() else None, - stop_token_ids=stop_token_ids, - output_kind=( - RequestOutputKind.DELTA if self.stream else RequestOutputKind.FINAL_ONLY - ), - structured_outputs=structured_outputs, - logit_bias=self.logit_bias, - skip_clone=True, # Created fresh per request, safe to skip clone - ) - - def is_include_output_logprobs(self) -> bool: - """Check if the request includes output logprobs.""" - if self.include is None: - return False - return ( - isinstance(self.include, list) - and "message.output_text.logprobs" in self.include - ) - - @model_validator(mode="before") - def validate_background(cls, data): - if not data.get("background"): - return data - if not data.get("store", True): - raise ValueError("background can only be used when `store` is true") - return data - - @model_validator(mode="before") - def validate_prompt(cls, data): - if data.get("prompt") is not None: - raise VLLMValidationError( - "prompt template is not supported", parameter="prompt" - ) - return data - - @model_validator(mode="before") - def check_cache_salt_support(cls, data): - if data.get("cache_salt") is not None and ( - not isinstance(data["cache_salt"], str) or not data["cache_salt"] - ): - raise ValueError( - "Parameter 'cache_salt' must be a non-empty string if provided." - ) - return data - - @model_validator(mode="before") - def function_call_parsing(cls, data): - """Parse function_call dictionaries into ResponseFunctionToolCall objects. - This ensures Pydantic can properly resolve union types in the input field. - Function calls provided as dicts are converted to ResponseFunctionToolCall - objects before validation, while invalid structures are left for Pydantic - to reject with appropriate error messages. - """ - - input_data = data.get("input") - - # Early return for None, strings, or bytes - # (strings are iterable but shouldn't be processed) - if input_data is None or isinstance(input_data, (str, bytes)): - return data - - # Convert iterators (like ValidatorIterator) to list - if not isinstance(input_data, list): - try: - input_data = list(input_data) - except TypeError: - # Not iterable, leave as-is for Pydantic to handle - return data - - processed_input = [] - for item in input_data: - if isinstance(item, dict) and item.get("type") == "function_call": - try: - processed_input.append(ResponseFunctionToolCall(**item)) - except ValidationError: - # Let Pydantic handle validation for malformed function calls - logger.debug( - "Failed to parse function_call to ResponseFunctionToolCall, " - "leaving for Pydantic validation" - ) - processed_input.append(item) - else: - processed_input.append(item) - - data["input"] = processed_input - return data - - class CompletionRequest(OpenAIBaseModel): # Ordered by official OpenAI API documentation # https://platform.openai.com/docs/api-reference/completions/create @@ -997,256 +717,6 @@ class TranscriptionStreamResponse(OpenAIBaseModel): usage: UsageInfo | None = Field(default=None) -class InputTokensDetails(OpenAIBaseModel): - cached_tokens: int - input_tokens_per_turn: list[int] = Field(default_factory=list) - cached_tokens_per_turn: list[int] = Field(default_factory=list) - - -class OutputTokensDetails(OpenAIBaseModel): - reasoning_tokens: int = 0 - tool_output_tokens: int = 0 - output_tokens_per_turn: list[int] = Field(default_factory=list) - tool_output_tokens_per_turn: list[int] = Field(default_factory=list) - - -class ResponseUsage(OpenAIBaseModel): - input_tokens: int - input_tokens_details: InputTokensDetails - output_tokens: int - output_tokens_details: OutputTokensDetails - total_tokens: int - - -def serialize_message(msg): - """ - Serializes a single message - """ - if isinstance(msg, dict): - return msg - elif hasattr(msg, "to_dict"): - return msg.to_dict() - else: - # fallback to pyandic dump - return msg.model_dump_json() - - -def serialize_messages(msgs): - """ - Serializes multiple messages - """ - return [serialize_message(msg) for msg in msgs] if msgs else None - - -class ResponseRawMessageAndToken(OpenAIBaseModel): - """Class to show the raw message. - If message / tokens diverge, tokens is the source of truth""" - - message: str - tokens: list[int] - type: Literal["raw_message_tokens"] = "raw_message_tokens" - - -ResponseInputOutputMessage: TypeAlias = ( - list[ChatCompletionMessageParam] | list[ResponseRawMessageAndToken] -) - - -class ResponsesResponse(OpenAIBaseModel): - id: str = Field(default_factory=lambda: f"resp_{random_uuid()}") - created_at: int = Field(default_factory=lambda: int(time.time())) - # error: Optional[ResponseError] = None - incomplete_details: IncompleteDetails | None = None - instructions: str | None = None - metadata: Metadata | None = None - model: str - object: Literal["response"] = "response" - output: list[ResponseOutputItem] - parallel_tool_calls: bool - temperature: float - tool_choice: ToolChoice - tools: list[Tool] - top_p: float - background: bool - max_output_tokens: int - max_tool_calls: int | None = None - previous_response_id: str | None = None - prompt: ResponsePrompt | None = None - reasoning: Reasoning | None = None - service_tier: Literal["auto", "default", "flex", "scale", "priority"] - status: ResponseStatus - text: ResponseTextConfig | None = None - top_logprobs: int | None = None - truncation: Literal["auto", "disabled"] - usage: ResponseUsage | None = None - user: str | None = None - - # --8<-- [start:responses-response-extra-params] - # These are populated when enable_response_messages is set to True - # NOTE: custom serialization is needed - # see serialize_input_messages and serialize_output_messages - input_messages: ResponseInputOutputMessage | None = Field( - default=None, - description=( - "If enable_response_messages, we can show raw token input to model." - ), - ) - output_messages: ResponseInputOutputMessage | None = Field( - default=None, - description=( - "If enable_response_messages, we can show raw token output of model." - ), - ) - # --8<-- [end:responses-response-extra-params] - - # NOTE: openAI harmony doesn't serialize TextContent properly, - # TODO: this fixes for TextContent, but need to verify for tools etc - # https://github.com/openai/harmony/issues/78 - @field_serializer("output_messages", when_used="json") - def serialize_output_messages(self, msgs, _info): - return serialize_messages(msgs) - - # NOTE: openAI harmony doesn't serialize TextContent properly, this fixes it - # https://github.com/openai/harmony/issues/78 - @field_serializer("input_messages", when_used="json") - def serialize_input_messages(self, msgs, _info): - return serialize_messages(msgs) - - @classmethod - def from_request( - cls, - request: ResponsesRequest, - sampling_params: SamplingParams, - model_name: str, - created_time: int, - output: list[ResponseOutputItem], - status: ResponseStatus, - usage: ResponseUsage | None = None, - input_messages: ResponseInputOutputMessage | None = None, - output_messages: ResponseInputOutputMessage | None = None, - ) -> "ResponsesResponse": - incomplete_details: IncompleteDetails | None = None - if status == "incomplete": - incomplete_details = IncompleteDetails(reason="max_output_tokens") - # TODO: implement the other reason for incomplete_details, - # which is content_filter - # incomplete_details = IncompleteDetails(reason='content_filter') - return cls( - id=request.request_id, - created_at=created_time, - incomplete_details=incomplete_details, - instructions=request.instructions, - metadata=request.metadata, - model=model_name, - output=output, - input_messages=input_messages, - output_messages=output_messages, - parallel_tool_calls=request.parallel_tool_calls, - temperature=sampling_params.temperature, - tool_choice=request.tool_choice, - tools=request.tools, - top_p=sampling_params.top_p, - background=request.background, - max_output_tokens=sampling_params.max_tokens, - max_tool_calls=request.max_tool_calls, - previous_response_id=request.previous_response_id, - prompt=request.prompt, - reasoning=request.reasoning, - service_tier=request.service_tier, - status=status, - text=request.text, - top_logprobs=sampling_params.logprobs, - truncation=request.truncation, - user=request.user, - usage=usage, - ) - - -# TODO: this code can be removed once -# https://github.com/openai/openai-python/issues/2634 has been resolved -class ResponseReasoningPartDoneEvent(OpenAIBaseModel): - content_index: int - """The index of the content part that is done.""" - - item_id: str - """The ID of the output item that the content part was added to.""" - - output_index: int - """The index of the output item that the content part was added to.""" - - part: ResponseReasoningTextContent - """The content part that is done.""" - - sequence_number: int - """The sequence number of this event.""" - - type: Literal["response.reasoning_part.done"] - """The type of the event. Always `response.reasoning_part.done`.""" - - -# TODO: this code can be removed once -# https://github.com/openai/openai-python/issues/2634 has been resolved -class ResponseReasoningPartAddedEvent(OpenAIBaseModel): - content_index: int - """The index of the content part that is done.""" - - item_id: str - """The ID of the output item that the content part was added to.""" - - output_index: int - """The index of the output item that the content part was added to.""" - - part: ResponseReasoningTextContent - """The content part that is done.""" - - sequence_number: int - """The sequence number of this event.""" - - type: Literal["response.reasoning_part.added"] - """The type of the event. Always `response.reasoning_part.added`.""" - - -# vLLM Streaming Events -# Note: we override the response type with the vLLM ResponsesResponse type -class ResponseCompletedEvent(OpenAIResponseCompletedEvent): - response: ResponsesResponse # type: ignore[override] - - -class ResponseCreatedEvent(OpenAIResponseCreatedEvent): - response: ResponsesResponse # type: ignore[override] - - -class ResponseInProgressEvent(OpenAIResponseInProgressEvent): - response: ResponsesResponse # type: ignore[override] - - -StreamingResponsesResponse: TypeAlias = ( - ResponseCreatedEvent - | ResponseInProgressEvent - | ResponseCompletedEvent - | ResponseOutputItemAddedEvent - | ResponseOutputItemDoneEvent - | ResponseContentPartAddedEvent - | ResponseContentPartDoneEvent - | ResponseReasoningTextDeltaEvent - | ResponseReasoningTextDoneEvent - | ResponseReasoningPartAddedEvent - | ResponseReasoningPartDoneEvent - | ResponseCodeInterpreterCallInProgressEvent - | ResponseCodeInterpreterCallCodeDeltaEvent - | ResponseWebSearchCallInProgressEvent - | ResponseWebSearchCallSearchingEvent - | ResponseWebSearchCallCompletedEvent - | ResponseCodeInterpreterCallCodeDoneEvent - | ResponseCodeInterpreterCallInterpretingEvent - | ResponseCodeInterpreterCallCompletedEvent - | ResponseMcpCallArgumentsDeltaEvent - | ResponseMcpCallArgumentsDoneEvent - | ResponseMcpCallInProgressEvent - | ResponseMcpCallCompletedEvent -) - - ## Protocols for Audio AudioResponseFormat: TypeAlias = Literal["json", "text", "srt", "verbose_json", "vtt"] diff --git a/vllm/entrypoints/openai/engine/serving.py b/vllm/entrypoints/openai/engine/serving.py index 666d83c21..045e2d068 100644 --- a/vllm/entrypoints/openai/engine/serving.py +++ b/vllm/entrypoints/openai/engine/serving.py @@ -50,13 +50,15 @@ from vllm.entrypoints.openai.engine.protocol import ( ErrorResponse, FunctionCall, FunctionDefinition, - ResponseInputOutputItem, - ResponsesRequest, TranscriptionRequest, TranscriptionResponse, TranslationRequest, VLLMValidationError, ) +from vllm.entrypoints.openai.responses.protocol import ( + ResponseInputOutputItem, + ResponsesRequest, +) from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.pooling.classify.protocol import ( ClassificationChatRequest, diff --git a/vllm/entrypoints/openai/parser/harmony_utils.py b/vllm/entrypoints/openai/parser/harmony_utils.py index a3959c873..e901fe6d7 100644 --- a/vllm/entrypoints/openai/parser/harmony_utils.py +++ b/vllm/entrypoints/openai/parser/harmony_utils.py @@ -44,7 +44,7 @@ from openai_harmony import Role as OpenAIHarmonyRole from vllm import envs from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionToolsParam -from vllm.entrypoints.openai.engine.protocol import ( +from vllm.entrypoints.openai.responses.protocol import ( ResponseInputOutputItem, ResponsesRequest, ) diff --git a/vllm/entrypoints/openai/parser/responses_parser.py b/vllm/entrypoints/openai/parser/responses_parser.py index bda7e9952..180520a1f 100644 --- a/vllm/entrypoints/openai/parser/responses_parser.py +++ b/vllm/entrypoints/openai/parser/responses_parser.py @@ -16,7 +16,7 @@ from openai.types.responses.response_reasoning_item import ( ) from vllm.entrypoints.constants import MCP_PREFIX -from vllm.entrypoints.openai.engine.protocol import ( +from vllm.entrypoints.openai.responses.protocol import ( ResponseInputOutputItem, ResponsesRequest, ) diff --git a/vllm/entrypoints/openai/responses/__init__.py b/vllm/entrypoints/openai/responses/__init__.py new file mode 100644 index 000000000..208f01a7c --- /dev/null +++ b/vllm/entrypoints/openai/responses/__init__.py @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project diff --git a/vllm/entrypoints/openai/responses/api_router.py b/vllm/entrypoints/openai/responses/api_router.py new file mode 100644 index 000000000..958cd3fec --- /dev/null +++ b/vllm/entrypoints/openai/responses/api_router.py @@ -0,0 +1,143 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +from collections.abc import AsyncGenerator +from http import HTTPStatus + +from fastapi import APIRouter, Depends, FastAPI, HTTPException, Request +from fastapi.responses import JSONResponse, StreamingResponse + +from vllm.entrypoints.openai.engine.protocol import ErrorResponse +from vllm.entrypoints.openai.responses.protocol import ( + ResponsesRequest, + ResponsesResponse, + StreamingResponsesResponse, +) +from vllm.entrypoints.openai.responses.serving import OpenAIServingResponses +from vllm.entrypoints.openai.utils import validate_json_request +from vllm.entrypoints.utils import ( + with_cancellation, +) +from vllm.logger import init_logger + +logger = init_logger(__name__) + +router = APIRouter() + + +def responses(request: Request) -> OpenAIServingResponses | None: + return request.app.state.openai_serving_responses + + +async def _convert_stream_to_sse_events( + generator: AsyncGenerator[StreamingResponsesResponse, None], +) -> AsyncGenerator[str, None]: + """Convert the generator to a stream of events in SSE format""" + async for event in generator: + event_type = getattr(event, "type", "unknown") + # https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#event_stream_format + event_data = ( + f"event: {event_type}\ndata: {event.model_dump_json(indent=None)}\n\n" + ) + yield event_data + + +@router.post( + "/v1/responses", + dependencies=[Depends(validate_json_request)], + responses={ + HTTPStatus.OK.value: {"content": {"text/event-stream": {}}}, + HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, + HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse}, + HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, + }, +) +@with_cancellation +async def create_responses(request: ResponsesRequest, raw_request: Request): + handler = responses(raw_request) + if handler is None: + base_server = raw_request.app.state.openai_serving_tokenization + return base_server.create_error_response( + message="The model does not support Responses API" + ) + try: + generator = await handler.create_responses(request, raw_request) + except Exception as e: + raise HTTPException( + status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e) + ) from e + + if isinstance(generator, ErrorResponse): + return JSONResponse( + content=generator.model_dump(), status_code=generator.error.code + ) + elif isinstance(generator, ResponsesResponse): + return JSONResponse(content=generator.model_dump()) + + return StreamingResponse( + content=_convert_stream_to_sse_events(generator), media_type="text/event-stream" + ) + + +@router.get("/v1/responses/{response_id}") +async def retrieve_responses( + response_id: str, + raw_request: Request, + starting_after: int | None = None, + stream: bool | None = False, +): + handler = responses(raw_request) + if handler is None: + base_server = raw_request.app.state.openai_serving_tokenization + return base_server.create_error_response( + message="The model does not support Responses API" + ) + + try: + response = await handler.retrieve_responses( + response_id, + starting_after=starting_after, + stream=stream, + ) + except Exception as e: + raise HTTPException( + status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e) + ) from e + + if isinstance(response, ErrorResponse): + return JSONResponse( + content=response.model_dump(), status_code=response.error.code + ) + elif isinstance(response, ResponsesResponse): + return JSONResponse(content=response.model_dump()) + return StreamingResponse( + content=_convert_stream_to_sse_events(response), media_type="text/event-stream" + ) + + +@router.post("/v1/responses/{response_id}/cancel") +async def cancel_responses(response_id: str, raw_request: Request): + handler = responses(raw_request) + if handler is None: + base_server = raw_request.app.state.openai_serving_tokenization + return base_server.create_error_response( + message="The model does not support Responses API" + ) + + try: + response = await handler.cancel_responses(response_id) + except Exception as e: + raise HTTPException( + status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e) + ) from e + + if isinstance(response, ErrorResponse): + return JSONResponse( + content=response.model_dump(), status_code=response.error.code + ) + return JSONResponse(content=response.model_dump()) + + +def attach_router(app: FastAPI): + app.include_router(router) diff --git a/vllm/entrypoints/openai/responses/protocol.py b/vllm/entrypoints/openai/responses/protocol.py new file mode 100644 index 000000000..0ff67373a --- /dev/null +++ b/vllm/entrypoints/openai/responses/protocol.py @@ -0,0 +1,554 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Adapted from +# https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py +import time +from typing import Any, Literal, TypeAlias + +from openai.types.responses import ( + ResponseCodeInterpreterCallCodeDeltaEvent, + ResponseCodeInterpreterCallCodeDoneEvent, + ResponseCodeInterpreterCallCompletedEvent, + ResponseCodeInterpreterCallInProgressEvent, + ResponseCodeInterpreterCallInterpretingEvent, + ResponseContentPartAddedEvent, + ResponseContentPartDoneEvent, + ResponseFunctionToolCall, + ResponseInputItemParam, + ResponseMcpCallArgumentsDeltaEvent, + ResponseMcpCallArgumentsDoneEvent, + ResponseMcpCallCompletedEvent, + ResponseMcpCallInProgressEvent, + ResponseOutputItem, + ResponseOutputItemAddedEvent, + ResponseOutputItemDoneEvent, + ResponsePrompt, + ResponseReasoningTextDeltaEvent, + ResponseReasoningTextDoneEvent, + ResponseStatus, + ResponseWebSearchCallCompletedEvent, + ResponseWebSearchCallInProgressEvent, + ResponseWebSearchCallSearchingEvent, +) +from openai.types.responses import ( + ResponseCompletedEvent as OpenAIResponseCompletedEvent, +) +from openai.types.responses import ResponseCreatedEvent as OpenAIResponseCreatedEvent +from openai.types.responses import ( + ResponseInProgressEvent as OpenAIResponseInProgressEvent, +) +from openai.types.responses.tool import Tool +from openai_harmony import Message as OpenAIHarmonyMessage + +# Backward compatibility for OpenAI client versions +try: # For older openai versions (< 1.100.0) + from openai.types.responses import ResponseTextConfig +except ImportError: # For newer openai versions (>= 1.100.0) + from openai.types.responses import ResponseFormatTextConfig as ResponseTextConfig + +from openai.types.responses.response import IncompleteDetails, ToolChoice +from openai.types.responses.response_reasoning_item import ( + Content as ResponseReasoningTextContent, +) +from openai.types.shared import Metadata, Reasoning +from pydantic import ( + Field, + ValidationError, + field_serializer, + model_validator, +) + +from vllm.entrypoints.chat_utils import ChatCompletionMessageParam +from vllm.entrypoints.openai.engine.protocol import ( + OpenAIBaseModel, +) +from vllm.exceptions import VLLMValidationError +from vllm.logger import init_logger +from vllm.sampling_params import ( + RequestOutputKind, + SamplingParams, + StructuredOutputsParams, +) +from vllm.utils import random_uuid + +logger = init_logger(__name__) + + +class InputTokensDetails(OpenAIBaseModel): + cached_tokens: int + input_tokens_per_turn: list[int] = Field(default_factory=list) + cached_tokens_per_turn: list[int] = Field(default_factory=list) + + +class OutputTokensDetails(OpenAIBaseModel): + reasoning_tokens: int = 0 + tool_output_tokens: int = 0 + output_tokens_per_turn: list[int] = Field(default_factory=list) + tool_output_tokens_per_turn: list[int] = Field(default_factory=list) + + +class ResponseUsage(OpenAIBaseModel): + input_tokens: int + input_tokens_details: InputTokensDetails + output_tokens: int + output_tokens_details: OutputTokensDetails + total_tokens: int + + +def serialize_message(msg): + """ + Serializes a single message + """ + if isinstance(msg, dict): + return msg + elif hasattr(msg, "to_dict"): + return msg.to_dict() + else: + # fallback to pyandic dump + return msg.model_dump_json() + + +def serialize_messages(msgs): + """ + Serializes multiple messages + """ + return [serialize_message(msg) for msg in msgs] if msgs else None + + +class ResponseRawMessageAndToken(OpenAIBaseModel): + """Class to show the raw message. + If message / tokens diverge, tokens is the source of truth""" + + message: str + tokens: list[int] + type: Literal["raw_message_tokens"] = "raw_message_tokens" + + +ResponseInputOutputMessage: TypeAlias = ( + list[ChatCompletionMessageParam] | list[ResponseRawMessageAndToken] +) +ResponseInputOutputItem: TypeAlias = ResponseInputItemParam | ResponseOutputItem + + +class ResponsesRequest(OpenAIBaseModel): + # Ordered by official OpenAI API documentation + # https://platform.openai.com/docs/api-reference/responses/create + background: bool | None = False + include: ( + list[ + Literal[ + "code_interpreter_call.outputs", + "computer_call_output.output.image_url", + "file_search_call.results", + "message.input_image.image_url", + "message.output_text.logprobs", + "reasoning.encrypted_content", + ], + ] + | None + ) = None + input: str | list[ResponseInputOutputItem] + instructions: str | None = None + max_output_tokens: int | None = None + max_tool_calls: int | None = None + metadata: Metadata | None = None + model: str | None = None + logit_bias: dict[str, float] | None = None + parallel_tool_calls: bool | None = True + previous_response_id: str | None = None + prompt: ResponsePrompt | None = None + reasoning: Reasoning | None = None + service_tier: Literal["auto", "default", "flex", "scale", "priority"] = "auto" + store: bool | None = True + stream: bool | None = False + temperature: float | None = None + text: ResponseTextConfig | None = None + tool_choice: ToolChoice = "auto" + tools: list[Tool] = Field(default_factory=list) + top_logprobs: int | None = 0 + top_p: float | None = None + top_k: int | None = None + truncation: Literal["auto", "disabled"] | None = "disabled" + user: str | None = None + + # --8<-- [start:responses-extra-params] + request_id: str = Field( + default_factory=lambda: f"resp_{random_uuid()}", + description=( + "The request_id related to this request. If the caller does " + "not set it, a random_uuid will be generated. This id is used " + "through out the inference process and return in response." + ), + ) + mm_processor_kwargs: dict[str, Any] | None = Field( + default=None, + description=("Additional kwargs to pass to the HF processor."), + ) + priority: int = Field( + default=0, + description=( + "The priority of the request (lower means earlier handling; " + "default: 0). Any priority other than 0 will raise an error " + "if the served model does not use priority scheduling." + ), + ) + cache_salt: str | None = Field( + default=None, + description=( + "If specified, the prefix cache will be salted with the provided " + "string to prevent an attacker to guess prompts in multi-user " + "environments. The salt should be random, protected from " + "access by 3rd parties, and long enough to be " + "unpredictable (e.g., 43 characters base64-encoded, corresponding " + "to 256 bit)." + ), + ) + + enable_response_messages: bool = Field( + default=False, + description=( + "Dictates whether or not to return messages as part of the " + "response object. Currently only supported for" + "non-background and gpt-oss only. " + ), + ) + # similar to input_messages / output_messages in ResponsesResponse + # we take in previous_input_messages (ie in harmony format) + # this cannot be used in conjunction with previous_response_id + # TODO: consider supporting non harmony messages as well + previous_input_messages: list[OpenAIHarmonyMessage | dict] | None = None + # --8<-- [end:responses-extra-params] + + _DEFAULT_SAMPLING_PARAMS = { + "temperature": 1.0, + "top_p": 1.0, + "top_k": 0, + } + + def to_sampling_params( + self, + default_max_tokens: int, + default_sampling_params: dict | None = None, + ) -> SamplingParams: + if self.max_output_tokens is None: + max_tokens = default_max_tokens + else: + max_tokens = min(self.max_output_tokens, default_max_tokens) + + default_sampling_params = default_sampling_params or {} + if (temperature := self.temperature) is None: + temperature = default_sampling_params.get( + "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"] + ) + if (top_p := self.top_p) is None: + top_p = default_sampling_params.get( + "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"] + ) + if (top_k := self.top_k) is None: + top_k = default_sampling_params.get( + "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"] + ) + stop_token_ids = default_sampling_params.get("stop_token_ids") + + # Structured output + structured_outputs = None + if self.text is not None and self.text.format is not None: + response_format = self.text.format + if ( + response_format.type == "json_schema" + and response_format.schema_ is not None + ): + structured_outputs = StructuredOutputsParams( + json=response_format.schema_ + ) + elif response_format.type == "json_object": + raise NotImplementedError("json_object is not supported") + + # TODO: add more parameters + return SamplingParams.from_optional( + temperature=temperature, + top_p=top_p, + top_k=top_k, + max_tokens=max_tokens, + logprobs=self.top_logprobs if self.is_include_output_logprobs() else None, + stop_token_ids=stop_token_ids, + output_kind=( + RequestOutputKind.DELTA if self.stream else RequestOutputKind.FINAL_ONLY + ), + structured_outputs=structured_outputs, + logit_bias=self.logit_bias, + skip_clone=True, # Created fresh per request, safe to skip clone + ) + + def is_include_output_logprobs(self) -> bool: + """Check if the request includes output logprobs.""" + if self.include is None: + return False + return ( + isinstance(self.include, list) + and "message.output_text.logprobs" in self.include + ) + + @model_validator(mode="before") + def validate_background(cls, data): + if not data.get("background"): + return data + if not data.get("store", True): + raise ValueError("background can only be used when `store` is true") + return data + + @model_validator(mode="before") + def validate_prompt(cls, data): + if data.get("prompt") is not None: + raise VLLMValidationError( + "prompt template is not supported", parameter="prompt" + ) + return data + + @model_validator(mode="before") + def check_cache_salt_support(cls, data): + if data.get("cache_salt") is not None and ( + not isinstance(data["cache_salt"], str) or not data["cache_salt"] + ): + raise ValueError( + "Parameter 'cache_salt' must be a non-empty string if provided." + ) + return data + + @model_validator(mode="before") + def function_call_parsing(cls, data): + """Parse function_call dictionaries into ResponseFunctionToolCall objects. + This ensures Pydantic can properly resolve union types in the input field. + Function calls provided as dicts are converted to ResponseFunctionToolCall + objects before validation, while invalid structures are left for Pydantic + to reject with appropriate error messages. + """ + + input_data = data.get("input") + + # Early return for None, strings, or bytes + # (strings are iterable but shouldn't be processed) + if input_data is None or isinstance(input_data, (str, bytes)): + return data + + # Convert iterators (like ValidatorIterator) to list + if not isinstance(input_data, list): + try: + input_data = list(input_data) + except TypeError: + # Not iterable, leave as-is for Pydantic to handle + return data + + processed_input = [] + for item in input_data: + if isinstance(item, dict) and item.get("type") == "function_call": + try: + processed_input.append(ResponseFunctionToolCall(**item)) + except ValidationError: + # Let Pydantic handle validation for malformed function calls + logger.debug( + "Failed to parse function_call to ResponseFunctionToolCall, " + "leaving for Pydantic validation" + ) + processed_input.append(item) + else: + processed_input.append(item) + + data["input"] = processed_input + return data + + +class ResponsesResponse(OpenAIBaseModel): + id: str = Field(default_factory=lambda: f"resp_{random_uuid()}") + created_at: int = Field(default_factory=lambda: int(time.time())) + # error: Optional[ResponseError] = None + incomplete_details: IncompleteDetails | None = None + instructions: str | None = None + metadata: Metadata | None = None + model: str + object: Literal["response"] = "response" + output: list[ResponseOutputItem] + parallel_tool_calls: bool + temperature: float + tool_choice: ToolChoice + tools: list[Tool] + top_p: float + background: bool + max_output_tokens: int + max_tool_calls: int | None = None + previous_response_id: str | None = None + prompt: ResponsePrompt | None = None + reasoning: Reasoning | None = None + service_tier: Literal["auto", "default", "flex", "scale", "priority"] + status: ResponseStatus + text: ResponseTextConfig | None = None + top_logprobs: int | None = None + truncation: Literal["auto", "disabled"] + usage: ResponseUsage | None = None + user: str | None = None + + # --8<-- [start:responses-response-extra-params] + # These are populated when enable_response_messages is set to True + # NOTE: custom serialization is needed + # see serialize_input_messages and serialize_output_messages + input_messages: ResponseInputOutputMessage | None = Field( + default=None, + description=( + "If enable_response_messages, we can show raw token input to model." + ), + ) + output_messages: ResponseInputOutputMessage | None = Field( + default=None, + description=( + "If enable_response_messages, we can show raw token output of model." + ), + ) + # --8<-- [end:responses-response-extra-params] + + # NOTE: openAI harmony doesn't serialize TextContent properly, + # TODO: this fixes for TextContent, but need to verify for tools etc + # https://github.com/openai/harmony/issues/78 + @field_serializer("output_messages", when_used="json") + def serialize_output_messages(self, msgs, _info): + return serialize_messages(msgs) + + # NOTE: openAI harmony doesn't serialize TextContent properly, this fixes it + # https://github.com/openai/harmony/issues/78 + @field_serializer("input_messages", when_used="json") + def serialize_input_messages(self, msgs, _info): + return serialize_messages(msgs) + + @classmethod + def from_request( + cls, + request: ResponsesRequest, + sampling_params: SamplingParams, + model_name: str, + created_time: int, + output: list[ResponseOutputItem], + status: ResponseStatus, + usage: ResponseUsage | None = None, + input_messages: ResponseInputOutputMessage | None = None, + output_messages: ResponseInputOutputMessage | None = None, + ) -> "ResponsesResponse": + incomplete_details: IncompleteDetails | None = None + if status == "incomplete": + incomplete_details = IncompleteDetails(reason="max_output_tokens") + # TODO: implement the other reason for incomplete_details, + # which is content_filter + # incomplete_details = IncompleteDetails(reason='content_filter') + return cls( + id=request.request_id, + created_at=created_time, + incomplete_details=incomplete_details, + instructions=request.instructions, + metadata=request.metadata, + model=model_name, + output=output, + input_messages=input_messages, + output_messages=output_messages, + parallel_tool_calls=request.parallel_tool_calls, + temperature=sampling_params.temperature, + tool_choice=request.tool_choice, + tools=request.tools, + top_p=sampling_params.top_p, + background=request.background, + max_output_tokens=sampling_params.max_tokens, + max_tool_calls=request.max_tool_calls, + previous_response_id=request.previous_response_id, + prompt=request.prompt, + reasoning=request.reasoning, + service_tier=request.service_tier, + status=status, + text=request.text, + top_logprobs=sampling_params.logprobs, + truncation=request.truncation, + user=request.user, + usage=usage, + ) + + +# TODO: this code can be removed once +# https://github.com/openai/openai-python/issues/2634 has been resolved +class ResponseReasoningPartDoneEvent(OpenAIBaseModel): + content_index: int + """The index of the content part that is done.""" + + item_id: str + """The ID of the output item that the content part was added to.""" + + output_index: int + """The index of the output item that the content part was added to.""" + + part: ResponseReasoningTextContent + """The content part that is done.""" + + sequence_number: int + """The sequence number of this event.""" + + type: Literal["response.reasoning_part.done"] + """The type of the event. Always `response.reasoning_part.done`.""" + + +# TODO: this code can be removed once +# https://github.com/openai/openai-python/issues/2634 has been resolved +class ResponseReasoningPartAddedEvent(OpenAIBaseModel): + content_index: int + """The index of the content part that is done.""" + + item_id: str + """The ID of the output item that the content part was added to.""" + + output_index: int + """The index of the output item that the content part was added to.""" + + part: ResponseReasoningTextContent + """The content part that is done.""" + + sequence_number: int + """The sequence number of this event.""" + + type: Literal["response.reasoning_part.added"] + """The type of the event. Always `response.reasoning_part.added`.""" + + +# vLLM Streaming Events +# Note: we override the response type with the vLLM ResponsesResponse type +class ResponseCompletedEvent(OpenAIResponseCompletedEvent): + response: ResponsesResponse # type: ignore[override] + + +class ResponseCreatedEvent(OpenAIResponseCreatedEvent): + response: ResponsesResponse # type: ignore[override] + + +class ResponseInProgressEvent(OpenAIResponseInProgressEvent): + response: ResponsesResponse # type: ignore[override] + + +StreamingResponsesResponse: TypeAlias = ( + ResponseCreatedEvent + | ResponseInProgressEvent + | ResponseCompletedEvent + | ResponseOutputItemAddedEvent + | ResponseOutputItemDoneEvent + | ResponseContentPartAddedEvent + | ResponseContentPartDoneEvent + | ResponseReasoningTextDeltaEvent + | ResponseReasoningTextDoneEvent + | ResponseReasoningPartAddedEvent + | ResponseReasoningPartDoneEvent + | ResponseCodeInterpreterCallInProgressEvent + | ResponseCodeInterpreterCallCodeDeltaEvent + | ResponseWebSearchCallInProgressEvent + | ResponseWebSearchCallSearchingEvent + | ResponseWebSearchCallCompletedEvent + | ResponseCodeInterpreterCallCodeDoneEvent + | ResponseCodeInterpreterCallInterpretingEvent + | ResponseCodeInterpreterCallCompletedEvent + | ResponseMcpCallArgumentsDeltaEvent + | ResponseMcpCallArgumentsDoneEvent + | ResponseMcpCallInProgressEvent + | ResponseMcpCallCompletedEvent +) diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/responses/serving.py similarity index 99% rename from vllm/entrypoints/openai/serving_responses.py rename to vllm/entrypoints/openai/responses/serving.py index 821a936a2..8be40f741 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/responses/serving.py @@ -75,19 +75,7 @@ from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.openai.engine.protocol import ( DeltaMessage, ErrorResponse, - InputTokensDetails, - OutputTokensDetails, RequestResponseMetadata, - ResponseCompletedEvent, - ResponseCreatedEvent, - ResponseInProgressEvent, - ResponseInputOutputMessage, - ResponseReasoningPartAddedEvent, - ResponseReasoningPartDoneEvent, - ResponsesRequest, - ResponsesResponse, - ResponseUsage, - StreamingResponsesResponse, VLLMValidationError, ) from vllm.entrypoints.openai.engine.serving import ( @@ -106,6 +94,20 @@ from vllm.entrypoints.openai.parser.harmony_utils import ( parse_response_input, render_for_completion, ) +from vllm.entrypoints.openai.responses.protocol import ( + InputTokensDetails, + OutputTokensDetails, + ResponseCompletedEvent, + ResponseCreatedEvent, + ResponseInProgressEvent, + ResponseInputOutputMessage, + ResponseReasoningPartAddedEvent, + ResponseReasoningPartDoneEvent, + ResponsesRequest, + ResponsesResponse, + ResponseUsage, + StreamingResponsesResponse, +) from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.responses_utils import ( construct_input_messages, @@ -590,7 +592,6 @@ class OpenAIServingResponses(OpenAIServing): prev_msg=self.msg_store.get(prev_response.id) if prev_response else None, prev_response_output=prev_response.output if prev_response else None, ) - # Check if we should continue the final message (partial completion) # This enables Anthropic-style partial message completion where the # user provides an incomplete assistant message to continue from. diff --git a/vllm/entrypoints/responses_utils.py b/vllm/entrypoints/responses_utils.py index f37d891dd..1069fa937 100644 --- a/vllm/entrypoints/responses_utils.py +++ b/vllm/entrypoints/responses_utils.py @@ -22,10 +22,8 @@ from openai.types.responses.tool import Tool from vllm import envs from vllm.entrypoints.constants import MCP_PREFIX -from vllm.entrypoints.openai.engine.protocol import ( - ChatCompletionMessageParam, - ResponseInputOutputItem, -) +from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionMessageParam +from vllm.entrypoints.openai.responses.protocol import ResponseInputOutputItem def should_continue_final_message( diff --git a/vllm/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py index 63ce096d0..0593c8125 100644 --- a/vllm/reasoning/abs_reasoning_parsers.py +++ b/vllm/reasoning/abs_reasoning_parsers.py @@ -19,6 +19,8 @@ if TYPE_CHECKING: ) from vllm.entrypoints.openai.engine.protocol import ( DeltaMessage, + ) + from vllm.entrypoints.openai.responses.protocol import ( ResponsesRequest, ) from vllm.tokenizers import TokenizerLike diff --git a/vllm/reasoning/basic_parsers.py b/vllm/reasoning/basic_parsers.py index ea0aa1dce..a854ea79a 100644 --- a/vllm/reasoning/basic_parsers.py +++ b/vllm/reasoning/basic_parsers.py @@ -13,7 +13,7 @@ if TYPE_CHECKING: from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionRequest, ) - from vllm.entrypoints.openai.engine.protocol import ( + from vllm.entrypoints.openai.responses.protocol import ( ResponsesRequest, ) else: diff --git a/vllm/reasoning/minimax_m2_reasoning_parser.py b/vllm/reasoning/minimax_m2_reasoning_parser.py index 06b97d39e..3d7942be5 100644 --- a/vllm/reasoning/minimax_m2_reasoning_parser.py +++ b/vllm/reasoning/minimax_m2_reasoning_parser.py @@ -8,6 +8,8 @@ from vllm.entrypoints.openai.chat_completion.protocol import ( ) from vllm.entrypoints.openai.engine.protocol import ( DeltaMessage, +) +from vllm.entrypoints.openai.responses.protocol import ( ResponsesRequest, ) from vllm.logger import init_logger diff --git a/vllm/reasoning/mistral_reasoning_parser.py b/vllm/reasoning/mistral_reasoning_parser.py index 05ff2bff0..05dcbac5c 100644 --- a/vllm/reasoning/mistral_reasoning_parser.py +++ b/vllm/reasoning/mistral_reasoning_parser.py @@ -6,7 +6,7 @@ from functools import cached_property from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionRequest, ) -from vllm.entrypoints.openai.engine.protocol import ( +from vllm.entrypoints.openai.responses.protocol import ( ResponsesRequest, ) from vllm.logger import init_logger diff --git a/vllm/reasoning/olmo3_reasoning_parser.py b/vllm/reasoning/olmo3_reasoning_parser.py index 9ee8fe37e..89db4881e 100644 --- a/vllm/reasoning/olmo3_reasoning_parser.py +++ b/vllm/reasoning/olmo3_reasoning_parser.py @@ -15,6 +15,8 @@ from vllm.entrypoints.openai.chat_completion.protocol import ( ) from vllm.entrypoints.openai.engine.protocol import ( DeltaMessage, +) +from vllm.entrypoints.openai.responses.protocol import ( ResponsesRequest, ) from vllm.logger import init_logger diff --git a/vllm/reasoning/qwen3_reasoning_parser.py b/vllm/reasoning/qwen3_reasoning_parser.py index 9c2f7404b..fc12ce540 100644 --- a/vllm/reasoning/qwen3_reasoning_parser.py +++ b/vllm/reasoning/qwen3_reasoning_parser.py @@ -4,7 +4,9 @@ from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionRequest, ) -from vllm.entrypoints.openai.engine.protocol import ResponsesRequest +from vllm.entrypoints.openai.responses.protocol import ( + ResponsesRequest, +) from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser diff --git a/vllm/tool_parsers/abstract_tool_parser.py b/vllm/tool_parsers/abstract_tool_parser.py index feb3ce6f3..5416027ed 100644 --- a/vllm/tool_parsers/abstract_tool_parser.py +++ b/vllm/tool_parsers/abstract_tool_parser.py @@ -14,6 +14,8 @@ from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionReque from vllm.entrypoints.openai.engine.protocol import ( DeltaMessage, ExtractedToolCallInformation, +) +from vllm.entrypoints.openai.responses.protocol import ( ResponsesRequest, ResponseTextConfig, )