[Responses API] Disable response store by default (#22137)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
@@ -11,6 +11,7 @@ import jinja2
|
||||
from fastapi import Request
|
||||
from openai.types.responses import ResponseOutputMessage, ResponseOutputText
|
||||
|
||||
from vllm import envs
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
|
||||
@@ -89,15 +90,17 @@ class OpenAIServingResponses(OpenAIServing):
|
||||
logger.info("Using default chat sampling params from %s: %s",
|
||||
source, self.default_sampling_params)
|
||||
|
||||
# False by default.
|
||||
self.enable_store = envs.VLLM_ENABLE_RESPONSES_API_STORE
|
||||
# HACK(woosuk): This is a hack. We should use a better store.
|
||||
# FIXME: This causes a memory leak since we never remove responses
|
||||
# from the store.
|
||||
# FIXME: If enable_store=True, this may cause a memory leak since we
|
||||
# never remove responses from the store.
|
||||
self.response_store: dict[str, ResponsesResponse] = {}
|
||||
self.response_store_lock = asyncio.Lock()
|
||||
|
||||
# HACK(woosuk): This is a hack. We should use a better store.
|
||||
# FIXME: This causes a memory leak since we never remove messages
|
||||
# from the store.
|
||||
# FIXME: If enable_store=True, this may cause a memory leak since we
|
||||
# never remove messages from the store.
|
||||
self.msg_store: dict[str, list[ChatCompletionMessageParam]] = {}
|
||||
|
||||
self.background_tasks: dict[str, asyncio.Task] = {}
|
||||
@@ -118,6 +121,10 @@ class OpenAIServingResponses(OpenAIServing):
|
||||
if self.engine_client.errored:
|
||||
raise self.engine_client.dead_error
|
||||
|
||||
# If store is not enabled, return an error.
|
||||
if request.store and not self.enable_store:
|
||||
return self._make_store_not_supported_error()
|
||||
|
||||
# Handle the previous response ID.
|
||||
prev_response_id = request.previous_response_id
|
||||
if prev_response_id is not None:
|
||||
@@ -456,3 +463,13 @@ class OpenAIServingResponses(OpenAIServing):
|
||||
message=f"Response with id '{response_id}' not found.",
|
||||
status_code=HTTPStatus.NOT_FOUND,
|
||||
)
|
||||
|
||||
def _make_store_not_supported_error(self) -> ErrorResponse:
|
||||
return self.create_error_response(
|
||||
err_type="invalid_request_error",
|
||||
message=("`store=True` (default) is not supported. Please set "
|
||||
"`store=False` in Responses API or set "
|
||||
"`VLLM_ENABLE_RESPONSES_API_STORE=1` in the env var when "
|
||||
"starting the vLLM server."),
|
||||
status_code=HTTPStatus.BAD_REQUEST,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user