[Responses API] Disable response store by default (#22137)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-03 04:04:21 -07:00
parent aefeea0fde
commit 6d98843b31
4 changed files with 46 additions and 10 deletions
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -151,6 +151,7 @@ if TYPE_CHECKING:
    VLLM_ENABLE_CUDAGRAPH_GC: bool = False
    VLLM_LOOPBACK_IP: str = ""
    VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = False
+    VLLM_ENABLE_RESPONSES_API_STORE: bool = False


 def get_default_cache_root():
@@ -1056,6 +1057,17 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE":
    lambda: bool(int(os.getenv(\
            "VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE", "0"))),
+
+    # Enables support for the "store" option in the OpenAI Responses API.
+    # When set to 1, vLLM's OpenAI server will retain the input and output
+    # messages for those requests in memory. By default, this is disabled (0).
+    # NOTE/WARNING:
+    # 1. Messages are kept in memory only (not persisted to disk) and will be
+    #    lost when the vLLM server shuts down.
+    # 2. Enabling this option will cause a memory leak, as stored messages are
+    #    never removed from memory until the server terminates.
+    "VLLM_ENABLE_RESPONSES_API_STORE":
+    lambda: bool(int(os.getenv("VLLM_ENABLE_RESPONSES_API_STORE", "0"))),
 }

 # --8<-- [end:env-vars-definition]