[Responses API] Disable response store by default (#22137)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
12
vllm/envs.py
12
vllm/envs.py
@@ -151,6 +151,7 @@ if TYPE_CHECKING:
|
||||
VLLM_ENABLE_CUDAGRAPH_GC: bool = False
|
||||
VLLM_LOOPBACK_IP: str = ""
|
||||
VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = False
|
||||
VLLM_ENABLE_RESPONSES_API_STORE: bool = False
|
||||
|
||||
|
||||
def get_default_cache_root():
|
||||
@@ -1056,6 +1057,17 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
"VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE":
|
||||
lambda: bool(int(os.getenv(\
|
||||
"VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE", "0"))),
|
||||
|
||||
# Enables support for the "store" option in the OpenAI Responses API.
|
||||
# When set to 1, vLLM's OpenAI server will retain the input and output
|
||||
# messages for those requests in memory. By default, this is disabled (0).
|
||||
# NOTE/WARNING:
|
||||
# 1. Messages are kept in memory only (not persisted to disk) and will be
|
||||
# lost when the vLLM server shuts down.
|
||||
# 2. Enabling this option will cause a memory leak, as stored messages are
|
||||
# never removed from memory until the server terminates.
|
||||
"VLLM_ENABLE_RESPONSES_API_STORE":
|
||||
lambda: bool(int(os.getenv("VLLM_ENABLE_RESPONSES_API_STORE", "0"))),
|
||||
}
|
||||
|
||||
# --8<-- [end:env-vars-definition]
|
||||
|
||||
Reference in New Issue
Block a user