Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
6d98843b
Unverified
Commit
6d98843b
authored
Aug 03, 2025
by
Woosuk Kwon
Committed by
GitHub
Aug 03, 2025
Browse files
[Responses API] Disable response store by default (#22137)
Signed-off-by:
Woosuk Kwon
<
woosuk.kwon@berkeley.edu
>
parent
aefeea0f
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
46 additions
and
10 deletions
+46
-10
tests/v1/entrypoints/openai/responses/conftest.py
tests/v1/entrypoints/openai/responses/conftest.py
+8
-4
tests/v1/entrypoints/openai/responses/test_image.py
tests/v1/entrypoints/openai/responses/test_image.py
+5
-2
vllm/entrypoints/openai/serving_responses.py
vllm/entrypoints/openai/serving_responses.py
+21
-4
vllm/envs.py
vllm/envs.py
+12
-0
No files found.
tests/v1/entrypoints/openai/responses/conftest.py
View file @
6d98843b
...
@@ -21,12 +21,16 @@ def default_server_args():
...
@@ -21,12 +21,16 @@ def default_server_args():
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
server
(
default_server_args
):
def
server_with_store
(
default_server_args
):
with
RemoteOpenAIServer
(
MODEL_NAME
,
default_server_args
)
as
remote_server
:
with
RemoteOpenAIServer
(
MODEL_NAME
,
default_server_args
,
env_dict
=
{
"VLLM_ENABLE_RESPONSES_API_STORE"
:
"1"
},
)
as
remote_server
:
yield
remote_server
yield
remote_server
@
pytest_asyncio
.
fixture
@
pytest_asyncio
.
fixture
async
def
client
(
server
):
async
def
client
(
server
_with_store
):
async
with
server
.
get_async_client
()
as
async_client
:
async
with
server
_with_store
.
get_async_client
()
as
async_client
:
yield
async_client
yield
async_client
tests/v1/entrypoints/openai/responses/test_image.py
View file @
6d98843b
...
@@ -37,8 +37,11 @@ def default_image_server_args():
...
@@ -37,8 +37,11 @@ def default_image_server_args():
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
image_server
(
default_image_server_args
):
def
image_server
(
default_image_server_args
):
with
RemoteOpenAIServer
(
MODEL_NAME
,
with
RemoteOpenAIServer
(
default_image_server_args
)
as
remote_server
:
MODEL_NAME
,
default_image_server_args
,
env_dict
=
{
"VLLM_ENABLE_RESPONSES_API_STORE"
:
"1"
},
)
as
remote_server
:
yield
remote_server
yield
remote_server
...
...
vllm/entrypoints/openai/serving_responses.py
View file @
6d98843b
...
@@ -11,6 +11,7 @@ import jinja2
...
@@ -11,6 +11,7 @@ import jinja2
from
fastapi
import
Request
from
fastapi
import
Request
from
openai.types.responses
import
ResponseOutputMessage
,
ResponseOutputText
from
openai.types.responses
import
ResponseOutputMessage
,
ResponseOutputText
from
vllm
import
envs
from
vllm.config
import
ModelConfig
from
vllm.config
import
ModelConfig
from
vllm.engine.protocol
import
EngineClient
from
vllm.engine.protocol
import
EngineClient
from
vllm.entrypoints.chat_utils
import
(
ChatCompletionMessageParam
,
from
vllm.entrypoints.chat_utils
import
(
ChatCompletionMessageParam
,
...
@@ -89,15 +90,17 @@ class OpenAIServingResponses(OpenAIServing):
...
@@ -89,15 +90,17 @@ class OpenAIServingResponses(OpenAIServing):
logger
.
info
(
"Using default chat sampling params from %s: %s"
,
logger
.
info
(
"Using default chat sampling params from %s: %s"
,
source
,
self
.
default_sampling_params
)
source
,
self
.
default_sampling_params
)
# False by default.
self
.
enable_store
=
envs
.
VLLM_ENABLE_RESPONSES_API_STORE
# HACK(woosuk): This is a hack. We should use a better store.
# HACK(woosuk): This is a hack. We should use a better store.
# FIXME:
This
cause
s
a memory leak since we
never remove responses
# FIXME:
If enable_store=True, this may
cause a memory leak since we
# from the store.
#
never remove responses
from the store.
self
.
response_store
:
dict
[
str
,
ResponsesResponse
]
=
{}
self
.
response_store
:
dict
[
str
,
ResponsesResponse
]
=
{}
self
.
response_store_lock
=
asyncio
.
Lock
()
self
.
response_store_lock
=
asyncio
.
Lock
()
# HACK(woosuk): This is a hack. We should use a better store.
# HACK(woosuk): This is a hack. We should use a better store.
# FIXME:
This
cause
s
a memory leak since we
never remove messages
# FIXME:
If enable_store=True, this may
cause a memory leak since we
# from the store.
#
never remove messages
from the store.
self
.
msg_store
:
dict
[
str
,
list
[
ChatCompletionMessageParam
]]
=
{}
self
.
msg_store
:
dict
[
str
,
list
[
ChatCompletionMessageParam
]]
=
{}
self
.
background_tasks
:
dict
[
str
,
asyncio
.
Task
]
=
{}
self
.
background_tasks
:
dict
[
str
,
asyncio
.
Task
]
=
{}
...
@@ -118,6 +121,10 @@ class OpenAIServingResponses(OpenAIServing):
...
@@ -118,6 +121,10 @@ class OpenAIServingResponses(OpenAIServing):
if
self
.
engine_client
.
errored
:
if
self
.
engine_client
.
errored
:
raise
self
.
engine_client
.
dead_error
raise
self
.
engine_client
.
dead_error
# If store is not enabled, return an error.
if
request
.
store
and
not
self
.
enable_store
:
return
self
.
_make_store_not_supported_error
()
# Handle the previous response ID.
# Handle the previous response ID.
prev_response_id
=
request
.
previous_response_id
prev_response_id
=
request
.
previous_response_id
if
prev_response_id
is
not
None
:
if
prev_response_id
is
not
None
:
...
@@ -456,3 +463,13 @@ class OpenAIServingResponses(OpenAIServing):
...
@@ -456,3 +463,13 @@ class OpenAIServingResponses(OpenAIServing):
message
=
f
"Response with id '
{
response_id
}
' not found."
,
message
=
f
"Response with id '
{
response_id
}
' not found."
,
status_code
=
HTTPStatus
.
NOT_FOUND
,
status_code
=
HTTPStatus
.
NOT_FOUND
,
)
)
def
_make_store_not_supported_error
(
self
)
->
ErrorResponse
:
return
self
.
create_error_response
(
err_type
=
"invalid_request_error"
,
message
=
(
"`store=True` (default) is not supported. Please set "
"`store=False` in Responses API or set "
"`VLLM_ENABLE_RESPONSES_API_STORE=1` in the env var when "
"starting the vLLM server."
),
status_code
=
HTTPStatus
.
BAD_REQUEST
,
)
vllm/envs.py
View file @
6d98843b
...
@@ -151,6 +151,7 @@ if TYPE_CHECKING:
...
@@ -151,6 +151,7 @@ if TYPE_CHECKING:
VLLM_ENABLE_CUDAGRAPH_GC
:
bool
=
False
VLLM_ENABLE_CUDAGRAPH_GC
:
bool
=
False
VLLM_LOOPBACK_IP
:
str
=
""
VLLM_LOOPBACK_IP
:
str
=
""
VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE
:
bool
=
False
VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE
:
bool
=
False
VLLM_ENABLE_RESPONSES_API_STORE
:
bool
=
False
def
get_default_cache_root
():
def
get_default_cache_root
():
...
@@ -1056,6 +1057,17 @@ environment_variables: dict[str, Callable[[], Any]] = {
...
@@ -1056,6 +1057,17 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE"
:
"VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE"
:
lambda
:
bool
(
int
(
os
.
getenv
(
\
lambda
:
bool
(
int
(
os
.
getenv
(
\
"VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE"
,
"0"
))),
"VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE"
,
"0"
))),
# Enables support for the "store" option in the OpenAI Responses API.
# When set to 1, vLLM's OpenAI server will retain the input and output
# messages for those requests in memory. By default, this is disabled (0).
# NOTE/WARNING:
# 1. Messages are kept in memory only (not persisted to disk) and will be
# lost when the vLLM server shuts down.
# 2. Enabling this option will cause a memory leak, as stored messages are
# never removed from memory until the server terminates.
"VLLM_ENABLE_RESPONSES_API_STORE"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_ENABLE_RESPONSES_API_STORE"
,
"0"
))),
}
}
# --8<-- [end:env-vars-definition]
# --8<-- [end:env-vars-definition]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment