Unverified Commit 6d98843b authored by Woosuk Kwon's avatar Woosuk Kwon Committed by GitHub
Browse files

[Responses API] Disable response store by default (#22137)


Signed-off-by: default avatarWoosuk Kwon <woosuk.kwon@berkeley.edu>
parent aefeea0f
...@@ -21,12 +21,16 @@ def default_server_args(): ...@@ -21,12 +21,16 @@ def default_server_args():
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(default_server_args): def server_with_store(default_server_args):
with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server: with RemoteOpenAIServer(
MODEL_NAME,
default_server_args,
env_dict={"VLLM_ENABLE_RESPONSES_API_STORE": "1"},
) as remote_server:
yield remote_server yield remote_server
@pytest_asyncio.fixture @pytest_asyncio.fixture
async def client(server): async def client(server_with_store):
async with server.get_async_client() as async_client: async with server_with_store.get_async_client() as async_client:
yield async_client yield async_client
...@@ -37,8 +37,11 @@ def default_image_server_args(): ...@@ -37,8 +37,11 @@ def default_image_server_args():
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def image_server(default_image_server_args): def image_server(default_image_server_args):
with RemoteOpenAIServer(MODEL_NAME, with RemoteOpenAIServer(
default_image_server_args) as remote_server: MODEL_NAME,
default_image_server_args,
env_dict={"VLLM_ENABLE_RESPONSES_API_STORE": "1"},
) as remote_server:
yield remote_server yield remote_server
......
...@@ -11,6 +11,7 @@ import jinja2 ...@@ -11,6 +11,7 @@ import jinja2
from fastapi import Request from fastapi import Request
from openai.types.responses import ResponseOutputMessage, ResponseOutputText from openai.types.responses import ResponseOutputMessage, ResponseOutputText
from vllm import envs
from vllm.config import ModelConfig from vllm.config import ModelConfig
from vllm.engine.protocol import EngineClient from vllm.engine.protocol import EngineClient
from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam, from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
...@@ -89,15 +90,17 @@ class OpenAIServingResponses(OpenAIServing): ...@@ -89,15 +90,17 @@ class OpenAIServingResponses(OpenAIServing):
logger.info("Using default chat sampling params from %s: %s", logger.info("Using default chat sampling params from %s: %s",
source, self.default_sampling_params) source, self.default_sampling_params)
# False by default.
self.enable_store = envs.VLLM_ENABLE_RESPONSES_API_STORE
# HACK(woosuk): This is a hack. We should use a better store. # HACK(woosuk): This is a hack. We should use a better store.
# FIXME: This causes a memory leak since we never remove responses # FIXME: If enable_store=True, this may cause a memory leak since we
# from the store. # never remove responses from the store.
self.response_store: dict[str, ResponsesResponse] = {} self.response_store: dict[str, ResponsesResponse] = {}
self.response_store_lock = asyncio.Lock() self.response_store_lock = asyncio.Lock()
# HACK(woosuk): This is a hack. We should use a better store. # HACK(woosuk): This is a hack. We should use a better store.
# FIXME: This causes a memory leak since we never remove messages # FIXME: If enable_store=True, this may cause a memory leak since we
# from the store. # never remove messages from the store.
self.msg_store: dict[str, list[ChatCompletionMessageParam]] = {} self.msg_store: dict[str, list[ChatCompletionMessageParam]] = {}
self.background_tasks: dict[str, asyncio.Task] = {} self.background_tasks: dict[str, asyncio.Task] = {}
...@@ -118,6 +121,10 @@ class OpenAIServingResponses(OpenAIServing): ...@@ -118,6 +121,10 @@ class OpenAIServingResponses(OpenAIServing):
if self.engine_client.errored: if self.engine_client.errored:
raise self.engine_client.dead_error raise self.engine_client.dead_error
# If store is not enabled, return an error.
if request.store and not self.enable_store:
return self._make_store_not_supported_error()
# Handle the previous response ID. # Handle the previous response ID.
prev_response_id = request.previous_response_id prev_response_id = request.previous_response_id
if prev_response_id is not None: if prev_response_id is not None:
...@@ -456,3 +463,13 @@ class OpenAIServingResponses(OpenAIServing): ...@@ -456,3 +463,13 @@ class OpenAIServingResponses(OpenAIServing):
message=f"Response with id '{response_id}' not found.", message=f"Response with id '{response_id}' not found.",
status_code=HTTPStatus.NOT_FOUND, status_code=HTTPStatus.NOT_FOUND,
) )
def _make_store_not_supported_error(self) -> ErrorResponse:
return self.create_error_response(
err_type="invalid_request_error",
message=("`store=True` (default) is not supported. Please set "
"`store=False` in Responses API or set "
"`VLLM_ENABLE_RESPONSES_API_STORE=1` in the env var when "
"starting the vLLM server."),
status_code=HTTPStatus.BAD_REQUEST,
)
...@@ -151,6 +151,7 @@ if TYPE_CHECKING: ...@@ -151,6 +151,7 @@ if TYPE_CHECKING:
VLLM_ENABLE_CUDAGRAPH_GC: bool = False VLLM_ENABLE_CUDAGRAPH_GC: bool = False
VLLM_LOOPBACK_IP: str = "" VLLM_LOOPBACK_IP: str = ""
VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = False VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE: bool = False
VLLM_ENABLE_RESPONSES_API_STORE: bool = False
def get_default_cache_root(): def get_default_cache_root():
...@@ -1056,6 +1057,17 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -1056,6 +1057,17 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE": "VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE":
lambda: bool(int(os.getenv(\ lambda: bool(int(os.getenv(\
"VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE", "0"))), "VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE", "0"))),
# Enables support for the "store" option in the OpenAI Responses API.
# When set to 1, vLLM's OpenAI server will retain the input and output
# messages for those requests in memory. By default, this is disabled (0).
# NOTE/WARNING:
# 1. Messages are kept in memory only (not persisted to disk) and will be
# lost when the vLLM server shuts down.
# 2. Enabling this option will cause a memory leak, as stored messages are
# never removed from memory until the server terminates.
"VLLM_ENABLE_RESPONSES_API_STORE":
lambda: bool(int(os.getenv("VLLM_ENABLE_RESPONSES_API_STORE", "0"))),
} }
# --8<-- [end:env-vars-definition] # --8<-- [end:env-vars-definition]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment