Unverified Commit 9312a6c0 authored by Chauncey's avatar Chauncey Committed by GitHub
Browse files

[Refactor] [8/N] to simplify the vLLM openai responsesapi_serving architecture (#32260)


Signed-off-by: default avatarchaunceyjiang <chaunceyjiang@gmail.com>
parent 6388b500
...@@ -7,7 +7,7 @@ import json ...@@ -7,7 +7,7 @@ import json
import pytest import pytest
from openai.types.responses import ResponseFunctionToolCall from openai.types.responses import ResponseFunctionToolCall
from vllm.entrypoints.openai.engine.protocol import ResponsesRequest from vllm.entrypoints.openai.responses.protocol import ResponsesRequest
def test_function_call_dict_converted_to_object(): def test_function_call_dict_converted_to_object():
...@@ -253,7 +253,7 @@ def test_function_call_validation_failure_logs_debug(caplog): ...@@ -253,7 +253,7 @@ def test_function_call_validation_failure_logs_debug(caplog):
} }
# Mock the logger to verify debug was called # Mock the logger to verify debug was called
with patch("vllm.entrypoints.openai.engine.protocol.logger") as mock_logger: with patch("vllm.entrypoints.openai.responses.protocol.logger") as mock_logger:
with pytest.raises(ValueError): with pytest.raises(ValueError):
ResponsesRequest(**request_data) ResponsesRequest(**request_data)
......
...@@ -4,7 +4,7 @@ from openai_harmony import ( ...@@ -4,7 +4,7 @@ from openai_harmony import (
Message, Message,
) )
from vllm.entrypoints.openai.engine.protocol import ( from vllm.entrypoints.openai.responses.protocol import (
serialize_message, serialize_message,
serialize_messages, serialize_messages,
) )
......
...@@ -14,8 +14,9 @@ from openai.types.responses.tool import ( ...@@ -14,8 +14,9 @@ from openai.types.responses.tool import (
) )
from vllm.entrypoints.context import ConversationContext from vllm.entrypoints.context import ConversationContext
from vllm.entrypoints.openai.engine.protocol import ErrorResponse, ResponsesRequest from vllm.entrypoints.openai.engine.protocol import ErrorResponse
from vllm.entrypoints.openai.serving_responses import ( from vllm.entrypoints.openai.responses.protocol import ResponsesRequest
from vllm.entrypoints.openai.responses.serving import (
OpenAIServingResponses, OpenAIServingResponses,
_extract_allowed_tools_from_mcp_requests, _extract_allowed_tools_from_mcp_requests,
extract_tool_types, extract_tool_types,
......
...@@ -24,9 +24,6 @@ from vllm.entrypoints.chat_utils import ( ...@@ -24,9 +24,6 @@ from vllm.entrypoints.chat_utils import (
from vllm.entrypoints.constants import MCP_PREFIX from vllm.entrypoints.constants import MCP_PREFIX
from vllm.entrypoints.openai.engine.protocol import ( from vllm.entrypoints.openai.engine.protocol import (
FunctionCall, FunctionCall,
ResponseInputOutputItem,
ResponseRawMessageAndToken,
ResponsesRequest,
) )
from vllm.entrypoints.openai.parser.harmony_utils import ( from vllm.entrypoints.openai.parser.harmony_utils import (
get_encoding, get_encoding,
...@@ -36,6 +33,11 @@ from vllm.entrypoints.openai.parser.harmony_utils import ( ...@@ -36,6 +33,11 @@ from vllm.entrypoints.openai.parser.harmony_utils import (
from vllm.entrypoints.openai.parser.responses_parser import ( from vllm.entrypoints.openai.parser.responses_parser import (
get_responses_parser_for_simple_context, get_responses_parser_for_simple_context,
) )
from vllm.entrypoints.openai.responses.protocol import (
ResponseInputOutputItem,
ResponseRawMessageAndToken,
ResponsesRequest,
)
from vllm.entrypoints.responses_utils import construct_tool_dicts from vllm.entrypoints.responses_utils import construct_tool_dicts
from vllm.entrypoints.tool import Tool from vllm.entrypoints.tool import Tool
from vllm.entrypoints.tool_server import ToolServer from vllm.entrypoints.tool_server import ToolServer
......
...@@ -14,7 +14,7 @@ import socket ...@@ -14,7 +14,7 @@ import socket
import tempfile import tempfile
import uuid import uuid
from argparse import Namespace from argparse import Namespace
from collections.abc import AsyncGenerator, AsyncIterator, Awaitable from collections.abc import AsyncIterator, Awaitable
from contextlib import asynccontextmanager from contextlib import asynccontextmanager
from http import HTTPStatus from http import HTTPStatus
from typing import Annotated, Any from typing import Annotated, Any
...@@ -49,9 +49,6 @@ from vllm.entrypoints.openai.engine.protocol import ( ...@@ -49,9 +49,6 @@ from vllm.entrypoints.openai.engine.protocol import (
CompletionResponse, CompletionResponse,
ErrorInfo, ErrorInfo,
ErrorResponse, ErrorResponse,
ResponsesRequest,
ResponsesResponse,
StreamingResponsesResponse,
TranscriptionRequest, TranscriptionRequest,
TranscriptionResponseVariant, TranscriptionResponseVariant,
TranslationRequest, TranslationRequest,
...@@ -59,12 +56,12 @@ from vllm.entrypoints.openai.engine.protocol import ( ...@@ -59,12 +56,12 @@ from vllm.entrypoints.openai.engine.protocol import (
) )
from vllm.entrypoints.openai.engine.serving import OpenAIServing from vllm.entrypoints.openai.engine.serving import OpenAIServing
from vllm.entrypoints.openai.orca_metrics import metrics_header from vllm.entrypoints.openai.orca_metrics import metrics_header
from vllm.entrypoints.openai.responses.serving import OpenAIServingResponses
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
from vllm.entrypoints.openai.serving_models import ( from vllm.entrypoints.openai.serving_models import (
BaseModelPath, BaseModelPath,
OpenAIServingModels, OpenAIServingModels,
) )
from vllm.entrypoints.openai.serving_responses import OpenAIServingResponses
from vllm.entrypoints.openai.serving_transcription import ( from vllm.entrypoints.openai.serving_transcription import (
OpenAIServingTranscription, OpenAIServingTranscription,
OpenAIServingTranslation, OpenAIServingTranslation,
...@@ -311,112 +308,6 @@ async def show_version(): ...@@ -311,112 +308,6 @@ async def show_version():
return JSONResponse(content=ver) return JSONResponse(content=ver)
async def _convert_stream_to_sse_events(
generator: AsyncGenerator[StreamingResponsesResponse, None],
) -> AsyncGenerator[str, None]:
"""Convert the generator to a stream of events in SSE format"""
async for event in generator:
event_type = getattr(event, "type", "unknown")
# https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#event_stream_format
event_data = (
f"event: {event_type}\ndata: {event.model_dump_json(indent=None)}\n\n"
)
yield event_data
@router.post(
"/v1/responses",
dependencies=[Depends(validate_json_request)],
responses={
HTTPStatus.OK.value: {"content": {"text/event-stream": {}}},
HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse},
HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
},
)
@with_cancellation
async def create_responses(request: ResponsesRequest, raw_request: Request):
handler = responses(raw_request)
if handler is None:
return base(raw_request).create_error_response(
message="The model does not support Responses API"
)
try:
generator = await handler.create_responses(request, raw_request)
except Exception as e:
raise HTTPException(
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
) from e
if isinstance(generator, ErrorResponse):
return JSONResponse(
content=generator.model_dump(), status_code=generator.error.code
)
elif isinstance(generator, ResponsesResponse):
return JSONResponse(content=generator.model_dump())
return StreamingResponse(
content=_convert_stream_to_sse_events(generator), media_type="text/event-stream"
)
@router.get("/v1/responses/{response_id}")
async def retrieve_responses(
response_id: str,
raw_request: Request,
starting_after: int | None = None,
stream: bool | None = False,
):
handler = responses(raw_request)
if handler is None:
return base(raw_request).create_error_response(
message="The model does not support Responses API"
)
try:
response = await handler.retrieve_responses(
response_id,
starting_after=starting_after,
stream=stream,
)
except Exception as e:
raise HTTPException(
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
) from e
if isinstance(response, ErrorResponse):
return JSONResponse(
content=response.model_dump(), status_code=response.error.code
)
elif isinstance(response, ResponsesResponse):
return JSONResponse(content=response.model_dump())
return StreamingResponse(
content=_convert_stream_to_sse_events(response), media_type="text/event-stream"
)
@router.post("/v1/responses/{response_id}/cancel")
async def cancel_responses(response_id: str, raw_request: Request):
handler = responses(raw_request)
if handler is None:
return base(raw_request).create_error_response(
message="The model does not support Responses API"
)
try:
response = await handler.cancel_responses(response_id)
except Exception as e:
raise HTTPException(
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
) from e
if isinstance(response, ErrorResponse):
return JSONResponse(
content=response.model_dump(), status_code=response.error.code
)
return JSONResponse(content=response.model_dump())
@router.post( @router.post(
"/v1/messages", "/v1/messages",
dependencies=[Depends(validate_json_request)], dependencies=[Depends(validate_json_request)],
...@@ -844,6 +735,12 @@ def build_app(args: Namespace) -> FastAPI: ...@@ -844,6 +735,12 @@ def build_app(args: Namespace) -> FastAPI:
) )
register_chat_api_router(app) register_chat_api_router(app)
from vllm.entrypoints.openai.responses.api_router import (
attach_router as register_responses_api_router,
)
register_responses_api_router(app)
from vllm.entrypoints.sagemaker.routes import register_sagemaker_routes from vllm.entrypoints.sagemaker.routes import register_sagemaker_routes
register_sagemaker_routes(router) register_sagemaker_routes(router)
......
This diff is collapsed.
...@@ -50,13 +50,15 @@ from vllm.entrypoints.openai.engine.protocol import ( ...@@ -50,13 +50,15 @@ from vllm.entrypoints.openai.engine.protocol import (
ErrorResponse, ErrorResponse,
FunctionCall, FunctionCall,
FunctionDefinition, FunctionDefinition,
ResponseInputOutputItem,
ResponsesRequest,
TranscriptionRequest, TranscriptionRequest,
TranscriptionResponse, TranscriptionResponse,
TranslationRequest, TranslationRequest,
VLLMValidationError, VLLMValidationError,
) )
from vllm.entrypoints.openai.responses.protocol import (
ResponseInputOutputItem,
ResponsesRequest,
)
from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.entrypoints.pooling.classify.protocol import ( from vllm.entrypoints.pooling.classify.protocol import (
ClassificationChatRequest, ClassificationChatRequest,
......
...@@ -44,7 +44,7 @@ from openai_harmony import Role as OpenAIHarmonyRole ...@@ -44,7 +44,7 @@ from openai_harmony import Role as OpenAIHarmonyRole
from vllm import envs from vllm import envs
from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionToolsParam from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionToolsParam
from vllm.entrypoints.openai.engine.protocol import ( from vllm.entrypoints.openai.responses.protocol import (
ResponseInputOutputItem, ResponseInputOutputItem,
ResponsesRequest, ResponsesRequest,
) )
......
...@@ -16,7 +16,7 @@ from openai.types.responses.response_reasoning_item import ( ...@@ -16,7 +16,7 @@ from openai.types.responses.response_reasoning_item import (
) )
from vllm.entrypoints.constants import MCP_PREFIX from vllm.entrypoints.constants import MCP_PREFIX
from vllm.entrypoints.openai.engine.protocol import ( from vllm.entrypoints.openai.responses.protocol import (
ResponseInputOutputItem, ResponseInputOutputItem,
ResponsesRequest, ResponsesRequest,
) )
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import AsyncGenerator
from http import HTTPStatus
from fastapi import APIRouter, Depends, FastAPI, HTTPException, Request
from fastapi.responses import JSONResponse, StreamingResponse
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
from vllm.entrypoints.openai.responses.protocol import (
ResponsesRequest,
ResponsesResponse,
StreamingResponsesResponse,
)
from vllm.entrypoints.openai.responses.serving import OpenAIServingResponses
from vllm.entrypoints.openai.utils import validate_json_request
from vllm.entrypoints.utils import (
with_cancellation,
)
from vllm.logger import init_logger
logger = init_logger(__name__)
router = APIRouter()
def responses(request: Request) -> OpenAIServingResponses | None:
return request.app.state.openai_serving_responses
async def _convert_stream_to_sse_events(
generator: AsyncGenerator[StreamingResponsesResponse, None],
) -> AsyncGenerator[str, None]:
"""Convert the generator to a stream of events in SSE format"""
async for event in generator:
event_type = getattr(event, "type", "unknown")
# https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#event_stream_format
event_data = (
f"event: {event_type}\ndata: {event.model_dump_json(indent=None)}\n\n"
)
yield event_data
@router.post(
"/v1/responses",
dependencies=[Depends(validate_json_request)],
responses={
HTTPStatus.OK.value: {"content": {"text/event-stream": {}}},
HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse},
HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
},
)
@with_cancellation
async def create_responses(request: ResponsesRequest, raw_request: Request):
handler = responses(raw_request)
if handler is None:
base_server = raw_request.app.state.openai_serving_tokenization
return base_server.create_error_response(
message="The model does not support Responses API"
)
try:
generator = await handler.create_responses(request, raw_request)
except Exception as e:
raise HTTPException(
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
) from e
if isinstance(generator, ErrorResponse):
return JSONResponse(
content=generator.model_dump(), status_code=generator.error.code
)
elif isinstance(generator, ResponsesResponse):
return JSONResponse(content=generator.model_dump())
return StreamingResponse(
content=_convert_stream_to_sse_events(generator), media_type="text/event-stream"
)
@router.get("/v1/responses/{response_id}")
async def retrieve_responses(
response_id: str,
raw_request: Request,
starting_after: int | None = None,
stream: bool | None = False,
):
handler = responses(raw_request)
if handler is None:
base_server = raw_request.app.state.openai_serving_tokenization
return base_server.create_error_response(
message="The model does not support Responses API"
)
try:
response = await handler.retrieve_responses(
response_id,
starting_after=starting_after,
stream=stream,
)
except Exception as e:
raise HTTPException(
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
) from e
if isinstance(response, ErrorResponse):
return JSONResponse(
content=response.model_dump(), status_code=response.error.code
)
elif isinstance(response, ResponsesResponse):
return JSONResponse(content=response.model_dump())
return StreamingResponse(
content=_convert_stream_to_sse_events(response), media_type="text/event-stream"
)
@router.post("/v1/responses/{response_id}/cancel")
async def cancel_responses(response_id: str, raw_request: Request):
handler = responses(raw_request)
if handler is None:
base_server = raw_request.app.state.openai_serving_tokenization
return base_server.create_error_response(
message="The model does not support Responses API"
)
try:
response = await handler.cancel_responses(response_id)
except Exception as e:
raise HTTPException(
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
) from e
if isinstance(response, ErrorResponse):
return JSONResponse(
content=response.model_dump(), status_code=response.error.code
)
return JSONResponse(content=response.model_dump())
def attach_router(app: FastAPI):
app.include_router(router)
This diff is collapsed.
...@@ -75,19 +75,7 @@ from vllm.entrypoints.logger import RequestLogger ...@@ -75,19 +75,7 @@ from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.engine.protocol import ( from vllm.entrypoints.openai.engine.protocol import (
DeltaMessage, DeltaMessage,
ErrorResponse, ErrorResponse,
InputTokensDetails,
OutputTokensDetails,
RequestResponseMetadata, RequestResponseMetadata,
ResponseCompletedEvent,
ResponseCreatedEvent,
ResponseInProgressEvent,
ResponseInputOutputMessage,
ResponseReasoningPartAddedEvent,
ResponseReasoningPartDoneEvent,
ResponsesRequest,
ResponsesResponse,
ResponseUsage,
StreamingResponsesResponse,
VLLMValidationError, VLLMValidationError,
) )
from vllm.entrypoints.openai.engine.serving import ( from vllm.entrypoints.openai.engine.serving import (
...@@ -106,6 +94,20 @@ from vllm.entrypoints.openai.parser.harmony_utils import ( ...@@ -106,6 +94,20 @@ from vllm.entrypoints.openai.parser.harmony_utils import (
parse_response_input, parse_response_input,
render_for_completion, render_for_completion,
) )
from vllm.entrypoints.openai.responses.protocol import (
InputTokensDetails,
OutputTokensDetails,
ResponseCompletedEvent,
ResponseCreatedEvent,
ResponseInProgressEvent,
ResponseInputOutputMessage,
ResponseReasoningPartAddedEvent,
ResponseReasoningPartDoneEvent,
ResponsesRequest,
ResponsesResponse,
ResponseUsage,
StreamingResponsesResponse,
)
from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.entrypoints.responses_utils import ( from vllm.entrypoints.responses_utils import (
construct_input_messages, construct_input_messages,
...@@ -590,7 +592,6 @@ class OpenAIServingResponses(OpenAIServing): ...@@ -590,7 +592,6 @@ class OpenAIServingResponses(OpenAIServing):
prev_msg=self.msg_store.get(prev_response.id) if prev_response else None, prev_msg=self.msg_store.get(prev_response.id) if prev_response else None,
prev_response_output=prev_response.output if prev_response else None, prev_response_output=prev_response.output if prev_response else None,
) )
# Check if we should continue the final message (partial completion) # Check if we should continue the final message (partial completion)
# This enables Anthropic-style partial message completion where the # This enables Anthropic-style partial message completion where the
# user provides an incomplete assistant message to continue from. # user provides an incomplete assistant message to continue from.
......
...@@ -22,10 +22,8 @@ from openai.types.responses.tool import Tool ...@@ -22,10 +22,8 @@ from openai.types.responses.tool import Tool
from vllm import envs from vllm import envs
from vllm.entrypoints.constants import MCP_PREFIX from vllm.entrypoints.constants import MCP_PREFIX
from vllm.entrypoints.openai.engine.protocol import ( from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionMessageParam
ChatCompletionMessageParam, from vllm.entrypoints.openai.responses.protocol import ResponseInputOutputItem
ResponseInputOutputItem,
)
def should_continue_final_message( def should_continue_final_message(
......
...@@ -19,6 +19,8 @@ if TYPE_CHECKING: ...@@ -19,6 +19,8 @@ if TYPE_CHECKING:
) )
from vllm.entrypoints.openai.engine.protocol import ( from vllm.entrypoints.openai.engine.protocol import (
DeltaMessage, DeltaMessage,
)
from vllm.entrypoints.openai.responses.protocol import (
ResponsesRequest, ResponsesRequest,
) )
from vllm.tokenizers import TokenizerLike from vllm.tokenizers import TokenizerLike
......
...@@ -13,7 +13,7 @@ if TYPE_CHECKING: ...@@ -13,7 +13,7 @@ if TYPE_CHECKING:
from vllm.entrypoints.openai.chat_completion.protocol import ( from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionRequest, ChatCompletionRequest,
) )
from vllm.entrypoints.openai.engine.protocol import ( from vllm.entrypoints.openai.responses.protocol import (
ResponsesRequest, ResponsesRequest,
) )
else: else:
......
...@@ -8,6 +8,8 @@ from vllm.entrypoints.openai.chat_completion.protocol import ( ...@@ -8,6 +8,8 @@ from vllm.entrypoints.openai.chat_completion.protocol import (
) )
from vllm.entrypoints.openai.engine.protocol import ( from vllm.entrypoints.openai.engine.protocol import (
DeltaMessage, DeltaMessage,
)
from vllm.entrypoints.openai.responses.protocol import (
ResponsesRequest, ResponsesRequest,
) )
from vllm.logger import init_logger from vllm.logger import init_logger
......
...@@ -6,7 +6,7 @@ from functools import cached_property ...@@ -6,7 +6,7 @@ from functools import cached_property
from vllm.entrypoints.openai.chat_completion.protocol import ( from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionRequest, ChatCompletionRequest,
) )
from vllm.entrypoints.openai.engine.protocol import ( from vllm.entrypoints.openai.responses.protocol import (
ResponsesRequest, ResponsesRequest,
) )
from vllm.logger import init_logger from vllm.logger import init_logger
......
...@@ -15,6 +15,8 @@ from vllm.entrypoints.openai.chat_completion.protocol import ( ...@@ -15,6 +15,8 @@ from vllm.entrypoints.openai.chat_completion.protocol import (
) )
from vllm.entrypoints.openai.engine.protocol import ( from vllm.entrypoints.openai.engine.protocol import (
DeltaMessage, DeltaMessage,
)
from vllm.entrypoints.openai.responses.protocol import (
ResponsesRequest, ResponsesRequest,
) )
from vllm.logger import init_logger from vllm.logger import init_logger
......
...@@ -4,7 +4,9 @@ ...@@ -4,7 +4,9 @@
from vllm.entrypoints.openai.chat_completion.protocol import ( from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionRequest, ChatCompletionRequest,
) )
from vllm.entrypoints.openai.engine.protocol import ResponsesRequest from vllm.entrypoints.openai.responses.protocol import (
ResponsesRequest,
)
from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment