"vllm/vscode:/vscode.git/clone" did not exist on "98c12cffe57be141b64d47c82e65b64948446699"
Unverified Commit fefce498 authored by Chauncey's avatar Chauncey Committed by GitHub
Browse files

[Refactor] [6/N] to simplify the vLLM openai chat_completion serving architecture (#32240)


Signed-off-by: default avatarchaunceyjiang <chaunceyjiang@gmail.com>
parent a5bbbd2f
...@@ -22,6 +22,12 @@ from vllm.entrypoints.chat_utils import ( ...@@ -22,6 +22,12 @@ from vllm.entrypoints.chat_utils import (
ChatTemplateContentFormatOption, ChatTemplateContentFormatOption,
) )
from vllm.entrypoints.constants import MCP_PREFIX from vllm.entrypoints.constants import MCP_PREFIX
from vllm.entrypoints.openai.engine.protocol import (
FunctionCall,
ResponseInputOutputItem,
ResponseRawMessageAndToken,
ResponsesRequest,
)
from vllm.entrypoints.openai.parser.harmony_utils import ( from vllm.entrypoints.openai.parser.harmony_utils import (
get_encoding, get_encoding,
get_streamable_parser_for_assistant, get_streamable_parser_for_assistant,
...@@ -30,12 +36,6 @@ from vllm.entrypoints.openai.parser.harmony_utils import ( ...@@ -30,12 +36,6 @@ from vllm.entrypoints.openai.parser.harmony_utils import (
from vllm.entrypoints.openai.parser.responses_parser import ( from vllm.entrypoints.openai.parser.responses_parser import (
get_responses_parser_for_simple_context, get_responses_parser_for_simple_context,
) )
from vllm.entrypoints.openai.protocol import (
FunctionCall,
ResponseInputOutputItem,
ResponseRawMessageAndToken,
ResponsesRequest,
)
from vllm.entrypoints.responses_utils import construct_tool_dicts from vllm.entrypoints.responses_utils import construct_tool_dicts
from vllm.entrypoints.tool import Tool from vllm.entrypoints.tool import Tool
from vllm.entrypoints.tool_server import ToolServer from vllm.entrypoints.tool_server import ToolServer
......
...@@ -42,11 +42,9 @@ from vllm.entrypoints.anthropic.protocol import ( ...@@ -42,11 +42,9 @@ from vllm.entrypoints.anthropic.protocol import (
from vllm.entrypoints.anthropic.serving_messages import AnthropicServingMessages from vllm.entrypoints.anthropic.serving_messages import AnthropicServingMessages
from vllm.entrypoints.launcher import serve_http from vllm.entrypoints.launcher import serve_http
from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
from vllm.entrypoints.openai.cli_args import make_arg_parser, validate_parsed_serve_args from vllm.entrypoints.openai.cli_args import make_arg_parser, validate_parsed_serve_args
from vllm.entrypoints.openai.orca_metrics import metrics_header from vllm.entrypoints.openai.engine.protocol import (
from vllm.entrypoints.openai.protocol import (
ChatCompletionRequest,
ChatCompletionResponse,
CompletionRequest, CompletionRequest,
CompletionResponse, CompletionResponse,
ErrorInfo, ErrorInfo,
...@@ -59,9 +57,9 @@ from vllm.entrypoints.openai.protocol import ( ...@@ -59,9 +57,9 @@ from vllm.entrypoints.openai.protocol import (
TranslationRequest, TranslationRequest,
TranslationResponseVariant, TranslationResponseVariant,
) )
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.engine.serving import OpenAIServing
from vllm.entrypoints.openai.orca_metrics import metrics_header
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
from vllm.entrypoints.openai.serving_engine import OpenAIServing
from vllm.entrypoints.openai.serving_models import ( from vllm.entrypoints.openai.serving_models import (
BaseModelPath, BaseModelPath,
OpenAIServingModels, OpenAIServingModels,
...@@ -475,47 +473,6 @@ async def create_messages(request: AnthropicMessagesRequest, raw_request: Reques ...@@ -475,47 +473,6 @@ async def create_messages(request: AnthropicMessagesRequest, raw_request: Reques
return StreamingResponse(content=generator, media_type="text/event-stream") return StreamingResponse(content=generator, media_type="text/event-stream")
@router.post(
"/v1/chat/completions",
dependencies=[Depends(validate_json_request)],
responses={
HTTPStatus.OK.value: {"content": {"text/event-stream": {}}},
HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse},
HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
},
)
@with_cancellation
@load_aware_call
async def create_chat_completion(request: ChatCompletionRequest, raw_request: Request):
metrics_header_format = raw_request.headers.get(
ENDPOINT_LOAD_METRICS_FORMAT_HEADER_LABEL, ""
)
handler = chat(raw_request)
if handler is None:
return base(raw_request).create_error_response(
message="The model does not support Chat Completions API"
)
try:
generator = await handler.create_chat_completion(request, raw_request)
except Exception as e:
raise HTTPException(
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
) from e
if isinstance(generator, ErrorResponse):
return JSONResponse(
content=generator.model_dump(), status_code=generator.error.code
)
elif isinstance(generator, ChatCompletionResponse):
return JSONResponse(
content=generator.model_dump(),
headers=metrics_header(metrics_header_format),
)
return StreamingResponse(content=generator, media_type="text/event-stream")
@router.post( @router.post(
"/v1/completions", "/v1/completions",
dependencies=[Depends(validate_json_request)], dependencies=[Depends(validate_json_request)],
...@@ -735,8 +692,10 @@ class XRequestIdMiddleware: ...@@ -735,8 +692,10 @@ class XRequestIdMiddleware:
def _extract_content_from_chunk(chunk_data: dict) -> str: def _extract_content_from_chunk(chunk_data: dict) -> str:
"""Extract content from a streaming response chunk.""" """Extract content from a streaming response chunk."""
try: try:
from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionStreamResponse, ChatCompletionStreamResponse,
)
from vllm.entrypoints.openai.engine.protocol import (
CompletionStreamResponse, CompletionStreamResponse,
) )
...@@ -880,7 +839,11 @@ def build_app(args: Namespace) -> FastAPI: ...@@ -880,7 +839,11 @@ def build_app(args: Namespace) -> FastAPI:
from vllm.entrypoints.serve import register_vllm_serve_api_routers from vllm.entrypoints.serve import register_vllm_serve_api_routers
register_vllm_serve_api_routers(app) register_vllm_serve_api_routers(app)
from vllm.entrypoints.openai.chat_completion.api_router import (
attach_router as register_chat_api_router,
)
register_chat_api_router(app)
from vllm.entrypoints.sagemaker.routes import register_sagemaker_routes from vllm.entrypoints.sagemaker.routes import register_sagemaker_routes
register_sagemaker_routes(router) register_sagemaker_routes(router)
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from http import HTTPStatus
from fastapi import APIRouter, Depends, FastAPI, HTTPException, Request
from fastapi.responses import JSONResponse, StreamingResponse
from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionRequest,
ChatCompletionResponse,
)
from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
from vllm.entrypoints.openai.orca_metrics import metrics_header
from vllm.entrypoints.openai.utils import validate_json_request
from vllm.entrypoints.utils import (
load_aware_call,
with_cancellation,
)
from vllm.logger import init_logger
logger = init_logger(__name__)
router = APIRouter()
ENDPOINT_LOAD_METRICS_FORMAT_HEADER_LABEL = "endpoint-load-metrics-format"
def chat(request: Request) -> OpenAIServingChat | None:
return request.app.state.openai_serving_chat
@router.post(
"/v1/chat/completions",
dependencies=[Depends(validate_json_request)],
responses={
HTTPStatus.OK.value: {"content": {"text/event-stream": {}}},
HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse},
HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
},
)
@with_cancellation
@load_aware_call
async def create_chat_completion(request: ChatCompletionRequest, raw_request: Request):
metrics_header_format = raw_request.headers.get(
ENDPOINT_LOAD_METRICS_FORMAT_HEADER_LABEL, ""
)
handler = chat(raw_request)
if handler is None:
base_server = raw_request.app.state.openai_serving_tokenization
return base_server.create_error_response(
message="The model does not support Chat Completions API"
)
try:
generator = await handler.create_chat_completion(request, raw_request)
except Exception as e:
raise HTTPException(
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
) from e
if isinstance(generator, ErrorResponse):
return JSONResponse(
content=generator.model_dump(), status_code=generator.error.code
)
elif isinstance(generator, ChatCompletionResponse):
return JSONResponse(
content=generator.model_dump(),
headers=metrics_header(metrics_header_format),
)
return StreamingResponse(content=generator, media_type="text/event-stream")
def attach_router(app: FastAPI):
app.include_router(router)
This diff is collapsed.
...@@ -23,16 +23,7 @@ from vllm.entrypoints.chat_utils import ( ...@@ -23,16 +23,7 @@ from vllm.entrypoints.chat_utils import (
make_tool_call_id, make_tool_call_id,
) )
from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.parser.harmony_utils import ( from vllm.entrypoints.openai.chat_completion.protocol import (
get_developer_message,
get_stop_tokens_for_assistant_actions,
get_streamable_parser_for_assistant,
get_system_message,
parse_chat_inputs_to_harmony_messages,
parse_chat_output,
render_for_completion,
)
from vllm.entrypoints.openai.protocol import (
ChatCompletionLogProb, ChatCompletionLogProb,
ChatCompletionLogProbs, ChatCompletionLogProbs,
ChatCompletionLogProbsContent, ChatCompletionLogProbsContent,
...@@ -43,6 +34,11 @@ from vllm.entrypoints.openai.protocol import ( ...@@ -43,6 +34,11 @@ from vllm.entrypoints.openai.protocol import (
ChatCompletionResponseStreamChoice, ChatCompletionResponseStreamChoice,
ChatCompletionStreamResponse, ChatCompletionStreamResponse,
ChatMessage, ChatMessage,
)
from vllm.entrypoints.openai.chat_completion.stream_harmony import (
extract_harmony_streaming_delta,
)
from vllm.entrypoints.openai.engine.protocol import (
DeltaFunctionCall, DeltaFunctionCall,
DeltaMessage, DeltaMessage,
DeltaToolCall, DeltaToolCall,
...@@ -52,14 +48,20 @@ from vllm.entrypoints.openai.protocol import ( ...@@ -52,14 +48,20 @@ from vllm.entrypoints.openai.protocol import (
ToolCall, ToolCall,
UsageInfo, UsageInfo,
) )
from vllm.entrypoints.openai.serving_chat_stream_harmony import ( from vllm.entrypoints.openai.engine.serving import (
extract_harmony_streaming_delta,
)
from vllm.entrypoints.openai.serving_engine import (
GenerationError, GenerationError,
OpenAIServing, OpenAIServing,
clamp_prompt_logprobs, clamp_prompt_logprobs,
) )
from vllm.entrypoints.openai.parser.harmony_utils import (
get_developer_message,
get_stop_tokens_for_assistant_actions,
get_streamable_parser_for_assistant,
get_system_message,
parse_chat_inputs_to_harmony_messages,
parse_chat_output,
render_for_completion,
)
from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.entrypoints.openai.utils import maybe_filter_parallel_tool_calls from vllm.entrypoints.openai.utils import maybe_filter_parallel_tool_calls
from vllm.entrypoints.utils import get_max_tokens, should_include_usage from vllm.entrypoints.utils import get_max_tokens, should_include_usage
......
...@@ -10,7 +10,7 @@ harmony parser state during streaming chat completions. ...@@ -10,7 +10,7 @@ harmony parser state during streaming chat completions.
from openai_harmony import StreamableParser from openai_harmony import StreamableParser
from vllm.entrypoints.chat_utils import make_tool_call_id from vllm.entrypoints.chat_utils import make_tool_call_id
from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.engine.protocol import (
DeltaFunctionCall, DeltaFunctionCall,
DeltaMessage, DeltaMessage,
DeltaToolCall, DeltaToolCall,
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
...@@ -38,22 +38,20 @@ from vllm.entrypoints.context import ( ...@@ -38,22 +38,20 @@ from vllm.entrypoints.context import (
StreamingHarmonyContext, StreamingHarmonyContext,
) )
from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionNamedToolChoiceParam, ChatCompletionNamedToolChoiceParam,
ChatCompletionRequest, ChatCompletionRequest,
ChatCompletionResponse, ChatCompletionResponse,
)
from vllm.entrypoints.openai.engine.protocol import (
CompletionRequest, CompletionRequest,
CompletionResponse, CompletionResponse,
DetokenizeRequest,
ErrorInfo, ErrorInfo,
ErrorResponse, ErrorResponse,
FunctionCall, FunctionCall,
FunctionDefinition, FunctionDefinition,
ResponseInputOutputItem, ResponseInputOutputItem,
ResponsesRequest, ResponsesRequest,
TokenizeChatRequest,
TokenizeCompletionRequest,
TokenizeResponse,
TranscriptionRequest, TranscriptionRequest,
TranscriptionResponse, TranscriptionResponse,
TranslationRequest, TranslationRequest,
...@@ -86,6 +84,12 @@ from vllm.entrypoints.responses_utils import ( ...@@ -86,6 +84,12 @@ from vllm.entrypoints.responses_utils import (
construct_input_messages, construct_input_messages,
) )
from vllm.entrypoints.serve.disagg.protocol import GenerateRequest, GenerateResponse from vllm.entrypoints.serve.disagg.protocol import GenerateRequest, GenerateResponse
from vllm.entrypoints.serve.tokenize.protocol import (
DetokenizeRequest,
TokenizeChatRequest,
TokenizeCompletionRequest,
TokenizeResponse,
)
from vllm.entrypoints.utils import _validate_truncation_size from vllm.entrypoints.utils import _validate_truncation_size
from vllm.inputs.data import PromptType, TokensPrompt from vllm.inputs.data import PromptType, TokensPrompt
from vllm.inputs.parse import ( from vllm.inputs.parse import (
......
...@@ -43,8 +43,8 @@ from openai_harmony import Message as OpenAIHarmonyMessage ...@@ -43,8 +43,8 @@ from openai_harmony import Message as OpenAIHarmonyMessage
from openai_harmony import Role as OpenAIHarmonyRole from openai_harmony import Role as OpenAIHarmonyRole
from vllm import envs from vllm import envs
from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionToolsParam
ChatCompletionToolsParam, from vllm.entrypoints.openai.engine.protocol import (
ResponseInputOutputItem, ResponseInputOutputItem,
ResponsesRequest, ResponsesRequest,
) )
......
...@@ -16,7 +16,10 @@ from openai.types.responses.response_reasoning_item import ( ...@@ -16,7 +16,10 @@ from openai.types.responses.response_reasoning_item import (
) )
from vllm.entrypoints.constants import MCP_PREFIX from vllm.entrypoints.constants import MCP_PREFIX
from vllm.entrypoints.openai.protocol import ResponseInputOutputItem, ResponsesRequest from vllm.entrypoints.openai.engine.protocol import (
ResponseInputOutputItem,
ResponsesRequest,
)
from vllm.outputs import CompletionOutput from vllm.outputs import CompletionOutput
from vllm.reasoning.abs_reasoning_parsers import ReasoningParser from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
from vllm.tokenizers import TokenizerLike from vllm.tokenizers import TokenizerLike
......
...@@ -19,13 +19,15 @@ from tqdm import tqdm ...@@ -19,13 +19,15 @@ from tqdm import tqdm
from vllm.engine.arg_utils import AsyncEngineArgs, optional_type from vllm.engine.arg_utils import AsyncEngineArgs, optional_type
from vllm.engine.protocol import EngineClient from vllm.engine.protocol import EngineClient
from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionRequest, ChatCompletionRequest,
ChatCompletionResponse, ChatCompletionResponse,
)
from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
from vllm.entrypoints.openai.engine.protocol import (
ErrorResponse, ErrorResponse,
OpenAIBaseModel, OpenAIBaseModel,
) )
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
from vllm.entrypoints.pooling.embed.protocol import EmbeddingRequest, EmbeddingResponse from vllm.entrypoints.pooling.embed.protocol import EmbeddingRequest, EmbeddingResponse
from vllm.entrypoints.pooling.embed.serving import OpenAIServingEmbedding from vllm.entrypoints.pooling.embed.serving import OpenAIServingEmbedding
......
...@@ -12,7 +12,7 @@ from fastapi import Request ...@@ -12,7 +12,7 @@ from fastapi import Request
from vllm.engine.protocol import EngineClient from vllm.engine.protocol import EngineClient
from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.engine.protocol import (
CompletionLogProbs, CompletionLogProbs,
CompletionRequest, CompletionRequest,
CompletionResponse, CompletionResponse,
...@@ -25,7 +25,7 @@ from vllm.entrypoints.openai.protocol import ( ...@@ -25,7 +25,7 @@ from vllm.entrypoints.openai.protocol import (
UsageInfo, UsageInfo,
VLLMValidationError, VLLMValidationError,
) )
from vllm.entrypoints.openai.serving_engine import ( from vllm.entrypoints.openai.engine.serving import (
GenerationError, GenerationError,
OpenAIServing, OpenAIServing,
clamp_prompt_logprobs, clamp_prompt_logprobs,
......
...@@ -7,7 +7,7 @@ from dataclasses import dataclass ...@@ -7,7 +7,7 @@ from dataclasses import dataclass
from http import HTTPStatus from http import HTTPStatus
from vllm.engine.protocol import EngineClient from vllm.engine.protocol import EngineClient
from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.engine.protocol import (
ErrorInfo, ErrorInfo,
ErrorResponse, ErrorResponse,
LoadLoRAAdapterRequest, LoadLoRAAdapterRequest,
......
...@@ -72,19 +72,7 @@ from vllm.entrypoints.context import ( ...@@ -72,19 +72,7 @@ from vllm.entrypoints.context import (
StreamingHarmonyContext, StreamingHarmonyContext,
) )
from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.parser.harmony_utils import ( from vllm.entrypoints.openai.engine.protocol import (
construct_harmony_previous_input_messages,
get_developer_message,
get_stop_tokens_for_assistant_actions,
get_system_message,
get_user_message,
has_custom_tools,
parse_output_message,
parse_remaining_state,
parse_response_input,
render_for_completion,
)
from vllm.entrypoints.openai.protocol import (
DeltaMessage, DeltaMessage,
ErrorResponse, ErrorResponse,
InputTokensDetails, InputTokensDetails,
...@@ -102,10 +90,22 @@ from vllm.entrypoints.openai.protocol import ( ...@@ -102,10 +90,22 @@ from vllm.entrypoints.openai.protocol import (
StreamingResponsesResponse, StreamingResponsesResponse,
VLLMValidationError, VLLMValidationError,
) )
from vllm.entrypoints.openai.serving_engine import ( from vllm.entrypoints.openai.engine.serving import (
GenerationError, GenerationError,
OpenAIServing, OpenAIServing,
) )
from vllm.entrypoints.openai.parser.harmony_utils import (
construct_harmony_previous_input_messages,
get_developer_message,
get_stop_tokens_for_assistant_actions,
get_system_message,
get_user_message,
has_custom_tools,
parse_output_message,
parse_remaining_state,
parse_response_input,
render_for_completion,
)
from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.entrypoints.responses_utils import ( from vllm.entrypoints.responses_utils import (
construct_input_messages, construct_input_messages,
......
...@@ -6,7 +6,7 @@ from fastapi import Request ...@@ -6,7 +6,7 @@ from fastapi import Request
from vllm.engine.protocol import EngineClient from vllm.engine.protocol import EngineClient
from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.engine.protocol import (
ErrorResponse, ErrorResponse,
RequestResponseMetadata, RequestResponseMetadata,
TranscriptionRequest, TranscriptionRequest,
......
...@@ -15,7 +15,7 @@ from transformers import PreTrainedTokenizerBase ...@@ -15,7 +15,7 @@ from transformers import PreTrainedTokenizerBase
import vllm.envs as envs import vllm.envs as envs
from vllm.engine.protocol import EngineClient from vllm.engine.protocol import EngineClient
from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.engine.protocol import (
DeltaMessage, DeltaMessage,
ErrorResponse, ErrorResponse,
RequestResponseMetadata, RequestResponseMetadata,
...@@ -32,7 +32,7 @@ from vllm.entrypoints.openai.protocol import ( ...@@ -32,7 +32,7 @@ from vllm.entrypoints.openai.protocol import (
UsageInfo, UsageInfo,
VLLMValidationError, VLLMValidationError,
) )
from vllm.entrypoints.openai.serving_engine import OpenAIServing, SpeechToTextRequest from vllm.entrypoints.openai.engine.serving import OpenAIServing, SpeechToTextRequest
from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.inputs.data import PromptType from vllm.inputs.data import PromptType
from vllm.logger import init_logger from vllm.logger import init_logger
......
...@@ -5,7 +5,7 @@ from typing import TypeVar ...@@ -5,7 +5,7 @@ from typing import TypeVar
from fastapi import Request from fastapi import Request
from fastapi.exceptions import RequestValidationError from fastapi.exceptions import RequestValidationError
from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionRequest, ChatCompletionRequest,
ChatCompletionResponseChoice, ChatCompletionResponseChoice,
ChatCompletionResponseStreamChoice, ChatCompletionResponseStreamChoice,
......
...@@ -6,7 +6,7 @@ from fastapi import APIRouter, Depends, HTTPException, Request ...@@ -6,7 +6,7 @@ from fastapi import APIRouter, Depends, HTTPException, Request
from starlette.responses import JSONResponse from starlette.responses import JSONResponse
from typing_extensions import assert_never from typing_extensions import assert_never
from vllm.entrypoints.openai.protocol import ErrorResponse from vllm.entrypoints.openai.engine.protocol import ErrorResponse
from vllm.entrypoints.openai.utils import validate_json_request from vllm.entrypoints.openai.utils import validate_json_request
from vllm.entrypoints.pooling.classify.protocol import ( from vllm.entrypoints.pooling.classify.protocol import (
ClassificationRequest, ClassificationRequest,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment