Unverified Commit 4c1c501a authored by Chauncey's avatar Chauncey Committed by GitHub
Browse files

[Refactor] [10/N] to simplify the vLLM openai completion serving architecture (#32369)


Signed-off-by: default avatarchaunceyjiang <chaunceyjiang@gmail.com>
parent ae1eba6a
...@@ -43,20 +43,21 @@ from vllm.entrypoints.openai.chat_completion.protocol import ( ...@@ -43,20 +43,21 @@ from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionRequest, ChatCompletionRequest,
ChatCompletionResponse, ChatCompletionResponse,
) )
from vllm.entrypoints.openai.engine.protocol import ( from vllm.entrypoints.openai.completion.protocol import (
CompletionRequest, CompletionRequest,
CompletionResponse, CompletionResponse,
)
from vllm.entrypoints.openai.engine.protocol import (
ErrorInfo, ErrorInfo,
ErrorResponse, ErrorResponse,
FunctionCall, FunctionCall,
FunctionDefinition, FunctionDefinition,
VLLMValidationError,
) )
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.openai.responses.protocol import ( from vllm.entrypoints.openai.responses.protocol import (
ResponseInputOutputItem, ResponseInputOutputItem,
ResponsesRequest, ResponsesRequest,
) )
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.entrypoints.openai.translations.protocol import ( from vllm.entrypoints.openai.translations.protocol import (
TranscriptionRequest, TranscriptionRequest,
TranscriptionResponse, TranscriptionResponse,
...@@ -95,6 +96,7 @@ from vllm.entrypoints.serve.tokenize.protocol import ( ...@@ -95,6 +96,7 @@ from vllm.entrypoints.serve.tokenize.protocol import (
TokenizeResponse, TokenizeResponse,
) )
from vllm.entrypoints.utils import _validate_truncation_size, sanitize_message from vllm.entrypoints.utils import _validate_truncation_size, sanitize_message
from vllm.exceptions import VLLMValidationError
from vllm.inputs.data import PromptType, TokensPrompt from vllm.inputs.data import PromptType, TokensPrompt
from vllm.inputs.parse import ( from vllm.inputs.parse import (
PromptComponents, PromptComponents,
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from fastapi import APIRouter, FastAPI, Request
from fastapi.responses import JSONResponse
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.logger import init_logger
logger = init_logger(__name__)
router = APIRouter()
def models(request: Request) -> OpenAIServingModels:
return request.app.state.openai_serving_models
@router.get("/v1/models")
async def show_available_models(raw_request: Request):
handler = models(raw_request)
models_ = await handler.show_available_models()
return JSONResponse(content=models_.model_dump())
def attach_router(app: FastAPI):
app.include_router(router)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from dataclasses import dataclass
@dataclass
class BaseModelPath:
name: str
model_path: str
@dataclass
class LoRAModulePath:
name: str
path: str
base_model_name: str | None = None
...@@ -3,7 +3,6 @@ ...@@ -3,7 +3,6 @@
from asyncio import Lock from asyncio import Lock
from collections import defaultdict from collections import defaultdict
from dataclasses import dataclass
from http import HTTPStatus from http import HTTPStatus
from vllm.engine.protocol import EngineClient from vllm.engine.protocol import EngineClient
...@@ -14,6 +13,7 @@ from vllm.entrypoints.openai.engine.protocol import ( ...@@ -14,6 +13,7 @@ from vllm.entrypoints.openai.engine.protocol import (
ModelList, ModelList,
ModelPermission, ModelPermission,
) )
from vllm.entrypoints.openai.models.protocol import BaseModelPath, LoRAModulePath
from vllm.entrypoints.serve.lora.protocol import ( from vllm.entrypoints.serve.lora.protocol import (
LoadLoRAAdapterRequest, LoadLoRAAdapterRequest,
UnloadLoRAAdapterRequest, UnloadLoRAAdapterRequest,
...@@ -27,19 +27,6 @@ from vllm.utils.counter import AtomicCounter ...@@ -27,19 +27,6 @@ from vllm.utils.counter import AtomicCounter
logger = init_logger(__name__) logger = init_logger(__name__)
@dataclass
class BaseModelPath:
name: str
model_path: str
@dataclass
class LoRAModulePath:
name: str
path: str
base_model_name: str | None = None
class OpenAIServingModels: class OpenAIServingModels:
"""Shared instance to hold data about the loaded base model(s) and adapters. """Shared instance to hold data about the loaded base model(s) and adapters.
......
...@@ -76,12 +76,12 @@ from vllm.entrypoints.openai.engine.protocol import ( ...@@ -76,12 +76,12 @@ from vllm.entrypoints.openai.engine.protocol import (
DeltaMessage, DeltaMessage,
ErrorResponse, ErrorResponse,
RequestResponseMetadata, RequestResponseMetadata,
VLLMValidationError,
) )
from vllm.entrypoints.openai.engine.serving import ( from vllm.entrypoints.openai.engine.serving import (
GenerationError, GenerationError,
OpenAIServing, OpenAIServing,
) )
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.openai.parser.harmony_utils import ( from vllm.entrypoints.openai.parser.harmony_utils import (
construct_harmony_previous_input_messages, construct_harmony_previous_input_messages,
get_developer_message, get_developer_message,
...@@ -108,7 +108,6 @@ from vllm.entrypoints.openai.responses.protocol import ( ...@@ -108,7 +108,6 @@ from vllm.entrypoints.openai.responses.protocol import (
ResponseUsage, ResponseUsage,
StreamingResponsesResponse, StreamingResponsesResponse,
) )
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.entrypoints.responses_utils import ( from vllm.entrypoints.responses_utils import (
construct_input_messages, construct_input_messages,
construct_tool_dicts, construct_tool_dicts,
...@@ -116,6 +115,7 @@ from vllm.entrypoints.responses_utils import ( ...@@ -116,6 +115,7 @@ from vllm.entrypoints.responses_utils import (
should_continue_final_message, should_continue_final_message,
) )
from vllm.entrypoints.tool_server import ToolServer from vllm.entrypoints.tool_server import ToolServer
from vllm.exceptions import VLLMValidationError
from vllm.inputs.data import TokensPrompt from vllm.inputs.data import TokensPrompt
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.logprobs import Logprob as SampleLogprob from vllm.logprobs import Logprob as SampleLogprob
......
...@@ -28,7 +28,8 @@ from vllm.entrypoints.openai.engine.protocol import ( ...@@ -28,7 +28,8 @@ from vllm.entrypoints.openai.engine.protocol import (
ErrorResponse, ErrorResponse,
OpenAIBaseModel, OpenAIBaseModel,
) )
from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels from vllm.entrypoints.openai.models.protocol import BaseModelPath
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.pooling.embed.protocol import EmbeddingRequest, EmbeddingResponse from vllm.entrypoints.pooling.embed.protocol import EmbeddingRequest, EmbeddingResponse
from vllm.entrypoints.pooling.embed.serving import OpenAIServingEmbedding from vllm.entrypoints.pooling.embed.serving import OpenAIServingEmbedding
from vllm.entrypoints.pooling.score.protocol import ( from vllm.entrypoints.pooling.score.protocol import (
......
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
from http import HTTPStatus from http import HTTPStatus
from typing import Annotated from typing import Annotated
from fastapi import APIRouter, FastAPI, Form, HTTPException, Request from fastapi import APIRouter, FastAPI, Form, Request
from fastapi.responses import JSONResponse, StreamingResponse from fastapi.responses import JSONResponse, StreamingResponse
from vllm.entrypoints.openai.engine.protocol import ErrorResponse from vllm.entrypoints.openai.engine.protocol import ErrorResponse
...@@ -63,10 +63,7 @@ async def create_transcriptions( ...@@ -63,10 +63,7 @@ async def create_transcriptions(
try: try:
generator = await handler.create_transcription(audio_data, request, raw_request) generator = await handler.create_transcription(audio_data, request, raw_request)
except Exception as e: except Exception as e:
raise HTTPException( return handler.create_error_response(e)
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
) from e
if isinstance(generator, ErrorResponse): if isinstance(generator, ErrorResponse):
return JSONResponse( return JSONResponse(
content=generator.model_dump(), status_code=generator.error.code content=generator.model_dump(), status_code=generator.error.code
...@@ -103,9 +100,7 @@ async def create_translations( ...@@ -103,9 +100,7 @@ async def create_translations(
try: try:
generator = await handler.create_translation(audio_data, request, raw_request) generator = await handler.create_translation(audio_data, request, raw_request)
except Exception as e: except Exception as e:
raise HTTPException( return handler.create_error_response(e)
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
) from e
if isinstance(generator, ErrorResponse): if isinstance(generator, ErrorResponse):
return JSONResponse( return JSONResponse(
......
...@@ -10,7 +10,7 @@ from vllm.entrypoints.openai.engine.protocol import ( ...@@ -10,7 +10,7 @@ from vllm.entrypoints.openai.engine.protocol import (
ErrorResponse, ErrorResponse,
RequestResponseMetadata, RequestResponseMetadata,
) )
from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.openai.translations.protocol import ( from vllm.entrypoints.openai.translations.protocol import (
TranscriptionRequest, TranscriptionRequest,
TranscriptionResponse, TranscriptionResponse,
......
...@@ -22,7 +22,7 @@ from vllm.entrypoints.openai.engine.protocol import ( ...@@ -22,7 +22,7 @@ from vllm.entrypoints.openai.engine.protocol import (
UsageInfo, UsageInfo,
) )
from vllm.entrypoints.openai.engine.serving import OpenAIServing, SpeechToTextRequest from vllm.entrypoints.openai.engine.serving import OpenAIServing, SpeechToTextRequest
from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.openai.translations.protocol import ( from vllm.entrypoints.openai.translations.protocol import (
TranscriptionResponse, TranscriptionResponse,
TranscriptionResponseStreamChoice, TranscriptionResponseStreamChoice,
......
...@@ -23,7 +23,7 @@ from vllm.entrypoints.openai.engine.serving import ( ...@@ -23,7 +23,7 @@ from vllm.entrypoints.openai.engine.serving import (
OpenAIServing, OpenAIServing,
ServeContext, ServeContext,
) )
from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.pooling.classify.protocol import ( from vllm.entrypoints.pooling.classify.protocol import (
ClassificationChatRequest, ClassificationChatRequest,
ClassificationCompletionRequest, ClassificationCompletionRequest,
......
...@@ -21,7 +21,7 @@ from vllm.entrypoints.openai.engine.serving import ( ...@@ -21,7 +21,7 @@ from vllm.entrypoints.openai.engine.serving import (
OpenAIServing, OpenAIServing,
ServeContext, ServeContext,
) )
from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.pooling.embed.protocol import ( from vllm.entrypoints.pooling.embed.protocol import (
EmbeddingBytesResponse, EmbeddingBytesResponse,
EmbeddingChatRequest, EmbeddingChatRequest,
......
...@@ -19,7 +19,7 @@ from vllm.entrypoints.openai.engine.protocol import ( ...@@ -19,7 +19,7 @@ from vllm.entrypoints.openai.engine.protocol import (
UsageInfo, UsageInfo,
) )
from vllm.entrypoints.openai.engine.serving import OpenAIServing from vllm.entrypoints.openai.engine.serving import OpenAIServing
from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.pooling.pooling.protocol import ( from vllm.entrypoints.pooling.pooling.protocol import (
IOProcessorRequest, IOProcessorRequest,
IOProcessorResponse, IOProcessorResponse,
......
...@@ -14,7 +14,7 @@ from vllm.entrypoints.openai.engine.protocol import ( ...@@ -14,7 +14,7 @@ from vllm.entrypoints.openai.engine.protocol import (
UsageInfo, UsageInfo,
) )
from vllm.entrypoints.openai.engine.serving import OpenAIServing from vllm.entrypoints.openai.engine.serving import OpenAIServing
from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.pooling.score.protocol import ( from vllm.entrypoints.pooling.score.protocol import (
RerankDocument, RerankDocument,
RerankRequest, RerankRequest,
......
...@@ -12,22 +12,26 @@ from fastapi.responses import JSONResponse, Response ...@@ -12,22 +12,26 @@ from fastapi.responses import JSONResponse, Response
from vllm.entrypoints.openai.api_server import ( from vllm.entrypoints.openai.api_server import (
base, base,
chat,
completion,
create_completion,
validate_json_request,
) )
from vllm.entrypoints.openai.chat_completion.api_router import ( from vllm.entrypoints.openai.chat_completion.api_router import (
chat,
create_chat_completion, create_chat_completion,
) )
from vllm.entrypoints.openai.chat_completion.protocol import ( from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionRequest, ChatCompletionRequest,
) )
from vllm.entrypoints.openai.engine.protocol import ( from vllm.entrypoints.openai.completion.api_router import (
completion,
create_completion,
)
from vllm.entrypoints.openai.completion.protocol import (
CompletionRequest, CompletionRequest,
)
from vllm.entrypoints.openai.engine.protocol import (
ErrorResponse, ErrorResponse,
) )
from vllm.entrypoints.openai.engine.serving import OpenAIServing from vllm.entrypoints.openai.engine.serving import OpenAIServing
from vllm.entrypoints.openai.utils import validate_json_request
from vllm.entrypoints.pooling.classify.api_router import classify, create_classify from vllm.entrypoints.pooling.classify.api_router import classify, create_classify
from vllm.entrypoints.pooling.classify.protocol import ClassificationRequest from vllm.entrypoints.pooling.classify.protocol import ClassificationRequest
from vllm.entrypoints.pooling.embed.api_router import create_embedding, embedding from vllm.entrypoints.pooling.embed.api_router import create_embedding, embedding
......
...@@ -10,10 +10,10 @@ from fastapi import APIRouter, Depends, FastAPI, HTTPException, Request, Respons ...@@ -10,10 +10,10 @@ from fastapi import APIRouter, Depends, FastAPI, HTTPException, Request, Respons
from fastapi.responses import JSONResponse, StreamingResponse from fastapi.responses import JSONResponse, StreamingResponse
from vllm.engine.protocol import EngineClient from vllm.engine.protocol import EngineClient
from vllm.entrypoints.openai.api_server import validate_json_request
from vllm.entrypoints.openai.engine.protocol import ( from vllm.entrypoints.openai.engine.protocol import (
ErrorResponse, ErrorResponse,
) )
from vllm.entrypoints.openai.utils import validate_json_request
from vllm.entrypoints.serve.disagg.protocol import ( from vllm.entrypoints.serve.disagg.protocol import (
GenerateRequest, GenerateRequest,
GenerateResponse, GenerateResponse,
......
...@@ -6,10 +6,10 @@ from pydantic import BaseModel, Field ...@@ -6,10 +6,10 @@ from pydantic import BaseModel, Field
from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionLogProbs from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionLogProbs
from vllm.entrypoints.openai.engine.protocol import ( from vllm.entrypoints.openai.engine.protocol import (
Logprob,
SamplingParams, SamplingParams,
StreamOptions, StreamOptions,
) )
from vllm.logprobs import Logprob
from vllm.utils import random_uuid from vllm.utils import random_uuid
......
...@@ -23,7 +23,7 @@ from vllm.entrypoints.openai.engine.protocol import ( ...@@ -23,7 +23,7 @@ from vllm.entrypoints.openai.engine.protocol import (
UsageInfo, UsageInfo,
) )
from vllm.entrypoints.openai.engine.serving import OpenAIServing, clamp_prompt_logprobs from vllm.entrypoints.openai.engine.serving import OpenAIServing, clamp_prompt_logprobs
from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.serve.disagg.protocol import ( from vllm.entrypoints.serve.disagg.protocol import (
GenerateRequest, GenerateRequest,
GenerateResponse, GenerateResponse,
......
...@@ -9,10 +9,10 @@ from fastapi import APIRouter, Depends, FastAPI, HTTPException, Request ...@@ -9,10 +9,10 @@ from fastapi import APIRouter, Depends, FastAPI, HTTPException, Request
from fastapi.responses import JSONResponse from fastapi.responses import JSONResponse
from vllm.engine.protocol import EngineClient from vllm.engine.protocol import EngineClient
from vllm.entrypoints.openai.api_server import validate_json_request
from vllm.entrypoints.openai.engine.protocol import ( from vllm.entrypoints.openai.engine.protocol import (
ErrorResponse, ErrorResponse,
) )
from vllm.entrypoints.openai.utils import validate_json_request
from vllm.entrypoints.serve.elastic_ep.middleware import ( from vllm.entrypoints.serve.elastic_ep.middleware import (
get_scaling_elastic_ep, get_scaling_elastic_ep,
set_scaling_elastic_ep, set_scaling_elastic_ep,
......
...@@ -7,11 +7,12 @@ from fastapi import APIRouter, Depends, FastAPI, Request ...@@ -7,11 +7,12 @@ from fastapi import APIRouter, Depends, FastAPI, Request
from fastapi.responses import JSONResponse, Response from fastapi.responses import JSONResponse, Response
from vllm import envs from vllm import envs
from vllm.entrypoints.openai.api_server import models, validate_json_request
from vllm.entrypoints.openai.engine.protocol import ( from vllm.entrypoints.openai.engine.protocol import (
ErrorResponse, ErrorResponse,
) )
from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.models.api_router import models
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.openai.utils import validate_json_request
from vllm.entrypoints.serve.lora.protocol import ( from vllm.entrypoints.serve.lora.protocol import (
LoadLoRAAdapterRequest, LoadLoRAAdapterRequest,
UnloadLoRAAdapterRequest, UnloadLoRAAdapterRequest,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment