Unverified Commit 4c1c501a authored by Chauncey's avatar Chauncey Committed by GitHub
Browse files

[Refactor] [10/N] to simplify the vLLM openai completion serving architecture (#32369)


Signed-off-by: default avatarchaunceyjiang <chaunceyjiang@gmail.com>
parent ae1eba6a
......@@ -43,20 +43,21 @@ from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionRequest,
ChatCompletionResponse,
)
from vllm.entrypoints.openai.engine.protocol import (
from vllm.entrypoints.openai.completion.protocol import (
CompletionRequest,
CompletionResponse,
)
from vllm.entrypoints.openai.engine.protocol import (
ErrorInfo,
ErrorResponse,
FunctionCall,
FunctionDefinition,
VLLMValidationError,
)
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.openai.responses.protocol import (
ResponseInputOutputItem,
ResponsesRequest,
)
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.entrypoints.openai.translations.protocol import (
TranscriptionRequest,
TranscriptionResponse,
......@@ -95,6 +96,7 @@ from vllm.entrypoints.serve.tokenize.protocol import (
TokenizeResponse,
)
from vllm.entrypoints.utils import _validate_truncation_size, sanitize_message
from vllm.exceptions import VLLMValidationError
from vllm.inputs.data import PromptType, TokensPrompt
from vllm.inputs.parse import (
PromptComponents,
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from fastapi import APIRouter, FastAPI, Request
from fastapi.responses import JSONResponse
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.logger import init_logger
logger = init_logger(__name__)
router = APIRouter()
def models(request: Request) -> OpenAIServingModels:
return request.app.state.openai_serving_models
@router.get("/v1/models")
async def show_available_models(raw_request: Request):
handler = models(raw_request)
models_ = await handler.show_available_models()
return JSONResponse(content=models_.model_dump())
def attach_router(app: FastAPI):
app.include_router(router)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from dataclasses import dataclass
@dataclass
class BaseModelPath:
name: str
model_path: str
@dataclass
class LoRAModulePath:
name: str
path: str
base_model_name: str | None = None
......@@ -3,7 +3,6 @@
from asyncio import Lock
from collections import defaultdict
from dataclasses import dataclass
from http import HTTPStatus
from vllm.engine.protocol import EngineClient
......@@ -14,6 +13,7 @@ from vllm.entrypoints.openai.engine.protocol import (
ModelList,
ModelPermission,
)
from vllm.entrypoints.openai.models.protocol import BaseModelPath, LoRAModulePath
from vllm.entrypoints.serve.lora.protocol import (
LoadLoRAAdapterRequest,
UnloadLoRAAdapterRequest,
......@@ -27,19 +27,6 @@ from vllm.utils.counter import AtomicCounter
logger = init_logger(__name__)
@dataclass
class BaseModelPath:
name: str
model_path: str
@dataclass
class LoRAModulePath:
name: str
path: str
base_model_name: str | None = None
class OpenAIServingModels:
"""Shared instance to hold data about the loaded base model(s) and adapters.
......
......@@ -76,12 +76,12 @@ from vllm.entrypoints.openai.engine.protocol import (
DeltaMessage,
ErrorResponse,
RequestResponseMetadata,
VLLMValidationError,
)
from vllm.entrypoints.openai.engine.serving import (
GenerationError,
OpenAIServing,
)
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.openai.parser.harmony_utils import (
construct_harmony_previous_input_messages,
get_developer_message,
......@@ -108,7 +108,6 @@ from vllm.entrypoints.openai.responses.protocol import (
ResponseUsage,
StreamingResponsesResponse,
)
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.entrypoints.responses_utils import (
construct_input_messages,
construct_tool_dicts,
......@@ -116,6 +115,7 @@ from vllm.entrypoints.responses_utils import (
should_continue_final_message,
)
from vllm.entrypoints.tool_server import ToolServer
from vllm.exceptions import VLLMValidationError
from vllm.inputs.data import TokensPrompt
from vllm.logger import init_logger
from vllm.logprobs import Logprob as SampleLogprob
......
......@@ -28,7 +28,8 @@ from vllm.entrypoints.openai.engine.protocol import (
ErrorResponse,
OpenAIBaseModel,
)
from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
from vllm.entrypoints.openai.models.protocol import BaseModelPath
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.pooling.embed.protocol import EmbeddingRequest, EmbeddingResponse
from vllm.entrypoints.pooling.embed.serving import OpenAIServingEmbedding
from vllm.entrypoints.pooling.score.protocol import (
......
......@@ -5,7 +5,7 @@
from http import HTTPStatus
from typing import Annotated
from fastapi import APIRouter, FastAPI, Form, HTTPException, Request
from fastapi import APIRouter, FastAPI, Form, Request
from fastapi.responses import JSONResponse, StreamingResponse
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
......@@ -63,10 +63,7 @@ async def create_transcriptions(
try:
generator = await handler.create_transcription(audio_data, request, raw_request)
except Exception as e:
raise HTTPException(
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
) from e
return handler.create_error_response(e)
if isinstance(generator, ErrorResponse):
return JSONResponse(
content=generator.model_dump(), status_code=generator.error.code
......@@ -103,9 +100,7 @@ async def create_translations(
try:
generator = await handler.create_translation(audio_data, request, raw_request)
except Exception as e:
raise HTTPException(
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
) from e
return handler.create_error_response(e)
if isinstance(generator, ErrorResponse):
return JSONResponse(
......
......@@ -10,7 +10,7 @@ from vllm.entrypoints.openai.engine.protocol import (
ErrorResponse,
RequestResponseMetadata,
)
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.openai.translations.protocol import (
TranscriptionRequest,
TranscriptionResponse,
......
......@@ -22,7 +22,7 @@ from vllm.entrypoints.openai.engine.protocol import (
UsageInfo,
)
from vllm.entrypoints.openai.engine.serving import OpenAIServing, SpeechToTextRequest
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.openai.translations.protocol import (
TranscriptionResponse,
TranscriptionResponseStreamChoice,
......
......@@ -23,7 +23,7 @@ from vllm.entrypoints.openai.engine.serving import (
OpenAIServing,
ServeContext,
)
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.pooling.classify.protocol import (
ClassificationChatRequest,
ClassificationCompletionRequest,
......
......@@ -21,7 +21,7 @@ from vllm.entrypoints.openai.engine.serving import (
OpenAIServing,
ServeContext,
)
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.pooling.embed.protocol import (
EmbeddingBytesResponse,
EmbeddingChatRequest,
......
......@@ -19,7 +19,7 @@ from vllm.entrypoints.openai.engine.protocol import (
UsageInfo,
)
from vllm.entrypoints.openai.engine.serving import OpenAIServing
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.pooling.pooling.protocol import (
IOProcessorRequest,
IOProcessorResponse,
......
......@@ -14,7 +14,7 @@ from vllm.entrypoints.openai.engine.protocol import (
UsageInfo,
)
from vllm.entrypoints.openai.engine.serving import OpenAIServing
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.pooling.score.protocol import (
RerankDocument,
RerankRequest,
......
......@@ -12,22 +12,26 @@ from fastapi.responses import JSONResponse, Response
from vllm.entrypoints.openai.api_server import (
base,
chat,
completion,
create_completion,
validate_json_request,
)
from vllm.entrypoints.openai.chat_completion.api_router import (
chat,
create_chat_completion,
)
from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionRequest,
)
from vllm.entrypoints.openai.engine.protocol import (
from vllm.entrypoints.openai.completion.api_router import (
completion,
create_completion,
)
from vllm.entrypoints.openai.completion.protocol import (
CompletionRequest,
)
from vllm.entrypoints.openai.engine.protocol import (
ErrorResponse,
)
from vllm.entrypoints.openai.engine.serving import OpenAIServing
from vllm.entrypoints.openai.utils import validate_json_request
from vllm.entrypoints.pooling.classify.api_router import classify, create_classify
from vllm.entrypoints.pooling.classify.protocol import ClassificationRequest
from vllm.entrypoints.pooling.embed.api_router import create_embedding, embedding
......
......@@ -10,10 +10,10 @@ from fastapi import APIRouter, Depends, FastAPI, HTTPException, Request, Respons
from fastapi.responses import JSONResponse, StreamingResponse
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.openai.api_server import validate_json_request
from vllm.entrypoints.openai.engine.protocol import (
ErrorResponse,
)
from vllm.entrypoints.openai.utils import validate_json_request
from vllm.entrypoints.serve.disagg.protocol import (
GenerateRequest,
GenerateResponse,
......
......@@ -6,10 +6,10 @@ from pydantic import BaseModel, Field
from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionLogProbs
from vllm.entrypoints.openai.engine.protocol import (
Logprob,
SamplingParams,
StreamOptions,
)
from vllm.logprobs import Logprob
from vllm.utils import random_uuid
......
......@@ -23,7 +23,7 @@ from vllm.entrypoints.openai.engine.protocol import (
UsageInfo,
)
from vllm.entrypoints.openai.engine.serving import OpenAIServing, clamp_prompt_logprobs
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.serve.disagg.protocol import (
GenerateRequest,
GenerateResponse,
......
......@@ -9,10 +9,10 @@ from fastapi import APIRouter, Depends, FastAPI, HTTPException, Request
from fastapi.responses import JSONResponse
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.openai.api_server import validate_json_request
from vllm.entrypoints.openai.engine.protocol import (
ErrorResponse,
)
from vllm.entrypoints.openai.utils import validate_json_request
from vllm.entrypoints.serve.elastic_ep.middleware import (
get_scaling_elastic_ep,
set_scaling_elastic_ep,
......
......@@ -7,11 +7,12 @@ from fastapi import APIRouter, Depends, FastAPI, Request
from fastapi.responses import JSONResponse, Response
from vllm import envs
from vllm.entrypoints.openai.api_server import models, validate_json_request
from vllm.entrypoints.openai.engine.protocol import (
ErrorResponse,
)
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.entrypoints.openai.models.api_router import models
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.openai.utils import validate_json_request
from vllm.entrypoints.serve.lora.protocol import (
LoadLoRAAdapterRequest,
UnloadLoRAAdapterRequest,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment