"examples/vscode:/vscode.git/clone" did not exist on "42fadebecb79290ad722f33f3094de23b121f33d"
Unverified Commit fefce498 authored by Chauncey's avatar Chauncey Committed by GitHub
Browse files

[Refactor] [6/N] to simplify the vLLM openai chat_completion serving architecture (#32240)


Signed-off-by: default avatarchaunceyjiang <chaunceyjiang@gmail.com>
parent a5bbbd2f
...@@ -11,7 +11,7 @@ from pydantic import ( ...@@ -11,7 +11,7 @@ from pydantic import (
from vllm import PoolingParams from vllm import PoolingParams
from vllm.config.pooler import get_use_activation from vllm.config.pooler import get_use_activation
from vllm.entrypoints.chat_utils import ChatCompletionMessageParam from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
from vllm.entrypoints.openai.protocol import OpenAIBaseModel, UsageInfo from vllm.entrypoints.openai.engine.protocol import OpenAIBaseModel, UsageInfo
from vllm.utils import random_uuid from vllm.utils import random_uuid
......
...@@ -11,12 +11,14 @@ from fastapi import Request ...@@ -11,12 +11,14 @@ from fastapi import Request
from vllm.engine.protocol import EngineClient from vllm.engine.protocol import EngineClient
from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionRequest, ChatCompletionRequest,
)
from vllm.entrypoints.openai.engine.protocol import (
ErrorResponse, ErrorResponse,
UsageInfo, UsageInfo,
) )
from vllm.entrypoints.openai.serving_engine import ( from vllm.entrypoints.openai.engine.serving import (
ClassificationServeContext, ClassificationServeContext,
OpenAIServing, OpenAIServing,
ServeContext, ServeContext,
......
...@@ -6,7 +6,7 @@ from fastapi import APIRouter, Depends, HTTPException, Request ...@@ -6,7 +6,7 @@ from fastapi import APIRouter, Depends, HTTPException, Request
from fastapi.responses import JSONResponse, StreamingResponse from fastapi.responses import JSONResponse, StreamingResponse
from typing_extensions import assert_never from typing_extensions import assert_never
from vllm.entrypoints.openai.protocol import ErrorResponse from vllm.entrypoints.openai.engine.protocol import ErrorResponse
from vllm.entrypoints.openai.utils import validate_json_request from vllm.entrypoints.openai.utils import validate_json_request
from vllm.entrypoints.pooling.embed.protocol import ( from vllm.entrypoints.pooling.embed.protocol import (
EmbeddingBytesResponse, EmbeddingBytesResponse,
......
...@@ -10,7 +10,7 @@ from pydantic import ( ...@@ -10,7 +10,7 @@ from pydantic import (
from vllm import PoolingParams from vllm import PoolingParams
from vllm.entrypoints.chat_utils import ChatCompletionMessageParam from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
from vllm.entrypoints.openai.protocol import OpenAIBaseModel, UsageInfo from vllm.entrypoints.openai.engine.protocol import OpenAIBaseModel, UsageInfo
from vllm.utils import random_uuid from vllm.utils import random_uuid
from vllm.utils.serial_utils import EmbedDType, EncodingFormat, Endianness from vllm.utils.serial_utils import EmbedDType, EncodingFormat, Endianness
......
...@@ -12,11 +12,11 @@ from typing_extensions import assert_never, override ...@@ -12,11 +12,11 @@ from typing_extensions import assert_never, override
from vllm.engine.protocol import EngineClient from vllm.engine.protocol import EngineClient
from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.engine.protocol import (
ErrorResponse, ErrorResponse,
UsageInfo, UsageInfo,
) )
from vllm.entrypoints.openai.serving_engine import ( from vllm.entrypoints.openai.engine.serving import (
EmbeddingServeContext, EmbeddingServeContext,
OpenAIServing, OpenAIServing,
ServeContext, ServeContext,
......
...@@ -6,7 +6,7 @@ from fastapi import APIRouter, Depends, HTTPException, Request ...@@ -6,7 +6,7 @@ from fastapi import APIRouter, Depends, HTTPException, Request
from fastapi.responses import JSONResponse, StreamingResponse from fastapi.responses import JSONResponse, StreamingResponse
from typing_extensions import assert_never from typing_extensions import assert_never
from vllm.entrypoints.openai.protocol import ErrorResponse from vllm.entrypoints.openai.engine.protocol import ErrorResponse
from vllm.entrypoints.openai.utils import validate_json_request from vllm.entrypoints.openai.utils import validate_json_request
from vllm.entrypoints.pooling.pooling.protocol import ( from vllm.entrypoints.pooling.pooling.protocol import (
IOProcessorResponse, IOProcessorResponse,
......
...@@ -9,7 +9,7 @@ from pydantic import ( ...@@ -9,7 +9,7 @@ from pydantic import (
from vllm import PoolingParams from vllm import PoolingParams
from vllm.config.pooler import get_use_activation from vllm.config.pooler import get_use_activation
from vllm.entrypoints.openai.protocol import OpenAIBaseModel, UsageInfo from vllm.entrypoints.openai.engine.protocol import OpenAIBaseModel, UsageInfo
from vllm.entrypoints.pooling.embed.protocol import ( from vllm.entrypoints.pooling.embed.protocol import (
EmbeddingChatRequest, EmbeddingChatRequest,
EmbeddingCompletionRequest, EmbeddingCompletionRequest,
......
...@@ -14,11 +14,11 @@ from typing_extensions import assert_never ...@@ -14,11 +14,11 @@ from typing_extensions import assert_never
from vllm.engine.protocol import EngineClient from vllm.engine.protocol import EngineClient
from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.engine.protocol import (
ErrorResponse, ErrorResponse,
UsageInfo, UsageInfo,
) )
from vllm.entrypoints.openai.serving_engine import OpenAIServing from vllm.entrypoints.openai.engine.serving import OpenAIServing
from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.entrypoints.pooling.pooling.protocol import ( from vllm.entrypoints.pooling.pooling.protocol import (
IOProcessorRequest, IOProcessorRequest,
......
...@@ -6,7 +6,7 @@ from fastapi import APIRouter, Depends, HTTPException, Request ...@@ -6,7 +6,7 @@ from fastapi import APIRouter, Depends, HTTPException, Request
from fastapi.responses import JSONResponse from fastapi.responses import JSONResponse
from typing_extensions import assert_never from typing_extensions import assert_never
from vllm.entrypoints.openai.protocol import ErrorResponse from vllm.entrypoints.openai.engine.protocol import ErrorResponse
from vllm.entrypoints.openai.utils import validate_json_request from vllm.entrypoints.openai.utils import validate_json_request
from vllm.entrypoints.pooling.score.protocol import ( from vllm.entrypoints.pooling.score.protocol import (
RerankRequest, RerankRequest,
......
...@@ -10,7 +10,7 @@ from pydantic import ( ...@@ -10,7 +10,7 @@ from pydantic import (
from vllm import PoolingParams from vllm import PoolingParams
from vllm.config.pooler import get_use_activation from vllm.config.pooler import get_use_activation
from vllm.entrypoints.openai.protocol import OpenAIBaseModel, UsageInfo from vllm.entrypoints.openai.engine.protocol import OpenAIBaseModel, UsageInfo
from vllm.entrypoints.score_utils import ScoreContentPartParam, ScoreMultiModalParam from vllm.entrypoints.score_utils import ScoreContentPartParam, ScoreMultiModalParam
from vllm.utils import random_uuid from vllm.utils import random_uuid
......
...@@ -9,11 +9,11 @@ from fastapi import Request ...@@ -9,11 +9,11 @@ from fastapi import Request
from vllm.engine.protocol import EngineClient from vllm.engine.protocol import EngineClient
from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.engine.protocol import (
ErrorResponse, ErrorResponse,
UsageInfo, UsageInfo,
) )
from vllm.entrypoints.openai.serving_engine import OpenAIServing from vllm.entrypoints.openai.engine.serving import OpenAIServing
from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.entrypoints.pooling.score.protocol import ( from vllm.entrypoints.pooling.score.protocol import (
RerankDocument, RerankDocument,
......
...@@ -22,7 +22,7 @@ from openai.types.responses.tool import Tool ...@@ -22,7 +22,7 @@ from openai.types.responses.tool import Tool
from vllm import envs from vllm import envs
from vllm.entrypoints.constants import MCP_PREFIX from vllm.entrypoints.constants import MCP_PREFIX
from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.engine.protocol import (
ChatCompletionMessageParam, ChatCompletionMessageParam,
ResponseInputOutputItem, ResponseInputOutputItem,
) )
......
...@@ -14,16 +14,20 @@ from vllm.entrypoints.openai.api_server import ( ...@@ -14,16 +14,20 @@ from vllm.entrypoints.openai.api_server import (
base, base,
chat, chat,
completion, completion,
create_chat_completion,
create_completion, create_completion,
validate_json_request, validate_json_request,
) )
from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.chat_completion.api_router import (
create_chat_completion,
)
from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionRequest, ChatCompletionRequest,
)
from vllm.entrypoints.openai.engine.protocol import (
CompletionRequest, CompletionRequest,
ErrorResponse, ErrorResponse,
) )
from vllm.entrypoints.openai.serving_engine import OpenAIServing from vllm.entrypoints.openai.engine.serving import OpenAIServing
from vllm.entrypoints.pooling.classify.api_router import classify, create_classify from vllm.entrypoints.pooling.classify.api_router import classify, create_classify
from vllm.entrypoints.pooling.classify.protocol import ClassificationRequest from vllm.entrypoints.pooling.classify.protocol import ClassificationRequest
from vllm.entrypoints.pooling.embed.api_router import create_embedding, embedding from vllm.entrypoints.pooling.embed.api_router import create_embedding, embedding
......
...@@ -11,7 +11,7 @@ from fastapi.responses import JSONResponse, StreamingResponse ...@@ -11,7 +11,7 @@ from fastapi.responses import JSONResponse, StreamingResponse
from vllm.engine.protocol import EngineClient from vllm.engine.protocol import EngineClient
from vllm.entrypoints.openai.api_server import validate_json_request from vllm.entrypoints.openai.api_server import validate_json_request
from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.engine.protocol import (
ErrorResponse, ErrorResponse,
) )
from vllm.entrypoints.serve.disagg.protocol import ( from vllm.entrypoints.serve.disagg.protocol import (
......
...@@ -4,8 +4,8 @@ from typing import Any ...@@ -4,8 +4,8 @@ from typing import Any
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionLogProbs
ChatCompletionLogProbs, from vllm.entrypoints.openai.engine.protocol import (
Logprob, Logprob,
SamplingParams, SamplingParams,
StreamOptions, StreamOptions,
......
...@@ -11,16 +11,18 @@ from fastapi import Request ...@@ -11,16 +11,18 @@ from fastapi import Request
from vllm.engine.protocol import EngineClient from vllm.engine.protocol import EngineClient
from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionLogProb, ChatCompletionLogProb,
ChatCompletionLogProbs, ChatCompletionLogProbs,
ChatCompletionLogProbsContent, ChatCompletionLogProbsContent,
)
from vllm.entrypoints.openai.engine.protocol import (
ErrorResponse, ErrorResponse,
PromptTokenUsageInfo, PromptTokenUsageInfo,
RequestResponseMetadata, RequestResponseMetadata,
UsageInfo, UsageInfo,
) )
from vllm.entrypoints.openai.serving_engine import OpenAIServing, clamp_prompt_logprobs from vllm.entrypoints.openai.engine.serving import OpenAIServing, clamp_prompt_logprobs
from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.entrypoints.serve.disagg.protocol import ( from vllm.entrypoints.serve.disagg.protocol import (
GenerateRequest, GenerateRequest,
......
...@@ -10,7 +10,7 @@ from fastapi.responses import JSONResponse ...@@ -10,7 +10,7 @@ from fastapi.responses import JSONResponse
from vllm.engine.protocol import EngineClient from vllm.engine.protocol import EngineClient
from vllm.entrypoints.openai.api_server import validate_json_request from vllm.entrypoints.openai.api_server import validate_json_request
from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.engine.protocol import (
ErrorResponse, ErrorResponse,
) )
from vllm.entrypoints.serve.elastic_ep.middleware import ( from vllm.entrypoints.serve.elastic_ep.middleware import (
......
...@@ -8,7 +8,7 @@ from fastapi.responses import JSONResponse, Response ...@@ -8,7 +8,7 @@ from fastapi.responses import JSONResponse, Response
from vllm import envs from vllm import envs
from vllm.entrypoints.openai.api_server import models, validate_json_request from vllm.entrypoints.openai.api_server import models, validate_json_request
from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.engine.protocol import (
ErrorResponse, ErrorResponse,
LoadLoRAAdapterRequest, LoadLoRAAdapterRequest,
UnloadLoRAAdapterRequest, UnloadLoRAAdapterRequest,
......
...@@ -10,10 +10,12 @@ from fastapi.responses import JSONResponse ...@@ -10,10 +10,12 @@ from fastapi.responses import JSONResponse
from typing_extensions import assert_never from typing_extensions import assert_never
from vllm.entrypoints.openai.api_server import validate_json_request from vllm.entrypoints.openai.api_server import validate_json_request
from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.engine.protocol import (
ErrorResponse,
)
from vllm.entrypoints.serve.tokenize.protocol import (
DetokenizeRequest, DetokenizeRequest,
DetokenizeResponse, DetokenizeResponse,
ErrorResponse,
TokenizeRequest, TokenizeRequest,
TokenizeResponse, TokenizeResponse,
) )
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any, TypeAlias
from pydantic import ConfigDict, Field, model_validator
from vllm.entrypoints.chat_utils import (
ChatCompletionMessageParam,
)
from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionToolsParam,
)
from vllm.entrypoints.openai.engine.protocol import (
OpenAIBaseModel,
)
class TokenizeCompletionRequest(OpenAIBaseModel):
model: str | None = None
prompt: str
add_special_tokens: bool = Field(
default=True,
description=(
"If true (the default), special tokens (e.g. BOS) will be added to "
"the prompt."
),
)
return_token_strs: bool | None = Field(
default=False,
description=(
"If true, also return the token strings corresponding to the token ids."
),
)
class TokenizeChatRequest(OpenAIBaseModel):
model: str | None = None
messages: list[ChatCompletionMessageParam]
add_generation_prompt: bool = Field(
default=True,
description=(
"If true, the generation prompt will be added to the chat template. "
"This is a parameter used by chat template in tokenizer config of the "
"model."
),
)
return_token_strs: bool | None = Field(
default=False,
description=(
"If true, also return the token strings corresponding to the token ids."
),
)
continue_final_message: bool = Field(
default=False,
description=(
"If this is set, the chat will be formatted so that the final "
"message in the chat is open-ended, without any EOS tokens. The "
"model will continue this message rather than starting a new one. "
'This allows you to "prefill" part of the model\'s response for it. '
"Cannot be used at the same time as `add_generation_prompt`."
),
)
add_special_tokens: bool = Field(
default=False,
description=(
"If true, special tokens (e.g. BOS) will be added to the prompt "
"on top of what is added by the chat template. "
"For most models, the chat template takes care of adding the "
"special tokens so this should be set to false (as is the "
"default)."
),
)
chat_template: str | None = Field(
default=None,
description=(
"A Jinja template to use for this conversion. "
"As of transformers v4.44, default chat template is no longer "
"allowed, so you must provide a chat template if the tokenizer "
"does not define one."
),
)
chat_template_kwargs: dict[str, Any] | None = Field(
default=None,
description=(
"Additional keyword args to pass to the template renderer. "
"Will be accessible by the chat template."
),
)
mm_processor_kwargs: dict[str, Any] | None = Field(
default=None,
description=("Additional kwargs to pass to the HF processor."),
)
tools: list[ChatCompletionToolsParam] | None = Field(
default=None,
description=("A list of tools the model may call."),
)
@model_validator(mode="before")
@classmethod
def check_generation_prompt(cls, data):
if data.get("continue_final_message") and data.get("add_generation_prompt"):
raise ValueError(
"Cannot set both `continue_final_message` and "
"`add_generation_prompt` to True."
)
return data
TokenizeRequest: TypeAlias = TokenizeCompletionRequest | TokenizeChatRequest
class TokenizeResponse(OpenAIBaseModel):
count: int
max_model_len: int
tokens: list[int]
token_strs: list[str] | None = None
class DetokenizeRequest(OpenAIBaseModel):
model: str | None = None
tokens: list[int]
class DetokenizeResponse(OpenAIBaseModel):
prompt: str
class TokenizerInfoResponse(OpenAIBaseModel):
"""
Response containing tokenizer configuration
equivalent to tokenizer_config.json
"""
model_config = ConfigDict(extra="allow")
tokenizer_class: str
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment