"docs/vscode:/vscode.git/clone" did not exist on "a19e8d372651abad75dc6a3939c18f23a1ae8d40"
Unverified Commit fefce498 authored by Chauncey's avatar Chauncey Committed by GitHub
Browse files

[Refactor] [6/N] to simplify the vLLM openai chat_completion serving architecture (#32240)


Signed-off-by: default avatarchaunceyjiang <chaunceyjiang@gmail.com>
parent a5bbbd2f
...@@ -22,6 +22,12 @@ from vllm.entrypoints.chat_utils import ( ...@@ -22,6 +22,12 @@ from vllm.entrypoints.chat_utils import (
ChatTemplateContentFormatOption, ChatTemplateContentFormatOption,
) )
from vllm.entrypoints.constants import MCP_PREFIX from vllm.entrypoints.constants import MCP_PREFIX
from vllm.entrypoints.openai.engine.protocol import (
FunctionCall,
ResponseInputOutputItem,
ResponseRawMessageAndToken,
ResponsesRequest,
)
from vllm.entrypoints.openai.parser.harmony_utils import ( from vllm.entrypoints.openai.parser.harmony_utils import (
get_encoding, get_encoding,
get_streamable_parser_for_assistant, get_streamable_parser_for_assistant,
...@@ -30,12 +36,6 @@ from vllm.entrypoints.openai.parser.harmony_utils import ( ...@@ -30,12 +36,6 @@ from vllm.entrypoints.openai.parser.harmony_utils import (
from vllm.entrypoints.openai.parser.responses_parser import ( from vllm.entrypoints.openai.parser.responses_parser import (
get_responses_parser_for_simple_context, get_responses_parser_for_simple_context,
) )
from vllm.entrypoints.openai.protocol import (
FunctionCall,
ResponseInputOutputItem,
ResponseRawMessageAndToken,
ResponsesRequest,
)
from vllm.entrypoints.responses_utils import construct_tool_dicts from vllm.entrypoints.responses_utils import construct_tool_dicts
from vllm.entrypoints.tool import Tool from vllm.entrypoints.tool import Tool
from vllm.entrypoints.tool_server import ToolServer from vllm.entrypoints.tool_server import ToolServer
......
...@@ -42,11 +42,9 @@ from vllm.entrypoints.anthropic.protocol import ( ...@@ -42,11 +42,9 @@ from vllm.entrypoints.anthropic.protocol import (
from vllm.entrypoints.anthropic.serving_messages import AnthropicServingMessages from vllm.entrypoints.anthropic.serving_messages import AnthropicServingMessages
from vllm.entrypoints.launcher import serve_http from vllm.entrypoints.launcher import serve_http
from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
from vllm.entrypoints.openai.cli_args import make_arg_parser, validate_parsed_serve_args from vllm.entrypoints.openai.cli_args import make_arg_parser, validate_parsed_serve_args
from vllm.entrypoints.openai.orca_metrics import metrics_header from vllm.entrypoints.openai.engine.protocol import (
from vllm.entrypoints.openai.protocol import (
ChatCompletionRequest,
ChatCompletionResponse,
CompletionRequest, CompletionRequest,
CompletionResponse, CompletionResponse,
ErrorInfo, ErrorInfo,
...@@ -59,9 +57,9 @@ from vllm.entrypoints.openai.protocol import ( ...@@ -59,9 +57,9 @@ from vllm.entrypoints.openai.protocol import (
TranslationRequest, TranslationRequest,
TranslationResponseVariant, TranslationResponseVariant,
) )
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.engine.serving import OpenAIServing
from vllm.entrypoints.openai.orca_metrics import metrics_header
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
from vllm.entrypoints.openai.serving_engine import OpenAIServing
from vllm.entrypoints.openai.serving_models import ( from vllm.entrypoints.openai.serving_models import (
BaseModelPath, BaseModelPath,
OpenAIServingModels, OpenAIServingModels,
...@@ -475,47 +473,6 @@ async def create_messages(request: AnthropicMessagesRequest, raw_request: Reques ...@@ -475,47 +473,6 @@ async def create_messages(request: AnthropicMessagesRequest, raw_request: Reques
return StreamingResponse(content=generator, media_type="text/event-stream") return StreamingResponse(content=generator, media_type="text/event-stream")
@router.post(
"/v1/chat/completions",
dependencies=[Depends(validate_json_request)],
responses={
HTTPStatus.OK.value: {"content": {"text/event-stream": {}}},
HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse},
HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
},
)
@with_cancellation
@load_aware_call
async def create_chat_completion(request: ChatCompletionRequest, raw_request: Request):
metrics_header_format = raw_request.headers.get(
ENDPOINT_LOAD_METRICS_FORMAT_HEADER_LABEL, ""
)
handler = chat(raw_request)
if handler is None:
return base(raw_request).create_error_response(
message="The model does not support Chat Completions API"
)
try:
generator = await handler.create_chat_completion(request, raw_request)
except Exception as e:
raise HTTPException(
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
) from e
if isinstance(generator, ErrorResponse):
return JSONResponse(
content=generator.model_dump(), status_code=generator.error.code
)
elif isinstance(generator, ChatCompletionResponse):
return JSONResponse(
content=generator.model_dump(),
headers=metrics_header(metrics_header_format),
)
return StreamingResponse(content=generator, media_type="text/event-stream")
@router.post( @router.post(
"/v1/completions", "/v1/completions",
dependencies=[Depends(validate_json_request)], dependencies=[Depends(validate_json_request)],
...@@ -735,8 +692,10 @@ class XRequestIdMiddleware: ...@@ -735,8 +692,10 @@ class XRequestIdMiddleware:
def _extract_content_from_chunk(chunk_data: dict) -> str: def _extract_content_from_chunk(chunk_data: dict) -> str:
"""Extract content from a streaming response chunk.""" """Extract content from a streaming response chunk."""
try: try:
from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionStreamResponse, ChatCompletionStreamResponse,
)
from vllm.entrypoints.openai.engine.protocol import (
CompletionStreamResponse, CompletionStreamResponse,
) )
...@@ -880,7 +839,11 @@ def build_app(args: Namespace) -> FastAPI: ...@@ -880,7 +839,11 @@ def build_app(args: Namespace) -> FastAPI:
from vllm.entrypoints.serve import register_vllm_serve_api_routers from vllm.entrypoints.serve import register_vllm_serve_api_routers
register_vllm_serve_api_routers(app) register_vllm_serve_api_routers(app)
from vllm.entrypoints.openai.chat_completion.api_router import (
attach_router as register_chat_api_router,
)
register_chat_api_router(app)
from vllm.entrypoints.sagemaker.routes import register_sagemaker_routes from vllm.entrypoints.sagemaker.routes import register_sagemaker_routes
register_sagemaker_routes(router) register_sagemaker_routes(router)
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from http import HTTPStatus
from fastapi import APIRouter, Depends, FastAPI, HTTPException, Request
from fastapi.responses import JSONResponse, StreamingResponse
from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionRequest,
ChatCompletionResponse,
)
from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
from vllm.entrypoints.openai.engine.protocol import ErrorResponse
from vllm.entrypoints.openai.orca_metrics import metrics_header
from vllm.entrypoints.openai.utils import validate_json_request
from vllm.entrypoints.utils import (
load_aware_call,
with_cancellation,
)
from vllm.logger import init_logger
logger = init_logger(__name__)
router = APIRouter()
ENDPOINT_LOAD_METRICS_FORMAT_HEADER_LABEL = "endpoint-load-metrics-format"
def chat(request: Request) -> OpenAIServingChat | None:
return request.app.state.openai_serving_chat
@router.post(
"/v1/chat/completions",
dependencies=[Depends(validate_json_request)],
responses={
HTTPStatus.OK.value: {"content": {"text/event-stream": {}}},
HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse},
HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
},
)
@with_cancellation
@load_aware_call
async def create_chat_completion(request: ChatCompletionRequest, raw_request: Request):
metrics_header_format = raw_request.headers.get(
ENDPOINT_LOAD_METRICS_FORMAT_HEADER_LABEL, ""
)
handler = chat(raw_request)
if handler is None:
base_server = raw_request.app.state.openai_serving_tokenization
return base_server.create_error_response(
message="The model does not support Chat Completions API"
)
try:
generator = await handler.create_chat_completion(request, raw_request)
except Exception as e:
raise HTTPException(
status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
) from e
if isinstance(generator, ErrorResponse):
return JSONResponse(
content=generator.model_dump(), status_code=generator.error.code
)
elif isinstance(generator, ChatCompletionResponse):
return JSONResponse(
content=generator.model_dump(),
headers=metrics_header(metrics_header_format),
)
return StreamingResponse(content=generator, media_type="text/event-stream")
def attach_router(app: FastAPI):
app.include_router(router)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Adapted from
# https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py
import json
import time
from typing import Annotated, Any, ClassVar, Literal
import torch
from openai.types.chat.chat_completion_audio import (
ChatCompletionAudio as OpenAIChatCompletionAudio,
)
from openai.types.chat.chat_completion_message import Annotation as OpenAIAnnotation
from pydantic import (
Field,
model_validator,
)
from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
from vllm.entrypoints.openai.engine.protocol import (
AnyResponseFormat,
DeltaMessage,
FunctionCall,
FunctionDefinition,
LegacyStructuralTagResponseFormat,
LogitsProcessors,
OpenAIBaseModel,
StreamOptions,
StructuralTagResponseFormat,
ToolCall,
UsageInfo,
get_logits_processors,
)
from vllm.exceptions import VLLMValidationError
from vllm.logger import init_logger
from vllm.logprobs import Logprob
from vllm.sampling_params import (
BeamSearchParams,
RequestOutputKind,
SamplingParams,
StructuredOutputsParams,
)
from vllm.utils import random_uuid
logger = init_logger(__name__)
_LONG_INFO = torch.iinfo(torch.long)
class ChatMessage(OpenAIBaseModel):
role: str
content: str | None = None
refusal: str | None = None
annotations: OpenAIAnnotation | None = None
audio: OpenAIChatCompletionAudio | None = None
function_call: FunctionCall | None = None
tool_calls: list[ToolCall] = Field(default_factory=list)
# vLLM-specific fields that are not in OpenAI spec
reasoning: str | None = None
reasoning_content: str | None = None
"""Deprecated: use `reasoning` instead."""
@model_validator(mode="after")
def handle_deprecated_reasoning_content(self):
"""Copy reasoning to reasoning_content for backward compatibility."""
self.reasoning_content = self.reasoning
return self
class ChatCompletionLogProb(OpenAIBaseModel):
token: str
logprob: float = -9999.0
bytes: list[int] | None = None
class ChatCompletionLogProbsContent(ChatCompletionLogProb):
# Workaround: redefine fields name cache so that it's not
# shared with the super class.
field_names: ClassVar[set[str] | None] = None
top_logprobs: list[ChatCompletionLogProb] = Field(default_factory=list)
class ChatCompletionLogProbs(OpenAIBaseModel):
content: list[ChatCompletionLogProbsContent] | None = None
class ChatCompletionResponseChoice(OpenAIBaseModel):
index: int
message: ChatMessage
logprobs: ChatCompletionLogProbs | None = None
# per OpenAI spec this is the default
finish_reason: str | None = "stop"
# not part of the OpenAI spec but included in vLLM for legacy reasons
stop_reason: int | str | None = None
# not part of the OpenAI spec but is useful for tracing the tokens
# in agent scenarios
token_ids: list[int] | None = None
class ChatCompletionResponse(OpenAIBaseModel):
id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
object: Literal["chat.completion"] = "chat.completion"
created: int = Field(default_factory=lambda: int(time.time()))
model: str
choices: list[ChatCompletionResponseChoice]
service_tier: Literal["auto", "default", "flex", "scale", "priority"] | None = None
system_fingerprint: str | None = None
usage: UsageInfo
# vLLM-specific fields that are not in OpenAI spec
prompt_logprobs: list[dict[int, Logprob] | None] | None = None
prompt_token_ids: list[int] | None = None
kv_transfer_params: dict[str, Any] | None = Field(
default=None, description="KVTransfer parameters."
)
class ChatCompletionResponseStreamChoice(OpenAIBaseModel):
index: int
delta: DeltaMessage
logprobs: ChatCompletionLogProbs | None = None
finish_reason: str | None = None
stop_reason: int | str | None = None
# not part of the OpenAI spec but for tracing the tokens
token_ids: list[int] | None = None
class ChatCompletionStreamResponse(OpenAIBaseModel):
id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
object: Literal["chat.completion.chunk"] = "chat.completion.chunk"
created: int = Field(default_factory=lambda: int(time.time()))
model: str
choices: list[ChatCompletionResponseStreamChoice]
usage: UsageInfo | None = Field(default=None)
# not part of the OpenAI spec but for tracing the tokens
prompt_token_ids: list[int] | None = None
class ChatCompletionToolsParam(OpenAIBaseModel):
type: Literal["function"] = "function"
function: FunctionDefinition
class ChatCompletionNamedFunction(OpenAIBaseModel):
name: str
class ChatCompletionNamedToolChoiceParam(OpenAIBaseModel):
function: ChatCompletionNamedFunction
type: Literal["function"] = "function"
class ChatCompletionRequest(OpenAIBaseModel):
# Ordered by official OpenAI API documentation
# https://platform.openai.com/docs/api-reference/chat/create
messages: list[ChatCompletionMessageParam]
model: str | None = None
frequency_penalty: float | None = 0.0
logit_bias: dict[str, float] | None = None
logprobs: bool | None = False
top_logprobs: int | None = 0
max_tokens: int | None = Field(
default=None,
deprecated="max_tokens is deprecated in favor of "
"the max_completion_tokens field",
)
max_completion_tokens: int | None = None
n: int | None = 1
presence_penalty: float | None = 0.0
response_format: AnyResponseFormat | None = None
seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
stop: str | list[str] | None = []
stream: bool | None = False
stream_options: StreamOptions | None = None
temperature: float | None = None
top_p: float | None = None
tools: list[ChatCompletionToolsParam] | None = None
tool_choice: (
Literal["none"]
| Literal["auto"]
| Literal["required"]
| ChatCompletionNamedToolChoiceParam
| None
) = "none"
reasoning_effort: Literal["low", "medium", "high"] | None = None
include_reasoning: bool = True
parallel_tool_calls: bool | None = True
# NOTE this will be ignored by vLLM
user: str | None = None
# --8<-- [start:chat-completion-sampling-params]
use_beam_search: bool = False
top_k: int | None = None
min_p: float | None = None
repetition_penalty: float | None = None
length_penalty: float = 1.0
stop_token_ids: list[int] | None = []
include_stop_str_in_output: bool = False
ignore_eos: bool = False
min_tokens: int = 0
skip_special_tokens: bool = True
spaces_between_special_tokens: bool = True
truncate_prompt_tokens: Annotated[int, Field(ge=-1, le=_LONG_INFO.max)] | None = (
None
)
prompt_logprobs: int | None = None
allowed_token_ids: list[int] | None = None
bad_words: list[str] = Field(default_factory=list)
# --8<-- [end:chat-completion-sampling-params]
# --8<-- [start:chat-completion-extra-params]
echo: bool = Field(
default=False,
description=(
"If true, the new message will be prepended with the last message "
"if they belong to the same role."
),
)
add_generation_prompt: bool = Field(
default=True,
description=(
"If true, the generation prompt will be added to the chat template. "
"This is a parameter used by chat template in tokenizer config of the "
"model."
),
)
continue_final_message: bool = Field(
default=False,
description=(
"If this is set, the chat will be formatted so that the final "
"message in the chat is open-ended, without any EOS tokens. The "
"model will continue this message rather than starting a new one. "
'This allows you to "prefill" part of the model\'s response for it. '
"Cannot be used at the same time as `add_generation_prompt`."
),
)
add_special_tokens: bool = Field(
default=False,
description=(
"If true, special tokens (e.g. BOS) will be added to the prompt "
"on top of what is added by the chat template. "
"For most models, the chat template takes care of adding the "
"special tokens so this should be set to false (as is the "
"default)."
),
)
documents: list[dict[str, str]] | None = Field(
default=None,
description=(
"A list of dicts representing documents that will be accessible to "
"the model if it is performing RAG (retrieval-augmented generation)."
" If the template does not support RAG, this argument will have no "
"effect. We recommend that each document should be a dict containing "
'"title" and "text" keys.'
),
)
chat_template: str | None = Field(
default=None,
description=(
"A Jinja template to use for this conversion. "
"As of transformers v4.44, default chat template is no longer "
"allowed, so you must provide a chat template if the tokenizer "
"does not define one."
),
)
chat_template_kwargs: dict[str, Any] | None = Field(
default=None,
description=(
"Additional keyword args to pass to the template renderer. "
"Will be accessible by the chat template."
),
)
mm_processor_kwargs: dict[str, Any] | None = Field(
default=None,
description=("Additional kwargs to pass to the HF processor."),
)
structured_outputs: StructuredOutputsParams | None = Field(
default=None,
description="Additional kwargs for structured outputs",
)
priority: int = Field(
default=0,
description=(
"The priority of the request (lower means earlier handling; "
"default: 0). Any priority other than 0 will raise an error "
"if the served model does not use priority scheduling."
),
)
request_id: str = Field(
default_factory=random_uuid,
description=(
"The request_id related to this request. If the caller does "
"not set it, a random_uuid will be generated. This id is used "
"through out the inference process and return in response."
),
)
logits_processors: LogitsProcessors | None = Field(
default=None,
description=(
"A list of either qualified names of logits processors, or "
"constructor objects, to apply when sampling. A constructor is "
"a JSON object with a required 'qualname' field specifying the "
"qualified name of the processor class/factory, and optional "
"'args' and 'kwargs' fields containing positional and keyword "
"arguments. For example: {'qualname': "
"'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
"{'param': 'value'}}."
),
)
return_tokens_as_token_ids: bool | None = Field(
default=None,
description=(
"If specified with 'logprobs', tokens are represented "
" as strings of the form 'token_id:{token_id}' so that tokens "
"that are not JSON-encodable can be identified."
),
)
return_token_ids: bool | None = Field(
default=None,
description=(
"If specified, the result will include token IDs alongside the "
"generated text. In streaming mode, prompt_token_ids is included "
"only in the first chunk, and token_ids contains the delta tokens "
"for each chunk. This is useful for debugging or when you "
"need to map generated text back to input tokens."
),
)
cache_salt: str | None = Field(
default=None,
description=(
"If specified, the prefix cache will be salted with the provided "
"string to prevent an attacker to guess prompts in multi-user "
"environments. The salt should be random, protected from "
"access by 3rd parties, and long enough to be "
"unpredictable (e.g., 43 characters base64-encoded, corresponding "
"to 256 bit)."
),
)
kv_transfer_params: dict[str, Any] | None = Field(
default=None,
description="KVTransfer parameters used for disaggregated serving.",
)
vllm_xargs: dict[str, str | int | float | list[str | int | float]] | None = Field(
default=None,
description=(
"Additional request parameters with (list of) string or "
"numeric values, used by custom extensions."
),
)
# --8<-- [end:chat-completion-extra-params]
# Default sampling parameters for chat completion requests
_DEFAULT_SAMPLING_PARAMS: dict = {
"repetition_penalty": 1.0,
"temperature": 1.0,
"top_p": 1.0,
"top_k": 0,
"min_p": 0.0,
}
def to_beam_search_params(
self, max_tokens: int, default_sampling_params: dict
) -> BeamSearchParams:
n = self.n if self.n is not None else 1
if (temperature := self.temperature) is None:
temperature = default_sampling_params.get(
"temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
)
return BeamSearchParams(
beam_width=n,
max_tokens=max_tokens,
ignore_eos=self.ignore_eos,
temperature=temperature,
length_penalty=self.length_penalty,
include_stop_str_in_output=self.include_stop_str_in_output,
)
def to_sampling_params(
self,
max_tokens: int,
logits_processor_pattern: str | None,
default_sampling_params: dict,
) -> SamplingParams:
# Default parameters
if (repetition_penalty := self.repetition_penalty) is None:
repetition_penalty = default_sampling_params.get(
"repetition_penalty",
self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
)
if (temperature := self.temperature) is None:
temperature = default_sampling_params.get(
"temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
)
if (top_p := self.top_p) is None:
top_p = default_sampling_params.get(
"top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"]
)
if (top_k := self.top_k) is None:
top_k = default_sampling_params.get(
"top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"]
)
if (min_p := self.min_p) is None:
min_p = default_sampling_params.get(
"min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"]
)
prompt_logprobs = self.prompt_logprobs
if prompt_logprobs is None and self.echo:
prompt_logprobs = self.top_logprobs
response_format = self.response_format
if response_format is not None:
# If structured outputs wasn't already enabled,
# we must enable it for these features to work
if self.structured_outputs is None:
self.structured_outputs = StructuredOutputsParams()
# Set structured output params for response format
if response_format.type == "json_object":
self.structured_outputs.json_object = True
elif response_format.type == "json_schema":
json_schema = response_format.json_schema
assert json_schema is not None
self.structured_outputs.json = json_schema.json_schema
elif response_format.type == "structural_tag":
structural_tag = response_format
assert structural_tag is not None and isinstance(
structural_tag,
(
LegacyStructuralTagResponseFormat,
StructuralTagResponseFormat,
),
)
s_tag_obj = structural_tag.model_dump(by_alias=True)
self.structured_outputs.structural_tag = json.dumps(s_tag_obj)
extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
if self.kv_transfer_params:
# Pass in kv_transfer_params via extra_args
extra_args["kv_transfer_params"] = self.kv_transfer_params
return SamplingParams.from_optional(
n=self.n,
presence_penalty=self.presence_penalty,
frequency_penalty=self.frequency_penalty,
repetition_penalty=repetition_penalty,
temperature=temperature,
top_p=top_p,
top_k=top_k,
min_p=min_p,
seed=self.seed,
stop=self.stop,
stop_token_ids=self.stop_token_ids,
logprobs=self.top_logprobs if self.logprobs else None,
prompt_logprobs=prompt_logprobs,
ignore_eos=self.ignore_eos,
max_tokens=max_tokens,
min_tokens=self.min_tokens,
skip_special_tokens=self.skip_special_tokens,
spaces_between_special_tokens=self.spaces_between_special_tokens,
logits_processors=get_logits_processors(
self.logits_processors, logits_processor_pattern
),
include_stop_str_in_output=self.include_stop_str_in_output,
truncate_prompt_tokens=self.truncate_prompt_tokens,
output_kind=RequestOutputKind.DELTA
if self.stream
else RequestOutputKind.FINAL_ONLY,
structured_outputs=self.structured_outputs,
logit_bias=self.logit_bias,
bad_words=self.bad_words,
allowed_token_ids=self.allowed_token_ids,
extra_args=extra_args or None,
skip_clone=True, # Created fresh per request, safe to skip clone
)
@model_validator(mode="before")
@classmethod
def validate_stream_options(cls, data):
if data.get("stream_options") and not data.get("stream"):
raise VLLMValidationError(
"Stream options can only be defined when `stream=True`.",
parameter="stream_options",
)
return data
@model_validator(mode="before")
@classmethod
def check_logprobs(cls, data):
if (prompt_logprobs := data.get("prompt_logprobs")) is not None:
if data.get("stream") and (prompt_logprobs > 0 or prompt_logprobs == -1):
raise VLLMValidationError(
"`prompt_logprobs` are not available when `stream=True`.",
parameter="prompt_logprobs",
)
if prompt_logprobs < 0 and prompt_logprobs != -1:
raise VLLMValidationError(
"`prompt_logprobs` must be a positive value or -1.",
parameter="prompt_logprobs",
value=prompt_logprobs,
)
if (top_logprobs := data.get("top_logprobs")) is not None:
if top_logprobs < 0 and top_logprobs != -1:
raise VLLMValidationError(
"`top_logprobs` must be a positive value or -1.",
parameter="top_logprobs",
value=top_logprobs,
)
if (top_logprobs == -1 or top_logprobs > 0) and not data.get("logprobs"):
raise VLLMValidationError(
"when using `top_logprobs`, `logprobs` must be set to true.",
parameter="top_logprobs",
)
return data
@model_validator(mode="before")
@classmethod
def check_structured_outputs_count(cls, data):
if isinstance(data, ValueError):
raise data
if data.get("structured_outputs", None) is None:
return data
structured_outputs_kwargs = data["structured_outputs"]
count = sum(
structured_outputs_kwargs.get(k) is not None
for k in ("json", "regex", "choice")
)
# you can only use one kind of constraints for structured outputs
if count > 1:
raise ValueError(
"You can only use one kind of constraints for structured "
"outputs ('json', 'regex' or 'choice')."
)
# you can only either use structured outputs or tools, not both
if count > 1 and data.get("tool_choice", "none") not in (
"none",
"auto",
"required",
):
raise ValueError(
"You can only either use constraints for structured outputs "
"or tools, not both."
)
return data
@model_validator(mode="before")
@classmethod
def check_tool_usage(cls, data):
# if "tool_choice" is not specified but tools are provided,
# default to "auto" tool_choice
if "tool_choice" not in data and data.get("tools"):
data["tool_choice"] = "auto"
# if "tool_choice" is "none" -- no validation is needed for tools
if "tool_choice" in data and data["tool_choice"] == "none":
return data
# if "tool_choice" is specified -- validation
if "tool_choice" in data and data["tool_choice"] is not None:
# ensure that if "tool choice" is specified, tools are present
if "tools" not in data or data["tools"] is None:
raise ValueError("When using `tool_choice`, `tools` must be set.")
# make sure that tool choice is either a named tool
# OR that it's set to "auto" or "required"
if data["tool_choice"] not in ["auto", "required"] and not isinstance(
data["tool_choice"], dict
):
raise ValueError(
f"Invalid value for `tool_choice`: {data['tool_choice']}! "
'Only named tools, "none", "auto" or "required" '
"are supported."
)
# if tool_choice is "required" but the "tools" list is empty,
# override the data to behave like "none" to align with
# OpenAI’s behavior.
if (
data["tool_choice"] == "required"
and isinstance(data["tools"], list)
and len(data["tools"]) == 0
):
data["tool_choice"] = "none"
del data["tools"]
return data
# ensure that if "tool_choice" is specified as an object,
# it matches a valid tool
correct_usage_message = (
'Correct usage: `{"type": "function",'
' "function": {"name": "my_function"}}`'
)
if isinstance(data["tool_choice"], dict):
valid_tool = False
function = data["tool_choice"].get("function")
if not isinstance(function, dict):
raise ValueError(
f"Invalid value for `function`: `{function}` in "
f"`tool_choice`! {correct_usage_message}"
)
if "name" not in function:
raise ValueError(
f"Expected field `name` in `function` in "
f"`tool_choice`! {correct_usage_message}"
)
function_name = function["name"]
if not isinstance(function_name, str) or len(function_name) == 0:
raise ValueError(
f"Invalid `name` in `function`: `{function_name}`"
f" in `tool_choice`! {correct_usage_message}"
)
for tool in data["tools"]:
if tool["function"]["name"] == function_name:
valid_tool = True
break
if not valid_tool:
raise ValueError(
"The tool specified in `tool_choice` does not match any"
" of the specified `tools`"
)
return data
@model_validator(mode="before")
@classmethod
def check_generation_prompt(cls, data):
if data.get("continue_final_message") and data.get("add_generation_prompt"):
raise ValueError(
"Cannot set both `continue_final_message` and "
"`add_generation_prompt` to True."
)
return data
@model_validator(mode="before")
@classmethod
def check_cache_salt_support(cls, data):
if data.get("cache_salt") is not None and (
not isinstance(data["cache_salt"], str) or not data["cache_salt"]
):
raise ValueError(
"Parameter 'cache_salt' must be a non-empty string if provided."
)
return data
...@@ -23,16 +23,7 @@ from vllm.entrypoints.chat_utils import ( ...@@ -23,16 +23,7 @@ from vllm.entrypoints.chat_utils import (
make_tool_call_id, make_tool_call_id,
) )
from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.parser.harmony_utils import ( from vllm.entrypoints.openai.chat_completion.protocol import (
get_developer_message,
get_stop_tokens_for_assistant_actions,
get_streamable_parser_for_assistant,
get_system_message,
parse_chat_inputs_to_harmony_messages,
parse_chat_output,
render_for_completion,
)
from vllm.entrypoints.openai.protocol import (
ChatCompletionLogProb, ChatCompletionLogProb,
ChatCompletionLogProbs, ChatCompletionLogProbs,
ChatCompletionLogProbsContent, ChatCompletionLogProbsContent,
...@@ -43,6 +34,11 @@ from vllm.entrypoints.openai.protocol import ( ...@@ -43,6 +34,11 @@ from vllm.entrypoints.openai.protocol import (
ChatCompletionResponseStreamChoice, ChatCompletionResponseStreamChoice,
ChatCompletionStreamResponse, ChatCompletionStreamResponse,
ChatMessage, ChatMessage,
)
from vllm.entrypoints.openai.chat_completion.stream_harmony import (
extract_harmony_streaming_delta,
)
from vllm.entrypoints.openai.engine.protocol import (
DeltaFunctionCall, DeltaFunctionCall,
DeltaMessage, DeltaMessage,
DeltaToolCall, DeltaToolCall,
...@@ -52,14 +48,20 @@ from vllm.entrypoints.openai.protocol import ( ...@@ -52,14 +48,20 @@ from vllm.entrypoints.openai.protocol import (
ToolCall, ToolCall,
UsageInfo, UsageInfo,
) )
from vllm.entrypoints.openai.serving_chat_stream_harmony import ( from vllm.entrypoints.openai.engine.serving import (
extract_harmony_streaming_delta,
)
from vllm.entrypoints.openai.serving_engine import (
GenerationError, GenerationError,
OpenAIServing, OpenAIServing,
clamp_prompt_logprobs, clamp_prompt_logprobs,
) )
from vllm.entrypoints.openai.parser.harmony_utils import (
get_developer_message,
get_stop_tokens_for_assistant_actions,
get_streamable_parser_for_assistant,
get_system_message,
parse_chat_inputs_to_harmony_messages,
parse_chat_output,
render_for_completion,
)
from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.entrypoints.openai.utils import maybe_filter_parallel_tool_calls from vllm.entrypoints.openai.utils import maybe_filter_parallel_tool_calls
from vllm.entrypoints.utils import get_max_tokens, should_include_usage from vllm.entrypoints.utils import get_max_tokens, should_include_usage
......
...@@ -10,7 +10,7 @@ harmony parser state during streaming chat completions. ...@@ -10,7 +10,7 @@ harmony parser state during streaming chat completions.
from openai_harmony import StreamableParser from openai_harmony import StreamableParser
from vllm.entrypoints.chat_utils import make_tool_call_id from vllm.entrypoints.chat_utils import make_tool_call_id
from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.engine.protocol import (
DeltaFunctionCall, DeltaFunctionCall,
DeltaMessage, DeltaMessage,
DeltaToolCall, DeltaToolCall,
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
...@@ -11,10 +11,6 @@ from typing import Annotated, Any, ClassVar, Literal, TypeAlias ...@@ -11,10 +11,6 @@ from typing import Annotated, Any, ClassVar, Literal, TypeAlias
import regex as re import regex as re
import torch import torch
from fastapi import HTTPException, UploadFile from fastapi import HTTPException, UploadFile
from openai.types.chat.chat_completion_audio import (
ChatCompletionAudio as OpenAIChatCompletionAudio,
)
from openai.types.chat.chat_completion_message import Annotation as OpenAIAnnotation
from openai.types.responses import ( from openai.types.responses import (
ResponseCodeInterpreterCallCodeDeltaEvent, ResponseCodeInterpreterCallCodeDeltaEvent,
ResponseCodeInterpreterCallCodeDoneEvent, ResponseCodeInterpreterCallCodeDoneEvent,
...@@ -234,20 +230,6 @@ class FunctionDefinition(OpenAIBaseModel): ...@@ -234,20 +230,6 @@ class FunctionDefinition(OpenAIBaseModel):
parameters: dict[str, Any] | None = None parameters: dict[str, Any] | None = None
class ChatCompletionToolsParam(OpenAIBaseModel):
type: Literal["function"] = "function"
function: FunctionDefinition
class ChatCompletionNamedFunction(OpenAIBaseModel):
name: str
class ChatCompletionNamedToolChoiceParam(OpenAIBaseModel):
function: ChatCompletionNamedFunction
type: Literal["function"] = "function"
# extra="forbid" is a workaround to have kwargs as a field, # extra="forbid" is a workaround to have kwargs as a field,
# see https://github.com/pydantic/pydantic/issues/3125 # see https://github.com/pydantic/pydantic/issues/3125
class LogitsProcessorConstructor(BaseModel): class LogitsProcessorConstructor(BaseModel):
...@@ -414,609 +396,66 @@ class ResponsesRequest(OpenAIBaseModel): ...@@ -414,609 +396,66 @@ class ResponsesRequest(OpenAIBaseModel):
) )
if (top_k := self.top_k) is None: if (top_k := self.top_k) is None:
top_k = default_sampling_params.get( top_k = default_sampling_params.get(
"top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"] "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"]
)
stop_token_ids = default_sampling_params.get("stop_token_ids")
# Structured output
structured_outputs = None
if self.text is not None and self.text.format is not None:
response_format = self.text.format
if (
response_format.type == "json_schema"
and response_format.schema_ is not None
):
structured_outputs = StructuredOutputsParams(
json=response_format.schema_
)
elif response_format.type == "json_object":
raise NotImplementedError("json_object is not supported")
# TODO: add more parameters
return SamplingParams.from_optional(
temperature=temperature,
top_p=top_p,
top_k=top_k,
max_tokens=max_tokens,
logprobs=self.top_logprobs if self.is_include_output_logprobs() else None,
stop_token_ids=stop_token_ids,
output_kind=(
RequestOutputKind.DELTA if self.stream else RequestOutputKind.FINAL_ONLY
),
structured_outputs=structured_outputs,
logit_bias=self.logit_bias,
skip_clone=True, # Created fresh per request, safe to skip clone
)
def is_include_output_logprobs(self) -> bool:
"""Check if the request includes output logprobs."""
if self.include is None:
return False
return (
isinstance(self.include, list)
and "message.output_text.logprobs" in self.include
)
@model_validator(mode="before")
def validate_background(cls, data):
if not data.get("background"):
return data
if not data.get("store", True):
raise ValueError("background can only be used when `store` is true")
return data
@model_validator(mode="before")
def validate_prompt(cls, data):
if data.get("prompt") is not None:
raise VLLMValidationError(
"prompt template is not supported", parameter="prompt"
)
return data
@model_validator(mode="before")
def check_cache_salt_support(cls, data):
if data.get("cache_salt") is not None and (
not isinstance(data["cache_salt"], str) or not data["cache_salt"]
):
raise ValueError(
"Parameter 'cache_salt' must be a non-empty string if provided."
)
return data
@model_validator(mode="before")
def function_call_parsing(cls, data):
"""Parse function_call dictionaries into ResponseFunctionToolCall objects.
This ensures Pydantic can properly resolve union types in the input field.
Function calls provided as dicts are converted to ResponseFunctionToolCall
objects before validation, while invalid structures are left for Pydantic
to reject with appropriate error messages.
"""
input_data = data.get("input")
# Early return for None, strings, or bytes
# (strings are iterable but shouldn't be processed)
if input_data is None or isinstance(input_data, (str, bytes)):
return data
# Convert iterators (like ValidatorIterator) to list
if not isinstance(input_data, list):
try:
input_data = list(input_data)
except TypeError:
# Not iterable, leave as-is for Pydantic to handle
return data
processed_input = []
for item in input_data:
if isinstance(item, dict) and item.get("type") == "function_call":
try:
processed_input.append(ResponseFunctionToolCall(**item))
except ValidationError:
# Let Pydantic handle validation for malformed function calls
logger.debug(
"Failed to parse function_call to ResponseFunctionToolCall, "
"leaving for Pydantic validation"
)
processed_input.append(item)
else:
processed_input.append(item)
data["input"] = processed_input
return data
class ChatCompletionRequest(OpenAIBaseModel):
# Ordered by official OpenAI API documentation
# https://platform.openai.com/docs/api-reference/chat/create
messages: list[ChatCompletionMessageParam]
model: str | None = None
frequency_penalty: float | None = 0.0
logit_bias: dict[str, float] | None = None
logprobs: bool | None = False
top_logprobs: int | None = 0
max_tokens: int | None = Field(
default=None,
deprecated="max_tokens is deprecated in favor of "
"the max_completion_tokens field",
)
max_completion_tokens: int | None = None
n: int | None = 1
presence_penalty: float | None = 0.0
response_format: AnyResponseFormat | None = None
seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
stop: str | list[str] | None = []
stream: bool | None = False
stream_options: StreamOptions | None = None
temperature: float | None = None
top_p: float | None = None
tools: list[ChatCompletionToolsParam] | None = None
tool_choice: (
Literal["none"]
| Literal["auto"]
| Literal["required"]
| ChatCompletionNamedToolChoiceParam
| None
) = "none"
reasoning_effort: Literal["low", "medium", "high"] | None = None
include_reasoning: bool = True
parallel_tool_calls: bool | None = True
# NOTE this will be ignored by vLLM
user: str | None = None
# --8<-- [start:chat-completion-sampling-params]
use_beam_search: bool = False
top_k: int | None = None
min_p: float | None = None
repetition_penalty: float | None = None
length_penalty: float = 1.0
stop_token_ids: list[int] | None = []
include_stop_str_in_output: bool = False
ignore_eos: bool = False
min_tokens: int = 0
skip_special_tokens: bool = True
spaces_between_special_tokens: bool = True
truncate_prompt_tokens: Annotated[int, Field(ge=-1, le=_LONG_INFO.max)] | None = (
None
)
prompt_logprobs: int | None = None
allowed_token_ids: list[int] | None = None
bad_words: list[str] = Field(default_factory=list)
# --8<-- [end:chat-completion-sampling-params]
# --8<-- [start:chat-completion-extra-params]
echo: bool = Field(
default=False,
description=(
"If true, the new message will be prepended with the last message "
"if they belong to the same role."
),
)
add_generation_prompt: bool = Field(
default=True,
description=(
"If true, the generation prompt will be added to the chat template. "
"This is a parameter used by chat template in tokenizer config of the "
"model."
),
)
continue_final_message: bool = Field(
default=False,
description=(
"If this is set, the chat will be formatted so that the final "
"message in the chat is open-ended, without any EOS tokens. The "
"model will continue this message rather than starting a new one. "
'This allows you to "prefill" part of the model\'s response for it. '
"Cannot be used at the same time as `add_generation_prompt`."
),
)
add_special_tokens: bool = Field(
default=False,
description=(
"If true, special tokens (e.g. BOS) will be added to the prompt "
"on top of what is added by the chat template. "
"For most models, the chat template takes care of adding the "
"special tokens so this should be set to false (as is the "
"default)."
),
)
documents: list[dict[str, str]] | None = Field(
default=None,
description=(
"A list of dicts representing documents that will be accessible to "
"the model if it is performing RAG (retrieval-augmented generation)."
" If the template does not support RAG, this argument will have no "
"effect. We recommend that each document should be a dict containing "
'"title" and "text" keys.'
),
)
chat_template: str | None = Field(
default=None,
description=(
"A Jinja template to use for this conversion. "
"As of transformers v4.44, default chat template is no longer "
"allowed, so you must provide a chat template if the tokenizer "
"does not define one."
),
)
chat_template_kwargs: dict[str, Any] | None = Field(
default=None,
description=(
"Additional keyword args to pass to the template renderer. "
"Will be accessible by the chat template."
),
)
mm_processor_kwargs: dict[str, Any] | None = Field(
default=None,
description=("Additional kwargs to pass to the HF processor."),
)
structured_outputs: StructuredOutputsParams | None = Field(
default=None,
description="Additional kwargs for structured outputs",
)
priority: int = Field(
default=0,
description=(
"The priority of the request (lower means earlier handling; "
"default: 0). Any priority other than 0 will raise an error "
"if the served model does not use priority scheduling."
),
)
request_id: str = Field(
default_factory=random_uuid,
description=(
"The request_id related to this request. If the caller does "
"not set it, a random_uuid will be generated. This id is used "
"through out the inference process and return in response."
),
)
logits_processors: LogitsProcessors | None = Field(
default=None,
description=(
"A list of either qualified names of logits processors, or "
"constructor objects, to apply when sampling. A constructor is "
"a JSON object with a required 'qualname' field specifying the "
"qualified name of the processor class/factory, and optional "
"'args' and 'kwargs' fields containing positional and keyword "
"arguments. For example: {'qualname': "
"'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
"{'param': 'value'}}."
),
)
return_tokens_as_token_ids: bool | None = Field(
default=None,
description=(
"If specified with 'logprobs', tokens are represented "
" as strings of the form 'token_id:{token_id}' so that tokens "
"that are not JSON-encodable can be identified."
),
)
return_token_ids: bool | None = Field(
default=None,
description=(
"If specified, the result will include token IDs alongside the "
"generated text. In streaming mode, prompt_token_ids is included "
"only in the first chunk, and token_ids contains the delta tokens "
"for each chunk. This is useful for debugging or when you "
"need to map generated text back to input tokens."
),
)
cache_salt: str | None = Field(
default=None,
description=(
"If specified, the prefix cache will be salted with the provided "
"string to prevent an attacker to guess prompts in multi-user "
"environments. The salt should be random, protected from "
"access by 3rd parties, and long enough to be "
"unpredictable (e.g., 43 characters base64-encoded, corresponding "
"to 256 bit)."
),
)
kv_transfer_params: dict[str, Any] | None = Field(
default=None,
description="KVTransfer parameters used for disaggregated serving.",
)
vllm_xargs: dict[str, str | int | float | list[str | int | float]] | None = Field(
default=None,
description=(
"Additional request parameters with (list of) string or "
"numeric values, used by custom extensions."
),
)
# --8<-- [end:chat-completion-extra-params]
# Default sampling parameters for chat completion requests
_DEFAULT_SAMPLING_PARAMS: dict = {
"repetition_penalty": 1.0,
"temperature": 1.0,
"top_p": 1.0,
"top_k": 0,
"min_p": 0.0,
}
def to_beam_search_params(
self, max_tokens: int, default_sampling_params: dict
) -> BeamSearchParams:
n = self.n if self.n is not None else 1
if (temperature := self.temperature) is None:
temperature = default_sampling_params.get(
"temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
)
return BeamSearchParams(
beam_width=n,
max_tokens=max_tokens,
ignore_eos=self.ignore_eos,
temperature=temperature,
length_penalty=self.length_penalty,
include_stop_str_in_output=self.include_stop_str_in_output,
)
def to_sampling_params(
self,
max_tokens: int,
logits_processor_pattern: str | None,
default_sampling_params: dict,
) -> SamplingParams:
# Default parameters
if (repetition_penalty := self.repetition_penalty) is None:
repetition_penalty = default_sampling_params.get(
"repetition_penalty",
self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
)
if (temperature := self.temperature) is None:
temperature = default_sampling_params.get(
"temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
)
if (top_p := self.top_p) is None:
top_p = default_sampling_params.get(
"top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"]
)
if (top_k := self.top_k) is None:
top_k = default_sampling_params.get(
"top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"]
)
if (min_p := self.min_p) is None:
min_p = default_sampling_params.get(
"min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"]
)
prompt_logprobs = self.prompt_logprobs
if prompt_logprobs is None and self.echo:
prompt_logprobs = self.top_logprobs
response_format = self.response_format
if response_format is not None:
# If structured outputs wasn't already enabled,
# we must enable it for these features to work
if self.structured_outputs is None:
self.structured_outputs = StructuredOutputsParams()
# Set structured output params for response format
if response_format.type == "json_object":
self.structured_outputs.json_object = True
elif response_format.type == "json_schema":
json_schema = response_format.json_schema
assert json_schema is not None
self.structured_outputs.json = json_schema.json_schema
elif response_format.type == "structural_tag":
structural_tag = response_format
assert structural_tag is not None and isinstance(
structural_tag,
(
LegacyStructuralTagResponseFormat,
StructuralTagResponseFormat,
),
)
s_tag_obj = structural_tag.model_dump(by_alias=True)
self.structured_outputs.structural_tag = json.dumps(s_tag_obj)
extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
if self.kv_transfer_params:
# Pass in kv_transfer_params via extra_args
extra_args["kv_transfer_params"] = self.kv_transfer_params
return SamplingParams.from_optional(
n=self.n,
presence_penalty=self.presence_penalty,
frequency_penalty=self.frequency_penalty,
repetition_penalty=repetition_penalty,
temperature=temperature,
top_p=top_p,
top_k=top_k,
min_p=min_p,
seed=self.seed,
stop=self.stop,
stop_token_ids=self.stop_token_ids,
logprobs=self.top_logprobs if self.logprobs else None,
prompt_logprobs=prompt_logprobs,
ignore_eos=self.ignore_eos,
max_tokens=max_tokens,
min_tokens=self.min_tokens,
skip_special_tokens=self.skip_special_tokens,
spaces_between_special_tokens=self.spaces_between_special_tokens,
logits_processors=get_logits_processors(
self.logits_processors, logits_processor_pattern
),
include_stop_str_in_output=self.include_stop_str_in_output,
truncate_prompt_tokens=self.truncate_prompt_tokens,
output_kind=RequestOutputKind.DELTA
if self.stream
else RequestOutputKind.FINAL_ONLY,
structured_outputs=self.structured_outputs,
logit_bias=self.logit_bias,
bad_words=self.bad_words,
allowed_token_ids=self.allowed_token_ids,
extra_args=extra_args or None,
skip_clone=True, # Created fresh per request, safe to skip clone
)
@model_validator(mode="before")
@classmethod
def validate_stream_options(cls, data):
if data.get("stream_options") and not data.get("stream"):
raise VLLMValidationError(
"Stream options can only be defined when `stream=True`.",
parameter="stream_options",
)
return data
@model_validator(mode="before")
@classmethod
def check_logprobs(cls, data):
if (prompt_logprobs := data.get("prompt_logprobs")) is not None:
if data.get("stream") and (prompt_logprobs > 0 or prompt_logprobs == -1):
raise VLLMValidationError(
"`prompt_logprobs` are not available when `stream=True`.",
parameter="prompt_logprobs",
)
if prompt_logprobs < 0 and prompt_logprobs != -1:
raise VLLMValidationError(
"`prompt_logprobs` must be a positive value or -1.",
parameter="prompt_logprobs",
value=prompt_logprobs,
)
if (top_logprobs := data.get("top_logprobs")) is not None:
if top_logprobs < 0 and top_logprobs != -1:
raise VLLMValidationError(
"`top_logprobs` must be a positive value or -1.",
parameter="top_logprobs",
value=top_logprobs,
)
if (top_logprobs == -1 or top_logprobs > 0) and not data.get("logprobs"):
raise VLLMValidationError(
"when using `top_logprobs`, `logprobs` must be set to true.",
parameter="top_logprobs",
)
return data
@model_validator(mode="before")
@classmethod
def check_structured_outputs_count(cls, data):
if isinstance(data, ValueError):
raise data
if data.get("structured_outputs", None) is None:
return data
structured_outputs_kwargs = data["structured_outputs"]
count = sum(
structured_outputs_kwargs.get(k) is not None
for k in ("json", "regex", "choice")
)
# you can only use one kind of constraints for structured outputs
if count > 1:
raise ValueError(
"You can only use one kind of constraints for structured "
"outputs ('json', 'regex' or 'choice')."
)
# you can only either use structured outputs or tools, not both
if count > 1 and data.get("tool_choice", "none") not in (
"none",
"auto",
"required",
):
raise ValueError(
"You can only either use constraints for structured outputs "
"or tools, not both."
) )
return data stop_token_ids = default_sampling_params.get("stop_token_ids")
@model_validator(mode="before")
@classmethod
def check_tool_usage(cls, data):
# if "tool_choice" is not specified but tools are provided,
# default to "auto" tool_choice
if "tool_choice" not in data and data.get("tools"):
data["tool_choice"] = "auto"
# if "tool_choice" is "none" -- no validation is needed for tools
if "tool_choice" in data and data["tool_choice"] == "none":
return data
# if "tool_choice" is specified -- validation
if "tool_choice" in data and data["tool_choice"] is not None:
# ensure that if "tool choice" is specified, tools are present
if "tools" not in data or data["tools"] is None:
raise ValueError("When using `tool_choice`, `tools` must be set.")
# make sure that tool choice is either a named tool # Structured output
# OR that it's set to "auto" or "required" structured_outputs = None
if data["tool_choice"] not in ["auto", "required"] and not isinstance( if self.text is not None and self.text.format is not None:
data["tool_choice"], dict response_format = self.text.format
if (
response_format.type == "json_schema"
and response_format.schema_ is not None
): ):
raise ValueError( structured_outputs = StructuredOutputsParams(
f"Invalid value for `tool_choice`: {data['tool_choice']}! " json=response_format.schema_
'Only named tools, "none", "auto" or "required" '
"are supported."
) )
elif response_format.type == "json_object":
raise NotImplementedError("json_object is not supported")
# if tool_choice is "required" but the "tools" list is empty, # TODO: add more parameters
# override the data to behave like "none" to align with return SamplingParams.from_optional(
# OpenAI’s behavior. temperature=temperature,
if ( top_p=top_p,
data["tool_choice"] == "required" top_k=top_k,
and isinstance(data["tools"], list) max_tokens=max_tokens,
and len(data["tools"]) == 0 logprobs=self.top_logprobs if self.is_include_output_logprobs() else None,
): stop_token_ids=stop_token_ids,
data["tool_choice"] = "none" output_kind=(
del data["tools"] RequestOutputKind.DELTA if self.stream else RequestOutputKind.FINAL_ONLY
return data ),
structured_outputs=structured_outputs,
logit_bias=self.logit_bias,
skip_clone=True, # Created fresh per request, safe to skip clone
)
# ensure that if "tool_choice" is specified as an object, def is_include_output_logprobs(self) -> bool:
# it matches a valid tool """Check if the request includes output logprobs."""
correct_usage_message = ( if self.include is None:
'Correct usage: `{"type": "function",' return False
' "function": {"name": "my_function"}}`' return (
) isinstance(self.include, list)
if isinstance(data["tool_choice"], dict): and "message.output_text.logprobs" in self.include
valid_tool = False )
function = data["tool_choice"].get("function")
if not isinstance(function, dict): @model_validator(mode="before")
raise ValueError( def validate_background(cls, data):
f"Invalid value for `function`: `{function}` in " if not data.get("background"):
f"`tool_choice`! {correct_usage_message}" return data
) if not data.get("store", True):
if "name" not in function: raise ValueError("background can only be used when `store` is true")
raise ValueError(
f"Expected field `name` in `function` in "
f"`tool_choice`! {correct_usage_message}"
)
function_name = function["name"]
if not isinstance(function_name, str) or len(function_name) == 0:
raise ValueError(
f"Invalid `name` in `function`: `{function_name}`"
f" in `tool_choice`! {correct_usage_message}"
)
for tool in data["tools"]:
if tool["function"]["name"] == function_name:
valid_tool = True
break
if not valid_tool:
raise ValueError(
"The tool specified in `tool_choice` does not match any"
" of the specified `tools`"
)
return data return data
@model_validator(mode="before") @model_validator(mode="before")
@classmethod def validate_prompt(cls, data):
def check_generation_prompt(cls, data): if data.get("prompt") is not None:
if data.get("continue_final_message") and data.get("add_generation_prompt"): raise VLLMValidationError(
raise ValueError( "prompt template is not supported", parameter="prompt"
"Cannot set both `continue_final_message` and "
"`add_generation_prompt` to True."
) )
return data return data
@model_validator(mode="before") @model_validator(mode="before")
@classmethod
def check_cache_salt_support(cls, data): def check_cache_salt_support(cls, data):
if data.get("cache_salt") is not None and ( if data.get("cache_salt") is not None and (
not isinstance(data["cache_salt"], str) or not data["cache_salt"] not isinstance(data["cache_salt"], str) or not data["cache_salt"]
...@@ -1026,6 +465,48 @@ class ChatCompletionRequest(OpenAIBaseModel): ...@@ -1026,6 +465,48 @@ class ChatCompletionRequest(OpenAIBaseModel):
) )
return data return data
@model_validator(mode="before")
def function_call_parsing(cls, data):
"""Parse function_call dictionaries into ResponseFunctionToolCall objects.
This ensures Pydantic can properly resolve union types in the input field.
Function calls provided as dicts are converted to ResponseFunctionToolCall
objects before validation, while invalid structures are left for Pydantic
to reject with appropriate error messages.
"""
input_data = data.get("input")
# Early return for None, strings, or bytes
# (strings are iterable but shouldn't be processed)
if input_data is None or isinstance(input_data, (str, bytes)):
return data
# Convert iterators (like ValidatorIterator) to list
if not isinstance(input_data, list):
try:
input_data = list(input_data)
except TypeError:
# Not iterable, leave as-is for Pydantic to handle
return data
processed_input = []
for item in input_data:
if isinstance(item, dict) and item.get("type") == "function_call":
try:
processed_input.append(ResponseFunctionToolCall(**item))
except ValidationError:
# Let Pydantic handle validation for malformed function calls
logger.debug(
"Failed to parse function_call to ResponseFunctionToolCall, "
"leaving for Pydantic validation"
)
processed_input.append(item)
else:
processed_input.append(item)
data["input"] = processed_input
return data
class CompletionRequest(OpenAIBaseModel): class CompletionRequest(OpenAIBaseModel):
# Ordered by official OpenAI API documentation # Ordered by official OpenAI API documentation
...@@ -1486,75 +967,6 @@ class ExtractedToolCallInformation(BaseModel): ...@@ -1486,75 +967,6 @@ class ExtractedToolCallInformation(BaseModel):
content: str | None = None content: str | None = None
class ChatMessage(OpenAIBaseModel):
role: str
content: str | None = None
refusal: str | None = None
annotations: OpenAIAnnotation | None = None
audio: OpenAIChatCompletionAudio | None = None
function_call: FunctionCall | None = None
tool_calls: list[ToolCall] = Field(default_factory=list)
# vLLM-specific fields that are not in OpenAI spec
reasoning: str | None = None
reasoning_content: str | None = None
"""Deprecated: use `reasoning` instead."""
@model_validator(mode="after")
def handle_deprecated_reasoning_content(self):
"""Copy reasoning to reasoning_content for backward compatibility."""
self.reasoning_content = self.reasoning
return self
class ChatCompletionLogProb(OpenAIBaseModel):
token: str
logprob: float = -9999.0
bytes: list[int] | None = None
class ChatCompletionLogProbsContent(ChatCompletionLogProb):
# Workaround: redefine fields name cache so that it's not
# shared with the super class.
field_names: ClassVar[set[str] | None] = None
top_logprobs: list[ChatCompletionLogProb] = Field(default_factory=list)
class ChatCompletionLogProbs(OpenAIBaseModel):
content: list[ChatCompletionLogProbsContent] | None = None
class ChatCompletionResponseChoice(OpenAIBaseModel):
index: int
message: ChatMessage
logprobs: ChatCompletionLogProbs | None = None
# per OpenAI spec this is the default
finish_reason: str | None = "stop"
# not part of the OpenAI spec but included in vLLM for legacy reasons
stop_reason: int | str | None = None
# not part of the OpenAI spec but is useful for tracing the tokens
# in agent scenarios
token_ids: list[int] | None = None
class ChatCompletionResponse(OpenAIBaseModel):
id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
object: Literal["chat.completion"] = "chat.completion"
created: int = Field(default_factory=lambda: int(time.time()))
model: str
choices: list[ChatCompletionResponseChoice]
service_tier: Literal["auto", "default", "flex", "scale", "priority"] | None = None
system_fingerprint: str | None = None
usage: UsageInfo
# vLLM-specific fields that are not in OpenAI spec
prompt_logprobs: list[dict[int, Logprob] | None] | None = None
prompt_token_ids: list[int] | None = None
kv_transfer_params: dict[str, Any] | None = Field(
default=None, description="KVTransfer parameters."
)
class DeltaMessage(OpenAIBaseModel): class DeltaMessage(OpenAIBaseModel):
role: str | None = None role: str | None = None
content: str | None = None content: str | None = None
...@@ -1570,27 +982,6 @@ class DeltaMessage(OpenAIBaseModel): ...@@ -1570,27 +982,6 @@ class DeltaMessage(OpenAIBaseModel):
return self return self
class ChatCompletionResponseStreamChoice(OpenAIBaseModel):
index: int
delta: DeltaMessage
logprobs: ChatCompletionLogProbs | None = None
finish_reason: str | None = None
stop_reason: int | str | None = None
# not part of the OpenAI spec but for tracing the tokens
token_ids: list[int] | None = None
class ChatCompletionStreamResponse(OpenAIBaseModel):
id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
object: Literal["chat.completion.chunk"] = "chat.completion.chunk"
created: int = Field(default_factory=lambda: int(time.time()))
model: str
choices: list[ChatCompletionResponseStreamChoice]
usage: UsageInfo | None = Field(default=None)
# not part of the OpenAI spec but for tracing the tokens
prompt_token_ids: list[int] | None = None
class TranscriptionResponseStreamChoice(OpenAIBaseModel): class TranscriptionResponseStreamChoice(OpenAIBaseModel):
delta: DeltaMessage delta: DeltaMessage
finish_reason: str | None = None finish_reason: str | None = None
...@@ -1856,128 +1247,6 @@ StreamingResponsesResponse: TypeAlias = ( ...@@ -1856,128 +1247,6 @@ StreamingResponsesResponse: TypeAlias = (
) )
class TokenizeCompletionRequest(OpenAIBaseModel):
model: str | None = None
prompt: str
add_special_tokens: bool = Field(
default=True,
description=(
"If true (the default), special tokens (e.g. BOS) will be added to "
"the prompt."
),
)
return_token_strs: bool | None = Field(
default=False,
description=(
"If true, also return the token strings corresponding to the token ids."
),
)
class TokenizeChatRequest(OpenAIBaseModel):
model: str | None = None
messages: list[ChatCompletionMessageParam]
add_generation_prompt: bool = Field(
default=True,
description=(
"If true, the generation prompt will be added to the chat template. "
"This is a parameter used by chat template in tokenizer config of the "
"model."
),
)
return_token_strs: bool | None = Field(
default=False,
description=(
"If true, also return the token strings corresponding to the token ids."
),
)
continue_final_message: bool = Field(
default=False,
description=(
"If this is set, the chat will be formatted so that the final "
"message in the chat is open-ended, without any EOS tokens. The "
"model will continue this message rather than starting a new one. "
'This allows you to "prefill" part of the model\'s response for it. '
"Cannot be used at the same time as `add_generation_prompt`."
),
)
add_special_tokens: bool = Field(
default=False,
description=(
"If true, special tokens (e.g. BOS) will be added to the prompt "
"on top of what is added by the chat template. "
"For most models, the chat template takes care of adding the "
"special tokens so this should be set to false (as is the "
"default)."
),
)
chat_template: str | None = Field(
default=None,
description=(
"A Jinja template to use for this conversion. "
"As of transformers v4.44, default chat template is no longer "
"allowed, so you must provide a chat template if the tokenizer "
"does not define one."
),
)
chat_template_kwargs: dict[str, Any] | None = Field(
default=None,
description=(
"Additional keyword args to pass to the template renderer. "
"Will be accessible by the chat template."
),
)
mm_processor_kwargs: dict[str, Any] | None = Field(
default=None,
description=("Additional kwargs to pass to the HF processor."),
)
tools: list[ChatCompletionToolsParam] | None = Field(
default=None,
description=("A list of tools the model may call."),
)
@model_validator(mode="before")
@classmethod
def check_generation_prompt(cls, data):
if data.get("continue_final_message") and data.get("add_generation_prompt"):
raise ValueError(
"Cannot set both `continue_final_message` and "
"`add_generation_prompt` to True."
)
return data
TokenizeRequest: TypeAlias = TokenizeCompletionRequest | TokenizeChatRequest
class TokenizeResponse(OpenAIBaseModel):
count: int
max_model_len: int
tokens: list[int]
token_strs: list[str] | None = None
class DetokenizeRequest(OpenAIBaseModel):
model: str | None = None
tokens: list[int]
class DetokenizeResponse(OpenAIBaseModel):
prompt: str
class TokenizerInfoResponse(OpenAIBaseModel):
"""
Response containing tokenizer configuration
equivalent to tokenizer_config.json
"""
model_config = ConfigDict(extra="allow")
tokenizer_class: str
class LoadLoRAAdapterRequest(BaseModel): class LoadLoRAAdapterRequest(BaseModel):
lora_name: str lora_name: str
lora_path: str lora_path: str
...@@ -2537,30 +1806,3 @@ class GenerateRequest(BaseModel): ...@@ -2537,30 +1806,3 @@ class GenerateRequest(BaseModel):
default=None, default=None,
description="KVTransfer parameters used for disaggregated serving.", description="KVTransfer parameters used for disaggregated serving.",
) )
class GenerateResponseChoice(BaseModel):
index: int
logprobs: ChatCompletionLogProbs | None = None
# per OpenAI spec this is the default
finish_reason: str | None = "stop"
token_ids: list[int] | None = None
class GenerateResponse(BaseModel):
request_id: str = Field(
default_factory=random_uuid,
description=(
"The request_id related to this request. If the caller does "
"not set it, a random_uuid will be generated. This id is used "
"through out the inference process and return in response."
),
)
choices: list[GenerateResponseChoice]
prompt_logprobs: list[dict[int, Logprob] | None] | None = None
kv_transfer_params: dict[str, Any] | None = Field(
default=None,
description="KVTransfer parameters used for disaggregated serving.",
)
...@@ -38,22 +38,20 @@ from vllm.entrypoints.context import ( ...@@ -38,22 +38,20 @@ from vllm.entrypoints.context import (
StreamingHarmonyContext, StreamingHarmonyContext,
) )
from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionNamedToolChoiceParam, ChatCompletionNamedToolChoiceParam,
ChatCompletionRequest, ChatCompletionRequest,
ChatCompletionResponse, ChatCompletionResponse,
)
from vllm.entrypoints.openai.engine.protocol import (
CompletionRequest, CompletionRequest,
CompletionResponse, CompletionResponse,
DetokenizeRequest,
ErrorInfo, ErrorInfo,
ErrorResponse, ErrorResponse,
FunctionCall, FunctionCall,
FunctionDefinition, FunctionDefinition,
ResponseInputOutputItem, ResponseInputOutputItem,
ResponsesRequest, ResponsesRequest,
TokenizeChatRequest,
TokenizeCompletionRequest,
TokenizeResponse,
TranscriptionRequest, TranscriptionRequest,
TranscriptionResponse, TranscriptionResponse,
TranslationRequest, TranslationRequest,
...@@ -86,6 +84,12 @@ from vllm.entrypoints.responses_utils import ( ...@@ -86,6 +84,12 @@ from vllm.entrypoints.responses_utils import (
construct_input_messages, construct_input_messages,
) )
from vllm.entrypoints.serve.disagg.protocol import GenerateRequest, GenerateResponse from vllm.entrypoints.serve.disagg.protocol import GenerateRequest, GenerateResponse
from vllm.entrypoints.serve.tokenize.protocol import (
DetokenizeRequest,
TokenizeChatRequest,
TokenizeCompletionRequest,
TokenizeResponse,
)
from vllm.entrypoints.utils import _validate_truncation_size from vllm.entrypoints.utils import _validate_truncation_size
from vllm.inputs.data import PromptType, TokensPrompt from vllm.inputs.data import PromptType, TokensPrompt
from vllm.inputs.parse import ( from vllm.inputs.parse import (
......
...@@ -43,8 +43,8 @@ from openai_harmony import Message as OpenAIHarmonyMessage ...@@ -43,8 +43,8 @@ from openai_harmony import Message as OpenAIHarmonyMessage
from openai_harmony import Role as OpenAIHarmonyRole from openai_harmony import Role as OpenAIHarmonyRole
from vllm import envs from vllm import envs
from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionToolsParam
ChatCompletionToolsParam, from vllm.entrypoints.openai.engine.protocol import (
ResponseInputOutputItem, ResponseInputOutputItem,
ResponsesRequest, ResponsesRequest,
) )
......
...@@ -16,7 +16,10 @@ from openai.types.responses.response_reasoning_item import ( ...@@ -16,7 +16,10 @@ from openai.types.responses.response_reasoning_item import (
) )
from vllm.entrypoints.constants import MCP_PREFIX from vllm.entrypoints.constants import MCP_PREFIX
from vllm.entrypoints.openai.protocol import ResponseInputOutputItem, ResponsesRequest from vllm.entrypoints.openai.engine.protocol import (
ResponseInputOutputItem,
ResponsesRequest,
)
from vllm.outputs import CompletionOutput from vllm.outputs import CompletionOutput
from vllm.reasoning.abs_reasoning_parsers import ReasoningParser from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
from vllm.tokenizers import TokenizerLike from vllm.tokenizers import TokenizerLike
......
...@@ -19,13 +19,15 @@ from tqdm import tqdm ...@@ -19,13 +19,15 @@ from tqdm import tqdm
from vllm.engine.arg_utils import AsyncEngineArgs, optional_type from vllm.engine.arg_utils import AsyncEngineArgs, optional_type
from vllm.engine.protocol import EngineClient from vllm.engine.protocol import EngineClient
from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionRequest, ChatCompletionRequest,
ChatCompletionResponse, ChatCompletionResponse,
)
from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
from vllm.entrypoints.openai.engine.protocol import (
ErrorResponse, ErrorResponse,
OpenAIBaseModel, OpenAIBaseModel,
) )
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
from vllm.entrypoints.pooling.embed.protocol import EmbeddingRequest, EmbeddingResponse from vllm.entrypoints.pooling.embed.protocol import EmbeddingRequest, EmbeddingResponse
from vllm.entrypoints.pooling.embed.serving import OpenAIServingEmbedding from vllm.entrypoints.pooling.embed.serving import OpenAIServingEmbedding
......
...@@ -12,7 +12,7 @@ from fastapi import Request ...@@ -12,7 +12,7 @@ from fastapi import Request
from vllm.engine.protocol import EngineClient from vllm.engine.protocol import EngineClient
from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.engine.protocol import (
CompletionLogProbs, CompletionLogProbs,
CompletionRequest, CompletionRequest,
CompletionResponse, CompletionResponse,
...@@ -25,7 +25,7 @@ from vllm.entrypoints.openai.protocol import ( ...@@ -25,7 +25,7 @@ from vllm.entrypoints.openai.protocol import (
UsageInfo, UsageInfo,
VLLMValidationError, VLLMValidationError,
) )
from vllm.entrypoints.openai.serving_engine import ( from vllm.entrypoints.openai.engine.serving import (
GenerationError, GenerationError,
OpenAIServing, OpenAIServing,
clamp_prompt_logprobs, clamp_prompt_logprobs,
......
...@@ -7,7 +7,7 @@ from dataclasses import dataclass ...@@ -7,7 +7,7 @@ from dataclasses import dataclass
from http import HTTPStatus from http import HTTPStatus
from vllm.engine.protocol import EngineClient from vllm.engine.protocol import EngineClient
from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.engine.protocol import (
ErrorInfo, ErrorInfo,
ErrorResponse, ErrorResponse,
LoadLoRAAdapterRequest, LoadLoRAAdapterRequest,
......
...@@ -72,19 +72,7 @@ from vllm.entrypoints.context import ( ...@@ -72,19 +72,7 @@ from vllm.entrypoints.context import (
StreamingHarmonyContext, StreamingHarmonyContext,
) )
from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.parser.harmony_utils import ( from vllm.entrypoints.openai.engine.protocol import (
construct_harmony_previous_input_messages,
get_developer_message,
get_stop_tokens_for_assistant_actions,
get_system_message,
get_user_message,
has_custom_tools,
parse_output_message,
parse_remaining_state,
parse_response_input,
render_for_completion,
)
from vllm.entrypoints.openai.protocol import (
DeltaMessage, DeltaMessage,
ErrorResponse, ErrorResponse,
InputTokensDetails, InputTokensDetails,
...@@ -102,10 +90,22 @@ from vllm.entrypoints.openai.protocol import ( ...@@ -102,10 +90,22 @@ from vllm.entrypoints.openai.protocol import (
StreamingResponsesResponse, StreamingResponsesResponse,
VLLMValidationError, VLLMValidationError,
) )
from vllm.entrypoints.openai.serving_engine import ( from vllm.entrypoints.openai.engine.serving import (
GenerationError, GenerationError,
OpenAIServing, OpenAIServing,
) )
from vllm.entrypoints.openai.parser.harmony_utils import (
construct_harmony_previous_input_messages,
get_developer_message,
get_stop_tokens_for_assistant_actions,
get_system_message,
get_user_message,
has_custom_tools,
parse_output_message,
parse_remaining_state,
parse_response_input,
render_for_completion,
)
from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.entrypoints.responses_utils import ( from vllm.entrypoints.responses_utils import (
construct_input_messages, construct_input_messages,
......
...@@ -6,7 +6,7 @@ from fastapi import Request ...@@ -6,7 +6,7 @@ from fastapi import Request
from vllm.engine.protocol import EngineClient from vllm.engine.protocol import EngineClient
from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.engine.protocol import (
ErrorResponse, ErrorResponse,
RequestResponseMetadata, RequestResponseMetadata,
TranscriptionRequest, TranscriptionRequest,
......
...@@ -15,7 +15,7 @@ from transformers import PreTrainedTokenizerBase ...@@ -15,7 +15,7 @@ from transformers import PreTrainedTokenizerBase
import vllm.envs as envs import vllm.envs as envs
from vllm.engine.protocol import EngineClient from vllm.engine.protocol import EngineClient
from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.engine.protocol import (
DeltaMessage, DeltaMessage,
ErrorResponse, ErrorResponse,
RequestResponseMetadata, RequestResponseMetadata,
...@@ -32,7 +32,7 @@ from vllm.entrypoints.openai.protocol import ( ...@@ -32,7 +32,7 @@ from vllm.entrypoints.openai.protocol import (
UsageInfo, UsageInfo,
VLLMValidationError, VLLMValidationError,
) )
from vllm.entrypoints.openai.serving_engine import OpenAIServing, SpeechToTextRequest from vllm.entrypoints.openai.engine.serving import OpenAIServing, SpeechToTextRequest
from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.inputs.data import PromptType from vllm.inputs.data import PromptType
from vllm.logger import init_logger from vllm.logger import init_logger
......
...@@ -5,7 +5,7 @@ from typing import TypeVar ...@@ -5,7 +5,7 @@ from typing import TypeVar
from fastapi import Request from fastapi import Request
from fastapi.exceptions import RequestValidationError from fastapi.exceptions import RequestValidationError
from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionRequest, ChatCompletionRequest,
ChatCompletionResponseChoice, ChatCompletionResponseChoice,
ChatCompletionResponseStreamChoice, ChatCompletionResponseStreamChoice,
......
...@@ -6,7 +6,7 @@ from fastapi import APIRouter, Depends, HTTPException, Request ...@@ -6,7 +6,7 @@ from fastapi import APIRouter, Depends, HTTPException, Request
from starlette.responses import JSONResponse from starlette.responses import JSONResponse
from typing_extensions import assert_never from typing_extensions import assert_never
from vllm.entrypoints.openai.protocol import ErrorResponse from vllm.entrypoints.openai.engine.protocol import ErrorResponse
from vllm.entrypoints.openai.utils import validate_json_request from vllm.entrypoints.openai.utils import validate_json_request
from vllm.entrypoints.pooling.classify.protocol import ( from vllm.entrypoints.pooling.classify.protocol import (
ClassificationRequest, ClassificationRequest,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment