[Refactor] [6/N] to simplify the vLLM openai chat_completion serving architecture (#32240)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>

[Refactor] [6/N] to simplify the vLLM openai chat_completion serving architecture (#32240)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
fefce498 · Chauncey · GitHub · a5bbbd2f · fefce498 · fefce498
Unverified Commit fefce498 authored Jan 13, 2026 by Chauncey Committed by GitHub Jan 13, 2026
20 changed files
--- a/vllm/entrypoints/context.py
+++ b/vllm/entrypoints/context.py
@@ -22,6 +22,12 @@ from vllm.entrypoints.chat_utils import (
    ChatTemplateContentFormatOption,
 )
 from vllm.entrypoints.constants import MCP_PREFIX
+from vllm.entrypoints.openai.engine.protocol import (
+    FunctionCall,
+    ResponseInputOutputItem,
+    ResponseRawMessageAndToken,
+    ResponsesRequest,
+)
 from vllm.entrypoints.openai.parser.harmony_utils import (
    get_encoding,
    get_streamable_parser_for_assistant,
@@ -30,12 +36,6 @@ from vllm.entrypoints.openai.parser.harmony_utils import (
 from vllm.entrypoints.openai.parser.responses_parser import (
    get_responses_parser_for_simple_context,
 )
-from vllm.entrypoints.openai.protocol import (
-    FunctionCall,
-    ResponseInputOutputItem,
-    ResponseRawMessageAndToken,
-    ResponsesRequest,
-)
 from vllm.entrypoints.responses_utils import construct_tool_dicts
 from vllm.entrypoints.tool import Tool
 from vllm.entrypoints.tool_server import ToolServer

--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -42,11 +42,9 @@ from vllm.entrypoints.anthropic.protocol import (
 from vllm.entrypoints.anthropic.serving_messages import AnthropicServingMessages
 from vllm.entrypoints.launcher import serve_http
 from vllm.entrypoints.logger import RequestLogger
+from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
 from vllm.entrypoints.openai.cli_args import make_arg_parser, validate_parsed_serve_args
-from vllm.entrypoints.openai.orca_metrics import metrics_header
+from vllm.entrypoints.openai.engine.protocol import (
-from vllm.entrypoints.openai.protocol import (
-    ChatCompletionRequest,
-    ChatCompletionResponse,
    CompletionRequest,
    CompletionResponse,
    ErrorInfo,
@@ -59,9 +57,9 @@ from vllm.entrypoints.openai.protocol import (
    TranslationRequest,
    TranslationResponseVariant,
 )
-from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
+from vllm.entrypoints.openai.engine.serving import OpenAIServing
+from vllm.entrypoints.openai.orca_metrics import metrics_header
 from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
-from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import (
    BaseModelPath,
    OpenAIServingModels,
@@ -475,47 +473,6 @@ async def create_messages(request: AnthropicMessagesRequest, raw_request: Reques
    return StreamingResponse(content=generator, media_type="text/event-stream")
-@router.post(
-    "/v1/chat/completions",
-    dependencies=[Depends(validate_json_request)],
-    responses={
-        HTTPStatus.OK.value: {"content": {"text/event-stream": {}}},
-        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
-        HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse},
-        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
-    },
-)
-@with_cancellation
-@load_aware_call
-async def create_chat_completion(request: ChatCompletionRequest, raw_request: Request):
-    metrics_header_format = raw_request.headers.get(
-        ENDPOINT_LOAD_METRICS_FORMAT_HEADER_LABEL, ""
-    )
-    handler = chat(raw_request)
-    if handler is None:
-        return base(raw_request).create_error_response(
-            message="The model does not support Chat Completions API"
-        )
-    try:
-        generator = await handler.create_chat_completion(request, raw_request)
-    except Exception as e:
-        raise HTTPException(
-            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
-        ) from e
-    if isinstance(generator, ErrorResponse):
-        return JSONResponse(
-            content=generator.model_dump(), status_code=generator.error.code
-        )
-    elif isinstance(generator, ChatCompletionResponse):
-        return JSONResponse(
-            content=generator.model_dump(),
-            headers=metrics_header(metrics_header_format),
-        )
-    return StreamingResponse(content=generator, media_type="text/event-stream")
 @router.post(
    "/v1/completions",
    dependencies=[Depends(validate_json_request)],
@@ -735,8 +692,10 @@ class XRequestIdMiddleware:
 def _extract_content_from_chunk(chunk_data: dict) -> str:
    """Extract content from a streaming response chunk."""
    try:
-        from vllm.entrypoints.openai.protocol import (
+        from vllm.entrypoints.openai.chat_completion.protocol import (
            ChatCompletionStreamResponse,
+        )
+        from vllm.entrypoints.openai.engine.protocol import (
            CompletionStreamResponse,
        )
@@ -880,7 +839,11 @@ def build_app(args: Namespace) -> FastAPI:
    from vllm.entrypoints.serve import register_vllm_serve_api_routers
    register_vllm_serve_api_routers(app)
+    from vllm.entrypoints.openai.chat_completion.api_router import (
+        attach_router as register_chat_api_router,
+    )
+    register_chat_api_router(app)
    from vllm.entrypoints.sagemaker.routes import register_sagemaker_routes
    register_sagemaker_routes(router)

--- a/vllm/entrypoints/openai/chat_completion/__init__.py
+++ b/vllm/entrypoints/openai/chat_completion/__init__.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
--- a/vllm/entrypoints/openai/chat_completion/api_router.py
+++ b/vllm/entrypoints/openai/chat_completion/api_router.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from http import HTTPStatus
+from fastapi import APIRouter, Depends, FastAPI, HTTPException, Request
+from fastapi.responses import JSONResponse, StreamingResponse
+from vllm.entrypoints.openai.chat_completion.protocol import (
+    ChatCompletionRequest,
+    ChatCompletionResponse,
+)
+from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
+from vllm.entrypoints.openai.engine.protocol import ErrorResponse
+from vllm.entrypoints.openai.orca_metrics import metrics_header
+from vllm.entrypoints.openai.utils import validate_json_request
+from vllm.entrypoints.utils import (
+    load_aware_call,
+    with_cancellation,
+)
+from vllm.logger import init_logger
+logger = init_logger(__name__)
+router = APIRouter()
+ENDPOINT_LOAD_METRICS_FORMAT_HEADER_LABEL = "endpoint-load-metrics-format"
+def chat(request: Request) -> OpenAIServingChat | None:
+    return request.app.state.openai_serving_chat
+@router.post(
+    "/v1/chat/completions",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.OK.value: {"content": {"text/event-stream": {}}},
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
+@with_cancellation
+@load_aware_call
+async def create_chat_completion(request: ChatCompletionRequest, raw_request: Request):
+    metrics_header_format = raw_request.headers.get(
+        ENDPOINT_LOAD_METRICS_FORMAT_HEADER_LABEL, ""
+    )
+    handler = chat(raw_request)
+    if handler is None:
+        base_server = raw_request.app.state.openai_serving_tokenization
+        return base_server.create_error_response(
+            message="The model does not support Chat Completions API"
+        )
+    try:
+        generator = await handler.create_chat_completion(request, raw_request)
+    except Exception as e:
+        raise HTTPException(
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
+        ) from e
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(
+            content=generator.model_dump(), status_code=generator.error.code
+        )
+    elif isinstance(generator, ChatCompletionResponse):
+        return JSONResponse(
+            content=generator.model_dump(),
+            headers=metrics_header(metrics_header_format),
+        )
+    return StreamingResponse(content=generator, media_type="text/event-stream")
+def attach_router(app: FastAPI):
+    app.include_router(router)
--- a/vllm/entrypoints/openai/chat_completion/protocol.py
+++ b/vllm/entrypoints/openai/chat_completion/protocol.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adapted from
+# https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py
+import json
+import time
+from typing import Annotated, Any, ClassVar, Literal
+import torch
+from openai.types.chat.chat_completion_audio import (
+    ChatCompletionAudio as OpenAIChatCompletionAudio,
+)
+from openai.types.chat.chat_completion_message import Annotation as OpenAIAnnotation
+from pydantic import (
+    Field,
+    model_validator,
+)
+from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
+from vllm.entrypoints.openai.engine.protocol import (
+    AnyResponseFormat,
+    DeltaMessage,
+    FunctionCall,
+    FunctionDefinition,
+    LegacyStructuralTagResponseFormat,
+    LogitsProcessors,
+    OpenAIBaseModel,
+    StreamOptions,
+    StructuralTagResponseFormat,
+    ToolCall,
+    UsageInfo,
+    get_logits_processors,
+)
+from vllm.exceptions import VLLMValidationError
+from vllm.logger import init_logger
+from vllm.logprobs import Logprob
+from vllm.sampling_params import (
+    BeamSearchParams,
+    RequestOutputKind,
+    SamplingParams,
+    StructuredOutputsParams,
+)
+from vllm.utils import random_uuid
+logger = init_logger(__name__)
+_LONG_INFO = torch.iinfo(torch.long)
+class ChatMessage(OpenAIBaseModel):
+    role: str
+    content: str | None = None
+    refusal: str | None = None
+    annotations: OpenAIAnnotation | None = None
+    audio: OpenAIChatCompletionAudio | None = None
+    function_call: FunctionCall | None = None
+    tool_calls: list[ToolCall] = Field(default_factory=list)
+    # vLLM-specific fields that are not in OpenAI spec
+    reasoning: str | None = None
+    reasoning_content: str | None = None
+    """Deprecated: use `reasoning` instead."""
+    @model_validator(mode="after")
+    def handle_deprecated_reasoning_content(self):
+        """Copy reasoning to reasoning_content for backward compatibility."""
+        self.reasoning_content = self.reasoning
+        return self
+class ChatCompletionLogProb(OpenAIBaseModel):
+    token: str
+    logprob: float = -9999.0
+    bytes: list[int] | None = None
+class ChatCompletionLogProbsContent(ChatCompletionLogProb):
+    # Workaround: redefine fields name cache so that it's not
+    # shared with the super class.
+    field_names: ClassVar[set[str] | None] = None
+    top_logprobs: list[ChatCompletionLogProb] = Field(default_factory=list)
+class ChatCompletionLogProbs(OpenAIBaseModel):
+    content: list[ChatCompletionLogProbsContent] | None = None
+class ChatCompletionResponseChoice(OpenAIBaseModel):
+    index: int
+    message: ChatMessage
+    logprobs: ChatCompletionLogProbs | None = None
+    # per OpenAI spec this is the default
+    finish_reason: str | None = "stop"
+    # not part of the OpenAI spec but included in vLLM for legacy reasons
+    stop_reason: int | str | None = None
+    # not part of the OpenAI spec but is useful for tracing the tokens
+    # in agent scenarios
+    token_ids: list[int] | None = None
+class ChatCompletionResponse(OpenAIBaseModel):
+    id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
+    object: Literal["chat.completion"] = "chat.completion"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: list[ChatCompletionResponseChoice]
+    service_tier: Literal["auto", "default", "flex", "scale", "priority"] | None = None
+    system_fingerprint: str | None = None
+    usage: UsageInfo
+    # vLLM-specific fields that are not in OpenAI spec
+    prompt_logprobs: list[dict[int, Logprob] | None] | None = None
+    prompt_token_ids: list[int] | None = None
+    kv_transfer_params: dict[str, Any] | None = Field(
+        default=None, description="KVTransfer parameters."
+    )
+class ChatCompletionResponseStreamChoice(OpenAIBaseModel):
+    index: int
+    delta: DeltaMessage
+    logprobs: ChatCompletionLogProbs | None = None
+    finish_reason: str | None = None
+    stop_reason: int | str | None = None
+    # not part of the OpenAI spec but for tracing the tokens
+    token_ids: list[int] | None = None
+class ChatCompletionStreamResponse(OpenAIBaseModel):
+    id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
+    object: Literal["chat.completion.chunk"] = "chat.completion.chunk"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: list[ChatCompletionResponseStreamChoice]
+    usage: UsageInfo | None = Field(default=None)
+    # not part of the OpenAI spec but for tracing the tokens
+    prompt_token_ids: list[int] | None = None
+class ChatCompletionToolsParam(OpenAIBaseModel):
+    type: Literal["function"] = "function"
+    function: FunctionDefinition
+class ChatCompletionNamedFunction(OpenAIBaseModel):
+    name: str
+class ChatCompletionNamedToolChoiceParam(OpenAIBaseModel):
+    function: ChatCompletionNamedFunction
+    type: Literal["function"] = "function"
+class ChatCompletionRequest(OpenAIBaseModel):
+    # Ordered by official OpenAI API documentation
+    # https://platform.openai.com/docs/api-reference/chat/create
+    messages: list[ChatCompletionMessageParam]
+    model: str | None = None
+    frequency_penalty: float | None = 0.0
+    logit_bias: dict[str, float] | None = None
+    logprobs: bool | None = False
+    top_logprobs: int | None = 0
+    max_tokens: int | None = Field(
+        default=None,
+        deprecated="max_tokens is deprecated in favor of "
+        "the max_completion_tokens field",
+    )
+    max_completion_tokens: int | None = None
+    n: int | None = 1
+    presence_penalty: float | None = 0.0
+    response_format: AnyResponseFormat | None = None
+    seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
+    stop: str | list[str] | None = []
+    stream: bool | None = False
+    stream_options: StreamOptions | None = None
+    temperature: float | None = None
+    top_p: float | None = None
+    tools: list[ChatCompletionToolsParam] | None = None
+    tool_choice: (
+        Literal["none"]
+        | Literal["auto"]
+        | Literal["required"]
+        | ChatCompletionNamedToolChoiceParam
+        | None
+    ) = "none"
+    reasoning_effort: Literal["low", "medium", "high"] | None = None
+    include_reasoning: bool = True
+    parallel_tool_calls: bool | None = True
+    # NOTE this will be ignored by vLLM
+    user: str | None = None
+    # --8<-- [start:chat-completion-sampling-params]
+    use_beam_search: bool = False
+    top_k: int | None = None
+    min_p: float | None = None
+    repetition_penalty: float | None = None
+    length_penalty: float = 1.0
+    stop_token_ids: list[int] | None = []
+    include_stop_str_in_output: bool = False
+    ignore_eos: bool = False
+    min_tokens: int = 0
+    skip_special_tokens: bool = True
+    spaces_between_special_tokens: bool = True
+    truncate_prompt_tokens: Annotated[int, Field(ge=-1, le=_LONG_INFO.max)] | None = (
+        None
+    )
+    prompt_logprobs: int | None = None
+    allowed_token_ids: list[int] | None = None
+    bad_words: list[str] = Field(default_factory=list)
+    # --8<-- [end:chat-completion-sampling-params]
+    # --8<-- [start:chat-completion-extra-params]
+    echo: bool = Field(
+        default=False,
+        description=(
+            "If true, the new message will be prepended with the last message "
+            "if they belong to the same role."
+        ),
+    )
+    add_generation_prompt: bool = Field(
+        default=True,
+        description=(
+            "If true, the generation prompt will be added to the chat template. "
+            "This is a parameter used by chat template in tokenizer config of the "
+            "model."
+        ),
+    )
+    continue_final_message: bool = Field(
+        default=False,
+        description=(
+            "If this is set, the chat will be formatted so that the final "
+            "message in the chat is open-ended, without any EOS tokens. The "
+            "model will continue this message rather than starting a new one. "
+            'This allows you to "prefill" part of the model\'s response for it. '
+            "Cannot be used at the same time as `add_generation_prompt`."
+        ),
+    )
+    add_special_tokens: bool = Field(
+        default=False,
+        description=(
+            "If true, special tokens (e.g. BOS) will be added to the prompt "
+            "on top of what is added by the chat template. "
+            "For most models, the chat template takes care of adding the "
+            "special tokens so this should be set to false (as is the "
+            "default)."
+        ),
+    )
+    documents: list[dict[str, str]] | None = Field(
+        default=None,
+        description=(
+            "A list of dicts representing documents that will be accessible to "
+            "the model if it is performing RAG (retrieval-augmented generation)."
+            " If the template does not support RAG, this argument will have no "
+            "effect. We recommend that each document should be a dict containing "
+            '"title" and "text" keys.'
+        ),
+    )
+    chat_template: str | None = Field(
+        default=None,
+        description=(
+            "A Jinja template to use for this conversion. "
+            "As of transformers v4.44, default chat template is no longer "
+            "allowed, so you must provide a chat template if the tokenizer "
+            "does not define one."
+        ),
+    )
+    chat_template_kwargs: dict[str, Any] | None = Field(
+        default=None,
+        description=(
+            "Additional keyword args to pass to the template renderer. "
+            "Will be accessible by the chat template."
+        ),
+    )
+    mm_processor_kwargs: dict[str, Any] | None = Field(
+        default=None,
+        description=("Additional kwargs to pass to the HF processor."),
+    )
+    structured_outputs: StructuredOutputsParams | None = Field(
+        default=None,
+        description="Additional kwargs for structured outputs",
+    )
+    priority: int = Field(
+        default=0,
+        description=(
+            "The priority of the request (lower means earlier handling; "
+            "default: 0). Any priority other than 0 will raise an error "
+            "if the served model does not use priority scheduling."
+        ),
+    )
+    request_id: str = Field(
+        default_factory=random_uuid,
+        description=(
+            "The request_id related to this request. If the caller does "
+            "not set it, a random_uuid will be generated. This id is used "
+            "through out the inference process and return in response."
+        ),
+    )
+    logits_processors: LogitsProcessors | None = Field(
+        default=None,
+        description=(
+            "A list of either qualified names of logits processors, or "
+            "constructor objects, to apply when sampling. A constructor is "
+            "a JSON object with a required 'qualname' field specifying the "
+            "qualified name of the processor class/factory, and optional "
+            "'args' and 'kwargs' fields containing positional and keyword "
+            "arguments. For example: {'qualname': "
+            "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
+            "{'param': 'value'}}."
+        ),
+    )
+    return_tokens_as_token_ids: bool | None = Field(
+        default=None,
+        description=(
+            "If specified with 'logprobs', tokens are represented "
+            " as strings of the form 'token_id:{token_id}' so that tokens "
+            "that are not JSON-encodable can be identified."
+        ),
+    )
+    return_token_ids: bool | None = Field(
+        default=None,
+        description=(
+            "If specified, the result will include token IDs alongside the "
+            "generated text. In streaming mode, prompt_token_ids is included "
+            "only in the first chunk, and token_ids contains the delta tokens "
+            "for each chunk. This is useful for debugging or when you "
+            "need to map generated text back to input tokens."
+        ),
+    )
+    cache_salt: str | None = Field(
+        default=None,
+        description=(
+            "If specified, the prefix cache will be salted with the provided "
+            "string to prevent an attacker to guess prompts in multi-user "
+            "environments. The salt should be random, protected from "
+            "access by 3rd parties, and long enough to be "
+            "unpredictable (e.g., 43 characters base64-encoded, corresponding "
+            "to 256 bit)."
+        ),
+    )
+    kv_transfer_params: dict[str, Any] | None = Field(
+        default=None,
+        description="KVTransfer parameters used for disaggregated serving.",
+    )
+    vllm_xargs: dict[str, str | int | float | list[str | int | float]] | None = Field(
+        default=None,
+        description=(
+            "Additional request parameters with (list of) string or "
+            "numeric values, used by custom extensions."
+        ),
+    )
+    # --8<-- [end:chat-completion-extra-params]
+    # Default sampling parameters for chat completion requests
+    _DEFAULT_SAMPLING_PARAMS: dict = {
+        "repetition_penalty": 1.0,
+        "temperature": 1.0,
+        "top_p": 1.0,
+        "top_k": 0,
+        "min_p": 0.0,
+    }
+    def to_beam_search_params(
+        self, max_tokens: int, default_sampling_params: dict
+    ) -> BeamSearchParams:
+        n = self.n if self.n is not None else 1
+        if (temperature := self.temperature) is None:
+            temperature = default_sampling_params.get(
+                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
+            )
+        return BeamSearchParams(
+            beam_width=n,
+            max_tokens=max_tokens,
+            ignore_eos=self.ignore_eos,
+            temperature=temperature,
+            length_penalty=self.length_penalty,
+            include_stop_str_in_output=self.include_stop_str_in_output,
+        )
+    def to_sampling_params(
+        self,
+        max_tokens: int,
+        logits_processor_pattern: str | None,
+        default_sampling_params: dict,
+    ) -> SamplingParams:
+        # Default parameters
+        if (repetition_penalty := self.repetition_penalty) is None:
+            repetition_penalty = default_sampling_params.get(
+                "repetition_penalty",
+                self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
+            )
+        if (temperature := self.temperature) is None:
+            temperature = default_sampling_params.get(
+                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
+            )
+        if (top_p := self.top_p) is None:
+            top_p = default_sampling_params.get(
+                "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"]
+            )
+        if (top_k := self.top_k) is None:
+            top_k = default_sampling_params.get(
+                "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"]
+            )
+        if (min_p := self.min_p) is None:
+            min_p = default_sampling_params.get(
+                "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"]
+            )
+        prompt_logprobs = self.prompt_logprobs
+        if prompt_logprobs is None and self.echo:
+            prompt_logprobs = self.top_logprobs
+        response_format = self.response_format
+        if response_format is not None:
+            # If structured outputs wasn't already enabled,
+            # we must enable it for these features to work
+            if self.structured_outputs is None:
+                self.structured_outputs = StructuredOutputsParams()
+            # Set structured output params for response format
+            if response_format.type == "json_object":
+                self.structured_outputs.json_object = True
+            elif response_format.type == "json_schema":
+                json_schema = response_format.json_schema
+                assert json_schema is not None
+                self.structured_outputs.json = json_schema.json_schema
+            elif response_format.type == "structural_tag":
+                structural_tag = response_format
+                assert structural_tag is not None and isinstance(
+                    structural_tag,
+                    (
+                        LegacyStructuralTagResponseFormat,
+                        StructuralTagResponseFormat,
+                    ),
+                )
+                s_tag_obj = structural_tag.model_dump(by_alias=True)
+                self.structured_outputs.structural_tag = json.dumps(s_tag_obj)
+        extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
+        if self.kv_transfer_params:
+            # Pass in kv_transfer_params via extra_args
+            extra_args["kv_transfer_params"] = self.kv_transfer_params
+        return SamplingParams.from_optional(
+            n=self.n,
+            presence_penalty=self.presence_penalty,
+            frequency_penalty=self.frequency_penalty,
+            repetition_penalty=repetition_penalty,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            min_p=min_p,
+            seed=self.seed,
+            stop=self.stop,
+            stop_token_ids=self.stop_token_ids,
+            logprobs=self.top_logprobs if self.logprobs else None,
+            prompt_logprobs=prompt_logprobs,
+            ignore_eos=self.ignore_eos,
+            max_tokens=max_tokens,
+            min_tokens=self.min_tokens,
+            skip_special_tokens=self.skip_special_tokens,
+            spaces_between_special_tokens=self.spaces_between_special_tokens,
+            logits_processors=get_logits_processors(
+                self.logits_processors, logits_processor_pattern
+            ),
+            include_stop_str_in_output=self.include_stop_str_in_output,
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            output_kind=RequestOutputKind.DELTA
+            if self.stream
+            else RequestOutputKind.FINAL_ONLY,
+            structured_outputs=self.structured_outputs,
+            logit_bias=self.logit_bias,
+            bad_words=self.bad_words,
+            allowed_token_ids=self.allowed_token_ids,
+            extra_args=extra_args or None,
+            skip_clone=True,  # Created fresh per request, safe to skip clone
+        )
+    @model_validator(mode="before")
+    @classmethod
+    def validate_stream_options(cls, data):
+        if data.get("stream_options") and not data.get("stream"):
+            raise VLLMValidationError(
+                "Stream options can only be defined when `stream=True`.",
+                parameter="stream_options",
+            )
+        return data
+    @model_validator(mode="before")
+    @classmethod
+    def check_logprobs(cls, data):
+        if (prompt_logprobs := data.get("prompt_logprobs")) is not None:
+            if data.get("stream") and (prompt_logprobs > 0 or prompt_logprobs == -1):
+                raise VLLMValidationError(
+                    "`prompt_logprobs` are not available when `stream=True`.",
+                    parameter="prompt_logprobs",
+                )
+            if prompt_logprobs < 0 and prompt_logprobs != -1:
+                raise VLLMValidationError(
+                    "`prompt_logprobs` must be a positive value or -1.",
+                    parameter="prompt_logprobs",
+                    value=prompt_logprobs,
+                )
+        if (top_logprobs := data.get("top_logprobs")) is not None:
+            if top_logprobs < 0 and top_logprobs != -1:
+                raise VLLMValidationError(
+                    "`top_logprobs` must be a positive value or -1.",
+                    parameter="top_logprobs",
+                    value=top_logprobs,
+                )
+            if (top_logprobs == -1 or top_logprobs > 0) and not data.get("logprobs"):
+                raise VLLMValidationError(
+                    "when using `top_logprobs`, `logprobs` must be set to true.",
+                    parameter="top_logprobs",
+                )
+        return data
+    @model_validator(mode="before")
+    @classmethod
+    def check_structured_outputs_count(cls, data):
+        if isinstance(data, ValueError):
+            raise data
+        if data.get("structured_outputs", None) is None:
+            return data
+        structured_outputs_kwargs = data["structured_outputs"]
+        count = sum(
+            structured_outputs_kwargs.get(k) is not None
+            for k in ("json", "regex", "choice")
+        )
+        # you can only use one kind of constraints for structured outputs
+        if count > 1:
+            raise ValueError(
+                "You can only use one kind of constraints for structured "
+                "outputs ('json', 'regex' or 'choice')."
+            )
+        # you can only either use structured outputs or tools, not both
+        if count > 1 and data.get("tool_choice", "none") not in (
+            "none",
+            "auto",
+            "required",
+        ):
+            raise ValueError(
+                "You can only either use constraints for structured outputs "
+                "or tools, not both."
+            )
+        return data
+    @model_validator(mode="before")
+    @classmethod
+    def check_tool_usage(cls, data):
+        # if "tool_choice" is not specified but tools are provided,
+        # default to "auto" tool_choice
+        if "tool_choice" not in data and data.get("tools"):
+            data["tool_choice"] = "auto"
+        # if "tool_choice" is "none" -- no validation is needed for tools
+        if "tool_choice" in data and data["tool_choice"] == "none":
+            return data
+        # if "tool_choice" is specified -- validation
+        if "tool_choice" in data and data["tool_choice"] is not None:
+            # ensure that if "tool choice" is specified, tools are present
+            if "tools" not in data or data["tools"] is None:
+                raise ValueError("When using `tool_choice`, `tools` must be set.")
+            # make sure that tool choice is either a named tool
+            # OR that it's set to "auto" or "required"
+            if data["tool_choice"] not in ["auto", "required"] and not isinstance(
+                data["tool_choice"], dict
+            ):
+                raise ValueError(
+                    f"Invalid value for `tool_choice`: {data['tool_choice']}! "
+                    'Only named tools, "none", "auto" or "required" '
+                    "are supported."
+                )
+            # if tool_choice is "required" but the "tools" list is empty,
+            # override the data to behave like "none" to align with
+            # OpenAI’s behavior.
+            if (
+                data["tool_choice"] == "required"
+                and isinstance(data["tools"], list)
+                and len(data["tools"]) == 0
+            ):
+                data["tool_choice"] = "none"
+                del data["tools"]
+                return data
+            # ensure that if "tool_choice" is specified as an object,
+            # it matches a valid tool
+            correct_usage_message = (
+                'Correct usage: `{"type": "function",'
+                ' "function": {"name": "my_function"}}`'
+            )
+            if isinstance(data["tool_choice"], dict):
+                valid_tool = False
+                function = data["tool_choice"].get("function")
+                if not isinstance(function, dict):
+                    raise ValueError(
+                        f"Invalid value for `function`: `{function}` in "
+                        f"`tool_choice`! {correct_usage_message}"
+                    )
+                if "name" not in function:
+                    raise ValueError(
+                        f"Expected field `name` in `function` in "
+                        f"`tool_choice`! {correct_usage_message}"
+                    )
+                function_name = function["name"]
+                if not isinstance(function_name, str) or len(function_name) == 0:
+                    raise ValueError(
+                        f"Invalid `name` in `function`: `{function_name}`"
+                        f" in `tool_choice`! {correct_usage_message}"
+                    )
+                for tool in data["tools"]:
+                    if tool["function"]["name"] == function_name:
+                        valid_tool = True
+                        break
+                if not valid_tool:
+                    raise ValueError(
+                        "The tool specified in `tool_choice` does not match any"
+                        " of the specified `tools`"
+                    )
+        return data
+    @model_validator(mode="before")
+    @classmethod
+    def check_generation_prompt(cls, data):
+        if data.get("continue_final_message") and data.get("add_generation_prompt"):
+            raise ValueError(
+                "Cannot set both `continue_final_message` and "
+                "`add_generation_prompt` to True."
+            )
+        return data
+    @model_validator(mode="before")
+    @classmethod
+    def check_cache_salt_support(cls, data):
+        if data.get("cache_salt") is not None and (
+            not isinstance(data["cache_salt"], str) or not data["cache_salt"]
+        ):
+            raise ValueError(
+                "Parameter 'cache_salt' must be a non-empty string if provided."
+            )
+        return data
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -23,16 +23,7 @@ from vllm.entrypoints.chat_utils import (
    make_tool_call_id,
 )
 from vllm.entrypoints.logger import RequestLogger
-from vllm.entrypoints.openai.parser.harmony_utils import (
+from vllm.entrypoints.openai.chat_completion.protocol import (
-    get_developer_message,
-    get_stop_tokens_for_assistant_actions,
-    get_streamable_parser_for_assistant,
-    get_system_message,
-    parse_chat_inputs_to_harmony_messages,
-    parse_chat_output,
-    render_for_completion,
-)
-from vllm.entrypoints.openai.protocol import (
    ChatCompletionLogProb,
    ChatCompletionLogProbs,
    ChatCompletionLogProbsContent,
@@ -43,6 +34,11 @@ from vllm.entrypoints.openai.protocol import (
    ChatCompletionResponseStreamChoice,
    ChatCompletionStreamResponse,
    ChatMessage,
+)
+from vllm.entrypoints.openai.chat_completion.stream_harmony import (
+    extract_harmony_streaming_delta,
+)
+from vllm.entrypoints.openai.engine.protocol import (
    DeltaFunctionCall,
    DeltaMessage,
    DeltaToolCall,
@@ -52,14 +48,20 @@ from vllm.entrypoints.openai.protocol import (
    ToolCall,
    UsageInfo,
 )
-from vllm.entrypoints.openai.serving_chat_stream_harmony import (
+from vllm.entrypoints.openai.engine.serving import (
-    extract_harmony_streaming_delta,
-)
-from vllm.entrypoints.openai.serving_engine import (
    GenerationError,
    OpenAIServing,
    clamp_prompt_logprobs,
 )
+from vllm.entrypoints.openai.parser.harmony_utils import (
+    get_developer_message,
+    get_stop_tokens_for_assistant_actions,
+    get_streamable_parser_for_assistant,
+    get_system_message,
+    parse_chat_inputs_to_harmony_messages,
+    parse_chat_output,
+    render_for_completion,
+)
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.openai.utils import maybe_filter_parallel_tool_calls
 from vllm.entrypoints.utils import get_max_tokens, should_include_usage

--- a/vllm/entrypoints/openai/serving_chat_stream_harmony.py
+++ b/vllm/entrypoints/openai/serving_chat_stream_harmony.py
@@ -10,7 +10,7 @@ harmony parser state during streaming chat completions.
 from openai_harmony import StreamableParser
 from vllm.entrypoints.chat_utils import make_tool_call_id
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.engine.protocol import (
    DeltaFunctionCall,
    DeltaMessage,
    DeltaToolCall,

--- a/vllm/entrypoints/openai/engine/__init__.py
+++ b/vllm/entrypoints/openai/engine/__init__.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -11,10 +11,6 @@ from typing import Annotated, Any, ClassVar, Literal, TypeAlias
 import regex as re
 import torch
 from fastapi import HTTPException, UploadFile
-from openai.types.chat.chat_completion_audio import (
-    ChatCompletionAudio as OpenAIChatCompletionAudio,
-)
-from openai.types.chat.chat_completion_message import Annotation as OpenAIAnnotation
 from openai.types.responses import (
    ResponseCodeInterpreterCallCodeDeltaEvent,
    ResponseCodeInterpreterCallCodeDoneEvent,
@@ -234,20 +230,6 @@ class FunctionDefinition(OpenAIBaseModel):
    parameters: dict[str, Any] | None = None
-class ChatCompletionToolsParam(OpenAIBaseModel):
-    type: Literal["function"] = "function"
-    function: FunctionDefinition
-class ChatCompletionNamedFunction(OpenAIBaseModel):
-    name: str
-class ChatCompletionNamedToolChoiceParam(OpenAIBaseModel):
-    function: ChatCompletionNamedFunction
-    type: Literal["function"] = "function"
 # extra="forbid" is a workaround to have kwargs as a field,
 # see https://github.com/pydantic/pydantic/issues/3125
 class LogitsProcessorConstructor(BaseModel):
@@ -414,609 +396,66 @@ class ResponsesRequest(OpenAIBaseModel):
            )
        if (top_k := self.top_k) is None:
            top_k = default_sampling_params.get(
                "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"]
-            )
-        stop_token_ids = default_sampling_params.get("stop_token_ids")
-        # Structured output
-        structured_outputs = None
-        if self.text is not None and self.text.format is not None:
-            response_format = self.text.format
-            if (
-                response_format.type == "json_schema"
-                and response_format.schema_ is not None
-            ):
-                structured_outputs = StructuredOutputsParams(
-                    json=response_format.schema_
-                )
-            elif response_format.type == "json_object":
-                raise NotImplementedError("json_object is not supported")
-        # TODO: add more parameters
-        return SamplingParams.from_optional(
-            temperature=temperature,
-            top_p=top_p,
-            top_k=top_k,
-            max_tokens=max_tokens,
-            logprobs=self.top_logprobs if self.is_include_output_logprobs() else None,
-            stop_token_ids=stop_token_ids,
-            output_kind=(
-                RequestOutputKind.DELTA if self.stream else RequestOutputKind.FINAL_ONLY
-            ),
-            structured_outputs=structured_outputs,
-            logit_bias=self.logit_bias,
-            skip_clone=True,  # Created fresh per request, safe to skip clone
-        )
-    def is_include_output_logprobs(self) -> bool:
-        """Check if the request includes output logprobs."""
-        if self.include is None:
-            return False
-        return (
-            isinstance(self.include, list)
-            and "message.output_text.logprobs" in self.include
-        )
-    @model_validator(mode="before")
-    def validate_background(cls, data):
-        if not data.get("background"):
-            return data
-        if not data.get("store", True):
-            raise ValueError("background can only be used when `store` is true")
-        return data
-    @model_validator(mode="before")
-    def validate_prompt(cls, data):
-        if data.get("prompt") is not None:
-            raise VLLMValidationError(
-                "prompt template is not supported", parameter="prompt"
-            )
-        return data
-    @model_validator(mode="before")
-    def check_cache_salt_support(cls, data):
-        if data.get("cache_salt") is not None and (
-            not isinstance(data["cache_salt"], str) or not data["cache_salt"]
-        ):
-            raise ValueError(
-                "Parameter 'cache_salt' must be a non-empty string if provided."
-            )
-        return data
-    @model_validator(mode="before")
-    def function_call_parsing(cls, data):
-        """Parse function_call dictionaries into ResponseFunctionToolCall objects.
-        This ensures Pydantic can properly resolve union types in the input field.
-        Function calls provided as dicts are converted to ResponseFunctionToolCall
-        objects before validation, while invalid structures are left for Pydantic
-        to reject with appropriate error messages.
-        """
-        input_data = data.get("input")
-        # Early return for None, strings, or bytes
-        # (strings are iterable but shouldn't be processed)
-        if input_data is None or isinstance(input_data, (str, bytes)):
-            return data
-        # Convert iterators (like ValidatorIterator) to list
-        if not isinstance(input_data, list):
-            try:
-                input_data = list(input_data)
-            except TypeError:
-                # Not iterable, leave as-is for Pydantic to handle
-                return data
-        processed_input = []
-        for item in input_data:
-            if isinstance(item, dict) and item.get("type") == "function_call":
-                try:
-                    processed_input.append(ResponseFunctionToolCall(**item))
-                except ValidationError:
-                    # Let Pydantic handle validation for malformed function calls
-                    logger.debug(
-                        "Failed to parse function_call to ResponseFunctionToolCall, "
-                        "leaving for Pydantic validation"
-                    )
-                    processed_input.append(item)
-            else:
-                processed_input.append(item)
-        data["input"] = processed_input
-        return data
-class ChatCompletionRequest(OpenAIBaseModel):
-    # Ordered by official OpenAI API documentation
-    # https://platform.openai.com/docs/api-reference/chat/create
-    messages: list[ChatCompletionMessageParam]
-    model: str | None = None
-    frequency_penalty: float | None = 0.0
-    logit_bias: dict[str, float] | None = None
-    logprobs: bool | None = False
-    top_logprobs: int | None = 0
-    max_tokens: int | None = Field(
-        default=None,
-        deprecated="max_tokens is deprecated in favor of "
-        "the max_completion_tokens field",
-    )
-    max_completion_tokens: int | None = None
-    n: int | None = 1
-    presence_penalty: float | None = 0.0
-    response_format: AnyResponseFormat | None = None
-    seed: int | None = Field(None, ge=_LONG_INFO.min, le=_LONG_INFO.max)
-    stop: str | list[str] | None = []
-    stream: bool | None = False
-    stream_options: StreamOptions | None = None
-    temperature: float | None = None
-    top_p: float | None = None
-    tools: list[ChatCompletionToolsParam] | None = None
-    tool_choice: (
-        Literal["none"]
-        | Literal["auto"]
-        | Literal["required"]
-        | ChatCompletionNamedToolChoiceParam
-        | None
-    ) = "none"
-    reasoning_effort: Literal["low", "medium", "high"] | None = None
-    include_reasoning: bool = True
-    parallel_tool_calls: bool | None = True
-    # NOTE this will be ignored by vLLM
-    user: str | None = None
-    # --8<-- [start:chat-completion-sampling-params]
-    use_beam_search: bool = False
-    top_k: int | None = None
-    min_p: float | None = None
-    repetition_penalty: float | None = None
-    length_penalty: float = 1.0
-    stop_token_ids: list[int] | None = []
-    include_stop_str_in_output: bool = False
-    ignore_eos: bool = False
-    min_tokens: int = 0
-    skip_special_tokens: bool = True
-    spaces_between_special_tokens: bool = True
-    truncate_prompt_tokens: Annotated[int, Field(ge=-1, le=_LONG_INFO.max)] | None = (
-        None
-    )
-    prompt_logprobs: int | None = None
-    allowed_token_ids: list[int] | None = None
-    bad_words: list[str] = Field(default_factory=list)
-    # --8<-- [end:chat-completion-sampling-params]
-    # --8<-- [start:chat-completion-extra-params]
-    echo: bool = Field(
-        default=False,
-        description=(
-            "If true, the new message will be prepended with the last message "
-            "if they belong to the same role."
-        ),
-    )
-    add_generation_prompt: bool = Field(
-        default=True,
-        description=(
-            "If true, the generation prompt will be added to the chat template. "
-            "This is a parameter used by chat template in tokenizer config of the "
-            "model."
-        ),
-    )
-    continue_final_message: bool = Field(
-        default=False,
-        description=(
-            "If this is set, the chat will be formatted so that the final "
-            "message in the chat is open-ended, without any EOS tokens. The "
-            "model will continue this message rather than starting a new one. "
-            'This allows you to "prefill" part of the model\'s response for it. '
-            "Cannot be used at the same time as `add_generation_prompt`."
-        ),
-    )
-    add_special_tokens: bool = Field(
-        default=False,
-        description=(
-            "If true, special tokens (e.g. BOS) will be added to the prompt "
-            "on top of what is added by the chat template. "
-            "For most models, the chat template takes care of adding the "
-            "special tokens so this should be set to false (as is the "
-            "default)."
-        ),
-    )
-    documents: list[dict[str, str]] | None = Field(
-        default=None,
-        description=(
-            "A list of dicts representing documents that will be accessible to "
-            "the model if it is performing RAG (retrieval-augmented generation)."
-            " If the template does not support RAG, this argument will have no "
-            "effect. We recommend that each document should be a dict containing "
-            '"title" and "text" keys.'
-        ),
-    )
-    chat_template: str | None = Field(
-        default=None,
-        description=(
-            "A Jinja template to use for this conversion. "
-            "As of transformers v4.44, default chat template is no longer "
-            "allowed, so you must provide a chat template if the tokenizer "
-            "does not define one."
-        ),
-    )
-    chat_template_kwargs: dict[str, Any] | None = Field(
-        default=None,
-        description=(
-            "Additional keyword args to pass to the template renderer. "
-            "Will be accessible by the chat template."
-        ),
-    )
-    mm_processor_kwargs: dict[str, Any] | None = Field(
-        default=None,
-        description=("Additional kwargs to pass to the HF processor."),
-    )
-    structured_outputs: StructuredOutputsParams | None = Field(
-        default=None,
-        description="Additional kwargs for structured outputs",
-    )
-    priority: int = Field(
-        default=0,
-        description=(
-            "The priority of the request (lower means earlier handling; "
-            "default: 0). Any priority other than 0 will raise an error "
-            "if the served model does not use priority scheduling."
-        ),
-    )
-    request_id: str = Field(
-        default_factory=random_uuid,
-        description=(
-            "The request_id related to this request. If the caller does "
-            "not set it, a random_uuid will be generated. This id is used "
-            "through out the inference process and return in response."
-        ),
-    )
-    logits_processors: LogitsProcessors | None = Field(
-        default=None,
-        description=(
-            "A list of either qualified names of logits processors, or "
-            "constructor objects, to apply when sampling. A constructor is "
-            "a JSON object with a required 'qualname' field specifying the "
-            "qualified name of the processor class/factory, and optional "
-            "'args' and 'kwargs' fields containing positional and keyword "
-            "arguments. For example: {'qualname': "
-            "'my_module.MyLogitsProcessor', 'args': [1, 2], 'kwargs': "
-            "{'param': 'value'}}."
-        ),
-    )
-    return_tokens_as_token_ids: bool | None = Field(
-        default=None,
-        description=(
-            "If specified with 'logprobs', tokens are represented "
-            " as strings of the form 'token_id:{token_id}' so that tokens "
-            "that are not JSON-encodable can be identified."
-        ),
-    )
-    return_token_ids: bool | None = Field(
-        default=None,
-        description=(
-            "If specified, the result will include token IDs alongside the "
-            "generated text. In streaming mode, prompt_token_ids is included "
-            "only in the first chunk, and token_ids contains the delta tokens "
-            "for each chunk. This is useful for debugging or when you "
-            "need to map generated text back to input tokens."
-        ),
-    )
-    cache_salt: str | None = Field(
-        default=None,
-        description=(
-            "If specified, the prefix cache will be salted with the provided "
-            "string to prevent an attacker to guess prompts in multi-user "
-            "environments. The salt should be random, protected from "
-            "access by 3rd parties, and long enough to be "
-            "unpredictable (e.g., 43 characters base64-encoded, corresponding "
-            "to 256 bit)."
-        ),
-    )
-    kv_transfer_params: dict[str, Any] | None = Field(
-        default=None,
-        description="KVTransfer parameters used for disaggregated serving.",
-    )
-    vllm_xargs: dict[str, str | int | float | list[str | int | float]] | None = Field(
-        default=None,
-        description=(
-            "Additional request parameters with (list of) string or "
-            "numeric values, used by custom extensions."
-        ),
-    )
-    # --8<-- [end:chat-completion-extra-params]
-    # Default sampling parameters for chat completion requests
-    _DEFAULT_SAMPLING_PARAMS: dict = {
-        "repetition_penalty": 1.0,
-        "temperature": 1.0,
-        "top_p": 1.0,
-        "top_k": 0,
-        "min_p": 0.0,
-    }
-    def to_beam_search_params(
-        self, max_tokens: int, default_sampling_params: dict
-    ) -> BeamSearchParams:
-        n = self.n if self.n is not None else 1
-        if (temperature := self.temperature) is None:
-            temperature = default_sampling_params.get(
-                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
-            )
-        return BeamSearchParams(
-            beam_width=n,
-            max_tokens=max_tokens,
-            ignore_eos=self.ignore_eos,
-            temperature=temperature,
-            length_penalty=self.length_penalty,
-            include_stop_str_in_output=self.include_stop_str_in_output,
-        )
-    def to_sampling_params(
-        self,
-        max_tokens: int,
-        logits_processor_pattern: str | None,
-        default_sampling_params: dict,
-    ) -> SamplingParams:
-        # Default parameters
-        if (repetition_penalty := self.repetition_penalty) is None:
-            repetition_penalty = default_sampling_params.get(
-                "repetition_penalty",
-                self._DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
-            )
-        if (temperature := self.temperature) is None:
-            temperature = default_sampling_params.get(
-                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
-            )
-        if (top_p := self.top_p) is None:
-            top_p = default_sampling_params.get(
-                "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"]
-            )
-        if (top_k := self.top_k) is None:
-            top_k = default_sampling_params.get(
-                "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"]
-            )
-        if (min_p := self.min_p) is None:
-            min_p = default_sampling_params.get(
-                "min_p", self._DEFAULT_SAMPLING_PARAMS["min_p"]
-            )
-        prompt_logprobs = self.prompt_logprobs
-        if prompt_logprobs is None and self.echo:
-            prompt_logprobs = self.top_logprobs
-        response_format = self.response_format
-        if response_format is not None:
-            # If structured outputs wasn't already enabled,
-            # we must enable it for these features to work
-            if self.structured_outputs is None:
-                self.structured_outputs = StructuredOutputsParams()
-            # Set structured output params for response format
-            if response_format.type == "json_object":
-                self.structured_outputs.json_object = True
-            elif response_format.type == "json_schema":
-                json_schema = response_format.json_schema
-                assert json_schema is not None
-                self.structured_outputs.json = json_schema.json_schema
-            elif response_format.type == "structural_tag":
-                structural_tag = response_format
-                assert structural_tag is not None and isinstance(
-                    structural_tag,
-                    (
-                        LegacyStructuralTagResponseFormat,
-                        StructuralTagResponseFormat,
-                    ),
-                )
-                s_tag_obj = structural_tag.model_dump(by_alias=True)
-                self.structured_outputs.structural_tag = json.dumps(s_tag_obj)
-        extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
-        if self.kv_transfer_params:
-            # Pass in kv_transfer_params via extra_args
-            extra_args["kv_transfer_params"] = self.kv_transfer_params
-        return SamplingParams.from_optional(
-            n=self.n,
-            presence_penalty=self.presence_penalty,
-            frequency_penalty=self.frequency_penalty,
-            repetition_penalty=repetition_penalty,
-            temperature=temperature,
-            top_p=top_p,
-            top_k=top_k,
-            min_p=min_p,
-            seed=self.seed,
-            stop=self.stop,
-            stop_token_ids=self.stop_token_ids,
-            logprobs=self.top_logprobs if self.logprobs else None,
-            prompt_logprobs=prompt_logprobs,
-            ignore_eos=self.ignore_eos,
-            max_tokens=max_tokens,
-            min_tokens=self.min_tokens,
-            skip_special_tokens=self.skip_special_tokens,
-            spaces_between_special_tokens=self.spaces_between_special_tokens,
-            logits_processors=get_logits_processors(
-                self.logits_processors, logits_processor_pattern
-            ),
-            include_stop_str_in_output=self.include_stop_str_in_output,
-            truncate_prompt_tokens=self.truncate_prompt_tokens,
-            output_kind=RequestOutputKind.DELTA
-            if self.stream
-            else RequestOutputKind.FINAL_ONLY,
-            structured_outputs=self.structured_outputs,
-            logit_bias=self.logit_bias,
-            bad_words=self.bad_words,
-            allowed_token_ids=self.allowed_token_ids,
-            extra_args=extra_args or None,
-            skip_clone=True,  # Created fresh per request, safe to skip clone
-        )
-    @model_validator(mode="before")
-    @classmethod
-    def validate_stream_options(cls, data):
-        if data.get("stream_options") and not data.get("stream"):
-            raise VLLMValidationError(
-                "Stream options can only be defined when `stream=True`.",
-                parameter="stream_options",
-            )
-        return data
-    @model_validator(mode="before")
-    @classmethod
-    def check_logprobs(cls, data):
-        if (prompt_logprobs := data.get("prompt_logprobs")) is not None:
-            if data.get("stream") and (prompt_logprobs > 0 or prompt_logprobs == -1):
-                raise VLLMValidationError(
-                    "`prompt_logprobs` are not available when `stream=True`.",
-                    parameter="prompt_logprobs",
-                )
-            if prompt_logprobs < 0 and prompt_logprobs != -1:
-                raise VLLMValidationError(
-                    "`prompt_logprobs` must be a positive value or -1.",
-                    parameter="prompt_logprobs",
-                    value=prompt_logprobs,
-                )
-        if (top_logprobs := data.get("top_logprobs")) is not None:
-            if top_logprobs < 0 and top_logprobs != -1:
-                raise VLLMValidationError(
-                    "`top_logprobs` must be a positive value or -1.",
-                    parameter="top_logprobs",
-                    value=top_logprobs,
-                )
-            if (top_logprobs == -1 or top_logprobs > 0) and not data.get("logprobs"):
-                raise VLLMValidationError(
-                    "when using `top_logprobs`, `logprobs` must be set to true.",
-                    parameter="top_logprobs",
-                )
-        return data
-    @model_validator(mode="before")
-    @classmethod
-    def check_structured_outputs_count(cls, data):
-        if isinstance(data, ValueError):
-            raise data
-        if data.get("structured_outputs", None) is None:
-            return data
-        structured_outputs_kwargs = data["structured_outputs"]
-        count = sum(
-            structured_outputs_kwargs.get(k) is not None
-            for k in ("json", "regex", "choice")
-        )
-        # you can only use one kind of constraints for structured outputs
-        if count > 1:
-            raise ValueError(
-                "You can only use one kind of constraints for structured "
-                "outputs ('json', 'regex' or 'choice')."
-            )
-        # you can only either use structured outputs or tools, not both
-        if count > 1 and data.get("tool_choice", "none") not in (
-            "none",
-            "auto",
-            "required",
-        ):
-            raise ValueError(
-                "You can only either use constraints for structured outputs "
-                "or tools, not both."
            )
-        return data
+        stop_token_ids = default_sampling_params.get("stop_token_ids")
-    @model_validator(mode="before")
-    @classmethod
-    def check_tool_usage(cls, data):
-        # if "tool_choice" is not specified but tools are provided,
-        # default to "auto" tool_choice
-        if "tool_choice" not in data and data.get("tools"):
-            data["tool_choice"] = "auto"
-        # if "tool_choice" is "none" -- no validation is needed for tools
-        if "tool_choice" in data and data["tool_choice"] == "none":
-            return data
-        # if "tool_choice" is specified -- validation
-        if "tool_choice" in data and data["tool_choice"] is not None:
-            # ensure that if "tool choice" is specified, tools are present
-            if "tools" not in data or data["tools"] is None:
-                raise ValueError("When using `tool_choice`, `tools` must be set.")
-            # make sure that tool choice is either a named tool
+        # Structured output
-            # OR that it's set to "auto" or "required"
+        structured_outputs = None
-            if data["tool_choice"] not in ["auto", "required"] and not isinstance(
+        if self.text is not None and self.text.format is not None:
-                data["tool_choice"], dict
+            response_format = self.text.format
+            if (
+                response_format.type == "json_schema"
+                and response_format.schema_ is not None
            ):
-                raise ValueError(
+                structured_outputs = StructuredOutputsParams(
-                    f"Invalid value for `tool_choice`: {data['tool_choice']}! "
+                    json=response_format.schema_
-                    'Only named tools, "none", "auto" or "required" '
-                    "are supported."
                )
+            elif response_format.type == "json_object":
+                raise NotImplementedError("json_object is not supported")
-            # if tool_choice is "required" but the "tools" list is empty,
+        # TODO: add more parameters
-            # override the data to behave like "none" to align with
+        return SamplingParams.from_optional(
-            # OpenAI’s behavior.
+            temperature=temperature,
-            if (
+            top_p=top_p,
-                data["tool_choice"] == "required"
+            top_k=top_k,
-                and isinstance(data["tools"], list)
+            max_tokens=max_tokens,
-                and len(data["tools"]) == 0
+            logprobs=self.top_logprobs if self.is_include_output_logprobs() else None,
-            ):
+            stop_token_ids=stop_token_ids,
-                data["tool_choice"] = "none"
+            output_kind=(
-                del data["tools"]
+                RequestOutputKind.DELTA if self.stream else RequestOutputKind.FINAL_ONLY
-                return data
+            ),
+            structured_outputs=structured_outputs,
+            logit_bias=self.logit_bias,
+            skip_clone=True,  # Created fresh per request, safe to skip clone
+        )
-            # ensure that if "tool_choice" is specified as an object,
+    def is_include_output_logprobs(self) -> bool:
-            # it matches a valid tool
+        """Check if the request includes output logprobs."""
-            correct_usage_message = (
+        if self.include is None:
-                'Correct usage: `{"type": "function",'
+            return False
-                ' "function": {"name": "my_function"}}`'
+        return (
-            )
+            isinstance(self.include, list)
-            if isinstance(data["tool_choice"], dict):
+            and "message.output_text.logprobs" in self.include
-                valid_tool = False
+        )
-                function = data["tool_choice"].get("function")
-                if not isinstance(function, dict):
+    @model_validator(mode="before")
-                    raise ValueError(
+    def validate_background(cls, data):
-                        f"Invalid value for `function`: `{function}` in "
+        if not data.get("background"):
-                        f"`tool_choice`! {correct_usage_message}"
+            return data
-                    )
+        if not data.get("store", True):
-                if "name" not in function:
+            raise ValueError("background can only be used when `store` is true")
-                    raise ValueError(
-                        f"Expected field `name` in `function` in "
-                        f"`tool_choice`! {correct_usage_message}"
-                    )
-                function_name = function["name"]
-                if not isinstance(function_name, str) or len(function_name) == 0:
-                    raise ValueError(
-                        f"Invalid `name` in `function`: `{function_name}`"
-                        f" in `tool_choice`! {correct_usage_message}"
-                    )
-                for tool in data["tools"]:
-                    if tool["function"]["name"] == function_name:
-                        valid_tool = True
-                        break
-                if not valid_tool:
-                    raise ValueError(
-                        "The tool specified in `tool_choice` does not match any"
-                        " of the specified `tools`"
-                    )
        return data
    @model_validator(mode="before")
-    @classmethod
+    def validate_prompt(cls, data):
-    def check_generation_prompt(cls, data):
+        if data.get("prompt") is not None:
-        if data.get("continue_final_message") and data.get("add_generation_prompt"):
+            raise VLLMValidationError(
-            raise ValueError(
+                "prompt template is not supported", parameter="prompt"
-                "Cannot set both `continue_final_message` and "
-                "`add_generation_prompt` to True."
            )
        return data
    @model_validator(mode="before")
-    @classmethod
    def check_cache_salt_support(cls, data):
        if data.get("cache_salt") is not None and (
            not isinstance(data["cache_salt"], str) or not data["cache_salt"]
@@ -1026,6 +465,48 @@ class ChatCompletionRequest(OpenAIBaseModel):
            )
        return data
+    @model_validator(mode="before")
+    def function_call_parsing(cls, data):
+        """Parse function_call dictionaries into ResponseFunctionToolCall objects.
+        This ensures Pydantic can properly resolve union types in the input field.
+        Function calls provided as dicts are converted to ResponseFunctionToolCall
+        objects before validation, while invalid structures are left for Pydantic
+        to reject with appropriate error messages.
+        """
+        input_data = data.get("input")
+        # Early return for None, strings, or bytes
+        # (strings are iterable but shouldn't be processed)
+        if input_data is None or isinstance(input_data, (str, bytes)):
+            return data
+        # Convert iterators (like ValidatorIterator) to list
+        if not isinstance(input_data, list):
+            try:
+                input_data = list(input_data)
+            except TypeError:
+                # Not iterable, leave as-is for Pydantic to handle
+                return data
+        processed_input = []
+        for item in input_data:
+            if isinstance(item, dict) and item.get("type") == "function_call":
+                try:
+                    processed_input.append(ResponseFunctionToolCall(**item))
+                except ValidationError:
+                    # Let Pydantic handle validation for malformed function calls
+                    logger.debug(
+                        "Failed to parse function_call to ResponseFunctionToolCall, "
+                        "leaving for Pydantic validation"
+                    )
+                    processed_input.append(item)
+            else:
+                processed_input.append(item)
+        data["input"] = processed_input
+        return data
 class CompletionRequest(OpenAIBaseModel):
    # Ordered by official OpenAI API documentation
@@ -1486,75 +967,6 @@ class ExtractedToolCallInformation(BaseModel):
    content: str | None = None
-class ChatMessage(OpenAIBaseModel):
-    role: str
-    content: str | None = None
-    refusal: str | None = None
-    annotations: OpenAIAnnotation | None = None
-    audio: OpenAIChatCompletionAudio | None = None
-    function_call: FunctionCall | None = None
-    tool_calls: list[ToolCall] = Field(default_factory=list)
-    # vLLM-specific fields that are not in OpenAI spec
-    reasoning: str | None = None
-    reasoning_content: str | None = None
-    """Deprecated: use `reasoning` instead."""
-    @model_validator(mode="after")
-    def handle_deprecated_reasoning_content(self):
-        """Copy reasoning to reasoning_content for backward compatibility."""
-        self.reasoning_content = self.reasoning
-        return self
-class ChatCompletionLogProb(OpenAIBaseModel):
-    token: str
-    logprob: float = -9999.0
-    bytes: list[int] | None = None
-class ChatCompletionLogProbsContent(ChatCompletionLogProb):
-    # Workaround: redefine fields name cache so that it's not
-    # shared with the super class.
-    field_names: ClassVar[set[str] | None] = None
-    top_logprobs: list[ChatCompletionLogProb] = Field(default_factory=list)
-class ChatCompletionLogProbs(OpenAIBaseModel):
-    content: list[ChatCompletionLogProbsContent] | None = None
-class ChatCompletionResponseChoice(OpenAIBaseModel):
-    index: int
-    message: ChatMessage
-    logprobs: ChatCompletionLogProbs | None = None
-    # per OpenAI spec this is the default
-    finish_reason: str | None = "stop"
-    # not part of the OpenAI spec but included in vLLM for legacy reasons
-    stop_reason: int | str | None = None
-    # not part of the OpenAI spec but is useful for tracing the tokens
-    # in agent scenarios
-    token_ids: list[int] | None = None
-class ChatCompletionResponse(OpenAIBaseModel):
-    id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
-    object: Literal["chat.completion"] = "chat.completion"
-    created: int = Field(default_factory=lambda: int(time.time()))
-    model: str
-    choices: list[ChatCompletionResponseChoice]
-    service_tier: Literal["auto", "default", "flex", "scale", "priority"] | None = None
-    system_fingerprint: str | None = None
-    usage: UsageInfo
-    # vLLM-specific fields that are not in OpenAI spec
-    prompt_logprobs: list[dict[int, Logprob] | None] | None = None
-    prompt_token_ids: list[int] | None = None
-    kv_transfer_params: dict[str, Any] | None = Field(
-        default=None, description="KVTransfer parameters."
-    )
 class DeltaMessage(OpenAIBaseModel):
    role: str | None = None
    content: str | None = None
@@ -1570,27 +982,6 @@ class DeltaMessage(OpenAIBaseModel):
        return self
-class ChatCompletionResponseStreamChoice(OpenAIBaseModel):
-    index: int
-    delta: DeltaMessage
-    logprobs: ChatCompletionLogProbs | None = None
-    finish_reason: str | None = None
-    stop_reason: int | str | None = None
-    # not part of the OpenAI spec but for tracing the tokens
-    token_ids: list[int] | None = None
-class ChatCompletionStreamResponse(OpenAIBaseModel):
-    id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
-    object: Literal["chat.completion.chunk"] = "chat.completion.chunk"
-    created: int = Field(default_factory=lambda: int(time.time()))
-    model: str
-    choices: list[ChatCompletionResponseStreamChoice]
-    usage: UsageInfo | None = Field(default=None)
-    # not part of the OpenAI spec but for tracing the tokens
-    prompt_token_ids: list[int] | None = None
 class TranscriptionResponseStreamChoice(OpenAIBaseModel):
    delta: DeltaMessage
    finish_reason: str | None = None
@@ -1856,128 +1247,6 @@ StreamingResponsesResponse: TypeAlias = (
 )
-class TokenizeCompletionRequest(OpenAIBaseModel):
-    model: str | None = None
-    prompt: str
-    add_special_tokens: bool = Field(
-        default=True,
-        description=(
-            "If true (the default), special tokens (e.g. BOS) will be added to "
-            "the prompt."
-        ),
-    )
-    return_token_strs: bool | None = Field(
-        default=False,
-        description=(
-            "If true, also return the token strings corresponding to the token ids."
-        ),
-    )
-class TokenizeChatRequest(OpenAIBaseModel):
-    model: str | None = None
-    messages: list[ChatCompletionMessageParam]
-    add_generation_prompt: bool = Field(
-        default=True,
-        description=(
-            "If true, the generation prompt will be added to the chat template. "
-            "This is a parameter used by chat template in tokenizer config of the "
-            "model."
-        ),
-    )
-    return_token_strs: bool | None = Field(
-        default=False,
-        description=(
-            "If true, also return the token strings corresponding to the token ids."
-        ),
-    )
-    continue_final_message: bool = Field(
-        default=False,
-        description=(
-            "If this is set, the chat will be formatted so that the final "
-            "message in the chat is open-ended, without any EOS tokens. The "
-            "model will continue this message rather than starting a new one. "
-            'This allows you to "prefill" part of the model\'s response for it. '
-            "Cannot be used at the same time as `add_generation_prompt`."
-        ),
-    )
-    add_special_tokens: bool = Field(
-        default=False,
-        description=(
-            "If true, special tokens (e.g. BOS) will be added to the prompt "
-            "on top of what is added by the chat template. "
-            "For most models, the chat template takes care of adding the "
-            "special tokens so this should be set to false (as is the "
-            "default)."
-        ),
-    )
-    chat_template: str | None = Field(
-        default=None,
-        description=(
-            "A Jinja template to use for this conversion. "
-            "As of transformers v4.44, default chat template is no longer "
-            "allowed, so you must provide a chat template if the tokenizer "
-            "does not define one."
-        ),
-    )
-    chat_template_kwargs: dict[str, Any] | None = Field(
-        default=None,
-        description=(
-            "Additional keyword args to pass to the template renderer. "
-            "Will be accessible by the chat template."
-        ),
-    )
-    mm_processor_kwargs: dict[str, Any] | None = Field(
-        default=None,
-        description=("Additional kwargs to pass to the HF processor."),
-    )
-    tools: list[ChatCompletionToolsParam] | None = Field(
-        default=None,
-        description=("A list of tools the model may call."),
-    )
-    @model_validator(mode="before")
-    @classmethod
-    def check_generation_prompt(cls, data):
-        if data.get("continue_final_message") and data.get("add_generation_prompt"):
-            raise ValueError(
-                "Cannot set both `continue_final_message` and "
-                "`add_generation_prompt` to True."
-            )
-        return data
-TokenizeRequest: TypeAlias = TokenizeCompletionRequest | TokenizeChatRequest
-class TokenizeResponse(OpenAIBaseModel):
-    count: int
-    max_model_len: int
-    tokens: list[int]
-    token_strs: list[str] | None = None
-class DetokenizeRequest(OpenAIBaseModel):
-    model: str | None = None
-    tokens: list[int]
-class DetokenizeResponse(OpenAIBaseModel):
-    prompt: str
-class TokenizerInfoResponse(OpenAIBaseModel):
-    """
-    Response containing tokenizer configuration
-    equivalent to tokenizer_config.json
-    """
-    model_config = ConfigDict(extra="allow")
-    tokenizer_class: str
 class LoadLoRAAdapterRequest(BaseModel):
    lora_name: str
    lora_path: str
@@ -2537,30 +1806,3 @@ class GenerateRequest(BaseModel):
        default=None,
        description="KVTransfer parameters used for disaggregated serving.",
    )
-class GenerateResponseChoice(BaseModel):
-    index: int
-    logprobs: ChatCompletionLogProbs | None = None
-    # per OpenAI spec this is the default
-    finish_reason: str | None = "stop"
-    token_ids: list[int] | None = None
-class GenerateResponse(BaseModel):
-    request_id: str = Field(
-        default_factory=random_uuid,
-        description=(
-            "The request_id related to this request. If the caller does "
-            "not set it, a random_uuid will be generated. This id is used "
-            "through out the inference process and return in response."
-        ),
-    )
-    choices: list[GenerateResponseChoice]
-    prompt_logprobs: list[dict[int, Logprob] | None] | None = None
-    kv_transfer_params: dict[str, Any] | None = Field(
-        default=None,
-        description="KVTransfer parameters used for disaggregated serving.",
-    )
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -38,22 +38,20 @@ from vllm.entrypoints.context import (
    StreamingHarmonyContext,
 )
 from vllm.entrypoints.logger import RequestLogger
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.chat_completion.protocol import (
    ChatCompletionNamedToolChoiceParam,
    ChatCompletionRequest,
    ChatCompletionResponse,
+)
+from vllm.entrypoints.openai.engine.protocol import (
    CompletionRequest,
    CompletionResponse,
-    DetokenizeRequest,
    ErrorInfo,
    ErrorResponse,
    FunctionCall,
    FunctionDefinition,
    ResponseInputOutputItem,
    ResponsesRequest,
-    TokenizeChatRequest,
-    TokenizeCompletionRequest,
-    TokenizeResponse,
    TranscriptionRequest,
    TranscriptionResponse,
    TranslationRequest,
@@ -86,6 +84,12 @@ from vllm.entrypoints.responses_utils import (
    construct_input_messages,
 )
 from vllm.entrypoints.serve.disagg.protocol import GenerateRequest, GenerateResponse
+from vllm.entrypoints.serve.tokenize.protocol import (
+    DetokenizeRequest,
+    TokenizeChatRequest,
+    TokenizeCompletionRequest,
+    TokenizeResponse,
+)
 from vllm.entrypoints.utils import _validate_truncation_size
 from vllm.inputs.data import PromptType, TokensPrompt
 from vllm.inputs.parse import (

--- a/vllm/entrypoints/openai/parser/harmony_utils.py
+++ b/vllm/entrypoints/openai/parser/harmony_utils.py
@@ -43,8 +43,8 @@ from openai_harmony import Message as OpenAIHarmonyMessage
 from openai_harmony import Role as OpenAIHarmonyRole
 from vllm import envs
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionToolsParam
-    ChatCompletionToolsParam,
+from vllm.entrypoints.openai.engine.protocol import (
    ResponseInputOutputItem,
    ResponsesRequest,
 )

--- a/vllm/entrypoints/openai/parser/responses_parser.py
+++ b/vllm/entrypoints/openai/parser/responses_parser.py
@@ -16,7 +16,10 @@ from openai.types.responses.response_reasoning_item import (
 )
 from vllm.entrypoints.constants import MCP_PREFIX
-from vllm.entrypoints.openai.protocol import ResponseInputOutputItem, ResponsesRequest
+from vllm.entrypoints.openai.engine.protocol import (
+    ResponseInputOutputItem,
+    ResponsesRequest,
+)
 from vllm.outputs import CompletionOutput
 from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
 from vllm.tokenizers import TokenizerLike

--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -19,13 +19,15 @@ from tqdm import tqdm
 from vllm.engine.arg_utils import AsyncEngineArgs, optional_type
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.logger import RequestLogger
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.chat_completion.protocol import (
    ChatCompletionRequest,
    ChatCompletionResponse,
+)
+from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat
+from vllm.entrypoints.openai.engine.protocol import (
    ErrorResponse,
    OpenAIBaseModel,
 )
-from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
 from vllm.entrypoints.pooling.embed.protocol import EmbeddingRequest, EmbeddingResponse
 from vllm.entrypoints.pooling.embed.serving import OpenAIServingEmbedding

--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -12,7 +12,7 @@ from fastapi import Request
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.logger import RequestLogger
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.engine.protocol import (
    CompletionLogProbs,
    CompletionRequest,
    CompletionResponse,
@@ -25,7 +25,7 @@ from vllm.entrypoints.openai.protocol import (
    UsageInfo,
    VLLMValidationError,
 )
-from vllm.entrypoints.openai.serving_engine import (
+from vllm.entrypoints.openai.engine.serving import (
    GenerationError,
    OpenAIServing,
    clamp_prompt_logprobs,

--- a/vllm/entrypoints/openai/serving_models.py
+++ b/vllm/entrypoints/openai/serving_models.py
@@ -7,7 +7,7 @@ from dataclasses import dataclass
 from http import HTTPStatus
 from vllm.engine.protocol import EngineClient
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.engine.protocol import (
    ErrorInfo,
    ErrorResponse,
    LoadLoRAAdapterRequest,

--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -72,19 +72,7 @@ from vllm.entrypoints.context import (
    StreamingHarmonyContext,
 )
 from vllm.entrypoints.logger import RequestLogger
-from vllm.entrypoints.openai.parser.harmony_utils import (
+from vllm.entrypoints.openai.engine.protocol import (
-    construct_harmony_previous_input_messages,
-    get_developer_message,
-    get_stop_tokens_for_assistant_actions,
-    get_system_message,
-    get_user_message,
-    has_custom_tools,
-    parse_output_message,
-    parse_remaining_state,
-    parse_response_input,
-    render_for_completion,
-)
-from vllm.entrypoints.openai.protocol import (
    DeltaMessage,
    ErrorResponse,
    InputTokensDetails,
@@ -102,10 +90,22 @@ from vllm.entrypoints.openai.protocol import (
    StreamingResponsesResponse,
    VLLMValidationError,
 )
-from vllm.entrypoints.openai.serving_engine import (
+from vllm.entrypoints.openai.engine.serving import (
    GenerationError,
    OpenAIServing,
 )
+from vllm.entrypoints.openai.parser.harmony_utils import (
+    construct_harmony_previous_input_messages,
+    get_developer_message,
+    get_stop_tokens_for_assistant_actions,
+    get_system_message,
+    get_user_message,
+    has_custom_tools,
+    parse_output_message,
+    parse_remaining_state,
+    parse_response_input,
+    render_for_completion,
+)
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.responses_utils import (
    construct_input_messages,

--- a/vllm/entrypoints/openai/serving_transcription.py
+++ b/vllm/entrypoints/openai/serving_transcription.py
@@ -6,7 +6,7 @@ from fastapi import Request
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.logger import RequestLogger
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.engine.protocol import (
    ErrorResponse,
    RequestResponseMetadata,
    TranscriptionRequest,

--- a/vllm/entrypoints/openai/speech_to_text.py
+++ b/vllm/entrypoints/openai/speech_to_text.py
@@ -15,7 +15,7 @@ from transformers import PreTrainedTokenizerBase
 import vllm.envs as envs
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.logger import RequestLogger
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.engine.protocol import (
    DeltaMessage,
    ErrorResponse,
    RequestResponseMetadata,
@@ -32,7 +32,7 @@ from vllm.entrypoints.openai.protocol import (
    UsageInfo,
    VLLMValidationError,
 )
-from vllm.entrypoints.openai.serving_engine import OpenAIServing, SpeechToTextRequest
+from vllm.entrypoints.openai.engine.serving import OpenAIServing, SpeechToTextRequest
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.inputs.data import PromptType
 from vllm.logger import init_logger

--- a/vllm/entrypoints/openai/utils.py
+++ b/vllm/entrypoints/openai/utils.py
@@ -5,7 +5,7 @@ from typing import TypeVar
 from fastapi import Request
 from fastapi.exceptions import RequestValidationError
-from vllm.entrypoints.openai.protocol import (
+from vllm.entrypoints.openai.chat_completion.protocol import (
    ChatCompletionRequest,
    ChatCompletionResponseChoice,
    ChatCompletionResponseStreamChoice,

--- a/vllm/entrypoints/pooling/classify/api_router.py
+++ b/vllm/entrypoints/pooling/classify/api_router.py
@@ -6,7 +6,7 @@ from fastapi import APIRouter, Depends, HTTPException, Request
 from starlette.responses import JSONResponse
 from typing_extensions import assert_never
-from vllm.entrypoints.openai.protocol import ErrorResponse
+from vllm.entrypoints.openai.engine.protocol import ErrorResponse
 from vllm.entrypoints.openai.utils import validate_json_request
 from vllm.entrypoints.pooling.classify.protocol import (
    ClassificationRequest,