[Refactor] [10/N] to simplify the vLLM openai completion serving architecture (#32369)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>

[Refactor] [10/N] to simplify the vLLM openai completion serving architecture (#32369)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
4c1c501a · Chauncey · GitHub · ae1eba6a · 4c1c501a · 4c1c501a
Unverified Commit 4c1c501a authored Jan 15, 2026 by Chauncey Committed by GitHub Jan 15, 2026
3 changed files
--- a/vllm/entrypoints/serve/tokenize/api_router.py
+++ b/vllm/entrypoints/serve/tokenize/api_router.py
@@ -9,10 +9,10 @@ from fastapi.exceptions import RequestValidationError
 from fastapi.responses import JSONResponse
 from typing_extensions import assert_never
-from vllm.entrypoints.openai.api_server import validate_json_request
 from vllm.entrypoints.openai.engine.protocol import (
    ErrorResponse,
 )
+from vllm.entrypoints.openai.utils import validate_json_request
 from vllm.entrypoints.serve.tokenize.protocol import (
    DetokenizeRequest,
    DetokenizeResponse,

--- a/vllm/entrypoints/serve/tokenize/serving.py
+++ b/vllm/entrypoints/serve/tokenize/serving.py
@@ -13,7 +13,7 @@ from vllm.entrypoints.openai.engine.protocol import (
    ErrorResponse,
 )
 from vllm.entrypoints.openai.engine.serving import OpenAIServing
-from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.entrypoints.openai.models.serving import OpenAIServingModels
 from vllm.entrypoints.renderer import RenderConfig
 from vllm.entrypoints.serve.tokenize.protocol import (
    DetokenizeRequest,

--- a/vllm/entrypoints/utils.py
+++ b/vllm/entrypoints/utils.py
@@ -31,11 +31,13 @@ if TYPE_CHECKING:
    from vllm.entrypoints.openai.chat_completion.protocol import (
        ChatCompletionRequest,
    )
-    from vllm.entrypoints.openai.engine.protocol import (
+    from vllm.entrypoints.openai.completion.protocol import (
        CompletionRequest,
+    )
+    from vllm.entrypoints.openai.engine.protocol import (
        StreamOptions,
    )
-    from vllm.entrypoints.openai.serving_models import LoRAModulePath
+    from vllm.entrypoints.openai.models.protocol import LoRAModulePath
 else:
    ChatCompletionRequest = object
    CompletionRequest = object
@@ -281,7 +283,7 @@ def should_include_usage(
 def process_lora_modules(
    args_lora_modules: list[LoRAModulePath], default_mm_loras: dict[str, str] | None
 ) -> list[LoRAModulePath]:
-    from vllm.entrypoints.openai.serving_models import LoRAModulePath
+    from vllm.entrypoints.openai.models.serving import LoRAModulePath
    lora_modules = args_lora_modules
    if default_mm_loras: