# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import TYPE_CHECKING from fastapi import FastAPI if TYPE_CHECKING: from argparse import Namespace from starlette.datastructures import State from vllm.engine.protocol import EngineClient from vllm.entrypoints.logger import RequestLogger from vllm.tasks import SupportedTask else: RequestLogger = object def register_generate_api_routers(app: FastAPI): from vllm.entrypoints.openai.chat_completion.api_router import ( attach_router as register_chat_api_router, ) register_chat_api_router(app) from vllm.entrypoints.openai.responses.api_router import ( attach_router as register_responses_api_router, ) register_responses_api_router(app) from vllm.entrypoints.openai.completion.api_router import ( attach_router as register_completion_api_router, ) register_completion_api_router(app) from vllm.entrypoints.anthropic.api_router import ( attach_router as register_anthropic_api_router, ) register_anthropic_api_router(app) async def init_generate_state( engine_client: "EngineClient", state: "State", args: "Namespace", request_logger: RequestLogger | None, supported_tasks: tuple["SupportedTask", ...], ): from vllm.entrypoints.anthropic.serving import AnthropicServingMessages from vllm.entrypoints.chat_utils import load_chat_template from vllm.entrypoints.mcp.tool_server import ( DemoToolServer, MCPToolServer, ToolServer, ) from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat from vllm.entrypoints.openai.completion.serving import OpenAIServingCompletion from vllm.entrypoints.openai.responses.serving import OpenAIServingResponses from vllm.entrypoints.serve.disagg.serving import ServingTokens if args.tool_server == "demo": tool_server: ToolServer | None = DemoToolServer() assert isinstance(tool_server, DemoToolServer) await tool_server.init_and_validate() elif args.tool_server: tool_server = MCPToolServer() await tool_server.add_tool_server(args.tool_server) else: tool_server = None resolved_chat_template = load_chat_template(args.chat_template) state.openai_serving_responses = ( OpenAIServingResponses( engine_client, state.openai_serving_models, request_logger=request_logger, chat_template=resolved_chat_template, chat_template_content_format=args.chat_template_content_format, return_tokens_as_token_ids=args.return_tokens_as_token_ids, enable_auto_tools=args.enable_auto_tool_choice, tool_parser=args.tool_call_parser, tool_server=tool_server, reasoning_parser=args.structured_outputs_config.reasoning_parser, enable_prompt_tokens_details=args.enable_prompt_tokens_details, enable_force_include_usage=args.enable_force_include_usage, enable_log_outputs=args.enable_log_outputs, ) if "generate" in supported_tasks else None ) state.openai_serving_chat = ( OpenAIServingChat( engine_client, state.openai_serving_models, args.response_role, request_logger=request_logger, chat_template=resolved_chat_template, chat_template_content_format=args.chat_template_content_format, default_chat_template_kwargs=args.default_chat_template_kwargs, trust_request_chat_template=args.trust_request_chat_template, return_tokens_as_token_ids=args.return_tokens_as_token_ids, enable_auto_tools=args.enable_auto_tool_choice, exclude_tools_when_tool_choice_none=args.exclude_tools_when_tool_choice_none, tool_parser=args.tool_call_parser, reasoning_parser=args.structured_outputs_config.reasoning_parser, enable_prompt_tokens_details=args.enable_prompt_tokens_details, enable_force_include_usage=args.enable_force_include_usage, enable_log_outputs=args.enable_log_outputs, enable_log_deltas=args.enable_log_deltas, ) if "generate" in supported_tasks else None ) if state.openai_serving_chat is not None: state.openai_serving_chat.warmup() state.openai_serving_completion = ( OpenAIServingCompletion( engine_client, state.openai_serving_models, request_logger=request_logger, return_tokens_as_token_ids=args.return_tokens_as_token_ids, enable_prompt_tokens_details=args.enable_prompt_tokens_details, enable_force_include_usage=args.enable_force_include_usage, ) if "generate" in supported_tasks else None ) state.anthropic_serving_messages = ( AnthropicServingMessages( engine_client, state.openai_serving_models, args.response_role, request_logger=request_logger, chat_template=resolved_chat_template, chat_template_content_format=args.chat_template_content_format, return_tokens_as_token_ids=args.return_tokens_as_token_ids, enable_auto_tools=args.enable_auto_tool_choice, tool_parser=args.tool_call_parser, reasoning_parser=args.structured_outputs_config.reasoning_parser, enable_prompt_tokens_details=args.enable_prompt_tokens_details, enable_force_include_usage=args.enable_force_include_usage, ) if "generate" in supported_tasks else None ) state.serving_tokens = ( ServingTokens( engine_client, state.openai_serving_models, request_logger=request_logger, return_tokens_as_token_ids=args.return_tokens_as_token_ids, enable_prompt_tokens_details=args.enable_prompt_tokens_details, enable_log_outputs=args.enable_log_outputs, force_no_detokenize=args.tokens_only, ) if "generate" in supported_tasks else None ) # Render endpoints are always backed by OpenAIServingRender so that # /v1/chat/completions/render and /v1/completions/render work on both # generate-mode and render-only servers. from vllm.entrypoints.serve.render.serving import OpenAIServingRender state.openai_serving_render = OpenAIServingRender( model_config=engine_client.model_config, renderer=engine_client.renderer, io_processor=engine_client.io_processor, model_registry=state.openai_serving_models.registry, request_logger=request_logger, chat_template=resolved_chat_template, chat_template_content_format=args.chat_template_content_format, trust_request_chat_template=args.trust_request_chat_template, enable_auto_tools=args.enable_auto_tool_choice, exclude_tools_when_tool_choice_none=args.exclude_tools_when_tool_choice_none, tool_parser=args.tool_call_parser, default_chat_template_kwargs=args.default_chat_template_kwargs, log_error_stack=args.log_error_stack, )