cli_args.py 13.7 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
5
6
7
8
9
10
11
"""
This file contains the command line arguments for the vLLM's
OpenAI-compatible server. It is kept in a separate file for documentation
purposes.
"""

import argparse
import json
import ssl
12
from collections.abc import Sequence
13
from dataclasses import field
14
from typing import Any, Literal
15
16

from pydantic.dataclasses import dataclass
17

18
import vllm.envs as envs
19
from vllm.config import config
20
from vllm.engine.arg_utils import AsyncEngineArgs, optional_type
21
22
23
24
25
26
27
28
from vllm.entrypoints.chat_utils import (
    ChatTemplateContentFormatOption,
    validate_chat_template,
)
from vllm.entrypoints.constants import (
    H11_MAX_HEADER_COUNT_DEFAULT,
    H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT,
)
29
from vllm.entrypoints.openai.models.protocol import LoRAModulePath
30
from vllm.logger import init_logger
31
from vllm.tool_parsers import ToolParserManager
32
from vllm.utils.argparse_utils import FlexibleArgumentParser
33

34
35
logger = init_logger(__name__)

36
37

class LoRAParserAction(argparse.Action):
38
39
40
41
    def __call__(
        self,
        parser: argparse.ArgumentParser,
        namespace: argparse.Namespace,
42
43
        values: str | Sequence[str] | None,
        option_string: str | None = None,
44
45
46
47
48
49
    ):
        if values is None:
            values = []
        if isinstance(values, str):
            raise TypeError("Expected values to be a list")

50
        lora_list: list[LoRAModulePath] = []
51
        for item in values:
52
            if item in [None, ""]:  # Skip if item is None or empty string
53
                continue
54
55
            if "=" in item and "," not in item:  # Old format: name=path
                name, path = item.split("=")
56
57
58
59
60
61
62
                lora_list.append(LoRAModulePath(name, path))
            else:  # Assume JSON format
                try:
                    lora_dict = json.loads(item)
                    lora = LoRAModulePath(**lora_dict)
                    lora_list.append(lora)
                except json.JSONDecodeError:
63
                    parser.error(f"Invalid JSON format for --lora-modules: {item}")
64
65
66
67
                except TypeError as e:
                    parser.error(
                        f"Invalid fields for --lora-modules: {item} - {str(e)}"
                    )
68
69
70
        setattr(namespace, self.dest, lora_list)


71
72
73
74
@config
@dataclass
class FrontendArgs:
    """Arguments for the OpenAI-compatible frontend server."""
75

76
    host: str | None = None
77
78
79
    """Host name."""
    port: int = 8000
    """Port number."""
80
    uds: str | None = None
81
    """Unix domain socket path. If set, host and port arguments are ignored."""
82
    uvicorn_log_level: Literal[
83
        "critical", "error", "warning", "info", "debug", "trace"
84
    ] = "info"
85
86
87
88
89
90
91
92
93
94
95
    """Log level for uvicorn."""
    disable_uvicorn_access_log: bool = False
    """Disable uvicorn access log."""
    allow_credentials: bool = False
    """Allow credentials."""
    allowed_origins: list[str] = field(default_factory=lambda: ["*"])
    """Allowed origins."""
    allowed_methods: list[str] = field(default_factory=lambda: ["*"])
    """Allowed methods."""
    allowed_headers: list[str] = field(default_factory=lambda: ["*"])
    """Allowed headers."""
96
    api_key: list[str] | None = None
97
98
    """If provided, the server will require one of these keys to be presented in
    the header."""
99
    lora_modules: list[LoRAModulePath] | None = None
100
    """LoRA modules configurations in either 'name=path' format or JSON format
101
102
    or JSON list format. Example (old format): `'name=path'` Example (new
    format): `{\"name\": \"name\", \"path\": \"lora_path\",
103
    \"base_model_name\": \"id\"}`"""
104
    chat_template: str | None = None
105
    """The file path to the chat template, or the template in single-line form
106
107
108
109
    for the specified model."""
    chat_template_content_format: ChatTemplateContentFormatOption = "auto"
    """The format to render message content within a chat template.

110
111
112
113
114
115
116
    * "string" will render the content as a string. Example: `"Hello World"`
    * "openai" will render the content as a list of dictionaries, similar to
      OpenAI schema. Example: `[{"type": "text", "text": "Hello world!"}]`"""
    trust_request_chat_template: bool = False
    """Whether to trust the chat template provided in the request. If False,
    the server will always use the chat template specified by `--chat-template`
    or the ones from tokenizer."""
117
118
119
120
121
122
    default_chat_template_kwargs: dict[str, Any] | None = None
    """Default keyword arguments to pass to the chat template renderer.
    These will be merged with request-level chat_template_kwargs,
    with request values taking precedence. Useful for setting default
    behavior for reasoning models. Example: '{"enable_thinking": false}'
    to disable thinking mode by default for Qwen3/DeepSeek models."""
123
124
    response_role: str = "assistant"
    """The role name to return if `request.add_generation_prompt=true`."""
125
    ssl_keyfile: str | None = None
126
    """The file path to the SSL key file."""
127
    ssl_certfile: str | None = None
128
    """The file path to the SSL cert file."""
129
    ssl_ca_certs: str | None = None
130
131
132
133
134
    """The CA certificates file."""
    enable_ssl_refresh: bool = False
    """Refresh SSL Context when SSL certificate files change"""
    ssl_cert_reqs: int = int(ssl.CERT_NONE)
    """Whether client certificate is required (see stdlib ssl module's)."""
135
136
137
    ssl_ciphers: str | None = None
    """SSL cipher suites for HTTPS (TLS 1.2 and below only).
    Example: 'ECDHE-RSA-AES256-GCM-SHA384:ECDHE-RSA-CHACHA20-POLY1305'"""
138
    root_path: str | None = None
139
140
    """FastAPI root_path when app is behind a path based routing proxy."""
    middleware: list[str] = field(default_factory=lambda: [])
141
142
143
144
    """Additional ASGI middleware to apply to the app. We accept multiple
    --middleware arguments. The value should be an import path. If a function
    is provided, vLLM will add it to the server using
    `@app.middleware('http')`. If a class is provided, vLLM will
145
146
    add it to the server using `app.add_middleware()`."""
    return_tokens_as_token_ids: bool = False
147
148
    """When `--max-logprobs` is specified, represents single tokens as
    strings of the form 'token_id:{token_id}' so that tokens that are not
149
150
    JSON-encodable can be identified."""
    disable_frontend_multiprocessing: bool = False
151
    """If specified, will run the OpenAI frontend server in the same process as
152
153
    the model serving engine."""
    enable_request_id_headers: bool = False
154
    """If specified, API server will add X-Request-Id header to responses."""
155
    enable_auto_tool_choice: bool = False
156
    """Enable auto tool choice for supported models. Use `--tool-call-parser`
157
    to specify which parser to use."""
158
159
160
    exclude_tools_when_tool_choice_none: bool = False
    """If specified, exclude tool definitions in prompts when
    tool_choice='none'."""
161
    tool_call_parser: str | None = None
162
163
164
    """Select the tool call parser depending on the model that you're using.
    This is used to parse the model-generated tool call into OpenAI API format.
    Required for `--enable-auto-tool-choice`. You can choose any option from
165
166
    the built-in parsers or register a plugin via `--tool-parser-plugin`."""
    tool_parser_plugin: str = ""
167
168
    """Special the tool parser plugin write to parse the model-generated tool
    into OpenAI API format, the name register in this plugin can be used in
169
    `--tool-call-parser`."""
170
    tool_server: str | None = None
171
172
173
    """Comma-separated list of host:port pairs (IPv4, IPv6, or hostname).
    Examples: 127.0.0.1:8000, [::1]:8000, localhost:1234. Or `demo` for demo
    purpose."""
174
    log_config_file: str | None = envs.VLLM_LOGGING_CONFIG_PATH
175
    """Path to logging config JSON file for both vllm and uvicorn"""
176
    max_log_len: int | None = None
177
    """Max number of prompt characters or prompt ID numbers being printed in
178
179
180
181
182
183
184
185
186
    log. The default of None means unlimited."""
    disable_fastapi_docs: bool = False
    """Disable FastAPI's OpenAPI schema, Swagger UI, and ReDoc endpoint."""
    enable_prompt_tokens_details: bool = False
    """If set to True, enable prompt_tokens_details in usage."""
    enable_server_load_tracking: bool = False
    """If set to True, enable tracking server_load_metrics in the app state."""
    enable_force_include_usage: bool = False
    """If set to True, including usage on every request."""
187
    enable_tokenizer_info_endpoint: bool = False
188
    """Enable the `/tokenizer_info` endpoint. May expose chat
189
    templates and other tokenizer configuration."""
190
    enable_log_outputs: bool = False
191
    """If set to True, log model outputs (generations).
192
    Requires --enable-log-requests."""
193
194
195
196
    enable_log_deltas: bool = True
    """If set to False, output deltas will not be logged. Relevant only if 
    --enable-log-outputs is set.
    """
197
198
199
200
201
202
    h11_max_incomplete_event_size: int = H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT
    """Maximum size (bytes) of an incomplete HTTP event (header or body) for
    h11 parser. Helps mitigate header abuse. Default: 4194304 (4 MB)."""
    h11_max_header_count: int = H11_MAX_HEADER_COUNT_DEFAULT
    """Maximum number of HTTP headers allowed in a request for h11 parser.
    Helps mitigate header abuse. Default: 256."""
203
204
    log_error_stack: bool = envs.VLLM_SERVER_DEV_MODE
    """If set to True, log the stack trace of error responses"""
205
206
207
208
209
    tokens_only: bool = False
    """
    If set to True, only enable the Tokens In<>Out endpoint. 
    This is intended for use in a Disaggregated Everything setup.
    """
210
211
212
213
214
    enable_offline_docs: bool = False
    """
    Enable offline FastAPI documentation for air-gapped environments.
    Uses vendored static assets bundled with vLLM.
    """
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231

    @staticmethod
    def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
        from vllm.engine.arg_utils import get_kwargs

        frontend_kwargs = get_kwargs(FrontendArgs)

        # Special case: allowed_origins, allowed_methods, allowed_headers all
        # need json.loads type
        # Should also remove nargs
        frontend_kwargs["allowed_origins"]["type"] = json.loads
        frontend_kwargs["allowed_methods"]["type"] = json.loads
        frontend_kwargs["allowed_headers"]["type"] = json.loads
        del frontend_kwargs["allowed_origins"]["nargs"]
        del frontend_kwargs["allowed_methods"]["nargs"]
        del frontend_kwargs["allowed_headers"]["nargs"]

232
233
234
        # Special case: default_chat_template_kwargs needs json.loads type
        frontend_kwargs["default_chat_template_kwargs"]["type"] = json.loads

235
236
237
238
239
        # Special case: LoRA modules need custom parser action and
        # optional_type(str)
        frontend_kwargs["lora_modules"]["type"] = optional_type(str)
        frontend_kwargs["lora_modules"]["action"] = LoRAParserAction

240
        # Special case: Middleware needs to append action
241
        frontend_kwargs["middleware"]["action"] = "append"
242
243
244
245
        frontend_kwargs["middleware"]["type"] = str
        if "nargs" in frontend_kwargs["middleware"]:
            del frontend_kwargs["middleware"]["nargs"]
        frontend_kwargs["middleware"]["default"] = []
246
247

        # Special case: Tool call parser shows built-in options.
248
        valid_tool_parsers = list(ToolParserManager.list_registered())
249
250
        parsers_str = ",".join(valid_tool_parsers)
        frontend_kwargs["tool_call_parser"]["metavar"] = (
251
252
            f"{{{parsers_str}}} or name registered in --tool-parser-plugin"
        )
253
254
255
256
257
258
259
260
261
262
263
264

        frontend_group = parser.add_argument_group(
            title="Frontend",
            description=FrontendArgs.__doc__,
        )

        for key, value in frontend_kwargs.items():
            frontend_group.add_argument(f"--{key.replace('_', '-')}", **value)

        return parser


Ethan Xu's avatar
Ethan Xu committed
265
def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
266
    """Create the CLI argument parser used by the OpenAI API server.
267

268
269
270
271
    We rely on the helper methods of `FrontendArgs` and `AsyncEngineArgs` to
    register all arguments instead of manually enumerating them here. This
    avoids code duplication and keeps the argument definitions in one place.
    """
272
273
274
275
276
277
    parser.add_argument(
        "model_tag",
        type=str,
        nargs="?",
        help="The model tag to serve (optional if specified in config)",
    )
278
279
280
281
282
    parser.add_argument(
        "--headless",
        action="store_true",
        default=False,
        help="Run in headless mode. See multi-node data parallel "
283
284
285
286
287
288
        "documentation for more details.",
    )
    parser.add_argument(
        "--api-server-count",
        "-asc",
        type=int,
289
290
291
        default=None,
        help="How many API server processes to run. "
        "Defaults to data_parallel_size if not specified.",
292
    )
293
294
295
296
    parser.add_argument(
        "--config",
        help="Read CLI options from a config file. "
        "Must be a YAML with the following options: "
297
298
        "https://docs.vllm.ai/en/latest/configuration/serve_args.html",
    )
299
    parser = FrontendArgs.add_cli_args(parser)
300
    parser = AsyncEngineArgs.add_cli_args(parser)
301

302
    return parser
Ethan Xu's avatar
Ethan Xu committed
303
304


305
306
307
308
309
310
311
312
313
314
def validate_parsed_serve_args(args: argparse.Namespace):
    """Quick checks for model serve args that raise prior to loading."""
    if hasattr(args, "subparser") and args.subparser != "serve":
        return

    # Ensure that the chat template is valid; raises if it likely isn't
    validate_chat_template(args.chat_template)

    # Enable auto tool needs a tool call parser to be valid
    if args.enable_auto_tool_choice and not args.tool_call_parser:
315
        raise TypeError("Error: --enable-auto-tool-choice requires --tool-call-parser")
316
    if args.enable_log_outputs and not args.enable_log_requests:
317
        raise TypeError("Error: --enable-log-outputs requires --enable-log-requests")
318
319


Ethan Xu's avatar
Ethan Xu committed
320
321
def create_parser_for_docs() -> FlexibleArgumentParser:
    parser_for_docs = FlexibleArgumentParser(
322
323
        prog="-m vllm.entrypoints.openai.api_server"
    )
Ethan Xu's avatar
Ethan Xu committed
324
    return make_arg_parser(parser_for_docs)