cli_args.py 15.9 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
5
6
7
8
9
10
11
"""
This file contains the command line arguments for the vLLM's
OpenAI-compatible server. It is kept in a separate file for documentation
purposes.
"""

import argparse
import json
import ssl
12
from collections.abc import Sequence
13
from dataclasses import field
14
from typing import Any, Literal
15

16
import vllm.envs as envs
17
from vllm.config import config
18
from vllm.engine.arg_utils import AsyncEngineArgs, optional_type
19
20
21
22
23
24
25
26
from vllm.entrypoints.chat_utils import (
    ChatTemplateContentFormatOption,
    validate_chat_template,
)
from vllm.entrypoints.constants import (
    H11_MAX_HEADER_COUNT_DEFAULT,
    H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT,
)
27
from vllm.entrypoints.openai.models.protocol import LoRAModulePath
28
from vllm.logger import init_logger
29
from vllm.tool_parsers import ToolParserManager
30
from vllm.utils.argparse_utils import FlexibleArgumentParser
31

32
33
logger = init_logger(__name__)

34
35

class LoRAParserAction(argparse.Action):
36
37
38
39
    def __call__(
        self,
        parser: argparse.ArgumentParser,
        namespace: argparse.Namespace,
40
41
        values: str | Sequence[str] | None,
        option_string: str | None = None,
42
43
44
45
46
47
    ):
        if values is None:
            values = []
        if isinstance(values, str):
            raise TypeError("Expected values to be a list")

48
        lora_list: list[LoRAModulePath] = []
49
        for item in values:
50
            if item in [None, ""]:  # Skip if item is None or empty string
51
                continue
52
53
            if "=" in item and "," not in item:  # Old format: name=path
                name, path = item.split("=")
54
55
56
57
58
59
60
                lora_list.append(LoRAModulePath(name, path))
            else:  # Assume JSON format
                try:
                    lora_dict = json.loads(item)
                    lora = LoRAModulePath(**lora_dict)
                    lora_list.append(lora)
                except json.JSONDecodeError:
61
                    parser.error(f"Invalid JSON format for --lora-modules: {item}")
62
63
64
65
                except TypeError as e:
                    parser.error(
                        f"Invalid fields for --lora-modules: {item} - {str(e)}"
                    )
66
67
68
        setattr(namespace, self.dest, lora_list)


69
@config
70
71
72
73
74
75
76
class BaseFrontendArgs:
    """Base arguments for the OpenAI-compatible frontend server.

    This base class does not include host, port, and server-specific arguments
    like SSL, CORS, and HTTP server settings. Those arguments are added by
    the subclasses.
    """
77

78
    lora_modules: list[LoRAModulePath] | None = None
79
    """LoRA modules configurations in either 'name=path' format or JSON format
80
81
    or JSON list format. Example (old format): `'name=path'` Example (new
    format): `{\"name\": \"name\", \"path\": \"lora_path\",
82
    \"base_model_name\": \"id\"}`"""
83
    chat_template: str | None = None
84
    """The file path to the chat template, or the template in single-line form
85
86
87
88
    for the specified model."""
    chat_template_content_format: ChatTemplateContentFormatOption = "auto"
    """The format to render message content within a chat template.

89
90
91
92
93
94
95
    * "string" will render the content as a string. Example: `"Hello World"`
    * "openai" will render the content as a list of dictionaries, similar to
      OpenAI schema. Example: `[{"type": "text", "text": "Hello world!"}]`"""
    trust_request_chat_template: bool = False
    """Whether to trust the chat template provided in the request. If False,
    the server will always use the chat template specified by `--chat-template`
    or the ones from tokenizer."""
96
97
98
99
100
101
    default_chat_template_kwargs: dict[str, Any] | None = None
    """Default keyword arguments to pass to the chat template renderer.
    These will be merged with request-level chat_template_kwargs,
    with request values taking precedence. Useful for setting default
    behavior for reasoning models. Example: '{"enable_thinking": false}'
    to disable thinking mode by default for Qwen3/DeepSeek models."""
102
103
104
    response_role: str = "assistant"
    """The role name to return if `request.add_generation_prompt=true`."""
    return_tokens_as_token_ids: bool = False
105
106
    """When `--max-logprobs` is specified, represents single tokens as
    strings of the form 'token_id:{token_id}' so that tokens that are not
107
108
    JSON-encodable can be identified."""
    disable_frontend_multiprocessing: bool = False
109
    """If specified, will run the OpenAI frontend server in the same process as
110
111
    the model serving engine."""
    enable_auto_tool_choice: bool = False
112
    """Enable auto tool choice for supported models. Use `--tool-call-parser`
113
    to specify which parser to use."""
114
115
116
    exclude_tools_when_tool_choice_none: bool = False
    """If specified, exclude tool definitions in prompts when
    tool_choice='none'."""
117
    tool_call_parser: str | None = None
118
119
120
    """Select the tool call parser depending on the model that you're using.
    This is used to parse the model-generated tool call into OpenAI API format.
    Required for `--enable-auto-tool-choice`. You can choose any option from
121
122
    the built-in parsers or register a plugin via `--tool-parser-plugin`."""
    tool_parser_plugin: str = ""
123
124
    """Special the tool parser plugin write to parse the model-generated tool
    into OpenAI API format, the name register in this plugin can be used in
125
    `--tool-call-parser`."""
126
    tool_server: str | None = None
127
128
129
    """Comma-separated list of host:port pairs (IPv4, IPv6, or hostname).
    Examples: 127.0.0.1:8000, [::1]:8000, localhost:1234. Or `demo` for demo
    purpose."""
130
    log_config_file: str | None = envs.VLLM_LOGGING_CONFIG_PATH
131
    """Path to logging config JSON file for both vllm and uvicorn"""
132
    max_log_len: int | None = None
133
    """Max number of prompt characters or prompt ID numbers being printed in
134
135
136
137
138
139
140
    log. The default of None means unlimited."""
    enable_prompt_tokens_details: bool = False
    """If set to True, enable prompt_tokens_details in usage."""
    enable_server_load_tracking: bool = False
    """If set to True, enable tracking server_load_metrics in the app state."""
    enable_force_include_usage: bool = False
    """If set to True, including usage on every request."""
141
    enable_tokenizer_info_endpoint: bool = False
142
    """Enable the `/tokenizer_info` endpoint. May expose chat
143
    templates and other tokenizer configuration."""
144
    enable_log_outputs: bool = False
145
    """If set to True, log model outputs (generations).
146
147
    Requires `--enable-log-requests`. As with `--enable-log-requests`,
    information is only logged at INFO level at maximum."""
148
149
150
151
    enable_log_deltas: bool = True
    """If set to False, output deltas will not be logged. Relevant only if 
    --enable-log-outputs is set.
    """
152
153
    log_error_stack: bool = envs.VLLM_SERVER_DEV_MODE
    """If set to True, log the stack trace of error responses"""
154
155
156
157
158
    tokens_only: bool = False
    """
    If set to True, only enable the Tokens In<>Out endpoint. 
    This is intended for use in a Disaggregated Everything setup.
    """
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275

    @classmethod
    def _customize_cli_kwargs(
        cls,
        frontend_kwargs: dict[str, Any],
    ) -> dict[str, Any]:
        """Customize argparse kwargs before arguments are registered.

        Subclasses should override this and call
        ``super()._customize_cli_kwargs(frontend_kwargs)`` first.
        """
        # Special case: default_chat_template_kwargs needs json.loads type
        frontend_kwargs["default_chat_template_kwargs"]["type"] = json.loads

        # Special case: LoRA modules need custom parser action and
        # optional_type(str)
        frontend_kwargs["lora_modules"]["type"] = optional_type(str)
        frontend_kwargs["lora_modules"]["action"] = LoRAParserAction

        # Special case: Tool call parser shows built-in options.
        valid_tool_parsers = list(ToolParserManager.list_registered())
        parsers_str = ",".join(valid_tool_parsers)
        frontend_kwargs["tool_call_parser"]["metavar"] = (
            f"{{{parsers_str}}} or name registered in --tool-parser-plugin"
        )
        return frontend_kwargs

    @classmethod
    def add_cli_args(cls, parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
        """Register CLI arguments for this frontend class.

        Subclasses should override ``_customize_cli_kwargs`` instead of
        this method so that base-class postprocessing is always applied.
        """
        from vllm.engine.arg_utils import get_kwargs

        frontend_kwargs = get_kwargs(cls)
        frontend_kwargs = cls._customize_cli_kwargs(frontend_kwargs)

        group_name = cls.__name__.replace("Args", "")
        frontend_group = parser.add_argument_group(
            title=group_name,
            description=cls.__doc__,
        )
        for key, value in frontend_kwargs.items():
            extra_flags = value.pop("flags", [])
            frontend_group.add_argument(
                *extra_flags, f"--{key.replace('_', '-')}", **value
            )

        return parser


@config
class FrontendArgs(BaseFrontendArgs):
    """Arguments for the OpenAI-compatible frontend server."""

    host: str | None = None
    """Host name."""
    port: int = 8000
    """Port number."""
    uds: str | None = None
    """Unix domain socket path. If set, host and port arguments are ignored."""
    uvicorn_log_level: Literal[
        "critical", "error", "warning", "info", "debug", "trace"
    ] = "info"
    """Log level for uvicorn."""
    disable_uvicorn_access_log: bool = False
    """Disable uvicorn access log."""
    disable_access_log_for_endpoints: str | None = None
    """Comma-separated list of endpoint paths to exclude from uvicorn access
    logs. This is useful to reduce log noise from high-frequency endpoints
    like health checks. Example: "/health,/metrics,/ping".
    When set, access logs for requests to these paths will be suppressed
    while keeping logs for other endpoints."""
    allow_credentials: bool = False
    """Allow credentials."""
    allowed_origins: list[str] = field(default_factory=lambda: ["*"])
    """Allowed origins."""
    allowed_methods: list[str] = field(default_factory=lambda: ["*"])
    """Allowed methods."""
    allowed_headers: list[str] = field(default_factory=lambda: ["*"])
    """Allowed headers."""
    api_key: list[str] | None = None
    """If provided, the server will require one of these keys to be presented in
    the header."""
    ssl_keyfile: str | None = None
    """The file path to the SSL key file."""
    ssl_certfile: str | None = None
    """The file path to the SSL cert file."""
    ssl_ca_certs: str | None = None
    """The CA certificates file."""
    enable_ssl_refresh: bool = False
    """Refresh SSL Context when SSL certificate files change"""
    ssl_cert_reqs: int = int(ssl.CERT_NONE)
    """Whether client certificate is required (see stdlib ssl module's)."""
    ssl_ciphers: str | None = None
    """SSL cipher suites for HTTPS (TLS 1.2 and below only).
    Example: 'ECDHE-RSA-AES256-GCM-SHA384:ECDHE-RSA-CHACHA20-POLY1305'"""
    root_path: str | None = None
    """FastAPI root_path when app is behind a path based routing proxy."""
    middleware: list[str] = field(default_factory=lambda: [])
    """Additional ASGI middleware to apply to the app. We accept multiple
    --middleware arguments. The value should be an import path. If a function
    is provided, vLLM will add it to the server using
    `@app.middleware('http')`. If a class is provided, vLLM will
    add it to the server using `app.add_middleware()`."""
    enable_request_id_headers: bool = False
    """If specified, API server will add X-Request-Id header to responses."""
    disable_fastapi_docs: bool = False
    """Disable FastAPI's OpenAPI schema, Swagger UI, and ReDoc endpoint."""
    h11_max_incomplete_event_size: int = H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT
    """Maximum size (bytes) of an incomplete HTTP event (header or body) for
    h11 parser. Helps mitigate header abuse. Default: 4194304 (4 MB)."""
    h11_max_header_count: int = H11_MAX_HEADER_COUNT_DEFAULT
    """Maximum number of HTTP headers allowed in a request for h11 parser.
    Helps mitigate header abuse. Default: 256."""
276
277
278
279
280
    enable_offline_docs: bool = False
    """
    Enable offline FastAPI documentation for air-gapped environments.
    Uses vendored static assets bundled with vLLM.
    """
281
282
283
284
    use_gpu_for_pooling_score: bool = False
    """If set, run pooling score MaxSim on GPU in the API server process.
    Can significantly improve late-interaction scoring performance.
    https://github.com/vllm-project/vllm/pull/35330"""
285

286
287
288
289
290
291
    @classmethod
    def _customize_cli_kwargs(
        cls,
        frontend_kwargs: dict[str, Any],
    ) -> dict[str, Any]:
        frontend_kwargs = super()._customize_cli_kwargs(frontend_kwargs)
292
293
294
295
296
297
298
299
300
301
302

        # Special case: allowed_origins, allowed_methods, allowed_headers all
        # need json.loads type
        # Should also remove nargs
        frontend_kwargs["allowed_origins"]["type"] = json.loads
        frontend_kwargs["allowed_methods"]["type"] = json.loads
        frontend_kwargs["allowed_headers"]["type"] = json.loads
        del frontend_kwargs["allowed_origins"]["nargs"]
        del frontend_kwargs["allowed_methods"]["nargs"]
        del frontend_kwargs["allowed_headers"]["nargs"]

303
        # Special case: Middleware needs to append action
304
        frontend_kwargs["middleware"]["action"] = "append"
305
306
307
308
        frontend_kwargs["middleware"]["type"] = str
        if "nargs" in frontend_kwargs["middleware"]:
            del frontend_kwargs["middleware"]["nargs"]
        frontend_kwargs["middleware"]["default"] = []
309

310
311
312
313
314
        # Special case: disable_access_log_for_endpoints is a single
        # comma-separated string, not a list
        if "nargs" in frontend_kwargs["disable_access_log_for_endpoints"]:
            del frontend_kwargs["disable_access_log_for_endpoints"]["nargs"]

315
        return frontend_kwargs
316
317


Ethan Xu's avatar
Ethan Xu committed
318
def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
319
    """Create the CLI argument parser used by the OpenAI API server.
320

321
322
323
324
    We rely on the helper methods of `FrontendArgs` and `AsyncEngineArgs` to
    register all arguments instead of manually enumerating them here. This
    avoids code duplication and keeps the argument definitions in one place.
    """
325
326
327
328
329
330
    parser.add_argument(
        "model_tag",
        type=str,
        nargs="?",
        help="The model tag to serve (optional if specified in config)",
    )
331
332
333
334
335
    parser.add_argument(
        "--headless",
        action="store_true",
        default=False,
        help="Run in headless mode. See multi-node data parallel "
336
337
338
339
340
341
        "documentation for more details.",
    )
    parser.add_argument(
        "--api-server-count",
        "-asc",
        type=int,
342
343
344
        default=None,
        help="How many API server processes to run. "
        "Defaults to data_parallel_size if not specified.",
345
    )
346
347
348
349
    parser.add_argument(
        "--config",
        help="Read CLI options from a config file. "
        "Must be a YAML with the following options: "
350
351
        "https://docs.vllm.ai/en/latest/configuration/serve_args.html",
    )
352
    parser = FrontendArgs.add_cli_args(parser)
353
    parser = AsyncEngineArgs.add_cli_args(parser)
354

355
    return parser
Ethan Xu's avatar
Ethan Xu committed
356
357


358
359
360
361
362
363
364
365
366
367
def validate_parsed_serve_args(args: argparse.Namespace):
    """Quick checks for model serve args that raise prior to loading."""
    if hasattr(args, "subparser") and args.subparser != "serve":
        return

    # Ensure that the chat template is valid; raises if it likely isn't
    validate_chat_template(args.chat_template)

    # Enable auto tool needs a tool call parser to be valid
    if args.enable_auto_tool_choice and not args.tool_call_parser:
368
        raise TypeError("Error: --enable-auto-tool-choice requires --tool-call-parser")
369
    if args.enable_log_outputs and not args.enable_log_requests:
370
        raise TypeError("Error: --enable-log-outputs requires --enable-log-requests")
371
372


Ethan Xu's avatar
Ethan Xu committed
373
374
def create_parser_for_docs() -> FlexibleArgumentParser:
    parser_for_docs = FlexibleArgumentParser(
375
376
        prog="-m vllm.entrypoints.openai.api_server"
    )
Ethan Xu's avatar
Ethan Xu committed
377
    return make_arg_parser(parser_for_docs)