[Doc]: Update `OpenAI-Compatible Server` documents (#12082)

57e729e8 · maang-h · GitHub · de0526f6 · 57e729e8 · 57e729e8
Unverified Commit 57e729e8 authored Jan 16, 2025 by maang-h Committed by GitHub Jan 15, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 38 additions and 38 deletions

vllm/engine/arg_utils.py vllm/engine/arg_utils.py +8 -8

vllm/entrypoints/openai/cli_args.py vllm/entrypoints/openai/cli_args.py +30 -30

No files found.
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -238,7 +238,7 @@ class EngineArgs:
            choices=get_args(TaskOption),
            help='The task to use the model for. Each vLLM instance only '
            'supports one task, even if the same model can be used for '
-            'multiple tasks. When the model only supports one task, "auto" '
+            'multiple tasks. When the model only supports one task, ``"auto"`` '
            'can be used to select it; otherwise, you must specify explicitly '
            'which task to use.')
        parser.add_argument(
@@ -250,7 +250,7 @@ class EngineArgs:
        parser.add_argument(
            '--skip-tokenizer-init',
            action='store_true',
-            help='Skip initialization of tokenizer and detokenizer')
+            help='Skip initialization of tokenizer and detokenizer.')
        parser.add_argument(
            '--revision',
            type=nullable_str,
@@ -401,7 +401,7 @@ class EngineArgs:
        parser.add_argument(
            '--worker-use-ray',
            action='store_true',
-            help='Deprecated, use --distributed-executor-backend=ray.')
+            help='Deprecated, use ``--distributed-executor-backend=ray``.')
        parser.add_argument('--pipeline-parallel-size',
                            '-pp',
                            type=int,
@@ -430,7 +430,7 @@ class EngineArgs:
                            choices=[8, 16, 32, 64, 128],
                            help='Token block size for contiguous chunks of '
                            'tokens. This is ignored on neuron devices and '
-                            'set to max-model-len. On CUDA devices, '
+                            'set to ``--max-model-len``. On CUDA devices, '
                            'only block sizes up to 32 are supported. '
                            'On HPU devices, block size defaults to 128.')

@@ -439,12 +439,12 @@ class EngineArgs:
            action=argparse.BooleanOptionalAction,
            default=EngineArgs.enable_prefix_caching,
            help="Enables automatic prefix caching. "
-            "Use --no-enable-prefix-caching to disable explicitly.",
+            "Use ``--no-enable-prefix-caching`` to disable explicitly.",
        )
        parser.add_argument('--disable-sliding-window',
                            action='store_true',
                            help='Disables sliding window, '
-                            'capping to sliding window size')
+                            'capping to sliding window size.')
        parser.add_argument('--use-v2-block-manager',
                            action='store_true',
                            default=True,
@@ -861,7 +861,7 @@ class EngineArgs:
            "of the provided names. The model name in the model "
            "field of a response will be the first name in this "
            "list. If not specified, the model name will be the "
-            "same as the `--model` argument. Noted that this name(s) "
+            "same as the ``--model`` argument. Noted that this name(s) "
            "will also be used in `model_name` tag content of "
            "prometheus metrics, if multiple names provided, metrics "
            "tag will take the first one.")
@@ -881,7 +881,7 @@ class EngineArgs:
            default=None,
            help="Valid choices are " +
            ",".join(ALLOWED_DETAILED_TRACE_MODULES) +
-            ". It makes sense to set this only if --otlp-traces-endpoint is"
+            ". It makes sense to set this only if ``--otlp-traces-endpoint`` is"
            " set. If set, it will collect detailed traces for the specified "
            "modules. This involves use of possibly costly and or blocking "
            "operations and hence might have a performance impact.")

--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -79,29 +79,29 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
    parser.add_argument("--host",
                        type=nullable_str,
                        default=None,
-                        help="host name")
-    parser.add_argument("--port", type=int, default=8000, help="port number")
+                        help="Host name.")
+    parser.add_argument("--port", type=int, default=8000, help="Port number.")
    parser.add_argument(
        "--uvicorn-log-level",
        type=str,
        default="info",
        choices=['debug', 'info', 'warning', 'error', 'critical', 'trace'],
-        help="log level for uvicorn")
+        help="Log level for uvicorn.")
    parser.add_argument("--allow-credentials",
                        action="store_true",
-                        help="allow credentials")
+                        help="Allow credentials.")
    parser.add_argument("--allowed-origins",
                        type=json.loads,
                        default=["*"],
-                        help="allowed origins")
+                        help="Allowed origins.")
    parser.add_argument("--allowed-methods",
                        type=json.loads,
                        default=["*"],
-                        help="allowed methods")
+                        help="Allowed methods.")
    parser.add_argument("--allowed-headers",
                        type=json.loads,
                        default=["*"],
-                        help="allowed headers")
+                        help="Allowed headers.")
    parser.add_argument("--api-key",
                        type=nullable_str,
                        default=None,
@@ -115,10 +115,10 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
        action=LoRAParserAction,
        help="LoRA module configurations in either 'name=path' format"
        "or JSON format. "
-        "Example (old format): 'name=path' "
+        "Example (old format): ``'name=path'`` "
        "Example (new format): "
-        "'{\"name\": \"name\", \"local_path\": \"path\", "
-        "\"base_model_name\": \"id\"}'")
+        "``{\"name\": \"name\", \"local_path\": \"path\", "
+        "\"base_model_name\": \"id\"}``")
    parser.add_argument(
        "--prompt-adapters",
        type=nullable_str,
@@ -132,7 +132,7 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                        default=None,
                        help="The file path to the chat template, "
                        "or the template in single-line form "
-                        "for the specified model")
+                        "for the specified model.")
    parser.add_argument(
        '--chat-template-content-format',
        type=str,
@@ -141,38 +141,39 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
        help='The format to render message content within a chat template.'
        '\n\n'
        '* "string" will render the content as a string. '
-        'Example: "Hello World"\n'
+        'Example: ``"Hello World"``\n'
        '* "openai" will render the content as a list of dictionaries, '
        'similar to OpenAI schema. '
-        'Example: [{"type": "text", "text": "Hello world!"}]')
+        'Example: ``[{"type": "text", "text": "Hello world!"}]``')
    parser.add_argument("--response-role",
                        type=nullable_str,
                        default="assistant",
                        help="The role name to return if "
-                        "`request.add_generation_prompt=true`.")
+                        "``request.add_generation_prompt=true``.")
    parser.add_argument("--ssl-keyfile",
                        type=nullable_str,
                        default=None,
-                        help="The file path to the SSL key file")
+                        help="The file path to the SSL key file.")
    parser.add_argument("--ssl-certfile",
                        type=nullable_str,
                        default=None,
-                        help="The file path to the SSL cert file")
+                        help="The file path to the SSL cert file.")
    parser.add_argument("--ssl-ca-certs",
                        type=nullable_str,
                        default=None,
-                        help="The CA certificates file")
+                        help="The CA certificates file.")
    parser.add_argument(
        "--ssl-cert-reqs",
        type=int,
        default=int(ssl.CERT_NONE),
-        help="Whether client certificate is required (see stdlib ssl module's)"
+        help="Whether client certificate is required (see stdlib ssl module's)."
    )
    parser.add_argument(
        "--root-path",
        type=nullable_str,
        default=None,
-        help="FastAPI root_path when app is behind a path based routing proxy")
+        help="FastAPI root_path when app is behind a path based routing proxy."
+    )
    parser.add_argument(
        "--middleware",
        type=nullable_str,
@@ -182,15 +183,15 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
        "We accept multiple --middleware arguments. "
        "The value should be an import path. "
        "If a function is provided, vLLM will add it to the server "
-        "using @app.middleware('http'). "
+        "using ``@app.middleware('http')``. "
        "If a class is provided, vLLM will add it to the server "
-        "using app.add_middleware(). ")
+        "using ``app.add_middleware()``. ")
    parser.add_argument(
        "--return-tokens-as-token-ids",
        action="store_true",
-        help="When --max-logprobs is specified, represents single tokens as "
-        "strings of the form 'token_id:{token_id}' so that tokens that "
-        "are not JSON-encodable can be identified.")
+        help="When ``--max-logprobs`` is specified, represents single tokens "
+        " as strings of the form 'token_id:{token_id}' so that tokens "
+        "that are not JSON-encodable can be identified.")
    parser.add_argument(
        "--disable-frontend-multiprocessing",
        action="store_true",
@@ -205,9 +206,8 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
        "--enable-auto-tool-choice",
        action="store_true",
        default=False,
-        help=
-        "Enable auto tool choice for supported models. Use --tool-call-parser"
-        " to specify which parser to use")
+        help="Enable auto tool choice for supported models. Use "
+        "``--tool-call-parser`` to specify which parser to use.")

    valid_tool_parsers = ToolParserManager.tool_parsers.keys()
    parser.add_argument(
@@ -219,7 +219,7 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
        help=
        "Select the tool call parser depending on the model that you're using."
        " This is used to parse the model-generated tool call into OpenAI API "
-        "format. Required for --enable-auto-tool-choice.")
+        "format. Required for ``--enable-auto-tool-choice``.")

    parser.add_argument(
        "--tool-parser-plugin",
@@ -228,7 +228,7 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
        help=
        "Special the tool parser plugin write to parse the model-generated tool"
        " into OpenAI API format, the name register in this plugin can be used "
-        "in --tool-call-parser.")
+        "in ``--tool-call-parser``.")

    parser = AsyncEngineArgs.add_cli_args(parser)

@@ -243,7 +243,7 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
        "--disable-fastapi-docs",
        action='store_true',
        default=False,
-        help="Disable FastAPI's OpenAPI schema, Swagger UI, and ReDoc endpoint"
+        help="Disable FastAPI's OpenAPI schema, Swagger UI, and ReDoc endpoint."
    )
    parser.add_argument(
        "--enable-prompt-tokens-details",