[CLI] add --max-tokens to `vllm complete` (#28109)

Signed-off-by: Iceber Gu <caiwei95@hotmail.com>

[CLI] add --max-tokens to `vllm complete` (#28109)
Signed-off-by: Iceber Gu <caiwei95@hotmail.com>
e0d6b4a8 · Iceber Gu · GitHub · 72b1c2ae · e0d6b4a8
Unverified Commit e0d6b4a8 authored Nov 07, 2025 by Iceber Gu Committed by GitHub Nov 07, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 14 additions and 6 deletions

vllm/entrypoints/cli/openai.py vllm/entrypoints/cli/openai.py +14 -6

No files found.
--- a/vllm/entrypoints/cli/openai.py
+++ b/vllm/entrypoints/cli/openai.py
@@ -195,10 +195,15 @@ class CompleteCommand(CLISubcommand):
    def cmd(args: argparse.Namespace) -> None:
        model_name, client = _interactive_cli(args)

+        kwargs = {
+            "model": model_name,
+            "stream": True,
+        }
+        if args.max_tokens:
+            kwargs["max_tokens"] = args.max_tokens
+
        if args.quick:
-            stream = client.completions.create(
-                model=model_name, prompt=args.quick, stream=True
-            )
+            stream = client.completions.create(prompt=args.quick, **kwargs)
            _print_completion_stream(stream)
            return

@@ -208,15 +213,18 @@ class CompleteCommand(CLISubcommand):
                input_prompt = input("> ")
            except EOFError:
                break
-            stream = client.completions.create(
-                model=model_name, prompt=input_prompt, stream=True
-            )
+            stream = client.completions.create(prompt=input_prompt, **kwargs)
            _print_completion_stream(stream)

    @staticmethod
    def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
        """Add CLI arguments for the complete command."""
        _add_query_options(parser)
+        parser.add_argument(
+            "--max-tokens",
+            type=int,
+            help="Maximum number of tokens to generate per output sequence.",
+        )
        parser.add_argument(
            "-q",
            "--quick",