[Feature] Add `vllm bench` CLI (#13993)

Signed-off-by: Randy Chen <acad.randyjhc@gmail.com> Signed-off-by: Cody Yu <hao.yu.cody@gmail.com> Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>

[Feature] Add `vllm bench` CLI (#13993)
Signed-off-by: Randy Chen <acad.randyjhc@gmail.com> Signed-off-by: Cody Yu <hao.yu.cody@gmail.com> Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
36e0c8f7 · Randy Chen · GitHub · 9f583e36 · 36e0c8f7 · 36e0c8f7
Unverified Commit 36e0c8f7 authored Mar 11, 2025 by Randy Chen Committed by GitHub Mar 12, 2025
8 changed files
--- a/vllm/benchmarks/endpoint_request_func.py
+++ b/vllm/benchmarks/endpoint_request_func.py
+# SPDX-License-Identifier: Apache-2.0
+"""The request function for API endpoints."""
+
+import json
+import os
+import sys
+import time
+import traceback
+from dataclasses import dataclass, field
+from typing import Optional
+
+import aiohttp
+from tqdm.asyncio import tqdm
+
+AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
+
+
+@dataclass
+class RequestFuncInput:
+    """The input for the request function."""
+    prompt: str
+    api_url: str
+    prompt_len: int
+    output_len: int
+    model: str
+    model_name: Optional[str] = None
+    best_of: int = 1
+    logprobs: Optional[int] = None
+    extra_body: Optional[dict] = None
+    multi_modal_content: Optional[dict] = None
+    ignore_eos: bool = False
+
+
+@dataclass
+class RequestFuncOutput:
+    """The output of the request function including metrics."""
+    generated_text: str = ""
+    success: bool = False
+    latency: float = 0.0
+    output_tokens: int = 0
+    ttft: float = 0.0  # Time to first token
+    itl: list[float] = field(
+        default_factory=list)  # list of inter-token latencies
+    tpot: float = 0.0  # avg next-token latencies
+    prompt_len: int = 0
+    error: str = ""
+
+
+async def async_request_openai_completions(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    """The async request function for the OpenAI Completions API.
+
+    Args:
+        request_func_input: The input for the request function.
+        pbar: The progress bar to display the progress.
+
+    Returns:
+        The output of the request function.
+    """
+    api_url = request_func_input.api_url
+    assert api_url.endswith(
+        ("completions", "profile")
+    ), "OpenAI Completions API URL must end with 'completions' or 'profile'."
+
+    async with aiohttp.ClientSession(trust_env=True,
+                                     timeout=AIOHTTP_TIMEOUT) as session:
+        payload = {
+            "model": request_func_input.model_name \
+                if request_func_input.model_name else request_func_input.model,
+            "prompt": request_func_input.prompt,
+            "temperature": 0.0,
+            "best_of": request_func_input.best_of,
+            "max_tokens": request_func_input.output_len,
+            "logprobs": request_func_input.logprobs,
+            "stream": True,
+            "stream_options": {
+                "include_usage": True,
+            },
+        }
+        if request_func_input.ignore_eos:
+            payload["ignore_eos"] = request_func_input.ignore_eos
+        if request_func_input.extra_body:
+            payload.update(request_func_input.extra_body)
+        headers = {
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
+        }
+
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        generated_text = ""
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(url=api_url, json=payload,
+                                    headers=headers) as response:
+                if response.status == 200:
+                    first_chunk_received = False
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = chunk_bytes.decode("utf-8").removeprefix(
+                            "data: ")
+                        if chunk != "[DONE]":
+                            data = json.loads(chunk)
+
+                            # NOTE: Some completion API might have a last
+                            # usage summary response without a token so we
+                            # want to check a token was generated
+                            if choices := data.get("choices"):
+                                # Note that text could be empty here
+                                # e.g. for special tokens
+                                text = choices[0].get("text")
+                                timestamp = time.perf_counter()
+                                # First token
+                                if not first_chunk_received:
+                                    first_chunk_received = True
+                                    ttft = time.perf_counter() - st
+                                    output.ttft = ttft
+
+                                # Decoding phase
+                                else:
+                                    output.itl.append(timestamp -
+                                                      most_recent_timestamp)
+
+                                most_recent_timestamp = timestamp
+                                generated_text += text or ""
+                            elif usage := data.get("usage"):
+                                output.output_tokens = usage.get(
+                                    "completion_tokens")
+                    if first_chunk_received:
+                        output.success = True
+                    else:
+                        output.success = False
+                        output.error = (
+                            "Never received a valid chunk to calculate TTFT."
+                            "This response will be marked as failed!")
+                    output.generated_text = generated_text
+                    output.latency = most_recent_timestamp - st
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+
+# TODO: Add more request functions for different API protocols.
+ASYNC_REQUEST_FUNCS = {
+    "openai-comp": async_request_openai_completions,
+}
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
--- a/vllm/benchmarks/utils.py
+++ b/vllm/benchmarks/utils.py
+# SPDX-License-Identifier: Apache-2.0
+
+import argparse
+import json
+import math
+import os
+from typing import Any
+
+
+def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
+                                        metrics: dict[str, list],
+                                        extra_info: dict[str, Any]) -> list:
+    """
+    Save the benchmark results in the format used by PyTorch OSS benchmark with
+    on metric per record
+    https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
+    """
+    records = []
+    if not os.environ.get("SAVE_TO_PYTORCH_BENCHMARK_FORMAT", False):
+        return records
+
+    for name, benchmark_values in metrics.items():
+        record = {
+            "benchmark": {
+                "name": "vLLM benchmark",
+                "extra_info": {
+                    "args": vars(args),
+                },
+            },
+            "model": {
+                "name": args.model,
+            },
+            "metric": {
+                "name": name,
+                "benchmark_values": benchmark_values,
+                "extra_info": extra_info,
+            },
+        }
+
+        tp = record["benchmark"]["extra_info"]["args"].get(
+            "tensor_parallel_size")
+        # Save tensor_parallel_size parameter if it's part of the metadata
+        if not tp and "tensor_parallel_size" in extra_info:
+            record["benchmark"]["extra_info"]["args"][
+                "tensor_parallel_size"] = extra_info["tensor_parallel_size"]
+
+        records.append(record)
+
+    return records
+
+
+class InfEncoder(json.JSONEncoder):
+
+    def clear_inf(self, o: Any):
+        if isinstance(o, dict):
+            return {k: self.clear_inf(v) for k, v in o.items()}
+        elif isinstance(o, list):
+            return [self.clear_inf(v) for v in o]
+        elif isinstance(o, float) and math.isinf(o):
+            return "inf"
+        return o
+
+    def iterencode(self, o: Any, *args, **kwargs) -> Any:
+        return super().iterencode(self.clear_inf(o), *args, **kwargs)
+
+
+def write_to_json(filename: str, records: list) -> None:
+    with open(filename, "w") as f:
+        json.dump(records, f, cls=InfEncoder)
--- a/vllm/entrypoints/cli/benchmark/__init__.py
+++ b/vllm/entrypoints/cli/benchmark/__init__.py
--- a/vllm/entrypoints/cli/benchmark/base.py
+++ b/vllm/entrypoints/cli/benchmark/base.py
+# SPDX-License-Identifier: Apache-2.0
+import argparse
+
+from vllm.entrypoints.cli.types import CLISubcommand
+from vllm.utils import FlexibleArgumentParser
+
+
+class BenchmarkSubcommandBase(CLISubcommand):
+    """ The base class of subcommands for vllm bench. """
+
+    @property
+    def help(self) -> str:
+        """The help message of the subcommand."""
+        raise NotImplementedError
+
+    def add_cli_args(self, parser: argparse.ArgumentParser) -> None:
+        """Add the CLI arguments to the parser."""
+        raise NotImplementedError
+
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        """Run the benchmark.
+
+        Args:
+            args: The arguments to the command.
+        """
+        raise NotImplementedError
+
+    def subparser_init(
+            self,
+            subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
+        parser = subparsers.add_parser(
+            self.name,
+            help=self.help,
+            usage=f"vllm bench {self.name} [options]")
+        self.add_cli_args(parser)
+        return parser
--- a/vllm/entrypoints/cli/benchmark/main.py
+++ b/vllm/entrypoints/cli/benchmark/main.py
+# SPDX-License-Identifier: Apache-2.0
+import argparse
+
+import vllm.entrypoints.cli.benchmark.serve
+from vllm.entrypoints.cli.types import CLISubcommand
+from vllm.utils import FlexibleArgumentParser
+
+# TODO: Add the rest of the benchmark subcommands here,
+# e.g., throughput, latency, etc.
+BENCHMARK_CMD_MODULES = [
+    vllm.entrypoints.cli.benchmark.serve,
+]
+
+
+class BenchmarkSubcommand(CLISubcommand):
+    """ The `bench` subcommand for the vLLM CLI. """
+
+    def __init__(self):
+        self.name = "bench"
+        super().__init__()
+
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        args.dispatch_function(args)
+
+    def validate(self, args: argparse.Namespace) -> None:
+        if args.bench_type in self.cmds:
+            self.cmds[args.bench_type].validate(args)
+
+    def subparser_init(
+            self,
+            subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
+        bench_parser = subparsers.add_parser(
+            "bench",
+            help="vLLM bench subcommand.",
+            usage="vllm bench <bench_type> [options]")
+        bench_subparsers = bench_parser.add_subparsers(required=True,
+                                                       dest="bench_type")
+        self.cmds = {}
+        for cmd_module in BENCHMARK_CMD_MODULES:
+            new_cmds = cmd_module.cmd_init()
+            for cmd in new_cmds:
+                cmd.subparser_init(bench_subparsers).set_defaults(
+                    dispatch_function=cmd.cmd)
+                self.cmds[cmd.name] = cmd
+        return bench_parser
+
+
+def cmd_init() -> list[CLISubcommand]:
+    return [BenchmarkSubcommand()]
--- a/vllm/entrypoints/cli/benchmark/serve.py
+++ b/vllm/entrypoints/cli/benchmark/serve.py
+# SPDX-License-Identifier: Apache-2.0
+import argparse
+
+from vllm.benchmarks.serve import add_cli_args, main
+from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
+from vllm.entrypoints.cli.types import CLISubcommand
+
+
+class BenchmarkServingSubcommand(BenchmarkSubcommandBase):
+    """ The `serve` subcommand for vllm bench. """
+
+    def __init__(self):
+        self.name = "serve"
+        super().__init__()
+
+    @property
+    def help(self) -> str:
+        return "Benchmark the online serving throughput."
+
+    def add_cli_args(self, parser: argparse.ArgumentParser) -> None:
+        add_cli_args(parser)
+
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        main(args)
+
+
+def cmd_init() -> list[CLISubcommand]:
+    return [BenchmarkServingSubcommand()]
--- a/vllm/entrypoints/cli/main.py
+++ b/vllm/entrypoints/cli/main.py
@@ -5,6 +5,7 @@ import os
 import signal
 import sys

+import vllm.entrypoints.cli.benchmark.main
 import vllm.entrypoints.cli.openai
 import vllm.entrypoints.cli.serve
 import vllm.version
@@ -16,6 +17,7 @@ logger = init_logger(__name__)
 CMD_MODULES = [
    vllm.entrypoints.cli.openai,
    vllm.entrypoints.cli.serve,
+    vllm.entrypoints.cli.benchmark.main,
 ]