[Misc] Clean up flags in `vllm bench serve` (#25138)

Signed-off-by: Roger Wang <hey@rogerw.io>

[Misc] Clean up flags in `vllm bench serve` (#25138)
Signed-off-by: Roger Wang <hey@rogerw.io>
21da7334 · Roger Wang · GitHub · 66072b36 · 21da7334 · 21da7334
Unverified Commit 21da7334 authored Sep 18, 2025 by Roger Wang Committed by GitHub Sep 18, 2025
4 changed files
--- a/docs/contributing/benchmarks.md
+++ b/docs/contributing/benchmarks.md
@@ -156,7 +156,6 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct
 ```bash
 vllm bench serve \
  --backend openai-chat \
-  --endpoint-type openai-chat \
  --model Qwen/Qwen2-VL-7B-Instruct \
  --endpoint /v1/chat/completions \
  --dataset-name hf \
@@ -230,7 +229,6 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct
 ```bash
 vllm bench serve \
  --backend openai-chat \
-  --endpoint-type openai-chat \
  --model Qwen/Qwen2-VL-7B-Instruct \
  --endpoint /v1/chat/completions \
  --dataset-name hf \
@@ -245,7 +243,6 @@ vllm bench serve \
 ```bash
 vllm bench serve \
  --backend openai-chat \
-  --endpoint-type openai-chat \
  --model Qwen/Qwen2-VL-7B-Instruct \
  --endpoint /v1/chat/completions \
  --dataset-name hf \

--- a/tests/benchmarks/test_serve_cli.py
+++ b/tests/benchmarks/test_serve_cli.py
@@ -68,7 +68,7 @@ def test_bench_serve_chat(server):
        "5",
        "--endpoint",
        "/v1/chat/completions",
-        "--endpoint-type",
+        "--backend",
        "openai-chat",
    ]
    result = subprocess.run(command, capture_output=True, text=True)

--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -1358,7 +1358,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
    elif args.dataset_name == "sonnet":
        dataset = SonnetDataset(dataset_path=args.dataset_path)
        # For the "sonnet" dataset, formatting depends on the backend.
-        if args.endpoint_type == "openai-chat":
+        if args.backend == "openai-chat":
            input_requests = dataset.sample(
                num_requests=args.num_prompts,
                input_len=args.sonnet_input_len,
@@ -1462,7 +1462,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
                "Please consider contributing if you would "
                "like to add support for additional dataset formats.")
-        if dataset_class.IS_MULTIMODAL and args.endpoint_type not in [
+        if dataset_class.IS_MULTIMODAL and args.backend not in [
                "openai-chat",
                "openai-audio",
        ]:
@@ -1470,7 +1470,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
            # endpoint-type.
            raise ValueError(
                "Multi-modal content is only supported on 'openai-chat' and "
-                "'openai-audio' endpoint-type.")
+                "'openai-audio' backends.")
        input_requests = dataset_class(
            dataset_path=args.dataset_path,
            dataset_subset=args.hf_subset,
@@ -1563,7 +1563,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
        try:
            # Enforce endpoint compatibility for multimodal datasets.
-            if args.dataset_name == "random-mm" and args.endpoint_type not in [
+            if args.dataset_name == "random-mm" and args.backend not in [
                    "openai-chat"]:
                raise ValueError(
                    "Multi-modal content (images) is only supported on "

--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -8,8 +8,8 @@ to launch the vLLM OpenAI API server:
 On the client side, run:
    vllm bench serve \
-        --endpoint-type <endpoint_type. Default 'openai'> \
+        --backend <backend or endpoint type. Default 'openai'> \
-        --label <benchmark result label. Default using endpoint_type> \
+        --label <benchmark result label. Default using backend> \
        --model <your_model> \
        --dataset-name <dataset_name. Default 'random'> \
        --request-rate <request_rate. Default inf> \
@@ -52,6 +52,21 @@ TERM_PLOTLIB_AVAILABLE = ((importlib.util.find_spec("termplotlib") is not None)
                          and (shutil.which("gnuplot") is not None))
+# TODO: Remove this in v0.11.0
+class DeprecatedEndpointTypeAction(argparse.Action):
+    """Argparse action for the deprecated --endpoint-type flag.
+    """
+    def __call__(self, _, namespace, values, option_string=None):
+        warnings.warn(
+            "'--endpoint-type' is deprecated and will be removed in v0.11.0. "
+            "Please use '--backend' instead or remove this argument if you "
+            "have already set it.",
+            stacklevel=1,
+        )
+        setattr(namespace, self.dest, values)
 class TaskType(Enum):
    GENERATION = "generation"
    EMBEDDING = "embedding"
@@ -470,7 +485,7 @@ async def benchmark(
        else:
            request_func = ASYNC_REQUEST_FUNCS[endpoint_type]
    else:
-        raise ValueError(f"Unknown endpoint_type: {endpoint_type}")
+        raise ValueError(f"Unknown backend: {endpoint_type}")
    # Reuses connections across requests to reduce TLS handshake overhead.
    connector = aiohttp.TCPConnector(
@@ -850,24 +865,28 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace,
 def add_cli_args(parser: argparse.ArgumentParser):
    add_dataset_parser(parser)
-    parser.add_argument(
-        "--endpoint-type",
-        type=str,
-        default="openai",
-        choices=list(ASYNC_REQUEST_FUNCS.keys()),
-    )
    parser.add_argument(
        "--label",
        type=str,
        default=None,
        help="The label (prefix) of the benchmark results. If not specified, "
-        "the endpoint type will be used as the label.",
+        "the value of '--backend' will be used as the label.",
    )
    parser.add_argument(
        "--backend",
        type=str,
-        default="vllm",
+        default="openai",
+        choices=list(ASYNC_REQUEST_FUNCS.keys()),
+        help="The type of backend or endpoint to use for the benchmark."
+    )
+    parser.add_argument(
+        "--endpoint-type",
+        type=str,
+        default=None,
        choices=list(ASYNC_REQUEST_FUNCS.keys()),
+        action=DeprecatedEndpointTypeAction,
+        help="'--endpoint-type' is deprecated and will be removed in v0.11.0. "
+        "Please use '--backend' instead.",
    )
    parser.add_argument(
        "--base-url",
@@ -1165,7 +1184,6 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
            raise ValueError(
                "For exponential ramp-up, the start RPS cannot be 0.")
-    endpoint_type = args.endpoint_type
    label = args.label
    model_id = args.model
    model_name = args.served_model_name
@@ -1228,7 +1246,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
    gc.freeze()
    benchmark_result = await benchmark(
-        endpoint_type=args.endpoint_type,
+        endpoint_type=args.backend,
        api_url=api_url,
        base_url=base_url,
        model_id=model_id,
@@ -1262,7 +1280,8 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
    # Setup
    current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
    result_json["date"] = current_dt
-    result_json["endpoint_type"] = args.endpoint_type
+    result_json["endpoint_type"] = args.backend # for backward compatibility
+    result_json["backend"] = args.backend
    result_json["label"] = label
    result_json["model_id"] = model_id
    result_json["tokenizer_id"] = tokenizer_id
@@ -1312,7 +1331,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
        base_model_id = model_id.split("/")[-1]
        max_concurrency_str = (f"-concurrency{args.max_concurrency}"
                               if args.max_concurrency is not None else "")
-        label = label or endpoint_type
+        label = label or args.backend
        if args.ramp_up_strategy is not None:
            file_name = f"{label}-ramp-up-{args.ramp_up_strategy}-{args.ramp_up_start_rps}qps-{args.ramp_up_end_rps}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json"  # noqa
        else: