[Benchmarks] add benchmark for embedding models (#23000)

Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>

[Benchmarks] add benchmark for embedding models (#23000)
Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
3ecbb14b · Jiangyun Zhu · GitHub · 7d67a9d9 · 3ecbb14b · 3ecbb14b
Unverified Commit 3ecbb14b authored Aug 26, 2025 by Jiangyun Zhu Committed by GitHub Aug 25, 2025
3 changed files
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -73,7 +73,7 @@ class SampleRequest:
    Represents a single inference request for benchmarking.
    """

-    prompt: Union[str, Any]
+    prompt: Union[str, list[str]]
    prompt_len: int
    expected_output_len: int
    multi_modal_data: Optional[
@@ -409,6 +409,7 @@ class RandomDataset(BenchmarkDataset):
        range_ratio: float = DEFAULT_RANGE_RATIO,
        input_len: int = DEFAULT_INPUT_LEN,
        output_len: int = DEFAULT_OUTPUT_LEN,
+        batchsize: int = 1,
        **kwargs,
    ) -> list[SampleRequest]:

@@ -439,6 +440,21 @@ class RandomDataset(BenchmarkDataset):
                    request_id=request_id_prefix + str(i),
                )
            )
+        # only used for embeddings benchmark.
+        if batchsize > 1:
+            batch_requests = []
+            # Create batched requests
+            for i in range(0, num_requests, batchsize):
+                batch = requests[i : i + batchsize]
+                batch_requests.append(
+                    SampleRequest(
+                        prompt=[req.prompt for req in batch],
+                        prompt_len=sum(req.prompt_len for req in batch),
+                        expected_output_len=0,
+                        request_id=request_id_prefix + str(i // batchsize),
+                    )
+                )
+            requests = batch_requests
        return requests

    def get_prefix(
@@ -506,7 +522,6 @@ class RandomDataset(BenchmarkDataset):
                                        size=num_requests)
        return input_lens, output_lens, offsets

-
    def generate_token_sequence(
        self,
        *,
@@ -1105,6 +1120,13 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
              "context length sampled from [input_len * (1 - range_ratio), "
              "input_len * (1 + range_ratio)]."),
    )
+    random_group.add_argument(
+        "--random-batch-size",
+        type=int,
+        default=1,
+        help=("Batch size for random sampling. "
+              "Only used for embeddings benchmark."),
+    )

    # random multimodal dataset options
    random_mm_group = parser.add_argument_group(
@@ -1196,8 +1218,6 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
        ),
    )

-
-
    hf_group = parser.add_argument_group("hf dataset options")
    hf_group.add_argument("--hf-subset",
                          type=str,
@@ -1348,22 +1368,24 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
    else:
        # For datasets that follow a similar structure, use a mapping.
        dataset_mapping = {
-            "sharegpt":
-            lambda: ShareGPTDataset(random_seed=args.seed,
-                                    dataset_path=args.dataset_path).sample(
+            "sharegpt": lambda: ShareGPTDataset(
+                random_seed=args.seed, dataset_path=args.dataset_path
+            ).sample(
                tokenizer=tokenizer,
                num_requests=args.num_prompts,
                output_len=args.sharegpt_output_len,
                request_id_prefix=args.request_id_prefix,
            ),
-            "burstgpt":
-            lambda: BurstGPTDataset(random_seed=args.seed,
-                                    dataset_path=args.dataset_path).
-            sample(tokenizer=tokenizer, num_requests=args.num_prompts, 
-                   request_id_prefix=args.request_id_prefix,),
-            "random":
-            lambda: RandomDataset(random_seed=args.seed,
-                                  dataset_path=args.dataset_path).sample(
+            "burstgpt": lambda: BurstGPTDataset(
+                random_seed=args.seed, dataset_path=args.dataset_path
+            ).sample(
+                tokenizer=tokenizer,
+                num_requests=args.num_prompts,
+                request_id_prefix=args.request_id_prefix,
+            ),
+            "random": lambda: RandomDataset(
+                random_seed=args.seed, dataset_path=args.dataset_path
+            ).sample(
                tokenizer=tokenizer,
                num_requests=args.num_prompts,
                prefix_len=args.random_prefix_len,
@@ -1371,6 +1393,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
                output_len=args.random_output_len,
                range_ratio=args.random_range_ratio,
                request_id_prefix=args.request_id_prefix,
+                batchsize=args.random_batch_size,
            ),
            "random-mm":
            lambda: RandomMultiModalDataset(

--- a/vllm/benchmarks/lib/endpoint_request_func.py
+++ b/vllm/benchmarks/lib/endpoint_request_func.py
@@ -69,7 +69,7 @@ async def async_request_openai_completions(
    ), "OpenAI Completions API URL must end with 'completions' or 'profile'."

    payload = {
-        "model": request_func_input.model_name \
+        "model": request_func_input.model_name
        if request_func_input.model_name else request_func_input.model,
        "prompt": request_func_input.prompt,
        "temperature": 0.0,
@@ -394,12 +394,61 @@ async def async_request_openai_audio(
    return output


+async def async_request_openai_embeddings(
+    request_func_input: RequestFuncInput,
+    session: aiohttp.ClientSession,
+    pbar: Optional[tqdm] = None,
+):
+    api_url = request_func_input.api_url
+    assert api_url.endswith(
+        "embeddings"
+    ), "OpenAI Embeddings API URL must end with 'embeddings'."
+
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+    }
+
+    payload = {
+        "model": request_func_input.model,
+        "input": request_func_input.prompt,
+    }
+
+    output = RequestFuncOutput()
+    st = time.perf_counter()
+    try:
+        async with session.post(
+            url=api_url,
+            headers=headers,
+            json=payload
+        ) as response:
+            if response.status == 200:
+                output.latency = time.perf_counter() - st
+                data = await response.json()
+                output.success = True
+                output.generated_text = ""
+                output.prompt_len = data.get(
+                    "usage", {}).get(
+                    "prompt_tokens", 0)
+            else:
+                output.success = False
+                output.error = response.reason or ""
+    except Exception as e:
+        output.success = False
+        output.error = str(e)
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+
 # TODO: Add more request functions for different API protocols.
 ASYNC_REQUEST_FUNCS = {
    "vllm": async_request_openai_completions,
    "openai": async_request_openai_completions,
    "openai-chat": async_request_openai_chat_completions,
    "openai-audio": async_request_openai_audio,
+    "openai-embeddings": async_request_openai_embeddings,
 }

 OPENAI_COMPATIBLE_BACKENDS = [

--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -26,6 +26,7 @@ import warnings
 from collections.abc import AsyncGenerator, Iterable
 from dataclasses import dataclass
 from datetime import datetime
+from enum import Enum
 from typing import Any, Literal, Optional

 import aiohttp
@@ -46,6 +47,11 @@ from vllm.transformers_utils.tokenizer import get_tokenizer
 MILLISECONDS_TO_SECONDS_CONVERSION = 1000


+class TaskType(Enum):
+    GENERATION = "generation"
+    EMBEDDING = "embedding"
+
+
 @dataclass
 class BenchmarkMetrics:
    completed: int
@@ -75,6 +81,16 @@ class BenchmarkMetrics:
    std_e2el_ms: float
    percentiles_e2el_ms: list[tuple[float, float]]

+@dataclass
+class EmbedBenchmarkMetrics:
+    completed: int
+    total_input: int
+    request_throughput: float
+    total_token_throughput :float
+    mean_e2el_ms: float
+    std_e2el_ms: float
+    median_e2el_ms: float
+    percentiles_e2el_ms: float

 def _get_current_request_rate(
    ramp_up_strategy: Optional[Literal["linear", "exponential"]],
@@ -189,6 +205,51 @@ async def get_request(
        yield request, request_rates[request_index]


+def calculate_metrics_for_embeddings(
+    outputs: list[RequestFuncOutput], 
+    dur_s: float, 
+    selected_percentiles: list[float]
+) -> EmbedBenchmarkMetrics:
+    """Calculate the metrics for the embedding requests.
+
+    Args:
+        outputs: The outputs of the requests.
+        dur_s: The duration of the benchmark.
+        selected_percentiles: The percentiles to select.
+
+    Returns:
+        The calculated benchmark metrics.
+    """
+    total_input = 0
+    completed = 0
+    e2els: list[float] = []
+    for i in range(len(outputs)):
+        if outputs[i].success:
+            e2els.append(outputs[i].latency)
+            completed += 1
+            total_input += outputs[i].prompt_len
+
+    if completed == 0:
+        warnings.warn(
+            "All requests failed. This is likely due to a misconfiguration "
+            "on the benchmark arguments.",
+            stacklevel=2)
+    metrics = EmbedBenchmarkMetrics(
+        completed=completed,
+        total_input=total_input,
+        request_throughput=completed / dur_s,
+        total_token_throughput=total_input / dur_s,
+        mean_e2el_ms=np.mean(e2els or 0) * 1000,
+        std_e2el_ms=np.std(e2els or 0) * 1000,
+        median_e2el_ms=np.median(e2els or 0) * 1000,
+        percentiles_e2el_ms=[
+            (p, np.percentile(e2els or 0, p) * 1000) 
+            for p in selected_percentiles
+        ],
+    )
+    return metrics
+
+
 def calculate_metrics(
    input_requests: list[SampleRequest],
    outputs: list[RequestFuncOutput],
@@ -334,7 +395,15 @@ async def benchmark(
    ramp_up_end_rps: Optional[int] = None,
    ready_check_timeout_sec: int = 600,
 ):
+    task_type = (
+        TaskType.EMBEDDING
+        if api_url.endswith("/v1/embeddings")
+        else TaskType.GENERATION
+    )
    if endpoint_type in ASYNC_REQUEST_FUNCS:
+        if task_type == TaskType.EMBEDDING:
+            request_func = ASYNC_REQUEST_FUNCS["openai-embeddings"]
+        else:
            request_func = ASYNC_REQUEST_FUNCS[endpoint_type]
    else:
        raise ValueError(f"Unknown endpoint_type: {endpoint_type}")
@@ -513,6 +582,7 @@ async def benchmark(

    benchmark_duration = time.perf_counter() - benchmark_start_time

+    if task_type == TaskType.GENERATION:
        metrics, actual_output_lens = calculate_metrics(
            input_requests=input_requests,
            outputs=outputs,
@@ -521,6 +591,13 @@ async def benchmark(
            selected_percentiles=selected_percentiles,
            goodput_config_dict=goodput_config_dict,
        )
+    else:
+        metrics = calculate_metrics_for_embeddings(
+            outputs=outputs,
+            dur_s=benchmark_duration,
+            selected_percentiles=selected_percentiles,
+        )
+        actual_output_lens = 0

    print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
    print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
@@ -529,22 +606,28 @@ async def benchmark(
                                     max_concurrency))
    if request_rate != float('inf'):
        print("{:<40} {:<10.2f}".format("Request rate configured (RPS):",
-                                        request_rate ))
+                                        request_rate))
    print("{:<40} {:<10.2f}".format("Benchmark duration (s):",
                                    benchmark_duration))
    print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
-    print("{:<40} {:<10}".format("Total generated tokens:",
-                                 metrics.total_output))
+    if isinstance(metrics, BenchmarkMetrics):
+        print("{:<40} {:<10}".format(
+            "Total generated tokens:", metrics.total_output))
    print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
                                    metrics.request_throughput))
    if goodput_config_dict:
        print("{:<40} {:<10.2f}".format("Request goodput (req/s):",
                                        metrics.request_goodput))
-    print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
-                                    metrics.output_throughput))
+    if isinstance(metrics, BenchmarkMetrics):
+        print(
+            "{:<40} {:<10.2f}".format(
+                "Output token throughput (tok/s):", metrics.output_throughput
+            )
+        )
    print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):",
                                    metrics.total_token_throughput))

+    if isinstance(metrics, BenchmarkMetrics):
        result = {
            "duration": benchmark_duration,
            "completed": metrics.completed,
@@ -562,6 +645,16 @@ async def benchmark(
            "generated_texts": [output.generated_text for output in outputs],
            "errors": [output.error for output in outputs],
        }
+    else:
+        result = {
+            "duration": benchmark_duration,
+            "completed": metrics.completed,
+            "total_input_tokens": metrics.total_input,
+            "request_throughput": metrics.request_throughput,
+            "total_token_throughput": metrics.total_token_throughput,
+            "input_lens": [output.prompt_len for output in outputs],
+            "errors": [output.error for output in outputs],
+        }

    if rps_change_events:
        result["rps_change_events"] = rps_change_events
@@ -598,9 +691,10 @@ async def benchmark(
                                            value))
            result[f"p{p_word}_{metric_attribute_name}_ms"] = value

+    if task_type == TaskType.GENERATION:
        process_one_metric("ttft", "TTFT", "Time to First Token")
-    process_one_metric("tpot", "TPOT",
-                       "Time per Output Token (excl. 1st token)")
+        process_one_metric(
+            "tpot", "TPOT", "Time per Output Token (excl. 1st token)")
        process_one_metric("itl", "ITL", "Inter-token Latency")
    process_one_metric("e2el", "E2EL", "End-to-end Latency")

@@ -732,7 +826,8 @@ def add_cli_args(parser: argparse.ArgumentParser):
        "initiated, this argument will control how many are actually allowed "
        "to execute at a time. This means that when used in combination, the "
        "actual request rate may be lower than specified with --request-rate, "
-        "if the server is not processing requests fast enough to keep up.")
+        "if the server is not processing requests fast enough to keep up.",
+    )

    parser.add_argument(
        "--model",
@@ -743,8 +838,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
    parser.add_argument(
        "--tokenizer",
        type=str,
-        help=
-        "Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
+        help="Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
    )
    parser.add_argument("--use-beam-search", action="store_true")
    parser.add_argument(
@@ -968,6 +1062,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
 def main(args: argparse.Namespace) -> dict[str, Any]:
    return asyncio.run(main_async(args))

+
 async def main_async(args: argparse.Namespace) -> dict[str, Any]:
    print(args)
    random.seed(args.seed)