Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
3ecbb14b
Unverified
Commit
3ecbb14b
authored
Aug 26, 2025
by
Jiangyun Zhu
Committed by
GitHub
Aug 25, 2025
Browse files
[Benchmarks] add benchmark for embedding models (#23000)
Signed-off-by:
zjy0516
<
riverclouds.zhu@qq.com
>
parent
7d67a9d9
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
274 additions
and
107 deletions
+274
-107
vllm/benchmarks/datasets.py
vllm/benchmarks/datasets.py
+45
-22
vllm/benchmarks/lib/endpoint_request_func.py
vllm/benchmarks/lib/endpoint_request_func.py
+53
-4
vllm/benchmarks/serve.py
vllm/benchmarks/serve.py
+176
-81
No files found.
vllm/benchmarks/datasets.py
View file @
3ecbb14b
...
...
@@ -73,7 +73,7 @@ class SampleRequest:
Represents a single inference request for benchmarking.
"""
prompt
:
Union
[
str
,
Any
]
prompt
:
Union
[
str
,
list
[
str
]
]
prompt_len
:
int
expected_output_len
:
int
multi_modal_data
:
Optional
[
...
...
@@ -409,6 +409,7 @@ class RandomDataset(BenchmarkDataset):
range_ratio
:
float
=
DEFAULT_RANGE_RATIO
,
input_len
:
int
=
DEFAULT_INPUT_LEN
,
output_len
:
int
=
DEFAULT_OUTPUT_LEN
,
batchsize
:
int
=
1
,
**
kwargs
,
)
->
list
[
SampleRequest
]:
...
...
@@ -439,6 +440,21 @@ class RandomDataset(BenchmarkDataset):
request_id
=
request_id_prefix
+
str
(
i
),
)
)
# only used for embeddings benchmark.
if
batchsize
>
1
:
batch_requests
=
[]
# Create batched requests
for
i
in
range
(
0
,
num_requests
,
batchsize
):
batch
=
requests
[
i
:
i
+
batchsize
]
batch_requests
.
append
(
SampleRequest
(
prompt
=
[
req
.
prompt
for
req
in
batch
],
prompt_len
=
sum
(
req
.
prompt_len
for
req
in
batch
),
expected_output_len
=
0
,
request_id
=
request_id_prefix
+
str
(
i
//
batchsize
),
)
)
requests
=
batch_requests
return
requests
def
get_prefix
(
...
...
@@ -506,7 +522,6 @@ class RandomDataset(BenchmarkDataset):
size
=
num_requests
)
return
input_lens
,
output_lens
,
offsets
def
generate_token_sequence
(
self
,
*
,
...
...
@@ -1105,6 +1120,13 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
"context length sampled from [input_len * (1 - range_ratio), "
"input_len * (1 + range_ratio)]."
),
)
random_group
.
add_argument
(
"--random-batch-size"
,
type
=
int
,
default
=
1
,
help
=
(
"Batch size for random sampling. "
"Only used for embeddings benchmark."
),
)
# random multimodal dataset options
random_mm_group
=
parser
.
add_argument_group
(
...
...
@@ -1196,8 +1218,6 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
),
)
hf_group
=
parser
.
add_argument_group
(
"hf dataset options"
)
hf_group
.
add_argument
(
"--hf-subset"
,
type
=
str
,
...
...
@@ -1348,22 +1368,24 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
else
:
# For datasets that follow a similar structure, use a mapping.
dataset_mapping
=
{
"sharegpt"
:
lambda
:
ShareGPTDataset
(
random_seed
=
args
.
seed
,
dataset_path
=
args
.
dataset_path
).
sample
(
"sharegpt"
:
lambda
:
ShareGPTDataset
(
random_seed
=
args
.
seed
,
dataset_path
=
args
.
dataset_path
).
sample
(
tokenizer
=
tokenizer
,
num_requests
=
args
.
num_prompts
,
output_len
=
args
.
sharegpt_output_len
,
request_id_prefix
=
args
.
request_id_prefix
,
),
"burstgpt"
:
lambda
:
BurstGPTDataset
(
random_seed
=
args
.
seed
,
dataset_path
=
args
.
dataset_path
).
sample
(
tokenizer
=
tokenizer
,
num_requests
=
args
.
num_prompts
,
request_id_prefix
=
args
.
request_id_prefix
,),
"random"
:
lambda
:
RandomDataset
(
random_seed
=
args
.
seed
,
dataset_path
=
args
.
dataset_path
).
sample
(
"burstgpt"
:
lambda
:
BurstGPTDataset
(
random_seed
=
args
.
seed
,
dataset_path
=
args
.
dataset_path
).
sample
(
tokenizer
=
tokenizer
,
num_requests
=
args
.
num_prompts
,
request_id_prefix
=
args
.
request_id_prefix
,
),
"random"
:
lambda
:
RandomDataset
(
random_seed
=
args
.
seed
,
dataset_path
=
args
.
dataset_path
).
sample
(
tokenizer
=
tokenizer
,
num_requests
=
args
.
num_prompts
,
prefix_len
=
args
.
random_prefix_len
,
...
...
@@ -1371,6 +1393,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
output_len
=
args
.
random_output_len
,
range_ratio
=
args
.
random_range_ratio
,
request_id_prefix
=
args
.
request_id_prefix
,
batchsize
=
args
.
random_batch_size
,
),
"random-mm"
:
lambda
:
RandomMultiModalDataset
(
...
...
vllm/benchmarks/lib/endpoint_request_func.py
View file @
3ecbb14b
...
...
@@ -69,7 +69,7 @@ async def async_request_openai_completions(
),
"OpenAI Completions API URL must end with 'completions' or 'profile'."
payload
=
{
"model"
:
request_func_input
.
model_name
\
"model"
:
request_func_input
.
model_name
if
request_func_input
.
model_name
else
request_func_input
.
model
,
"prompt"
:
request_func_input
.
prompt
,
"temperature"
:
0.0
,
...
...
@@ -394,12 +394,61 @@ async def async_request_openai_audio(
return
output
async
def
async_request_openai_embeddings
(
request_func_input
:
RequestFuncInput
,
session
:
aiohttp
.
ClientSession
,
pbar
:
Optional
[
tqdm
]
=
None
,
):
api_url
=
request_func_input
.
api_url
assert
api_url
.
endswith
(
"embeddings"
),
"OpenAI Embeddings API URL must end with 'embeddings'."
headers
=
{
"Content-Type"
:
"application/json"
,
"Authorization"
:
f
"Bearer
{
os
.
environ
.
get
(
'OPENAI_API_KEY'
)
}
"
,
}
payload
=
{
"model"
:
request_func_input
.
model
,
"input"
:
request_func_input
.
prompt
,
}
output
=
RequestFuncOutput
()
st
=
time
.
perf_counter
()
try
:
async
with
session
.
post
(
url
=
api_url
,
headers
=
headers
,
json
=
payload
)
as
response
:
if
response
.
status
==
200
:
output
.
latency
=
time
.
perf_counter
()
-
st
data
=
await
response
.
json
()
output
.
success
=
True
output
.
generated_text
=
""
output
.
prompt_len
=
data
.
get
(
"usage"
,
{}).
get
(
"prompt_tokens"
,
0
)
else
:
output
.
success
=
False
output
.
error
=
response
.
reason
or
""
except
Exception
as
e
:
output
.
success
=
False
output
.
error
=
str
(
e
)
if
pbar
:
pbar
.
update
(
1
)
return
output
# TODO: Add more request functions for different API protocols.
ASYNC_REQUEST_FUNCS
=
{
"vllm"
:
async_request_openai_completions
,
"openai"
:
async_request_openai_completions
,
"openai-chat"
:
async_request_openai_chat_completions
,
"openai-audio"
:
async_request_openai_audio
,
"openai-embeddings"
:
async_request_openai_embeddings
,
}
OPENAI_COMPATIBLE_BACKENDS
=
[
...
...
vllm/benchmarks/serve.py
View file @
3ecbb14b
...
...
@@ -26,6 +26,7 @@ import warnings
from
collections.abc
import
AsyncGenerator
,
Iterable
from
dataclasses
import
dataclass
from
datetime
import
datetime
from
enum
import
Enum
from
typing
import
Any
,
Literal
,
Optional
import
aiohttp
...
...
@@ -46,6 +47,11 @@ from vllm.transformers_utils.tokenizer import get_tokenizer
MILLISECONDS_TO_SECONDS_CONVERSION
=
1000
class
TaskType
(
Enum
):
GENERATION
=
"generation"
EMBEDDING
=
"embedding"
@
dataclass
class
BenchmarkMetrics
:
completed
:
int
...
...
@@ -75,6 +81,16 @@ class BenchmarkMetrics:
std_e2el_ms
:
float
percentiles_e2el_ms
:
list
[
tuple
[
float
,
float
]]
@
dataclass
class
EmbedBenchmarkMetrics
:
completed
:
int
total_input
:
int
request_throughput
:
float
total_token_throughput
:
float
mean_e2el_ms
:
float
std_e2el_ms
:
float
median_e2el_ms
:
float
percentiles_e2el_ms
:
float
def
_get_current_request_rate
(
ramp_up_strategy
:
Optional
[
Literal
[
"linear"
,
"exponential"
]],
...
...
@@ -189,6 +205,51 @@ async def get_request(
yield
request
,
request_rates
[
request_index
]
def
calculate_metrics_for_embeddings
(
outputs
:
list
[
RequestFuncOutput
],
dur_s
:
float
,
selected_percentiles
:
list
[
float
]
)
->
EmbedBenchmarkMetrics
:
"""Calculate the metrics for the embedding requests.
Args:
outputs: The outputs of the requests.
dur_s: The duration of the benchmark.
selected_percentiles: The percentiles to select.
Returns:
The calculated benchmark metrics.
"""
total_input
=
0
completed
=
0
e2els
:
list
[
float
]
=
[]
for
i
in
range
(
len
(
outputs
)):
if
outputs
[
i
].
success
:
e2els
.
append
(
outputs
[
i
].
latency
)
completed
+=
1
total_input
+=
outputs
[
i
].
prompt_len
if
completed
==
0
:
warnings
.
warn
(
"All requests failed. This is likely due to a misconfiguration "
"on the benchmark arguments."
,
stacklevel
=
2
)
metrics
=
EmbedBenchmarkMetrics
(
completed
=
completed
,
total_input
=
total_input
,
request_throughput
=
completed
/
dur_s
,
total_token_throughput
=
total_input
/
dur_s
,
mean_e2el_ms
=
np
.
mean
(
e2els
or
0
)
*
1000
,
std_e2el_ms
=
np
.
std
(
e2els
or
0
)
*
1000
,
median_e2el_ms
=
np
.
median
(
e2els
or
0
)
*
1000
,
percentiles_e2el_ms
=
[
(
p
,
np
.
percentile
(
e2els
or
0
,
p
)
*
1000
)
for
p
in
selected_percentiles
],
)
return
metrics
def
calculate_metrics
(
input_requests
:
list
[
SampleRequest
],
outputs
:
list
[
RequestFuncOutput
],
...
...
@@ -334,7 +395,15 @@ async def benchmark(
ramp_up_end_rps
:
Optional
[
int
]
=
None
,
ready_check_timeout_sec
:
int
=
600
,
):
task_type
=
(
TaskType
.
EMBEDDING
if
api_url
.
endswith
(
"/v1/embeddings"
)
else
TaskType
.
GENERATION
)
if
endpoint_type
in
ASYNC_REQUEST_FUNCS
:
if
task_type
==
TaskType
.
EMBEDDING
:
request_func
=
ASYNC_REQUEST_FUNCS
[
"openai-embeddings"
]
else
:
request_func
=
ASYNC_REQUEST_FUNCS
[
endpoint_type
]
else
:
raise
ValueError
(
f
"Unknown endpoint_type:
{
endpoint_type
}
"
)
...
...
@@ -513,6 +582,7 @@ async def benchmark(
benchmark_duration
=
time
.
perf_counter
()
-
benchmark_start_time
if
task_type
==
TaskType
.
GENERATION
:
metrics
,
actual_output_lens
=
calculate_metrics
(
input_requests
=
input_requests
,
outputs
=
outputs
,
...
...
@@ -521,6 +591,13 @@ async def benchmark(
selected_percentiles
=
selected_percentiles
,
goodput_config_dict
=
goodput_config_dict
,
)
else
:
metrics
=
calculate_metrics_for_embeddings
(
outputs
=
outputs
,
dur_s
=
benchmark_duration
,
selected_percentiles
=
selected_percentiles
,
)
actual_output_lens
=
0
print
(
"{s:{c}^{n}}"
.
format
(
s
=
' Serving Benchmark Result '
,
n
=
50
,
c
=
'='
))
print
(
"{:<40} {:<10}"
.
format
(
"Successful requests:"
,
metrics
.
completed
))
...
...
@@ -529,22 +606,28 @@ async def benchmark(
max_concurrency
))
if
request_rate
!=
float
(
'inf'
):
print
(
"{:<40} {:<10.2f}"
.
format
(
"Request rate configured (RPS):"
,
request_rate
))
request_rate
))
print
(
"{:<40} {:<10.2f}"
.
format
(
"Benchmark duration (s):"
,
benchmark_duration
))
print
(
"{:<40} {:<10}"
.
format
(
"Total input tokens:"
,
metrics
.
total_input
))
print
(
"{:<40} {:<10}"
.
format
(
"Total generated tokens:"
,
metrics
.
total_output
))
if
isinstance
(
metrics
,
BenchmarkMetrics
):
print
(
"{:<40} {:<10}"
.
format
(
"Total generated tokens:"
,
metrics
.
total_output
))
print
(
"{:<40} {:<10.2f}"
.
format
(
"Request throughput (req/s):"
,
metrics
.
request_throughput
))
if
goodput_config_dict
:
print
(
"{:<40} {:<10.2f}"
.
format
(
"Request goodput (req/s):"
,
metrics
.
request_goodput
))
print
(
"{:<40} {:<10.2f}"
.
format
(
"Output token throughput (tok/s):"
,
metrics
.
output_throughput
))
if
isinstance
(
metrics
,
BenchmarkMetrics
):
print
(
"{:<40} {:<10.2f}"
.
format
(
"Output token throughput (tok/s):"
,
metrics
.
output_throughput
)
)
print
(
"{:<40} {:<10.2f}"
.
format
(
"Total Token throughput (tok/s):"
,
metrics
.
total_token_throughput
))
if
isinstance
(
metrics
,
BenchmarkMetrics
):
result
=
{
"duration"
:
benchmark_duration
,
"completed"
:
metrics
.
completed
,
...
...
@@ -562,6 +645,16 @@ async def benchmark(
"generated_texts"
:
[
output
.
generated_text
for
output
in
outputs
],
"errors"
:
[
output
.
error
for
output
in
outputs
],
}
else
:
result
=
{
"duration"
:
benchmark_duration
,
"completed"
:
metrics
.
completed
,
"total_input_tokens"
:
metrics
.
total_input
,
"request_throughput"
:
metrics
.
request_throughput
,
"total_token_throughput"
:
metrics
.
total_token_throughput
,
"input_lens"
:
[
output
.
prompt_len
for
output
in
outputs
],
"errors"
:
[
output
.
error
for
output
in
outputs
],
}
if
rps_change_events
:
result
[
"rps_change_events"
]
=
rps_change_events
...
...
@@ -598,9 +691,10 @@ async def benchmark(
value
))
result
[
f
"p
{
p_word
}
_
{
metric_attribute_name
}
_ms"
]
=
value
if
task_type
==
TaskType
.
GENERATION
:
process_one_metric
(
"ttft"
,
"TTFT"
,
"Time to First Token"
)
process_one_metric
(
"tpot"
,
"TPOT"
,
"Time per Output Token (excl. 1st token)"
)
process_one_metric
(
"tpot"
,
"TPOT"
,
"Time per Output Token (excl. 1st token)"
)
process_one_metric
(
"itl"
,
"ITL"
,
"Inter-token Latency"
)
process_one_metric
(
"e2el"
,
"E2EL"
,
"End-to-end Latency"
)
...
...
@@ -732,7 +826,8 @@ def add_cli_args(parser: argparse.ArgumentParser):
"initiated, this argument will control how many are actually allowed "
"to execute at a time. This means that when used in combination, the "
"actual request rate may be lower than specified with --request-rate, "
"if the server is not processing requests fast enough to keep up."
)
"if the server is not processing requests fast enough to keep up."
,
)
parser
.
add_argument
(
"--model"
,
...
...
@@ -743,8 +838,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
parser
.
add_argument
(
"--tokenizer"
,
type
=
str
,
help
=
"Name or path of the tokenizer, if not using the default tokenizer."
,
# noqa: E501
help
=
"Name or path of the tokenizer, if not using the default tokenizer."
,
# noqa: E501
)
parser
.
add_argument
(
"--use-beam-search"
,
action
=
"store_true"
)
parser
.
add_argument
(
...
...
@@ -968,6 +1062,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
def
main
(
args
:
argparse
.
Namespace
)
->
dict
[
str
,
Any
]:
return
asyncio
.
run
(
main_async
(
args
))
async
def
main_async
(
args
:
argparse
.
Namespace
)
->
dict
[
str
,
Any
]:
print
(
args
)
random
.
seed
(
args
.
seed
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment