Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
e56f44d9
Unverified
Commit
e56f44d9
authored
May 27, 2025
by
Michael Goin
Committed by
GitHub
May 27, 2025
Browse files
Support datasets in `vllm bench serve` and sync with benchmark_[serving,datasets].py (#18566)
parent
e0cbad4e
Changes
3
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
692 additions
and
101 deletions
+692
-101
vllm/benchmarks/datasets.py
vllm/benchmarks/datasets.py
+175
-10
vllm/benchmarks/endpoint_request_func.py
vllm/benchmarks/endpoint_request_func.py
+223
-3
vllm/benchmarks/serve.py
vllm/benchmarks/serve.py
+294
-88
No files found.
vllm/benchmarks/datasets.py
View file @
e56f44d9
...
@@ -62,6 +62,7 @@ class SampleRequest:
...
@@ -62,6 +62,7 @@ class SampleRequest:
class
BenchmarkDataset
(
ABC
):
class
BenchmarkDataset
(
ABC
):
DEFAULT_SEED
=
0
DEFAULT_SEED
=
0
IS_MULTIMODAL
=
False
def
__init__
(
def
__init__
(
self
,
self
,
...
@@ -316,13 +317,15 @@ class RandomDataset(BenchmarkDataset):
...
@@ -316,13 +317,15 @@ class RandomDataset(BenchmarkDataset):
)
)
vocab_size
=
tokenizer
.
vocab_size
vocab_size
=
tokenizer
.
vocab_size
num_special_tokens
=
tokenizer
.
num_special_tokens_to_add
()
real_input_len
=
input_len
-
num_special_tokens
prefix_token_ids
=
(
np
.
random
.
randint
(
prefix_token_ids
=
(
np
.
random
.
randint
(
0
,
vocab_size
,
size
=
prefix_len
).
tolist
()
if
prefix_len
>
0
else
[])
0
,
vocab_size
,
size
=
prefix_len
).
tolist
()
if
prefix_len
>
0
else
[])
# New sampling logic: [X * (1 - b), X * (1 + b)]
# New sampling logic: [X * (1 - b), X * (1 + b)]
input_low
=
int
(
input_len
*
(
1
-
range_ratio
))
input_low
=
int
(
real_
input_len
*
(
1
-
range_ratio
))
input_high
=
int
(
input_len
*
(
1
+
range_ratio
))
input_high
=
int
(
real_
input_len
*
(
1
+
range_ratio
))
output_low
=
int
(
output_len
*
(
1
-
range_ratio
))
output_low
=
int
(
output_len
*
(
1
-
range_ratio
))
output_high
=
int
(
output_len
*
(
1
+
range_ratio
))
output_high
=
int
(
output_len
*
(
1
+
range_ratio
))
...
@@ -345,6 +348,17 @@ class RandomDataset(BenchmarkDataset):
...
@@ -345,6 +348,17 @@ class RandomDataset(BenchmarkDataset):
vocab_size
).
tolist
()
vocab_size
).
tolist
()
token_sequence
=
prefix_token_ids
+
inner_seq
token_sequence
=
prefix_token_ids
+
inner_seq
prompt
=
tokenizer
.
decode
(
token_sequence
)
prompt
=
tokenizer
.
decode
(
token_sequence
)
# After decoding the prompt we have to encode and decode it again.
# This is done because in some cases N consecutive tokens
# give a string tokenized into != N number of tokens.
# For example for GPT2Tokenizer:
# [6880, 6881] -> ['Ġcalls', 'here'] ->
# [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
# To avoid uncontrolled change of the prompt length,
# the encoded sequence is truncated before being decode again.
re_encoded_sequence
=
tokenizer
.
encode
(
prompt
,
add_special_tokens
=
False
)[:
input_lens
[
i
]]
prompt
=
tokenizer
.
decode
(
re_encoded_sequence
)
total_input_len
=
prefix_len
+
int
(
input_lens
[
i
])
total_input_len
=
prefix_len
+
int
(
input_lens
[
i
])
requests
.
append
(
requests
.
append
(
SampleRequest
(
SampleRequest
(
...
@@ -637,6 +651,7 @@ class ConversationDataset(HuggingFaceDataset):
...
@@ -637,6 +651,7 @@ class ConversationDataset(HuggingFaceDataset):
SUPPORTED_DATASET_PATHS
=
{
SUPPORTED_DATASET_PATHS
=
{
'lmms-lab/LLaVA-OneVision-Data'
,
'Aeala/ShareGPT_Vicuna_unfiltered'
'lmms-lab/LLaVA-OneVision-Data'
,
'Aeala/ShareGPT_Vicuna_unfiltered'
}
}
IS_MULTIMODAL
=
True
def
sample
(
self
,
def
sample
(
self
,
tokenizer
:
PreTrainedTokenizerBase
,
tokenizer
:
PreTrainedTokenizerBase
,
...
@@ -701,6 +716,7 @@ class VisionArenaDataset(HuggingFaceDataset):
...
@@ -701,6 +716,7 @@ class VisionArenaDataset(HuggingFaceDataset):
"lmarena-ai/vision-arena-bench-v0.1"
:
"lmarena-ai/vision-arena-bench-v0.1"
:
lambda
x
:
x
[
"turns"
][
0
][
0
][
"content"
]
lambda
x
:
x
[
"turns"
][
0
][
0
][
"content"
]
}
}
IS_MULTIMODAL
=
True
def
sample
(
def
sample
(
self
,
self
,
...
@@ -784,6 +800,64 @@ class InstructCoderDataset(HuggingFaceDataset):
...
@@ -784,6 +800,64 @@ class InstructCoderDataset(HuggingFaceDataset):
return
sampled_requests
return
sampled_requests
# -----------------------------------------------------------------------------
# MT-Bench Dataset Implementation
# -----------------------------------------------------------------------------
class
MTBenchDataset
(
HuggingFaceDataset
):
"""
MT-Bench Dataset.
https://huggingface.co/datasets/philschmid/mt-bench
We create a single turn dataset for MT-Bench.
This is similar to Spec decoding benchmark setup in vLLM
https://github.com/vllm-project/vllm/blob/9d98ab5ec/examples/offline_inference/eagle.py#L14-L18
"""
# noqa: E501
DEFAULT_OUTPUT_LEN
=
256
# avg len used in SD bench in vLLM
SUPPORTED_DATASET_PATHS
=
{
"philschmid/mt-bench"
,
}
def
sample
(
self
,
tokenizer
:
PreTrainedTokenizerBase
,
num_requests
:
int
,
output_len
:
Optional
[
int
]
=
None
,
enable_multimodal_chat
:
bool
=
False
,
**
kwargs
,
)
->
list
:
output_len
=
(
output_len
if
output_len
is
not
None
else
self
.
DEFAULT_OUTPUT_LEN
)
sampled_requests
=
[]
for
item
in
self
.
data
:
if
len
(
sampled_requests
)
>=
num_requests
:
break
prompt
=
item
[
"turns"
][
0
]
# apply template
prompt
=
tokenizer
.
apply_chat_template
(
[{
"role"
:
"user"
,
"content"
:
prompt
}],
add_generation_prompt
=
True
,
tokenize
=
False
,
)
prompt_len
=
len
(
tokenizer
(
prompt
).
input_ids
)
sampled_requests
.
append
(
SampleRequest
(
prompt
=
prompt
,
prompt_len
=
prompt_len
,
expected_output_len
=
output_len
,
))
self
.
maybe_oversample_requests
(
sampled_requests
,
num_requests
)
return
sampled_requests
# -----------------------------------------------------------------------------
# -----------------------------------------------------------------------------
# AIMO Dataset Implementation
# AIMO Dataset Implementation
# -----------------------------------------------------------------------------
# -----------------------------------------------------------------------------
...
@@ -858,18 +932,18 @@ def _format_zeta_prompt(
...
@@ -858,18 +932,18 @@ def _format_zeta_prompt(
sample
:
dict
,
sample
:
dict
,
original_start_marker
:
str
=
"<|editable_region_start|>"
)
->
dict
:
original_start_marker
:
str
=
"<|editable_region_start|>"
)
->
dict
:
"""Format the zeta prompt for the Next Edit Prediction (NEP) dataset.
"""Format the zeta prompt for the Next Edit Prediction (NEP) dataset.
This function formats examples from the NEP dataset
This function formats examples from the NEP dataset
into prompts and expected outputs. It could be
into prompts and expected outputs. It could be
further extended to support more NEP datasets.
further extended to support more NEP datasets.
Args:
Args:
sample: The dataset sample containing events,
sample: The dataset sample containing events,
inputs, and outputs.
inputs, and outputs.
original_start_marker: The marker indicating the
original_start_marker: The marker indicating the
start of the editable region. Defaults to
start of the editable region. Defaults to
"<|editable_region_start|>".
"<|editable_region_start|>".
Returns:
Returns:
A dictionary with the formatted prompts and expected outputs.
A dictionary with the formatted prompts and expected outputs.
"""
"""
...
@@ -919,3 +993,94 @@ class NextEditPredictionDataset(HuggingFaceDataset):
...
@@ -919,3 +993,94 @@ class NextEditPredictionDataset(HuggingFaceDataset):
break
break
self
.
maybe_oversample_requests
(
samples
,
num_requests
)
self
.
maybe_oversample_requests
(
samples
,
num_requests
)
return
samples
return
samples
# -----------------------------------------------------------------------------
# ASR Dataset Implementation
# -----------------------------------------------------------------------------
class
ASRDataset
(
HuggingFaceDataset
):
"""
Dataset class for processing a ASR dataset for transcription.
Tested on the following set:
+----------------+----------------------------------------+--------------------------+-----------------------------+
| Dataset | Domain | Speaking Style | hf-subset |
+----------------+----------------------------------------+--------------------------+-----------------------------+
| TED-LIUM | TED talks | Oratory | release1, release2, release3|
| | | | release3-speaker-adaptation |
| VoxPopuli | European Parliament | Oratory | en, de, it, fr, ... |
| LibriSpeech | Audiobook | Narrated | "LIUM/tedlium" |
| GigaSpeech | Audiobook, podcast, YouTube | Narrated, spontaneous | xs, s, m, l, xl, dev, test |
| SPGISpeech | Financial meetings | Oratory, spontaneous | S, M, L, dev, test |
| AMI | Meetings | Spontaneous | ihm, sdm |
+----------------+----------------------------------------+--------------------------+-----------------------------+
"""
# noqa: E501
SUPPORTED_DATASET_PATHS
=
{
"openslr/librispeech_asr"
,
"facebook/voxpopuli"
,
"LIUM/tedlium"
,
"edinburghcstr/ami"
,
"speechcolab/gigaspeech"
,
"kensho/spgispeech"
,
}
DEFAULT_OUTPUT_LEN
=
128
IS_MULTIMODAL
=
True
# TODO Whisper-specific. Abstract interface when more models are supported.
TRANSCRIPTION_PREAMBLE
=
(
"<|startoftranscript|><|en|><|transcribe|><|notimestamps|>"
)
skip_long_audios
:
bool
=
True
def
sample
(
self
,
tokenizer
:
PreTrainedTokenizerBase
,
num_requests
:
int
,
output_len
:
Optional
[
int
]
=
None
,
**
kwargs
,
)
->
list
:
try
:
import
librosa
except
ImportError
as
e
:
raise
ImportError
(
"librosa is required for ASRDataset. Please install it "
"using `pip install librosa`."
)
from
e
output_len
=
(
output_len
if
output_len
is
not
None
else
self
.
DEFAULT_OUTPUT_LEN
)
prompt
=
ASRDataset
.
TRANSCRIPTION_PREAMBLE
prompt_len
=
len
(
tokenizer
(
prompt
).
input_ids
)
sampled_requests
=
[]
skipped
=
0
for
item
in
self
.
data
:
if
len
(
sampled_requests
)
>=
num_requests
:
break
audio
=
item
[
"audio"
]
y
,
sr
=
audio
[
"array"
],
audio
[
"sampling_rate"
]
duration_s
=
librosa
.
get_duration
(
y
=
y
,
sr
=
sr
)
# Whisper max supported duration
if
self
.
skip_long_audios
and
duration_s
>
30
:
skipped
+=
1
continue
mm_content
=
{
"audio"
:
(
y
,
sr
)}
sampled_requests
.
append
(
SampleRequest
(
prompt
=
prompt
,
prompt_len
=
prompt_len
,
expected_output_len
=
output_len
,
multi_modal_data
=
mm_content
,
))
if
skipped
:
logger
.
warning
(
"%d samples discarded from dataset due to"
" their length being greater than"
" what Whisper supports."
,
skipped
,
)
self
.
maybe_oversample_requests
(
sampled_requests
,
num_requests
)
return
sampled_requests
vllm/benchmarks/endpoint_request_func.py
View file @
e56f44d9
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
"""The request function for API endpoints."""
"""The request function for API endpoints."""
import
io
import
json
import
json
import
os
import
os
import
sys
import
sys
...
@@ -24,11 +25,11 @@ class RequestFuncInput:
...
@@ -24,11 +25,11 @@ class RequestFuncInput:
output_len
:
int
output_len
:
int
model
:
str
model
:
str
model_name
:
Optional
[
str
]
=
None
model_name
:
Optional
[
str
]
=
None
best_of
:
int
=
1
logprobs
:
Optional
[
int
]
=
None
logprobs
:
Optional
[
int
]
=
None
extra_body
:
Optional
[
dict
]
=
None
extra_body
:
Optional
[
dict
]
=
None
multi_modal_content
:
Optional
[
dict
]
=
None
multi_modal_content
:
Optional
[
dict
]
=
None
ignore_eos
:
bool
=
False
ignore_eos
:
bool
=
False
language
:
Optional
[
str
]
=
None
@
dataclass
@
dataclass
...
@@ -71,7 +72,7 @@ async def async_request_openai_completions(
...
@@ -71,7 +72,7 @@ async def async_request_openai_completions(
if
request_func_input
.
model_name
else
request_func_input
.
model
,
if
request_func_input
.
model_name
else
request_func_input
.
model
,
"prompt"
:
request_func_input
.
prompt
,
"prompt"
:
request_func_input
.
prompt
,
"temperature"
:
0.0
,
"temperature"
:
0.0
,
"
best_of"
:
request_func_input
.
best_of
,
"
repetition_penalty"
:
1.0
,
"max_tokens"
:
request_func_input
.
output_len
,
"max_tokens"
:
request_func_input
.
output_len
,
"logprobs"
:
request_func_input
.
logprobs
,
"logprobs"
:
request_func_input
.
logprobs
,
"stream"
:
True
,
"stream"
:
True
,
...
@@ -154,7 +155,226 @@ async def async_request_openai_completions(
...
@@ -154,7 +155,226 @@ async def async_request_openai_completions(
return
output
return
output
async
def
async_request_openai_chat_completions
(
request_func_input
:
RequestFuncInput
,
pbar
:
Optional
[
tqdm
]
=
None
,
)
->
RequestFuncOutput
:
api_url
=
request_func_input
.
api_url
assert
api_url
.
endswith
((
"chat/completions"
,
"profile"
)),
(
"OpenAI Chat Completions API URL must end with 'chat/completions'."
)
async
with
aiohttp
.
ClientSession
(
trust_env
=
True
,
timeout
=
AIOHTTP_TIMEOUT
)
as
session
:
content
=
[{
"type"
:
"text"
,
"text"
:
request_func_input
.
prompt
}]
if
request_func_input
.
multi_modal_content
:
content
.
append
(
request_func_input
.
multi_modal_content
)
payload
=
{
"model"
:
request_func_input
.
model_name
if
request_func_input
.
model_name
else
request_func_input
.
model
,
"messages"
:
[
{
"role"
:
"user"
,
"content"
:
content
},
],
"temperature"
:
0.0
,
"max_completion_tokens"
:
request_func_input
.
output_len
,
"stream"
:
True
,
"stream_options"
:
{
"include_usage"
:
True
,
},
}
if
request_func_input
.
ignore_eos
:
payload
[
"ignore_eos"
]
=
request_func_input
.
ignore_eos
if
request_func_input
.
extra_body
:
payload
.
update
(
request_func_input
.
extra_body
)
headers
=
{
"Content-Type"
:
"application/json"
,
"Authorization"
:
f
"Bearer
{
os
.
environ
.
get
(
'OPENAI_API_KEY'
)
}
"
,
}
output
=
RequestFuncOutput
()
output
.
prompt_len
=
request_func_input
.
prompt_len
generated_text
=
""
ttft
=
0.0
st
=
time
.
perf_counter
()
most_recent_timestamp
=
st
try
:
async
with
session
.
post
(
url
=
api_url
,
json
=
payload
,
headers
=
headers
)
as
response
:
if
response
.
status
==
200
:
async
for
chunk_bytes
in
response
.
content
:
chunk_bytes
=
chunk_bytes
.
strip
()
if
not
chunk_bytes
:
continue
chunk
=
chunk_bytes
.
decode
(
"utf-8"
).
removeprefix
(
"data: "
)
if
chunk
!=
"[DONE]"
:
timestamp
=
time
.
perf_counter
()
data
=
json
.
loads
(
chunk
)
if
choices
:
=
data
.
get
(
"choices"
):
content
=
choices
[
0
][
"delta"
].
get
(
"content"
)
# First token
if
ttft
==
0.0
:
ttft
=
timestamp
-
st
output
.
ttft
=
ttft
# Decoding phase
else
:
output
.
itl
.
append
(
timestamp
-
most_recent_timestamp
)
generated_text
+=
content
or
""
elif
usage
:
=
data
.
get
(
"usage"
):
output
.
output_tokens
=
usage
.
get
(
"completion_tokens"
)
most_recent_timestamp
=
timestamp
output
.
generated_text
=
generated_text
output
.
success
=
True
output
.
latency
=
most_recent_timestamp
-
st
else
:
output
.
error
=
response
.
reason
or
""
output
.
success
=
False
except
Exception
:
output
.
success
=
False
exc_info
=
sys
.
exc_info
()
output
.
error
=
""
.
join
(
traceback
.
format_exception
(
*
exc_info
))
if
pbar
:
pbar
.
update
(
1
)
return
output
async
def
async_request_openai_audio
(
request_func_input
:
RequestFuncInput
,
pbar
:
Optional
[
tqdm
]
=
None
,
)
->
RequestFuncOutput
:
# Lazy import without PlaceholderModule to avoid vllm dep.
import
soundfile
api_url
=
request_func_input
.
api_url
assert
api_url
.
endswith
((
"transcriptions"
,
"translations"
)),
(
"OpenAI Chat Completions API URL must end with 'transcriptions' "
)
"or `translations`."
async
with
aiohttp
.
ClientSession
(
trust_env
=
True
,
timeout
=
AIOHTTP_TIMEOUT
)
as
session
:
content
=
[{
"type"
:
"text"
,
"text"
:
request_func_input
.
prompt
}]
payload
=
{
"model"
:
request_func_input
.
model_name
if
request_func_input
.
model_name
else
request_func_input
.
model
,
"temperature"
:
0.0
,
"max_completion_tokens"
:
request_func_input
.
output_len
,
"stream"
:
True
,
"language"
:
"en"
,
# Flattened due to multipart/form-data
"stream_include_usage"
:
True
,
"stream_continuous_usage_stats"
:
True
,
}
if
request_func_input
.
extra_body
:
payload
.
update
(
request_func_input
.
extra_body
)
headers
=
{
"Authorization"
:
f
"Bearer
{
os
.
environ
.
get
(
'OPENAI_API_KEY'
)
}
"
,
}
# Send audio file
def
to_bytes
(
y
,
sr
):
buffer
=
io
.
BytesIO
()
soundfile
.
write
(
buffer
,
y
,
sr
,
format
=
"WAV"
)
buffer
.
seek
(
0
)
return
buffer
with
to_bytes
(
*
request_func_input
.
multi_modal_content
[
"audio"
])
as
f
:
form
=
aiohttp
.
FormData
()
form
.
add_field
(
"file"
,
f
,
content_type
=
"audio/wav"
)
for
key
,
value
in
payload
.
items
():
form
.
add_field
(
key
,
str
(
value
))
output
=
RequestFuncOutput
()
output
.
prompt_len
=
request_func_input
.
prompt_len
generated_text
=
""
ttft
=
0.0
st
=
time
.
perf_counter
()
most_recent_timestamp
=
st
try
:
async
with
session
.
post
(
url
=
api_url
,
data
=
form
,
headers
=
headers
)
as
response
:
if
response
.
status
==
200
:
async
for
chunk_bytes
in
response
.
content
:
chunk_bytes
=
chunk_bytes
.
strip
()
if
not
chunk_bytes
:
continue
chunk
=
chunk_bytes
.
decode
(
"utf-8"
).
removeprefix
(
"data: "
)
if
chunk
!=
"[DONE]"
:
timestamp
=
time
.
perf_counter
()
data
=
json
.
loads
(
chunk
)
if
choices
:
=
data
.
get
(
"choices"
):
content
=
choices
[
0
][
"delta"
].
get
(
"content"
)
# First token
if
ttft
==
0.0
:
ttft
=
timestamp
-
st
output
.
ttft
=
ttft
# Decoding phase
else
:
output
.
itl
.
append
(
timestamp
-
most_recent_timestamp
)
generated_text
+=
content
or
""
elif
usage
:
=
data
.
get
(
"usage"
):
output
.
output_tokens
=
usage
.
get
(
"completion_tokens"
)
most_recent_timestamp
=
timestamp
output
.
generated_text
=
generated_text
output
.
success
=
True
output
.
latency
=
most_recent_timestamp
-
st
else
:
output
.
error
=
response
.
reason
or
""
output
.
success
=
False
except
Exception
:
output
.
success
=
False
exc_info
=
sys
.
exc_info
()
output
.
error
=
""
.
join
(
traceback
.
format_exception
(
*
exc_info
))
if
pbar
:
pbar
.
update
(
1
)
return
output
# TODO: Add more request functions for different API protocols.
# TODO: Add more request functions for different API protocols.
ASYNC_REQUEST_FUNCS
=
{
ASYNC_REQUEST_FUNCS
=
{
"openai-comp"
:
async_request_openai_completions
,
"vllm"
:
async_request_openai_completions
,
"openai"
:
async_request_openai_completions
,
"openai-chat"
:
async_request_openai_chat_completions
,
"openai-audio"
:
async_request_openai_audio
,
}
}
OPENAI_COMPATIBLE_BACKENDS
=
[
k
for
k
,
v
in
ASYNC_REQUEST_FUNCS
.
items
()
if
v
in
(
async_request_openai_completions
,
async_request_openai_chat_completions
)
]
vllm/benchmarks/serve.py
View file @
e56f44d9
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment