Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
ba108019
Unverified
Commit
ba108019
authored
Apr 05, 2025
by
Hyesoo Yang
Committed by
GitHub
Apr 06, 2025
Browse files
[Benchmark] Add sampling parameters to benchmark_serving. (#16022)
Signed-off-by:
Hyesoo Yang
<
hyeygit@gmail.com
>
parent
620fc2d0
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
80 additions
and
3 deletions
+80
-3
benchmarks/README.md
benchmarks/README.md
+18
-0
benchmarks/backend_request_func.py
benchmarks/backend_request_func.py
+6
-0
benchmarks/benchmark_serving.py
benchmarks/benchmark_serving.py
+56
-3
No files found.
benchmarks/README.md
View file @
ba108019
...
...
@@ -204,6 +204,24 @@ python3 vllm/benchmarks/benchmark_serving.py \
--seed
42
```
### Running With Sampling Parameters
When using OpenAI-compatible backends such as
`vllm`
, optional sampling
parameters can be specified. Example client command:
```
bash
python3 vllm/benchmarks/benchmark_serving.py
\
--backend
vllm
\
--model
NousResearch/Hermes-3-Llama-3.1-8B
\
--endpoint
/v1/completions
\
--dataset-name
sharegpt
\
--dataset-path
<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json
\
--top-k
10
\
--top-p
0.9
\
--temperature
0.5
\
--num-prompts
10
```
---
## Example - Offline Throughput Benchmark
...
...
benchmarks/backend_request_func.py
View file @
ba108019
...
...
@@ -497,3 +497,9 @@ ASYNC_REQUEST_FUNCS = {
"scalellm"
:
async_request_openai_completions
,
"sglang"
:
async_request_openai_completions
,
}
OPENAI_COMPATIBLE_BACKENDS
=
[
k
for
k
,
v
in
ASYNC_REQUEST_FUNCS
.
items
()
if
v
in
(
async_request_openai_completions
,
async_request_openai_chat_completions
)
]
benchmarks/benchmark_serving.py
View file @
ba108019
...
...
@@ -34,7 +34,8 @@ from datetime import datetime
from
typing
import
Any
,
Optional
import
numpy
as
np
from
backend_request_func
import
(
ASYNC_REQUEST_FUNCS
,
RequestFuncInput
,
from
backend_request_func
import
(
ASYNC_REQUEST_FUNCS
,
OPENAI_COMPATIBLE_BACKENDS
,
RequestFuncInput
,
RequestFuncOutput
)
from
tqdm.asyncio
import
tqdm
from
transformers
import
PreTrainedTokenizerBase
...
...
@@ -260,6 +261,7 @@ async def benchmark(
goodput_config_dict
:
dict
[
str
,
float
],
max_concurrency
:
Optional
[
int
],
lora_modules
:
Optional
[
Iterable
[
str
]],
extra_body
:
Optional
[
dict
],
):
if
backend
in
ASYNC_REQUEST_FUNCS
:
request_func
=
ASYNC_REQUEST_FUNCS
[
backend
]
...
...
@@ -287,6 +289,7 @@ async def benchmark(
logprobs
=
logprobs
,
multi_modal_content
=
test_mm_content
,
ignore_eos
=
ignore_eos
,
extra_body
=
extra_body
,
)
test_output
=
await
request_func
(
request_func_input
=
test_input
)
...
...
@@ -313,7 +316,8 @@ async def benchmark(
output_len
=
test_output_len
,
logprobs
=
logprobs
,
multi_modal_content
=
test_mm_content
,
ignore_eos
=
ignore_eos
)
ignore_eos
=
ignore_eos
,
extra_body
=
extra_body
)
profile_output
=
await
request_func
(
request_func_input
=
profile_input
)
if
profile_output
.
success
:
print
(
"Profiler started"
)
...
...
@@ -363,7 +367,8 @@ async def benchmark(
output_len
=
output_len
,
logprobs
=
logprobs
,
multi_modal_content
=
mm_content
,
ignore_eos
=
ignore_eos
)
ignore_eos
=
ignore_eos
,
extra_body
=
extra_body
)
tasks
.
append
(
asyncio
.
create_task
(
limited_request_func
(
request_func_input
=
request_func_input
,
...
...
@@ -652,6 +657,26 @@ def main(args: argparse.Namespace):
raise
ValueError
(
f
"Unknown dataset:
{
args
.
dataset_name
}
"
)
from
err
goodput_config_dict
=
check_goodput_args
(
args
)
# Collect the sampling parameters.
sampling_params
=
{
k
:
v
for
k
,
v
in
{
"top_p"
:
args
.
top_p
,
"top_k"
:
args
.
top_k
,
"min_p"
:
args
.
min_p
,
"temperature"
:
args
.
temperature
}.
items
()
if
v
is
not
None
}
# Sampling parameters are only supported by openai-compatible backend.
if
sampling_params
and
args
.
backend
not
in
OPENAI_COMPATIBLE_BACKENDS
:
raise
ValueError
(
"Sampling parameters are only supported by openai-compatible "
"backends."
)
if
"temperature"
not
in
sampling_params
:
sampling_params
[
"temperature"
]
=
0.0
# Default to greedy decoding.
# Avoid GC processing "static" data - reduce pause times.
gc
.
collect
()
gc
.
freeze
()
...
...
@@ -678,6 +703,7 @@ def main(args: argparse.Namespace):
goodput_config_dict
=
goodput_config_dict
,
max_concurrency
=
args
.
max_concurrency
,
lora_modules
=
args
.
lora_modules
,
extra_body
=
sampling_params
,
))
# Save config and results to json
...
...
@@ -1000,6 +1026,33 @@ if __name__ == "__main__":
"from the sampled HF dataset."
,
)
sampling_group
=
parser
.
add_argument_group
(
"sampling parameters"
)
sampling_group
.
add_argument
(
"--top-p"
,
type
=
float
,
default
=
None
,
help
=
"Top-p sampling parameter. Only has effect on openai-compatible "
"backends."
)
sampling_group
.
add_argument
(
"--top-k"
,
type
=
int
,
default
=
None
,
help
=
"Top-k sampling parameter. Only has effect on openai-compatible "
"backends."
)
sampling_group
.
add_argument
(
"--min-p"
,
type
=
float
,
default
=
None
,
help
=
"Min-p sampling parameter. Only has effect on openai-compatible "
"backends."
)
sampling_group
.
add_argument
(
"--temperature"
,
type
=
float
,
default
=
None
,
help
=
"Temperature sampling parameter. Only has effect on "
"openai-compatible backends. If not specified, default to greedy "
"decoding (i.e. temperature==0.0)."
)
parser
.
add_argument
(
'--tokenizer-mode'
,
type
=
str
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment