Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
21da7334
Unverified
Commit
21da7334
authored
Sep 18, 2025
by
Roger Wang
Committed by
GitHub
Sep 18, 2025
Browse files
[Misc] Clean up flags in `vllm bench serve` (#25138)
Signed-off-by:
Roger Wang
<
hey@rogerw.io
>
parent
66072b36
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
39 additions
and
23 deletions
+39
-23
docs/contributing/benchmarks.md
docs/contributing/benchmarks.md
+0
-3
tests/benchmarks/test_serve_cli.py
tests/benchmarks/test_serve_cli.py
+1
-1
vllm/benchmarks/datasets.py
vllm/benchmarks/datasets.py
+4
-4
vllm/benchmarks/serve.py
vllm/benchmarks/serve.py
+34
-15
No files found.
docs/contributing/benchmarks.md
View file @
21da7334
...
...
@@ -156,7 +156,6 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct
```
bash
vllm bench serve
\
--backend
openai-chat
\
--endpoint-type
openai-chat
\
--model
Qwen/Qwen2-VL-7B-Instruct
\
--endpoint
/v1/chat/completions
\
--dataset-name
hf
\
...
...
@@ -230,7 +229,6 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct
```
bash
vllm bench serve
\
--backend
openai-chat
\
--endpoint-type
openai-chat
\
--model
Qwen/Qwen2-VL-7B-Instruct
\
--endpoint
/v1/chat/completions
\
--dataset-name
hf
\
...
...
@@ -245,7 +243,6 @@ vllm bench serve \
```
bash
vllm bench serve
\
--backend
openai-chat
\
--endpoint-type
openai-chat
\
--model
Qwen/Qwen2-VL-7B-Instruct
\
--endpoint
/v1/chat/completions
\
--dataset-name
hf
\
...
...
tests/benchmarks/test_serve_cli.py
View file @
21da7334
...
...
@@ -68,7 +68,7 @@ def test_bench_serve_chat(server):
"5"
,
"--endpoint"
,
"/v1/chat/completions"
,
"--
endpoint-type
"
,
"--
backend
"
,
"openai-chat"
,
]
result
=
subprocess
.
run
(
command
,
capture_output
=
True
,
text
=
True
)
...
...
vllm/benchmarks/datasets.py
View file @
21da7334
...
...
@@ -1358,7 +1358,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
elif
args
.
dataset_name
==
"sonnet"
:
dataset
=
SonnetDataset
(
dataset_path
=
args
.
dataset_path
)
# For the "sonnet" dataset, formatting depends on the backend.
if
args
.
endpoint_type
==
"openai-chat"
:
if
args
.
backend
==
"openai-chat"
:
input_requests
=
dataset
.
sample
(
num_requests
=
args
.
num_prompts
,
input_len
=
args
.
sonnet_input_len
,
...
...
@@ -1462,7 +1462,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
"Please consider contributing if you would "
"like to add support for additional dataset formats."
)
if
dataset_class
.
IS_MULTIMODAL
and
args
.
endpoint_type
not
in
[
if
dataset_class
.
IS_MULTIMODAL
and
args
.
backend
not
in
[
"openai-chat"
,
"openai-audio"
,
]:
...
...
@@ -1470,7 +1470,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
# endpoint-type.
raise
ValueError
(
"Multi-modal content is only supported on 'openai-chat' and "
"'openai-audio'
endpoint-type
."
)
"'openai-audio'
backends
."
)
input_requests
=
dataset_class
(
dataset_path
=
args
.
dataset_path
,
dataset_subset
=
args
.
hf_subset
,
...
...
@@ -1563,7 +1563,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
try
:
# Enforce endpoint compatibility for multimodal datasets.
if
args
.
dataset_name
==
"random-mm"
and
args
.
endpoint_type
not
in
[
if
args
.
dataset_name
==
"random-mm"
and
args
.
backend
not
in
[
"openai-chat"
]:
raise
ValueError
(
"Multi-modal content (images) is only supported on "
...
...
vllm/benchmarks/serve.py
View file @
21da7334
...
...
@@ -8,8 +8,8 @@ to launch the vLLM OpenAI API server:
On the client side, run:
vllm bench serve \
--
endpoint-type <
endpoint
_
type. Default 'openai'> \
--label <benchmark result label. Default using
endpoint_type
> \
--
backend <backend or
endpoint
type. Default 'openai'> \
--label <benchmark result label. Default using
backend
> \
--model <your_model> \
--dataset-name <dataset_name. Default 'random'> \
--request-rate <request_rate. Default inf> \
...
...
@@ -52,6 +52,21 @@ TERM_PLOTLIB_AVAILABLE = ((importlib.util.find_spec("termplotlib") is not None)
and
(
shutil
.
which
(
"gnuplot"
)
is
not
None
))
# TODO: Remove this in v0.11.0
class
DeprecatedEndpointTypeAction
(
argparse
.
Action
):
"""Argparse action for the deprecated --endpoint-type flag.
"""
def
__call__
(
self
,
_
,
namespace
,
values
,
option_string
=
None
):
warnings
.
warn
(
"'--endpoint-type' is deprecated and will be removed in v0.11.0. "
"Please use '--backend' instead or remove this argument if you "
"have already set it."
,
stacklevel
=
1
,
)
setattr
(
namespace
,
self
.
dest
,
values
)
class
TaskType
(
Enum
):
GENERATION
=
"generation"
EMBEDDING
=
"embedding"
...
...
@@ -470,7 +485,7 @@ async def benchmark(
else
:
request_func
=
ASYNC_REQUEST_FUNCS
[
endpoint_type
]
else
:
raise
ValueError
(
f
"Unknown
endpoint_type
:
{
endpoint_type
}
"
)
raise
ValueError
(
f
"Unknown
backend
:
{
endpoint_type
}
"
)
# Reuses connections across requests to reduce TLS handshake overhead.
connector
=
aiohttp
.
TCPConnector
(
...
...
@@ -850,24 +865,28 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace,
def
add_cli_args
(
parser
:
argparse
.
ArgumentParser
):
add_dataset_parser
(
parser
)
parser
.
add_argument
(
"--endpoint-type"
,
type
=
str
,
default
=
"openai"
,
choices
=
list
(
ASYNC_REQUEST_FUNCS
.
keys
()),
)
parser
.
add_argument
(
"--label"
,
type
=
str
,
default
=
None
,
help
=
"The label (prefix) of the benchmark results. If not specified, "
"the
endpoint type
will be used as the label."
,
"the
value of '--backend'
will be used as the label."
,
)
parser
.
add_argument
(
"--backend"
,
type
=
str
,
default
=
"vllm"
,
default
=
"openai"
,
choices
=
list
(
ASYNC_REQUEST_FUNCS
.
keys
()),
help
=
"The type of backend or endpoint to use for the benchmark."
)
parser
.
add_argument
(
"--endpoint-type"
,
type
=
str
,
default
=
None
,
choices
=
list
(
ASYNC_REQUEST_FUNCS
.
keys
()),
action
=
DeprecatedEndpointTypeAction
,
help
=
"'--endpoint-type' is deprecated and will be removed in v0.11.0. "
"Please use '--backend' instead."
,
)
parser
.
add_argument
(
"--base-url"
,
...
...
@@ -1165,7 +1184,6 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
raise
ValueError
(
"For exponential ramp-up, the start RPS cannot be 0."
)
endpoint_type
=
args
.
endpoint_type
label
=
args
.
label
model_id
=
args
.
model
model_name
=
args
.
served_model_name
...
...
@@ -1228,7 +1246,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
gc
.
freeze
()
benchmark_result
=
await
benchmark
(
endpoint_type
=
args
.
endpoint_type
,
endpoint_type
=
args
.
backend
,
api_url
=
api_url
,
base_url
=
base_url
,
model_id
=
model_id
,
...
...
@@ -1262,7 +1280,8 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
# Setup
current_dt
=
datetime
.
now
().
strftime
(
"%Y%m%d-%H%M%S"
)
result_json
[
"date"
]
=
current_dt
result_json
[
"endpoint_type"
]
=
args
.
endpoint_type
result_json
[
"endpoint_type"
]
=
args
.
backend
# for backward compatibility
result_json
[
"backend"
]
=
args
.
backend
result_json
[
"label"
]
=
label
result_json
[
"model_id"
]
=
model_id
result_json
[
"tokenizer_id"
]
=
tokenizer_id
...
...
@@ -1312,7 +1331,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
base_model_id
=
model_id
.
split
(
"/"
)[
-
1
]
max_concurrency_str
=
(
f
"-concurrency
{
args
.
max_concurrency
}
"
if
args
.
max_concurrency
is
not
None
else
""
)
label
=
label
or
endpoint_type
label
=
label
or
args
.
backend
if
args
.
ramp_up_strategy
is
not
None
:
file_name
=
f
"
{
label
}
-ramp-up-
{
args
.
ramp_up_strategy
}
-
{
args
.
ramp_up_start_rps
}
qps-
{
args
.
ramp_up_end_rps
}
qps
{
max_concurrency_str
}
-
{
base_model_id
}
-
{
current_dt
}
.json"
# noqa
else
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment