Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
raojy
vllm_017
Commits
3b50924c
Commit
3b50924c
authored
Mar 27, 2026
by
raojy
Browse files
raw_vllm
parent
fbeb8a6f
Pipeline
#3455
canceled with stages
Changes
144
Pipelines
1
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
3469 additions
and
0 deletions
+3469
-0
.buildkite/performance-benchmarks/scripts/convert-results-json-to-markdown.py
...ce-benchmarks/scripts/convert-results-json-to-markdown.py
+414
-0
.buildkite/performance-benchmarks/scripts/launch-server.sh
.buildkite/performance-benchmarks/scripts/launch-server.sh
+224
-0
.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
...formance-benchmarks/scripts/run-performance-benchmarks.sh
+539
-0
.buildkite/performance-benchmarks/tests/genai-perf-tests.json
...ldkite/performance-benchmarks/tests/genai-perf-tests.json
+21
-0
.buildkite/performance-benchmarks/tests/latency-tests-arm64-cpu.json
...performance-benchmarks/tests/latency-tests-arm64-cpu.json
+26
-0
.buildkite/performance-benchmarks/tests/latency-tests-cpu.json
...dkite/performance-benchmarks/tests/latency-tests-cpu.json
+26
-0
.buildkite/performance-benchmarks/tests/latency-tests-hpu.json
...dkite/performance-benchmarks/tests/latency-tests-hpu.json
+106
-0
.buildkite/performance-benchmarks/tests/latency-tests.json
.buildkite/performance-benchmarks/tests/latency-tests.json
+32
-0
.buildkite/performance-benchmarks/tests/nightly-tests.json
.buildkite/performance-benchmarks/tests/nightly-tests.json
+311
-0
.buildkite/performance-benchmarks/tests/serving-tests-arm64-cpu.json
...performance-benchmarks/tests/serving-tests-arm64-cpu.json
+130
-0
.buildkite/performance-benchmarks/tests/serving-tests-cpu-embed.json
...performance-benchmarks/tests/serving-tests-cpu-embed.json
+41
-0
.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json
.../performance-benchmarks/tests/serving-tests-cpu-text.json
+283
-0
.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
...dkite/performance-benchmarks/tests/serving-tests-cpu.json
+153
-0
.buildkite/performance-benchmarks/tests/serving-tests-hpu.json
...dkite/performance-benchmarks/tests/serving-tests-hpu.json
+161
-0
.buildkite/performance-benchmarks/tests/serving-tests.json
.buildkite/performance-benchmarks/tests/serving-tests.json
+77
-0
.buildkite/performance-benchmarks/tests/throughput-tests-arm64-cpu.json
...formance-benchmarks/tests/throughput-tests-arm64-cpu.json
+27
-0
.buildkite/performance-benchmarks/tests/throughput-tests-cpu.json
...te/performance-benchmarks/tests/throughput-tests-cpu.json
+27
-0
.buildkite/performance-benchmarks/tests/throughput-tests-hpu.json
...te/performance-benchmarks/tests/throughput-tests-hpu.json
+123
-0
.buildkite/performance-benchmarks/tests/throughput-tests.json
...ldkite/performance-benchmarks/tests/throughput-tests.json
+35
-0
.buildkite/release-pipeline.yaml
.buildkite/release-pipeline.yaml
+713
-0
No files found.
.buildkite/performance-benchmarks/scripts/convert-results-json-to-markdown.py
0 → 100644
View file @
3b50924c
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
argparse
import
json
import
os
import
shlex
from
importlib
import
util
from
pathlib
import
Path
from
typing
import
Any
import
pandas
as
pd
import
psutil
import
regex
as
re
from
tabulate
import
tabulate
# latency results and the keys that will be printed into markdown
latency_results
=
[]
latency_column_mapping
=
{
"test_name"
:
"Test name"
,
"gpu_type"
:
"GPU"
,
"avg_latency"
:
"Mean latency (ms)"
,
# "P10": "P10 (s)",
# "P25": "P25 (s)",
"P50"
:
"Median latency (ms)"
,
# "P75": "P75 (s)",
# "P90": "P90 (s)",
"P99"
:
"P99 latency (ms)"
,
}
# throughput tests and the keys that will be printed into markdown
throughput_results
=
[]
throughput_results_column_mapping
=
{
"test_name"
:
"Test name"
,
"gpu_type"
:
"GPU"
,
"num_requests"
:
"# of req."
,
"total_num_tokens"
:
"Total # of tokens"
,
"elapsed_time"
:
"Elapsed time (s)"
,
"requests_per_second"
:
"Tput (req/s)"
,
"tokens_per_second"
:
"Tput (tok/s)"
,
}
# serving results and the keys that will be printed into markdown
serving_results
=
[]
serving_column_mapping
=
{
"test_name"
:
"Test name"
,
"model_id"
:
"Model"
,
"dataset_name"
:
"Dataset Name"
,
"input_len"
:
"Input Len"
,
"output_len"
:
"Output Len"
,
"tp_size"
:
"TP Size"
,
"pp_size"
:
"PP Size"
,
"dtype"
:
"dtype"
,
"gpu_type"
:
"GPU"
,
"completed"
:
"# of req."
,
"qps"
:
"qps"
,
"max_concurrency"
:
"# of max concurrency."
,
"request_throughput"
:
"Tput (req/s)"
,
"total_token_throughput"
:
"Total Token Tput (tok/s)"
,
"output_throughput"
:
"Output Tput (tok/s)"
,
# "total_input_tokens": "Total input tokens",
# "total_output_tokens": "Total output tokens",
"mean_ttft_ms"
:
"Mean TTFT (ms)"
,
"median_ttft_ms"
:
"Median TTFT (ms)"
,
"p99_ttft_ms"
:
"P99 TTFT (ms)"
,
"std_ttft_ms"
:
"STD TTFT (ms)"
,
"mean_tpot_ms"
:
"Mean TPOT (ms)"
,
"median_tpot_ms"
:
"Median"
,
"p99_tpot_ms"
:
"P99"
,
"std_tpot_ms"
:
"STD TPOT (ms)"
,
"mean_itl_ms"
:
"Mean ITL (ms)"
,
"median_itl_ms"
:
"Median ITL (ms)"
,
"p99_itl_ms"
:
"P99 ITL (ms)"
,
}
def
read_markdown
(
file
):
if
os
.
path
.
exists
(
file
):
with
open
(
file
)
as
f
:
return
f
.
read
()
+
"
\n
"
else
:
return
f
"
{
file
}
not found.
\n
"
def
results_to_json
(
latency
,
throughput
,
serving
):
return
json
.
dumps
(
{
"latency"
:
latency
.
to_dict
(),
"throughput"
:
throughput
.
to_dict
(),
"serving"
:
serving
.
to_dict
(),
}
)
def
get_size_with_unit
(
bytes
,
suffix
=
"B"
):
"""
Scale bytes to its proper format
e.g:
1253656 => '1.20MB'
1253656678 => '1.17GB'
"""
factor
=
1024
for
unit
in
[
""
,
"K"
,
"M"
,
"G"
,
"T"
,
"P"
]:
if
bytes
<
factor
:
return
f
"
{
bytes
:.
2
f
}{
unit
}{
suffix
}
"
bytes
/=
factor
def
_coerce
(
val
:
str
)
->
Any
:
"""Best-effort type coercion from string to Python types."""
low
=
val
.
lower
()
if
low
==
"null"
:
return
None
if
low
==
"true"
:
return
True
if
low
==
"false"
:
return
False
# integers
if
re
.
fullmatch
(
r
"[+-]?\d+"
,
val
):
try
:
return
int
(
val
)
except
ValueError
:
pass
# floats (keep 'inf'/'-inf'/'nan' as strings)
if
re
.
fullmatch
(
r
"[+-]?\d*\.\d+"
,
val
):
try
:
return
float
(
val
)
except
ValueError
:
pass
return
val
def
parse_client_command
(
cmd
:
str
)
->
dict
[
str
,
Any
]:
"""Parse the client_command shell string into {executable, script, args}."""
toks
=
shlex
.
split
(
cmd
)
if
len
(
toks
)
<
2
:
raise
ValueError
(
"client_command must include an executable and a script"
)
executable
,
script
=
toks
[
0
],
toks
[
1
]
args
:
dict
[
str
,
Any
]
=
{}
i
=
2
while
i
<
len
(
toks
):
t
=
toks
[
i
]
if
t
.
startswith
(
"--"
):
# --key=value or --key (value) or boolean flag
if
"="
in
t
:
key
,
val
=
t
.
split
(
"="
,
1
)
if
key
==
"--metadata"
:
md
=
{}
if
val
:
if
"="
in
val
:
k
,
v
=
val
.
split
(
"="
,
1
)
md
[
k
]
=
_coerce
(
v
)
else
:
md
[
val
]
=
True
args
[
key
]
=
md
else
:
args
[
key
]
=
_coerce
(
val
)
i
+=
1
continue
key
=
t
# Special: consume metadata k=v pairs until next --flag
if
key
==
"--metadata"
:
i
+=
1
md
=
{}
while
i
<
len
(
toks
)
and
not
toks
[
i
].
startswith
(
"--"
):
pair
=
toks
[
i
]
if
"="
in
pair
:
k
,
v
=
pair
.
split
(
"="
,
1
)
md
[
k
]
=
_coerce
(
v
)
else
:
md
[
pair
]
=
True
i
+=
1
args
[
key
]
=
md
continue
# Standard: check if next token is a value (not a flag)
if
i
+
1
<
len
(
toks
)
and
not
toks
[
i
+
1
].
startswith
(
"--"
):
args
[
key
]
=
_coerce
(
toks
[
i
+
1
])
i
+=
2
else
:
# lone flag -> True
args
[
key
]
=
True
i
+=
1
else
:
# unexpected positional; skip
i
+=
1
return
{
"executable"
:
executable
,
"script"
:
script
,
"args"
:
args
}
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"-r"
,
"--result"
,
type
=
str
,
default
=
"results"
,
help
=
"Folder name for benchmark output results."
,
)
args
=
parser
.
parse_args
()
results_folder
=
Path
(
args
.
result
)
if
not
results_folder
.
exists
():
raise
FileNotFoundError
(
f
"results folder does not exist:
{
results_folder
}
"
)
# collect results
for
test_file
in
results_folder
.
glob
(
"*.json"
):
with
open
(
test_file
)
as
f
:
raw_result
=
json
.
loads
(
f
.
read
())
if
"serving"
in
str
(
test_file
):
# this result is generated via `vllm bench serve` command
# attach the benchmarking command to raw_result
try
:
with
open
(
test_file
.
with_suffix
(
".commands"
))
as
f
:
command
=
json
.
loads
(
f
.
read
())
except
OSError
as
e
:
print
(
e
)
continue
# Parse Server Command Arg
out
:
dict
[
str
,
Any
]
=
{
"server_command"
:
parse_client_command
(
command
[
"server_command"
])
}
parse_args
=
[
"--tensor-parallel-size"
,
"--pipeline-parallel-size"
,
"--dtype"
,
]
col_mapping
=
[
"tp_size"
,
"pp_size"
,
"dtype"
]
for
index
,
arg
in
enumerate
(
parse_args
):
if
arg
in
out
[
"server_command"
][
"args"
]:
raw_result
.
update
(
{
col_mapping
[
index
]:
out
[
"server_command"
][
"args"
][
arg
]}
)
# Parse Client Command Arg
out
:
dict
[
str
,
Any
]
=
{
"client_command"
:
parse_client_command
(
command
[
"client_command"
])
}
parse_args
=
[
"--dataset-name"
,
"--random-input-len"
,
"--random-output-len"
,
"--request-rate"
,
]
col_mapping
=
[
"dataset_name"
,
"input_len"
,
"output_len"
,
"qps"
]
for
index
,
arg
in
enumerate
(
parse_args
):
if
arg
in
out
[
"client_command"
][
"args"
]:
raw_result
.
update
(
{
col_mapping
[
index
]:
out
[
"client_command"
][
"args"
][
arg
]}
)
# Add Server, Client command
raw_result
.
update
(
command
)
# update the test name of this result
raw_result
.
update
({
"test_name"
:
test_file
.
stem
})
# add the result to raw_result
serving_results
.
append
(
raw_result
)
continue
elif
"latency"
in
f
.
name
:
# this result is generated via `vllm bench latency` command
# attach the benchmarking command to raw_result
try
:
with
open
(
test_file
.
with_suffix
(
".commands"
))
as
f
:
command
=
json
.
loads
(
f
.
read
())
except
OSError
as
e
:
print
(
e
)
continue
raw_result
.
update
(
command
)
# update the test name of this result
raw_result
.
update
({
"test_name"
:
test_file
.
stem
})
# get different percentiles
for
perc
in
[
10
,
25
,
50
,
75
,
90
,
99
]:
# Multiply 1000 to convert the time unit from s to ms
raw_result
.
update
(
{
f
"P
{
perc
}
"
:
1000
*
raw_result
[
"percentiles"
][
str
(
perc
)]}
)
raw_result
[
"avg_latency"
]
=
raw_result
[
"avg_latency"
]
*
1000
# add the result to raw_result
latency_results
.
append
(
raw_result
)
continue
elif
"throughput"
in
f
.
name
:
# this result is generated via `vllm bench throughput` command
# attach the benchmarking command to raw_result
try
:
with
open
(
test_file
.
with_suffix
(
".commands"
))
as
f
:
command
=
json
.
loads
(
f
.
read
())
except
OSError
as
e
:
print
(
e
)
continue
raw_result
.
update
(
command
)
# update the test name of this result
raw_result
.
update
({
"test_name"
:
test_file
.
stem
})
# add the result to raw_result
throughput_results
.
append
(
raw_result
)
continue
print
(
f
"Skipping
{
test_file
}
"
)
latency_results
=
pd
.
DataFrame
.
from_dict
(
latency_results
)
serving_results
=
pd
.
DataFrame
.
from_dict
(
serving_results
)
throughput_results
=
pd
.
DataFrame
.
from_dict
(
throughput_results
)
svmem
=
psutil
.
virtual_memory
()
platform_data
=
{
"Physical cores"
:
[
psutil
.
cpu_count
(
logical
=
False
)],
"Total cores"
:
[
psutil
.
cpu_count
(
logical
=
True
)],
"Total Memory"
:
[
get_size_with_unit
(
svmem
.
total
)],
}
if
util
.
find_spec
(
"numa"
)
is
not
None
:
from
numa
import
info
platform_data
[
"Total NUMA nodes"
]
=
[
info
.
get_num_configured_nodes
()]
if
util
.
find_spec
(
"cpuinfo"
)
is
not
None
:
from
cpuinfo
import
get_cpu_info
platform_data
[
"CPU Brand"
]
=
[
get_cpu_info
()[
"brand_raw"
]]
platform_results
=
pd
.
DataFrame
.
from_dict
(
platform_data
,
orient
=
"index"
,
columns
=
[
"Platform Info"
]
)
raw_results_json
=
results_to_json
(
latency_results
,
throughput_results
,
serving_results
)
# remapping the key, for visualization purpose
if
not
latency_results
.
empty
:
latency_results
=
latency_results
[
list
(
latency_column_mapping
.
keys
())].
rename
(
columns
=
latency_column_mapping
)
if
not
serving_results
.
empty
:
valid_columns
=
[
col
for
col
in
serving_column_mapping
if
col
in
serving_results
.
columns
]
serving_results
=
serving_results
[
valid_columns
].
rename
(
columns
=
serving_column_mapping
)
if
not
throughput_results
.
empty
:
throughput_results
=
throughput_results
[
list
(
throughput_results_column_mapping
.
keys
())
].
rename
(
columns
=
throughput_results_column_mapping
)
processed_results_json
=
results_to_json
(
latency_results
,
throughput_results
,
serving_results
)
for
df
in
[
latency_results
,
serving_results
,
throughput_results
]:
if
df
.
empty
:
continue
# Sort all dataframes by their respective "Test name" columns
df
.
sort_values
(
by
=
"Test name"
,
inplace
=
True
)
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
# we want to turn it into "8xGPUTYPE"
df
[
"GPU"
]
=
df
[
"GPU"
].
apply
(
lambda
x
:
"{}x{}"
.
format
(
len
(
x
.
split
(
"
\n
"
)),
x
.
split
(
"
\n
"
)[
0
])
)
# get markdown tables
latency_md_table
=
tabulate
(
latency_results
,
headers
=
"keys"
,
tablefmt
=
"pipe"
,
showindex
=
False
)
serving_md_table
=
tabulate
(
serving_results
,
headers
=
"keys"
,
tablefmt
=
"pipe"
,
showindex
=
False
)
throughput_md_table
=
tabulate
(
throughput_results
,
headers
=
"keys"
,
tablefmt
=
"pipe"
,
showindex
=
False
)
platform_md_table
=
tabulate
(
platform_results
,
headers
=
"keys"
,
tablefmt
=
"pipe"
,
showindex
=
True
)
# document the result
md_file
=
"benchmark_results.md"
json_file
=
"benchmark_results.json"
with
open
(
results_folder
/
md_file
,
"w"
)
as
f
:
results
=
read_markdown
(
"../.buildkite/performance-benchmarks/"
"performance-benchmarks-descriptions.md"
)
results
=
results
.
format
(
latency_tests_markdown_table
=
latency_md_table
,
throughput_tests_markdown_table
=
throughput_md_table
,
serving_tests_markdown_table
=
serving_md_table
,
platform_markdown_table
=
platform_md_table
,
benchmarking_results_in_json_string
=
processed_results_json
,
)
f
.
write
(
results
)
# document benchmarking results in json
with
open
(
results_folder
/
json_file
,
"w"
)
as
f
:
results
=
(
latency_results
.
to_dict
(
orient
=
"records"
)
+
throughput_results
.
to_dict
(
orient
=
"records"
)
+
serving_results
.
to_dict
(
orient
=
"records"
)
)
f
.
write
(
json
.
dumps
(
results
))
.buildkite/performance-benchmarks/scripts/launch-server.sh
0 → 100644
View file @
3b50924c
#!/bin/bash
# Currently FP8 benchmark is NOT enabled.
set
-x
server_params
=
$1
common_params
=
$2
json2args
()
{
# transforms the JSON string to command line args, and '_' is replaced to '-'
# example:
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
local
json_string
=
$1
local
args
=
$(
echo
"
$json_string
"
| jq
-r
'
to_entries |
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
join(" ")
'
)
echo
"
$args
"
}
launch_trt_server
()
{
model_path
=
$(
echo
"
$common_params
"
| jq
-r
'.model'
)
model_name
=
"
${
model_path
#*/
}
"
model_type
=
$(
echo
"
$server_params
"
| jq
-r
'.model_type'
)
model_dtype
=
$(
echo
"
$server_params
"
| jq
-r
'.model_dtype'
)
model_tp_size
=
$(
echo
"
$common_params
"
| jq
-r
'.tp'
)
max_batch_size
=
$(
echo
"
$server_params
"
| jq
-r
'.max_batch_size'
)
max_input_len
=
$(
echo
"
$server_params
"
| jq
-r
'.max_input_len'
)
max_seq_len
=
$(
echo
"
$server_params
"
| jq
-r
'.max_seq_len'
)
max_num_tokens
=
$(
echo
"
$server_params
"
| jq
-r
'.max_num_tokens'
)
trt_llm_version
=
$(
echo
"
$server_params
"
| jq
-r
'.trt_llm_version'
)
# create model caching directory
cd
~
rm
-rf
models
mkdir
-p
models
cd
models
models_dir
=
$(
pwd
)
trt_model_path
=
${
models_dir
}
/
${
model_name
}
-trt-ckpt
trt_engine_path
=
${
models_dir
}
/
${
model_name
}
-trt-engine
# clone tensorrt backend
cd
/
rm
-rf
tensorrtllm_backend
git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
git lfs
install
cd
tensorrtllm_backend
git checkout
"
$trt_llm_version
"
git submodule update
--init
--recursive
# build trtllm engine
cd
/tensorrtllm_backend
cd
"./tensorrt_llm/examples/
${
model_type
}
"
python3 convert_checkpoint.py
\
--model_dir
"
${
model_path
}
"
\
--dtype
"
${
model_dtype
}
"
\
--tp_size
"
${
model_tp_size
}
"
\
--output_dir
"
${
trt_model_path
}
"
trtllm-build
\
--checkpoint_dir
"
${
trt_model_path
}
"
\
--use_fused_mlp
\
--reduce_fusion
disable
\
--workers
8
\
--gpt_attention_plugin
"
${
model_dtype
}
"
\
--gemm_plugin
"
${
model_dtype
}
"
\
--tp_size
"
${
model_tp_size
}
"
\
--max_batch_size
"
${
max_batch_size
}
"
\
--max_input_len
"
${
max_input_len
}
"
\
--max_seq_len
"
${
max_seq_len
}
"
\
--max_num_tokens
"
${
max_num_tokens
}
"
\
--output_dir
"
${
trt_engine_path
}
"
# handle triton protobuf files and launch triton server
cd
/tensorrtllm_backend
mkdir
triton_model_repo
cp
-r
all_models/inflight_batcher_llm/
*
triton_model_repo/
cd
triton_model_repo
rm
-rf
./tensorrt_llm/1/
*
cp
-r
"
${
trt_engine_path
}
"
/
*
./tensorrt_llm/1
python3 ../tools/fill_template.py
-i
tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,engine_dir:/tensorrtllm_backend/triton_model_repo/tensorrt_llm/1,decoupled_mode:true,batching_strategy:inflight_fused_batching,batch_scheduler_policy:guaranteed_no_evict,exclude_input_in_output:true,triton_max_batch_size:2048,max_queue_delay_microseconds:0,max_beam_width:1,max_queue_size:2048,enable_kv_cache_reuse:false
python3 ../tools/fill_template.py
-i
preprocessing/config.pbtxt
"triton_max_batch_size:2048,tokenizer_dir:
$model_path
,preprocessing_instance_count:5"
python3 ../tools/fill_template.py
-i
postprocessing/config.pbtxt
"triton_max_batch_size:2048,tokenizer_dir:
$model_path
,postprocessing_instance_count:5,skip_special_tokens:false"
python3 ../tools/fill_template.py
-i
ensemble/config.pbtxt triton_max_batch_size:
"
$max_batch_size
"
python3 ../tools/fill_template.py
-i
tensorrt_llm_bls/config.pbtxt
"triton_max_batch_size:
$max_batch_size
,decoupled_mode:true,accumulate_tokens:False,bls_instance_count:1"
cd
/tensorrtllm_backend
python3 scripts/launch_triton_server.py
\
--world_size
=
"
${
model_tp_size
}
"
\
--model_repo
=
/tensorrtllm_backend/triton_model_repo &
}
launch_tgi_server
()
{
model
=
$(
echo
"
$common_params
"
| jq
-r
'.model'
)
tp
=
$(
echo
"
$common_params
"
| jq
-r
'.tp'
)
port
=
$(
echo
"
$common_params
"
| jq
-r
'.port'
)
server_args
=
$(
json2args
"
$server_params
"
)
if
echo
"
$common_params
"
| jq
-e
'has("fp8")'
>
/dev/null
;
then
echo
"Key 'fp8' exists in common params."
server_command
=
"/tgi-entrypoint.sh
\
--model-id
$model
\
--num-shard
$tp
\
--port
$port
\
--quantize fp8
\
$server_args
"
else
echo
"Key 'fp8' does not exist in common params."
server_command
=
"/tgi-entrypoint.sh
\
--model-id
$model
\
--num-shard
$tp
\
--port
$port
\
$server_args
"
fi
echo
"Server command:
$server_command
"
eval
"
$server_command
"
&
}
launch_lmdeploy_server
()
{
model
=
$(
echo
"
$common_params
"
| jq
-r
'.model'
)
tp
=
$(
echo
"
$common_params
"
| jq
-r
'.tp'
)
port
=
$(
echo
"
$common_params
"
| jq
-r
'.port'
)
server_args
=
$(
json2args
"
$server_params
"
)
server_command
=
"lmdeploy serve api_server
$model
\
--tp
$tp
\
--server-port
$port
\
$server_args
"
# run the server
echo
"Server command:
$server_command
"
bash
-c
"
$server_command
"
&
}
launch_sglang_server
()
{
model
=
$(
echo
"
$common_params
"
| jq
-r
'.model'
)
tp
=
$(
echo
"
$common_params
"
| jq
-r
'.tp'
)
port
=
$(
echo
"
$common_params
"
| jq
-r
'.port'
)
server_args
=
$(
json2args
"
$server_params
"
)
if
echo
"
$common_params
"
| jq
-e
'has("fp8")'
>
/dev/null
;
then
echo
"Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
model
=
$(
echo
"
$common_params
"
| jq
-r
'.neuralmagic_quantized_model'
)
server_command
=
"python3
\
-m sglang.launch_server
\
--tp
$tp
\
--model-path
$model
\
--port
$port
\
$server_args
"
else
echo
"Key 'fp8' does not exist in common params."
server_command
=
"python3
\
-m sglang.launch_server
\
--tp
$tp
\
--model-path
$model
\
--port
$port
\
$server_args
"
fi
# run the server
echo
"Server command:
$server_command
"
eval
"
$server_command
"
&
}
launch_vllm_server
()
{
export
VLLM_HOST_IP
=
$(
hostname
-I
|
awk
'{print $1}'
)
model
=
$(
echo
"
$common_params
"
| jq
-r
'.model'
)
tp
=
$(
echo
"
$common_params
"
| jq
-r
'.tp'
)
port
=
$(
echo
"
$common_params
"
| jq
-r
'.port'
)
server_args
=
$(
json2args
"
$server_params
"
)
if
echo
"
$common_params
"
| jq
-e
'has("fp8")'
>
/dev/null
;
then
echo
"Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
model
=
$(
echo
"
$common_params
"
| jq
-r
'.neuralmagic_quantized_model'
)
server_command
=
"vllm serve
$model
\
-tp
$tp
\
--port
$port
\
$server_args
"
else
echo
"Key 'fp8' does not exist in common params."
server_command
=
"vllm serve
$model
\
-tp
$tp
\
--port
$port
\
$server_args
"
fi
# run the server
echo
"Server command:
$server_command
"
eval
"
$server_command
"
&
}
main
()
{
if
[[
"
$CURRENT_LLM_SERVING_ENGINE
"
==
"trt"
]]
;
then
launch_trt_server
fi
if
[[
"
$CURRENT_LLM_SERVING_ENGINE
"
==
"tgi"
]]
;
then
launch_tgi_server
fi
if
[[
"
$CURRENT_LLM_SERVING_ENGINE
"
==
"lmdeploy"
]]
;
then
launch_lmdeploy_server
fi
if
[[
"
$CURRENT_LLM_SERVING_ENGINE
"
==
"sglang"
]]
;
then
launch_sglang_server
fi
if
[[
"
$CURRENT_LLM_SERVING_ENGINE
"
==
*
"vllm"
*
]]
;
then
launch_vllm_server
fi
}
main
.buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
0 → 100644
View file @
3b50924c
#!/bin/bash
# This script assumes that we are already inside the vllm/ directory
# Benchmarking results will be available inside vllm/benchmarks/results/
# Do not set -e, as the mixtral 8x22B model tends to crash occasionally
# and we still want to see other benchmarking results even when mixtral crashes.
set
-x
set
-o
pipefail
# Environment-driven debug controls (like ON_CPU=1)
DRY_RUN
=
"
${
DRY_RUN
:-
0
}
"
MODEL_FILTER
=
"
${
MODEL_FILTER
:-}
"
DTYPE_FILTER
=
"
${
DTYPE_FILTER
:-}
"
check_gpus
()
{
if
command
-v
nvidia-smi
;
then
# check the number of GPUs and GPU type.
declare
-g
gpu_count
=
$(
nvidia-smi
--list-gpus
|
grep
-c
.
||
true
)
elif
command
-v
amd-smi
;
then
declare
-g
gpu_count
=
$(
amd-smi list |
grep
-c
'GPU'
||
true
)
elif
command
-v
hl-smi
;
then
declare
-g
gpu_count
=
$(
hl-smi
--list
|
grep
-ci
"Module ID"
||
true
)
fi
if
[[
$gpu_count
-gt
0
]]
;
then
echo
"GPU found."
else
echo
"Need at least 1 GPU to run benchmarking."
exit
1
fi
declare
-g
arch_suffix
=
''
if
command
-v
nvidia-smi
;
then
declare
-g
gpu_type
=
$(
nvidia-smi
--query-gpu
=
name
--format
=
csv,noheader |
awk
'{print $2}'
)
elif
command
-v
amd-smi
;
then
declare
-g
gpu_type
=
$(
amd-smi static
-g
0
-a
|
grep
'MARKET_NAME'
|
awk
'{print $2}'
)
elif
command
-v
hl-smi
;
then
declare
-g
gpu_type
=
$(
hl-smi
-q
|
grep
"Product Name"
|
head
-n
1 |
awk
-F
':'
'{print $2}'
|
sed
's/^ *//'
)
arch_suffix
=
'-hpu'
fi
echo
"GPU type is
$gpu_type
"
}
check_cpus
()
{
# check the number of CPUs and NUMA Node and GPU type.
declare
-g
numa_count
=
$(
lscpu |
grep
"NUMA node(s):"
|
awk
'{print $3}'
)
if
[[
$numa_count
-gt
0
]]
;
then
echo
"NUMA found."
echo
"
$numa_count
"
else
echo
"Need at least 1 NUMA to run benchmarking."
exit
1
fi
if
[[
"
$(
uname
-m
)
"
==
"aarch64"
]]
||
[[
"
$(
uname
-m
)
"
==
"arm64"
]]
;
then
declare
-g
gpu_type
=
"arm64-cpu"
else
declare
-g
gpu_type
=
"cpu"
fi
echo
"GPU type is
$gpu_type
"
}
check_hf_token
()
{
# check if HF_TOKEN is available and valid
if
[[
-z
"
$HF_TOKEN
"
]]
;
then
echo
"Error: HF_TOKEN is not set."
exit
1
elif
[[
!
"
$HF_TOKEN
"
=
~ ^hf_
]]
;
then
echo
"Error: HF_TOKEN does not start with 'hf_'."
exit
1
else
echo
"HF_TOKEN is set and valid."
fi
}
ensure_sharegpt_downloaded
()
{
local
FILE
=
ShareGPT_V3_unfiltered_cleaned_split.json
if
[
!
-f
"
$FILE
"
]
;
then
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/
$FILE
else
echo
"
$FILE
already exists."
fi
}
json2args
()
{
# transforms the JSON string to command line args, and '_' is replaced to '-'
# example:
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
local
json_string
=
$1
local
args
=
$(
echo
"
$json_string
"
| jq
-r
'
to_entries |
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
join(" ")
'
)
echo
"
$args
"
}
json2envs
()
{
# transforms the JSON string to environment variables.
# example:
# input: { "VLLM_CPU_KVCACHE_SPACE": 5 }
# output: VLLM_CPU_KVCACHE_SPACE=5
local
json_string
=
$1
local
args
=
$(
echo
"
$json_string
"
| jq
-r
'
to_entries |
map((.key ) + "=" + (.value | tostring)) |
join(" ")
'
)
echo
"
$args
"
}
wait_for_server
()
{
local
timeout_val
=
"1200"
timeout
"
$timeout_val
"
bash
-c
'
until curl -sf http://localhost:8000/v1/models >/dev/null; do
sleep 1
done
'
}
kill_processes_launched_by_current_bash
()
{
# Kill all python processes launched from current bash script
current_shell_pid
=
$$
processes
=
$(
ps
-eo
pid,ppid,command |
awk
-v
ppid
=
"
$current_shell_pid
"
-v
proc
=
"
$1
"
'$2 == ppid && $3 ~ proc {print $1}'
)
if
[
-n
"
$processes
"
]
;
then
echo
"Killing the following processes matching '
$1
':"
echo
"
$processes
"
echo
"
$processes
"
| xargs
kill
-9
else
echo
"No processes found matching '
$1
'."
fi
}
kill_gpu_processes
()
{
ps
-aux
lsof
-t
-i
:8000 | xargs
-r
kill
-9
pgrep python3 | xargs
-r
kill
-9
# vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
pgrep VLLM | xargs
-r
kill
-9
# wait until GPU memory usage smaller than 1GB
if
command
-v
nvidia-smi
;
then
while
[
"
$(
nvidia-smi
--query-gpu
=
memory.used
--format
=
csv,noheader,nounits |
head
-n
1
)
"
-ge
1000
]
;
do
sleep
1
done
elif
command
-v
amd-smi
;
then
while
[
"
$(
amd-smi metric
-g
0 |
grep
'USED_VRAM'
|
awk
'{print $2}'
)
"
-ge
1000
]
;
do
sleep
1
done
elif
command
-v
hl-smi
;
then
while
[
"
$(
hl-smi
-q
|
grep
"Used"
|
head
-n
1 |
awk
'{print $3}'
)
"
-ge
1000
]
;
do
sleep
1
done
fi
# remove vllm config file
rm
-rf
~/.config/vllm
}
upload_to_buildkite
()
{
# upload the benchmarking results to buildkite
# if the agent binary is not found, skip uploading the results, exit 0
# Check if buildkite-agent is available in the PATH or at /workspace/buildkite-agent
if
command
-v
buildkite-agent
>
/dev/null 2>&1
;
then
BUILDKITE_AGENT_COMMAND
=
"buildkite-agent"
elif
[
-f
/workspace/buildkite-agent
]
;
then
BUILDKITE_AGENT_COMMAND
=
"/workspace/buildkite-agent"
else
echo
"buildkite-agent binary not found. Skip uploading the results."
return
0
fi
# Use the determined command to annotate and upload artifacts
$BUILDKITE_AGENT_COMMAND
annotate
--style
"info"
--context
"
$BUILDKITE_LABEL
-benchmark-results"
<
"
$RESULTS_FOLDER
/benchmark_results.md"
$BUILDKITE_AGENT_COMMAND
artifact upload
"
$RESULTS_FOLDER
/*"
}
run_benchmark_tests
()
{
# run benchmark tests using `vllm bench <test_type>` command
# $1: test type (latency or throughput)
# $2: a json file specifying test cases
local
test_type
=
$1
local
test_file
=
$2
# Iterate over tests
jq
-c
'.[]'
"
$test_file
"
|
while
read
-r
params
;
do
# get the test name, and append the GPU type back to it.
test_name
=
$(
echo
"
$params
"
| jq
-r
'.test_name'
)
if
[[
!
"
$test_name
"
=
~ ^
${
test_type
}
_
]]
;
then
echo
"In
${
test_type
}
-test.json, test_name must start with
\"
${
test_type
}
_
\"
."
exit
1
fi
# if TEST_SELECTOR is set, only run the test cases that match the selector
if
[[
-n
"
$TEST_SELECTOR
"
]]
&&
[[
!
"
$test_name
"
=
~
$TEST_SELECTOR
]]
;
then
echo
"Skip test case
$test_name
."
continue
fi
# get arguments
bench_params
=
$(
echo
"
$params
"
| jq
-r
'.parameters'
)
bench_args
=
$(
json2args
"
$bench_params
"
)
bench_environment_variables
=
$(
echo
"
$params
"
| jq
-r
'.environment_variables'
)
bench_envs
=
$(
json2envs
"
$bench_environment_variables
"
)
# check if there is enough GPU to run the test
tp
=
$(
echo
"
$bench_params
"
| jq
-r
'.tensor_parallel_size'
)
if
[[
"
$ON_CPU
"
==
"1"
]]
;
then
pp
=
$(
echo
"
$bench_params
"
| jq
-r
'.pipeline_parallel_size // 1'
)
world_size
=
$((
$tp
*
$pp
))
if
[[
$numa_count
-lt
$world_size
&&
-z
"
${
REMOTE_HOST
}
"
]]
;
then
echo
"Required world-size
$world_size
but only
$numa_count
NUMA nodes found. Skip testcase
$test_name
."
continue
fi
else
if
[[
$gpu_count
-lt
$tp
]]
;
then
echo
"Required tensor-parallel-size
$tp
but only
$gpu_count
GPU found. Skip testcase
$test_name
."
continue
fi
fi
bench_command
=
"
$bench_envs
vllm bench
$test_type
\
--output-json
$RESULTS_FOLDER
/
${
test_name
}
.json
\
$bench_args
"
echo
"Running test case
$test_name
"
echo
"
${
test_type
^
}
command:
$bench_command
"
# recording benchmarking command and GPU command
jq_output
=
$(
jq
-n
\
--arg
command
"
$bench_command
"
\
--arg
gpu
"
$gpu_type
"
\
--arg
test_type
"
$test_type
"
\
'{
($test_type + "_command"): $command,
gpu_type: $gpu
}'
)
echo
"
$jq_output
"
>
"
$RESULTS_FOLDER
/
$test_name
.commands"
# run the benchmark
eval
"
$bench_command
"
kill_gpu_processes
done
}
run_latency_tests
()
{
run_benchmark_tests
"latency"
"
$1
"
;
}
run_startup_tests
()
{
run_benchmark_tests
"startup"
"
$1
"
;
}
run_throughput_tests
()
{
run_benchmark_tests
"throughput"
"
$1
"
;
}
merge_serving_tests_stream
()
{
# Emit merged serving test objects, optionally filtered by MODEL_FILTER/DTYPE_FILTER in DRY_RUN mode.
# This helper does NOT modify JSON; it only filters the stream in dry-run mode.
local
serving_test_file
=
"
$1
"
# shellcheck disable=SC2016
local
merged
=
'
if type == "array" then
# Plain format: test cases array
.[]
elif (type == "object" and has("tests")) then
# merge the default parameters into each test cases
. as $root
| ($root.defaults // {}) as $d
| ($root.tests // [])[]
# default qps / max_concurrency from defaults if missing
| .qps_list = (.qps_list // $d.qps_list)
| .max_concurrency_list = (.max_concurrency_list // $d.max_concurrency_list)
# merge envs / params: test overrides defaults
| .server_environment_variables =
(($d.server_environment_variables // {}) + (.server_environment_variables // {}))
| .server_parameters =
(($d.server_parameters // {}) + (.server_parameters // {}))
| .client_parameters =
(($d.client_parameters // {}) + (.client_parameters // {}))
else
error("Unsupported serving test file format: must be array or object with .tests")
end
'
jq
-c
"
$merged
"
"
$serving_test_file
"
|
\
if
[[
"
${
DRY_RUN
:-
0
}
"
==
"1"
&&
(
"
${
MODEL_FILTER
}${
DTYPE_FILTER
}
"
!=
""
)
]]
;
then
jq
-c
--arg
model
"
$MODEL_FILTER
"
--arg
dtype
"
$DTYPE_FILTER
"
'
select((($model|length)==0)
or ((.server_parameters.model // "") == $model)
or ((.client_parameters.model // "") == $model))
| select((($dtype|length)==0) or ((.server_parameters.dtype // "") == $dtype))
'
else
cat
fi
}
run_serving_tests
()
{
# run serving tests using `vllm bench serve` command
# $1: a json file specifying serving test cases
#
# Supported JSON formats:
# 1) Plain format: top-level array
# [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
#
# 2) Default parameters field + plain format tests
# {
# "defaults": { ... },
# "tests": [ { "test_name": "...", "server_parameters": {...}, ... }, ... ]
# }
local
serving_test_file
serving_test_file
=
$1
# In dry-run mode, if filters are provided but no tests match, fail fast.
if
[[
"
${
DRY_RUN
:-
0
}
"
==
"1"
&&
(
"
${
MODEL_FILTER
}${
DTYPE_FILTER
}
"
!=
""
)
]]
;
then
local
count
count
=
$(
merge_serving_tests_stream
"
$serving_test_file
"
|
wc
-l
|
tr
-d
' '
)
if
[[
"
$count
"
-eq
0
]]
;
then
echo
"No matching serving tests found in
$serving_test_file
for model='
$MODEL_FILTER
' dtype='
$DTYPE_FILTER
'."
>
&2
return
0
fi
fi
# Iterate over serving tests (merged + optional filtered stream)
merge_serving_tests_stream
"
$serving_test_file
"
|
while
read
-r
params
;
do
# get the test name, and append the GPU type back to it.
test_name
=
$(
echo
"
$params
"
| jq
-r
'.test_name'
)
if
[[
!
"
$test_name
"
=
~ ^serving_
]]
;
then
echo
"In serving-test.json, test_name must start with
\"
serving_
\"
."
exit
1
fi
# if TEST_SELECTOR is set, only run the test cases that match the selector
if
[[
-n
"
$TEST_SELECTOR
"
]]
&&
[[
!
"
$test_name
"
=
~
$TEST_SELECTOR
]]
;
then
echo
"Skip test case
$test_name
."
continue
fi
# get client and server arguments (after merged the default parameters)
server_params
=
$(
echo
"
$params
"
| jq
-r
'.server_parameters'
)
server_envs
=
$(
echo
"
$params
"
| jq
-r
'.server_environment_variables'
)
client_params
=
$(
echo
"
$params
"
| jq
-r
'.client_parameters'
)
server_args
=
$(
json2args
"
$server_params
"
)
server_envs
=
$(
json2envs
"
$server_envs
"
)
client_args
=
$(
json2args
"
$client_params
"
)
# qps_list
qps_list
=
$(
echo
"
$params
"
| jq
-r
'.qps_list'
)
qps_list
=
$(
echo
"
$qps_list
"
| jq
-r
'.[] | @sh'
)
echo
"Running over qps list
$qps_list
"
# max_concurrency_list (fallback to num_prompts if missing)
max_concurrency_list
=
$(
echo
"
$params
"
| jq
-r
'.max_concurrency_list'
)
if
[[
-z
"
$max_concurrency_list
"
||
"
$max_concurrency_list
"
==
"null"
]]
;
then
num_prompts
=
$(
echo
"
$client_params
"
| jq
-r
'.num_prompts'
)
max_concurrency_list
=
"[
$num_prompts
]"
fi
max_concurrency_list
=
$(
echo
"
$max_concurrency_list
"
| jq
-r
'.[] | @sh'
)
echo
"Running over max concurrency list
$max_concurrency_list
"
# check if there is enough resources to run the test
tp
=
$(
echo
"
$server_params
"
| jq
-r
'.tensor_parallel_size'
)
if
[[
"
$ON_CPU
"
==
"1"
]]
;
then
pp
=
$(
echo
"
$server_params
"
| jq
-r
'.pipeline_parallel_size // 1'
)
world_size
=
$((
$tp
*
$pp
))
if
[[
$numa_count
-lt
$world_size
&&
-z
"
${
REMOTE_HOST
}
"
]]
;
then
echo
"Required world-size
$world_size
but only
$numa_count
NUMA nodes found. Skip testcase
$test_name
."
continue
fi
else
if
[[
$gpu_count
-lt
$tp
]]
;
then
echo
"Required tensor-parallel-size
$tp
but only
$gpu_count
GPU found. Skip testcase
$test_name
."
continue
fi
fi
# check if server model and client model is aligned
server_model
=
$(
echo
"
$server_params
"
| jq
-r
'.model'
)
client_model
=
$(
echo
"
$client_params
"
| jq
-r
'.model'
)
if
[[
$server_model
!=
"
$client_model
"
]]
;
then
echo
"Server model and client model must be the same. Skip testcase
$test_name
."
continue
fi
server_command
=
"
$server_envs
vllm serve
\
$server_args
"
# run the server
echo
"Running test case
$test_name
"
echo
"Server command:
$server_command
"
# support remote vllm server
client_remote_args
=
""
if
[[
-z
"
${
REMOTE_HOST
}
"
&&
"
${
DRY_RUN
:-
0
}
"
!=
"1"
]]
;
then
bash
-c
"
$server_command
"
&
server_pid
=
$!
# wait until the server is alive
if
wait_for_server
;
then
echo
""
echo
"vLLM server is up and running."
else
echo
""
echo
"vLLM failed to start within the timeout period."
fi
elif
[[
"
${
DRY_RUN
:-
0
}
"
==
"1"
]]
;
then
# dry-run: don't start server
echo
"Dry Run."
else
server_command
=
"Using Remote Server
$REMOTE_HOST
$REMOTE_PORT
"
if
[[
${
REMOTE_PORT
}
]]
;
then
client_remote_args
=
" --host=
$REMOTE_HOST
--port=
$REMOTE_PORT
"
else
client_remote_args
=
" --host=
$REMOTE_HOST
"
fi
fi
# save the compilation mode and optimization level on the serving results
# whenever they are set
compilation_config_mode
=
$(
echo
"
$server_params
"
| jq
-r
'."compilation_config.mode" // empty'
)
optimization_level
=
$(
echo
"
$server_params
"
| jq
-r
'.optimization_level // empty'
)
# iterate over different QPS
for
qps
in
$qps_list
;
do
# remove the surrounding single quote from qps
if
[[
"
$qps
"
==
*
"inf"
*
]]
;
then
qps
=
"inf"
fi
# iterate over different max_concurrency
for
max_concurrency
in
$max_concurrency_list
;
do
new_test_name
=
"
${
test_name
}
_qps_
${
qps
}
_concurrency_
${
max_concurrency
}
"
echo
" new test name
$new_test_name
"
# pass the tensor parallel size, the compilation mode, and the optimization
# level to the client so that they can be used on the benchmark dashboard
client_command
=
"vllm bench serve
\
--save-result
\
--result-dir
$RESULTS_FOLDER
\
--result-filename
${
new_test_name
}
.json
\
--request-rate
$qps
\
--max-concurrency
$max_concurrency
\
--metadata tensor_parallel_size=
$tp
compilation_config.mode=
$compilation_config_mode
optimization_level=
$optimization_level
\
$client_args
$client_remote_args
"
echo
"Running test case
$test_name
with qps
$qps
"
echo
"Client command:
$client_command
"
if
[[
"
${
DRY_RUN
:-
0
}
"
!=
"1"
]]
;
then
bash
-c
"
$client_command
"
fi
# record the benchmarking commands
jq_output
=
$(
jq
-n
\
--arg
server
"
$server_command
"
\
--arg
client
"
$client_command
"
\
--arg
gpu
"
$gpu_type
"
\
'{
server_command: $server,
client_command: $client,
gpu_type: $gpu
}'
)
echo
"
$jq_output
"
>
"
$RESULTS_FOLDER
/
${
new_test_name
}
.commands"
done
done
# clean up
if
[[
"
${
DRY_RUN
:-
0
}
"
!=
"1"
]]
;
then
kill
-9
"
$server_pid
"
kill_gpu_processes
fi
done
}
main
()
{
local
ARCH
ARCH
=
''
if
[[
"
$ON_CPU
"
==
"1"
]]
;
then
check_cpus
ARCH
=
"-
$gpu_type
"
else
check_gpus
ARCH
=
"
$arch_suffix
"
fi
# DRY_RUN does not execute vLLM; do not require HF_TOKEN.
if
[[
"
${
DRY_RUN
:-
0
}
"
!=
"1"
]]
;
then
check_hf_token
else
echo
"DRY_RUN=1 -> skip HF_TOKEN validation"
fi
# dependencies
(
which wget
&&
which curl
)
||
(
apt-get update
&&
apt-get
install
-y
wget curl
)
(
which jq
)
||
(
apt-get update
&&
apt-get
-y
install
jq
)
(
which lsof
)
||
(
apt-get update
&&
apt-get
install
-y
lsof
)
# get the current IP address, required by `vllm bench serve` command
export
VLLM_HOST_IP
=
$(
hostname
-I
|
awk
'{print $1}'
)
# turn of the reporting of the status of each request, to clean up the terminal output
export
VLLM_LOGGING_LEVEL
=
"WARNING"
# prepare for benchmarking
cd
benchmarks
||
exit
1
ensure_sharegpt_downloaded
declare
-g
RESULTS_FOLDER
=
results/
mkdir
-p
$RESULTS_FOLDER
QUICK_BENCHMARK_ROOT
=
../.buildkite/performance-benchmarks/
# dump vllm info via vllm collect-env
env_output
=
$(
vllm collect-env
)
echo
"
$env_output
"
>
"
$RESULTS_FOLDER
/vllm_env.txt"
# benchmarking
run_serving_tests
$QUICK_BENCHMARK_ROOT
/tests/
"
${
SERVING_JSON
:-
serving
-tests
$ARCH
.json
}
"
||
exit
$?
if
[[
"
${
DRY_RUN
:-
0
}
"
==
"1"
]]
;
then
echo
"DRY_RUN=1 -> skip latency/startup/throughput suites"
exit
0
fi
run_latency_tests
$QUICK_BENCHMARK_ROOT
/tests/
"
${
LATENCY_JSON
:-
latency
-tests
$ARCH
.json
}
"
run_startup_tests
$QUICK_BENCHMARK_ROOT
/tests/
"
${
STARTUP_JSON
:-
startup
-tests
$ARCH
.json
}
"
run_throughput_tests
$QUICK_BENCHMARK_ROOT
/tests/
"
${
THROUGHPUT_JSON
:-
throughput
-tests
$ARCH
.json
}
"
# postprocess benchmarking results
pip
install
tabulate pandas
python3
$QUICK_BENCHMARK_ROOT
/scripts/convert-results-json-to-markdown.py
upload_to_buildkite
}
main
"
$@
"
.buildkite/performance-benchmarks/tests/genai-perf-tests.json
0 → 100644
View file @
3b50924c
[
{
"test_name"
:
"llama8B_tp1_genai_perf"
,
"qps_list"
:
[
4
,
8
,
16
,
32
],
"common_parameters"
:
{
"model"
:
"meta-llama/Meta-Llama-3-8B-Instruct"
,
"tp"
:
1
,
"port"
:
8000
,
"num_prompts"
:
500
,
"reuse_server"
:
false
},
"vllm_server_parameters"
:
{
"disable_log_stats"
:
""
,
"gpu_memory_utilization"
:
0.9
,
"max_num_seqs"
:
512
,
"dtype"
:
"bfloat16"
},
"genai_perf_input_parameters"
:
{
}
}
]
\ No newline at end of file
.buildkite/performance-benchmarks/tests/latency-tests-arm64-cpu.json
0 → 100644
View file @
3b50924c
[
{
"test_name"
:
"latency_llama8B_tp1"
,
"environment_variables"
:
{
"VLLM_RPC_TIMEOUT"
:
100000
,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN"
:
1
,
"VLLM_ENGINE_ITERATION_TIMEOUT_S"
:
120
,
"VLLM_CPU_KVCACHE_SPACE"
:
40
},
"parameters"
:
{
"model"
:
"meta-llama/Llama-3.1-8B-Instruct"
,
"tensor_parallel_size"
:
1
,
"load_format"
:
"dummy"
,
"dtype"
:
"bfloat16"
,
"distributed_executor_backend"
:
"mp"
,
"block_size"
:
128
,
"trust_remote_code"
:
""
,
"disable_log_stats"
:
""
,
"enforce_eager"
:
""
,
"max_num_batched_tokens"
:
2048
,
"max_num_seqs"
:
256
,
"num_iters_warmup"
:
5
,
"num_iters"
:
15
}
}
]
.buildkite/performance-benchmarks/tests/latency-tests-cpu.json
0 → 100644
View file @
3b50924c
[
{
"test_name"
:
"latency_llama8B_tp2"
,
"environment_variables"
:
{
"VLLM_RPC_TIMEOUT"
:
100000
,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN"
:
1
,
"VLLM_ENGINE_ITERATION_TIMEOUT_S"
:
120
,
"VLLM_CPU_SGL_KERNEL"
:
1
,
"VLLM_CPU_KVCACHE_SPACE"
:
40
},
"parameters"
:
{
"model"
:
"meta-llama/Llama-3.1-8B-Instruct"
,
"tensor_parallel_size"
:
2
,
"dtype"
:
"bfloat16"
,
"distributed_executor_backend"
:
"mp"
,
"block_size"
:
128
,
"trust_remote_code"
:
""
,
"disable_log_stats"
:
""
,
"enforce_eager"
:
""
,
"max_num_batched_tokens"
:
2048
,
"max_num_seqs"
:
256
,
"num_iters_warmup"
:
5
,
"num_iters"
:
15
}
}
]
.buildkite/performance-benchmarks/tests/latency-tests-hpu.json
0 → 100644
View file @
3b50924c
[
{
"test_name"
:
"latency_llama8B_tp1"
,
"environment_variables"
:
{
"PT_HPU_LAZY_MODE"
:
1
,
"VLLM_CONTIGUOUS_PA"
:
1
,
"VLLM_DEFRAG"
:
1
},
"parameters"
:
{
"model"
:
"meta-llama/Meta-Llama-3.1-8B-Instruct"
,
"tensor_parallel_size"
:
1
,
"load_format"
:
"dummy"
,
"num-iters-warmup"
:
5
,
"num-iters"
:
15
,
"max-model-len"
:
256
,
"async-scheduling"
:
""
}
},
{
"test_name"
:
"latency_llama70B_tp4"
,
"environment_variables"
:
{
"PT_HPU_LAZY_MODE"
:
1
,
"PT_HPU_ENABLE_LAZY_COLLECTIVES"
:
1
,
"VLLM_CONTIGUOUS_PA"
:
1
,
"VLLM_DEFRAG"
:
1
},
"parameters"
:
{
"model"
:
"meta-llama/Meta-Llama-3.1-70B-Instruct"
,
"tensor_parallel_size"
:
4
,
"load_format"
:
"dummy"
,
"num-iters-warmup"
:
5
,
"num-iters"
:
15
,
"max-model-len"
:
256
,
"async-scheduling"
:
""
}
},
{
"test_name"
:
"latency_mixtral8x7B_tp2"
,
"environment_variables"
:
{
"PT_HPU_LAZY_MODE"
:
1
,
"PT_HPU_ENABLE_LAZY_COLLECTIVES"
:
1
,
"VLLM_CONTIGUOUS_PA"
:
1
,
"VLLM_DEFRAG"
:
1
},
"parameters"
:
{
"model"
:
"mistralai/Mixtral-8x7B-Instruct-v0.1"
,
"tensor_parallel_size"
:
2
,
"load_format"
:
"dummy"
,
"num-iters-warmup"
:
5
,
"num-iters"
:
15
,
"max-model-len"
:
256
,
"async-scheduling"
:
""
}
},
{
"test_name"
:
"latency_deepseek_r1"
,
"environment_variables"
:
{
"PT_HPU_LAZY_MODE"
:
1
,
"PT_HPU_ENABLE_LAZY_COLLECTIVES"
:
1
,
"VLLM_CONTIGUOUS_PA"
:
1
,
"VLLM_DEFRAG"
:
1
},
"parameters"
:
{
"model"
:
"deepseek-ai/DeepSeek-R1"
,
"tensor_parallel_size"
:
8
,
"load_format"
:
"dummy"
,
"max-model-len"
:
2048
,
"dtype"
:
"bfloat16"
}
},
{
"test_name"
:
"latency_llama4_maverick_17b128e_instruct_fp8"
,
"environment_variables"
:
{
"PT_HPU_LAZY_MODE"
:
1
,
"PT_HPU_ENABLE_LAZY_COLLECTIVES"
:
1
,
"VLLM_CONTIGUOUS_PA"
:
1
,
"VLLM_DEFRAG"
:
1
},
"parameters"
:
{
"model"
:
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
,
"tensor_parallel_size"
:
8
,
"max-model-len"
:
512
,
"max-num-seqs"
:
128
,
"async-scheduling"
:
""
,
"gpu-memory-utilization"
:
0.95
,
"enable_expert_parallel"
:
""
}
},
{
"test_name"
:
"latency_qwen3_8b"
,
"environment_variables"
:
{
"PT_HPU_LAZY_MODE"
:
1
,
"PT_HPU_ENABLE_LAZY_COLLECTIVES"
:
1
,
"VLLM_CONTIGUOUS_PA"
:
1
,
"VLLM_DEFRAG"
:
1
},
"parameters"
:
{
"model"
:
"Qwen/Qwen3-8B"
,
"tensor_parallel_size"
:
1
,
"max-model-len"
:
2048
,
"max-num-seqs"
:
128
,
"dtype"
:
"bfloat16"
,
"async-scheduling"
:
""
}
}
]
.buildkite/performance-benchmarks/tests/latency-tests.json
0 → 100644
View file @
3b50924c
[
{
"test_name"
:
"latency_llama8B_tp1"
,
"parameters"
:
{
"model"
:
"meta-llama/Meta-Llama-3.1-8B-Instruct"
,
"tensor_parallel_size"
:
1
,
"load_format"
:
"dummy"
,
"num_iters_warmup"
:
5
,
"num_iters"
:
15
}
},
{
"test_name"
:
"latency_llama70B_tp4"
,
"parameters"
:
{
"model"
:
"meta-llama/Meta-Llama-3.1-70B-Instruct"
,
"tensor_parallel_size"
:
4
,
"load_format"
:
"dummy"
,
"num-iters-warmup"
:
5
,
"num-iters"
:
15
}
},
{
"test_name"
:
"latency_mixtral8x7B_tp2"
,
"parameters"
:
{
"model"
:
"mistralai/Mixtral-8x7B-Instruct-v0.1"
,
"tensor_parallel_size"
:
2
,
"load_format"
:
"dummy"
,
"num-iters-warmup"
:
5
,
"num-iters"
:
15
}
}
]
.buildkite/performance-benchmarks/tests/nightly-tests.json
0 → 100644
View file @
3b50924c
[
{
"test_name"
:
"llama8B_tp1_sharegpt"
,
"qps_list"
:
[
4
,
8
,
16
,
32
,
"inf"
],
"common_parameters"
:
{
"model"
:
"meta-llama/Meta-Llama-3-8B-Instruct"
,
"tp"
:
1
,
"dataset_name"
:
"sharegpt"
,
"dataset_path"
:
"./ShareGPT_V3_unfiltered_cleaned_split.json"
,
"num_prompts"
:
500
,
"port"
:
8000
,
"reuse_server"
:
false
},
"lmdeploy_server_parameters"
:
{
"dtype"
:
"bfloat16"
},
"lmdeploy_client_parameters"
:
{
},
"tgi_server_parameters"
:
{
},
"tgi_client_parameters"
:
{
"endpoint"
:
"/generate_stream"
},
"trt_server_parameters"
:
{
"model_type"
:
"llama"
,
"model_dtype"
:
"bfloat16"
,
"max_batch_size"
:
2048
,
"max_input_len"
:
4096
,
"max_seq_len"
:
6144
,
"max_num_tokens"
:
16384
,
"trt_llm_version"
:
"v0.11.0"
},
"trt_client_parameters"
:
{
"endpoint"
:
"/v2/models/ensemble/generate_stream"
},
"vllm_server_parameters"
:
{
"disable_log_stats"
:
""
,
"gpu_memory_utilization"
:
0.9
,
"max_num_seqs"
:
512
,
"dtype"
:
"bfloat16"
},
"vllm_client_parameters"
:
{
},
"sglang_server_parameters"
:
{
"disable_radix_cache"
:
""
,
"enable_torch_compile"
:
""
,
"dtype"
:
"bfloat16"
},
"sglang_client_parameters"
:
{
}
},
{
"test_name"
:
"llama8B_tp1_sonnet_512_16"
,
"qps_list"
:
[
4
,
8
,
16
,
32
,
"inf"
],
"common_parameters"
:
{
"model"
:
"meta-llama/Meta-Llama-3-8B-Instruct"
,
"tp"
:
1
,
"dataset_name"
:
"sonnet"
,
"dataset_path"
:
"./sonnet_4x.txt"
,
"num_prompts"
:
500
,
"port"
:
8000
,
"sonnet_input_len"
:
512
,
"sonnet_output_len"
:
16
,
"sonnet_prefix_len"
:
50
,
"reuse_server"
:
true
},
"lmdeploy_server_parameters"
:
{
"dtype"
:
"bfloat16"
},
"lmdeploy_client_parameters"
:
{
},
"tgi_server_parameters"
:
{
},
"tgi_client_parameters"
:
{
"endpoint"
:
"/generate_stream"
},
"trt_server_parameters"
:
{
"model_type"
:
"llama"
,
"model_dtype"
:
"bfloat16"
,
"max_batch_size"
:
2048
,
"max_input_len"
:
4096
,
"max_seq_len"
:
6144
,
"max_num_tokens"
:
16384
,
"trt_llm_version"
:
"v0.11.0"
},
"trt_client_parameters"
:
{
"endpoint"
:
"/v2/models/ensemble/generate_stream"
},
"vllm_server_parameters"
:
{
"disable_log_stats"
:
""
,
"gpu_memory_utilization"
:
0.9
,
"max_num_seqs"
:
512
,
"dtype"
:
"bfloat16"
},
"vllm_client_parameters"
:
{
},
"sglang_server_parameters"
:
{
"disable_radix_cache"
:
""
,
"enable_torch_compile"
:
""
,
"dtype"
:
"bfloat16"
},
"sglang_client_parameters"
:
{
}
},
{
"test_name"
:
"llama8B_tp1_sonnet_512_256"
,
"qps_list"
:
[
4
,
8
,
16
,
32
,
"inf"
],
"common_parameters"
:
{
"model"
:
"meta-llama/Meta-Llama-3-8B-Instruct"
,
"tp"
:
1
,
"dataset_name"
:
"sonnet"
,
"dataset_path"
:
"./sonnet_4x.txt"
,
"num_prompts"
:
500
,
"port"
:
8000
,
"sonnet_input_len"
:
512
,
"sonnet_output_len"
:
256
,
"sonnet_prefix_len"
:
50
,
"reuse_server"
:
true
},
"lmdeploy_server_parameters"
:
{
"dtype"
:
"bfloat16"
},
"lmdeploy_client_parameters"
:
{
},
"tgi_server_parameters"
:
{
},
"tgi_client_parameters"
:
{
"endpoint"
:
"/generate_stream"
},
"trt_server_parameters"
:
{
"model_type"
:
"llama"
,
"model_dtype"
:
"bfloat16"
,
"max_batch_size"
:
2048
,
"max_input_len"
:
4096
,
"max_seq_len"
:
6144
,
"max_num_tokens"
:
16384
,
"trt_llm_version"
:
"v0.11.0"
},
"trt_client_parameters"
:
{
"endpoint"
:
"/v2/models/ensemble/generate_stream"
},
"vllm_server_parameters"
:
{
"disable_log_stats"
:
""
,
"gpu_memory_utilization"
:
0.9
,
"max_num_seqs"
:
512
,
"dtype"
:
"bfloat16"
},
"vllm_client_parameters"
:
{
},
"sglang_server_parameters"
:
{
"disable_radix_cache"
:
""
,
"enable_torch_compile"
:
""
,
"dtype"
:
"bfloat16"
},
"sglang_client_parameters"
:
{
}
},
{
"test_name"
:
"llama70B_tp4_sharegpt"
,
"qps_list"
:
[
4
,
8
,
16
,
32
,
"inf"
],
"common_parameters"
:
{
"model"
:
"meta-llama/Meta-Llama-3-70B-Instruct"
,
"tp"
:
4
,
"dataset_name"
:
"sharegpt"
,
"dataset_path"
:
"./ShareGPT_V3_unfiltered_cleaned_split.json"
,
"num_prompts"
:
500
,
"port"
:
8000
,
"reuse_server"
:
false
},
"lmdeploy_server_parameters"
:
{
"dtype"
:
"bfloat16"
},
"lmdeploy_client_parameters"
:
{
},
"tgi_server_parameters"
:
{
},
"tgi_client_parameters"
:
{
"endpoint"
:
"/generate_stream"
},
"trt_server_parameters"
:
{
"model_type"
:
"llama"
,
"model_dtype"
:
"bfloat16"
,
"max_batch_size"
:
2048
,
"max_input_len"
:
4096
,
"max_seq_len"
:
6144
,
"max_num_tokens"
:
16384
,
"trt_llm_version"
:
"v0.11.0"
},
"trt_client_parameters"
:
{
"endpoint"
:
"/v2/models/ensemble/generate_stream"
},
"vllm_server_parameters"
:
{
"disable_log_stats"
:
""
,
"gpu_memory_utilization"
:
0.9
,
"max_num_seqs"
:
512
,
"dtype"
:
"bfloat16"
},
"vllm_client_parameters"
:
{
},
"sglang_server_parameters"
:
{
"disable_radix_cache"
:
""
,
"dtype"
:
"bfloat16"
},
"sglang_client_parameters"
:
{
}
},
{
"test_name"
:
"llama70B_tp4_sonnet_512_16"
,
"qps_list"
:
[
4
,
8
,
16
,
32
,
"inf"
],
"common_parameters"
:
{
"model"
:
"meta-llama/Meta-Llama-3-70B-Instruct"
,
"tp"
:
4
,
"dataset_name"
:
"sonnet"
,
"dataset_path"
:
"./sonnet_4x.txt"
,
"num_prompts"
:
500
,
"port"
:
8000
,
"sonnet_input_len"
:
512
,
"sonnet_output_len"
:
16
,
"sonnet_prefix_len"
:
50
,
"reuse_server"
:
true
},
"lmdeploy_server_parameters"
:
{
"dtype"
:
"bfloat16"
},
"lmdeploy_client_parameters"
:
{
},
"tgi_server_parameters"
:
{
},
"tgi_client_parameters"
:
{
"endpoint"
:
"/generate_stream"
},
"trt_server_parameters"
:
{
"model_type"
:
"llama"
,
"model_dtype"
:
"bfloat16"
,
"max_batch_size"
:
2048
,
"max_input_len"
:
4096
,
"max_seq_len"
:
6144
,
"max_num_tokens"
:
16384
,
"trt_llm_version"
:
"v0.11.0"
},
"trt_client_parameters"
:
{
"endpoint"
:
"/v2/models/ensemble/generate_stream"
},
"vllm_server_parameters"
:
{
"disable_log_stats"
:
""
,
"gpu_memory_utilization"
:
0.9
,
"max_num_seqs"
:
512
,
"dtype"
:
"bfloat16"
},
"vllm_client_parameters"
:
{
},
"sglang_server_parameters"
:
{
"disable_radix_cache"
:
""
,
"dtype"
:
"bfloat16"
},
"sglang_client_parameters"
:
{
}
},
{
"test_name"
:
"llama70B_tp4_sonnet_512_256"
,
"qps_list"
:
[
4
,
8
,
16
,
32
,
"inf"
],
"common_parameters"
:
{
"model"
:
"meta-llama/Meta-Llama-3-70B-Instruct"
,
"tp"
:
4
,
"dataset_name"
:
"sonnet"
,
"dataset_path"
:
"./sonnet_4x.txt"
,
"num_prompts"
:
500
,
"port"
:
8000
,
"sonnet_input_len"
:
512
,
"sonnet_output_len"
:
256
,
"sonnet_prefix_len"
:
50
,
"reuse_server"
:
true
},
"lmdeploy_server_parameters"
:
{
"dtype"
:
"bfloat16"
},
"lmdeploy_client_parameters"
:
{
},
"tgi_server_parameters"
:
{
},
"tgi_client_parameters"
:
{
"endpoint"
:
"/generate_stream"
},
"trt_server_parameters"
:
{
"model_type"
:
"llama"
,
"model_dtype"
:
"bfloat16"
,
"max_batch_size"
:
2048
,
"max_input_len"
:
4096
,
"max_seq_len"
:
6144
,
"max_num_tokens"
:
16384
,
"trt_llm_version"
:
"v0.11.0"
},
"trt_client_parameters"
:
{
"endpoint"
:
"/v2/models/ensemble/generate_stream"
},
"vllm_server_parameters"
:
{
"disable_log_stats"
:
""
,
"gpu_memory_utilization"
:
0.9
,
"max_num_seqs"
:
512
,
"dtype"
:
"bfloat16"
},
"vllm_client_parameters"
:
{
},
"sglang_server_parameters"
:
{
"disable_radix_cache"
:
""
,
"dtype"
:
"bfloat16"
},
"sglang_client_parameters"
:
{
}
}
]
\ No newline at end of file
.buildkite/performance-benchmarks/tests/serving-tests-arm64-cpu.json
0 → 100644
View file @
3b50924c
{
"defaults"
:
{
"qps_list"
:
[
"inf"
],
"max_concurrency_list"
:
[
12
,
16
,
24
,
32
,
64
,
128
,
200
],
"server_environment_variables"
:
{
"VLLM_RPC_TIMEOUT"
:
100000
,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN"
:
1
,
"VLLM_ENGINE_ITERATION_TIMEOUT_S"
:
120
,
"VLLM_CPU_SGL_KERNEL"
:
1
,
"VLLM_CPU_KVCACHE_SPACE"
:
40
},
"server_parameters"
:
{
"model"
:
"meta-llama/Llama-3.1-8B-Instruct"
,
"tensor_parallel_size"
:
1
,
"dtype"
:
"bfloat16"
,
"distributed_executor_backend"
:
"mp"
,
"block_size"
:
128
,
"trust_remote_code"
:
""
,
"disable_log_stats"
:
""
,
"enforce_eager"
:
""
,
"max_num_batched_tokens"
:
2048
,
"max_num_seqs"
:
256
,
"load_format"
:
"dummy"
},
"client_parameters"
:
{
"model"
:
"meta-llama/Llama-3.1-8B-Instruct"
,
"backend"
:
"vllm"
,
"ignore-eos"
:
""
,
"num_prompts"
:
200
}
},
"tests"
:
[
{
"test_name"
:
"serving_llama8B_tp1_sharegpt"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"dataset_name"
:
"sharegpt"
,
"dataset_path"
:
"./ShareGPT_V3_unfiltered_cleaned_split.json"
}
},
{
"test_name"
:
"serving_llama8B_tp2_sharegpt"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
2
},
"client_parameters"
:
{
"dataset_name"
:
"sharegpt"
,
"dataset_path"
:
"./ShareGPT_V3_unfiltered_cleaned_split.json"
}
},
{
"test_name"
:
"serving_llama8B_tp1_random_128_128"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama8B_tp2_random_128_128"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
2
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama8B_tp1_random_128_2048"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
2048
}
},
{
"test_name"
:
"serving_llama8B_tp2_random_128_2048"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
2
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
2048
}
},
{
"test_name"
:
"serving_llama8B_tp1_random_2048_128"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
2048
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama8B_tp2_random_2048_128"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
2
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
2048
,
"random-output-len"
:
128
}
}
]
}
\ No newline at end of file
.buildkite/performance-benchmarks/tests/serving-tests-cpu-embed.json
0 → 100644
View file @
3b50924c
{
"defaults"
:
{
"qps_list"
:
[
"inf"
],
"max_concurrency_list"
:
[
32
,
64
,
128
],
"server_environment_variables"
:
{
"VLLM_RPC_TIMEOUT"
:
100000
,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN"
:
1
,
"VLLM_ENGINE_ITERATION_TIMEOUT_S"
:
120
,
"VLLM_CPU_SGL_KERNEL"
:
1
,
"VLLM_CPU_KVCACHE_SPACE"
:
40
},
"server_parameters"
:
{
"dtype"
:
"bfloat16"
,
"model"
:
"jinaai/jina-embeddings-v3"
,
"trust_remote_code"
:
""
},
"client_parameters"
:
{
"model"
:
"jinaai/jina-embeddings-v3"
,
"backend"
:
"openai-embeddings"
,
"endpoint"
:
"/v1/embeddings"
,
"dataset_name"
:
"sharegpt"
,
"dataset_path"
:
"ShareGPT_V3_unfiltered_cleaned_split.json"
,
"num_prompts"
:
200
}
},
"tests"
:
[
{
"test_name"
:
"serving_jina_embed_v3_tp1_sharegpt"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{}
}
]
}
.buildkite/performance-benchmarks/tests/serving-tests-cpu-text.json
0 → 100644
View file @
3b50924c
{
"defaults"
:
{
"qps_list"
:
[
"inf"
],
"max_concurrency_list"
:
[
12
,
16
,
24
,
32
,
64
,
128
,
200
],
"server_environment_variables"
:
{
"VLLM_RPC_TIMEOUT"
:
100000
,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN"
:
1
,
"VLLM_ENGINE_ITERATION_TIMEOUT_S"
:
120
,
"VLLM_CPU_SGL_KERNEL"
:
1
,
"VLLM_CPU_KVCACHE_SPACE"
:
40
},
"server_parameters"
:
{
"model"
:
"meta-llama/Llama-3.1-8B-Instruct"
,
"tensor_parallel_size"
:
1
,
"dtype"
:
"bfloat16"
,
"distributed_executor_backend"
:
"mp"
,
"block_size"
:
128
,
"trust_remote_code"
:
""
,
"disable_log_stats"
:
""
,
"max_num_batched_tokens"
:
2048
,
"max_num_seqs"
:
256
},
"client_parameters"
:
{
"model"
:
"meta-llama/Llama-3.1-8B-Instruct"
,
"backend"
:
"vllm"
,
"ignore-eos"
:
""
,
"num_prompts"
:
200
}
},
"tests"
:
[
{
"test_name"
:
"serving_llama8B_tp1_sharegpt"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"dataset_name"
:
"sharegpt"
,
"dataset_path"
:
"./ShareGPT_V3_unfiltered_cleaned_split.json"
}
},
{
"test_name"
:
"serving_llama8B_tp2_sharegpt"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
2
},
"client_parameters"
:
{
"dataset_name"
:
"sharegpt"
,
"dataset_path"
:
"./ShareGPT_V3_unfiltered_cleaned_split.json"
}
},
{
"test_name"
:
"serving_llama8B_tp1_random_128_128"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama8B_tp2_random_128_128"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
2
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama8B_tp4_random_128_128"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
4
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama8B_tp1_random_128_2048"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
2048
}
},
{
"test_name"
:
"serving_llama8B_tp2_random_128_2048"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
2
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
2048
}
},
{
"test_name"
:
"serving_llama8B_tp4_random_128_2048"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
4
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
2048
}
},
{
"test_name"
:
"serving_llama8B_tp1_random_2048_128"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
2048
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama8B_tp2_random_2048_128"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
2
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
2048
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama8B_tp4_random_2048_128"
,
"server_parameters"
:
{
"tensor_parallel_size"
:
4
},
"client_parameters"
:
{
"dataset_name"
:
"random"
,
"random-input-len"
:
2048
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama8B_int4_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama8B_int4_tp2_random_128_128"
,
"server_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"tensor_parallel_size"
:
2
},
"client_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama8B_int4_tp4_random_128_128"
,
"server_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"tensor_parallel_size"
:
4
},
"client_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_llama3B_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"meta-llama/Llama-3.2-3B-Instruct"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"meta-llama/Llama-3.2-3B-Instruct"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_granite2B_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"ibm-granite/granite-3.2-2b-instruct"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"ibm-granite/granite-3.2-2b-instruct"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_qwen1.7B_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"Qwen/Qwen3-1.7B"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"Qwen/Qwen3-1.7B"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_qwen4B_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"Qwen/Qwen3-4B"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"Qwen/Qwen3-4B"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_qwen8B_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"Qwen/Qwen3-8B"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"Qwen/Qwen3-8B"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_glm9B_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"zai-org/glm-4-9b-hf"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"zai-org/glm-4-9b-hf"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
},
{
"test_name"
:
"serving_gemma7B_tp1_random_128_128"
,
"server_parameters"
:
{
"model"
:
"google/gemma-7b"
,
"tensor_parallel_size"
:
1
},
"client_parameters"
:
{
"model"
:
"google/gemma-7b"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
}
}
]
}
.buildkite/performance-benchmarks/tests/serving-tests-cpu.json
0 → 100644
View file @
3b50924c
This diff is collapsed.
Click to expand it.
.buildkite/performance-benchmarks/tests/serving-tests-hpu.json
0 → 100644
View file @
3b50924c
This diff is collapsed.
Click to expand it.
.buildkite/performance-benchmarks/tests/serving-tests.json
0 → 100644
View file @
3b50924c
This diff is collapsed.
Click to expand it.
.buildkite/performance-benchmarks/tests/throughput-tests-arm64-cpu.json
0 → 100644
View file @
3b50924c
[
{
"test_name"
:
"throughput_llama8B_tp1"
,
"environment_variables"
:
{
"VLLM_RPC_TIMEOUT"
:
100000
,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN"
:
1
,
"VLLM_ENGINE_ITERATION_TIMEOUT_S"
:
120
,
"VLLM_CPU_KVCACHE_SPACE"
:
40
},
"parameters"
:
{
"model"
:
"meta-llama/Llama-3.1-8B-Instruct"
,
"tensor_parallel_size"
:
1
,
"load_format"
:
"dummy"
,
"dtype"
:
"bfloat16"
,
"distributed_executor_backend"
:
"mp"
,
"block_size"
:
128
,
"trust_remote_code"
:
""
,
"disable_log_stats"
:
""
,
"enforce_eager"
:
""
,
"max_num_batched_tokens"
:
2048
,
"max_num_seqs"
:
256
,
"dataset"
:
"./ShareGPT_V3_unfiltered_cleaned_split.json"
,
"num_prompts"
:
200
,
"backend"
:
"vllm"
}
}
]
.buildkite/performance-benchmarks/tests/throughput-tests-cpu.json
0 → 100644
View file @
3b50924c
[
{
"test_name"
:
"throughput_llama8B_tp2"
,
"environment_variables"
:
{
"VLLM_RPC_TIMEOUT"
:
100000
,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN"
:
1
,
"VLLM_ENGINE_ITERATION_TIMEOUT_S"
:
120
,
"VLLM_CPU_SGL_KERNEL"
:
1
,
"VLLM_CPU_KVCACHE_SPACE"
:
40
},
"parameters"
:
{
"model"
:
"meta-llama/Llama-3.1-8B-Instruct"
,
"tensor_parallel_size"
:
2
,
"dtype"
:
"bfloat16"
,
"distributed_executor_backend"
:
"mp"
,
"block_size"
:
128
,
"trust_remote_code"
:
""
,
"disable_log_stats"
:
""
,
"enforce_eager"
:
""
,
"max_num_batched_tokens"
:
2048
,
"max_num_seqs"
:
256
,
"dataset"
:
"./ShareGPT_V3_unfiltered_cleaned_split.json"
,
"num_prompts"
:
200
,
"backend"
:
"vllm"
}
}
]
.buildkite/performance-benchmarks/tests/throughput-tests-hpu.json
0 → 100644
View file @
3b50924c
This diff is collapsed.
Click to expand it.
.buildkite/performance-benchmarks/tests/throughput-tests.json
0 → 100644
View file @
3b50924c
This diff is collapsed.
Click to expand it.
.buildkite/release-pipeline.yaml
0 → 100644
View file @
3b50924c
This diff is collapsed.
Click to expand it.
Prev
1
2
3
4
5
6
7
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment