Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
ad385667
Commit
ad385667
authored
Oct 23, 2024
by
zhuwenwen
Browse files
Merge branch 'v0.6.3.post1-dev'
parents
be0967c1
903593d3
Changes
967
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1283 additions
and
917 deletions
+1283
-917
.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
+48
-10
.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
...kite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+357
-0
.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
.../nightly-benchmarks/scripts/run-performance-benchmarks.sh
+30
-29
.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
+0
-216
.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
+0
-214
.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
+0
-221
.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
...ite/nightly-benchmarks/scripts/summary-nightly-results.py
+8
-1
.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
+3
-1
.buildkite/nightly-benchmarks/tests/latency-tests.json
.buildkite/nightly-benchmarks/tests/latency-tests.json
+2
-2
.buildkite/nightly-benchmarks/tests/nightly-tests.json
.buildkite/nightly-benchmarks/tests/nightly-tests.json
+237
-30
.buildkite/nightly-benchmarks/tests/serving-tests.json
.buildkite/nightly-benchmarks/tests/serving-tests.json
+6
-6
.buildkite/nightly-benchmarks/tests/throughput-tests.json
.buildkite/nightly-benchmarks/tests/throughput-tests.json
+2
-2
.buildkite/release-pipeline.yaml
.buildkite/release-pipeline.yaml
+21
-7
.buildkite/run-amd-test.sh
.buildkite/run-amd-test.sh
+75
-5
.buildkite/run-cpu-test-ppc64le.sh
.buildkite/run-cpu-test-ppc64le.sh
+39
-0
.buildkite/run-cpu-test.sh
.buildkite/run-cpu-test.sh
+19
-2
.buildkite/run-tpu-test.sh
.buildkite/run-tpu-test.sh
+1
-2
.buildkite/run-xpu-test.sh
.buildkite/run-xpu-test.sh
+1
-1
.buildkite/test-pipeline.yaml
.buildkite/test-pipeline.yaml
+402
-168
.dockerignore
.dockerignore
+32
-0
No files found.
.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
View file @
ad385667
...
@@ -8,6 +8,7 @@ main() {
...
@@ -8,6 +8,7 @@ main() {
(
which wget
&&
which curl
)
||
(
apt-get update
&&
apt-get
install
-y
wget curl
)
(
which wget
&&
which curl
)
||
(
apt-get update
&&
apt-get
install
-y
wget curl
)
(
which jq
)
||
(
apt-get update
&&
apt-get
-y
install
jq
)
(
which jq
)
||
(
apt-get update
&&
apt-get
-y
install
jq
)
(
which zip
)
||
(
apt-get
install
-y
zip
)
if
[
!
-f
/workspace/buildkite-agent
]
;
then
if
[
!
-f
/workspace/buildkite-agent
]
;
then
echo
"buildkite-agent binary not found. Skip plotting the results."
echo
"buildkite-agent binary not found. Skip plotting the results."
...
@@ -24,17 +25,54 @@ main() {
...
@@ -24,17 +25,54 @@ main() {
ls
ls
ls
results/
ls
results/
# generate figures
# upload benchmark results
python3
-m
pip
install
tabulate pandas matplotlib
zip
-r
results.zip results/
python3
$VLLM_SOURCE_CODE_LOC
/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
\
/workspace/buildkite-agent artifact upload
"results.zip"
--description
$description
\
--results-folder
results/
# upload benchmarking scripts
cd
$VLLM_SOURCE_CODE_LOC
/
# upload results and figures
zip
-r
nightly-benchmarks.zip .buildkite/ benchmarks/
/workspace/buildkite-agent artifact upload
"nightly_results.png"
/workspace/buildkite-agent artifact upload
"nightly-benchmarks.zip"
/workspace/buildkite-agent artifact upload
$VLLM_SOURCE_CODE_LOC
/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
/workspace/buildkite-agent artifact upload
$VLLM_SOURCE_CODE_LOC
/.buildkite/nightly-benchmarks/tests/nightly-tests.json
cd
$VLLM_SOURCE_CODE_LOC
/.buildkite/nightly-benchmarks/
/workspace/buildkite-agent annotate
--style
"success"
--context
"nightly-benchmarks-results"
--append
< nightly_results.md
# upload benchmarking pipeline
/workspace/buildkite-agent artifact upload
"nightly-pipeline.yaml"
cd
$VLLM_SOURCE_CODE_LOC
/.buildkite/nightly-benchmarks/
/workspace/buildkite-agent annotate
--style
"success"
--context
"nightly-benchmarks-results"
--append
< nightly-annotation.md
# The figures should be genereated by a separate process outside the CI/CD pipeline
# # generate figures
# python3 -m pip install tabulate pandas matplotlib
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py \
# --description $description \
# --results-folder results/
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
# --description $description \
# --results-folder results/ \
# --dataset sharegpt
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
# --description $description \
# --results-folder results/ \
# --dataset sonnet_2048_128
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
# --description $description \
# --results-folder results/ \
# --dataset sonnet_128_2048
# # upload results and figures
# /workspace/buildkite-agent artifact upload "nightly_results*.png"
# /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
# /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
# /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
}
}
main
"
$@
"
main
"
$@
"
\ No newline at end of file
.buildkite/nightly-benchmarks/scripts/run-
lmdeploy-nightly
.sh
→
.buildkite/nightly-benchmarks/scripts/run-
nightly-benchmarks
.sh
View file @
ad385667
#!/bin/bash
#!/bin/bash
set
-o
pipefail
set
-o
pipefail
set
-x
check_gpus
()
{
check_gpus
()
{
# check the number of GPUs and GPU type.
# check the number of GPUs and GPU type.
...
@@ -15,15 +16,66 @@ check_gpus() {
...
@@ -15,15 +16,66 @@ check_gpus() {
echo
"GPU type is
$gpu_type
"
echo
"GPU type is
$gpu_type
"
}
}
kill_gpu_processes
()
{
check_hf_token
()
{
pkill lmdeploy
||
true
# check if HF_TOKEN is available and valid
# waiting for GPU processes to be fully killed
if
[[
-z
"
$HF_TOKEN
"
]]
;
then
sleep
10
echo
"Error: HF_TOKEN is not set."
# Print the GPU memory usage
exit
1
# so that we know if all GPU processes are killed.
elif
[[
!
"
$HF_TOKEN
"
=
~ ^hf_
]]
;
then
gpu_memory_usage
=
$(
nvidia-smi
--query-gpu
=
memory.used
--format
=
csv,noheader,nounits
-i
0
)
echo
"Error: HF_TOKEN does not start with 'hf_'."
# The memory usage should be 0 MB.
exit
1
echo
"GPU 0 Memory Usage:
$gpu_memory_usage
MB"
else
echo
"HF_TOKEN is set and valid."
fi
}
upload_to_buildkite
()
{
# upload the benchmarking results to buildkite
# if the agent binary is not found, skip uploading the results, exit 0
if
[
!
-f
/workspace/buildkite-agent
]
;
then
echo
"buildkite-agent binary not found. Skip uploading the results."
return
0
fi
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
/workspace/buildkite-agent artifact upload
"
$RESULTS_FOLDER
/*"
}
get_current_llm_serving_engine
()
{
if
which lmdeploy
>
/dev/null
;
then
echo
"Container: lmdeploy"
export
CURRENT_LLM_SERVING_ENGINE
=
lmdeploy
return
fi
if
[
-e
/tgi-entrypoint.sh
]
;
then
echo
"Container: tgi"
export
CURRENT_LLM_SERVING_ENGINE
=
tgi
return
fi
if
which trtllm-build
>
/dev/null
;
then
echo
"Container: tensorrt-llm"
export
CURRENT_LLM_SERVING_ENGINE
=
trt
return
fi
if
[
-e
/sgl-workspace
]
;
then
echo
"Container: sglang"
export
CURRENT_LLM_SERVING_ENGINE
=
sglang
return
fi
if
[
-e
/vllm-workspace
]
;
then
echo
"Container: vllm"
# move to a completely irrelevant directory, to avoid import vllm from current folder
export
CURRENT_LLM_SERVING_ENGINE
=
vllm
return
fi
}
}
json2args
()
{
json2args
()
{
...
@@ -42,6 +94,19 @@ json2args() {
...
@@ -42,6 +94,19 @@ json2args() {
echo
"
$args
"
echo
"
$args
"
}
}
kill_gpu_processes
()
{
pkill
-f
python
pkill
-f
python3
pkill
-f
tritonserver
pkill
-f
pt_main_thread
pkill
-f
text-generation
pkill
-f
lmdeploy
while
[
$(
nvidia-smi
--query-gpu
=
memory.used
--format
=
csv,noheader,nounits |
head
-n
1
)
-ge
1000
]
;
do
sleep
1
done
}
wait_for_server
()
{
wait_for_server
()
{
# wait for vllm server to start
# wait for vllm server to start
# return 1 if vllm server crashes
# return 1 if vllm server crashes
...
@@ -51,6 +116,14 @@ wait_for_server() {
...
@@ -51,6 +116,14 @@ wait_for_server() {
done'
&&
return
0
||
return
1
done'
&&
return
0
||
return
1
}
}
ensure_installed
()
{
# Ensure that the given command is installed by apt-get
local
cmd
=
$1
if
!
which
$cmd
>
/dev/null
;
then
apt-get update
&&
apt-get
install
-y
$cmd
fi
}
run_serving_tests
()
{
run_serving_tests
()
{
# run serving tests using `benchmark_serving.py`
# run serving tests using `benchmark_serving.py`
# $1: a json file specifying serving test cases
# $1: a json file specifying serving test cases
...
@@ -69,8 +142,8 @@ run_serving_tests() {
...
@@ -69,8 +142,8 @@ run_serving_tests() {
continue
continue
fi
fi
#
a
ppend
lmdeploy
to the test name
# p
re
pend
the current serving engine
to the test name
test_name
=
lmdeploy
_
$test_name
test_name
=
${
CURRENT_LLM_SERVING_ENGINE
}
_
$
{
test_name
}
# get common parameters
# get common parameters
common_params
=
$(
echo
"
$params
"
| jq
-r
'.common_parameters'
)
common_params
=
$(
echo
"
$params
"
| jq
-r
'.common_parameters'
)
...
@@ -80,13 +153,11 @@ run_serving_tests() {
...
@@ -80,13 +153,11 @@ run_serving_tests() {
dataset_path
=
$(
echo
"
$common_params
"
| jq
-r
'.dataset_path'
)
dataset_path
=
$(
echo
"
$common_params
"
| jq
-r
'.dataset_path'
)
port
=
$(
echo
"
$common_params
"
| jq
-r
'.port'
)
port
=
$(
echo
"
$common_params
"
| jq
-r
'.port'
)
num_prompts
=
$(
echo
"
$common_params
"
| jq
-r
'.num_prompts'
)
num_prompts
=
$(
echo
"
$common_params
"
| jq
-r
'.num_prompts'
)
reuse_server
=
$(
echo
"
$common_params
"
| jq
-r
'.reuse_server'
)
# get client and server arguments
# get client and server arguments
server_params
=
$(
echo
"
$params
"
| jq
-r
'.lmdeploy_server_parameters'
)
server_params
=
$(
echo
"
$params
"
| jq
-r
".
${
CURRENT_LLM_SERVING_ENGINE
}
_server_parameters"
)
client_params
=
$(
echo
"
$params
"
| jq
-r
'.lmdeploy_client_parameters'
)
client_params
=
$(
echo
"
$params
"
| jq
-r
".
${
CURRENT_LLM_SERVING_ENGINE
}
_client_parameters"
)
server_args
=
$(
json2args
"
$server_params
"
)
client_args
=
$(
json2args
"
$client_params
"
)
client_args
=
$(
json2args
"
$client_params
"
)
qps_list
=
$(
echo
"
$params
"
| jq
-r
'.qps_list'
)
qps_list
=
$(
echo
"
$params
"
| jq
-r
'.qps_list'
)
qps_list
=
$(
echo
"
$qps_list
"
| jq
-r
'.[] | @sh'
)
qps_list
=
$(
echo
"
$qps_list
"
| jq
-r
'.[] | @sh'
)
...
@@ -94,40 +165,44 @@ run_serving_tests() {
...
@@ -94,40 +165,44 @@ run_serving_tests() {
# check if there is enough GPU to run the test
# check if there is enough GPU to run the test
if
[[
$gpu_count
-lt
$tp
]]
;
then
if
[[
$gpu_count
-lt
$tp
]]
;
then
echo
"Required
tensor-parallel-size
$tp
but only
$gpu_count
GPU found. Skip testcase
$test_name
."
echo
"Required
num-shard
$tp
but only
$gpu_count
GPU found. Skip testcase
$test_name
."
continue
continue
fi
fi
# prepare tokenizer
if
[[
$reuse_server
==
"true"
]]
;
then
rm
-rf
/tokenizer_cache
echo
"Reuse previous server for test case
$test_name
"
mkdir
/tokenizer_cache
else
python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
\
kill_gpu_processes
--model
"
$model
"
\
bash
$VLLM_SOURCE_CODE_LOC
/.buildkite/nightly-benchmarks/scripts/launch-server.sh
\
--cachedir
/tokenizer_cache
"
$server_params
"
"
$common_params
"
fi
server_command
=
"lmdeploy serve api_server
$model
\
--tp
$tp
\
--server-port
$port
\
$server_args
"
# run the server
echo
"Running test case
$test_name
"
echo
"Server command:
$server_command
"
bash
-c
"
$server_command
"
&
# wait until the server is alive
wait_for_server
wait_for_server
if
[
$?
-eq
0
]
;
then
if
[
$?
-eq
0
]
;
then
echo
""
echo
""
echo
"
lmdeploy
server is up and running."
echo
"
$CURRENT_LLM_SERVING_ENGINE
server is up and running."
else
else
echo
""
echo
""
echo
"
lmdeploy
failed to start within the timeout period."
echo
"
$CURRENT_LLM_SERVING_ENGINE
failed to start within the timeout period."
break
break
fi
fi
# get model name
# prepare tokenizer
model_name
=
$(
python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
)
# this is required for lmdeploy.
cd
$VLLM_SOURCE_CODE_LOC
/benchmarks
rm
-rf
/tokenizer_cache
mkdir
/tokenizer_cache
python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
\
--model
"
$model
"
\
--cachedir
/tokenizer_cache
cd
$VLLM_SOURCE_CODE_LOC
/benchmarks
# change model name for lmdeploy (it will not follow standard hf name)
if
[[
"
$CURRENT_LLM_SERVING_ENGINE
"
==
"lmdeploy"
]]
;
then
model
=
$(
python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
)
fi
# iterate over different QPS
# iterate over different QPS
for
qps
in
$qps_list
;
do
for
qps
in
$qps_list
;
do
...
@@ -140,31 +215,79 @@ run_serving_tests() {
...
@@ -140,31 +215,79 @@ run_serving_tests() {
new_test_name
=
$test_name
"_qps_"
$qps
new_test_name
=
$test_name
"_qps_"
$qps
backend
=
$CURRENT_LLM_SERVING_ENGINE
if
[[
$backend
=
"trt"
]]
;
then
backend
=
"tensorrt-llm"
fi
if
[[
"
$backend
"
==
*
"vllm"
*
]]
;
then
backend
=
"vllm"
fi
if
[[
"
$dataset_name
"
=
"sharegpt"
]]
;
then
client_command
=
"python3 benchmark_serving.py
\
--backend
$backend
\
--tokenizer /tokenizer_cache
\
--model
$model
\
--dataset-name
$dataset_name
\
--dataset-path
$dataset_path
\
--num-prompts
$num_prompts
\
--port
$port
\
--save-result
\
--result-dir
$RESULTS_FOLDER
\
--result-filename
${
new_test_name
}
.json
\
--request-rate
$qps
\
--ignore-eos
\
$client_args
"
elif
[[
"
$dataset_name
"
=
"sonnet"
]]
;
then
sonnet_input_len
=
$(
echo
"
$common_params
"
| jq
-r
'.sonnet_input_len'
)
sonnet_output_len
=
$(
echo
"
$common_params
"
| jq
-r
'.sonnet_output_len'
)
sonnet_prefix_len
=
$(
echo
"
$common_params
"
| jq
-r
'.sonnet_prefix_len'
)
client_command
=
"python3 benchmark_serving.py
\
client_command
=
"python3 benchmark_serving.py
\
--backend
lmdeploy
\
--backend
$backend
\
--tokenizer /tokenizer_cache
\
--tokenizer /tokenizer_cache
\
--model
$model
\
--dataset-name
$dataset_name
\
--dataset-name
$dataset_name
\
--dataset-path
$dataset_path
\
--dataset-path
$dataset_path
\
--num-prompts
$num_prompts
\
--num-prompts
$num_prompts
\
--sonnet-input-len
$sonnet_input_len
\
--sonnet-output-len
$sonnet_output_len
\
--sonnet-prefix-len
$sonnet_prefix_len
\
--port
$port
\
--port
$port
\
--save-result
\
--save-result
\
--result-dir
$RESULTS_FOLDER
\
--result-dir
$RESULTS_FOLDER
\
--result-filename
${
new_test_name
}
.json
\
--result-filename
${
new_test_name
}
.json
\
--request-rate
$qps
\
--request-rate
$qps
\
--model
\"
$model_name
\"
\
--ignore-eos
\
$client_args
"
$client_args
"
else
echo
"The dataset name must be either 'sharegpt' or 'sonnet'. Got
$dataset_name
."
exit
1
fi
echo
"Running test case
$test_name
with qps
$qps
"
echo
"Running test case
$test_name
with qps
$qps
"
echo
"Client command:
$client_command
"
echo
"Client command:
$client_command
"
eval
"
$client_command
"
eval
"
$client_command
"
server_command
=
"None"
# record the benchmarking commands
# record the benchmarking commands
jq_output
=
$(
jq
-n
\
jq_output
=
$(
jq
-n
\
--arg
server
"
$server_command
"
\
--arg
server
"
$server_command
"
\
--arg
client
"
$client_command
"
\
--arg
client
"
$client_command
"
\
--arg
gpu
"
$gpu_type
"
\
--arg
gpu
"
$gpu_type
"
\
--arg
engine
"
lmdeploy
"
\
--arg
engine
"
$CURRENT_LLM_SERVING_ENGINE
"
\
'{
'{
server_command: $server,
server_command: $server,
client_command: $client,
client_command: $client,
...
@@ -175,42 +298,58 @@ run_serving_tests() {
...
@@ -175,42 +298,58 @@ run_serving_tests() {
done
done
# clean up
kill_gpu_processes
rm
-rf
/root/.cache/huggingface/
*
done
done
kill_gpu_processes
}
}
upload_to_buildkite
()
{
prepare_dataset
()
{
# upload the benchmarking results to buildkite
# if the agent binary is not found, skip uploading the results, exit 0
# download sharegpt dataset
if
[
!
-f
/workspace/buildkite-agent
]
;
then
cd
$VLLM_SOURCE_CODE_LOC
/benchmarks
echo
"buildkite-agent binary not found. Skip uploading the results."
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
return
0
fi
# duplicate sonnet by 4x, to allow benchmarking with input length 2048
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
cd
$VLLM_SOURCE_CODE_LOC
/benchmarks
/workspace/buildkite-agent artifact upload
"
$RESULTS_FOLDER
/*"
echo
""
>
sonnet_4x.txt
}
for
_
in
{
1..4
}
do
cat
sonnet.txt
>>
sonnet_4x.txt
done
}
main
()
{
main
()
{
# check if the environment variable is successfully injected from yaml
check_gpus
check_gpus
# enter vllm directory
check_hf_token
cd
$VLLM_SOURCE_CODE_LOC
/benchmarks
get_current_llm_serving_engine
pip
install
-U
transformers
# check storage
df
-h
ensure_installed wget
ensure_installed curl
ensure_installed jq
prepare_dataset
cd
$VLLM_SOURCE_CODE_LOC
/benchmarks
declare
-g
RESULTS_FOLDER
=
results/
declare
-g
RESULTS_FOLDER
=
results/
mkdir
-p
$RESULTS_FOLDER
mkdir
-p
$RESULTS_FOLDER
BENCHMARK_ROOT
=
..
/.buildkite/nightly-benchmarks/
BENCHMARK_ROOT
=
$VLLM_SOURCE_CODE_LOC
/.buildkite/nightly-benchmarks/
python
-m
pip
install
transformers
==
4.41.2
# run the test
export
CURRENT_LLM_SERVING_ENGINE
=
lmdeploy
run_serving_tests
$BENCHMARK_ROOT
/tests/nightly-tests.json
run_serving_tests
$BENCHMARK_ROOT
/tests/nightly-tests.json
python
-m
pip
install
tabulate pandas
python
$BENCHMARK_ROOT
/scripts/summary-nightly-results.py
# upload benchmark results to buildkite
python3
-m
pip
install
tabulate pandas
python3
$BENCHMARK_ROOT
/scripts/summary-nightly-results.py
upload_to_buildkite
upload_to_buildkite
}
}
...
...
.buildkite/nightly-benchmarks/
run
-benchmarks
-suite
.sh
→
.buildkite/nightly-benchmarks/
scripts/run-performance
-benchmarks.sh
View file @
ad385667
...
@@ -68,35 +68,38 @@ wait_for_server() {
...
@@ -68,35 +68,38 @@ wait_for_server() {
done'
&&
return
0
||
return
1
done'
&&
return
0
||
return
1
}
}
kill_gpu_processes
()
{
kill_processes_launched_by_current_bash
()
{
# kill all processes on GPU.
# Kill all python processes launched from current bash script
pids
=
$(
nvidia-smi
--query-compute-apps
=
pid
--format
=
csv,noheader
)
current_shell_pid
=
$$
if
[
-z
"
$pids
"
]
;
then
processes
=
$(
ps
-eo
pid,ppid,command |
awk
-v
ppid
=
"
$current_shell_pid
"
-v
proc
=
"
$1
"
'$2 == ppid && $3 ~ proc {print $1}'
)
echo
"No GPU processes found."
if
[
-n
"
$processes
"
]
;
then
echo
"Killing the following processes matching '
$1
':"
echo
"
$processes
"
echo
"
$processes
"
| xargs
kill
-9
else
else
for
pid
in
$pids
;
do
echo
"No processes found matching '
$1
'."
kill
-9
"
$pid
"
echo
"Killed process with PID:
$pid
"
done
echo
"All GPU processes have been killed."
fi
fi
}
kill_gpu_processes
()
{
# waiting for GPU processes to be fully killed
ps
-aux
# loop while nvidia-smi returns any processes
lsof
-t
-i
:8000 | xargs
-r
kill
-9
while
[
-n
"
$(
nvidia-smi
--query-compute-apps
=
pid
--format
=
csv,noheader
)
"
]
;
do
pkill
-f
pt_main_thread
# this line doesn't work now
# ps aux | grep python | grep openai | awk '{print $2}' | xargs -r kill -9
pkill
-f
python3
pkill
-f
/usr/bin/python3
# wait until GPU memory usage smaller than 1GB
while
[
$(
nvidia-smi
--query-gpu
=
memory.used
--format
=
csv,noheader,nounits |
head
-n
1
)
-ge
1000
]
;
do
sleep
1
sleep
1
echo
"Waiting for GPU processes to be killed"
done
done
# remove vllm config file
# remove vllm config file
rm
-rf
~/.config/vllm
rm
-rf
~/.config/vllm
# Print the GPU memory usage
# so that we know if all GPU processes are killed.
gpu_memory_usage
=
$(
nvidia-smi
--query-gpu
=
memory.used
--format
=
csv,noheader,nounits
-i
0
)
# The memory usage should be 0 MB.
echo
"GPU 0 Memory Usage:
$gpu_memory_usage
MB"
}
}
upload_to_buildkite
()
{
upload_to_buildkite
()
{
...
@@ -114,7 +117,7 @@ upload_to_buildkite() {
...
@@ -114,7 +117,7 @@ upload_to_buildkite() {
fi
fi
# Use the determined command to annotate and upload artifacts
# Use the determined command to annotate and upload artifacts
$BUILDKITE_AGENT_COMMAND
annotate
--style
"info"
--context
"
$BUILDKITE_LABEL
-benchmark-results"
<
$RESULTS_FOLDER
/benchmark_results.md
$BUILDKITE_AGENT_COMMAND
annotate
--style
"info"
--context
"
$BUILDKITE_LABEL
-benchmark-results"
<
$RESULTS_FOLDER
/benchmark_results.md
$BUILDKITE_AGENT_COMMAND
artifact upload
"
$RESULTS_FOLDER
/*"
$BUILDKITE_AGENT_COMMAND
artifact upload
"
$RESULTS_FOLDER
/*"
}
}
...
@@ -166,7 +169,7 @@ run_latency_tests() {
...
@@ -166,7 +169,7 @@ run_latency_tests() {
latency_command: $latency,
latency_command: $latency,
gpu_type: $gpu
gpu_type: $gpu
}'
)
}'
)
echo
"
$jq_output
"
>
"
$RESULTS_FOLDER
/
$test_name
.commands"
echo
"
$jq_output
"
>
"
$RESULTS_FOLDER
/
$test_name
.commands"
# run the benchmark
# run the benchmark
eval
"
$latency_command
"
eval
"
$latency_command
"
...
@@ -176,7 +179,6 @@ run_latency_tests() {
...
@@ -176,7 +179,6 @@ run_latency_tests() {
done
done
}
}
run_throughput_tests
()
{
run_throughput_tests
()
{
# run throughput tests using `benchmark_throughput.py`
# run throughput tests using `benchmark_throughput.py`
# $1: a json file specifying throughput test cases
# $1: a json file specifying throughput test cases
...
@@ -224,7 +226,7 @@ run_throughput_tests() {
...
@@ -224,7 +226,7 @@ run_throughput_tests() {
throughput_command: $command,
throughput_command: $command,
gpu_type: $gpu
gpu_type: $gpu
}'
)
}'
)
echo
"
$jq_output
"
>
"
$RESULTS_FOLDER
/
$test_name
.commands"
echo
"
$jq_output
"
>
"
$RESULTS_FOLDER
/
$test_name
.commands"
# run the benchmark
# run the benchmark
eval
"
$throughput_command
"
eval
"
$throughput_command
"
...
@@ -256,7 +258,6 @@ run_serving_tests() {
...
@@ -256,7 +258,6 @@ run_serving_tests() {
continue
continue
fi
fi
# get client and server arguments
# get client and server arguments
server_params
=
$(
echo
"
$params
"
| jq
-r
'.server_parameters'
)
server_params
=
$(
echo
"
$params
"
| jq
-r
'.server_parameters'
)
client_params
=
$(
echo
"
$params
"
| jq
-r
'.client_parameters'
)
client_params
=
$(
echo
"
$params
"
| jq
-r
'.client_parameters'
)
...
@@ -334,7 +335,7 @@ run_serving_tests() {
...
@@ -334,7 +335,7 @@ run_serving_tests() {
client_command: $client,
client_command: $client,
gpu_type: $gpu
gpu_type: $gpu
}'
)
}'
)
echo
"
$jq_output
"
>
"
$RESULTS_FOLDER
/
${
new_test_name
}
.commands"
echo
"
$jq_output
"
>
"
$RESULTS_FOLDER
/
${
new_test_name
}
.commands"
done
done
...
@@ -351,6 +352,7 @@ main() {
...
@@ -351,6 +352,7 @@ main() {
# dependencies
# dependencies
(
which wget
&&
which curl
)
||
(
apt-get update
&&
apt-get
install
-y
wget curl
)
(
which wget
&&
which curl
)
||
(
apt-get update
&&
apt-get
install
-y
wget curl
)
(
which jq
)
||
(
apt-get update
&&
apt-get
-y
install
jq
)
(
which jq
)
||
(
apt-get update
&&
apt-get
-y
install
jq
)
(
which lsof
)
||
(
apt-get update
&&
apt-get
install
-y
lsof
)
# get the current IP address, required by benchmark_serving.py
# get the current IP address, required by benchmark_serving.py
export
VLLM_HOST_IP
=
$(
hostname
-I
|
awk
'{print $1}'
)
export
VLLM_HOST_IP
=
$(
hostname
-I
|
awk
'{print $1}'
)
...
@@ -369,7 +371,6 @@ main() {
...
@@ -369,7 +371,6 @@ main() {
run_latency_tests
$QUICK_BENCHMARK_ROOT
/tests/latency-tests.json
run_latency_tests
$QUICK_BENCHMARK_ROOT
/tests/latency-tests.json
run_throughput_tests
$QUICK_BENCHMARK_ROOT
/tests/throughput-tests.json
run_throughput_tests
$QUICK_BENCHMARK_ROOT
/tests/throughput-tests.json
# postprocess benchmarking results
# postprocess benchmarking results
pip
install
tabulate pandas
pip
install
tabulate pandas
python3
$QUICK_BENCHMARK_ROOT
/scripts/convert-results-json-to-markdown.py
python3
$QUICK_BENCHMARK_ROOT
/scripts/convert-results-json-to-markdown.py
...
...
.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
deleted
100644 → 0
View file @
be0967c1
#!/bin/bash
set
-o
pipefail
check_gpus
()
{
# check the number of GPUs and GPU type.
declare
-g
gpu_count
=
$(
nvidia-smi
--list-gpus
|
wc
-l
)
if
[[
$gpu_count
-gt
0
]]
;
then
echo
"GPU found."
else
echo
"Need at least 1 GPU to run benchmarking."
exit
1
fi
declare
-g
gpu_type
=
$(
echo
$(
nvidia-smi
--query-gpu
=
name
--format
=
csv,noheader
)
|
awk
'{print $2}'
)
echo
"GPU type is
$gpu_type
"
}
kill_gpu_processes
()
{
pkill text-generation
||
true
# waiting for GPU processes to be fully killed
sleep
10
# Print the GPU memory usage
# so that we know if all GPU processes are killed.
gpu_memory_usage
=
$(
nvidia-smi
--query-gpu
=
memory.used
--format
=
csv,noheader,nounits
-i
0
)
# The memory usage should be 0 MB.
echo
"GPU 0 Memory Usage:
$gpu_memory_usage
MB"
}
json2args
()
{
# transforms the JSON string to command line args, and '_' is replaced to '-'
# example:
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
local
json_string
=
$1
local
args
=
$(
echo
"
$json_string
"
| jq
-r
'
to_entries |
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
join(" ")
'
)
echo
"
$args
"
}
wait_for_server
()
{
timeout
1200 bash
-c
'
until curl -s localhost:8000/generate_stream > /dev/null; do
sleep 1
done'
&&
return
0
||
return
1
}
run_serving_tests
()
{
# run serving tests using `benchmark_serving.py`
# $1: a json file specifying serving test cases
local
serving_test_file
serving_test_file
=
$1
# Iterate over serving tests
jq
-c
'.[]'
"
$serving_test_file
"
|
while
read
-r
params
;
do
# get the test name, and append the GPU type back to it.
test_name
=
$(
echo
"
$params
"
| jq
-r
'.test_name'
)
# if TEST_SELECTOR is set, only run the test cases that match the selector
if
[[
-n
"
$TEST_SELECTOR
"
]]
&&
[[
!
"
$test_name
"
=
~
$TEST_SELECTOR
]]
;
then
echo
"Skip test case
$test_name
."
continue
fi
# append tgi to the test name
test_name
=
tgi_
$test_name
# get common parameters
common_params
=
$(
echo
"
$params
"
| jq
-r
'.common_parameters'
)
model
=
$(
echo
"
$common_params
"
| jq
-r
'.model'
)
tp
=
$(
echo
"
$common_params
"
| jq
-r
'.tp'
)
dataset_name
=
$(
echo
"
$common_params
"
| jq
-r
'.dataset_name'
)
dataset_path
=
$(
echo
"
$common_params
"
| jq
-r
'.dataset_path'
)
port
=
$(
echo
"
$common_params
"
| jq
-r
'.port'
)
num_prompts
=
$(
echo
"
$common_params
"
| jq
-r
'.num_prompts'
)
# get client and server arguments
server_params
=
$(
echo
"
$params
"
| jq
-r
'.tgi_server_parameters'
)
client_params
=
$(
echo
"
$params
"
| jq
-r
'.tgi_client_parameters'
)
server_args
=
$(
json2args
"
$server_params
"
)
client_args
=
$(
json2args
"
$client_params
"
)
qps_list
=
$(
echo
"
$params
"
| jq
-r
'.qps_list'
)
qps_list
=
$(
echo
"
$qps_list
"
| jq
-r
'.[] | @sh'
)
echo
"Running over qps list
$qps_list
"
# check if there is enough GPU to run the test
if
[[
$gpu_count
-lt
$tp
]]
;
then
echo
"Required num-shard
$tp
but only
$gpu_count
GPU found. Skip testcase
$test_name
."
continue
fi
if
echo
"
$common_params
"
| jq
-e
'has("fp8")'
>
/dev/null
;
then
echo
"Key 'fp8' exists in common params."
server_command
=
"/tgi-entrypoint.sh
\
--model-id
$model
\
--num-shard
$tp
\
--port
$port
\
--quantize fp8
\
$server_args
"
else
echo
"Key 'fp8' does not exist in common params."
server_command
=
"/tgi-entrypoint.sh
\
--model-id
$model
\
--num-shard
$tp
\
--port
$port
\
$server_args
"
fi
# run the server
echo
"Running test case
$test_name
"
echo
"Server command:
$server_command
"
eval
"
$server_command
"
&
# wait until the server is alive
wait_for_server
if
[
$?
-eq
0
]
;
then
echo
""
echo
"tgi server is up and running."
else
echo
""
echo
"tgi failed to start within the timeout period."
break
fi
# iterate over different QPS
for
qps
in
$qps_list
;
do
# remove the surrounding single quote from qps
if
[[
"
$qps
"
==
*
"inf"
*
]]
;
then
echo
"qps was
$qps
"
qps
=
"inf"
echo
"now qps is
$qps
"
fi
new_test_name
=
$test_name
"_qps_"
$qps
client_command
=
"python3 benchmark_serving.py
\
--backend tgi
\
--model
$model
\
--dataset-name
$dataset_name
\
--dataset-path
$dataset_path
\
--num-prompts
$num_prompts
\
--port
$port
\
--save-result
\
--result-dir
$RESULTS_FOLDER
\
--result-filename
${
new_test_name
}
.json
\
--request-rate
$qps
\
$client_args
"
echo
"Running test case
$test_name
with qps
$qps
"
echo
"Client command:
$client_command
"
eval
"
$client_command
"
# record the benchmarking commands
jq_output
=
$(
jq
-n
\
--arg
server
"
$server_command
"
\
--arg
client
"
$client_command
"
\
--arg
gpu
"
$gpu_type
"
\
--arg
engine
"tgi"
\
'{
server_command: $server,
client_command: $client,
gpu_type: $gpu,
engine: $engine
}'
)
echo
"
$jq_output
"
>
"
$RESULTS_FOLDER
/
${
new_test_name
}
.commands"
done
# clean up
kill_gpu_processes
rm
-rf
/root/.cache/huggingface/
*
done
}
upload_to_buildkite
()
{
# upload the benchmarking results to buildkite
# if the agent binary is not found, skip uploading the results, exit 0
if
[
!
-f
/workspace/buildkite-agent
]
;
then
echo
"buildkite-agent binary not found. Skip uploading the results."
return
0
fi
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
/workspace/buildkite-agent artifact upload
"
$RESULTS_FOLDER
/*"
}
main
()
{
check_gpus
# enter vllm directory
cd
$VLLM_SOURCE_CODE_LOC
/benchmarks
declare
-g
RESULTS_FOLDER
=
results/
mkdir
-p
$RESULTS_FOLDER
BENCHMARK_ROOT
=
../.buildkite/nightly-benchmarks/
export
CURRENT_LLM_SERVING_ENGINE
=
tgi
run_serving_tests
$BENCHMARK_ROOT
/tests/nightly-tests.json
python
-m
pip
install
tabulate pandas
python
$BENCHMARK_ROOT
/scripts/summary-nightly-results.py
upload_to_buildkite
}
main
"
$@
"
.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
deleted
100644 → 0
View file @
be0967c1
#!/bin/bash
set
-o
pipefail
check_gpus
()
{
# check the number of GPUs and GPU type.
declare
-g
gpu_count
=
$(
nvidia-smi
--list-gpus
|
wc
-l
)
if
[[
$gpu_count
-gt
0
]]
;
then
echo
"GPU found."
else
echo
"Need at least 1 GPU to run benchmarking."
exit
1
fi
declare
-g
gpu_type
=
$(
echo
$(
nvidia-smi
--query-gpu
=
name
--format
=
csv,noheader
)
|
awk
'{print $2}'
)
echo
"GPU type is
$gpu_type
"
}
kill_gpu_processes
()
{
pkill tritonserver
||
true
# waiting for GPU processes to be fully killed
sleep
20
# Print the GPU memory usage
# so that we know if all GPU processes are killed.
gpu_memory_usage
=
$(
nvidia-smi
--query-gpu
=
memory.used
--format
=
csv,noheader,nounits
-i
0
)
# The memory usage should be 0 MB.
echo
"GPU 0 Memory Usage:
$gpu_memory_usage
MB"
}
json2args
()
{
# transforms the JSON string to command line args, and '_' is replaced to '-'
# example:
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
local
json_string
=
$1
local
args
=
$(
echo
"
$json_string
"
| jq
-r
'
to_entries |
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
join(" ")
'
)
echo
"
$args
"
}
wait_for_server
()
{
timeout
1200 bash
-c
'
until curl -s localhost:8000/generate_stream > /dev/null; do
sleep 1
done'
&&
return
0
||
return
1
}
run_serving_tests
()
{
# run serving tests using `benchmark_serving.py`
# $1: a json file specifying serving test cases
local
serving_test_file
serving_test_file
=
$1
# Iterate over serving tests
jq
-c
'.[]'
"
$serving_test_file
"
|
while
read
-r
params
;
do
# get the test name, and append the GPU type back to it.
test_name
=
$(
echo
"
$params
"
| jq
-r
'.test_name'
)
# if TEST_SELECTOR is set, only run the test cases that match the selector
if
[[
-n
"
$TEST_SELECTOR
"
]]
&&
[[
!
"
$test_name
"
=
~
$TEST_SELECTOR
]]
;
then
echo
"Skip test case
$test_name
."
continue
fi
# append trt to the test name
test_name
=
trt_
$test_name
# get common parameters
common_params
=
$(
echo
"
$params
"
| jq
-r
'.common_parameters'
)
model
=
$(
echo
"
$common_params
"
| jq
-r
'.model'
)
tp
=
$(
echo
"
$common_params
"
| jq
-r
'.tp'
)
dataset_name
=
$(
echo
"
$common_params
"
| jq
-r
'.dataset_name'
)
dataset_path
=
$(
echo
"
$common_params
"
| jq
-r
'.dataset_path'
)
port
=
$(
echo
"
$common_params
"
| jq
-r
'.port'
)
num_prompts
=
$(
echo
"
$common_params
"
| jq
-r
'.num_prompts'
)
# get client and server arguments
server_params
=
$(
echo
"
$params
"
| jq
-r
'.trt_server_parameters'
)
client_params
=
$(
echo
"
$params
"
| jq
-r
'.trt_client_parameters'
)
client_args
=
$(
json2args
"
$client_params
"
)
qps_list
=
$(
echo
"
$params
"
| jq
-r
'.qps_list'
)
qps_list
=
$(
echo
"
$qps_list
"
| jq
-r
'.[] | @sh'
)
echo
"Running over qps list
$qps_list
"
# check if there is enough GPU to run the test
if
[[
$gpu_count
-lt
$tp
]]
;
then
echo
"Required model_tp_size
$tp
but only
$gpu_count
GPU found. Skip testcase
$test_name
."
continue
fi
cd
$VLLM_SOURCE_CODE_LOC
/benchmarks
echo
"Running test case
$test_name
"
bash ../.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh
"
$server_params
"
"
$common_params
"
# wait until the server is alive
wait_for_server
if
[
$?
-eq
0
]
;
then
echo
""
echo
"trt server is up and running."
else
echo
""
echo
"trt failed to start within the timeout period."
break
fi
# prepare tokenizer
cd
$VLLM_SOURCE_CODE_LOC
/benchmarks
rm
-rf
/tokenizer_cache
mkdir
/tokenizer_cache
python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
\
--model
"
$model
"
\
--cachedir
/tokenizer_cache
cd
$VLLM_SOURCE_CODE_LOC
/benchmarks
# iterate over different QPS
for
qps
in
$qps_list
;
do
# remove the surrounding single quote from qps
if
[[
"
$qps
"
==
*
"inf"
*
]]
;
then
echo
"qps was
$qps
"
qps
=
"inf"
echo
"now qps is
$qps
"
fi
new_test_name
=
$test_name
"_qps_"
$qps
client_command
=
"python3 benchmark_serving.py
\
--backend tensorrt-llm
\
--tokenizer /tokenizer_cache
\
--model
$model
\
--dataset-name
$dataset_name
\
--dataset-path
$dataset_path
\
--num-prompts
$num_prompts
\
--port
$port
\
--save-result
\
--result-dir
$RESULTS_FOLDER
\
--result-filename
${
new_test_name
}
.json
\
--request-rate
$qps
\
$client_args
"
echo
"Running test case
$test_name
with qps
$qps
"
echo
"Client command:
$client_command
"
eval
"
$client_command
"
server_command
=
""
# record the benchmarking commands
jq_output
=
$(
jq
-n
\
--arg
server
"
$server_command
"
\
--arg
client
"
$client_command
"
\
--arg
gpu
"
$gpu_type
"
\
--arg
engine
"trt"
\
'{
server_command: $server,
client_command: $client,
gpu_type: $gpu,
engine: $engine
}'
)
echo
"
$jq_output
"
>
"
$RESULTS_FOLDER
/
${
new_test_name
}
.commands"
done
# clean up
kill_gpu_processes
rm
-rf
/root/.cache/huggingface/
*
done
}
upload_to_buildkite
()
{
# upload the benchmarking results to buildkite
# if the agent binary is not found, skip uploading the results, exit 0
if
[
!
-f
/workspace/buildkite-agent
]
;
then
echo
"buildkite-agent binary not found. Skip uploading the results."
return
0
fi
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
/workspace/buildkite-agent artifact upload
"
$RESULTS_FOLDER
/*"
}
main
()
{
check_gpus
# enter vllm directory
cd
$VLLM_SOURCE_CODE_LOC
/benchmarks
declare
-g
RESULTS_FOLDER
=
results/
mkdir
-p
$RESULTS_FOLDER
BENCHMARK_ROOT
=
../.buildkite/nightly-benchmarks/
# update transformers package, to make sure mixtral tokenizer is available
python
-m
pip
install
transformers
-U
export
CURRENT_LLM_SERVING_ENGINE
=
trt
run_serving_tests
$BENCHMARK_ROOT
/tests/nightly-tests.json
python
-m
pip
install
tabulate pandas
python
$BENCHMARK_ROOT
/scripts/summary-nightly-results.py
upload_to_buildkite
}
main
"
$@
"
.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
deleted
100644 → 0
View file @
be0967c1
#!/bin/bash
set
-o
pipefail
check_gpus
()
{
# check the number of GPUs and GPU type.
declare
-g
gpu_count
=
$(
nvidia-smi
--list-gpus
|
wc
-l
)
if
[[
$gpu_count
-gt
0
]]
;
then
echo
"GPU found."
else
echo
"Need at least 1 GPU to run benchmarking."
exit
1
fi
declare
-g
gpu_type
=
$(
echo
$(
nvidia-smi
--query-gpu
=
name
--format
=
csv,noheader
)
|
awk
'{print $2}'
)
echo
"GPU type is
$gpu_type
"
}
kill_gpu_processes
()
{
# kill all processes on GPU.
pkill pt_main_thread
sleep
10
# remove vllm config file
rm
-rf
~/.config/vllm
# Print the GPU memory usage
# so that we know if all GPU processes are killed.
gpu_memory_usage
=
$(
nvidia-smi
--query-gpu
=
memory.used
--format
=
csv,noheader,nounits
-i
0
)
# The memory usage should be 0 MB.
echo
"GPU 0 Memory Usage:
$gpu_memory_usage
MB"
}
json2args
()
{
# transforms the JSON string to command line args, and '_' is replaced to '-'
# example:
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
local
json_string
=
$1
local
args
=
$(
echo
"
$json_string
"
| jq
-r
'
to_entries |
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
join(" ")
'
)
echo
"
$args
"
}
wait_for_server
()
{
# wait for vllm server to start
# return 1 if vllm server crashes
timeout
1200 bash
-c
'
until curl -s localhost:8000/v1/completions > /dev/null; do
sleep 1
done'
&&
return
0
||
return
1
}
run_serving_tests
()
{
# run serving tests using `benchmark_serving.py`
# $1: a json file specifying serving test cases
local
serving_test_file
serving_test_file
=
$1
# Iterate over serving tests
jq
-c
'.[]'
"
$serving_test_file
"
|
while
read
-r
params
;
do
# get the test name, and append the GPU type back to it.
test_name
=
$(
echo
"
$params
"
| jq
-r
'.test_name'
)
# if TEST_SELECTOR is set, only run the test cases that match the selector
if
[[
-n
"
$TEST_SELECTOR
"
]]
&&
[[
!
"
$test_name
"
=
~
$TEST_SELECTOR
]]
;
then
echo
"Skip test case
$test_name
."
continue
fi
# append vllm to the test name
test_name
=
vllm_
$test_name
# get common parameters
common_params
=
$(
echo
"
$params
"
| jq
-r
'.common_parameters'
)
model
=
$(
echo
"
$common_params
"
| jq
-r
'.model'
)
tp
=
$(
echo
"
$common_params
"
| jq
-r
'.tp'
)
dataset_name
=
$(
echo
"
$common_params
"
| jq
-r
'.dataset_name'
)
dataset_path
=
$(
echo
"
$common_params
"
| jq
-r
'.dataset_path'
)
port
=
$(
echo
"
$common_params
"
| jq
-r
'.port'
)
num_prompts
=
$(
echo
"
$common_params
"
| jq
-r
'.num_prompts'
)
# get client and server arguments
server_params
=
$(
echo
"
$params
"
| jq
-r
'.vllm_server_parameters'
)
client_params
=
$(
echo
"
$params
"
| jq
-r
'.vllm_client_parameters'
)
server_args
=
$(
json2args
"
$server_params
"
)
client_args
=
$(
json2args
"
$client_params
"
)
qps_list
=
$(
echo
"
$params
"
| jq
-r
'.qps_list'
)
qps_list
=
$(
echo
"
$qps_list
"
| jq
-r
'.[] | @sh'
)
echo
"Running over qps list
$qps_list
"
# check if there is enough GPU to run the test
if
[[
$gpu_count
-lt
$tp
]]
;
then
echo
"Required tensor-parallel-size
$tp
but only
$gpu_count
GPU found. Skip testcase
$test_name
."
continue
fi
if
echo
"
$common_params
"
| jq
-e
'has("fp8")'
>
/dev/null
;
then
echo
"Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
model
=
$(
echo
"
$common_params
"
| jq
-r
'.neuralmagic_quantized_model'
)
server_command
=
"python3
\
-m vllm.entrypoints.openai.api_server
\
-tp
$tp
\
--model
$model
\
--port
$port
\
$server_args
"
else
echo
"Key 'fp8' does not exist in common params."
server_command
=
"python3
\
-m vllm.entrypoints.openai.api_server
\
-tp
$tp
\
--model
$model
\
--port
$port
\
$server_args
"
fi
# run the server
echo
"Running test case
$test_name
"
echo
"Server command:
$server_command
"
eval
"
$server_command
"
&
# wait until the server is alive
wait_for_server
if
[
$?
-eq
0
]
;
then
echo
""
echo
"vllm server is up and running."
else
echo
""
echo
"vllm failed to start within the timeout period."
break
fi
# iterate over different QPS
for
qps
in
$qps_list
;
do
# remove the surrounding single quote from qps
if
[[
"
$qps
"
==
*
"inf"
*
]]
;
then
echo
"qps was
$qps
"
qps
=
"inf"
echo
"now qps is
$qps
"
fi
new_test_name
=
$test_name
"_qps_"
$qps
client_command
=
"python3 benchmark_serving.py
\
--backend vllm
\
--model
$model
\
--dataset-name
$dataset_name
\
--dataset-path
$dataset_path
\
--num-prompts
$num_prompts
\
--port
$port
\
--save-result
\
--result-dir
$RESULTS_FOLDER
\
--result-filename
${
new_test_name
}
.json
\
--request-rate
$qps
\
$client_args
"
echo
"Running test case
$test_name
with qps
$qps
"
echo
"Client command:
$client_command
"
eval
"
$client_command
"
# record the benchmarking commands
jq_output
=
$(
jq
-n
\
--arg
server
"
$server_command
"
\
--arg
client
"
$client_command
"
\
--arg
gpu
"
$gpu_type
"
\
--arg
engine
"vllm"
\
'{
server_command: $server,
client_command: $client,
gpu_type: $gpu,
engine: $engine
}'
)
echo
"
$jq_output
"
>
"
$RESULTS_FOLDER
/
${
new_test_name
}
.commands"
done
# clean up
kill_gpu_processes
rm
-rf
/root/.cache/huggingface/
*
done
}
upload_to_buildkite
()
{
# upload the benchmarking results to buildkite
# if the agent binary is not found, skip uploading the results, exit 0
if
[
!
-f
/workspace/buildkite-agent
]
;
then
echo
"buildkite-agent binary not found. Skip uploading the results."
return
0
fi
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
/workspace/buildkite-agent artifact upload
"
$RESULTS_FOLDER
/*"
}
main
()
{
check_gpus
# enter vllm directory
cd
$VLLM_SOURCE_CODE_LOC
/benchmarks
declare
-g
RESULTS_FOLDER
=
results/
mkdir
-p
$RESULTS_FOLDER
BENCHMARK_ROOT
=
../.buildkite/nightly-benchmarks/
export
CURRENT_LLM_SERVING_ENGINE
=
vllm
run_serving_tests
$BENCHMARK_ROOT
/tests/nightly-tests.json
python3
-m
pip
install
tabulate pandas
python3
$BENCHMARK_ROOT
/scripts/summary-nightly-results.py
upload_to_buildkite
}
main
"
$@
"
.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
View file @
ad385667
...
@@ -17,10 +17,17 @@ serving_column_mapping = {
...
@@ -17,10 +17,17 @@ serving_column_mapping = {
"request_throughput"
:
"Tput (req/s)"
,
"request_throughput"
:
"Tput (req/s)"
,
"mean_ttft_ms"
:
"Mean TTFT (ms)"
,
"mean_ttft_ms"
:
"Mean TTFT (ms)"
,
"std_ttft_ms"
:
"Std TTFT (ms)"
,
"std_ttft_ms"
:
"Std TTFT (ms)"
,
"median_ttft_ms"
:
"Median TTFT (ms)"
,
"mean_itl_ms"
:
"Mean ITL (ms)"
,
"mean_itl_ms"
:
"Mean ITL (ms)"
,
"std_itl_ms"
:
"Std ITL (ms)"
,
"std_itl_ms"
:
"Std ITL (ms)"
,
"input_throughput"
:
"Input Tput (tok/s)"
,
"median_itl_ms"
:
"Median ITL (ms)"
,
"mean_tpot_ms"
:
"Mean TPOT (ms)"
,
"std_tpot_ms"
:
"Std TPOT (ms)"
,
"median_tpot_ms"
:
"Median TPOT (ms)"
,
"total_token_throughput"
:
"Total Token Tput (tok/s)"
,
"output_throughput"
:
"Output Tput (tok/s)"
,
"output_throughput"
:
"Output Tput (tok/s)"
,
"total_input_tokens"
:
"Total input tokens"
,
"total_output_tokens"
:
"Total output tokens"
,
"engine"
:
"Engine"
,
"engine"
:
"Engine"
,
}
}
...
...
.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
View file @
ad385667
...
@@ -2,9 +2,11 @@
...
@@ -2,9 +2,11 @@
TOKEN
=
$(
curl
-s
-L
"https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull"
| jq
-r
.token
)
TOKEN
=
$(
curl
-s
-L
"https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull"
| jq
-r
.token
)
URL
=
"https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/
$BUILDKITE_COMMIT
"
URL
=
"https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/
$BUILDKITE_COMMIT
"
TIMEOUT_SECONDS
=
10
retries
=
0
retries
=
0
while
[
$retries
-lt
1000
]
;
do
while
[
$retries
-lt
1000
]
;
do
if
[
$(
curl
-s
-L
-H
"Authorization: Bearer
$TOKEN
"
-o
/dev/null
-w
"%{http_code}"
$URL
)
-eq
200
]
;
then
if
[
$(
curl
-s
--max-time
$TIMEOUT_SECONDS
-L
-H
"Authorization: Bearer
$TOKEN
"
-o
/dev/null
-w
"%{http_code}"
$URL
)
-eq
200
]
;
then
exit
0
exit
0
fi
fi
...
...
.buildkite/nightly-benchmarks/tests/latency-tests.json
View file @
ad385667
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
{
{
"test_name"
:
"latency_llama8B_tp1"
,
"test_name"
:
"latency_llama8B_tp1"
,
"parameters"
:
{
"parameters"
:
{
"model"
:
"meta-llama/Meta-Llama-3-8B"
,
"model"
:
"meta-llama/Meta-Llama-3
.1
-8B
-Instruct
"
,
"tensor_parallel_size"
:
1
,
"tensor_parallel_size"
:
1
,
"load_format"
:
"dummy"
,
"load_format"
:
"dummy"
,
"num_iters_warmup"
:
5
,
"num_iters_warmup"
:
5
,
...
@@ -12,7 +12,7 @@
...
@@ -12,7 +12,7 @@
{
{
"test_name"
:
"latency_llama70B_tp4"
,
"test_name"
:
"latency_llama70B_tp4"
,
"parameters"
:
{
"parameters"
:
{
"model"
:
"meta-llama/Meta-Llama-3-70B-Instruct"
,
"model"
:
"meta-llama/Meta-Llama-3
.1
-70B-Instruct"
,
"tensor_parallel_size"
:
4
,
"tensor_parallel_size"
:
4
,
"load_format"
:
"dummy"
,
"load_format"
:
"dummy"
,
"num-iters-warmup"
:
5
,
"num-iters-warmup"
:
5
,
...
...
.buildkite/nightly-benchmarks/tests/nightly-tests.json
View file @
ad385667
[
[
{
{
"test_name"
:
"llama8B_tp1"
,
"test_name"
:
"llama8B_tp1
_sharegpt
"
,
"qps_list"
:
[
4
],
"qps_list"
:
[
4
,
8
,
16
,
32
,
"inf"
],
"common_parameters"
:
{
"common_parameters"
:
{
"model"
:
"meta-llama/Meta-Llama-3-8B"
,
"model"
:
"meta-llama/Meta-Llama-3-8B
-Instruct
"
,
"tp"
:
1
,
"tp"
:
1
,
"dataset_name"
:
"sharegpt"
,
"dataset_name"
:
"sharegpt"
,
"dataset_path"
:
"./ShareGPT_V3_unfiltered_cleaned_split.json"
,
"dataset_path"
:
"./ShareGPT_V3_unfiltered_cleaned_split.json"
,
"num_prompts"
:
500
,
"num_prompts"
:
500
,
"port"
:
8000
"port"
:
8000
,
"reuse_server"
:
false
},
},
"lmdeploy_server_parameters"
:
{
"lmdeploy_server_parameters"
:
{
"dtype"
:
"bfloat16"
},
},
"lmdeploy_client_parameters"
:
{
"lmdeploy_client_parameters"
:
{
},
},
...
@@ -21,34 +23,158 @@
...
@@ -21,34 +23,158 @@
},
},
"trt_server_parameters"
:
{
"trt_server_parameters"
:
{
"model_type"
:
"llama"
,
"model_type"
:
"llama"
,
"model_dtype"
:
"float16"
,
"model_dtype"
:
"
b
float16"
,
"max_batch_size"
:
2
56
,
"max_batch_size"
:
2
048
,
"max_input_len"
:
4096
,
"max_input_len"
:
4096
,
"max_output_len"
:
4096
,
"max_seq_len"
:
6144
,
"trt_llm_version"
:
"r24.04"
"max_num_tokens"
:
16384
,
"trt_llm_version"
:
"v0.11.0"
},
},
"trt_client_parameters"
:
{
"trt_client_parameters"
:
{
"endpoint"
:
"/v2/models/ensemble/generate_stream"
"endpoint"
:
"/v2/models/ensemble/generate_stream"
},
},
"vllm_server_parameters"
:
{
"vllm_server_parameters"
:
{
"disable_log_stats"
:
""
,
"disable_log_stats"
:
""
,
"disable_log_requests"
:
""
"disable_log_requests"
:
""
,
"gpu_memory_utilization"
:
0.9
,
"num_scheduler_steps"
:
10
,
"max_num_seqs"
:
512
,
"dtype"
:
"bfloat16"
},
},
"vllm_client_parameters"
:
{
"vllm_client_parameters"
:
{
},
"sglang_server_parameters"
:
{
"disable_radix_cache"
:
""
,
"enable_torch_compile"
:
""
,
"dtype"
:
"bfloat16"
},
"sglang_client_parameters"
:
{
}
},
{
"test_name"
:
"llama8B_tp1_sonnet_512_16"
,
"qps_list"
:
[
4
,
8
,
16
,
32
,
"inf"
],
"common_parameters"
:
{
"model"
:
"meta-llama/Meta-Llama-3-8B-Instruct"
,
"tp"
:
1
,
"dataset_name"
:
"sonnet"
,
"dataset_path"
:
"./sonnet_4x.txt"
,
"num_prompts"
:
500
,
"port"
:
8000
,
"sonnet_input_len"
:
512
,
"sonnet_output_len"
:
16
,
"sonnet_prefix_len"
:
50
,
"reuse_server"
:
true
},
"lmdeploy_server_parameters"
:
{
"dtype"
:
"bfloat16"
},
"lmdeploy_client_parameters"
:
{
},
"tgi_server_parameters"
:
{
},
"tgi_client_parameters"
:
{
"endpoint"
:
"/generate_stream"
},
"trt_server_parameters"
:
{
"model_type"
:
"llama"
,
"model_dtype"
:
"bfloat16"
,
"max_batch_size"
:
2048
,
"max_input_len"
:
4096
,
"max_seq_len"
:
6144
,
"max_num_tokens"
:
16384
,
"trt_llm_version"
:
"v0.11.0"
},
"trt_client_parameters"
:
{
"endpoint"
:
"/v2/models/ensemble/generate_stream"
},
"vllm_server_parameters"
:
{
"disable_log_stats"
:
""
,
"disable_log_requests"
:
""
,
"gpu_memory_utilization"
:
0.9
,
"num_scheduler_steps"
:
10
,
"max_num_seqs"
:
512
,
"dtype"
:
"bfloat16"
},
"vllm_client_parameters"
:
{
},
"sglang_server_parameters"
:
{
"disable_radix_cache"
:
""
,
"enable_torch_compile"
:
""
,
"dtype"
:
"bfloat16"
},
"sglang_client_parameters"
:
{
}
},
{
"test_name"
:
"llama8B_tp1_sonnet_512_256"
,
"qps_list"
:
[
4
,
8
,
16
,
32
,
"inf"
],
"common_parameters"
:
{
"model"
:
"meta-llama/Meta-Llama-3-8B-Instruct"
,
"tp"
:
1
,
"dataset_name"
:
"sonnet"
,
"dataset_path"
:
"./sonnet_4x.txt"
,
"num_prompts"
:
500
,
"port"
:
8000
,
"sonnet_input_len"
:
512
,
"sonnet_output_len"
:
256
,
"sonnet_prefix_len"
:
50
,
"reuse_server"
:
true
},
"lmdeploy_server_parameters"
:
{
"dtype"
:
"bfloat16"
},
"lmdeploy_client_parameters"
:
{
},
"tgi_server_parameters"
:
{
},
"tgi_client_parameters"
:
{
"endpoint"
:
"/generate_stream"
},
"trt_server_parameters"
:
{
"model_type"
:
"llama"
,
"model_dtype"
:
"bfloat16"
,
"max_batch_size"
:
2048
,
"max_input_len"
:
4096
,
"max_seq_len"
:
6144
,
"max_num_tokens"
:
16384
,
"trt_llm_version"
:
"v0.11.0"
},
"trt_client_parameters"
:
{
"endpoint"
:
"/v2/models/ensemble/generate_stream"
},
"vllm_server_parameters"
:
{
"disable_log_stats"
:
""
,
"disable_log_requests"
:
""
,
"gpu_memory_utilization"
:
0.9
,
"num_scheduler_steps"
:
10
,
"max_num_seqs"
:
512
,
"dtype"
:
"bfloat16"
},
"vllm_client_parameters"
:
{
},
"sglang_server_parameters"
:
{
"disable_radix_cache"
:
""
,
"enable_torch_compile"
:
""
,
"dtype"
:
"bfloat16"
},
"sglang_client_parameters"
:
{
}
}
},
},
{
{
"test_name"
:
"llama70B_tp4"
,
"test_name"
:
"llama70B_tp4
_sharegpt
"
,
"qps_list"
:
[
2
],
"qps_list"
:
[
4
,
8
,
16
,
32
,
"inf"
],
"common_parameters"
:
{
"common_parameters"
:
{
"model"
:
"meta-llama/Meta-Llama-3-70B-Instruct"
,
"model"
:
"meta-llama/Meta-Llama-3-70B-Instruct"
,
"tp"
:
4
,
"tp"
:
4
,
"dataset_name"
:
"sharegpt"
,
"dataset_name"
:
"sharegpt"
,
"dataset_path"
:
"./ShareGPT_V3_unfiltered_cleaned_split.json"
,
"dataset_path"
:
"./ShareGPT_V3_unfiltered_cleaned_split.json"
,
"num_prompts"
:
500
,
"num_prompts"
:
500
,
"port"
:
8000
"port"
:
8000
,
"reuse_server"
:
false
},
},
"lmdeploy_server_parameters"
:
{
"lmdeploy_server_parameters"
:
{
"dtype"
:
"bfloat16"
},
},
"lmdeploy_client_parameters"
:
{
"lmdeploy_client_parameters"
:
{
},
},
...
@@ -59,34 +185,50 @@
...
@@ -59,34 +185,50 @@
},
},
"trt_server_parameters"
:
{
"trt_server_parameters"
:
{
"model_type"
:
"llama"
,
"model_type"
:
"llama"
,
"model_dtype"
:
"float16"
,
"model_dtype"
:
"
b
float16"
,
"max_batch_size"
:
2
56
,
"max_batch_size"
:
2
048
,
"max_input_len"
:
4096
,
"max_input_len"
:
4096
,
"max_output_len"
:
4096
,
"max_seq_len"
:
6144
,
"trt_llm_version"
:
"r24.04"
"max_num_tokens"
:
16384
,
"trt_llm_version"
:
"v0.11.0"
},
},
"trt_client_parameters"
:
{
"trt_client_parameters"
:
{
"endpoint"
:
"/v2/models/ensemble/generate_stream"
"endpoint"
:
"/v2/models/ensemble/generate_stream"
},
},
"vllm_server_parameters"
:
{
"vllm_server_parameters"
:
{
"disable_log_stats"
:
""
,
"disable_log_stats"
:
""
,
"disable_log_requests"
:
""
"disable_log_requests"
:
""
,
"gpu_memory_utilization"
:
0.9
,
"num_scheduler_steps"
:
10
,
"max_num_seqs"
:
512
,
"dtype"
:
"bfloat16"
},
},
"vllm_client_parameters"
:
{
"vllm_client_parameters"
:
{
},
"sglang_server_parameters"
:
{
"disable_radix_cache"
:
""
,
"dtype"
:
"bfloat16"
},
"sglang_client_parameters"
:
{
}
}
},
},
{
{
"test_name"
:
"
mixtral8x7B_tp2
"
,
"test_name"
:
"
llama70B_tp4_sonnet_512_16
"
,
"qps_list"
:
[
2
],
"qps_list"
:
[
4
,
8
,
16
,
32
,
"inf"
],
"common_parameters"
:
{
"common_parameters"
:
{
"model"
:
"m
istralai/Mixtral-8x7
B-Instruct
-v0.1
"
,
"model"
:
"m
eta-llama/Meta-Llama-3-70
B-Instruct"
,
"tp"
:
2
,
"tp"
:
4
,
"dataset_name"
:
"s
haregp
t"
,
"dataset_name"
:
"s
onne
t"
,
"dataset_path"
:
"./
ShareGPT_V3_unfiltered_cleaned_split.json
"
,
"dataset_path"
:
"./
sonnet_4x.txt
"
,
"num_prompts"
:
500
,
"num_prompts"
:
500
,
"port"
:
8000
"port"
:
8000
,
"sonnet_input_len"
:
512
,
"sonnet_output_len"
:
16
,
"sonnet_prefix_len"
:
50
,
"reuse_server"
:
true
},
},
"lmdeploy_server_parameters"
:
{
"lmdeploy_server_parameters"
:
{
"dtype"
:
"bfloat16"
},
},
"lmdeploy_client_parameters"
:
{
"lmdeploy_client_parameters"
:
{
},
},
...
@@ -97,20 +239,85 @@
...
@@ -97,20 +239,85 @@
},
},
"trt_server_parameters"
:
{
"trt_server_parameters"
:
{
"model_type"
:
"llama"
,
"model_type"
:
"llama"
,
"model_dtype"
:
"float16"
,
"model_dtype"
:
"
b
float16"
,
"max_batch_size"
:
2
56
,
"max_batch_size"
:
2
048
,
"max_input_len"
:
4096
,
"max_input_len"
:
4096
,
"max_output_len"
:
4096
,
"max_seq_len"
:
6144
,
"trt_llm_version"
:
"r24.04"
"max_num_tokens"
:
16384
,
"trt_llm_version"
:
"v0.11.0"
},
},
"trt_client_parameters"
:
{
"trt_client_parameters"
:
{
"endpoint"
:
"/v2/models/ensemble/generate_stream"
"endpoint"
:
"/v2/models/ensemble/generate_stream"
},
},
"vllm_server_parameters"
:
{
"vllm_server_parameters"
:
{
"disable_log_stats"
:
""
,
"disable_log_stats"
:
""
,
"disable_log_requests"
:
""
"disable_log_requests"
:
""
,
"gpu_memory_utilization"
:
0.9
,
"num_scheduler_steps"
:
10
,
"max_num_seqs"
:
512
,
"dtype"
:
"bfloat16"
},
},
"vllm_client_parameters"
:
{
"vllm_client_parameters"
:
{
},
"sglang_server_parameters"
:
{
"disable_radix_cache"
:
""
,
"dtype"
:
"bfloat16"
},
"sglang_client_parameters"
:
{
}
},
{
"test_name"
:
"llama70B_tp4_sonnet_512_256"
,
"qps_list"
:
[
4
,
8
,
16
,
32
,
"inf"
],
"common_parameters"
:
{
"model"
:
"meta-llama/Meta-Llama-3-70B-Instruct"
,
"tp"
:
4
,
"dataset_name"
:
"sonnet"
,
"dataset_path"
:
"./sonnet_4x.txt"
,
"num_prompts"
:
500
,
"port"
:
8000
,
"sonnet_input_len"
:
512
,
"sonnet_output_len"
:
256
,
"sonnet_prefix_len"
:
50
,
"reuse_server"
:
true
},
"lmdeploy_server_parameters"
:
{
"dtype"
:
"bfloat16"
},
"lmdeploy_client_parameters"
:
{
},
"tgi_server_parameters"
:
{
},
"tgi_client_parameters"
:
{
"endpoint"
:
"/generate_stream"
},
"trt_server_parameters"
:
{
"model_type"
:
"llama"
,
"model_dtype"
:
"bfloat16"
,
"max_batch_size"
:
2048
,
"max_input_len"
:
4096
,
"max_seq_len"
:
6144
,
"max_num_tokens"
:
16384
,
"trt_llm_version"
:
"v0.11.0"
},
"trt_client_parameters"
:
{
"endpoint"
:
"/v2/models/ensemble/generate_stream"
},
"vllm_server_parameters"
:
{
"disable_log_stats"
:
""
,
"disable_log_requests"
:
""
,
"gpu_memory_utilization"
:
0.9
,
"num_scheduler_steps"
:
10
,
"max_num_seqs"
:
512
,
"dtype"
:
"bfloat16"
},
"vllm_client_parameters"
:
{
},
"sglang_server_parameters"
:
{
"disable_radix_cache"
:
""
,
"dtype"
:
"bfloat16"
},
"sglang_client_parameters"
:
{
}
}
}
}
]
]
\ No newline at end of file
.buildkite/nightly-benchmarks/tests/serving-tests.json
View file @
ad385667
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
"test_name"
:
"serving_llama8B_tp1_sharegpt"
,
"test_name"
:
"serving_llama8B_tp1_sharegpt"
,
"qps_list"
:
[
1
,
4
,
16
,
"inf"
],
"qps_list"
:
[
1
,
4
,
16
,
"inf"
],
"server_parameters"
:
{
"server_parameters"
:
{
"model"
:
"meta-llama/Meta-Llama-3-8B"
,
"model"
:
"meta-llama/Meta-Llama-3
.1
-8B
-Instruct
"
,
"tensor_parallel_size"
:
1
,
"tensor_parallel_size"
:
1
,
"swap_space"
:
16
,
"swap_space"
:
16
,
"disable_log_stats"
:
""
,
"disable_log_stats"
:
""
,
...
@@ -11,7 +11,7 @@
...
@@ -11,7 +11,7 @@
"load_format"
:
"dummy"
"load_format"
:
"dummy"
},
},
"client_parameters"
:
{
"client_parameters"
:
{
"model"
:
"meta-llama/Meta-Llama-3-8B"
,
"model"
:
"meta-llama/Meta-Llama-3
.1
-8B
-Instruct
"
,
"backend"
:
"vllm"
,
"backend"
:
"vllm"
,
"dataset_name"
:
"sharegpt"
,
"dataset_name"
:
"sharegpt"
,
"dataset_path"
:
"./ShareGPT_V3_unfiltered_cleaned_split.json"
,
"dataset_path"
:
"./ShareGPT_V3_unfiltered_cleaned_split.json"
,
...
@@ -22,7 +22,7 @@
...
@@ -22,7 +22,7 @@
"test_name"
:
"serving_llama70B_tp4_sharegpt"
,
"test_name"
:
"serving_llama70B_tp4_sharegpt"
,
"qps_list"
:
[
1
,
4
,
16
,
"inf"
],
"qps_list"
:
[
1
,
4
,
16
,
"inf"
],
"server_parameters"
:
{
"server_parameters"
:
{
"model"
:
"meta-llama/Meta-Llama-3-70B-Instruct"
,
"model"
:
"meta-llama/Meta-Llama-3
.1
-70B-Instruct"
,
"tensor_parallel_size"
:
4
,
"tensor_parallel_size"
:
4
,
"swap_space"
:
16
,
"swap_space"
:
16
,
"disable_log_stats"
:
""
,
"disable_log_stats"
:
""
,
...
@@ -30,7 +30,7 @@
...
@@ -30,7 +30,7 @@
"load_format"
:
"dummy"
"load_format"
:
"dummy"
},
},
"client_parameters"
:
{
"client_parameters"
:
{
"model"
:
"meta-llama/Meta-Llama-3-70B-Instruct"
,
"model"
:
"meta-llama/Meta-Llama-3
.1
-70B-Instruct"
,
"backend"
:
"vllm"
,
"backend"
:
"vllm"
,
"dataset_name"
:
"sharegpt"
,
"dataset_name"
:
"sharegpt"
,
"dataset_path"
:
"./ShareGPT_V3_unfiltered_cleaned_split.json"
,
"dataset_path"
:
"./ShareGPT_V3_unfiltered_cleaned_split.json"
,
...
@@ -60,7 +60,7 @@
...
@@ -60,7 +60,7 @@
"test_name"
:
"serving_llama70B_tp4_sharegpt_specdecode"
,
"test_name"
:
"serving_llama70B_tp4_sharegpt_specdecode"
,
"qps_list"
:
[
2
],
"qps_list"
:
[
2
],
"server_parameters"
:
{
"server_parameters"
:
{
"model"
:
"meta-llama/Meta-Llama-3-70B-Instruct"
,
"model"
:
"meta-llama/Meta-Llama-3
.1
-70B-Instruct"
,
"disable_log_requests"
:
""
,
"disable_log_requests"
:
""
,
"tensor_parallel_size"
:
4
,
"tensor_parallel_size"
:
4
,
"swap_space"
:
16
,
"swap_space"
:
16
,
...
@@ -70,7 +70,7 @@
...
@@ -70,7 +70,7 @@
"use_v2_block_manager"
:
""
"use_v2_block_manager"
:
""
},
},
"client_parameters"
:
{
"client_parameters"
:
{
"model"
:
"meta-llama/Meta-Llama-3-70B-Instruct"
,
"model"
:
"meta-llama/Meta-Llama-3
.1
-70B-Instruct"
,
"backend"
:
"vllm"
,
"backend"
:
"vllm"
,
"dataset_name"
:
"sharegpt"
,
"dataset_name"
:
"sharegpt"
,
"dataset_path"
:
"./ShareGPT_V3_unfiltered_cleaned_split.json"
,
"dataset_path"
:
"./ShareGPT_V3_unfiltered_cleaned_split.json"
,
...
...
.buildkite/nightly-benchmarks/tests/throughput-tests.json
View file @
ad385667
...
@@ -2,7 +2,7 @@
...
@@ -2,7 +2,7 @@
{
{
"test_name"
:
"throughput_llama8B_tp1"
,
"test_name"
:
"throughput_llama8B_tp1"
,
"parameters"
:
{
"parameters"
:
{
"model"
:
"meta-llama/Meta-Llama-3-8B"
,
"model"
:
"meta-llama/Meta-Llama-3
.1
-8B
-Instruct
"
,
"tensor_parallel_size"
:
1
,
"tensor_parallel_size"
:
1
,
"load_format"
:
"dummy"
,
"load_format"
:
"dummy"
,
"dataset"
:
"./ShareGPT_V3_unfiltered_cleaned_split.json"
,
"dataset"
:
"./ShareGPT_V3_unfiltered_cleaned_split.json"
,
...
@@ -13,7 +13,7 @@
...
@@ -13,7 +13,7 @@
{
{
"test_name"
:
"throughput_llama70B_tp4"
,
"test_name"
:
"throughput_llama70B_tp4"
,
"parameters"
:
{
"parameters"
:
{
"model"
:
"meta-llama/Meta-Llama-3-70B-Instruct"
,
"model"
:
"meta-llama/Meta-Llama-3
.1
-70B-Instruct"
,
"tensor_parallel_size"
:
4
,
"tensor_parallel_size"
:
4
,
"load_format"
:
"dummy"
,
"load_format"
:
"dummy"
,
"dataset"
:
"./ShareGPT_V3_unfiltered_cleaned_split.json"
,
"dataset"
:
"./ShareGPT_V3_unfiltered_cleaned_split.json"
,
...
...
.buildkite/release-pipeline.yaml
View file @
ad385667
steps
:
steps
:
-
label
:
"
Build
wheel
-
CUDA
{{matrix.cuda_version}}
"
-
label
:
"
Build
wheel
-
CUDA
12.1
"
agents
:
agents
:
queue
:
cpu_queue
queue
:
cpu_queue
commands
:
commands
:
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
buildkite_commit=$BUILDKITE_COMMIT
--build-arg
USE_SCCACHE=1
--build-arg
CUDA_VERSION={{matrix.cuda_version}}
--tag
vllm-ci:build-image
--target
build
--progress
plain
."
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
USE_SCCACHE=1
--build-arg
GIT_REPO_CHECK=1
--build-arg
CUDA_VERSION=12.1.0
--tag
vllm-ci:build-image
--target
build
--progress
plain
."
-
"
mkdir
artifacts"
-
"
docker
run
--rm
-v
$(pwd)/artifacts:/artifacts_host
vllm-ci:build-image
bash
-c
'cp
-r
dist
/artifacts_host
&&
chmod
-R
a+rw
/artifacts_host'"
# rename the files to change linux -> manylinux1
-
"
for
f
in
artifacts/dist/*.whl;
do
mv
--
\"
$$f
\"
\"
$${f/linux/manylinux1}
\"
;
done"
-
"
mv
artifacts/dist/$(ls
artifacts/dist)
artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
-
"
aws
s3
cp
artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
s3://vllm-wheels/$BUILDKITE_COMMIT/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
-
"
aws
s3
cp
artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
s3://vllm-wheels/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
env
:
DOCKER_BUILDKIT
:
"
1"
-
block
:
"
Build
CUDA
11.8
wheel"
key
:
block-build-cu118-wheel
-
label
:
"
Build
wheel
-
CUDA
11.8"
depends_on
:
block-build-cu118-wheel
agents
:
queue
:
cpu_queue
commands
:
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
USE_SCCACHE=1
--build-arg
GIT_REPO_CHECK=1
--build-arg
CUDA_VERSION=11.8.0
--tag
vllm-ci:build-image
--target
build
--progress
plain
."
-
"
mkdir
artifacts"
-
"
mkdir
artifacts"
-
"
docker
run
--rm
-v
$(pwd)/artifacts:/artifacts_host
vllm-ci:build-image
bash
-c
'cp
-r
dist
/artifacts_host
&&
chmod
-R
a+rw
/artifacts_host'"
-
"
docker
run
--rm
-v
$(pwd)/artifacts:/artifacts_host
vllm-ci:build-image
bash
-c
'cp
-r
dist
/artifacts_host
&&
chmod
-R
a+rw
/artifacts_host'"
# rename the files to change linux -> manylinux1
# rename the files to change linux -> manylinux1
...
@@ -12,8 +31,3 @@ steps:
...
@@ -12,8 +31,3 @@ steps:
-
"
aws
s3
cp
--recursive
artifacts/dist
s3://vllm-wheels/nightly/"
-
"
aws
s3
cp
--recursive
artifacts/dist
s3://vllm-wheels/nightly/"
env
:
env
:
DOCKER_BUILDKIT
:
"
1"
DOCKER_BUILDKIT
:
"
1"
matrix
:
setup
:
cuda_version
:
-
"
11.8.0"
-
"
12.1.0"
.buildkite/run-amd-test.sh
100644 → 100755
View file @
ad385667
# This script runs test inside the corresponding ROCm docker container.
# This script runs test inside the corresponding ROCm docker container.
set
-
ex
set
-
o
pipefail
# Print ROCm version
# Print ROCm version
echo
"--- Confirming Clean Initial State"
echo
"--- Confirming Clean Initial State"
...
@@ -70,15 +70,85 @@ HF_CACHE="$(realpath ~)/huggingface"
...
@@ -70,15 +70,85 @@ HF_CACHE="$(realpath ~)/huggingface"
mkdir
-p
${
HF_CACHE
}
mkdir
-p
${
HF_CACHE
}
HF_MOUNT
=
"/root/.cache/huggingface"
HF_MOUNT
=
"/root/.cache/huggingface"
docker run
\
commands
=
$@
echo
"Commands:
$commands
"
#ignore certain kernels tests
if
[[
$commands
==
*
" kernels "
*
]]
;
then
commands
=
"
${
commands
}
\
--ignore=kernels/test_attention.py
\
--ignore=kernels/test_attention_selector.py
\
--ignore=kernels/test_blocksparse_attention.py
\
--ignore=kernels/test_causal_conv1d.py
\
--ignore=kernels/test_cutlass.py
\
--ignore=kernels/test_encoder_decoder_attn.py
\
--ignore=kernels/test_flash_attn.py
\
--ignore=kernels/test_flashinfer.py
\
--ignore=kernels/test_gguf.py
\
--ignore=kernels/test_int8_quant.py
\
--ignore=kernels/test_machete_gemm.py
\
--ignore=kernels/test_mamba_ssm.py
\
--ignore=kernels/test_marlin_gemm.py
\
--ignore=kernels/test_moe.py
\
--ignore=kernels/test_prefix_prefill.py
\
--ignore=kernels/test_rand.py
\
--ignore=kernels/test_sampler.py"
fi
#ignore certain Entrypoints tests
if
[[
$commands
==
*
" entrypoints/openai "
*
]]
;
then
commands
=
${
commands
//
" entrypoints/openai "
/
" entrypoints/openai
\
--ignore=entrypoints/openai/test_accuracy.py
\
--ignore=entrypoints/openai/test_audio.py
\
--ignore=entrypoints/openai/test_encoder_decoder.py
\
--ignore=entrypoints/openai/test_embedding.py
\
--ignore=entrypoints/openai/test_oot_registration.py "
}
fi
PARALLEL_JOB_COUNT
=
8
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
if
[[
$commands
==
*
"--shard-id="
*
]]
;
then
for
GPU
in
$(
seq
0
$((
$PARALLEL_JOB_COUNT
-
1
))
)
;
do
#replace shard arguments
commands
=
${
commands
//
"--shard-id= "
/
"--shard-id=
${
GPU
}
"
}
commands
=
${
commands
//
"--num-shards= "
/
"--num-shards=
${
PARALLEL_JOB_COUNT
}
"
}
echo
"Shard
${
GPU
}
commands:
$commands
"
docker run
\
--device
/dev/kfd
--device
/dev/dri
\
--network
host
\
--shm-size
=
16gb
\
--rm
\
-e
HIP_VISIBLE_DEVICES
=
${
GPU
}
\
-e
HF_TOKEN
\
-v
${
HF_CACHE
}
:
${
HF_MOUNT
}
\
-e
HF_HOME
=
${
HF_MOUNT
}
\
--name
${
container_name
}
_
${
GPU
}
\
${
image_name
}
\
/bin/bash
-c
"
${
commands
}
"
\
|&
while
read
-r
line
;
do
echo
">>Shard
$GPU
:
$line
"
;
done
&
PIDS+
=(
$!
)
done
#wait for all processes to finish and collect exit codes
for
pid
in
${
PIDS
[@]
}
;
do
wait
${
pid
}
STATUS+
=(
$?
)
done
for
st
in
${
STATUS
[@]
}
;
do
if
[[
${
st
}
-ne
0
]]
;
then
echo
"One of the processes failed with
$st
"
exit
${
st
}
fi
done
else
docker run
\
--device
/dev/kfd
--device
/dev/dri
\
--device
/dev/kfd
--device
/dev/dri
\
--network
host
\
--network
host
\
--shm-size
=
16gb
\
--shm-size
=
16gb
\
--rm
\
--rm
\
-e
HIP_VISIBLE_DEVICES
=
0
\
-e
HF_TOKEN
\
-e
HF_TOKEN
\
-v
${
HF_CACHE
}
:
${
HF_MOUNT
}
\
-v
${
HF_CACHE
}
:
${
HF_MOUNT
}
\
-e
HF_HOME
=
${
HF_MOUNT
}
\
-e
HF_HOME
=
${
HF_MOUNT
}
\
--name
${
container_name
}
\
--name
${
container_name
}
\
${
image_name
}
\
${
image_name
}
\
/bin/bash
-c
"
${
@
}
"
/bin/bash
-c
"
${
commands
}
"
fi
.buildkite/run-cpu-test-ppc64le.sh
0 → 100755
View file @
ad385667
# This script build the CPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
set
-ex
# Try building the docker image
docker build
-t
cpu-test
-f
Dockerfile.ppc64le
.
# Setup cleanup
remove_docker_container
()
{
docker
rm
-f
cpu-test
||
true
;
}
trap
remove_docker_container EXIT
remove_docker_container
# Run the image, setting --shm-size=4g for tensor parallel.
source
/etc/environment
#docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
docker run
-itd
--entrypoint
/bin/bash
-v
~/.cache/huggingface:/root/.cache/huggingface
--privileged
=
true
--network
host
-e
HF_TOKEN
=
$HF_TOKEN
--name
cpu-test cpu-test
# Run basic model test
docker
exec
cpu-test bash
-c
"
pip install pytest matplotlib einops transformers_stream_generator
pytest -v -s tests/models -m
\"
not vlm
\"
\
--ignore=tests/models/test_embedding.py
\
--ignore=tests/models/test_oot_registration.py
\
--ignore=tests/models/test_registry.py
\
--ignore=tests/models/test_jamba.py
\
--ignore=tests/models/test_mamba.py
\
--ignore=tests/models/test_danube3_4b.py"
# Mamba kernels and Danube3-4B on CPU is not supported
# online inference
docker
exec
cpu-test bash
-c
"
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m &
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
python3 benchmarks/benchmark_serving.py
\
--backend vllm
\
--dataset-name random
\
--model facebook/opt-125m
\
--num-prompts 20
\
--endpoint /v1/completions
\
--tokenizer facebook/opt-125m"
.buildkite/run-cpu-test.sh
View file @
ad385667
...
@@ -22,8 +22,25 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
...
@@ -22,8 +22,25 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
# Run basic model test
# Run basic model test
docker
exec
cpu-test bash
-c
"
docker
exec
cpu-test bash
-c
"
pip install pytest Pillow protobuf
pip install pytest matplotlib einops transformers_stream_generator datamodel_code_generator
pytest -v -s tests/models -m
\"
not vlm
\"
--ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py --ignore=tests/models/test_danube3_4b.py"
# Mamba and Danube3-4B on CPU is not supported
pytest -v -s tests/models/encoder_decoder/language
pytest -v -s tests/models/decoder_only/language
\
--ignore=tests/models/test_fp8.py
\
--ignore=tests/models/decoder_only/language/test_jamba.py
\
--ignore=tests/models/decoder_only/language/test_mamba.py
\
--ignore=tests/models/decoder_only/language/test_granitemoe.py
\
--ignore=tests/models/decoder_only/language/test_danube3_4b.py"
# Mamba and Danube3-4B on CPU is not supported
# Run compressed-tensor test
docker
exec
cpu-test bash
-c
"
pytest -s -v
\
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup
\
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
# Run AWQ test
docker
exec
cpu-test bash
-c
"
pytest -s -v
\
tests/quantization/test_ipex_quant.py"
# online inference
# online inference
docker
exec
cpu-test bash
-c
"
docker
exec
cpu-test bash
-c
"
...
...
.buildkite/run-tpu-test.sh
View file @
ad385667
...
@@ -12,5 +12,4 @@ remove_docker_container
...
@@ -12,5 +12,4 @@ remove_docker_container
# For HF_TOKEN.
# For HF_TOKEN.
source
/etc/environment
source
/etc/environment
# Run a simple end-to-end example.
# Run a simple end-to-end example.
docker run
--privileged
--net
host
--shm-size
=
16G
-it
-e
HF_TOKEN
=
$HF_TOKEN
--name
tpu-test vllm-tpu
\
docker run
--privileged
--net
host
--shm-size
=
16G
-it
-e
HF_TOKEN
=
$HF_TOKEN
--name
tpu-test vllm-tpu /bin/bash
-c
"python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py"
python3 /workspace/vllm/examples/offline_inference_tpu.py
.buildkite/run-xpu-test.sh
View file @
ad385667
...
@@ -11,4 +11,4 @@ trap remove_docker_container EXIT
...
@@ -11,4 +11,4 @@ trap remove_docker_container EXIT
remove_docker_container
remove_docker_container
# Run the image and launch offline inference
# Run the image and launch offline inference
docker run
--network
host
--name
xpu-test
--device
/dev/dri
-v
/dev/dri/by-path:/dev/dri/by-path xpu-test python3 examples/offline_inference.py
docker run
--network
host
--name
xpu-test
--device
/dev/dri
-v
/dev/dri/by-path:/dev/dri/by-path
--entrypoint
=
""
xpu-test python3 examples/offline_inference.py
.buildkite/test-pipeline.yaml
View file @
ad385667
This diff is collapsed.
Click to expand it.
.dockerignore
View file @
ad385667
/.venv
/build
dist
vllm/*.so
vllm/*.so
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
.mypy_cache
# Distribution / packaging
.Python
/build/
cmake-build-*/
CMakeUserPresets.json
develop-eggs/
/dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
Prev
1
2
3
4
5
6
…
49
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment