Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
ad385667
Commit
ad385667
authored
Oct 23, 2024
by
zhuwenwen
Browse files
Merge branch 'v0.6.3.post1-dev'
parents
be0967c1
903593d3
Changes
364
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1283 additions
and
917 deletions
+1283
-917
.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
+48
-10
.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
...kite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+357
-0
.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
.../nightly-benchmarks/scripts/run-performance-benchmarks.sh
+30
-29
.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
+0
-216
.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
+0
-214
.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
+0
-221
.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
...ite/nightly-benchmarks/scripts/summary-nightly-results.py
+8
-1
.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
+3
-1
.buildkite/nightly-benchmarks/tests/latency-tests.json
.buildkite/nightly-benchmarks/tests/latency-tests.json
+2
-2
.buildkite/nightly-benchmarks/tests/nightly-tests.json
.buildkite/nightly-benchmarks/tests/nightly-tests.json
+237
-30
.buildkite/nightly-benchmarks/tests/serving-tests.json
.buildkite/nightly-benchmarks/tests/serving-tests.json
+6
-6
.buildkite/nightly-benchmarks/tests/throughput-tests.json
.buildkite/nightly-benchmarks/tests/throughput-tests.json
+2
-2
.buildkite/release-pipeline.yaml
.buildkite/release-pipeline.yaml
+21
-7
.buildkite/run-amd-test.sh
.buildkite/run-amd-test.sh
+75
-5
.buildkite/run-cpu-test-ppc64le.sh
.buildkite/run-cpu-test-ppc64le.sh
+39
-0
.buildkite/run-cpu-test.sh
.buildkite/run-cpu-test.sh
+19
-2
.buildkite/run-tpu-test.sh
.buildkite/run-tpu-test.sh
+1
-2
.buildkite/run-xpu-test.sh
.buildkite/run-xpu-test.sh
+1
-1
.buildkite/test-pipeline.yaml
.buildkite/test-pipeline.yaml
+402
-168
.dockerignore
.dockerignore
+32
-0
No files found.
Too many changes to show.
To preserve performance only
364 of 364+
files are displayed.
Plain diff
Email patch
.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
View file @
ad385667
...
...
@@ -8,6 +8,7 @@ main() {
(
which wget
&&
which curl
)
||
(
apt-get update
&&
apt-get
install
-y
wget curl
)
(
which jq
)
||
(
apt-get update
&&
apt-get
-y
install
jq
)
(
which zip
)
||
(
apt-get
install
-y
zip
)
if
[
!
-f
/workspace/buildkite-agent
]
;
then
echo
"buildkite-agent binary not found. Skip plotting the results."
...
...
@@ -24,17 +25,54 @@ main() {
ls
ls
results/
# generate figures
python3
-m
pip
install
tabulate pandas matplotlib
python3
$VLLM_SOURCE_CODE_LOC
/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
\
--description
$description
\
--results-folder
results/
# upload benchmark results
zip
-r
results.zip results/
/workspace/buildkite-agent artifact upload
"results.zip"
# upload benchmarking scripts
cd
$VLLM_SOURCE_CODE_LOC
/
zip
-r
nightly-benchmarks.zip .buildkite/ benchmarks/
/workspace/buildkite-agent artifact upload
"nightly-benchmarks.zip"
cd
$VLLM_SOURCE_CODE_LOC
/.buildkite/nightly-benchmarks/
# upload benchmarking pipeline
/workspace/buildkite-agent artifact upload
"nightly-pipeline.yaml"
cd
$VLLM_SOURCE_CODE_LOC
/.buildkite/nightly-benchmarks/
/workspace/buildkite-agent annotate
--style
"success"
--context
"nightly-benchmarks-results"
--append
< nightly-annotation.md
# The figures should be genereated by a separate process outside the CI/CD pipeline
# # generate figures
# python3 -m pip install tabulate pandas matplotlib
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py \
# --description $description \
# --results-folder results/
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
# --description $description \
# --results-folder results/ \
# --dataset sharegpt
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
# --description $description \
# --results-folder results/ \
# --dataset sonnet_2048_128
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
# --description $description \
# --results-folder results/ \
# --dataset sonnet_128_2048
# upload results and figures
/workspace/buildkite-agent artifact upload
"nightly_results.png"
/workspace/buildkite-agent artifact upload
$VLLM_SOURCE_CODE_LOC
/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
/workspace/buildkite-agent artifact upload
$VLLM_SOURCE_CODE_LOC
/.buildkite/nightly-benchmarks/tests/nightly-tests.json
/workspace/buildkite-agent annotate
--style
"success"
--context
"nightly-benchmarks-results"
--append
< nightly_results.md
#
#
upload results and figures
#
/workspace/buildkite-agent artifact upload "nightly_results
*
.png"
#
/workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
#
/workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
#
/workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
}
main
"
$@
"
\ No newline at end of file
.buildkite/nightly-benchmarks/scripts/run-
lmdeploy-nightly
.sh
→
.buildkite/nightly-benchmarks/scripts/run-
nightly-benchmarks
.sh
View file @
ad385667
#!/bin/bash
set
-o
pipefail
set
-x
check_gpus
()
{
# check the number of GPUs and GPU type.
...
...
@@ -15,15 +16,66 @@ check_gpus() {
echo
"GPU type is
$gpu_type
"
}
kill_gpu_processes
()
{
pkill lmdeploy
||
true
# waiting for GPU processes to be fully killed
sleep
10
# Print the GPU memory usage
# so that we know if all GPU processes are killed.
gpu_memory_usage
=
$(
nvidia-smi
--query-gpu
=
memory.used
--format
=
csv,noheader,nounits
-i
0
)
# The memory usage should be 0 MB.
echo
"GPU 0 Memory Usage:
$gpu_memory_usage
MB"
check_hf_token
()
{
# check if HF_TOKEN is available and valid
if
[[
-z
"
$HF_TOKEN
"
]]
;
then
echo
"Error: HF_TOKEN is not set."
exit
1
elif
[[
!
"
$HF_TOKEN
"
=
~ ^hf_
]]
;
then
echo
"Error: HF_TOKEN does not start with 'hf_'."
exit
1
else
echo
"HF_TOKEN is set and valid."
fi
}
upload_to_buildkite
()
{
# upload the benchmarking results to buildkite
# if the agent binary is not found, skip uploading the results, exit 0
if
[
!
-f
/workspace/buildkite-agent
]
;
then
echo
"buildkite-agent binary not found. Skip uploading the results."
return
0
fi
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
/workspace/buildkite-agent artifact upload
"
$RESULTS_FOLDER
/*"
}
get_current_llm_serving_engine
()
{
if
which lmdeploy
>
/dev/null
;
then
echo
"Container: lmdeploy"
export
CURRENT_LLM_SERVING_ENGINE
=
lmdeploy
return
fi
if
[
-e
/tgi-entrypoint.sh
]
;
then
echo
"Container: tgi"
export
CURRENT_LLM_SERVING_ENGINE
=
tgi
return
fi
if
which trtllm-build
>
/dev/null
;
then
echo
"Container: tensorrt-llm"
export
CURRENT_LLM_SERVING_ENGINE
=
trt
return
fi
if
[
-e
/sgl-workspace
]
;
then
echo
"Container: sglang"
export
CURRENT_LLM_SERVING_ENGINE
=
sglang
return
fi
if
[
-e
/vllm-workspace
]
;
then
echo
"Container: vllm"
# move to a completely irrelevant directory, to avoid import vllm from current folder
export
CURRENT_LLM_SERVING_ENGINE
=
vllm
return
fi
}
json2args
()
{
...
...
@@ -42,6 +94,19 @@ json2args() {
echo
"
$args
"
}
kill_gpu_processes
()
{
pkill
-f
python
pkill
-f
python3
pkill
-f
tritonserver
pkill
-f
pt_main_thread
pkill
-f
text-generation
pkill
-f
lmdeploy
while
[
$(
nvidia-smi
--query-gpu
=
memory.used
--format
=
csv,noheader,nounits |
head
-n
1
)
-ge
1000
]
;
do
sleep
1
done
}
wait_for_server
()
{
# wait for vllm server to start
# return 1 if vllm server crashes
...
...
@@ -51,6 +116,14 @@ wait_for_server() {
done'
&&
return
0
||
return
1
}
ensure_installed
()
{
# Ensure that the given command is installed by apt-get
local
cmd
=
$1
if
!
which
$cmd
>
/dev/null
;
then
apt-get update
&&
apt-get
install
-y
$cmd
fi
}
run_serving_tests
()
{
# run serving tests using `benchmark_serving.py`
# $1: a json file specifying serving test cases
...
...
@@ -68,10 +141,10 @@ run_serving_tests() {
echo
"Skip test case
$test_name
."
continue
fi
#
a
ppend
lmdeploy
to the test name
test_name
=
lmdeploy
_
$test_name
# p
re
pend
the current serving engine
to the test name
test_name
=
${
CURRENT_LLM_SERVING_ENGINE
}
_
$
{
test_name
}
# get common parameters
common_params
=
$(
echo
"
$params
"
| jq
-r
'.common_parameters'
)
model
=
$(
echo
"
$common_params
"
| jq
-r
'.model'
)
...
...
@@ -80,13 +153,11 @@ run_serving_tests() {
dataset_path
=
$(
echo
"
$common_params
"
| jq
-r
'.dataset_path'
)
port
=
$(
echo
"
$common_params
"
| jq
-r
'.port'
)
num_prompts
=
$(
echo
"
$common_params
"
| jq
-r
'.num_prompts'
)
reuse_server
=
$(
echo
"
$common_params
"
| jq
-r
'.reuse_server'
)
# get client and server arguments
server_params
=
$(
echo
"
$params
"
| jq
-r
'.lmdeploy_server_parameters'
)
client_params
=
$(
echo
"
$params
"
| jq
-r
'.lmdeploy_client_parameters'
)
server_args
=
$(
json2args
"
$server_params
"
)
server_params
=
$(
echo
"
$params
"
| jq
-r
".
${
CURRENT_LLM_SERVING_ENGINE
}
_server_parameters"
)
client_params
=
$(
echo
"
$params
"
| jq
-r
".
${
CURRENT_LLM_SERVING_ENGINE
}
_client_parameters"
)
client_args
=
$(
json2args
"
$client_params
"
)
qps_list
=
$(
echo
"
$params
"
| jq
-r
'.qps_list'
)
qps_list
=
$(
echo
"
$qps_list
"
| jq
-r
'.[] | @sh'
)
...
...
@@ -94,40 +165,44 @@ run_serving_tests() {
# check if there is enough GPU to run the test
if
[[
$gpu_count
-lt
$tp
]]
;
then
echo
"Required
tensor-parallel-size
$tp
but only
$gpu_count
GPU found. Skip testcase
$test_name
."
echo
"Required
num-shard
$tp
but only
$gpu_count
GPU found. Skip testcase
$test_name
."
continue
fi
# prepare tokenizer
rm
-rf
/tokenizer_cache
mkdir
/tokenizer_cache
python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
\
--model
"
$model
"
\
--cachedir
/tokenizer_cache
server_command
=
"lmdeploy serve api_server
$model
\
--tp
$tp
\
--server-port
$port
\
$server_args
"
# run the server
echo
"Running test case
$test_name
"
echo
"Server command:
$server_command
"
bash
-c
"
$server_command
"
&
if
[[
$reuse_server
==
"true"
]]
;
then
echo
"Reuse previous server for test case
$test_name
"
else
kill_gpu_processes
bash
$VLLM_SOURCE_CODE_LOC
/.buildkite/nightly-benchmarks/scripts/launch-server.sh
\
"
$server_params
"
"
$common_params
"
fi
# wait until the server is alive
wait_for_server
if
[
$?
-eq
0
]
;
then
echo
""
echo
"
lmdeploy
server is up and running."
echo
"
$CURRENT_LLM_SERVING_ENGINE
server is up and running."
else
echo
""
echo
"
lmdeploy
failed to start within the timeout period."
echo
"
$CURRENT_LLM_SERVING_ENGINE
failed to start within the timeout period."
break
fi
# get model name
model_name
=
$(
python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
)
# prepare tokenizer
# this is required for lmdeploy.
cd
$VLLM_SOURCE_CODE_LOC
/benchmarks
rm
-rf
/tokenizer_cache
mkdir
/tokenizer_cache
python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
\
--model
"
$model
"
\
--cachedir
/tokenizer_cache
cd
$VLLM_SOURCE_CODE_LOC
/benchmarks
# change model name for lmdeploy (it will not follow standard hf name)
if
[[
"
$CURRENT_LLM_SERVING_ENGINE
"
==
"lmdeploy"
]]
;
then
model
=
$(
python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
)
fi
# iterate over different QPS
for
qps
in
$qps_list
;
do
...
...
@@ -140,31 +215,79 @@ run_serving_tests() {
new_test_name
=
$test_name
"_qps_"
$qps
client_command
=
"python3 benchmark_serving.py
\
--backend lmdeploy
\
--tokenizer /tokenizer_cache
\
--dataset-name
$dataset_name
\
--dataset-path
$dataset_path
\
--num-prompts
$num_prompts
\
--port
$port
\
--save-result
\
--result-dir
$RESULTS_FOLDER
\
--result-filename
${
new_test_name
}
.json
\
--request-rate
$qps
\
--model
\"
$model_name
\"
\
$client_args
"
backend
=
$CURRENT_LLM_SERVING_ENGINE
if
[[
$backend
=
"trt"
]]
;
then
backend
=
"tensorrt-llm"
fi
if
[[
"
$backend
"
==
*
"vllm"
*
]]
;
then
backend
=
"vllm"
fi
if
[[
"
$dataset_name
"
=
"sharegpt"
]]
;
then
client_command
=
"python3 benchmark_serving.py
\
--backend
$backend
\
--tokenizer /tokenizer_cache
\
--model
$model
\
--dataset-name
$dataset_name
\
--dataset-path
$dataset_path
\
--num-prompts
$num_prompts
\
--port
$port
\
--save-result
\
--result-dir
$RESULTS_FOLDER
\
--result-filename
${
new_test_name
}
.json
\
--request-rate
$qps
\
--ignore-eos
\
$client_args
"
elif
[[
"
$dataset_name
"
=
"sonnet"
]]
;
then
sonnet_input_len
=
$(
echo
"
$common_params
"
| jq
-r
'.sonnet_input_len'
)
sonnet_output_len
=
$(
echo
"
$common_params
"
| jq
-r
'.sonnet_output_len'
)
sonnet_prefix_len
=
$(
echo
"
$common_params
"
| jq
-r
'.sonnet_prefix_len'
)
client_command
=
"python3 benchmark_serving.py
\
--backend
$backend
\
--tokenizer /tokenizer_cache
\
--model
$model
\
--dataset-name
$dataset_name
\
--dataset-path
$dataset_path
\
--num-prompts
$num_prompts
\
--sonnet-input-len
$sonnet_input_len
\
--sonnet-output-len
$sonnet_output_len
\
--sonnet-prefix-len
$sonnet_prefix_len
\
--port
$port
\
--save-result
\
--result-dir
$RESULTS_FOLDER
\
--result-filename
${
new_test_name
}
.json
\
--request-rate
$qps
\
--ignore-eos
\
$client_args
"
else
echo
"The dataset name must be either 'sharegpt' or 'sonnet'. Got
$dataset_name
."
exit
1
fi
echo
"Running test case
$test_name
with qps
$qps
"
echo
"Client command:
$client_command
"
eval
"
$client_command
"
server_command
=
"None"
# record the benchmarking commands
jq_output
=
$(
jq
-n
\
--arg
server
"
$server_command
"
\
--arg
client
"
$client_command
"
\
--arg
gpu
"
$gpu_type
"
\
--arg
engine
"
lmdeploy
"
\
--arg
engine
"
$CURRENT_LLM_SERVING_ENGINE
"
\
'{
server_command: $server,
client_command: $client,
...
...
@@ -175,42 +298,58 @@ run_serving_tests() {
done
# clean up
kill_gpu_processes
rm
-rf
/root/.cache/huggingface/
*
done
kill_gpu_processes
}
upload_to_buildkite
()
{
# upload the benchmarking results to buildkite
prepare_dataset
()
{
# if the agent binary is not found, skip uploading the results, exit 0
if
[
!
-f
/workspace/buildkite-agent
]
;
then
echo
"buildkite-agent binary not found. Skip uploading the results."
return
0
fi
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
/workspace/buildkite-agent artifact upload
"
$RESULTS_FOLDER
/*"
}
# download sharegpt dataset
cd
$VLLM_SOURCE_CODE_LOC
/benchmarks
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
# duplicate sonnet by 4x, to allow benchmarking with input length 2048
cd
$VLLM_SOURCE_CODE_LOC
/benchmarks
echo
""
>
sonnet_4x.txt
for
_
in
{
1..4
}
do
cat
sonnet.txt
>>
sonnet_4x.txt
done
}
main
()
{
# check if the environment variable is successfully injected from yaml
check_gpus
# enter vllm directory
cd
$VLLM_SOURCE_CODE_LOC
/benchmarks
check_hf_token
get_current_llm_serving_engine
pip
install
-U
transformers
# check storage
df
-h
ensure_installed wget
ensure_installed curl
ensure_installed jq
prepare_dataset
cd
$VLLM_SOURCE_CODE_LOC
/benchmarks
declare
-g
RESULTS_FOLDER
=
results/
mkdir
-p
$RESULTS_FOLDER
BENCHMARK_ROOT
=
../.buildkite/nightly-benchmarks/
python
-m
pip
install
transformers
==
4.41.2
BENCHMARK_ROOT
=
$VLLM_SOURCE_CODE_LOC
/.buildkite/nightly-benchmarks/
export
CURRENT_LLM_SERVING_ENGINE
=
lmdeploy
# run the test
run_serving_tests
$BENCHMARK_ROOT
/tests/nightly-tests.json
python
-m
pip
install
tabulate pandas
python
$BENCHMARK_ROOT
/scripts/summary-nightly-results.py
# upload benchmark results to buildkite
python3
-m
pip
install
tabulate pandas
python3
$BENCHMARK_ROOT
/scripts/summary-nightly-results.py
upload_to_buildkite
}
...
...
.buildkite/nightly-benchmarks/
run
-benchmarks
-suite
.sh
→
.buildkite/nightly-benchmarks/
scripts/run-performance
-benchmarks.sh
View file @
ad385667
...
...
@@ -37,9 +37,9 @@ check_hf_token() {
ensure_sharegpt_downloaded
()
{
local
FILE
=
ShareGPT_V3_unfiltered_cleaned_split.json
if
[
!
-f
"
$FILE
"
]
;
then
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/
$FILE
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/
$FILE
else
echo
"
$FILE
already exists."
echo
"
$FILE
already exists."
fi
}
...
...
@@ -68,35 +68,38 @@ wait_for_server() {
done'
&&
return
0
||
return
1
}
kill_gpu_processes
()
{
# kill all processes on GPU.
pids
=
$(
nvidia-smi
--query-compute-apps
=
pid
--format
=
csv,noheader
)
if
[
-z
"
$pids
"
]
;
then
echo
"No GPU processes found."
kill_processes_launched_by_current_bash
()
{
# Kill all python processes launched from current bash script
current_shell_pid
=
$$
processes
=
$(
ps
-eo
pid,ppid,command |
awk
-v
ppid
=
"
$current_shell_pid
"
-v
proc
=
"
$1
"
'$2 == ppid && $3 ~ proc {print $1}'
)
if
[
-n
"
$processes
"
]
;
then
echo
"Killing the following processes matching '
$1
':"
echo
"
$processes
"
echo
"
$processes
"
| xargs
kill
-9
else
for
pid
in
$pids
;
do
kill
-9
"
$pid
"
echo
"Killed process with PID:
$pid
"
done
echo
"All GPU processes have been killed."
echo
"No processes found matching '
$1
'."
fi
}
kill_gpu_processes
()
{
# waiting for GPU processes to be fully killed
# loop while nvidia-smi returns any processes
while
[
-n
"
$(
nvidia-smi
--query-compute-apps
=
pid
--format
=
csv,noheader
)
"
]
;
do
ps
-aux
lsof
-t
-i
:8000 | xargs
-r
kill
-9
pkill
-f
pt_main_thread
# this line doesn't work now
# ps aux | grep python | grep openai | awk '{print $2}' | xargs -r kill -9
pkill
-f
python3
pkill
-f
/usr/bin/python3
# wait until GPU memory usage smaller than 1GB
while
[
$(
nvidia-smi
--query-gpu
=
memory.used
--format
=
csv,noheader,nounits |
head
-n
1
)
-ge
1000
]
;
do
sleep
1
echo
"Waiting for GPU processes to be killed"
done
# remove vllm config file
rm
-rf
~/.config/vllm
# Print the GPU memory usage
# so that we know if all GPU processes are killed.
gpu_memory_usage
=
$(
nvidia-smi
--query-gpu
=
memory.used
--format
=
csv,noheader,nounits
-i
0
)
# The memory usage should be 0 MB.
echo
"GPU 0 Memory Usage:
$gpu_memory_usage
MB"
}
upload_to_buildkite
()
{
...
...
@@ -114,7 +117,7 @@ upload_to_buildkite() {
fi
# Use the determined command to annotate and upload artifacts
$BUILDKITE_AGENT_COMMAND
annotate
--style
"info"
--context
"
$BUILDKITE_LABEL
-benchmark-results"
<
$RESULTS_FOLDER
/benchmark_results.md
$BUILDKITE_AGENT_COMMAND
annotate
--style
"info"
--context
"
$BUILDKITE_LABEL
-benchmark-results"
<
$RESULTS_FOLDER
/benchmark_results.md
$BUILDKITE_AGENT_COMMAND
artifact upload
"
$RESULTS_FOLDER
/*"
}
...
...
@@ -166,7 +169,7 @@ run_latency_tests() {
latency_command: $latency,
gpu_type: $gpu
}'
)
echo
"
$jq_output
"
>
"
$RESULTS_FOLDER
/
$test_name
.commands"
echo
"
$jq_output
"
>
"
$RESULTS_FOLDER
/
$test_name
.commands"
# run the benchmark
eval
"
$latency_command
"
...
...
@@ -176,7 +179,6 @@ run_latency_tests() {
done
}
run_throughput_tests
()
{
# run throughput tests using `benchmark_throughput.py`
# $1: a json file specifying throughput test cases
...
...
@@ -224,7 +226,7 @@ run_throughput_tests() {
throughput_command: $command,
gpu_type: $gpu
}'
)
echo
"
$jq_output
"
>
"
$RESULTS_FOLDER
/
$test_name
.commands"
echo
"
$jq_output
"
>
"
$RESULTS_FOLDER
/
$test_name
.commands"
# run the benchmark
eval
"
$throughput_command
"
...
...
@@ -256,7 +258,6 @@ run_serving_tests() {
continue
fi
# get client and server arguments
server_params
=
$(
echo
"
$params
"
| jq
-r
'.server_parameters'
)
client_params
=
$(
echo
"
$params
"
| jq
-r
'.client_parameters'
)
...
...
@@ -334,7 +335,7 @@ run_serving_tests() {
client_command: $client,
gpu_type: $gpu
}'
)
echo
"
$jq_output
"
>
"
$RESULTS_FOLDER
/
${
new_test_name
}
.commands"
echo
"
$jq_output
"
>
"
$RESULTS_FOLDER
/
${
new_test_name
}
.commands"
done
...
...
@@ -351,6 +352,7 @@ main() {
# dependencies
(
which wget
&&
which curl
)
||
(
apt-get update
&&
apt-get
install
-y
wget curl
)
(
which jq
)
||
(
apt-get update
&&
apt-get
-y
install
jq
)
(
which lsof
)
||
(
apt-get update
&&
apt-get
install
-y
lsof
)
# get the current IP address, required by benchmark_serving.py
export
VLLM_HOST_IP
=
$(
hostname
-I
|
awk
'{print $1}'
)
...
...
@@ -369,7 +371,6 @@ main() {
run_latency_tests
$QUICK_BENCHMARK_ROOT
/tests/latency-tests.json
run_throughput_tests
$QUICK_BENCHMARK_ROOT
/tests/throughput-tests.json
# postprocess benchmarking results
pip
install
tabulate pandas
python3
$QUICK_BENCHMARK_ROOT
/scripts/convert-results-json-to-markdown.py
...
...
.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
deleted
100644 → 0
View file @
be0967c1
#!/bin/bash
set
-o
pipefail
check_gpus
()
{
# check the number of GPUs and GPU type.
declare
-g
gpu_count
=
$(
nvidia-smi
--list-gpus
|
wc
-l
)
if
[[
$gpu_count
-gt
0
]]
;
then
echo
"GPU found."
else
echo
"Need at least 1 GPU to run benchmarking."
exit
1
fi
declare
-g
gpu_type
=
$(
echo
$(
nvidia-smi
--query-gpu
=
name
--format
=
csv,noheader
)
|
awk
'{print $2}'
)
echo
"GPU type is
$gpu_type
"
}
kill_gpu_processes
()
{
pkill text-generation
||
true
# waiting for GPU processes to be fully killed
sleep
10
# Print the GPU memory usage
# so that we know if all GPU processes are killed.
gpu_memory_usage
=
$(
nvidia-smi
--query-gpu
=
memory.used
--format
=
csv,noheader,nounits
-i
0
)
# The memory usage should be 0 MB.
echo
"GPU 0 Memory Usage:
$gpu_memory_usage
MB"
}
json2args
()
{
# transforms the JSON string to command line args, and '_' is replaced to '-'
# example:
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
local
json_string
=
$1
local
args
=
$(
echo
"
$json_string
"
| jq
-r
'
to_entries |
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
join(" ")
'
)
echo
"
$args
"
}
wait_for_server
()
{
timeout
1200 bash
-c
'
until curl -s localhost:8000/generate_stream > /dev/null; do
sleep 1
done'
&&
return
0
||
return
1
}
run_serving_tests
()
{
# run serving tests using `benchmark_serving.py`
# $1: a json file specifying serving test cases
local
serving_test_file
serving_test_file
=
$1
# Iterate over serving tests
jq
-c
'.[]'
"
$serving_test_file
"
|
while
read
-r
params
;
do
# get the test name, and append the GPU type back to it.
test_name
=
$(
echo
"
$params
"
| jq
-r
'.test_name'
)
# if TEST_SELECTOR is set, only run the test cases that match the selector
if
[[
-n
"
$TEST_SELECTOR
"
]]
&&
[[
!
"
$test_name
"
=
~
$TEST_SELECTOR
]]
;
then
echo
"Skip test case
$test_name
."
continue
fi
# append tgi to the test name
test_name
=
tgi_
$test_name
# get common parameters
common_params
=
$(
echo
"
$params
"
| jq
-r
'.common_parameters'
)
model
=
$(
echo
"
$common_params
"
| jq
-r
'.model'
)
tp
=
$(
echo
"
$common_params
"
| jq
-r
'.tp'
)
dataset_name
=
$(
echo
"
$common_params
"
| jq
-r
'.dataset_name'
)
dataset_path
=
$(
echo
"
$common_params
"
| jq
-r
'.dataset_path'
)
port
=
$(
echo
"
$common_params
"
| jq
-r
'.port'
)
num_prompts
=
$(
echo
"
$common_params
"
| jq
-r
'.num_prompts'
)
# get client and server arguments
server_params
=
$(
echo
"
$params
"
| jq
-r
'.tgi_server_parameters'
)
client_params
=
$(
echo
"
$params
"
| jq
-r
'.tgi_client_parameters'
)
server_args
=
$(
json2args
"
$server_params
"
)
client_args
=
$(
json2args
"
$client_params
"
)
qps_list
=
$(
echo
"
$params
"
| jq
-r
'.qps_list'
)
qps_list
=
$(
echo
"
$qps_list
"
| jq
-r
'.[] | @sh'
)
echo
"Running over qps list
$qps_list
"
# check if there is enough GPU to run the test
if
[[
$gpu_count
-lt
$tp
]]
;
then
echo
"Required num-shard
$tp
but only
$gpu_count
GPU found. Skip testcase
$test_name
."
continue
fi
if
echo
"
$common_params
"
| jq
-e
'has("fp8")'
>
/dev/null
;
then
echo
"Key 'fp8' exists in common params."
server_command
=
"/tgi-entrypoint.sh
\
--model-id
$model
\
--num-shard
$tp
\
--port
$port
\
--quantize fp8
\
$server_args
"
else
echo
"Key 'fp8' does not exist in common params."
server_command
=
"/tgi-entrypoint.sh
\
--model-id
$model
\
--num-shard
$tp
\
--port
$port
\
$server_args
"
fi
# run the server
echo
"Running test case
$test_name
"
echo
"Server command:
$server_command
"
eval
"
$server_command
"
&
# wait until the server is alive
wait_for_server
if
[
$?
-eq
0
]
;
then
echo
""
echo
"tgi server is up and running."
else
echo
""
echo
"tgi failed to start within the timeout period."
break
fi
# iterate over different QPS
for
qps
in
$qps_list
;
do
# remove the surrounding single quote from qps
if
[[
"
$qps
"
==
*
"inf"
*
]]
;
then
echo
"qps was
$qps
"
qps
=
"inf"
echo
"now qps is
$qps
"
fi
new_test_name
=
$test_name
"_qps_"
$qps
client_command
=
"python3 benchmark_serving.py
\
--backend tgi
\
--model
$model
\
--dataset-name
$dataset_name
\
--dataset-path
$dataset_path
\
--num-prompts
$num_prompts
\
--port
$port
\
--save-result
\
--result-dir
$RESULTS_FOLDER
\
--result-filename
${
new_test_name
}
.json
\
--request-rate
$qps
\
$client_args
"
echo
"Running test case
$test_name
with qps
$qps
"
echo
"Client command:
$client_command
"
eval
"
$client_command
"
# record the benchmarking commands
jq_output
=
$(
jq
-n
\
--arg
server
"
$server_command
"
\
--arg
client
"
$client_command
"
\
--arg
gpu
"
$gpu_type
"
\
--arg
engine
"tgi"
\
'{
server_command: $server,
client_command: $client,
gpu_type: $gpu,
engine: $engine
}'
)
echo
"
$jq_output
"
>
"
$RESULTS_FOLDER
/
${
new_test_name
}
.commands"
done
# clean up
kill_gpu_processes
rm
-rf
/root/.cache/huggingface/
*
done
}
upload_to_buildkite
()
{
# upload the benchmarking results to buildkite
# if the agent binary is not found, skip uploading the results, exit 0
if
[
!
-f
/workspace/buildkite-agent
]
;
then
echo
"buildkite-agent binary not found. Skip uploading the results."
return
0
fi
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
/workspace/buildkite-agent artifact upload
"
$RESULTS_FOLDER
/*"
}
main
()
{
check_gpus
# enter vllm directory
cd
$VLLM_SOURCE_CODE_LOC
/benchmarks
declare
-g
RESULTS_FOLDER
=
results/
mkdir
-p
$RESULTS_FOLDER
BENCHMARK_ROOT
=
../.buildkite/nightly-benchmarks/
export
CURRENT_LLM_SERVING_ENGINE
=
tgi
run_serving_tests
$BENCHMARK_ROOT
/tests/nightly-tests.json
python
-m
pip
install
tabulate pandas
python
$BENCHMARK_ROOT
/scripts/summary-nightly-results.py
upload_to_buildkite
}
main
"
$@
"
.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
deleted
100644 → 0
View file @
be0967c1
#!/bin/bash
set
-o
pipefail
check_gpus
()
{
# check the number of GPUs and GPU type.
declare
-g
gpu_count
=
$(
nvidia-smi
--list-gpus
|
wc
-l
)
if
[[
$gpu_count
-gt
0
]]
;
then
echo
"GPU found."
else
echo
"Need at least 1 GPU to run benchmarking."
exit
1
fi
declare
-g
gpu_type
=
$(
echo
$(
nvidia-smi
--query-gpu
=
name
--format
=
csv,noheader
)
|
awk
'{print $2}'
)
echo
"GPU type is
$gpu_type
"
}
kill_gpu_processes
()
{
pkill tritonserver
||
true
# waiting for GPU processes to be fully killed
sleep
20
# Print the GPU memory usage
# so that we know if all GPU processes are killed.
gpu_memory_usage
=
$(
nvidia-smi
--query-gpu
=
memory.used
--format
=
csv,noheader,nounits
-i
0
)
# The memory usage should be 0 MB.
echo
"GPU 0 Memory Usage:
$gpu_memory_usage
MB"
}
json2args
()
{
# transforms the JSON string to command line args, and '_' is replaced to '-'
# example:
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
local
json_string
=
$1
local
args
=
$(
echo
"
$json_string
"
| jq
-r
'
to_entries |
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
join(" ")
'
)
echo
"
$args
"
}
wait_for_server
()
{
timeout
1200 bash
-c
'
until curl -s localhost:8000/generate_stream > /dev/null; do
sleep 1
done'
&&
return
0
||
return
1
}
run_serving_tests
()
{
# run serving tests using `benchmark_serving.py`
# $1: a json file specifying serving test cases
local
serving_test_file
serving_test_file
=
$1
# Iterate over serving tests
jq
-c
'.[]'
"
$serving_test_file
"
|
while
read
-r
params
;
do
# get the test name, and append the GPU type back to it.
test_name
=
$(
echo
"
$params
"
| jq
-r
'.test_name'
)
# if TEST_SELECTOR is set, only run the test cases that match the selector
if
[[
-n
"
$TEST_SELECTOR
"
]]
&&
[[
!
"
$test_name
"
=
~
$TEST_SELECTOR
]]
;
then
echo
"Skip test case
$test_name
."
continue
fi
# append trt to the test name
test_name
=
trt_
$test_name
# get common parameters
common_params
=
$(
echo
"
$params
"
| jq
-r
'.common_parameters'
)
model
=
$(
echo
"
$common_params
"
| jq
-r
'.model'
)
tp
=
$(
echo
"
$common_params
"
| jq
-r
'.tp'
)
dataset_name
=
$(
echo
"
$common_params
"
| jq
-r
'.dataset_name'
)
dataset_path
=
$(
echo
"
$common_params
"
| jq
-r
'.dataset_path'
)
port
=
$(
echo
"
$common_params
"
| jq
-r
'.port'
)
num_prompts
=
$(
echo
"
$common_params
"
| jq
-r
'.num_prompts'
)
# get client and server arguments
server_params
=
$(
echo
"
$params
"
| jq
-r
'.trt_server_parameters'
)
client_params
=
$(
echo
"
$params
"
| jq
-r
'.trt_client_parameters'
)
client_args
=
$(
json2args
"
$client_params
"
)
qps_list
=
$(
echo
"
$params
"
| jq
-r
'.qps_list'
)
qps_list
=
$(
echo
"
$qps_list
"
| jq
-r
'.[] | @sh'
)
echo
"Running over qps list
$qps_list
"
# check if there is enough GPU to run the test
if
[[
$gpu_count
-lt
$tp
]]
;
then
echo
"Required model_tp_size
$tp
but only
$gpu_count
GPU found. Skip testcase
$test_name
."
continue
fi
cd
$VLLM_SOURCE_CODE_LOC
/benchmarks
echo
"Running test case
$test_name
"
bash ../.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh
"
$server_params
"
"
$common_params
"
# wait until the server is alive
wait_for_server
if
[
$?
-eq
0
]
;
then
echo
""
echo
"trt server is up and running."
else
echo
""
echo
"trt failed to start within the timeout period."
break
fi
# prepare tokenizer
cd
$VLLM_SOURCE_CODE_LOC
/benchmarks
rm
-rf
/tokenizer_cache
mkdir
/tokenizer_cache
python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
\
--model
"
$model
"
\
--cachedir
/tokenizer_cache
cd
$VLLM_SOURCE_CODE_LOC
/benchmarks
# iterate over different QPS
for
qps
in
$qps_list
;
do
# remove the surrounding single quote from qps
if
[[
"
$qps
"
==
*
"inf"
*
]]
;
then
echo
"qps was
$qps
"
qps
=
"inf"
echo
"now qps is
$qps
"
fi
new_test_name
=
$test_name
"_qps_"
$qps
client_command
=
"python3 benchmark_serving.py
\
--backend tensorrt-llm
\
--tokenizer /tokenizer_cache
\
--model
$model
\
--dataset-name
$dataset_name
\
--dataset-path
$dataset_path
\
--num-prompts
$num_prompts
\
--port
$port
\
--save-result
\
--result-dir
$RESULTS_FOLDER
\
--result-filename
${
new_test_name
}
.json
\
--request-rate
$qps
\
$client_args
"
echo
"Running test case
$test_name
with qps
$qps
"
echo
"Client command:
$client_command
"
eval
"
$client_command
"
server_command
=
""
# record the benchmarking commands
jq_output
=
$(
jq
-n
\
--arg
server
"
$server_command
"
\
--arg
client
"
$client_command
"
\
--arg
gpu
"
$gpu_type
"
\
--arg
engine
"trt"
\
'{
server_command: $server,
client_command: $client,
gpu_type: $gpu,
engine: $engine
}'
)
echo
"
$jq_output
"
>
"
$RESULTS_FOLDER
/
${
new_test_name
}
.commands"
done
# clean up
kill_gpu_processes
rm
-rf
/root/.cache/huggingface/
*
done
}
upload_to_buildkite
()
{
# upload the benchmarking results to buildkite
# if the agent binary is not found, skip uploading the results, exit 0
if
[
!
-f
/workspace/buildkite-agent
]
;
then
echo
"buildkite-agent binary not found. Skip uploading the results."
return
0
fi
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
/workspace/buildkite-agent artifact upload
"
$RESULTS_FOLDER
/*"
}
main
()
{
check_gpus
# enter vllm directory
cd
$VLLM_SOURCE_CODE_LOC
/benchmarks
declare
-g
RESULTS_FOLDER
=
results/
mkdir
-p
$RESULTS_FOLDER
BENCHMARK_ROOT
=
../.buildkite/nightly-benchmarks/
# update transformers package, to make sure mixtral tokenizer is available
python
-m
pip
install
transformers
-U
export
CURRENT_LLM_SERVING_ENGINE
=
trt
run_serving_tests
$BENCHMARK_ROOT
/tests/nightly-tests.json
python
-m
pip
install
tabulate pandas
python
$BENCHMARK_ROOT
/scripts/summary-nightly-results.py
upload_to_buildkite
}
main
"
$@
"
.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
deleted
100644 → 0
View file @
be0967c1
#!/bin/bash
set
-o
pipefail
check_gpus
()
{
# check the number of GPUs and GPU type.
declare
-g
gpu_count
=
$(
nvidia-smi
--list-gpus
|
wc
-l
)
if
[[
$gpu_count
-gt
0
]]
;
then
echo
"GPU found."
else
echo
"Need at least 1 GPU to run benchmarking."
exit
1
fi
declare
-g
gpu_type
=
$(
echo
$(
nvidia-smi
--query-gpu
=
name
--format
=
csv,noheader
)
|
awk
'{print $2}'
)
echo
"GPU type is
$gpu_type
"
}
kill_gpu_processes
()
{
# kill all processes on GPU.
pkill pt_main_thread
sleep
10
# remove vllm config file
rm
-rf
~/.config/vllm
# Print the GPU memory usage
# so that we know if all GPU processes are killed.
gpu_memory_usage
=
$(
nvidia-smi
--query-gpu
=
memory.used
--format
=
csv,noheader,nounits
-i
0
)
# The memory usage should be 0 MB.
echo
"GPU 0 Memory Usage:
$gpu_memory_usage
MB"
}
json2args
()
{
# transforms the JSON string to command line args, and '_' is replaced to '-'
# example:
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
local
json_string
=
$1
local
args
=
$(
echo
"
$json_string
"
| jq
-r
'
to_entries |
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
join(" ")
'
)
echo
"
$args
"
}
wait_for_server
()
{
# wait for vllm server to start
# return 1 if vllm server crashes
timeout
1200 bash
-c
'
until curl -s localhost:8000/v1/completions > /dev/null; do
sleep 1
done'
&&
return
0
||
return
1
}
run_serving_tests
()
{
# run serving tests using `benchmark_serving.py`
# $1: a json file specifying serving test cases
local
serving_test_file
serving_test_file
=
$1
# Iterate over serving tests
jq
-c
'.[]'
"
$serving_test_file
"
|
while
read
-r
params
;
do
# get the test name, and append the GPU type back to it.
test_name
=
$(
echo
"
$params
"
| jq
-r
'.test_name'
)
# if TEST_SELECTOR is set, only run the test cases that match the selector
if
[[
-n
"
$TEST_SELECTOR
"
]]
&&
[[
!
"
$test_name
"
=
~
$TEST_SELECTOR
]]
;
then
echo
"Skip test case
$test_name
."
continue
fi
# append vllm to the test name
test_name
=
vllm_
$test_name
# get common parameters
common_params
=
$(
echo
"
$params
"
| jq
-r
'.common_parameters'
)
model
=
$(
echo
"
$common_params
"
| jq
-r
'.model'
)
tp
=
$(
echo
"
$common_params
"
| jq
-r
'.tp'
)
dataset_name
=
$(
echo
"
$common_params
"
| jq
-r
'.dataset_name'
)
dataset_path
=
$(
echo
"
$common_params
"
| jq
-r
'.dataset_path'
)
port
=
$(
echo
"
$common_params
"
| jq
-r
'.port'
)
num_prompts
=
$(
echo
"
$common_params
"
| jq
-r
'.num_prompts'
)
# get client and server arguments
server_params
=
$(
echo
"
$params
"
| jq
-r
'.vllm_server_parameters'
)
client_params
=
$(
echo
"
$params
"
| jq
-r
'.vllm_client_parameters'
)
server_args
=
$(
json2args
"
$server_params
"
)
client_args
=
$(
json2args
"
$client_params
"
)
qps_list
=
$(
echo
"
$params
"
| jq
-r
'.qps_list'
)
qps_list
=
$(
echo
"
$qps_list
"
| jq
-r
'.[] | @sh'
)
echo
"Running over qps list
$qps_list
"
# check if there is enough GPU to run the test
if
[[
$gpu_count
-lt
$tp
]]
;
then
echo
"Required tensor-parallel-size
$tp
but only
$gpu_count
GPU found. Skip testcase
$test_name
."
continue
fi
if
echo
"
$common_params
"
| jq
-e
'has("fp8")'
>
/dev/null
;
then
echo
"Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
model
=
$(
echo
"
$common_params
"
| jq
-r
'.neuralmagic_quantized_model'
)
server_command
=
"python3
\
-m vllm.entrypoints.openai.api_server
\
-tp
$tp
\
--model
$model
\
--port
$port
\
$server_args
"
else
echo
"Key 'fp8' does not exist in common params."
server_command
=
"python3
\
-m vllm.entrypoints.openai.api_server
\
-tp
$tp
\
--model
$model
\
--port
$port
\
$server_args
"
fi
# run the server
echo
"Running test case
$test_name
"
echo
"Server command:
$server_command
"
eval
"
$server_command
"
&
# wait until the server is alive
wait_for_server
if
[
$?
-eq
0
]
;
then
echo
""
echo
"vllm server is up and running."
else
echo
""
echo
"vllm failed to start within the timeout period."
break
fi
# iterate over different QPS
for
qps
in
$qps_list
;
do
# remove the surrounding single quote from qps
if
[[
"
$qps
"
==
*
"inf"
*
]]
;
then
echo
"qps was
$qps
"
qps
=
"inf"
echo
"now qps is
$qps
"
fi
new_test_name
=
$test_name
"_qps_"
$qps
client_command
=
"python3 benchmark_serving.py
\
--backend vllm
\
--model
$model
\
--dataset-name
$dataset_name
\
--dataset-path
$dataset_path
\
--num-prompts
$num_prompts
\
--port
$port
\
--save-result
\
--result-dir
$RESULTS_FOLDER
\
--result-filename
${
new_test_name
}
.json
\
--request-rate
$qps
\
$client_args
"
echo
"Running test case
$test_name
with qps
$qps
"
echo
"Client command:
$client_command
"
eval
"
$client_command
"
# record the benchmarking commands
jq_output
=
$(
jq
-n
\
--arg
server
"
$server_command
"
\
--arg
client
"
$client_command
"
\
--arg
gpu
"
$gpu_type
"
\
--arg
engine
"vllm"
\
'{
server_command: $server,
client_command: $client,
gpu_type: $gpu,
engine: $engine
}'
)
echo
"
$jq_output
"
>
"
$RESULTS_FOLDER
/
${
new_test_name
}
.commands"
done
# clean up
kill_gpu_processes
rm
-rf
/root/.cache/huggingface/
*
done
}
upload_to_buildkite
()
{
# upload the benchmarking results to buildkite
# if the agent binary is not found, skip uploading the results, exit 0
if
[
!
-f
/workspace/buildkite-agent
]
;
then
echo
"buildkite-agent binary not found. Skip uploading the results."
return
0
fi
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
/workspace/buildkite-agent artifact upload
"
$RESULTS_FOLDER
/*"
}
main
()
{
check_gpus
# enter vllm directory
cd
$VLLM_SOURCE_CODE_LOC
/benchmarks
declare
-g
RESULTS_FOLDER
=
results/
mkdir
-p
$RESULTS_FOLDER
BENCHMARK_ROOT
=
../.buildkite/nightly-benchmarks/
export
CURRENT_LLM_SERVING_ENGINE
=
vllm
run_serving_tests
$BENCHMARK_ROOT
/tests/nightly-tests.json
python3
-m
pip
install
tabulate pandas
python3
$BENCHMARK_ROOT
/scripts/summary-nightly-results.py
upload_to_buildkite
}
main
"
$@
"
.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
View file @
ad385667
...
...
@@ -17,10 +17,17 @@ serving_column_mapping = {
"request_throughput"
:
"Tput (req/s)"
,
"mean_ttft_ms"
:
"Mean TTFT (ms)"
,
"std_ttft_ms"
:
"Std TTFT (ms)"
,
"median_ttft_ms"
:
"Median TTFT (ms)"
,
"mean_itl_ms"
:
"Mean ITL (ms)"
,
"std_itl_ms"
:
"Std ITL (ms)"
,
"input_throughput"
:
"Input Tput (tok/s)"
,
"median_itl_ms"
:
"Median ITL (ms)"
,
"mean_tpot_ms"
:
"Mean TPOT (ms)"
,
"std_tpot_ms"
:
"Std TPOT (ms)"
,
"median_tpot_ms"
:
"Median TPOT (ms)"
,
"total_token_throughput"
:
"Total Token Tput (tok/s)"
,
"output_throughput"
:
"Output Tput (tok/s)"
,
"total_input_tokens"
:
"Total input tokens"
,
"total_output_tokens"
:
"Total output tokens"
,
"engine"
:
"Engine"
,
}
...
...
.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
View file @
ad385667
...
...
@@ -2,9 +2,11 @@
TOKEN
=
$(
curl
-s
-L
"https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull"
| jq
-r
.token
)
URL
=
"https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/
$BUILDKITE_COMMIT
"
TIMEOUT_SECONDS
=
10
retries
=
0
while
[
$retries
-lt
1000
]
;
do
if
[
$(
curl
-s
-L
-H
"Authorization: Bearer
$TOKEN
"
-o
/dev/null
-w
"%{http_code}"
$URL
)
-eq
200
]
;
then
if
[
$(
curl
-s
--max-time
$TIMEOUT_SECONDS
-L
-H
"Authorization: Bearer
$TOKEN
"
-o
/dev/null
-w
"%{http_code}"
$URL
)
-eq
200
]
;
then
exit
0
fi
...
...
.buildkite/nightly-benchmarks/tests/latency-tests.json
View file @
ad385667
...
...
@@ -2,7 +2,7 @@
{
"test_name"
:
"latency_llama8B_tp1"
,
"parameters"
:
{
"model"
:
"meta-llama/Meta-Llama-3-8B"
,
"model"
:
"meta-llama/Meta-Llama-3
.1
-8B
-Instruct
"
,
"tensor_parallel_size"
:
1
,
"load_format"
:
"dummy"
,
"num_iters_warmup"
:
5
,
...
...
@@ -12,7 +12,7 @@
{
"test_name"
:
"latency_llama70B_tp4"
,
"parameters"
:
{
"model"
:
"meta-llama/Meta-Llama-3-70B-Instruct"
,
"model"
:
"meta-llama/Meta-Llama-3
.1
-70B-Instruct"
,
"tensor_parallel_size"
:
4
,
"load_format"
:
"dummy"
,
"num-iters-warmup"
:
5
,
...
...
.buildkite/nightly-benchmarks/tests/nightly-tests.json
View file @
ad385667
[
{
"test_name"
:
"llama8B_tp1"
,
"qps_list"
:
[
4
],
"test_name"
:
"llama8B_tp1
_sharegpt
"
,
"qps_list"
:
[
4
,
8
,
16
,
32
,
"inf"
],
"common_parameters"
:
{
"model"
:
"meta-llama/Meta-Llama-3-8B"
,
"model"
:
"meta-llama/Meta-Llama-3-8B
-Instruct
"
,
"tp"
:
1
,
"dataset_name"
:
"sharegpt"
,
"dataset_path"
:
"./ShareGPT_V3_unfiltered_cleaned_split.json"
,
"num_prompts"
:
500
,
"port"
:
8000
"port"
:
8000
,
"reuse_server"
:
false
},
"lmdeploy_server_parameters"
:
{
"dtype"
:
"bfloat16"
},
"lmdeploy_client_parameters"
:
{
},
...
...
@@ -21,34 +23,158 @@
},
"trt_server_parameters"
:
{
"model_type"
:
"llama"
,
"model_dtype"
:
"float16"
,
"max_batch_size"
:
2
56
,
"model_dtype"
:
"
b
float16"
,
"max_batch_size"
:
2
048
,
"max_input_len"
:
4096
,
"max_output_len"
:
4096
,
"trt_llm_version"
:
"r24.04"
"max_seq_len"
:
6144
,
"max_num_tokens"
:
16384
,
"trt_llm_version"
:
"v0.11.0"
},
"trt_client_parameters"
:
{
"endpoint"
:
"/v2/models/ensemble/generate_stream"
},
"vllm_server_parameters"
:
{
"disable_log_stats"
:
""
,
"disable_log_requests"
:
""
,
"gpu_memory_utilization"
:
0.9
,
"num_scheduler_steps"
:
10
,
"max_num_seqs"
:
512
,
"dtype"
:
"bfloat16"
},
"vllm_client_parameters"
:
{
},
"sglang_server_parameters"
:
{
"disable_radix_cache"
:
""
,
"enable_torch_compile"
:
""
,
"dtype"
:
"bfloat16"
},
"sglang_client_parameters"
:
{
}
},
{
"test_name"
:
"llama8B_tp1_sonnet_512_16"
,
"qps_list"
:
[
4
,
8
,
16
,
32
,
"inf"
],
"common_parameters"
:
{
"model"
:
"meta-llama/Meta-Llama-3-8B-Instruct"
,
"tp"
:
1
,
"dataset_name"
:
"sonnet"
,
"dataset_path"
:
"./sonnet_4x.txt"
,
"num_prompts"
:
500
,
"port"
:
8000
,
"sonnet_input_len"
:
512
,
"sonnet_output_len"
:
16
,
"sonnet_prefix_len"
:
50
,
"reuse_server"
:
true
},
"lmdeploy_server_parameters"
:
{
"dtype"
:
"bfloat16"
},
"lmdeploy_client_parameters"
:
{
},
"tgi_server_parameters"
:
{
},
"tgi_client_parameters"
:
{
"endpoint"
:
"/generate_stream"
},
"trt_server_parameters"
:
{
"model_type"
:
"llama"
,
"model_dtype"
:
"bfloat16"
,
"max_batch_size"
:
2048
,
"max_input_len"
:
4096
,
"max_seq_len"
:
6144
,
"max_num_tokens"
:
16384
,
"trt_llm_version"
:
"v0.11.0"
},
"trt_client_parameters"
:
{
"endpoint"
:
"/v2/models/ensemble/generate_stream"
},
"vllm_server_parameters"
:
{
"disable_log_stats"
:
""
,
"disable_log_requests"
:
""
,
"gpu_memory_utilization"
:
0.9
,
"num_scheduler_steps"
:
10
,
"max_num_seqs"
:
512
,
"dtype"
:
"bfloat16"
},
"vllm_client_parameters"
:
{
},
"sglang_server_parameters"
:
{
"disable_radix_cache"
:
""
,
"enable_torch_compile"
:
""
,
"dtype"
:
"bfloat16"
},
"sglang_client_parameters"
:
{
}
},
{
"test_name"
:
"llama8B_tp1_sonnet_512_256"
,
"qps_list"
:
[
4
,
8
,
16
,
32
,
"inf"
],
"common_parameters"
:
{
"model"
:
"meta-llama/Meta-Llama-3-8B-Instruct"
,
"tp"
:
1
,
"dataset_name"
:
"sonnet"
,
"dataset_path"
:
"./sonnet_4x.txt"
,
"num_prompts"
:
500
,
"port"
:
8000
,
"sonnet_input_len"
:
512
,
"sonnet_output_len"
:
256
,
"sonnet_prefix_len"
:
50
,
"reuse_server"
:
true
},
"lmdeploy_server_parameters"
:
{
"dtype"
:
"bfloat16"
},
"lmdeploy_client_parameters"
:
{
},
"tgi_server_parameters"
:
{
},
"tgi_client_parameters"
:
{
"endpoint"
:
"/generate_stream"
},
"trt_server_parameters"
:
{
"model_type"
:
"llama"
,
"model_dtype"
:
"bfloat16"
,
"max_batch_size"
:
2048
,
"max_input_len"
:
4096
,
"max_seq_len"
:
6144
,
"max_num_tokens"
:
16384
,
"trt_llm_version"
:
"v0.11.0"
},
"trt_client_parameters"
:
{
"endpoint"
:
"/v2/models/ensemble/generate_stream"
},
"vllm_server_parameters"
:
{
"disable_log_stats"
:
""
,
"disable_log_requests"
:
""
"disable_log_requests"
:
""
,
"gpu_memory_utilization"
:
0.9
,
"num_scheduler_steps"
:
10
,
"max_num_seqs"
:
512
,
"dtype"
:
"bfloat16"
},
"vllm_client_parameters"
:
{
},
"sglang_server_parameters"
:
{
"disable_radix_cache"
:
""
,
"enable_torch_compile"
:
""
,
"dtype"
:
"bfloat16"
},
"sglang_client_parameters"
:
{
}
},
{
"test_name"
:
"llama70B_tp4"
,
"qps_list"
:
[
2
],
"test_name"
:
"llama70B_tp4
_sharegpt
"
,
"qps_list"
:
[
4
,
8
,
16
,
32
,
"inf"
],
"common_parameters"
:
{
"model"
:
"meta-llama/Meta-Llama-3-70B-Instruct"
,
"tp"
:
4
,
"dataset_name"
:
"sharegpt"
,
"dataset_path"
:
"./ShareGPT_V3_unfiltered_cleaned_split.json"
,
"num_prompts"
:
500
,
"port"
:
8000
"port"
:
8000
,
"reuse_server"
:
false
},
"lmdeploy_server_parameters"
:
{
"dtype"
:
"bfloat16"
},
"lmdeploy_client_parameters"
:
{
},
...
...
@@ -59,34 +185,50 @@
},
"trt_server_parameters"
:
{
"model_type"
:
"llama"
,
"model_dtype"
:
"float16"
,
"max_batch_size"
:
2
56
,
"model_dtype"
:
"
b
float16"
,
"max_batch_size"
:
2
048
,
"max_input_len"
:
4096
,
"max_output_len"
:
4096
,
"trt_llm_version"
:
"r24.04"
"max_seq_len"
:
6144
,
"max_num_tokens"
:
16384
,
"trt_llm_version"
:
"v0.11.0"
},
"trt_client_parameters"
:
{
"endpoint"
:
"/v2/models/ensemble/generate_stream"
},
},
"vllm_server_parameters"
:
{
"disable_log_stats"
:
""
,
"disable_log_requests"
:
""
"disable_log_requests"
:
""
,
"gpu_memory_utilization"
:
0.9
,
"num_scheduler_steps"
:
10
,
"max_num_seqs"
:
512
,
"dtype"
:
"bfloat16"
},
"vllm_client_parameters"
:
{
},
"sglang_server_parameters"
:
{
"disable_radix_cache"
:
""
,
"dtype"
:
"bfloat16"
},
"sglang_client_parameters"
:
{
}
},
{
"test_name"
:
"
mixtral8x7B_tp2
"
,
"qps_list"
:
[
2
],
"test_name"
:
"
llama70B_tp4_sonnet_512_16
"
,
"qps_list"
:
[
4
,
8
,
16
,
32
,
"inf"
],
"common_parameters"
:
{
"model"
:
"m
istralai/Mixtral-8x7
B-Instruct
-v0.1
"
,
"tp"
:
2
,
"dataset_name"
:
"s
haregp
t"
,
"dataset_path"
:
"./
ShareGPT_V3_unfiltered_cleaned_split.json
"
,
"model"
:
"m
eta-llama/Meta-Llama-3-70
B-Instruct"
,
"tp"
:
4
,
"dataset_name"
:
"s
onne
t"
,
"dataset_path"
:
"./
sonnet_4x.txt
"
,
"num_prompts"
:
500
,
"port"
:
8000
"port"
:
8000
,
"sonnet_input_len"
:
512
,
"sonnet_output_len"
:
16
,
"sonnet_prefix_len"
:
50
,
"reuse_server"
:
true
},
"lmdeploy_server_parameters"
:
{
"dtype"
:
"bfloat16"
},
"lmdeploy_client_parameters"
:
{
},
...
...
@@ -97,20 +239,85 @@
},
"trt_server_parameters"
:
{
"model_type"
:
"llama"
,
"model_dtype"
:
"float16"
,
"max_batch_size"
:
2
56
,
"model_dtype"
:
"
b
float16"
,
"max_batch_size"
:
2
048
,
"max_input_len"
:
4096
,
"max_output_len"
:
4096
,
"trt_llm_version"
:
"r24.04"
"max_seq_len"
:
6144
,
"max_num_tokens"
:
16384
,
"trt_llm_version"
:
"v0.11.0"
},
"trt_client_parameters"
:
{
"endpoint"
:
"/v2/models/ensemble/generate_stream"
},
"vllm_server_parameters"
:
{
"disable_log_stats"
:
""
,
"disable_log_requests"
:
""
,
"gpu_memory_utilization"
:
0.9
,
"num_scheduler_steps"
:
10
,
"max_num_seqs"
:
512
,
"dtype"
:
"bfloat16"
},
"vllm_client_parameters"
:
{
},
"sglang_server_parameters"
:
{
"disable_radix_cache"
:
""
,
"dtype"
:
"bfloat16"
},
"sglang_client_parameters"
:
{
}
},
{
"test_name"
:
"llama70B_tp4_sonnet_512_256"
,
"qps_list"
:
[
4
,
8
,
16
,
32
,
"inf"
],
"common_parameters"
:
{
"model"
:
"meta-llama/Meta-Llama-3-70B-Instruct"
,
"tp"
:
4
,
"dataset_name"
:
"sonnet"
,
"dataset_path"
:
"./sonnet_4x.txt"
,
"num_prompts"
:
500
,
"port"
:
8000
,
"sonnet_input_len"
:
512
,
"sonnet_output_len"
:
256
,
"sonnet_prefix_len"
:
50
,
"reuse_server"
:
true
},
"lmdeploy_server_parameters"
:
{
"dtype"
:
"bfloat16"
},
"lmdeploy_client_parameters"
:
{
},
"tgi_server_parameters"
:
{
},
"tgi_client_parameters"
:
{
"endpoint"
:
"/generate_stream"
},
"trt_server_parameters"
:
{
"model_type"
:
"llama"
,
"model_dtype"
:
"bfloat16"
,
"max_batch_size"
:
2048
,
"max_input_len"
:
4096
,
"max_seq_len"
:
6144
,
"max_num_tokens"
:
16384
,
"trt_llm_version"
:
"v0.11.0"
},
"trt_client_parameters"
:
{
"endpoint"
:
"/v2/models/ensemble/generate_stream"
},
"vllm_server_parameters"
:
{
"disable_log_stats"
:
""
,
"disable_log_requests"
:
""
"disable_log_requests"
:
""
,
"gpu_memory_utilization"
:
0.9
,
"num_scheduler_steps"
:
10
,
"max_num_seqs"
:
512
,
"dtype"
:
"bfloat16"
},
"vllm_client_parameters"
:
{
},
"sglang_server_parameters"
:
{
"disable_radix_cache"
:
""
,
"dtype"
:
"bfloat16"
},
"sglang_client_parameters"
:
{
}
}
]
\ No newline at end of file
.buildkite/nightly-benchmarks/tests/serving-tests.json
View file @
ad385667
...
...
@@ -3,7 +3,7 @@
"test_name"
:
"serving_llama8B_tp1_sharegpt"
,
"qps_list"
:
[
1
,
4
,
16
,
"inf"
],
"server_parameters"
:
{
"model"
:
"meta-llama/Meta-Llama-3-8B"
,
"model"
:
"meta-llama/Meta-Llama-3
.1
-8B
-Instruct
"
,
"tensor_parallel_size"
:
1
,
"swap_space"
:
16
,
"disable_log_stats"
:
""
,
...
...
@@ -11,7 +11,7 @@
"load_format"
:
"dummy"
},
"client_parameters"
:
{
"model"
:
"meta-llama/Meta-Llama-3-8B"
,
"model"
:
"meta-llama/Meta-Llama-3
.1
-8B
-Instruct
"
,
"backend"
:
"vllm"
,
"dataset_name"
:
"sharegpt"
,
"dataset_path"
:
"./ShareGPT_V3_unfiltered_cleaned_split.json"
,
...
...
@@ -22,7 +22,7 @@
"test_name"
:
"serving_llama70B_tp4_sharegpt"
,
"qps_list"
:
[
1
,
4
,
16
,
"inf"
],
"server_parameters"
:
{
"model"
:
"meta-llama/Meta-Llama-3-70B-Instruct"
,
"model"
:
"meta-llama/Meta-Llama-3
.1
-70B-Instruct"
,
"tensor_parallel_size"
:
4
,
"swap_space"
:
16
,
"disable_log_stats"
:
""
,
...
...
@@ -30,7 +30,7 @@
"load_format"
:
"dummy"
},
"client_parameters"
:
{
"model"
:
"meta-llama/Meta-Llama-3-70B-Instruct"
,
"model"
:
"meta-llama/Meta-Llama-3
.1
-70B-Instruct"
,
"backend"
:
"vllm"
,
"dataset_name"
:
"sharegpt"
,
"dataset_path"
:
"./ShareGPT_V3_unfiltered_cleaned_split.json"
,
...
...
@@ -60,7 +60,7 @@
"test_name"
:
"serving_llama70B_tp4_sharegpt_specdecode"
,
"qps_list"
:
[
2
],
"server_parameters"
:
{
"model"
:
"meta-llama/Meta-Llama-3-70B-Instruct"
,
"model"
:
"meta-llama/Meta-Llama-3
.1
-70B-Instruct"
,
"disable_log_requests"
:
""
,
"tensor_parallel_size"
:
4
,
"swap_space"
:
16
,
...
...
@@ -70,7 +70,7 @@
"use_v2_block_manager"
:
""
},
"client_parameters"
:
{
"model"
:
"meta-llama/Meta-Llama-3-70B-Instruct"
,
"model"
:
"meta-llama/Meta-Llama-3
.1
-70B-Instruct"
,
"backend"
:
"vllm"
,
"dataset_name"
:
"sharegpt"
,
"dataset_path"
:
"./ShareGPT_V3_unfiltered_cleaned_split.json"
,
...
...
.buildkite/nightly-benchmarks/tests/throughput-tests.json
View file @
ad385667
...
...
@@ -2,7 +2,7 @@
{
"test_name"
:
"throughput_llama8B_tp1"
,
"parameters"
:
{
"model"
:
"meta-llama/Meta-Llama-3-8B"
,
"model"
:
"meta-llama/Meta-Llama-3
.1
-8B
-Instruct
"
,
"tensor_parallel_size"
:
1
,
"load_format"
:
"dummy"
,
"dataset"
:
"./ShareGPT_V3_unfiltered_cleaned_split.json"
,
...
...
@@ -13,7 +13,7 @@
{
"test_name"
:
"throughput_llama70B_tp4"
,
"parameters"
:
{
"model"
:
"meta-llama/Meta-Llama-3-70B-Instruct"
,
"model"
:
"meta-llama/Meta-Llama-3
.1
-70B-Instruct"
,
"tensor_parallel_size"
:
4
,
"load_format"
:
"dummy"
,
"dataset"
:
"./ShareGPT_V3_unfiltered_cleaned_split.json"
,
...
...
.buildkite/release-pipeline.yaml
View file @
ad385667
steps
:
-
label
:
"
Build
wheel
-
CUDA
{{matrix.cuda_version}}
"
-
label
:
"
Build
wheel
-
CUDA
12.1
"
agents
:
queue
:
cpu_queue
commands
:
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
buildkite_commit=$BUILDKITE_COMMIT
--build-arg
USE_SCCACHE=1
--build-arg
CUDA_VERSION={{matrix.cuda_version}}
--tag
vllm-ci:build-image
--target
build
--progress
plain
."
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
USE_SCCACHE=1
--build-arg
GIT_REPO_CHECK=1
--build-arg
CUDA_VERSION=12.1.0
--tag
vllm-ci:build-image
--target
build
--progress
plain
."
-
"
mkdir
artifacts"
-
"
docker
run
--rm
-v
$(pwd)/artifacts:/artifacts_host
vllm-ci:build-image
bash
-c
'cp
-r
dist
/artifacts_host
&&
chmod
-R
a+rw
/artifacts_host'"
# rename the files to change linux -> manylinux1
-
"
for
f
in
artifacts/dist/*.whl;
do
mv
--
\"
$$f
\"
\"
$${f/linux/manylinux1}
\"
;
done"
-
"
mv
artifacts/dist/$(ls
artifacts/dist)
artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
-
"
aws
s3
cp
artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
s3://vllm-wheels/$BUILDKITE_COMMIT/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
-
"
aws
s3
cp
artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
s3://vllm-wheels/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
env
:
DOCKER_BUILDKIT
:
"
1"
-
block
:
"
Build
CUDA
11.8
wheel"
key
:
block-build-cu118-wheel
-
label
:
"
Build
wheel
-
CUDA
11.8"
depends_on
:
block-build-cu118-wheel
agents
:
queue
:
cpu_queue
commands
:
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
USE_SCCACHE=1
--build-arg
GIT_REPO_CHECK=1
--build-arg
CUDA_VERSION=11.8.0
--tag
vllm-ci:build-image
--target
build
--progress
plain
."
-
"
mkdir
artifacts"
-
"
docker
run
--rm
-v
$(pwd)/artifacts:/artifacts_host
vllm-ci:build-image
bash
-c
'cp
-r
dist
/artifacts_host
&&
chmod
-R
a+rw
/artifacts_host'"
# rename the files to change linux -> manylinux1
...
...
@@ -12,8 +31,3 @@ steps:
-
"
aws
s3
cp
--recursive
artifacts/dist
s3://vllm-wheels/nightly/"
env
:
DOCKER_BUILDKIT
:
"
1"
matrix
:
setup
:
cuda_version
:
-
"
11.8.0"
-
"
12.1.0"
.buildkite/run-amd-test.sh
100644 → 100755
View file @
ad385667
# This script runs test inside the corresponding ROCm docker container.
set
-
ex
set
-
o
pipefail
# Print ROCm version
echo
"--- Confirming Clean Initial State"
...
...
@@ -70,15 +70,85 @@ HF_CACHE="$(realpath ~)/huggingface"
mkdir
-p
${
HF_CACHE
}
HF_MOUNT
=
"/root/.cache/huggingface"
docker run
\
commands
=
$@
echo
"Commands:
$commands
"
#ignore certain kernels tests
if
[[
$commands
==
*
" kernels "
*
]]
;
then
commands
=
"
${
commands
}
\
--ignore=kernels/test_attention.py
\
--ignore=kernels/test_attention_selector.py
\
--ignore=kernels/test_blocksparse_attention.py
\
--ignore=kernels/test_causal_conv1d.py
\
--ignore=kernels/test_cutlass.py
\
--ignore=kernels/test_encoder_decoder_attn.py
\
--ignore=kernels/test_flash_attn.py
\
--ignore=kernels/test_flashinfer.py
\
--ignore=kernels/test_gguf.py
\
--ignore=kernels/test_int8_quant.py
\
--ignore=kernels/test_machete_gemm.py
\
--ignore=kernels/test_mamba_ssm.py
\
--ignore=kernels/test_marlin_gemm.py
\
--ignore=kernels/test_moe.py
\
--ignore=kernels/test_prefix_prefill.py
\
--ignore=kernels/test_rand.py
\
--ignore=kernels/test_sampler.py"
fi
#ignore certain Entrypoints tests
if
[[
$commands
==
*
" entrypoints/openai "
*
]]
;
then
commands
=
${
commands
//
" entrypoints/openai "
/
" entrypoints/openai
\
--ignore=entrypoints/openai/test_accuracy.py
\
--ignore=entrypoints/openai/test_audio.py
\
--ignore=entrypoints/openai/test_encoder_decoder.py
\
--ignore=entrypoints/openai/test_embedding.py
\
--ignore=entrypoints/openai/test_oot_registration.py "
}
fi
PARALLEL_JOB_COUNT
=
8
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
if
[[
$commands
==
*
"--shard-id="
*
]]
;
then
for
GPU
in
$(
seq
0
$((
$PARALLEL_JOB_COUNT
-
1
))
)
;
do
#replace shard arguments
commands
=
${
commands
//
"--shard-id= "
/
"--shard-id=
${
GPU
}
"
}
commands
=
${
commands
//
"--num-shards= "
/
"--num-shards=
${
PARALLEL_JOB_COUNT
}
"
}
echo
"Shard
${
GPU
}
commands:
$commands
"
docker run
\
--device
/dev/kfd
--device
/dev/dri
\
--network
host
\
--shm-size
=
16gb
\
--rm
\
-e
HIP_VISIBLE_DEVICES
=
${
GPU
}
\
-e
HF_TOKEN
\
-v
${
HF_CACHE
}
:
${
HF_MOUNT
}
\
-e
HF_HOME
=
${
HF_MOUNT
}
\
--name
${
container_name
}
\
--name
${
container_name
}
_
${
GPU
}
\
${
image_name
}
\
/bin/bash
-c
"
${
@
}
"
/bin/bash
-c
"
${
commands
}
"
\
|&
while
read
-r
line
;
do
echo
">>Shard
$GPU
:
$line
"
;
done
&
PIDS+
=(
$!
)
done
#wait for all processes to finish and collect exit codes
for
pid
in
${
PIDS
[@]
}
;
do
wait
${
pid
}
STATUS+
=(
$?
)
done
for
st
in
${
STATUS
[@]
}
;
do
if
[[
${
st
}
-ne
0
]]
;
then
echo
"One of the processes failed with
$st
"
exit
${
st
}
fi
done
else
docker run
\
--device
/dev/kfd
--device
/dev/dri
\
--network
host
\
--shm-size
=
16gb
\
--rm
\
-e
HIP_VISIBLE_DEVICES
=
0
\
-e
HF_TOKEN
\
-v
${
HF_CACHE
}
:
${
HF_MOUNT
}
\
-e
HF_HOME
=
${
HF_MOUNT
}
\
--name
${
container_name
}
\
${
image_name
}
\
/bin/bash
-c
"
${
commands
}
"
fi
.buildkite/run-cpu-test-ppc64le.sh
0 → 100755
View file @
ad385667
# This script build the CPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
set
-ex
# Try building the docker image
docker build
-t
cpu-test
-f
Dockerfile.ppc64le
.
# Setup cleanup
remove_docker_container
()
{
docker
rm
-f
cpu-test
||
true
;
}
trap
remove_docker_container EXIT
remove_docker_container
# Run the image, setting --shm-size=4g for tensor parallel.
source
/etc/environment
#docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
docker run
-itd
--entrypoint
/bin/bash
-v
~/.cache/huggingface:/root/.cache/huggingface
--privileged
=
true
--network
host
-e
HF_TOKEN
=
$HF_TOKEN
--name
cpu-test cpu-test
# Run basic model test
docker
exec
cpu-test bash
-c
"
pip install pytest matplotlib einops transformers_stream_generator
pytest -v -s tests/models -m
\"
not vlm
\"
\
--ignore=tests/models/test_embedding.py
\
--ignore=tests/models/test_oot_registration.py
\
--ignore=tests/models/test_registry.py
\
--ignore=tests/models/test_jamba.py
\
--ignore=tests/models/test_mamba.py
\
--ignore=tests/models/test_danube3_4b.py"
# Mamba kernels and Danube3-4B on CPU is not supported
# online inference
docker
exec
cpu-test bash
-c
"
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m &
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
python3 benchmarks/benchmark_serving.py
\
--backend vllm
\
--dataset-name random
\
--model facebook/opt-125m
\
--num-prompts 20
\
--endpoint /v1/completions
\
--tokenizer facebook/opt-125m"
.buildkite/run-cpu-test.sh
View file @
ad385667
...
...
@@ -22,8 +22,25 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
# Run basic model test
docker
exec
cpu-test bash
-c
"
pip install pytest Pillow protobuf
pytest -v -s tests/models -m
\"
not vlm
\"
--ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py --ignore=tests/models/test_danube3_4b.py"
# Mamba and Danube3-4B on CPU is not supported
pip install pytest matplotlib einops transformers_stream_generator datamodel_code_generator
pytest -v -s tests/models/encoder_decoder/language
pytest -v -s tests/models/decoder_only/language
\
--ignore=tests/models/test_fp8.py
\
--ignore=tests/models/decoder_only/language/test_jamba.py
\
--ignore=tests/models/decoder_only/language/test_mamba.py
\
--ignore=tests/models/decoder_only/language/test_granitemoe.py
\
--ignore=tests/models/decoder_only/language/test_danube3_4b.py"
# Mamba and Danube3-4B on CPU is not supported
# Run compressed-tensor test
docker
exec
cpu-test bash
-c
"
pytest -s -v
\
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup
\
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
# Run AWQ test
docker
exec
cpu-test bash
-c
"
pytest -s -v
\
tests/quantization/test_ipex_quant.py"
# online inference
docker
exec
cpu-test bash
-c
"
...
...
.buildkite/run-tpu-test.sh
View file @
ad385667
...
...
@@ -12,5 +12,4 @@ remove_docker_container
# For HF_TOKEN.
source
/etc/environment
# Run a simple end-to-end example.
docker run
--privileged
--net
host
--shm-size
=
16G
-it
-e
HF_TOKEN
=
$HF_TOKEN
--name
tpu-test vllm-tpu
\
python3 /workspace/vllm/examples/offline_inference_tpu.py
docker run
--privileged
--net
host
--shm-size
=
16G
-it
-e
HF_TOKEN
=
$HF_TOKEN
--name
tpu-test vllm-tpu /bin/bash
-c
"python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py"
.buildkite/run-xpu-test.sh
View file @
ad385667
...
...
@@ -11,4 +11,4 @@ trap remove_docker_container EXIT
remove_docker_container
# Run the image and launch offline inference
docker run
--network
host
--name
xpu-test
--device
/dev/dri
-v
/dev/dri/by-path:/dev/dri/by-path xpu-test python3 examples/offline_inference.py
docker run
--network
host
--name
xpu-test
--device
/dev/dri
-v
/dev/dri/by-path:/dev/dri/by-path
--entrypoint
=
""
xpu-test python3 examples/offline_inference.py
.buildkite/test-pipeline.yaml
View file @
ad385667
...
...
@@ -5,264 +5,498 @@
# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2
# to generate the final pipeline yaml file.
# Documentation
# label(str): the name of the test. emoji allowed.
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
# fast_check_only(bool): run this test on fastcheck pipeline only
# optional(bool): never run this test by default (i.e. need to unblock manually)
# command(str): the single command to run for tests. incompatible with commands.
# commands(list): the list of commands to run for test. incompatbile with command.
# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
# gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100
# num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4.
# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host,
# in this case, commands must be specified. the first command runs on first host, the second
# command runs on the second host.
# working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests
# source_file_dependencies(list): the list of prefix to opt-in the test for, if empty, the test will always run.
# When adding a test
# - If the test belong to an existing group, add it there
# - If the test is short, add to any existing step
# - If the test takes more than 10min, then it is okay to create a new step.
# Note that all steps execute in parallel.
steps
:
-
label
:
Async Engine, Inputs, Utils, Worker Test
##### fast check tests #####
-
label
:
Documentation Build
# 2min
working_dir
:
"
/vllm-workspace/test_docs/docs"
fast_check
:
true
fast_check_only
:
t
rue
no_gpu
:
T
rue
commands
:
-
pytest -v -s async_engine
# Async Engine
-
pip install -r requirements-docs.txt
-
SPHINXOPTS=\"-W\" make html
# Check API reference (if it fails, you may have missing mock imports)
-
grep \"sig sig-object py\" build/html/dev/sampling_params.html
-
label
:
Async Engine, Inputs, Utils, Worker Test
# 24min
fast_check
:
true
source_file_dependencies
:
-
vllm/
-
tests/mq_llm_engine
-
tests/async_engine
-
tests/test_inputs
-
tests/multimodal
-
tests/test_utils
-
tests/worker
commands
:
-
pytest -v -s mq_llm_engine
# MQLLMEngine
-
pytest -v -s async_engine
# AsyncLLMEngine
-
NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
-
pytest -v -s test_inputs.py
-
pytest -v -s multimodal
-
pytest -v -s test_utils.py
# Utils
-
pytest -v -s worker
# Worker
-
label
:
Metrics, Tracing Test
fast_check
:
true
fast_check_only
:
true
commands
:
-
pytest -v -s metrics
# Metrics
-
"
pip
install
\
opentelemetry-sdk
\
opentelemetry-api
\
opentelemetry-exporter-otlp
\
opentelemetry-semantic-conventions-ai"
# Tracing
-
pytest -v -s tracing
-
label
:
Regression Test
mirror_hardwares
:
[
amd
]
fast_check
:
true
command
:
pytest -v -s test_regression.py
working_dir
:
"
/vllm-workspace/tests"
# optional
-
label
:
AsyncEngine Test
-
label
:
Basic Correctness Test
# 30min
#mirror_hardwares: [amd]
command
:
pytest -v -s async_engine
-
label
:
Basic Correctness Test
mirror_hardwares
:
[
amd
]
fast_check
:
true
source_file_dependencies
:
-
vllm/
-
tests/basic_correctness/test_basic_correctness
-
tests/basic_correctness/test_cpu_offload
-
tests/basic_correctness/test_preemption
commands
:
# This flashinfer installation will fail on AMD ROCm, so it is set as optional.
-
pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl ||
true
-
pytest -v -s basic_correctness/test_basic_correctness.py
-
pytest -v -s basic_correctness/test_cpu_offload.py
-
VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
-
label
:
Chunked Prefill Test
source_file_dependencies
:
-
vllm/
-
tests/basic_correctness/test_chunked_prefill
commands
:
-
VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
-
VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
-
VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
-
label
:
Core Test
-
label
:
Core Test
# 10min
mirror_hardwares
:
[
amd
]
fast_check
:
true
source_file_dependencies
:
-
vllm/core
-
vllm/distributed
-
tests/core
commands
:
-
pytest -v -s core
-
label
:
Distributed Comm Ops Test
#mirror_hardwares: [amd]
working_dir
:
"
/vllm-workspace/tests"
num_gpus
:
2
commands
:
-
pytest -v -s distributed/test_comm_ops.py
-
pytest -v -s distributed/test_shm_broadcast.py
-
label
:
2 Node Tests (4 GPUs in total)
-
label
:
Entrypoints Test
# 40min
working_dir
:
"
/vllm-workspace/tests"
num_gpus
:
2
num_nodes
:
2
commands
:
-
# the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
-
VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
-
VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
-
# the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
-
VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
-
label
:
Distributed Tests (2 GPUs)
fast_check
:
true
mirror_hardwares
:
[
amd
]
working_dir
:
"
/vllm-workspace/tests"
num_gpus
:
2
source_file_dependencies
:
-
vllm/
commands
:
-
VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
-
TARGET_TEST_SUITE=L4 pytest -v -s distributed/test_basic_distributed_correctness.py
-
pytest -v -s distributed/test_chunked_prefill_distributed.py
-
pytest -v -s distributed/test_multimodal_broadcast.py
-
pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
-
CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
-
CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
-
label
:
Distributed Tests (4 GPUs)
#mirror_hardwares: [amd]
-
pip install -e ./plugins/vllm_add_dummy_model
-
pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py
-
pytest -v -s entrypoints/llm/test_lazy_outlines.py
# it needs a clean process
-
pytest -v -s entrypoints/llm/test_generate.py
# it needs a clean process
-
pytest -v -s entrypoints/llm/test_generate_multiple_loras.py
# it needs a clean process
-
pytest -v -s entrypoints/llm/test_guided_generate.py
# it needs a clean process
-
pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py
-
pytest -v -s entrypoints/openai/test_oot_registration.py
# it needs a clean process
-
pytest -v -s entrypoints/test_chat_utils.py
-
pytest -v -s entrypoints/offline_mode
# Needs to avoid interference with other tests
-
label
:
Distributed Tests (4 GPUs)
# 10min
working_dir
:
"
/vllm-workspace/tests"
num_gpus
:
4
fast_check
:
true
source_file_dependencies
:
-
vllm/distributed/
-
vllm/core/
-
tests/distributed
-
tests/spec_decode/e2e/test_integration_dist_tp4
-
tests/compile
commands
:
-
pytest -v -s compile/test_basic_correctness.py
-
pytest -v -s distributed/test_pynccl.py
-
pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
-
label
:
Pipeline Parallelism Test
working_dir
:
"
/vllm-workspace/tests"
num_gpus
:
4
-
label
:
Metrics, Tracing Test
# 10min
num_gpus
:
2
fast_check
:
true
source_file_dependencies
:
-
vllm/
-
tests/metrics
-
tests/tracing
commands
:
-
pytest -v -s distributed/test_pipeline_parallel.py
-
pytest -v -s metrics
-
"
pip
install
\
'opentelemetry-sdk>=1.26.0,<1.27.0'
\
'opentelemetry-api>=1.26.0,<1.27.0'
\
'opentelemetry-exporter-otlp>=1.26.0,<1.27.0'
\
'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'"
-
pytest -v -s tracing
-
label
:
Engine Test
##### fast check tests #####
##### 1 GPU test #####
-
label
:
Regression Test
# 5min
mirror_hardwares
:
[
amd
]
source_file_dependencies
:
-
vllm/
-
tests/test_regression
commands
:
-
p
ytest -v -s engine test_sequence.py test_config.py test_logger.py
# OOM in the CI unless we run this separatel
y
-
pytest -v -s tokeniza
tion
-
p
ip install modelscope
-
pytest -v -s test_regression.p
y
working_dir
:
"
/vllm-workspace/tests"
# op
tion
al
-
label
:
Entrypoints Test
fast_check
:
true
-
label
:
Engine Test
# 10min
mirror_hardwares
:
[
amd
]
source_file_dependencies
:
-
vllm/
-
tests/engine
-
tests/tokenization
commands
:
-
pytest -v -s entrypoints/llm
-
pytest -v -s entrypoints/openai
-
pytest -v -s engine test_sequence.py test_config.py test_logger.py
# OOM in the CI unless we run this separately
-
pytest -v -s tokenization
-
label
:
Examples Test
-
label
:
Examples Test
# 15min
working_dir
:
"
/vllm-workspace/examples"
mirror_hardwares
:
[
amd
]
#mirror_hardwares: [amd]
source_file_dependencies
:
-
vllm/entrypoints
-
examples/
commands
:
# install tensorizer for tensorize_vllm_model.py
-
pip install awscli tensorizer
-
pip install awscli tensorizer
# for llava example and tensorizer test
-
python3 offline_inference.py
-
python3 cpu_offload.py
-
python3 offline_inference_chat.py
-
python3 offline_inference_with_prefix.py
-
python3 llm_engine_example.py
-
python3 offline_inference_vision_language.py
-
python3 offline_inference_vision_language_multi_image.py
-
python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
-
python3 offline_inference_encoder_decoder.py
-
python3 offline_profile.py --model facebook/opt-125m
-
label
:
Inputs Test
-
label
:
Prefix Caching Test
# 9min
#mirror_hardwares: [amd]
source_file_dependencies
:
-
vllm/
-
tests/prefix_caching
commands
:
-
pytest -v -s test_inputs.py
-
pytest -v -s multimodal
-
pytest -v -s prefix_caching
# - label: Kernels Test %N
# #mirror_hardwares: [amd]
# commands:
# - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
# - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
# parallelism: 4
-
label
:
Samplers Test
# 36min
source_file_dependencies
:
-
vllm/model_executor/layers
-
vllm/sampling_metadata.py
-
tests/samplers
commands
:
-
pytest -v -s samplers
-
VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
-
label
:
Models Test
#mirror_hardwares: [amd]
-
label
:
LogitsProcessor Test
# 5min
mirror_hardwares
:
[
amd
]
source_file_dependencies
:
-
vllm/model_executor/layers
-
tests/test_logits_processor
command
:
pytest -v -s test_logits_processor.py
-
label
:
Speculative decoding tests
# 30min
source_file_dependencies
:
-
vllm/spec_decode
-
tests/spec_decode
commands
:
-
p
ip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl
-
pytest -v -s models -m \"not vlm\"
-
p
ytest -v -s spec_decode/e2e/test_multistep_correctness.py
-
VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
-
label
:
Vision Language Models Test
-
label
:
LoRA Test %N
# 15min each
mirror_hardwares
:
[
amd
]
source_file_dependencies
:
-
vllm/lora
-
tests/lora
command
:
pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
parallelism
:
4
-
label
:
"
PyTorch
Fullgraph
Smoke
Test"
# 9min
fast_check
:
true
source_file_dependencies
:
-
vllm/
-
tests/compile
commands
:
-
pytest -v -s models -m vlm
-
pytest -v -s compile/test_basic_correctness.py
# TODO: re-write in comparison tests, and fix symbolic shape
# for quantization ops.
# - label: "PyTorch Fullgraph Test" # 18min
# source_file_dependencies:
# - vllm/
# - tests/compile
# commands:
# - pytest -v -s compile/test_full_graph.py
-
label
:
Prefix Caching Test
-
label
:
Kernels Test %N
# 1h each
mirror_hardwares
:
[
amd
]
source_file_dependencies
:
-
csrc/
-
vllm/attention
-
tests/kernels
commands
:
-
pytest -v -s prefix_caching
-
pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism
:
4
-
label
:
Samplers Test
#mirror_hardwares: [amd]
command
:
pytest -v -s samplers
-
label
:
Tensorizer Test
# 11min
mirror_hardwares
:
[
amd
]
soft_fail
:
true
source_file_dependencies
:
-
vllm/model_executor/model_loader
-
tests/tensorizer_loader
commands
:
-
apt-get update && apt-get install -y curl libsodium23
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
-
pytest -v -s tensorizer_loader
-
label
:
LogitsProcessor Test
-
label
:
Benchmarks
# 9min
working_dir
:
"
/vllm-workspace/.buildkite"
mirror_hardwares
:
[
amd
]
command
:
pytest -v -s test_logits_processor.py
source_file_dependencies
:
-
benchmarks/
commands
:
-
pip install aiohttp
-
bash run-benchmarks.sh
-
label
:
Utils Test
-
label
:
Quantization Test
# 33min
source_file_dependencies
:
-
csrc/
-
vllm/model_executor/layers/quantization
-
tests/quantization
command
:
VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
-
label
:
LM Eval Small Models
# 53min
working_dir
:
"
/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies
:
-
csrc/
-
vllm/model_executor/layers/quantization
commands
:
-
pytest -v -s test_utils.py
-
pytest -v -s test_embedded_commit.py
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
-
bash ./run-tests.sh -c configs/models-small.txt -t
1
-
label
:
Worker Test
mirror_hardwares
:
[
amd
]
command
:
pytest -v -s worker
-
label
:
Encoder Decoder tests
# 5min
source_file_dependencies
:
-
vllm/
-
tests/encoder_decoder
commands
:
-
pytest -v -s encoder_decoder
-
label
:
OpenAI-Compatible Tool Use
# 20 min
fast_check
:
false
mirror_hardwares
:
[
amd
]
source_file_dependencies
:
-
vllm/
-
tests/tool_use
commands
:
-
pytest -v -s tool_use
-
label
:
Speculative decoding tests
#mirror_hardwares: [amd]
##### models test #####
-
label
:
Basic Models Test
# 3min
source_file_dependencies
:
-
vllm/
-
tests/models
commands
:
# See https://github.com/vllm-project/vllm/issues/5152
-
export VLLM_ATTENTION_BACKEND=XFORMERS
-
pytest -v -s
spec_decode
-
pip install -e ./plugins/vllm_add_dummy_model
-
pytest -v -s models/test_oot_registration.py
# it needs a clean process
-
pytest -v -s
models/*.py --ignore=models/test_oot_registration.py
# - label: LoRA Test %N
# #mirror_hardwares: [amd]
# command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
# parallelism: 4
-
label
:
Decoder-only Language Models Test
# 1h36min
#mirror_hardwares: [amd]
source_file_dependencies
:
-
vllm/
-
tests/models/decoder_only/language
commands
:
-
pytest -v -s models/decoder_only/language
#
- label:
LoRA Long Context (Distributed)
#
#mirror_hardwares: [amd]
#
num_gpus: 4
#
# This test runs llama 13B, so it is required to run on 4 GPUs.
#
commands:
# # FIXIT: find out which code initialize cuda before running the test
#
# before the fix, we need to use spawn to test it
#
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
#
- pytest -v -s
-x lora/test_long_context.py
-
label
:
Decoder-only Multi-Modal Models Test
# 1h31min
#mirror_hardwares: [amd]
source_file_dependencies
:
-
vllm/
-
tests/models/decoder_only/audio_language
-
tests/models/decoder_only/vision_language
commands
:
-
pytest -v -s models/decoder_only/audio_language
-
pytest -v -s
models/decoder_only/vision_language
-
label
:
Tensorizer Test
-
label
:
Other Models Test
# 6min
#mirror_hardwares: [amd]
fast_check
:
true
source_file_dependencies
:
-
vllm/
-
tests/models/embedding/language
-
tests/models/embedding/vision_language
-
tests/models/encoder_decoder/language
-
tests/models/encoder_decoder/vision_language
commands
:
-
apt-get install -y curl libsodium23
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
-
pytest -v -s tensorizer_loader
-
pytest -v -s models/embedding/language
-
pytest -v -s models/embedding/vision_language
-
pytest -v -s models/encoder_decoder/language
-
pytest -v -s models/encoder_decoder/vision_language
# This test is used only in PR development phase to test individual models and should never run on main
-
label
:
Custom Models Test
optional
:
true
commands
:
-
echo 'Testing custom models...'
# PR authors can temporarily add commands below to test individual models
# e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
# *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
-
label
:
Metrics Test
mirror_hardwares
:
[
amd
]
command
:
pytest -v -s metrics
##### 1 GPU test #####
##### multi gpus test #####
-
label
:
Distributed Comm Ops Test
# 7min
working_dir
:
"
/vllm-workspace/tests"
num_gpus
:
2
source_file_dependencies
:
-
vllm/distributed
-
tests/distributed
commands
:
-
pytest -v -s distributed/test_comm_ops.py
-
pytest -v -s distributed/test_shm_broadcast.py
-
label
:
2 Node Tests (4 GPUs in total)
# 16min
working_dir
:
"
/vllm-workspace/tests"
num_gpus
:
2
num_nodes
:
2
source_file_dependencies
:
-
vllm/distributed/
-
vllm/engine/
-
vllm/executor/
-
vllm/model_executor/models/
-
tests/distributed/
commands
:
-
# the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
-
VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
-
VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
-
VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
-
# the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
-
VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
-
label
:
Quantization Test
-
label
:
Distributed Tests (2 GPUs)
# 40min
#mirror_hardwares: [amd]
command
:
pytest -v -s quantization
working_dir
:
"
/vllm-workspace/tests"
num_gpus
:
2
source_file_dependencies
:
-
vllm/distributed/
-
vllm/engine/
-
vllm/executor/
-
vllm/model_executor/models/
-
tests/distributed/
-
vllm/compilation
commands
:
-
pytest -v -s ./compile/test_basic_correctness.py
-
pytest -v -s ./compile/test_wrapper.py
-
VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
-
TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
# Avoid importing model tests that cause CUDA reinitialization error
-
pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus
-
pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
-
pytest models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
-
pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
-
pip install -e ./plugins/vllm_add_dummy_model
-
pytest -v -s distributed/test_distributed_oot.py
-
CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
-
CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
-
label
:
Tracing Test
commands
:
-
"
pip
install
\
opentelemetry-sdk
\
opentelemetry-api
\
opentelemetry-exporter-otlp
\
opentelemetry-semantic-conventions-ai"
-
pytest -v -s tracing
-
label
:
Benchmarks
working_dir
:
"
/vllm-workspace/.buildkite"
mirror_hardwares
:
[
amd
]
-
label
:
Multi-step Tests (4 GPUs)
# 36min
working_dir
:
"
/vllm-workspace/tests"
num_gpus
:
4
source_file_dependencies
:
-
vllm/model_executor/layers/sampler.py
-
vllm/sequence.py
-
vllm/worker/worker_base.py
-
vllm/worker/worker.py
-
vllm/worker/multi_step_worker.py
-
vllm/worker/model_runner_base.py
-
vllm/worker/model_runner.py
-
vllm/worker/multi_step_model_runner.py
-
vllm/engine
-
tests/multi_step
commands
:
-
p
ip install aiohttp
-
bash run-benchmarks.sh
-
p
ytest -v -s multi_step/test_correctness_async_llm.py
-
pytest -v -s multi_step/test_correctness_llm.py
-
label
:
LM Eval Small Models
working_dir
:
"
/vllm-workspace/.buildkite/lm-eval-harness"
-
label
:
Pipeline Parallelism Test
# 45min
working_dir
:
"
/vllm-workspace/tests"
num_gpus
:
4
source_file_dependencies
:
-
vllm/distributed/
-
vllm/engine/
-
vllm/executor/
-
vllm/model_executor/models/
-
tests/distributed/
commands
:
-
pip install lm-eval
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
-
bash ./run-tests.sh -c configs/models-small.txt -t
1
-
pytest -v -s distributed/test_pp_cudagraph.py
-
pytest -v -s distributed/test_pipeline_parallel.py
-
label
:
L
M Eval Large Models
gpu
:
a100
-
label
:
L
oRA Long Context (Distributed)
# 11min
# This test runs llama 13B, so it is required to run on 4 GPUs.
num_gpus
:
4
working_dir
:
"
/vllm-workspace/.buildkite/lm-eval-harness"
soft_fail
:
true
source_file_dependencies
:
-
vllm/lora
-
tests/lora/test_long_context
commands
:
-
pip install lm-eval
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
-
bash ./run-tests.sh -c configs/models-large.txt -t
4
# FIXIT: find out which code initialize cuda before running the test
# before the fix, we need to use spawn to test it
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
-
pytest -v -s -x lora/test_long_context.py
-
label
:
Documentation Build
working_dir
:
"
/vllm-workspace/test_docs/docs"
fast_check
:
true
no_gpu
:
True
-
label
:
Weight Loading Multiple GPU Test
# 33min
working_dir
:
"
/vllm-workspace/tests"
num_gpus
:
2
source_file_dependencies
:
-
vllm/
-
tests/weight_loading
commands
:
-
pip install -r requirements-docs.txt
-
SPHINXOPTS=\"-W\" make html
-
bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
-
label
:
Weight Loading Multiple GPU Test - Large Models
# optional
working_dir
:
"
/vllm-workspace/tests"
num_gpus
:
2
gpu
:
a100
optional
:
true
source_file_dependencies
:
-
vllm/
-
tests/weight_loading
commands
:
-
bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
##### multi gpus test #####
##### A100 test #####
-
label
:
Distributed Tests (A100)
-
label
:
Distributed Tests (A100)
# optional
gpu
:
a100
num_gpus
:
4
source_file_dependencies
:
-
vllm/
commands
:
# NOTE: don't test llama model here, it seems hf implementation is buggy
# see https://github.com/vllm-project/vllm/pull/5689 for details
-
pytest -v -s distributed/test_custom_all_reduce.py
-
pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl
-
TARGET_TEST_SUITE=A100 pytest -v -s distributed/test_basic_distributed_correctness.py
-
TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m distributed_2_gpus
-
pytest -v -s -x lora/test_mixtral.py
-
label
:
LM Eval Large Models
# optional
gpu
:
a100
num_gpus
:
4
working_dir
:
"
/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies
:
-
csrc/
-
vllm/model_executor/layers/quantization
commands
:
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
-
bash ./run-tests.sh -c configs/models-large.txt -t
4
.dockerignore
View file @
ad385667
/.venv
/build
dist
vllm/*.so
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
.mypy_cache
# Distribution / packaging
.Python
/build/
cmake-build-*/
CMakeUserPresets.json
develop-eggs/
/dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
Prev
1
2
3
4
5
6
…
19
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment