Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
3be5b26a
Unverified
Commit
3be5b26a
authored
Nov 07, 2024
by
Russell Bryant
Committed by
GitHub
Nov 07, 2024
Browse files
[CI/Build] Add shell script linting using shellcheck (#7925)
Signed-off-by:
Russell Bryant
<
rbryant@redhat.com
>
parent
de0e61a3
Changes
28
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
157 additions
and
117 deletions
+157
-117
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+3
-3
.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+3
-3
.buildkite/lm-eval-harness/run-tests.sh
.buildkite/lm-eval-harness/run-tests.sh
+1
-1
.buildkite/nightly-benchmarks/scripts/launch-server.sh
.buildkite/nightly-benchmarks/scripts/launch-server.sh
+25
-38
.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
+6
-6
.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
...kite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+14
-16
.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
.../nightly-benchmarks/scripts/run-performance-benchmarks.sh
+9
-10
.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
+2
-2
.buildkite/run-amd-test.sh
.buildkite/run-amd-test.sh
+18
-16
.buildkite/run-benchmarks.sh
.buildkite/run-benchmarks.sh
+2
-0
.buildkite/run-cpu-test-ppc64le.sh
.buildkite/run-cpu-test-ppc64le.sh
+3
-1
.buildkite/run-cpu-test.sh
.buildkite/run-cpu-test.sh
+2
-0
.buildkite/run-multi-node-test.sh
.buildkite/run-multi-node-test.sh
+15
-12
.buildkite/run-neuron-test.sh
.buildkite/run-neuron-test.sh
+5
-3
.buildkite/run-openvino-test.sh
.buildkite/run-openvino-test.sh
+2
-0
.buildkite/run-tpu-test.sh
.buildkite/run-tpu-test.sh
+3
-1
.buildkite/run-xpu-test.sh
.buildkite/run-xpu-test.sh
+2
-0
.github/workflows/scripts/cuda-install.sh
.github/workflows/scripts/cuda-install.sh
+4
-4
.github/workflows/scripts/pytorch-install.sh
.github/workflows/scripts/pytorch-install.sh
+1
-1
.github/workflows/shellcheck.yml
.github/workflows/shellcheck.yml
+37
-0
No files found.
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
View file @
3be5b26a
...
...
@@ -41,6 +41,6 @@ while getopts "m:b:l:f:" OPT; do
done
lm_eval
--model
hf
\
--model_args
pretrained
=
$MODEL
,parallelize
=
True
\
--tasks
gsm8k
--num_fewshot
$FEWSHOT
--limit
$LIMIT
\
--batch_size
$BATCH_SIZE
--model_args
"
pretrained=
$MODEL
,parallelize=True
"
\
--tasks
gsm8k
--num_fewshot
"
$FEWSHOT
"
--limit
"
$LIMIT
"
\
--batch_size
"
$BATCH_SIZE
"
.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
View file @
3be5b26a
...
...
@@ -46,6 +46,6 @@ while getopts "m:b:l:f:t:" OPT; do
done
lm_eval
--model
vllm
\
--model_args
pretrained
=
$MODEL
,tensor_parallel_size
=
$TP_SIZE
,distributed_executor_backend
=
"
ray
"
,trust_remote_code
=
true
,max_model_len
=
4096
\
--tasks
gsm8k
--num_fewshot
$FEWSHOT
--limit
$LIMIT
\
--batch_size
$BATCH_SIZE
--model_args
"
pretrained=
$MODEL
,tensor_parallel_size=
$TP_SIZE
,distributed_executor_backend=ray,trust_remote_code=true,max_model_len=4096
"
\
--tasks
gsm8k
--num_fewshot
"
$FEWSHOT
"
--limit
"
$LIMIT
"
\
--batch_size
"
$BATCH_SIZE
"
.buildkite/lm-eval-harness/run-tests.sh
View file @
3be5b26a
...
...
@@ -30,7 +30,7 @@ while getopts "c:t:" OPT; do
done
# Parse list of configs.
IFS
=
$'
\n
'
read
-d
''
-r
-a
MODEL_CONFIGS <
$CONFIG
IFS
=
$'
\n
'
read
-d
''
-r
-a
MODEL_CONFIGS <
"
$CONFIG
"
for
MODEL_CONFIG
in
"
${
MODEL_CONFIGS
[@]
}
"
do
...
...
.buildkite/nightly-benchmarks/scripts/launch-server.sh
View file @
3be5b26a
...
...
@@ -50,31 +50,30 @@ launch_trt_server() {
git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
git lfs
install
cd
tensorrtllm_backend
git checkout
$trt_llm_version
tensorrtllm_backend_dir
=
$(
pwd
)
git checkout
"
$trt_llm_version
"
git submodule update
--init
--recursive
# build trtllm engine
cd
/tensorrtllm_backend
cd
./tensorrt_llm/examples/
${
model_type
}
cd
"
./tensorrt_llm/examples/
${
model_type
}
"
python3 convert_checkpoint.py
\
--model_dir
${
model_path
}
\
--dtype
${
model_dtype
}
\
--tp_size
${
model_tp_size
}
\
--output_dir
${
trt_model_path
}
--model_dir
"
${
model_path
}
"
\
--dtype
"
${
model_dtype
}
"
\
--tp_size
"
${
model_tp_size
}
"
\
--output_dir
"
${
trt_model_path
}
"
trtllm-build
\
--checkpoint_dir
${
trt_model_path
}
\
--checkpoint_dir
"
${
trt_model_path
}
"
\
--use_fused_mlp
\
--reduce_fusion
disable
\
--workers
8
\
--gpt_attention_plugin
${
model_dtype
}
\
--gemm_plugin
${
model_dtype
}
\
--tp_size
${
model_tp_size
}
\
--max_batch_size
${
max_batch_size
}
\
--max_input_len
${
max_input_len
}
\
--max_seq_len
${
max_seq_len
}
\
--max_num_tokens
${
max_num_tokens
}
\
--output_dir
${
trt_engine_path
}
--gpt_attention_plugin
"
${
model_dtype
}
"
\
--gemm_plugin
"
${
model_dtype
}
"
\
--tp_size
"
${
model_tp_size
}
"
\
--max_batch_size
"
${
max_batch_size
}
"
\
--max_input_len
"
${
max_input_len
}
"
\
--max_seq_len
"
${
max_seq_len
}
"
\
--max_num_tokens
"
${
max_num_tokens
}
"
\
--output_dir
"
${
trt_engine_path
}
"
# handle triton protobuf files and launch triton server
cd
/tensorrtllm_backend
...
...
@@ -82,15 +81,15 @@ launch_trt_server() {
cp
-r
all_models/inflight_batcher_llm/
*
triton_model_repo/
cd
triton_model_repo
rm
-rf
./tensorrt_llm/1/
*
cp
-r
${
trt_engine_path
}
/
*
./tensorrt_llm/1
cp
-r
"
${
trt_engine_path
}
"
/
*
./tensorrt_llm/1
python3 ../tools/fill_template.py
-i
tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,engine_dir:/tensorrtllm_backend/triton_model_repo/tensorrt_llm/1,decoupled_mode:true,batching_strategy:inflight_fused_batching,batch_scheduler_policy:guaranteed_no_evict,exclude_input_in_output:true,triton_max_batch_size:2048,max_queue_delay_microseconds:0,max_beam_width:1,max_queue_size:2048,enable_kv_cache_reuse:false
python3 ../tools/fill_template.py
-i
preprocessing/config.pbtxt triton_max_batch_size:2048,tokenizer_dir:
$model_path
,preprocessing_instance_count:5
python3 ../tools/fill_template.py
-i
postprocessing/config.pbtxt triton_max_batch_size:2048,tokenizer_dir:
$model_path
,postprocessing_instance_count:5,skip_special_tokens:false
python3 ../tools/fill_template.py
-i
ensemble/config.pbtxt triton_max_batch_size:
$max_batch_size
python3 ../tools/fill_template.py
-i
tensorrt_llm_bls/config.pbtxt triton_max_batch_size:
$max_batch_size
,decoupled_mode:true,accumulate_tokens:
"
False
"
,bls_instance_count:1
python3 ../tools/fill_template.py
-i
preprocessing/config.pbtxt
"
triton_max_batch_size:2048,tokenizer_dir:
$model_path
,preprocessing_instance_count:5
"
python3 ../tools/fill_template.py
-i
postprocessing/config.pbtxt
"
triton_max_batch_size:2048,tokenizer_dir:
$model_path
,postprocessing_instance_count:5,skip_special_tokens:false
"
python3 ../tools/fill_template.py
-i
ensemble/config.pbtxt triton_max_batch_size:
"
$max_batch_size
"
python3 ../tools/fill_template.py
-i
tensorrt_llm_bls/config.pbtxt
"
triton_max_batch_size:
$max_batch_size
,decoupled_mode:true,accumulate_tokens:False,bls_instance_count:1
"
cd
/tensorrtllm_backend
python3 scripts/launch_triton_server.py
\
--world_size
=
${
model_tp_size
}
\
--world_size
=
"
${
model_tp_size
}
"
\
--model_repo
=
/tensorrtllm_backend/triton_model_repo &
}
...
...
@@ -98,10 +97,7 @@ launch_trt_server() {
launch_tgi_server
()
{
model
=
$(
echo
"
$common_params
"
| jq
-r
'.model'
)
tp
=
$(
echo
"
$common_params
"
| jq
-r
'.tp'
)
dataset_name
=
$(
echo
"
$common_params
"
| jq
-r
'.dataset_name'
)
dataset_path
=
$(
echo
"
$common_params
"
| jq
-r
'.dataset_path'
)
port
=
$(
echo
"
$common_params
"
| jq
-r
'.port'
)
num_prompts
=
$(
echo
"
$common_params
"
| jq
-r
'.num_prompts'
)
server_args
=
$(
json2args
"
$server_params
"
)
if
echo
"
$common_params
"
| jq
-e
'has("fp8")'
>
/dev/null
;
then
...
...
@@ -129,10 +125,7 @@ launch_tgi_server() {
launch_lmdeploy_server
()
{
model
=
$(
echo
"
$common_params
"
| jq
-r
'.model'
)
tp
=
$(
echo
"
$common_params
"
| jq
-r
'.tp'
)
dataset_name
=
$(
echo
"
$common_params
"
| jq
-r
'.dataset_name'
)
dataset_path
=
$(
echo
"
$common_params
"
| jq
-r
'.dataset_path'
)
port
=
$(
echo
"
$common_params
"
| jq
-r
'.port'
)
num_prompts
=
$(
echo
"
$common_params
"
| jq
-r
'.num_prompts'
)
server_args
=
$(
json2args
"
$server_params
"
)
server_command
=
"lmdeploy serve api_server
$model
\
...
...
@@ -149,10 +142,7 @@ launch_sglang_server() {
model
=
$(
echo
"
$common_params
"
| jq
-r
'.model'
)
tp
=
$(
echo
"
$common_params
"
| jq
-r
'.tp'
)
dataset_name
=
$(
echo
"
$common_params
"
| jq
-r
'.dataset_name'
)
dataset_path
=
$(
echo
"
$common_params
"
| jq
-r
'.dataset_path'
)
port
=
$(
echo
"
$common_params
"
| jq
-r
'.port'
)
num_prompts
=
$(
echo
"
$common_params
"
| jq
-r
'.num_prompts'
)
server_args
=
$(
json2args
"
$server_params
"
)
if
echo
"
$common_params
"
| jq
-e
'has("fp8")'
>
/dev/null
;
then
...
...
@@ -185,10 +175,7 @@ launch_vllm_server() {
model
=
$(
echo
"
$common_params
"
| jq
-r
'.model'
)
tp
=
$(
echo
"
$common_params
"
| jq
-r
'.tp'
)
dataset_name
=
$(
echo
"
$common_params
"
| jq
-r
'.dataset_name'
)
dataset_path
=
$(
echo
"
$common_params
"
| jq
-r
'.dataset_path'
)
port
=
$(
echo
"
$common_params
"
| jq
-r
'.port'
)
num_prompts
=
$(
echo
"
$common_params
"
| jq
-r
'.num_prompts'
)
server_args
=
$(
json2args
"
$server_params
"
)
if
echo
"
$common_params
"
| jq
-e
'has("fp8")'
>
/dev/null
;
then
...
...
@@ -217,19 +204,19 @@ launch_vllm_server() {
main
()
{
if
[[
$CURRENT_LLM_SERVING_ENGINE
==
"trt"
]]
;
then
if
[[
"
$CURRENT_LLM_SERVING_ENGINE
"
==
"trt"
]]
;
then
launch_trt_server
fi
if
[[
$CURRENT_LLM_SERVING_ENGINE
==
"tgi"
]]
;
then
if
[[
"
$CURRENT_LLM_SERVING_ENGINE
"
==
"tgi"
]]
;
then
launch_tgi_server
fi
if
[[
$CURRENT_LLM_SERVING_ENGINE
==
"lmdeploy"
]]
;
then
if
[[
"
$CURRENT_LLM_SERVING_ENGINE
"
==
"lmdeploy"
]]
;
then
launch_lmdeploy_server
fi
if
[[
$CURRENT_LLM_SERVING_ENGINE
==
"sglang"
]]
;
then
if
[[
"
$CURRENT_LLM_SERVING_ENGINE
"
==
"sglang"
]]
;
then
launch_sglang_server
fi
...
...
.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
View file @
3be5b26a
...
...
@@ -16,10 +16,10 @@ main() {
fi
# initial annotation
description
=
"
$VLLM_SOURCE_CODE_LOC
/.buildkite/nightly-benchmarks/nightly-descriptions.md"
#
description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"
# download results
cd
$VLLM_SOURCE_CODE_LOC
/benchmarks
cd
"
$VLLM_SOURCE_CODE_LOC
/benchmarks
"
mkdir
-p
results/
/workspace/buildkite-agent artifact download
'results/*nightly_results.json'
results/
ls
...
...
@@ -30,15 +30,15 @@ main() {
/workspace/buildkite-agent artifact upload
"results.zip"
# upload benchmarking scripts
cd
$VLLM_SOURCE_CODE_LOC
/
cd
"
$VLLM_SOURCE_CODE_LOC
/
"
zip
-r
nightly-benchmarks.zip .buildkite/ benchmarks/
/workspace/buildkite-agent artifact upload
"nightly-benchmarks.zip"
cd
$VLLM_SOURCE_CODE_LOC
/.buildkite/nightly-benchmarks/
cd
"
$VLLM_SOURCE_CODE_LOC
/.buildkite/nightly-benchmarks/
"
# upload benchmarking pipeline
/workspace/buildkite-agent artifact upload
"nightly-pipeline.yaml"
cd
$VLLM_SOURCE_CODE_LOC
/.buildkite/nightly-benchmarks/
cd
"
$VLLM_SOURCE_CODE_LOC
/.buildkite/nightly-benchmarks/
"
/workspace/buildkite-agent annotate
--style
"success"
--context
"nightly-benchmarks-results"
--append
< nightly-annotation.md
...
...
@@ -75,4 +75,4 @@ main() {
# /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
}
main
"
$@
"
\ No newline at end of file
main
"
$@
"
.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
View file @
3be5b26a
...
...
@@ -12,7 +12,7 @@ check_gpus() {
echo
"Need at least 1 GPU to run benchmarking."
exit
1
fi
declare
-g
gpu_type
=
$(
echo
$(
nvidia-smi
--query-gpu
=
name
--format
=
csv,noheader
)
|
awk
'{print $2}'
)
declare
-g
gpu_type
=
"
$(
nvidia-smi
--query-gpu
=
name
--format
=
csv,noheader |
awk
'{print $2}'
)
"
echo
"GPU type is
$gpu_type
"
}
...
...
@@ -102,7 +102,7 @@ kill_gpu_processes() {
pkill
-f
text-generation
pkill
-f
lmdeploy
while
[
$(
nvidia-smi
--query-gpu
=
memory.used
--format
=
csv,noheader,nounits |
head
-n
1
)
-ge
1000
]
;
do
while
[
"
$(
nvidia-smi
--query-gpu
=
memory.used
--format
=
csv,noheader,nounits |
head
-n
1
)
"
-ge
1000
]
;
do
sleep
1
done
}
...
...
@@ -119,8 +119,8 @@ wait_for_server() {
ensure_installed
()
{
# Ensure that the given command is installed by apt-get
local
cmd
=
$1
if
!
which
$cmd
>
/dev/null
;
then
apt-get update
&&
apt-get
install
-y
$cmd
if
!
which
"
$cmd
"
>
/dev/null
;
then
apt-get update
&&
apt-get
install
-y
"
$cmd
"
fi
}
...
...
@@ -173,13 +173,11 @@ run_serving_tests() {
echo
"Reuse previous server for test case
$test_name
"
else
kill_gpu_processes
bash
$VLLM_SOURCE_CODE_LOC
/.buildkite/nightly-benchmarks/scripts/launch-server.sh
\
bash
"
$VLLM_SOURCE_CODE_LOC
/.buildkite/nightly-benchmarks/scripts/launch-server.sh
"
\
"
$server_params
"
"
$common_params
"
fi
wait_for_server
if
[
$?
-eq
0
]
;
then
if
wait_for_server
;
then
echo
""
echo
"
$CURRENT_LLM_SERVING_ENGINE
server is up and running."
else
...
...
@@ -190,13 +188,13 @@ run_serving_tests() {
# prepare tokenizer
# this is required for lmdeploy.
cd
$VLLM_SOURCE_CODE_LOC
/benchmarks
cd
"
$VLLM_SOURCE_CODE_LOC
/benchmarks
"
rm
-rf
/tokenizer_cache
mkdir
/tokenizer_cache
python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
\
--model
"
$model
"
\
--cachedir
/tokenizer_cache
cd
$VLLM_SOURCE_CODE_LOC
/benchmarks
cd
"
$VLLM_SOURCE_CODE_LOC
/benchmarks
"
# change model name for lmdeploy (it will not follow standard hf name)
...
...
@@ -307,11 +305,11 @@ run_serving_tests() {
prepare_dataset
()
{
# download sharegpt dataset
cd
$VLLM_SOURCE_CODE_LOC
/benchmarks
cd
"
$VLLM_SOURCE_CODE_LOC
/benchmarks
"
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
# duplicate sonnet by 4x, to allow benchmarking with input length 2048
cd
$VLLM_SOURCE_CODE_LOC
/benchmarks
cd
"
$VLLM_SOURCE_CODE_LOC
/benchmarks
"
echo
""
>
sonnet_4x.txt
for
_
in
{
1..4
}
do
...
...
@@ -339,17 +337,17 @@ main() {
prepare_dataset
cd
$VLLM_SOURCE_CODE_LOC
/benchmarks
cd
"
$VLLM_SOURCE_CODE_LOC
/benchmarks
"
declare
-g
RESULTS_FOLDER
=
results/
mkdir
-p
$RESULTS_FOLDER
BENCHMARK_ROOT
=
$VLLM_SOURCE_CODE_LOC
/.buildkite/nightly-benchmarks/
BENCHMARK_ROOT
=
"
$VLLM_SOURCE_CODE_LOC
/.buildkite/nightly-benchmarks/
"
# run the test
run_serving_tests
$BENCHMARK_ROOT
/tests/nightly-tests.json
run_serving_tests
"
$BENCHMARK_ROOT
/tests/nightly-tests.json
"
# upload benchmark results to buildkite
python3
-m
pip
install
tabulate pandas
python3
$BENCHMARK_ROOT
/scripts/summary-nightly-results.py
python3
"
$BENCHMARK_ROOT
/scripts/summary-nightly-results.py
"
upload_to_buildkite
}
...
...
.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
View file @
3be5b26a
...
...
@@ -17,7 +17,7 @@ check_gpus() {
echo
"Need at least 1 GPU to run benchmarking."
exit
1
fi
declare
-g
gpu_type
=
$(
echo
$(
nvidia-smi
--query-gpu
=
name
--format
=
csv,noheader
)
|
awk
'{print $2}'
)
declare
-g
gpu_type
=
$(
nvidia-smi
--query-gpu
=
name
--format
=
csv,noheader |
awk
'{print $2}'
)
echo
"GPU type is
$gpu_type
"
}
...
...
@@ -93,7 +93,7 @@ kill_gpu_processes() {
# wait until GPU memory usage smaller than 1GB
while
[
$(
nvidia-smi
--query-gpu
=
memory.used
--format
=
csv,noheader,nounits |
head
-n
1
)
-ge
1000
]
;
do
while
[
"
$(
nvidia-smi
--query-gpu
=
memory.used
--format
=
csv,noheader,nounits |
head
-n
1
)
"
-ge
1000
]
;
do
sleep
1
done
...
...
@@ -117,7 +117,7 @@ upload_to_buildkite() {
fi
# Use the determined command to annotate and upload artifacts
$BUILDKITE_AGENT_COMMAND
annotate
--style
"info"
--context
"
$BUILDKITE_LABEL
-benchmark-results"
<
$RESULTS_FOLDER
/benchmark_results.md
$BUILDKITE_AGENT_COMMAND
annotate
--style
"info"
--context
"
$BUILDKITE_LABEL
-benchmark-results"
<
"
$RESULTS_FOLDER
/benchmark_results.md
"
$BUILDKITE_AGENT_COMMAND
artifact upload
"
$RESULTS_FOLDER
/*"
}
...
...
@@ -150,7 +150,7 @@ run_latency_tests() {
# check if there is enough GPU to run the test
tp
=
$(
echo
"
$latency_params
"
| jq
-r
'.tensor_parallel_size'
)
if
[[
$gpu_count
-lt
$tp
]]
;
then
echo
"Required tensor-parallel-size
$tp
but only
$gpu_count
GPU found. Skip testcase
$testname
."
echo
"Required tensor-parallel-size
$tp
but only
$gpu_count
GPU found. Skip testcase
$test
_
name
."
continue
fi
...
...
@@ -206,9 +206,9 @@ run_throughput_tests() {
throughput_args
=
$(
json2args
"
$throughput_params
"
)
# check if there is enough GPU to run the test
tp
=
$(
echo
$throughput_params
| jq
-r
'.tensor_parallel_size'
)
tp
=
$(
echo
"
$throughput_params
"
| jq
-r
'.tensor_parallel_size'
)
if
[[
$gpu_count
-lt
$tp
]]
;
then
echo
"Required tensor-parallel-size
$tp
but only
$gpu_count
GPU found. Skip testcase
$testname
."
echo
"Required tensor-parallel-size
$tp
but only
$gpu_count
GPU found. Skip testcase
$test
_
name
."
continue
fi
...
...
@@ -270,7 +270,7 @@ run_serving_tests() {
# check if there is enough GPU to run the test
tp
=
$(
echo
"
$server_params
"
| jq
-r
'.tensor_parallel_size'
)
if
[[
$gpu_count
-lt
$tp
]]
;
then
echo
"Required tensor-parallel-size
$tp
but only
$gpu_count
GPU found. Skip testcase
$testname
."
echo
"Required tensor-parallel-size
$tp
but only
$gpu_count
GPU found. Skip testcase
$test
_
name
."
continue
fi
...
...
@@ -278,7 +278,7 @@ run_serving_tests() {
server_model
=
$(
echo
"
$server_params
"
| jq
-r
'.model'
)
client_model
=
$(
echo
"
$client_params
"
| jq
-r
'.model'
)
if
[[
$server_model
!=
"
$client_model
"
]]
;
then
echo
"Server model and client model must be the same. Skip testcase
$testname
."
echo
"Server model and client model must be the same. Skip testcase
$test
_
name
."
continue
fi
...
...
@@ -293,8 +293,7 @@ run_serving_tests() {
server_pid
=
$!
# wait until the server is alive
wait_for_server
if
[
$?
-eq
0
]
;
then
if
wait_for_server
;
then
echo
""
echo
"vllm server is up and running."
else
...
...
.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
View file @
3be5b26a
...
...
@@ -6,7 +6,7 @@ TIMEOUT_SECONDS=10
retries
=
0
while
[
$retries
-lt
1000
]
;
do
if
[
$(
curl
-s
--max-time
$TIMEOUT_SECONDS
-L
-H
"Authorization: Bearer
$TOKEN
"
-o
/dev/null
-w
"%{http_code}"
$URL
)
-eq
200
]
;
then
if
[
"
$(
curl
-s
--max-time
"
$TIMEOUT_SECONDS
"
-L
-H
"Authorization: Bearer
$TOKEN
"
-o
/dev/null
-w
"%{http_code}"
"
$URL
"
)
"
-eq
200
]
;
then
exit
0
fi
...
...
@@ -16,4 +16,4 @@ while [ $retries -lt 1000 ]; do
sleep
5
done
exit
1
\ No newline at end of file
exit
1
.buildkite/run-amd-test.sh
View file @
3be5b26a
#!/bin/bash
# This script runs test inside the corresponding ROCm docker container.
set
-o
pipefail
...
...
@@ -57,17 +59,17 @@ done
echo
"--- Pulling container"
image_name
=
"rocm/vllm-ci:
${
BUILDKITE_COMMIT
}
"
container_name
=
"rocm_
${
BUILDKITE_COMMIT
}
_
$(
tr
-dc
A-Za-z0-9 < /dev/urandom |
head
-c
10
;
echo
)
"
docker pull
${
image_name
}
docker pull
"
${
image_name
}
"
remove_docker_container
()
{
docker
rm
-f
${
container_name
}
||
docker image
rm
-f
${
image_name
}
||
true
docker
rm
-f
"
${
container_name
}
"
||
docker image
rm
-f
"
${
image_name
}
"
||
true
}
trap
remove_docker_container EXIT
echo
"--- Running container"
HF_CACHE
=
"
$(
realpath
~
)
/huggingface"
mkdir
-p
${
HF_CACHE
}
mkdir
-p
"
${
HF_CACHE
}
"
HF_MOUNT
=
"/root/.cache/huggingface"
commands
=
$@
...
...
@@ -118,25 +120,25 @@ if [[ $commands == *"--shard-id="* ]]; then
--network
host
\
--shm-size
=
16gb
\
--rm
\
-e
HIP_VISIBLE_DEVICES
=
${
GPU
}
\
-e
HIP_VISIBLE_DEVICES
=
"
${
GPU
}
"
\
-e
HF_TOKEN
\
-v
${
HF_CACHE
}
:
${
HF_MOUNT
}
\
-e
HF_HOME
=
${
HF_MOUNT
}
\
--name
${
container_name
}
_
${
GPU
}
\
${
image_name
}
\
-v
"
${
HF_CACHE
}
:
${
HF_MOUNT
}
"
\
-e
"
HF_HOME=
${
HF_MOUNT
}
"
\
--name
"
${
container_name
}
_
${
GPU
}
"
\
"
${
image_name
}
"
\
/bin/bash
-c
"
${
commands_gpu
}
"
\
|&
while
read
-r
line
;
do
echo
">>Shard
$GPU
:
$line
"
;
done
&
PIDS+
=(
$!
)
done
#wait for all processes to finish and collect exit codes
for
pid
in
${
PIDS
[@]
}
;
do
wait
${
pid
}
for
pid
in
"
${
PIDS
[@]
}
"
;
do
wait
"
${
pid
}
"
STATUS+
=(
$?
)
done
for
st
in
${
STATUS
[@]
}
;
do
for
st
in
"
${
STATUS
[@]
}
"
;
do
if
[[
${
st
}
-ne
0
]]
;
then
echo
"One of the processes failed with
$st
"
exit
${
st
}
exit
"
${
st
}
"
fi
done
else
...
...
@@ -147,9 +149,9 @@ else
--rm
\
-e
HIP_VISIBLE_DEVICES
=
0
\
-e
HF_TOKEN
\
-v
${
HF_CACHE
}
:
${
HF_MOUNT
}
\
-e
HF_HOME
=
${
HF_MOUNT
}
\
--name
${
container_name
}
\
${
image_name
}
\
-v
"
${
HF_CACHE
}
:
${
HF_MOUNT
}
"
\
-e
"
HF_HOME=
${
HF_MOUNT
}
"
\
--name
"
${
container_name
}
"
\
"
${
image_name
}
"
\
/bin/bash
-c
"
${
commands
}
"
fi
.buildkite/run-benchmarks.sh
View file @
3be5b26a
#!/bin/bash
# This script is run by buildkite to run the benchmarks and upload the results to buildkite
set
-ex
...
...
.buildkite/run-cpu-test-ppc64le.sh
View file @
3be5b26a
#!/bin/bash
# This script build the CPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
set
-ex
...
...
@@ -13,7 +15,7 @@ remove_docker_container
# Run the image, setting --shm-size=4g for tensor parallel.
source
/etc/environment
#docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
docker run
-itd
--entrypoint
/bin/bash
-v
~/.cache/huggingface:/root/.cache/huggingface
--privileged
=
true
--network
host
-e
HF_TOKEN
=
$HF_TOKEN
--name
cpu-test cpu-test
docker run
-itd
--entrypoint
/bin/bash
-v
~/.cache/huggingface:/root/.cache/huggingface
--privileged
=
true
--network
host
-e
HF_TOKEN
=
"
$HF_TOKEN
"
--name
cpu-test cpu-test
# Run basic model test
docker
exec
cpu-test bash
-c
"
...
...
.buildkite/run-cpu-test.sh
View file @
3be5b26a
#!/bin/bash
# This script build the CPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
set
-ex
...
...
.buildkite/run-multi-node-test.sh
View file @
3be5b26a
...
...
@@ -14,7 +14,7 @@ DOCKER_IMAGE=$4
shift
4
COMMANDS
=(
"
$@
"
)
if
[
${#
COMMANDS
[@]
}
-ne
$NUM_NODES
]
;
then
if
[
${#
COMMANDS
[@]
}
-ne
"
$NUM_NODES
"
]
;
then
echo
"The number of commands must be equal to the number of nodes."
echo
"Number of nodes:
$NUM_NODES
"
echo
"Number of commands:
${#
COMMANDS
[@]
}
"
...
...
@@ -23,7 +23,7 @@ fi
echo
"List of commands"
for
command
in
"
${
COMMANDS
[@]
}
"
;
do
echo
$command
echo
"
$command
"
done
start_network
()
{
...
...
@@ -36,7 +36,7 @@ start_nodes() {
for
node_gpu
in
$(
seq
0
$((
$NUM_GPUS
-
1
))
)
;
do
DEVICE_NUM
=
$((
$node
*
$NUM_GPUS
+
$node_gpu
))
GPU_DEVICES+
=
$((
$DEVICE_NUM
))
if
[
$node_gpu
-lt
$((
$NUM_GPUS
-
1
))
]
;
then
if
[
"
$node_gpu
"
-lt
$((
$NUM_GPUS
-
1
))
]
;
then
GPU_DEVICES+
=
','
fi
done
...
...
@@ -49,17 +49,20 @@ start_nodes() {
# 3. map the huggingface cache directory to the container
# 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes:
# starting from 192.168.10.11)
docker run
-d
--gpus
"
$GPU_DEVICES
"
--shm-size
=
10.24gb
-e
HF_TOKEN
-v
~/.cache/huggingface:/root/.cache/huggingface
--name
node
$node
--network
docker-net
--ip
192.168.10.
$((
10
+
$node
))
--rm
$DOCKER_IMAGE
/bin/bash
-c
"tail -f /dev/null"
docker run
-d
--gpus
"
$GPU_DEVICES
"
--shm-size
=
10.24gb
-e
HF_TOKEN
\
-v
~/.cache/huggingface:/root/.cache/huggingface
--name
"node
$node
"
\
--network
docker-net
--ip
192.168.10.
$((
10
+
$node
))
--rm
"
$DOCKER_IMAGE
"
\
/bin/bash
-c
"tail -f /dev/null"
# organize containers into a ray cluster
if
[
$node
-eq
0
]
;
then
if
[
"
$node
"
-eq
0
]
;
then
# start the ray head node
docker
exec
-d
node
$node
/bin/bash
-c
"ray start --head --port=6379 --block"
docker
exec
-d
"
node
$node
"
/bin/bash
-c
"ray start --head --port=6379 --block"
# wait for the head node to be ready
sleep
10
else
# start the ray worker nodes, and connect them to the head node
docker
exec
-d
node
$node
/bin/bash
-c
"ray start --address=192.168.10.10:6379 --block"
docker
exec
-d
"
node
$node
"
/bin/bash
-c
"ray start --address=192.168.10.10:6379 --block"
fi
done
...
...
@@ -79,22 +82,22 @@ run_nodes() {
for
node_gpu
in
$(
seq
0
$((
$NUM_GPUS
-
1
))
)
;
do
DEVICE_NUM
=
$((
$node
*
$NUM_GPUS
+
$node_gpu
))
GPU_DEVICES+
=
$((
$DEVICE_NUM
))
if
[
$node_gpu
-lt
$((
$NUM_GPUS
-
1
))
]
;
then
if
[
"
$node_gpu
"
-lt
$((
$NUM_GPUS
-
1
))
]
;
then
GPU_DEVICES+
=
','
fi
done
GPU_DEVICES+
=
'"'
echo
"Running node
$node
with GPU devices:
$GPU_DEVICES
"
if
[
$node
-ne
0
]
;
then
docker
exec
-d
node
$node
/bin/bash
-c
"cd
$WORKING_DIR
;
${
COMMANDS
[
$node
]
}
"
if
[
"
$node
"
-ne
0
]
;
then
docker
exec
-d
"
node
$node
"
/bin/bash
-c
"cd
$WORKING_DIR
;
${
COMMANDS
[
$node
]
}
"
else
docker
exec
node
$node
/bin/bash
-c
"cd
$WORKING_DIR
;
${
COMMANDS
[
$node
]
}
"
docker
exec
"
node
$node
"
/bin/bash
-c
"cd
$WORKING_DIR
;
${
COMMANDS
[
$node
]
}
"
fi
done
}
cleanup
()
{
for
node
in
$(
seq
0
$((
$NUM_NODES
-
1
))
)
;
do
docker stop node
$node
docker stop
"
node
$node
"
done
docker network
rm
docker-net
}
...
...
.buildkite/run-neuron-test.sh
View file @
3be5b26a
#!/bin/bash
# This script build the Neuron docker image and run the API server inside the container.
# It serves a sanity check for compilation and basic model usage.
set
-e
...
...
@@ -12,10 +14,10 @@ if [ -f /tmp/neuron-docker-build-timestamp ]; then
current_time
=
$(
date
+%s
)
if
[
$((
current_time
-
last_build
))
-gt
86400
]
;
then
docker system prune
-f
echo
$current_time
>
/tmp/neuron-docker-build-timestamp
echo
"
$current_time
"
>
/tmp/neuron-docker-build-timestamp
fi
else
echo
$(
date
+%s
)
>
/tmp/neuron-docker-build-timestamp
date
"
+%s
"
>
/tmp/neuron-docker-build-timestamp
fi
docker build
-t
neuron
-f
Dockerfile.neuron
.
...
...
@@ -34,7 +36,7 @@ wait_for_server_to_start() {
timeout
=
300
counter
=
0
while
[
"
$(
curl
-s
-o
/dev/null
-w
'
'
%
{
http_code
}
'
'
localhost:8000/health
)
"
!=
"200"
]
;
do
while
[
"
$(
curl
-s
-o
/dev/null
-w
'%{http_code}'
localhost:8000/health
)
"
!=
"200"
]
;
do
sleep
1
counter
=
$((
counter
+
1
))
if
[
$counter
-ge
$timeout
]
;
then
...
...
.buildkite/run-openvino-test.sh
View file @
3be5b26a
#!/bin/bash
# This script build the OpenVINO docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
set
-ex
...
...
.buildkite/run-tpu-test.sh
View file @
3be5b26a
#!/bin/bash
set
-e
# Build the docker image.
...
...
@@ -12,4 +14,4 @@ remove_docker_container
# For HF_TOKEN.
source
/etc/environment
# Run a simple end-to-end example.
docker run
--privileged
--net
host
--shm-size
=
16G
-it
-e
HF_TOKEN
=
$HF_TOKEN
--name
tpu-test vllm-tpu /bin/bash
-c
"python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py"
docker run
--privileged
--net
host
--shm-size
=
16G
-it
-e
"
HF_TOKEN=
$HF_TOKEN
"
--name
tpu-test vllm-tpu /bin/bash
-c
"python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py"
.buildkite/run-xpu-test.sh
View file @
3be5b26a
#!/bin/bash
# This script build the CPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
set
-ex
...
...
.github/workflows/scripts/cuda-install.sh
View file @
3be5b26a
#!/bin/bash
# Replace '.' with '-' ex: 11.8 -> 11-8
cuda_version
=
$(
echo
$1
|
tr
"."
"-"
)
cuda_version
=
$(
echo
"
$1
"
|
tr
"."
"-"
)
# Removes '-' and '.' ex: ubuntu-20.04 -> ubuntu2004
OS
=
$(
echo
$2
|
tr
-d
".
\-
"
)
OS
=
$(
echo
"
$2
"
|
tr
-d
".
\-
"
)
# Installs CUDA
wget
-nv
https://developer.download.nvidia.com/compute/cuda/repos/
${
OS
}
/x86_64/cuda-keyring_1.1-1_all.deb
wget
-nv
"
https://developer.download.nvidia.com/compute/cuda/repos/
${
OS
}
/x86_64/cuda-keyring_1.1-1_all.deb
"
sudo
dpkg
-i
cuda-keyring_1.1-1_all.deb
rm
cuda-keyring_1.1-1_all.deb
sudo
apt
-qq
update
sudo
apt
-y
install
cuda-
${
cuda_version
}
cuda-nvcc-
${
cuda_version
}
cuda-libraries-dev-
${
cuda_version
}
sudo
apt
-y
install
"
cuda-
${
cuda_version
}
"
"
cuda-nvcc-
${
cuda_version
}
"
"
cuda-libraries-dev-
${
cuda_version
}
"
sudo
apt clean
# Test nvcc
...
...
.github/workflows/scripts/pytorch-install.sh
View file @
3be5b26a
...
...
@@ -6,7 +6,7 @@ cuda_version=$3
# Install torch
$python_executable
-m
pip
install
numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses setuptools
&&
conda clean
-ya
$python_executable
-m
pip
install
torch
==
${
pytorch_version
}
+cu
${
cuda_version
//./
}
--extra-index-url
https://download.pytorch.org/whl/cu
${
cuda_version
//./
}
$python_executable
-m
pip
install
torch
==
"
${
pytorch_version
}
+cu
${
cuda_version
//./
}
"
--extra-index-url
"
https://download.pytorch.org/whl/cu
${
cuda_version
//./
}
"
# Print version information
$python_executable
--version
...
...
.github/workflows/shellcheck.yml
0 → 100644
View file @
3be5b26a
name
:
Lint shell scripts
on
:
push
:
branches
:
-
"
main"
paths
:
-
'
**/*.sh'
-
'
.github/workflows/shellcheck.yml'
pull_request
:
branches
:
-
"
main"
paths
:
-
'
**/*.sh'
-
'
.github/workflows/shellcheck.yml'
env
:
LC_ALL
:
en_US.UTF-8
defaults
:
run
:
shell
:
bash
permissions
:
contents
:
read
jobs
:
shellcheck
:
runs-on
:
ubuntu-latest
steps
:
-
name
:
"
Checkout"
uses
:
actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
# v4.1.7
with
:
fetch-depth
:
0
-
name
:
"
Check
shell
scripts"
run
:
|
tools/shellcheck.sh
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment