Commit ad385667 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge branch 'v0.6.3.post1-dev'

parents be0967c1 903593d3
...@@ -8,6 +8,7 @@ main() { ...@@ -8,6 +8,7 @@ main() {
(which wget && which curl) || (apt-get update && apt-get install -y wget curl) (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
(which jq) || (apt-get update && apt-get -y install jq) (which jq) || (apt-get update && apt-get -y install jq)
(which zip) || (apt-get install -y zip)
if [ ! -f /workspace/buildkite-agent ]; then if [ ! -f /workspace/buildkite-agent ]; then
echo "buildkite-agent binary not found. Skip plotting the results." echo "buildkite-agent binary not found. Skip plotting the results."
...@@ -24,17 +25,54 @@ main() { ...@@ -24,17 +25,54 @@ main() {
ls ls
ls results/ ls results/
# generate figures # upload benchmark results
python3 -m pip install tabulate pandas matplotlib zip -r results.zip results/
python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \ /workspace/buildkite-agent artifact upload "results.zip"
--description $description \
--results-folder results/ # upload benchmarking scripts
cd $VLLM_SOURCE_CODE_LOC/
zip -r nightly-benchmarks.zip .buildkite/ benchmarks/
/workspace/buildkite-agent artifact upload "nightly-benchmarks.zip"
cd $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/
# upload benchmarking pipeline
/workspace/buildkite-agent artifact upload "nightly-pipeline.yaml"
cd $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/
/workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly-annotation.md
# The figures should be genereated by a separate process outside the CI/CD pipeline
# # generate figures
# python3 -m pip install tabulate pandas matplotlib
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py \
# --description $description \
# --results-folder results/
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
# --description $description \
# --results-folder results/ \
# --dataset sharegpt
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
# --description $description \
# --results-folder results/ \
# --dataset sonnet_2048_128
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
# --description $description \
# --results-folder results/ \
# --dataset sonnet_128_2048
# upload results and figures # # upload results and figures
/workspace/buildkite-agent artifact upload "nightly_results.png" # /workspace/buildkite-agent artifact upload "nightly_results*.png"
/workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
/workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
/workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md # /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
} }
main "$@" main "$@"
\ No newline at end of file
#!/bin/bash #!/bin/bash
set -o pipefail set -o pipefail
set -x
check_gpus() { check_gpus() {
# check the number of GPUs and GPU type. # check the number of GPUs and GPU type.
...@@ -15,15 +16,66 @@ check_gpus() { ...@@ -15,15 +16,66 @@ check_gpus() {
echo "GPU type is $gpu_type" echo "GPU type is $gpu_type"
} }
kill_gpu_processes() { check_hf_token() {
pkill lmdeploy || true # check if HF_TOKEN is available and valid
# waiting for GPU processes to be fully killed if [[ -z "$HF_TOKEN" ]]; then
sleep 10 echo "Error: HF_TOKEN is not set."
# Print the GPU memory usage exit 1
# so that we know if all GPU processes are killed. elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0) echo "Error: HF_TOKEN does not start with 'hf_'."
# The memory usage should be 0 MB. exit 1
echo "GPU 0 Memory Usage: $gpu_memory_usage MB" else
echo "HF_TOKEN is set and valid."
fi
}
upload_to_buildkite() {
# upload the benchmarking results to buildkite
# if the agent binary is not found, skip uploading the results, exit 0
if [ ! -f /workspace/buildkite-agent ]; then
echo "buildkite-agent binary not found. Skip uploading the results."
return 0
fi
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
}
get_current_llm_serving_engine() {
if which lmdeploy >/dev/null; then
echo "Container: lmdeploy"
export CURRENT_LLM_SERVING_ENGINE=lmdeploy
return
fi
if [ -e /tgi-entrypoint.sh ]; then
echo "Container: tgi"
export CURRENT_LLM_SERVING_ENGINE=tgi
return
fi
if which trtllm-build >/dev/null; then
echo "Container: tensorrt-llm"
export CURRENT_LLM_SERVING_ENGINE=trt
return
fi
if [ -e /sgl-workspace ]; then
echo "Container: sglang"
export CURRENT_LLM_SERVING_ENGINE=sglang
return
fi
if [ -e /vllm-workspace ]; then
echo "Container: vllm"
# move to a completely irrelevant directory, to avoid import vllm from current folder
export CURRENT_LLM_SERVING_ENGINE=vllm
return
fi
} }
json2args() { json2args() {
...@@ -42,6 +94,19 @@ json2args() { ...@@ -42,6 +94,19 @@ json2args() {
echo "$args" echo "$args"
} }
kill_gpu_processes() {
pkill -f python
pkill -f python3
pkill -f tritonserver
pkill -f pt_main_thread
pkill -f text-generation
pkill -f lmdeploy
while [ $(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1) -ge 1000 ]; do
sleep 1
done
}
wait_for_server() { wait_for_server() {
# wait for vllm server to start # wait for vllm server to start
# return 1 if vllm server crashes # return 1 if vllm server crashes
...@@ -51,6 +116,14 @@ wait_for_server() { ...@@ -51,6 +116,14 @@ wait_for_server() {
done' && return 0 || return 1 done' && return 0 || return 1
} }
ensure_installed() {
# Ensure that the given command is installed by apt-get
local cmd=$1
if ! which $cmd >/dev/null; then
apt-get update && apt-get install -y $cmd
fi
}
run_serving_tests() { run_serving_tests() {
# run serving tests using `benchmark_serving.py` # run serving tests using `benchmark_serving.py`
# $1: a json file specifying serving test cases # $1: a json file specifying serving test cases
...@@ -68,10 +141,10 @@ run_serving_tests() { ...@@ -68,10 +141,10 @@ run_serving_tests() {
echo "Skip test case $test_name." echo "Skip test case $test_name."
continue continue
fi fi
# append lmdeploy to the test name # prepend the current serving engine to the test name
test_name=lmdeploy_$test_name test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
# get common parameters # get common parameters
common_params=$(echo "$params" | jq -r '.common_parameters') common_params=$(echo "$params" | jq -r '.common_parameters')
model=$(echo "$common_params" | jq -r '.model') model=$(echo "$common_params" | jq -r '.model')
...@@ -80,13 +153,11 @@ run_serving_tests() { ...@@ -80,13 +153,11 @@ run_serving_tests() {
dataset_path=$(echo "$common_params" | jq -r '.dataset_path') dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port') port=$(echo "$common_params" | jq -r '.port')
num_prompts=$(echo "$common_params" | jq -r '.num_prompts') num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
# get client and server arguments # get client and server arguments
server_params=$(echo "$params" | jq -r '.lmdeploy_server_parameters') server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
client_params=$(echo "$params" | jq -r '.lmdeploy_client_parameters') client_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_client_parameters")
server_args=$(json2args "$server_params")
client_args=$(json2args "$client_params") client_args=$(json2args "$client_params")
qps_list=$(echo "$params" | jq -r '.qps_list') qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
...@@ -94,40 +165,44 @@ run_serving_tests() { ...@@ -94,40 +165,44 @@ run_serving_tests() {
# check if there is enough GPU to run the test # check if there is enough GPU to run the test
if [[ $gpu_count -lt $tp ]]; then if [[ $gpu_count -lt $tp ]]; then
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name." echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue continue
fi fi
# prepare tokenizer if [[ $reuse_server == "true" ]]; then
rm -rf /tokenizer_cache echo "Reuse previous server for test case $test_name"
mkdir /tokenizer_cache else
python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \ kill_gpu_processes
--model "$model" \ bash $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh \
--cachedir /tokenizer_cache "$server_params" "$common_params"
fi
server_command="lmdeploy serve api_server $model \
--tp $tp \
--server-port $port \
$server_args"
# run the server
echo "Running test case $test_name"
echo "Server command: $server_command"
bash -c "$server_command" &
# wait until the server is alive
wait_for_server wait_for_server
if [ $? -eq 0 ]; then if [ $? -eq 0 ]; then
echo "" echo ""
echo "lmdeploy server is up and running." echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
else else
echo "" echo ""
echo "lmdeploy failed to start within the timeout period." echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
break break
fi fi
# get model name # prepare tokenizer
model_name=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py) # this is required for lmdeploy.
cd $VLLM_SOURCE_CODE_LOC/benchmarks
rm -rf /tokenizer_cache
mkdir /tokenizer_cache
python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
--model "$model" \
--cachedir /tokenizer_cache
cd $VLLM_SOURCE_CODE_LOC/benchmarks
# change model name for lmdeploy (it will not follow standard hf name)
if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then
model=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
fi
# iterate over different QPS # iterate over different QPS
for qps in $qps_list; do for qps in $qps_list; do
...@@ -140,31 +215,79 @@ run_serving_tests() { ...@@ -140,31 +215,79 @@ run_serving_tests() {
new_test_name=$test_name"_qps_"$qps new_test_name=$test_name"_qps_"$qps
client_command="python3 benchmark_serving.py \ backend=$CURRENT_LLM_SERVING_ENGINE
--backend lmdeploy \
--tokenizer /tokenizer_cache \ if [[ $backend = "trt" ]]; then
--dataset-name $dataset_name \ backend="tensorrt-llm"
--dataset-path $dataset_path \ fi
--num-prompts $num_prompts \
--port $port \ if [[ "$backend" == *"vllm"* ]]; then
--save-result \ backend="vllm"
--result-dir $RESULTS_FOLDER \ fi
--result-filename ${new_test_name}.json \
--request-rate $qps \ if [[ "$dataset_name" = "sharegpt" ]]; then
--model \"$model_name\" \
$client_args" client_command="python3 benchmark_serving.py \
--backend $backend \
--tokenizer /tokenizer_cache \
--model $model \
--dataset-name $dataset_name \
--dataset-path $dataset_path \
--num-prompts $num_prompts \
--port $port \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
--ignore-eos \
$client_args"
elif [[ "$dataset_name" = "sonnet" ]]; then
sonnet_input_len=$(echo "$common_params" | jq -r '.sonnet_input_len')
sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
client_command="python3 benchmark_serving.py \
--backend $backend \
--tokenizer /tokenizer_cache \
--model $model \
--dataset-name $dataset_name \
--dataset-path $dataset_path \
--num-prompts $num_prompts \
--sonnet-input-len $sonnet_input_len \
--sonnet-output-len $sonnet_output_len \
--sonnet-prefix-len $sonnet_prefix_len \
--port $port \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
--ignore-eos \
$client_args"
else
echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
exit 1
fi
echo "Running test case $test_name with qps $qps" echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command" echo "Client command: $client_command"
eval "$client_command" eval "$client_command"
server_command="None"
# record the benchmarking commands # record the benchmarking commands
jq_output=$(jq -n \ jq_output=$(jq -n \
--arg server "$server_command" \ --arg server "$server_command" \
--arg client "$client_command" \ --arg client "$client_command" \
--arg gpu "$gpu_type" \ --arg gpu "$gpu_type" \
--arg engine "lmdeploy" \ --arg engine "$CURRENT_LLM_SERVING_ENGINE" \
'{ '{
server_command: $server, server_command: $server,
client_command: $client, client_command: $client,
...@@ -175,42 +298,58 @@ run_serving_tests() { ...@@ -175,42 +298,58 @@ run_serving_tests() {
done done
# clean up
kill_gpu_processes
rm -rf /root/.cache/huggingface/*
done done
kill_gpu_processes
} }
upload_to_buildkite() { prepare_dataset() {
# upload the benchmarking results to buildkite
# if the agent binary is not found, skip uploading the results, exit 0 # download sharegpt dataset
if [ ! -f /workspace/buildkite-agent ]; then cd $VLLM_SOURCE_CODE_LOC/benchmarks
echo "buildkite-agent binary not found. Skip uploading the results." wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
return 0
fi
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
}
# duplicate sonnet by 4x, to allow benchmarking with input length 2048
cd $VLLM_SOURCE_CODE_LOC/benchmarks
echo "" > sonnet_4x.txt
for _ in {1..4}
do
cat sonnet.txt >> sonnet_4x.txt
done
}
main() { main() {
# check if the environment variable is successfully injected from yaml
check_gpus check_gpus
# enter vllm directory check_hf_token
cd $VLLM_SOURCE_CODE_LOC/benchmarks get_current_llm_serving_engine
pip install -U transformers
# check storage
df -h
ensure_installed wget
ensure_installed curl
ensure_installed jq
prepare_dataset
cd $VLLM_SOURCE_CODE_LOC/benchmarks
declare -g RESULTS_FOLDER=results/ declare -g RESULTS_FOLDER=results/
mkdir -p $RESULTS_FOLDER mkdir -p $RESULTS_FOLDER
BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/ BENCHMARK_ROOT=$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/
python -m pip install transformers==4.41.2
export CURRENT_LLM_SERVING_ENGINE=lmdeploy # run the test
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
python -m pip install tabulate pandas
python $BENCHMARK_ROOT/scripts/summary-nightly-results.py # upload benchmark results to buildkite
python3 -m pip install tabulate pandas
python3 $BENCHMARK_ROOT/scripts/summary-nightly-results.py
upload_to_buildkite upload_to_buildkite
} }
......
...@@ -37,9 +37,9 @@ check_hf_token() { ...@@ -37,9 +37,9 @@ check_hf_token() {
ensure_sharegpt_downloaded() { ensure_sharegpt_downloaded() {
local FILE=ShareGPT_V3_unfiltered_cleaned_split.json local FILE=ShareGPT_V3_unfiltered_cleaned_split.json
if [ ! -f "$FILE" ]; then if [ ! -f "$FILE" ]; then
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/$FILE wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/$FILE
else else
echo "$FILE already exists." echo "$FILE already exists."
fi fi
} }
...@@ -68,35 +68,38 @@ wait_for_server() { ...@@ -68,35 +68,38 @@ wait_for_server() {
done' && return 0 || return 1 done' && return 0 || return 1
} }
kill_gpu_processes() { kill_processes_launched_by_current_bash() {
# kill all processes on GPU. # Kill all python processes launched from current bash script
pids=$(nvidia-smi --query-compute-apps=pid --format=csv,noheader) current_shell_pid=$$
if [ -z "$pids" ]; then processes=$(ps -eo pid,ppid,command | awk -v ppid="$current_shell_pid" -v proc="$1" '$2 == ppid && $3 ~ proc {print $1}')
echo "No GPU processes found." if [ -n "$processes" ]; then
echo "Killing the following processes matching '$1':"
echo "$processes"
echo "$processes" | xargs kill -9
else else
for pid in $pids; do echo "No processes found matching '$1'."
kill -9 "$pid"
echo "Killed process with PID: $pid"
done
echo "All GPU processes have been killed."
fi fi
}
kill_gpu_processes() {
# waiting for GPU processes to be fully killed ps -aux
# loop while nvidia-smi returns any processes lsof -t -i:8000 | xargs -r kill -9
while [ -n "$(nvidia-smi --query-compute-apps=pid --format=csv,noheader)" ]; do pkill -f pt_main_thread
# this line doesn't work now
# ps aux | grep python | grep openai | awk '{print $2}' | xargs -r kill -9
pkill -f python3
pkill -f /usr/bin/python3
# wait until GPU memory usage smaller than 1GB
while [ $(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1) -ge 1000 ]; do
sleep 1 sleep 1
echo "Waiting for GPU processes to be killed"
done done
# remove vllm config file # remove vllm config file
rm -rf ~/.config/vllm rm -rf ~/.config/vllm
# Print the GPU memory usage
# so that we know if all GPU processes are killed.
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
# The memory usage should be 0 MB.
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
} }
upload_to_buildkite() { upload_to_buildkite() {
...@@ -114,7 +117,7 @@ upload_to_buildkite() { ...@@ -114,7 +117,7 @@ upload_to_buildkite() {
fi fi
# Use the determined command to annotate and upload artifacts # Use the determined command to annotate and upload artifacts
$BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" < $RESULTS_FOLDER/benchmark_results.md $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" <$RESULTS_FOLDER/benchmark_results.md
$BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*" $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
} }
...@@ -166,7 +169,7 @@ run_latency_tests() { ...@@ -166,7 +169,7 @@ run_latency_tests() {
latency_command: $latency, latency_command: $latency,
gpu_type: $gpu gpu_type: $gpu
}') }')
echo "$jq_output" > "$RESULTS_FOLDER/$test_name.commands" echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
# run the benchmark # run the benchmark
eval "$latency_command" eval "$latency_command"
...@@ -176,7 +179,6 @@ run_latency_tests() { ...@@ -176,7 +179,6 @@ run_latency_tests() {
done done
} }
run_throughput_tests() { run_throughput_tests() {
# run throughput tests using `benchmark_throughput.py` # run throughput tests using `benchmark_throughput.py`
# $1: a json file specifying throughput test cases # $1: a json file specifying throughput test cases
...@@ -224,7 +226,7 @@ run_throughput_tests() { ...@@ -224,7 +226,7 @@ run_throughput_tests() {
throughput_command: $command, throughput_command: $command,
gpu_type: $gpu gpu_type: $gpu
}') }')
echo "$jq_output" > "$RESULTS_FOLDER/$test_name.commands" echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
# run the benchmark # run the benchmark
eval "$throughput_command" eval "$throughput_command"
...@@ -256,7 +258,6 @@ run_serving_tests() { ...@@ -256,7 +258,6 @@ run_serving_tests() {
continue continue
fi fi
# get client and server arguments # get client and server arguments
server_params=$(echo "$params" | jq -r '.server_parameters') server_params=$(echo "$params" | jq -r '.server_parameters')
client_params=$(echo "$params" | jq -r '.client_parameters') client_params=$(echo "$params" | jq -r '.client_parameters')
...@@ -334,7 +335,7 @@ run_serving_tests() { ...@@ -334,7 +335,7 @@ run_serving_tests() {
client_command: $client, client_command: $client,
gpu_type: $gpu gpu_type: $gpu
}') }')
echo "$jq_output" > "$RESULTS_FOLDER/${new_test_name}.commands" echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
done done
...@@ -351,6 +352,7 @@ main() { ...@@ -351,6 +352,7 @@ main() {
# dependencies # dependencies
(which wget && which curl) || (apt-get update && apt-get install -y wget curl) (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
(which jq) || (apt-get update && apt-get -y install jq) (which jq) || (apt-get update && apt-get -y install jq)
(which lsof) || (apt-get update && apt-get install -y lsof)
# get the current IP address, required by benchmark_serving.py # get the current IP address, required by benchmark_serving.py
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}') export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
...@@ -369,7 +371,6 @@ main() { ...@@ -369,7 +371,6 @@ main() {
run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json
run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json
# postprocess benchmarking results # postprocess benchmarking results
pip install tabulate pandas pip install tabulate pandas
python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
......
#!/bin/bash
set -o pipefail
check_gpus() {
# check the number of GPUs and GPU type.
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
if [[ $gpu_count -gt 0 ]]; then
echo "GPU found."
else
echo "Need at least 1 GPU to run benchmarking."
exit 1
fi
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
echo "GPU type is $gpu_type"
}
kill_gpu_processes() {
pkill text-generation || true
# waiting for GPU processes to be fully killed
sleep 10
# Print the GPU memory usage
# so that we know if all GPU processes are killed.
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
# The memory usage should be 0 MB.
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
}
json2args() {
# transforms the JSON string to command line args, and '_' is replaced to '-'
# example:
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
local json_string=$1
local args=$(
echo "$json_string" | jq -r '
to_entries |
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
join(" ")
'
)
echo "$args"
}
wait_for_server() {
timeout 1200 bash -c '
until curl -s localhost:8000/generate_stream > /dev/null; do
sleep 1
done' && return 0 || return 1
}
run_serving_tests() {
# run serving tests using `benchmark_serving.py`
# $1: a json file specifying serving test cases
local serving_test_file
serving_test_file=$1
# Iterate over serving tests
jq -c '.[]' "$serving_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
# if TEST_SELECTOR is set, only run the test cases that match the selector
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
echo "Skip test case $test_name."
continue
fi
# append tgi to the test name
test_name=tgi_$test_name
# get common parameters
common_params=$(echo "$params" | jq -r '.common_parameters')
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port')
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
# get client and server arguments
server_params=$(echo "$params" | jq -r '.tgi_server_parameters')
client_params=$(echo "$params" | jq -r '.tgi_client_parameters')
server_args=$(json2args "$server_params")
client_args=$(json2args "$client_params")
qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
echo "Running over qps list $qps_list"
# check if there is enough GPU to run the test
if [[ $gpu_count -lt $tp ]]; then
echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
fi
if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
echo "Key 'fp8' exists in common params."
server_command="/tgi-entrypoint.sh \
--model-id $model \
--num-shard $tp \
--port $port \
--quantize fp8 \
$server_args"
else
echo "Key 'fp8' does not exist in common params."
server_command="/tgi-entrypoint.sh \
--model-id $model \
--num-shard $tp \
--port $port \
$server_args"
fi
# run the server
echo "Running test case $test_name"
echo "Server command: $server_command"
eval "$server_command" &
# wait until the server is alive
wait_for_server
if [ $? -eq 0 ]; then
echo ""
echo "tgi server is up and running."
else
echo ""
echo "tgi failed to start within the timeout period."
break
fi
# iterate over different QPS
for qps in $qps_list; do
# remove the surrounding single quote from qps
if [[ "$qps" == *"inf"* ]]; then
echo "qps was $qps"
qps="inf"
echo "now qps is $qps"
fi
new_test_name=$test_name"_qps_"$qps
client_command="python3 benchmark_serving.py \
--backend tgi \
--model $model \
--dataset-name $dataset_name \
--dataset-path $dataset_path \
--num-prompts $num_prompts \
--port $port \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
$client_args"
echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command"
eval "$client_command"
# record the benchmarking commands
jq_output=$(jq -n \
--arg server "$server_command" \
--arg client "$client_command" \
--arg gpu "$gpu_type" \
--arg engine "tgi" \
'{
server_command: $server,
client_command: $client,
gpu_type: $gpu,
engine: $engine
}')
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
done
# clean up
kill_gpu_processes
rm -rf /root/.cache/huggingface/*
done
}
upload_to_buildkite() {
# upload the benchmarking results to buildkite
# if the agent binary is not found, skip uploading the results, exit 0
if [ ! -f /workspace/buildkite-agent ]; then
echo "buildkite-agent binary not found. Skip uploading the results."
return 0
fi
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
}
main() {
check_gpus
# enter vllm directory
cd $VLLM_SOURCE_CODE_LOC/benchmarks
declare -g RESULTS_FOLDER=results/
mkdir -p $RESULTS_FOLDER
BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
export CURRENT_LLM_SERVING_ENGINE=tgi
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
python -m pip install tabulate pandas
python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
upload_to_buildkite
}
main "$@"
#!/bin/bash
set -o pipefail
check_gpus() {
# check the number of GPUs and GPU type.
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
if [[ $gpu_count -gt 0 ]]; then
echo "GPU found."
else
echo "Need at least 1 GPU to run benchmarking."
exit 1
fi
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
echo "GPU type is $gpu_type"
}
kill_gpu_processes() {
pkill tritonserver || true
# waiting for GPU processes to be fully killed
sleep 20
# Print the GPU memory usage
# so that we know if all GPU processes are killed.
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
# The memory usage should be 0 MB.
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
}
json2args() {
# transforms the JSON string to command line args, and '_' is replaced to '-'
# example:
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
local json_string=$1
local args=$(
echo "$json_string" | jq -r '
to_entries |
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
join(" ")
'
)
echo "$args"
}
wait_for_server() {
timeout 1200 bash -c '
until curl -s localhost:8000/generate_stream > /dev/null; do
sleep 1
done' && return 0 || return 1
}
run_serving_tests() {
# run serving tests using `benchmark_serving.py`
# $1: a json file specifying serving test cases
local serving_test_file
serving_test_file=$1
# Iterate over serving tests
jq -c '.[]' "$serving_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
# if TEST_SELECTOR is set, only run the test cases that match the selector
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
echo "Skip test case $test_name."
continue
fi
# append trt to the test name
test_name=trt_$test_name
# get common parameters
common_params=$(echo "$params" | jq -r '.common_parameters')
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port')
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
# get client and server arguments
server_params=$(echo "$params" | jq -r '.trt_server_parameters')
client_params=$(echo "$params" | jq -r '.trt_client_parameters')
client_args=$(json2args "$client_params")
qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
echo "Running over qps list $qps_list"
# check if there is enough GPU to run the test
if [[ $gpu_count -lt $tp ]]; then
echo "Required model_tp_size $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
fi
cd $VLLM_SOURCE_CODE_LOC/benchmarks
echo "Running test case $test_name"
bash ../.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh "$server_params" "$common_params"
# wait until the server is alive
wait_for_server
if [ $? -eq 0 ]; then
echo ""
echo "trt server is up and running."
else
echo ""
echo "trt failed to start within the timeout period."
break
fi
# prepare tokenizer
cd $VLLM_SOURCE_CODE_LOC/benchmarks
rm -rf /tokenizer_cache
mkdir /tokenizer_cache
python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
--model "$model" \
--cachedir /tokenizer_cache
cd $VLLM_SOURCE_CODE_LOC/benchmarks
# iterate over different QPS
for qps in $qps_list; do
# remove the surrounding single quote from qps
if [[ "$qps" == *"inf"* ]]; then
echo "qps was $qps"
qps="inf"
echo "now qps is $qps"
fi
new_test_name=$test_name"_qps_"$qps
client_command="python3 benchmark_serving.py \
--backend tensorrt-llm \
--tokenizer /tokenizer_cache \
--model $model \
--dataset-name $dataset_name \
--dataset-path $dataset_path \
--num-prompts $num_prompts \
--port $port \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
$client_args"
echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command"
eval "$client_command"
server_command=""
# record the benchmarking commands
jq_output=$(jq -n \
--arg server "$server_command" \
--arg client "$client_command" \
--arg gpu "$gpu_type" \
--arg engine "trt" \
'{
server_command: $server,
client_command: $client,
gpu_type: $gpu,
engine: $engine
}')
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
done
# clean up
kill_gpu_processes
rm -rf /root/.cache/huggingface/*
done
}
upload_to_buildkite() {
# upload the benchmarking results to buildkite
# if the agent binary is not found, skip uploading the results, exit 0
if [ ! -f /workspace/buildkite-agent ]; then
echo "buildkite-agent binary not found. Skip uploading the results."
return 0
fi
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
}
main() {
check_gpus
# enter vllm directory
cd $VLLM_SOURCE_CODE_LOC/benchmarks
declare -g RESULTS_FOLDER=results/
mkdir -p $RESULTS_FOLDER
BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
# update transformers package, to make sure mixtral tokenizer is available
python -m pip install transformers -U
export CURRENT_LLM_SERVING_ENGINE=trt
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
python -m pip install tabulate pandas
python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
upload_to_buildkite
}
main "$@"
#!/bin/bash
set -o pipefail
check_gpus() {
# check the number of GPUs and GPU type.
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
if [[ $gpu_count -gt 0 ]]; then
echo "GPU found."
else
echo "Need at least 1 GPU to run benchmarking."
exit 1
fi
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
echo "GPU type is $gpu_type"
}
kill_gpu_processes() {
# kill all processes on GPU.
pkill pt_main_thread
sleep 10
# remove vllm config file
rm -rf ~/.config/vllm
# Print the GPU memory usage
# so that we know if all GPU processes are killed.
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
# The memory usage should be 0 MB.
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
}
json2args() {
# transforms the JSON string to command line args, and '_' is replaced to '-'
# example:
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
local json_string=$1
local args=$(
echo "$json_string" | jq -r '
to_entries |
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
join(" ")
'
)
echo "$args"
}
wait_for_server() {
# wait for vllm server to start
# return 1 if vllm server crashes
timeout 1200 bash -c '
until curl -s localhost:8000/v1/completions > /dev/null; do
sleep 1
done' && return 0 || return 1
}
run_serving_tests() {
# run serving tests using `benchmark_serving.py`
# $1: a json file specifying serving test cases
local serving_test_file
serving_test_file=$1
# Iterate over serving tests
jq -c '.[]' "$serving_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
# if TEST_SELECTOR is set, only run the test cases that match the selector
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
echo "Skip test case $test_name."
continue
fi
# append vllm to the test name
test_name=vllm_$test_name
# get common parameters
common_params=$(echo "$params" | jq -r '.common_parameters')
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port')
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
# get client and server arguments
server_params=$(echo "$params" | jq -r '.vllm_server_parameters')
client_params=$(echo "$params" | jq -r '.vllm_client_parameters')
server_args=$(json2args "$server_params")
client_args=$(json2args "$client_params")
qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
echo "Running over qps list $qps_list"
# check if there is enough GPU to run the test
if [[ $gpu_count -lt $tp ]]; then
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
fi
if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
server_command="python3 \
-m vllm.entrypoints.openai.api_server \
-tp $tp \
--model $model \
--port $port \
$server_args"
else
echo "Key 'fp8' does not exist in common params."
server_command="python3 \
-m vllm.entrypoints.openai.api_server \
-tp $tp \
--model $model \
--port $port \
$server_args"
fi
# run the server
echo "Running test case $test_name"
echo "Server command: $server_command"
eval "$server_command" &
# wait until the server is alive
wait_for_server
if [ $? -eq 0 ]; then
echo ""
echo "vllm server is up and running."
else
echo ""
echo "vllm failed to start within the timeout period."
break
fi
# iterate over different QPS
for qps in $qps_list; do
# remove the surrounding single quote from qps
if [[ "$qps" == *"inf"* ]]; then
echo "qps was $qps"
qps="inf"
echo "now qps is $qps"
fi
new_test_name=$test_name"_qps_"$qps
client_command="python3 benchmark_serving.py \
--backend vllm \
--model $model \
--dataset-name $dataset_name \
--dataset-path $dataset_path \
--num-prompts $num_prompts \
--port $port \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
$client_args"
echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command"
eval "$client_command"
# record the benchmarking commands
jq_output=$(jq -n \
--arg server "$server_command" \
--arg client "$client_command" \
--arg gpu "$gpu_type" \
--arg engine "vllm" \
'{
server_command: $server,
client_command: $client,
gpu_type: $gpu,
engine: $engine
}')
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
done
# clean up
kill_gpu_processes
rm -rf /root/.cache/huggingface/*
done
}
upload_to_buildkite() {
# upload the benchmarking results to buildkite
# if the agent binary is not found, skip uploading the results, exit 0
if [ ! -f /workspace/buildkite-agent ]; then
echo "buildkite-agent binary not found. Skip uploading the results."
return 0
fi
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
}
main() {
check_gpus
# enter vllm directory
cd $VLLM_SOURCE_CODE_LOC/benchmarks
declare -g RESULTS_FOLDER=results/
mkdir -p $RESULTS_FOLDER
BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
export CURRENT_LLM_SERVING_ENGINE=vllm
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
python3 -m pip install tabulate pandas
python3 $BENCHMARK_ROOT/scripts/summary-nightly-results.py
upload_to_buildkite
}
main "$@"
...@@ -17,10 +17,17 @@ serving_column_mapping = { ...@@ -17,10 +17,17 @@ serving_column_mapping = {
"request_throughput": "Tput (req/s)", "request_throughput": "Tput (req/s)",
"mean_ttft_ms": "Mean TTFT (ms)", "mean_ttft_ms": "Mean TTFT (ms)",
"std_ttft_ms": "Std TTFT (ms)", "std_ttft_ms": "Std TTFT (ms)",
"median_ttft_ms": "Median TTFT (ms)",
"mean_itl_ms": "Mean ITL (ms)", "mean_itl_ms": "Mean ITL (ms)",
"std_itl_ms": "Std ITL (ms)", "std_itl_ms": "Std ITL (ms)",
"input_throughput": "Input Tput (tok/s)", "median_itl_ms": "Median ITL (ms)",
"mean_tpot_ms": "Mean TPOT (ms)",
"std_tpot_ms": "Std TPOT (ms)",
"median_tpot_ms": "Median TPOT (ms)",
"total_token_throughput": "Total Token Tput (tok/s)",
"output_throughput": "Output Tput (tok/s)", "output_throughput": "Output Tput (tok/s)",
"total_input_tokens": "Total input tokens",
"total_output_tokens": "Total output tokens",
"engine": "Engine", "engine": "Engine",
} }
......
...@@ -2,9 +2,11 @@ ...@@ -2,9 +2,11 @@
TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull" | jq -r .token) TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull" | jq -r .token)
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT" URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
TIMEOUT_SECONDS=10
retries=0 retries=0
while [ $retries -lt 1000 ]; do while [ $retries -lt 1000 ]; do
if [ $(curl -s -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then if [ $(curl -s --max-time $TIMEOUT_SECONDS -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then
exit 0 exit 0
fi fi
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
{ {
"test_name": "latency_llama8B_tp1", "test_name": "latency_llama8B_tp1",
"parameters": { "parameters": {
"model": "meta-llama/Meta-Llama-3-8B", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1, "tensor_parallel_size": 1,
"load_format": "dummy", "load_format": "dummy",
"num_iters_warmup": 5, "num_iters_warmup": 5,
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
{ {
"test_name": "latency_llama70B_tp4", "test_name": "latency_llama70B_tp4",
"parameters": { "parameters": {
"model": "meta-llama/Meta-Llama-3-70B-Instruct", "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"tensor_parallel_size": 4, "tensor_parallel_size": 4,
"load_format": "dummy", "load_format": "dummy",
"num-iters-warmup": 5, "num-iters-warmup": 5,
......
[ [
{ {
"test_name": "llama8B_tp1", "test_name": "llama8B_tp1_sharegpt",
"qps_list": [4], "qps_list": [4,8,16,32,"inf"],
"common_parameters": { "common_parameters": {
"model": "meta-llama/Meta-Llama-3-8B", "model": "meta-llama/Meta-Llama-3-8B-Instruct",
"tp": 1, "tp": 1,
"dataset_name": "sharegpt", "dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 500, "num_prompts": 500,
"port": 8000 "port": 8000,
"reuse_server": false
}, },
"lmdeploy_server_parameters": { "lmdeploy_server_parameters": {
"dtype": "bfloat16"
}, },
"lmdeploy_client_parameters": { "lmdeploy_client_parameters": {
}, },
...@@ -21,34 +23,158 @@ ...@@ -21,34 +23,158 @@
}, },
"trt_server_parameters": { "trt_server_parameters": {
"model_type": "llama", "model_type": "llama",
"model_dtype": "float16", "model_dtype": "bfloat16",
"max_batch_size": 256, "max_batch_size": 2048,
"max_input_len": 4096, "max_input_len": 4096,
"max_output_len": 4096, "max_seq_len": 6144,
"trt_llm_version": "r24.04" "max_num_tokens": 16384,
"trt_llm_version": "v0.11.0"
}, },
"trt_client_parameters": { "trt_client_parameters": {
"endpoint": "/v2/models/ensemble/generate_stream" "endpoint": "/v2/models/ensemble/generate_stream"
},
"vllm_server_parameters": {
"disable_log_stats": "",
"disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
},
"vllm_client_parameters": {
},
"sglang_server_parameters": {
"disable_radix_cache": "",
"enable_torch_compile": "",
"dtype": "bfloat16"
},
"sglang_client_parameters": {
}
},
{
"test_name": "llama8B_tp1_sonnet_512_16",
"qps_list": [4,8,16,32,"inf"],
"common_parameters": {
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
"tp": 1,
"dataset_name": "sonnet",
"dataset_path": "./sonnet_4x.txt",
"num_prompts": 500,
"port": 8000,
"sonnet_input_len": 512,
"sonnet_output_len": 16,
"sonnet_prefix_len": 50,
"reuse_server": true
},
"lmdeploy_server_parameters": {
"dtype": "bfloat16"
},
"lmdeploy_client_parameters": {
},
"tgi_server_parameters": {
},
"tgi_client_parameters": {
"endpoint": "/generate_stream"
},
"trt_server_parameters": {
"model_type": "llama",
"model_dtype": "bfloat16",
"max_batch_size": 2048,
"max_input_len": 4096,
"max_seq_len": 6144,
"max_num_tokens": 16384,
"trt_llm_version": "v0.11.0"
},
"trt_client_parameters": {
"endpoint": "/v2/models/ensemble/generate_stream"
},
"vllm_server_parameters": {
"disable_log_stats": "",
"disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
},
"vllm_client_parameters": {
},
"sglang_server_parameters": {
"disable_radix_cache": "",
"enable_torch_compile": "",
"dtype": "bfloat16"
},
"sglang_client_parameters": {
}
},
{
"test_name": "llama8B_tp1_sonnet_512_256",
"qps_list": [4,8,16,32,"inf"],
"common_parameters": {
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
"tp": 1,
"dataset_name": "sonnet",
"dataset_path": "./sonnet_4x.txt",
"num_prompts": 500,
"port": 8000,
"sonnet_input_len": 512,
"sonnet_output_len": 256,
"sonnet_prefix_len": 50,
"reuse_server": true
},
"lmdeploy_server_parameters": {
"dtype": "bfloat16"
},
"lmdeploy_client_parameters": {
},
"tgi_server_parameters": {
},
"tgi_client_parameters": {
"endpoint": "/generate_stream"
},
"trt_server_parameters": {
"model_type": "llama",
"model_dtype": "bfloat16",
"max_batch_size": 2048,
"max_input_len": 4096,
"max_seq_len": 6144,
"max_num_tokens": 16384,
"trt_llm_version": "v0.11.0"
}, },
"trt_client_parameters": {
"endpoint": "/v2/models/ensemble/generate_stream"
},
"vllm_server_parameters": { "vllm_server_parameters": {
"disable_log_stats": "", "disable_log_stats": "",
"disable_log_requests": "" "disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
}, },
"vllm_client_parameters": { "vllm_client_parameters": {
},
"sglang_server_parameters": {
"disable_radix_cache": "",
"enable_torch_compile": "",
"dtype": "bfloat16"
},
"sglang_client_parameters": {
} }
}, },
{ {
"test_name": "llama70B_tp4", "test_name": "llama70B_tp4_sharegpt",
"qps_list": [2], "qps_list": [4,8,16,32,"inf"],
"common_parameters": { "common_parameters": {
"model": "meta-llama/Meta-Llama-3-70B-Instruct", "model": "meta-llama/Meta-Llama-3-70B-Instruct",
"tp": 4, "tp": 4,
"dataset_name": "sharegpt", "dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 500, "num_prompts": 500,
"port": 8000 "port": 8000,
"reuse_server": false
}, },
"lmdeploy_server_parameters": { "lmdeploy_server_parameters": {
"dtype": "bfloat16"
}, },
"lmdeploy_client_parameters": { "lmdeploy_client_parameters": {
}, },
...@@ -59,34 +185,50 @@ ...@@ -59,34 +185,50 @@
}, },
"trt_server_parameters": { "trt_server_parameters": {
"model_type": "llama", "model_type": "llama",
"model_dtype": "float16", "model_dtype": "bfloat16",
"max_batch_size": 256, "max_batch_size": 2048,
"max_input_len": 4096, "max_input_len": 4096,
"max_output_len": 4096, "max_seq_len": 6144,
"trt_llm_version": "r24.04" "max_num_tokens": 16384,
"trt_llm_version": "v0.11.0"
}, },
"trt_client_parameters": { "trt_client_parameters": {
"endpoint": "/v2/models/ensemble/generate_stream" "endpoint": "/v2/models/ensemble/generate_stream"
}, },
"vllm_server_parameters": { "vllm_server_parameters": {
"disable_log_stats": "", "disable_log_stats": "",
"disable_log_requests": "" "disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
}, },
"vllm_client_parameters": { "vllm_client_parameters": {
},
"sglang_server_parameters": {
"disable_radix_cache": "",
"dtype": "bfloat16"
},
"sglang_client_parameters": {
} }
}, },
{ {
"test_name": "mixtral8x7B_tp2", "test_name": "llama70B_tp4_sonnet_512_16",
"qps_list": [2], "qps_list": [4,8,16,32,"inf"],
"common_parameters": { "common_parameters": {
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1", "model": "meta-llama/Meta-Llama-3-70B-Instruct",
"tp": 2, "tp": 4,
"dataset_name": "sharegpt", "dataset_name": "sonnet",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "dataset_path": "./sonnet_4x.txt",
"num_prompts": 500, "num_prompts": 500,
"port": 8000 "port": 8000,
"sonnet_input_len": 512,
"sonnet_output_len": 16,
"sonnet_prefix_len": 50,
"reuse_server": true
}, },
"lmdeploy_server_parameters": { "lmdeploy_server_parameters": {
"dtype": "bfloat16"
}, },
"lmdeploy_client_parameters": { "lmdeploy_client_parameters": {
}, },
...@@ -97,20 +239,85 @@ ...@@ -97,20 +239,85 @@
}, },
"trt_server_parameters": { "trt_server_parameters": {
"model_type": "llama", "model_type": "llama",
"model_dtype": "float16", "model_dtype": "bfloat16",
"max_batch_size": 256, "max_batch_size": 2048,
"max_input_len": 4096, "max_input_len": 4096,
"max_output_len": 4096, "max_seq_len": 6144,
"trt_llm_version": "r24.04" "max_num_tokens": 16384,
"trt_llm_version": "v0.11.0"
}, },
"trt_client_parameters": { "trt_client_parameters": {
"endpoint": "/v2/models/ensemble/generate_stream" "endpoint": "/v2/models/ensemble/generate_stream"
},
"vllm_server_parameters": {
"disable_log_stats": "",
"disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
},
"vllm_client_parameters": {
}, },
"sglang_server_parameters": {
"disable_radix_cache": "",
"dtype": "bfloat16"
},
"sglang_client_parameters": {
}
},
{
"test_name": "llama70B_tp4_sonnet_512_256",
"qps_list": [4,8,16,32,"inf"],
"common_parameters": {
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
"tp": 4,
"dataset_name": "sonnet",
"dataset_path": "./sonnet_4x.txt",
"num_prompts": 500,
"port": 8000,
"sonnet_input_len": 512,
"sonnet_output_len": 256,
"sonnet_prefix_len": 50,
"reuse_server": true
},
"lmdeploy_server_parameters": {
"dtype": "bfloat16"
},
"lmdeploy_client_parameters": {
},
"tgi_server_parameters": {
},
"tgi_client_parameters": {
"endpoint": "/generate_stream"
},
"trt_server_parameters": {
"model_type": "llama",
"model_dtype": "bfloat16",
"max_batch_size": 2048,
"max_input_len": 4096,
"max_seq_len": 6144,
"max_num_tokens": 16384,
"trt_llm_version": "v0.11.0"
},
"trt_client_parameters": {
"endpoint": "/v2/models/ensemble/generate_stream"
},
"vllm_server_parameters": { "vllm_server_parameters": {
"disable_log_stats": "", "disable_log_stats": "",
"disable_log_requests": "" "disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
}, },
"vllm_client_parameters": { "vllm_client_parameters": {
},
"sglang_server_parameters": {
"disable_radix_cache": "",
"dtype": "bfloat16"
},
"sglang_client_parameters": {
} }
} }
] ]
\ No newline at end of file
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
"test_name": "serving_llama8B_tp1_sharegpt", "test_name": "serving_llama8B_tp1_sharegpt",
"qps_list": [1, 4, 16, "inf"], "qps_list": [1, 4, 16, "inf"],
"server_parameters": { "server_parameters": {
"model": "meta-llama/Meta-Llama-3-8B", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1, "tensor_parallel_size": 1,
"swap_space": 16, "swap_space": 16,
"disable_log_stats": "", "disable_log_stats": "",
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
"load_format": "dummy" "load_format": "dummy"
}, },
"client_parameters": { "client_parameters": {
"model": "meta-llama/Meta-Llama-3-8B", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"backend": "vllm", "backend": "vllm",
"dataset_name": "sharegpt", "dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
"test_name": "serving_llama70B_tp4_sharegpt", "test_name": "serving_llama70B_tp4_sharegpt",
"qps_list": [1, 4, 16, "inf"], "qps_list": [1, 4, 16, "inf"],
"server_parameters": { "server_parameters": {
"model": "meta-llama/Meta-Llama-3-70B-Instruct", "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"tensor_parallel_size": 4, "tensor_parallel_size": 4,
"swap_space": 16, "swap_space": 16,
"disable_log_stats": "", "disable_log_stats": "",
...@@ -30,7 +30,7 @@ ...@@ -30,7 +30,7 @@
"load_format": "dummy" "load_format": "dummy"
}, },
"client_parameters": { "client_parameters": {
"model": "meta-llama/Meta-Llama-3-70B-Instruct", "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"backend": "vllm", "backend": "vllm",
"dataset_name": "sharegpt", "dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
...@@ -60,7 +60,7 @@ ...@@ -60,7 +60,7 @@
"test_name": "serving_llama70B_tp4_sharegpt_specdecode", "test_name": "serving_llama70B_tp4_sharegpt_specdecode",
"qps_list": [2], "qps_list": [2],
"server_parameters": { "server_parameters": {
"model": "meta-llama/Meta-Llama-3-70B-Instruct", "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"disable_log_requests": "", "disable_log_requests": "",
"tensor_parallel_size": 4, "tensor_parallel_size": 4,
"swap_space": 16, "swap_space": 16,
...@@ -70,7 +70,7 @@ ...@@ -70,7 +70,7 @@
"use_v2_block_manager": "" "use_v2_block_manager": ""
}, },
"client_parameters": { "client_parameters": {
"model": "meta-llama/Meta-Llama-3-70B-Instruct", "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"backend": "vllm", "backend": "vllm",
"dataset_name": "sharegpt", "dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
{ {
"test_name": "throughput_llama8B_tp1", "test_name": "throughput_llama8B_tp1",
"parameters": { "parameters": {
"model": "meta-llama/Meta-Llama-3-8B", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1, "tensor_parallel_size": 1,
"load_format": "dummy", "load_format": "dummy",
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
{ {
"test_name": "throughput_llama70B_tp4", "test_name": "throughput_llama70B_tp4",
"parameters": { "parameters": {
"model": "meta-llama/Meta-Llama-3-70B-Instruct", "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"tensor_parallel_size": 4, "tensor_parallel_size": 4,
"load_format": "dummy", "load_format": "dummy",
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
......
steps: steps:
- label: "Build wheel - CUDA {{matrix.cuda_version}}" - label: "Build wheel - CUDA 12.1"
agents: agents:
queue: cpu_queue queue: cpu_queue
commands: commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION={{matrix.cuda_version}} --tag vllm-ci:build-image --target build --progress plain ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
# rename the files to change linux -> manylinux1
- "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
- "mv artifacts/dist/$(ls artifacts/dist) artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
- "aws s3 cp artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl s3://vllm-wheels/$BUILDKITE_COMMIT/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
- "aws s3 cp artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl s3://vllm-wheels/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
env:
DOCKER_BUILDKIT: "1"
- block: "Build CUDA 11.8 wheel"
key: block-build-cu118-wheel
- label: "Build wheel - CUDA 11.8"
depends_on: block-build-cu118-wheel
agents:
queue: cpu_queue
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
- "mkdir artifacts" - "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
# rename the files to change linux -> manylinux1 # rename the files to change linux -> manylinux1
...@@ -12,8 +31,3 @@ steps: ...@@ -12,8 +31,3 @@ steps:
- "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/" - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
matrix:
setup:
cuda_version:
- "11.8.0"
- "12.1.0"
# This script runs test inside the corresponding ROCm docker container. # This script runs test inside the corresponding ROCm docker container.
set -ex set -o pipefail
# Print ROCm version # Print ROCm version
echo "--- Confirming Clean Initial State" echo "--- Confirming Clean Initial State"
...@@ -70,15 +70,85 @@ HF_CACHE="$(realpath ~)/huggingface" ...@@ -70,15 +70,85 @@ HF_CACHE="$(realpath ~)/huggingface"
mkdir -p ${HF_CACHE} mkdir -p ${HF_CACHE}
HF_MOUNT="/root/.cache/huggingface" HF_MOUNT="/root/.cache/huggingface"
docker run \ commands=$@
echo "Commands:$commands"
#ignore certain kernels tests
if [[ $commands == *" kernels "* ]]; then
commands="${commands} \
--ignore=kernels/test_attention.py \
--ignore=kernels/test_attention_selector.py \
--ignore=kernels/test_blocksparse_attention.py \
--ignore=kernels/test_causal_conv1d.py \
--ignore=kernels/test_cutlass.py \
--ignore=kernels/test_encoder_decoder_attn.py \
--ignore=kernels/test_flash_attn.py \
--ignore=kernels/test_flashinfer.py \
--ignore=kernels/test_gguf.py \
--ignore=kernels/test_int8_quant.py \
--ignore=kernels/test_machete_gemm.py \
--ignore=kernels/test_mamba_ssm.py \
--ignore=kernels/test_marlin_gemm.py \
--ignore=kernels/test_moe.py \
--ignore=kernels/test_prefix_prefill.py \
--ignore=kernels/test_rand.py \
--ignore=kernels/test_sampler.py"
fi
#ignore certain Entrypoints tests
if [[ $commands == *" entrypoints/openai "* ]]; then
commands=${commands//" entrypoints/openai "/" entrypoints/openai \
--ignore=entrypoints/openai/test_accuracy.py \
--ignore=entrypoints/openai/test_audio.py \
--ignore=entrypoints/openai/test_encoder_decoder.py \
--ignore=entrypoints/openai/test_embedding.py \
--ignore=entrypoints/openai/test_oot_registration.py "}
fi
PARALLEL_JOB_COUNT=8
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
if [[ $commands == *"--shard-id="* ]]; then
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
#replace shard arguments
commands=${commands//"--shard-id= "/"--shard-id=${GPU} "}
commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
echo "Shard ${GPU} commands:$commands"
docker run \
--device /dev/kfd --device /dev/dri \ --device /dev/kfd --device /dev/dri \
--network host \ --network host \
--shm-size=16gb \ --shm-size=16gb \
--rm \ --rm \
-e HIP_VISIBLE_DEVICES=${GPU} \
-e HF_TOKEN \ -e HF_TOKEN \
-v ${HF_CACHE}:${HF_MOUNT} \ -v ${HF_CACHE}:${HF_MOUNT} \
-e HF_HOME=${HF_MOUNT} \ -e HF_HOME=${HF_MOUNT} \
--name ${container_name} \ --name ${container_name}_${GPU} \
${image_name} \ ${image_name} \
/bin/bash -c "${@}" /bin/bash -c "${commands}" \
|& while read -r line; do echo ">>Shard $GPU: $line"; done &
PIDS+=($!)
done
#wait for all processes to finish and collect exit codes
for pid in ${PIDS[@]}; do
wait ${pid}
STATUS+=($?)
done
for st in ${STATUS[@]}; do
if [[ ${st} -ne 0 ]]; then
echo "One of the processes failed with $st"
exit ${st}
fi
done
else
docker run \
--device /dev/kfd --device /dev/dri \
--network host \
--shm-size=16gb \
--rm \
-e HIP_VISIBLE_DEVICES=0 \
-e HF_TOKEN \
-v ${HF_CACHE}:${HF_MOUNT} \
-e HF_HOME=${HF_MOUNT} \
--name ${container_name} \
${image_name} \
/bin/bash -c "${commands}"
fi
# This script build the CPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
set -ex
# Try building the docker image
docker build -t cpu-test -f Dockerfile.ppc64le .
# Setup cleanup
remove_docker_container() { docker rm -f cpu-test || true; }
trap remove_docker_container EXIT
remove_docker_container
# Run the image, setting --shm-size=4g for tensor parallel.
source /etc/environment
#docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN=$HF_TOKEN --name cpu-test cpu-test
# Run basic model test
docker exec cpu-test bash -c "
pip install pytest matplotlib einops transformers_stream_generator
pytest -v -s tests/models -m \"not vlm\" \
--ignore=tests/models/test_embedding.py \
--ignore=tests/models/test_oot_registration.py \
--ignore=tests/models/test_registry.py \
--ignore=tests/models/test_jamba.py \
--ignore=tests/models/test_mamba.py \
--ignore=tests/models/test_danube3_4b.py" # Mamba kernels and Danube3-4B on CPU is not supported
# online inference
docker exec cpu-test bash -c "
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m &
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
python3 benchmarks/benchmark_serving.py \
--backend vllm \
--dataset-name random \
--model facebook/opt-125m \
--num-prompts 20 \
--endpoint /v1/completions \
--tokenizer facebook/opt-125m"
...@@ -22,8 +22,25 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py" ...@@ -22,8 +22,25 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
# Run basic model test # Run basic model test
docker exec cpu-test bash -c " docker exec cpu-test bash -c "
pip install pytest Pillow protobuf pip install pytest matplotlib einops transformers_stream_generator datamodel_code_generator
pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported pytest -v -s tests/models/encoder_decoder/language
pytest -v -s tests/models/decoder_only/language \
--ignore=tests/models/test_fp8.py \
--ignore=tests/models/decoder_only/language/test_jamba.py \
--ignore=tests/models/decoder_only/language/test_mamba.py \
--ignore=tests/models/decoder_only/language/test_granitemoe.py \
--ignore=tests/models/decoder_only/language/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
# Run compressed-tensor test
docker exec cpu-test bash -c "
pytest -s -v \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
# Run AWQ test
docker exec cpu-test bash -c "
pytest -s -v \
tests/quantization/test_ipex_quant.py"
# online inference # online inference
docker exec cpu-test bash -c " docker exec cpu-test bash -c "
......
...@@ -12,5 +12,4 @@ remove_docker_container ...@@ -12,5 +12,4 @@ remove_docker_container
# For HF_TOKEN. # For HF_TOKEN.
source /etc/environment source /etc/environment
# Run a simple end-to-end example. # Run a simple end-to-end example.
docker run --privileged --net host --shm-size=16G -it -e HF_TOKEN=$HF_TOKEN --name tpu-test vllm-tpu \ docker run --privileged --net host --shm-size=16G -it -e HF_TOKEN=$HF_TOKEN --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py"
python3 /workspace/vllm/examples/offline_inference_tpu.py
...@@ -11,4 +11,4 @@ trap remove_docker_container EXIT ...@@ -11,4 +11,4 @@ trap remove_docker_container EXIT
remove_docker_container remove_docker_container
# Run the image and launch offline inference # Run the image and launch offline inference
docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path xpu-test python3 examples/offline_inference.py docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test python3 examples/offline_inference.py
...@@ -5,264 +5,498 @@ ...@@ -5,264 +5,498 @@
# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2 # https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2
# to generate the final pipeline yaml file. # to generate the final pipeline yaml file.
# Documentation
# label(str): the name of the test. emoji allowed.
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
# fast_check_only(bool): run this test on fastcheck pipeline only
# optional(bool): never run this test by default (i.e. need to unblock manually)
# command(str): the single command to run for tests. incompatible with commands.
# commands(list): the list of commands to run for test. incompatbile with command.
# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
# gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100
# num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4.
# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host,
# in this case, commands must be specified. the first command runs on first host, the second
# command runs on the second host.
# working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests
# source_file_dependencies(list): the list of prefix to opt-in the test for, if empty, the test will always run.
# When adding a test
# - If the test belong to an existing group, add it there
# - If the test is short, add to any existing step
# - If the test takes more than 10min, then it is okay to create a new step.
# Note that all steps execute in parallel.
steps: steps:
- label: Async Engine, Inputs, Utils, Worker Test ##### fast check tests #####
- label: Documentation Build # 2min
working_dir: "/vllm-workspace/test_docs/docs"
fast_check: true fast_check: true
fast_check_only: true no_gpu: True
commands: commands:
- pytest -v -s async_engine # Async Engine - pip install -r requirements-docs.txt
- SPHINXOPTS=\"-W\" make html
# Check API reference (if it fails, you may have missing mock imports)
- grep \"sig sig-object py\" build/html/dev/sampling_params.html
- label: Async Engine, Inputs, Utils, Worker Test # 24min
fast_check: true
source_file_dependencies:
- vllm/
- tests/mq_llm_engine
- tests/async_engine
- tests/test_inputs
- tests/multimodal
- tests/test_utils
- tests/worker
commands:
- pytest -v -s mq_llm_engine # MQLLMEngine
- pytest -v -s async_engine # AsyncLLMEngine
- NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
- pytest -v -s test_inputs.py - pytest -v -s test_inputs.py
- pytest -v -s multimodal - pytest -v -s multimodal
- pytest -v -s test_utils.py # Utils - pytest -v -s test_utils.py # Utils
- pytest -v -s worker # Worker - pytest -v -s worker # Worker
- label: Metrics, Tracing Test - label: Basic Correctness Test # 30min
fast_check: true
fast_check_only: true
commands:
- pytest -v -s metrics # Metrics
- "pip install \
opentelemetry-sdk \
opentelemetry-api \
opentelemetry-exporter-otlp \
opentelemetry-semantic-conventions-ai" # Tracing
- pytest -v -s tracing
- label: Regression Test
mirror_hardwares: [amd]
fast_check: true
command: pytest -v -s test_regression.py
working_dir: "/vllm-workspace/tests" # optional
- label: AsyncEngine Test
#mirror_hardwares: [amd] #mirror_hardwares: [amd]
command: pytest -v -s async_engine
- label: Basic Correctness Test
mirror_hardwares: [amd]
fast_check: true fast_check: true
source_file_dependencies:
- vllm/
- tests/basic_correctness/test_basic_correctness
- tests/basic_correctness/test_cpu_offload
- tests/basic_correctness/test_preemption
commands: commands:
# This flashinfer installation will fail on AMD ROCm, so it is set as optional.
- pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl || true
- pytest -v -s basic_correctness/test_basic_correctness.py - pytest -v -s basic_correctness/test_basic_correctness.py
- pytest -v -s basic_correctness/test_cpu_offload.py - pytest -v -s basic_correctness/test_cpu_offload.py
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
- label: Chunked Prefill Test
source_file_dependencies:
- vllm/
- tests/basic_correctness/test_chunked_prefill
commands:
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
- label: Core Test - label: Core Test # 10min
mirror_hardwares: [amd] mirror_hardwares: [amd]
fast_check: true fast_check: true
source_file_dependencies:
- vllm/core
- vllm/distributed
- tests/core
commands: commands:
- pytest -v -s core - pytest -v -s core
- label: Distributed Comm Ops Test - label: Entrypoints Test # 40min
#mirror_hardwares: [amd]
working_dir: "/vllm-workspace/tests"
num_gpus: 2
commands:
- pytest -v -s distributed/test_comm_ops.py
- pytest -v -s distributed/test_shm_broadcast.py
- label: 2 Node Tests (4 GPUs in total)
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 2 fast_check: true
num_nodes: 2
commands:
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
- label: Distributed Tests (2 GPUs)
mirror_hardwares: [amd] mirror_hardwares: [amd]
working_dir: "/vllm-workspace/tests" source_file_dependencies:
num_gpus: 2 - vllm/
commands: commands:
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py - pip install -e ./plugins/vllm_add_dummy_model
- TARGET_TEST_SUITE=L4 pytest -v -s distributed/test_basic_distributed_correctness.py - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py
- pytest -v -s distributed/test_chunked_prefill_distributed.py - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
- pytest -v -s distributed/test_multimodal_broadcast.py - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
- label: Distributed Tests (4 GPUs) - pytest -v -s entrypoints/test_chat_utils.py
#mirror_hardwares: [amd] - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
- label: Distributed Tests (4 GPUs) # 10min
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 4 num_gpus: 4
fast_check: true fast_check: true
source_file_dependencies:
- vllm/distributed/
- vllm/core/
- tests/distributed
- tests/spec_decode/e2e/test_integration_dist_tp4
- tests/compile
commands: commands:
- pytest -v -s compile/test_basic_correctness.py
- pytest -v -s distributed/test_pynccl.py - pytest -v -s distributed/test_pynccl.py
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
- label: Pipeline Parallelism Test - label: Metrics, Tracing Test # 10min
working_dir: "/vllm-workspace/tests" num_gpus: 2
num_gpus: 4 fast_check: true
source_file_dependencies:
- vllm/
- tests/metrics
- tests/tracing
commands: commands:
- pytest -v -s distributed/test_pipeline_parallel.py - pytest -v -s metrics
- "pip install \
'opentelemetry-sdk>=1.26.0,<1.27.0' \
'opentelemetry-api>=1.26.0,<1.27.0' \
'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \
'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'"
- pytest -v -s tracing
- label: Engine Test ##### fast check tests #####
##### 1 GPU test #####
- label: Regression Test # 5min
mirror_hardwares: [amd] mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
- tests/test_regression
commands: commands:
- pytest -v -s engine test_sequence.py test_config.py test_logger.py - pip install modelscope
# OOM in the CI unless we run this separately - pytest -v -s test_regression.py
- pytest -v -s tokenization working_dir: "/vllm-workspace/tests" # optional
- label: Entrypoints Test - label: Engine Test # 10min
fast_check: true
mirror_hardwares: [amd] mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
- tests/engine
- tests/tokenization
commands: commands:
- pytest -v -s entrypoints/llm - pytest -v -s engine test_sequence.py test_config.py test_logger.py
- pytest -v -s entrypoints/openai # OOM in the CI unless we run this separately
- pytest -v -s tokenization
- label: Examples Test - label: Examples Test # 15min
working_dir: "/vllm-workspace/examples" working_dir: "/vllm-workspace/examples"
mirror_hardwares: [amd] #mirror_hardwares: [amd]
source_file_dependencies:
- vllm/entrypoints
- examples/
commands: commands:
# install tensorizer for tensorize_vllm_model.py - pip install awscli tensorizer # for llava example and tensorizer test
- pip install awscli tensorizer
- python3 offline_inference.py - python3 offline_inference.py
- python3 cpu_offload.py - python3 cpu_offload.py
- python3 offline_inference_chat.py
- python3 offline_inference_with_prefix.py - python3 offline_inference_with_prefix.py
- python3 llm_engine_example.py - python3 llm_engine_example.py
- python3 offline_inference_vision_language.py - python3 offline_inference_vision_language.py
- python3 offline_inference_vision_language_multi_image.py
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference_encoder_decoder.py
- python3 offline_profile.py --model facebook/opt-125m
- label: Inputs Test - label: Prefix Caching Test # 9min
#mirror_hardwares: [amd] #mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
- tests/prefix_caching
commands: commands:
- pytest -v -s test_inputs.py - pytest -v -s prefix_caching
- pytest -v -s multimodal
# - label: Kernels Test %N - label: Samplers Test # 36min
# #mirror_hardwares: [amd] source_file_dependencies:
# commands: - vllm/model_executor/layers
# - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl - vllm/sampling_metadata.py
# - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - tests/samplers
# parallelism: 4 commands:
- pytest -v -s samplers
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
- label: Models Test - label: LogitsProcessor Test # 5min
#mirror_hardwares: [amd] mirror_hardwares: [amd]
source_file_dependencies:
- vllm/model_executor/layers
- tests/test_logits_processor
command: pytest -v -s test_logits_processor.py
- label: Speculative decoding tests # 30min
source_file_dependencies:
- vllm/spec_decode
- tests/spec_decode
commands: commands:
- pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl - pytest -v -s spec_decode/e2e/test_multistep_correctness.py
- pytest -v -s models -m \"not vlm\" - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
- label: Vision Language Models Test - label: LoRA Test %N # 15min each
mirror_hardwares: [amd] mirror_hardwares: [amd]
source_file_dependencies:
- vllm/lora
- tests/lora
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
parallelism: 4
- label: "PyTorch Fullgraph Smoke Test" # 9min
fast_check: true
source_file_dependencies:
- vllm/
- tests/compile
commands: commands:
- pytest -v -s models -m vlm - pytest -v -s compile/test_basic_correctness.py
# TODO: re-write in comparison tests, and fix symbolic shape
# for quantization ops.
# - label: "PyTorch Fullgraph Test" # 18min
# source_file_dependencies:
# - vllm/
# - tests/compile
# commands:
# - pytest -v -s compile/test_full_graph.py
- label: Prefix Caching Test - label: Kernels Test %N # 1h each
mirror_hardwares: [amd] mirror_hardwares: [amd]
source_file_dependencies:
- csrc/
- vllm/attention
- tests/kernels
commands: commands:
- pytest -v -s prefix_caching - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 4
- label: Samplers Test - label: Tensorizer Test # 11min
#mirror_hardwares: [amd] mirror_hardwares: [amd]
command: pytest -v -s samplers soft_fail: true
source_file_dependencies:
- vllm/model_executor/model_loader
- tests/tensorizer_loader
commands:
- apt-get update && apt-get install -y curl libsodium23
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s tensorizer_loader
- label: LogitsProcessor Test - label: Benchmarks # 9min
working_dir: "/vllm-workspace/.buildkite"
mirror_hardwares: [amd] mirror_hardwares: [amd]
command: pytest -v -s test_logits_processor.py source_file_dependencies:
- benchmarks/
commands:
- pip install aiohttp
- bash run-benchmarks.sh
- label: Utils Test - label: Quantization Test # 33min
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
- tests/quantization
command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
- label: LM Eval Small Models # 53min
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands: commands:
- pytest -v -s test_utils.py - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s test_embedded_commit.py - bash ./run-tests.sh -c configs/models-small.txt -t 1
- label: Worker Test - label: Encoder Decoder tests # 5min
mirror_hardwares: [amd] source_file_dependencies:
command: pytest -v -s worker - vllm/
- tests/encoder_decoder
commands:
- pytest -v -s encoder_decoder
- label: OpenAI-Compatible Tool Use # 20 min
fast_check: false
mirror_hardwares: [ amd ]
source_file_dependencies:
- vllm/
- tests/tool_use
commands:
- pytest -v -s tool_use
- label: Speculative decoding tests ##### models test #####
#mirror_hardwares: [amd]
- label: Basic Models Test # 3min
source_file_dependencies:
- vllm/
- tests/models
commands: commands:
# See https://github.com/vllm-project/vllm/issues/5152 - pip install -e ./plugins/vllm_add_dummy_model
- export VLLM_ATTENTION_BACKEND=XFORMERS - pytest -v -s models/test_oot_registration.py # it needs a clean process
- pytest -v -s spec_decode - pytest -v -s models/*.py --ignore=models/test_oot_registration.py
# - label: LoRA Test %N - label: Decoder-only Language Models Test # 1h36min
# #mirror_hardwares: [amd] #mirror_hardwares: [amd]
# command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py source_file_dependencies:
# parallelism: 4 - vllm/
- tests/models/decoder_only/language
commands:
- pytest -v -s models/decoder_only/language
# - label: LoRA Long Context (Distributed) - label: Decoder-only Multi-Modal Models Test # 1h31min
# #mirror_hardwares: [amd] #mirror_hardwares: [amd]
# num_gpus: 4 source_file_dependencies:
# # This test runs llama 13B, so it is required to run on 4 GPUs. - vllm/
# commands: - tests/models/decoder_only/audio_language
# # FIXIT: find out which code initialize cuda before running the test - tests/models/decoder_only/vision_language
# # before the fix, we need to use spawn to test it commands:
# - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s models/decoder_only/audio_language
# - pytest -v -s -x lora/test_long_context.py - pytest -v -s models/decoder_only/vision_language
- label: Tensorizer Test - label: Other Models Test # 6min
#mirror_hardwares: [amd] #mirror_hardwares: [amd]
fast_check: true source_file_dependencies:
- vllm/
- tests/models/embedding/language
- tests/models/embedding/vision_language
- tests/models/encoder_decoder/language
- tests/models/encoder_decoder/vision_language
commands: commands:
- apt-get install -y curl libsodium23 - pytest -v -s models/embedding/language
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s models/embedding/vision_language
- pytest -v -s tensorizer_loader - pytest -v -s models/encoder_decoder/language
- pytest -v -s models/encoder_decoder/vision_language
# This test is used only in PR development phase to test individual models and should never run on main
- label: Custom Models Test
optional: true
commands:
- echo 'Testing custom models...'
# PR authors can temporarily add commands below to test individual models
# e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
# *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
- label: Metrics Test ##### 1 GPU test #####
mirror_hardwares: [amd] ##### multi gpus test #####
command: pytest -v -s metrics
- label: Distributed Comm Ops Test # 7min
working_dir: "/vllm-workspace/tests"
num_gpus: 2
source_file_dependencies:
- vllm/distributed
- tests/distributed
commands:
- pytest -v -s distributed/test_comm_ops.py
- pytest -v -s distributed/test_shm_broadcast.py
- label: 2 Node Tests (4 GPUs in total) # 16min
working_dir: "/vllm-workspace/tests"
num_gpus: 2
num_nodes: 2
source_file_dependencies:
- vllm/distributed/
- vllm/engine/
- vllm/executor/
- vllm/model_executor/models/
- tests/distributed/
commands:
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
- label: Quantization Test - label: Distributed Tests (2 GPUs) # 40min
#mirror_hardwares: [amd] #mirror_hardwares: [amd]
command: pytest -v -s quantization working_dir: "/vllm-workspace/tests"
num_gpus: 2
source_file_dependencies:
- vllm/distributed/
- vllm/engine/
- vllm/executor/
- vllm/model_executor/models/
- tests/distributed/
- vllm/compilation
commands:
- pytest -v -s ./compile/test_basic_correctness.py
- pytest -v -s ./compile/test_wrapper.py
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
# Avoid importing model tests that cause CUDA reinitialization error
- pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
- pytest models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
- pip install -e ./plugins/vllm_add_dummy_model
- pytest -v -s distributed/test_distributed_oot.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
- label: Tracing Test - label: Multi-step Tests (4 GPUs) # 36min
commands: working_dir: "/vllm-workspace/tests"
- "pip install \ num_gpus: 4
opentelemetry-sdk \ source_file_dependencies:
opentelemetry-api \ - vllm/model_executor/layers/sampler.py
opentelemetry-exporter-otlp \ - vllm/sequence.py
opentelemetry-semantic-conventions-ai" - vllm/worker/worker_base.py
- pytest -v -s tracing - vllm/worker/worker.py
- vllm/worker/multi_step_worker.py
- label: Benchmarks - vllm/worker/model_runner_base.py
working_dir: "/vllm-workspace/.buildkite" - vllm/worker/model_runner.py
mirror_hardwares: [amd] - vllm/worker/multi_step_model_runner.py
- vllm/engine
- tests/multi_step
commands: commands:
- pip install aiohttp - pytest -v -s multi_step/test_correctness_async_llm.py
- bash run-benchmarks.sh - pytest -v -s multi_step/test_correctness_llm.py
- label: LM Eval Small Models - label: Pipeline Parallelism Test # 45min
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" working_dir: "/vllm-workspace/tests"
num_gpus: 4
source_file_dependencies:
- vllm/distributed/
- vllm/engine/
- vllm/executor/
- vllm/model_executor/models/
- tests/distributed/
commands: commands:
- pip install lm-eval - pytest -v -s distributed/test_pp_cudagraph.py
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s distributed/test_pipeline_parallel.py
- bash ./run-tests.sh -c configs/models-small.txt -t 1
- label: LM Eval Large Models - label: LoRA Long Context (Distributed) # 11min
gpu: a100 # This test runs llama 13B, so it is required to run on 4 GPUs.
num_gpus: 4 num_gpus: 4
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" soft_fail: true
source_file_dependencies:
- vllm/lora
- tests/lora/test_long_context
commands: commands:
- pip install lm-eval # FIXIT: find out which code initialize cuda before running the test
- export VLLM_WORKER_MULTIPROC_METHOD=spawn # before the fix, we need to use spawn to test it
- bash ./run-tests.sh -c configs/models-large.txt -t 4 - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s -x lora/test_long_context.py
- label: Documentation Build - label: Weight Loading Multiple GPU Test # 33min
working_dir: "/vllm-workspace/test_docs/docs" working_dir: "/vllm-workspace/tests"
fast_check: true num_gpus: 2
no_gpu: True source_file_dependencies:
- vllm/
- tests/weight_loading
commands: commands:
- pip install -r requirements-docs.txt - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
- SPHINXOPTS=\"-W\" make html
- label: Weight Loading Multiple GPU Test - Large Models # optional
working_dir: "/vllm-workspace/tests"
num_gpus: 2
gpu: a100
optional: true
source_file_dependencies:
- vllm/
- tests/weight_loading
commands:
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
##### multi gpus test #####
##### A100 test #####
- label: Distributed Tests (A100) - label: Distributed Tests (A100) # optional
gpu: a100 gpu: a100
num_gpus: 4 num_gpus: 4
source_file_dependencies:
- vllm/
commands: commands:
# NOTE: don't test llama model here, it seems hf implementation is buggy # NOTE: don't test llama model here, it seems hf implementation is buggy
# see https://github.com/vllm-project/vllm/pull/5689 for details # see https://github.com/vllm-project/vllm/pull/5689 for details
- pytest -v -s distributed/test_custom_all_reduce.py - pytest -v -s distributed/test_custom_all_reduce.py
- pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m distributed_2_gpus
- TARGET_TEST_SUITE=A100 pytest -v -s distributed/test_basic_distributed_correctness.py
- pytest -v -s -x lora/test_mixtral.py - pytest -v -s -x lora/test_mixtral.py
- label: LM Eval Large Models # optional
gpu: a100
num_gpus: 4
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- bash ./run-tests.sh -c configs/models-large.txt -t 4
/.venv
/build
dist
vllm/*.so vllm/*.so
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
.mypy_cache
# Distribution / packaging
.Python
/build/
cmake-build-*/
CMakeUserPresets.json
develop-eggs/
/dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment