Commit ad385667 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge branch 'v0.6.3.post1-dev'

parents be0967c1 903593d3
...@@ -8,6 +8,7 @@ main() { ...@@ -8,6 +8,7 @@ main() {
(which wget && which curl) || (apt-get update && apt-get install -y wget curl) (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
(which jq) || (apt-get update && apt-get -y install jq) (which jq) || (apt-get update && apt-get -y install jq)
(which zip) || (apt-get install -y zip)
if [ ! -f /workspace/buildkite-agent ]; then if [ ! -f /workspace/buildkite-agent ]; then
echo "buildkite-agent binary not found. Skip plotting the results." echo "buildkite-agent binary not found. Skip plotting the results."
...@@ -24,17 +25,54 @@ main() { ...@@ -24,17 +25,54 @@ main() {
ls ls
ls results/ ls results/
# generate figures # upload benchmark results
python3 -m pip install tabulate pandas matplotlib zip -r results.zip results/
python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \ /workspace/buildkite-agent artifact upload "results.zip"
--description $description \
--results-folder results/ # upload benchmarking scripts
cd $VLLM_SOURCE_CODE_LOC/
# upload results and figures zip -r nightly-benchmarks.zip .buildkite/ benchmarks/
/workspace/buildkite-agent artifact upload "nightly_results.png" /workspace/buildkite-agent artifact upload "nightly-benchmarks.zip"
/workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
/workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json cd $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/
/workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md # upload benchmarking pipeline
/workspace/buildkite-agent artifact upload "nightly-pipeline.yaml"
cd $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/
/workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly-annotation.md
# The figures should be genereated by a separate process outside the CI/CD pipeline
# # generate figures
# python3 -m pip install tabulate pandas matplotlib
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py \
# --description $description \
# --results-folder results/
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
# --description $description \
# --results-folder results/ \
# --dataset sharegpt
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
# --description $description \
# --results-folder results/ \
# --dataset sonnet_2048_128
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
# --description $description \
# --results-folder results/ \
# --dataset sonnet_128_2048
# # upload results and figures
# /workspace/buildkite-agent artifact upload "nightly_results*.png"
# /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
# /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
# /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
} }
main "$@" main "$@"
\ No newline at end of file
#!/bin/bash #!/bin/bash
set -o pipefail set -o pipefail
set -x
check_gpus() { check_gpus() {
# check the number of GPUs and GPU type. # check the number of GPUs and GPU type.
...@@ -15,15 +16,66 @@ check_gpus() { ...@@ -15,15 +16,66 @@ check_gpus() {
echo "GPU type is $gpu_type" echo "GPU type is $gpu_type"
} }
kill_gpu_processes() { check_hf_token() {
pkill lmdeploy || true # check if HF_TOKEN is available and valid
# waiting for GPU processes to be fully killed if [[ -z "$HF_TOKEN" ]]; then
sleep 10 echo "Error: HF_TOKEN is not set."
# Print the GPU memory usage exit 1
# so that we know if all GPU processes are killed. elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0) echo "Error: HF_TOKEN does not start with 'hf_'."
# The memory usage should be 0 MB. exit 1
echo "GPU 0 Memory Usage: $gpu_memory_usage MB" else
echo "HF_TOKEN is set and valid."
fi
}
upload_to_buildkite() {
# upload the benchmarking results to buildkite
# if the agent binary is not found, skip uploading the results, exit 0
if [ ! -f /workspace/buildkite-agent ]; then
echo "buildkite-agent binary not found. Skip uploading the results."
return 0
fi
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
}
get_current_llm_serving_engine() {
if which lmdeploy >/dev/null; then
echo "Container: lmdeploy"
export CURRENT_LLM_SERVING_ENGINE=lmdeploy
return
fi
if [ -e /tgi-entrypoint.sh ]; then
echo "Container: tgi"
export CURRENT_LLM_SERVING_ENGINE=tgi
return
fi
if which trtllm-build >/dev/null; then
echo "Container: tensorrt-llm"
export CURRENT_LLM_SERVING_ENGINE=trt
return
fi
if [ -e /sgl-workspace ]; then
echo "Container: sglang"
export CURRENT_LLM_SERVING_ENGINE=sglang
return
fi
if [ -e /vllm-workspace ]; then
echo "Container: vllm"
# move to a completely irrelevant directory, to avoid import vllm from current folder
export CURRENT_LLM_SERVING_ENGINE=vllm
return
fi
} }
json2args() { json2args() {
...@@ -42,6 +94,19 @@ json2args() { ...@@ -42,6 +94,19 @@ json2args() {
echo "$args" echo "$args"
} }
kill_gpu_processes() {
pkill -f python
pkill -f python3
pkill -f tritonserver
pkill -f pt_main_thread
pkill -f text-generation
pkill -f lmdeploy
while [ $(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1) -ge 1000 ]; do
sleep 1
done
}
wait_for_server() { wait_for_server() {
# wait for vllm server to start # wait for vllm server to start
# return 1 if vllm server crashes # return 1 if vllm server crashes
...@@ -51,6 +116,14 @@ wait_for_server() { ...@@ -51,6 +116,14 @@ wait_for_server() {
done' && return 0 || return 1 done' && return 0 || return 1
} }
ensure_installed() {
# Ensure that the given command is installed by apt-get
local cmd=$1
if ! which $cmd >/dev/null; then
apt-get update && apt-get install -y $cmd
fi
}
run_serving_tests() { run_serving_tests() {
# run serving tests using `benchmark_serving.py` # run serving tests using `benchmark_serving.py`
# $1: a json file specifying serving test cases # $1: a json file specifying serving test cases
...@@ -69,8 +142,8 @@ run_serving_tests() { ...@@ -69,8 +142,8 @@ run_serving_tests() {
continue continue
fi fi
# append lmdeploy to the test name # prepend the current serving engine to the test name
test_name=lmdeploy_$test_name test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
# get common parameters # get common parameters
common_params=$(echo "$params" | jq -r '.common_parameters') common_params=$(echo "$params" | jq -r '.common_parameters')
...@@ -80,13 +153,11 @@ run_serving_tests() { ...@@ -80,13 +153,11 @@ run_serving_tests() {
dataset_path=$(echo "$common_params" | jq -r '.dataset_path') dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port') port=$(echo "$common_params" | jq -r '.port')
num_prompts=$(echo "$common_params" | jq -r '.num_prompts') num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
# get client and server arguments # get client and server arguments
server_params=$(echo "$params" | jq -r '.lmdeploy_server_parameters') server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
client_params=$(echo "$params" | jq -r '.lmdeploy_client_parameters') client_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_client_parameters")
server_args=$(json2args "$server_params")
client_args=$(json2args "$client_params") client_args=$(json2args "$client_params")
qps_list=$(echo "$params" | jq -r '.qps_list') qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
...@@ -94,40 +165,44 @@ run_serving_tests() { ...@@ -94,40 +165,44 @@ run_serving_tests() {
# check if there is enough GPU to run the test # check if there is enough GPU to run the test
if [[ $gpu_count -lt $tp ]]; then if [[ $gpu_count -lt $tp ]]; then
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name." echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue continue
fi fi
# prepare tokenizer if [[ $reuse_server == "true" ]]; then
rm -rf /tokenizer_cache echo "Reuse previous server for test case $test_name"
mkdir /tokenizer_cache else
python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \ kill_gpu_processes
--model "$model" \ bash $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh \
--cachedir /tokenizer_cache "$server_params" "$common_params"
fi
server_command="lmdeploy serve api_server $model \
--tp $tp \
--server-port $port \
$server_args"
# run the server
echo "Running test case $test_name"
echo "Server command: $server_command"
bash -c "$server_command" &
# wait until the server is alive
wait_for_server wait_for_server
if [ $? -eq 0 ]; then if [ $? -eq 0 ]; then
echo "" echo ""
echo "lmdeploy server is up and running." echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
else else
echo "" echo ""
echo "lmdeploy failed to start within the timeout period." echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
break break
fi fi
# get model name # prepare tokenizer
model_name=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py) # this is required for lmdeploy.
cd $VLLM_SOURCE_CODE_LOC/benchmarks
rm -rf /tokenizer_cache
mkdir /tokenizer_cache
python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
--model "$model" \
--cachedir /tokenizer_cache
cd $VLLM_SOURCE_CODE_LOC/benchmarks
# change model name for lmdeploy (it will not follow standard hf name)
if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then
model=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
fi
# iterate over different QPS # iterate over different QPS
for qps in $qps_list; do for qps in $qps_list; do
...@@ -140,31 +215,79 @@ run_serving_tests() { ...@@ -140,31 +215,79 @@ run_serving_tests() {
new_test_name=$test_name"_qps_"$qps new_test_name=$test_name"_qps_"$qps
backend=$CURRENT_LLM_SERVING_ENGINE
if [[ $backend = "trt" ]]; then
backend="tensorrt-llm"
fi
if [[ "$backend" == *"vllm"* ]]; then
backend="vllm"
fi
if [[ "$dataset_name" = "sharegpt" ]]; then
client_command="python3 benchmark_serving.py \
--backend $backend \
--tokenizer /tokenizer_cache \
--model $model \
--dataset-name $dataset_name \
--dataset-path $dataset_path \
--num-prompts $num_prompts \
--port $port \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
--ignore-eos \
$client_args"
elif [[ "$dataset_name" = "sonnet" ]]; then
sonnet_input_len=$(echo "$common_params" | jq -r '.sonnet_input_len')
sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
client_command="python3 benchmark_serving.py \ client_command="python3 benchmark_serving.py \
--backend lmdeploy \ --backend $backend \
--tokenizer /tokenizer_cache \ --tokenizer /tokenizer_cache \
--model $model \
--dataset-name $dataset_name \ --dataset-name $dataset_name \
--dataset-path $dataset_path \ --dataset-path $dataset_path \
--num-prompts $num_prompts \ --num-prompts $num_prompts \
--sonnet-input-len $sonnet_input_len \
--sonnet-output-len $sonnet_output_len \
--sonnet-prefix-len $sonnet_prefix_len \
--port $port \ --port $port \
--save-result \ --save-result \
--result-dir $RESULTS_FOLDER \ --result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \ --result-filename ${new_test_name}.json \
--request-rate $qps \ --request-rate $qps \
--model \"$model_name\" \ --ignore-eos \
$client_args" $client_args"
else
echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
exit 1
fi
echo "Running test case $test_name with qps $qps" echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command" echo "Client command: $client_command"
eval "$client_command" eval "$client_command"
server_command="None"
# record the benchmarking commands # record the benchmarking commands
jq_output=$(jq -n \ jq_output=$(jq -n \
--arg server "$server_command" \ --arg server "$server_command" \
--arg client "$client_command" \ --arg client "$client_command" \
--arg gpu "$gpu_type" \ --arg gpu "$gpu_type" \
--arg engine "lmdeploy" \ --arg engine "$CURRENT_LLM_SERVING_ENGINE" \
'{ '{
server_command: $server, server_command: $server,
client_command: $client, client_command: $client,
...@@ -175,42 +298,58 @@ run_serving_tests() { ...@@ -175,42 +298,58 @@ run_serving_tests() {
done done
# clean up
kill_gpu_processes
rm -rf /root/.cache/huggingface/*
done done
kill_gpu_processes
} }
upload_to_buildkite() { prepare_dataset() {
# upload the benchmarking results to buildkite
# if the agent binary is not found, skip uploading the results, exit 0 # download sharegpt dataset
if [ ! -f /workspace/buildkite-agent ]; then cd $VLLM_SOURCE_CODE_LOC/benchmarks
echo "buildkite-agent binary not found. Skip uploading the results." wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
return 0
fi # duplicate sonnet by 4x, to allow benchmarking with input length 2048
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md cd $VLLM_SOURCE_CODE_LOC/benchmarks
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*" echo "" > sonnet_4x.txt
} for _ in {1..4}
do
cat sonnet.txt >> sonnet_4x.txt
done
}
main() { main() {
# check if the environment variable is successfully injected from yaml
check_gpus check_gpus
# enter vllm directory check_hf_token
cd $VLLM_SOURCE_CODE_LOC/benchmarks get_current_llm_serving_engine
pip install -U transformers
# check storage
df -h
ensure_installed wget
ensure_installed curl
ensure_installed jq
prepare_dataset
cd $VLLM_SOURCE_CODE_LOC/benchmarks
declare -g RESULTS_FOLDER=results/ declare -g RESULTS_FOLDER=results/
mkdir -p $RESULTS_FOLDER mkdir -p $RESULTS_FOLDER
BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/ BENCHMARK_ROOT=$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/
python -m pip install transformers==4.41.2 # run the test
export CURRENT_LLM_SERVING_ENGINE=lmdeploy
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
python -m pip install tabulate pandas
python $BENCHMARK_ROOT/scripts/summary-nightly-results.py # upload benchmark results to buildkite
python3 -m pip install tabulate pandas
python3 $BENCHMARK_ROOT/scripts/summary-nightly-results.py
upload_to_buildkite upload_to_buildkite
} }
......
...@@ -68,35 +68,38 @@ wait_for_server() { ...@@ -68,35 +68,38 @@ wait_for_server() {
done' && return 0 || return 1 done' && return 0 || return 1
} }
kill_gpu_processes() { kill_processes_launched_by_current_bash() {
# kill all processes on GPU. # Kill all python processes launched from current bash script
pids=$(nvidia-smi --query-compute-apps=pid --format=csv,noheader) current_shell_pid=$$
if [ -z "$pids" ]; then processes=$(ps -eo pid,ppid,command | awk -v ppid="$current_shell_pid" -v proc="$1" '$2 == ppid && $3 ~ proc {print $1}')
echo "No GPU processes found." if [ -n "$processes" ]; then
echo "Killing the following processes matching '$1':"
echo "$processes"
echo "$processes" | xargs kill -9
else else
for pid in $pids; do echo "No processes found matching '$1'."
kill -9 "$pid"
echo "Killed process with PID: $pid"
done
echo "All GPU processes have been killed."
fi fi
}
kill_gpu_processes() {
# waiting for GPU processes to be fully killed ps -aux
# loop while nvidia-smi returns any processes lsof -t -i:8000 | xargs -r kill -9
while [ -n "$(nvidia-smi --query-compute-apps=pid --format=csv,noheader)" ]; do pkill -f pt_main_thread
# this line doesn't work now
# ps aux | grep python | grep openai | awk '{print $2}' | xargs -r kill -9
pkill -f python3
pkill -f /usr/bin/python3
# wait until GPU memory usage smaller than 1GB
while [ $(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1) -ge 1000 ]; do
sleep 1 sleep 1
echo "Waiting for GPU processes to be killed"
done done
# remove vllm config file # remove vllm config file
rm -rf ~/.config/vllm rm -rf ~/.config/vllm
# Print the GPU memory usage
# so that we know if all GPU processes are killed.
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
# The memory usage should be 0 MB.
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
} }
upload_to_buildkite() { upload_to_buildkite() {
...@@ -114,7 +117,7 @@ upload_to_buildkite() { ...@@ -114,7 +117,7 @@ upload_to_buildkite() {
fi fi
# Use the determined command to annotate and upload artifacts # Use the determined command to annotate and upload artifacts
$BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" < $RESULTS_FOLDER/benchmark_results.md $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" <$RESULTS_FOLDER/benchmark_results.md
$BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*" $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
} }
...@@ -166,7 +169,7 @@ run_latency_tests() { ...@@ -166,7 +169,7 @@ run_latency_tests() {
latency_command: $latency, latency_command: $latency,
gpu_type: $gpu gpu_type: $gpu
}') }')
echo "$jq_output" > "$RESULTS_FOLDER/$test_name.commands" echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
# run the benchmark # run the benchmark
eval "$latency_command" eval "$latency_command"
...@@ -176,7 +179,6 @@ run_latency_tests() { ...@@ -176,7 +179,6 @@ run_latency_tests() {
done done
} }
run_throughput_tests() { run_throughput_tests() {
# run throughput tests using `benchmark_throughput.py` # run throughput tests using `benchmark_throughput.py`
# $1: a json file specifying throughput test cases # $1: a json file specifying throughput test cases
...@@ -224,7 +226,7 @@ run_throughput_tests() { ...@@ -224,7 +226,7 @@ run_throughput_tests() {
throughput_command: $command, throughput_command: $command,
gpu_type: $gpu gpu_type: $gpu
}') }')
echo "$jq_output" > "$RESULTS_FOLDER/$test_name.commands" echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands"
# run the benchmark # run the benchmark
eval "$throughput_command" eval "$throughput_command"
...@@ -256,7 +258,6 @@ run_serving_tests() { ...@@ -256,7 +258,6 @@ run_serving_tests() {
continue continue
fi fi
# get client and server arguments # get client and server arguments
server_params=$(echo "$params" | jq -r '.server_parameters') server_params=$(echo "$params" | jq -r '.server_parameters')
client_params=$(echo "$params" | jq -r '.client_parameters') client_params=$(echo "$params" | jq -r '.client_parameters')
...@@ -334,7 +335,7 @@ run_serving_tests() { ...@@ -334,7 +335,7 @@ run_serving_tests() {
client_command: $client, client_command: $client,
gpu_type: $gpu gpu_type: $gpu
}') }')
echo "$jq_output" > "$RESULTS_FOLDER/${new_test_name}.commands" echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
done done
...@@ -351,6 +352,7 @@ main() { ...@@ -351,6 +352,7 @@ main() {
# dependencies # dependencies
(which wget && which curl) || (apt-get update && apt-get install -y wget curl) (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
(which jq) || (apt-get update && apt-get -y install jq) (which jq) || (apt-get update && apt-get -y install jq)
(which lsof) || (apt-get update && apt-get install -y lsof)
# get the current IP address, required by benchmark_serving.py # get the current IP address, required by benchmark_serving.py
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}') export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
...@@ -369,7 +371,6 @@ main() { ...@@ -369,7 +371,6 @@ main() {
run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json
run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json
# postprocess benchmarking results # postprocess benchmarking results
pip install tabulate pandas pip install tabulate pandas
python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
......
#!/bin/bash
set -o pipefail
check_gpus() {
# check the number of GPUs and GPU type.
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
if [[ $gpu_count -gt 0 ]]; then
echo "GPU found."
else
echo "Need at least 1 GPU to run benchmarking."
exit 1
fi
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
echo "GPU type is $gpu_type"
}
kill_gpu_processes() {
pkill text-generation || true
# waiting for GPU processes to be fully killed
sleep 10
# Print the GPU memory usage
# so that we know if all GPU processes are killed.
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
# The memory usage should be 0 MB.
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
}
json2args() {
# transforms the JSON string to command line args, and '_' is replaced to '-'
# example:
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
local json_string=$1
local args=$(
echo "$json_string" | jq -r '
to_entries |
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
join(" ")
'
)
echo "$args"
}
wait_for_server() {
timeout 1200 bash -c '
until curl -s localhost:8000/generate_stream > /dev/null; do
sleep 1
done' && return 0 || return 1
}
run_serving_tests() {
# run serving tests using `benchmark_serving.py`
# $1: a json file specifying serving test cases
local serving_test_file
serving_test_file=$1
# Iterate over serving tests
jq -c '.[]' "$serving_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
# if TEST_SELECTOR is set, only run the test cases that match the selector
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
echo "Skip test case $test_name."
continue
fi
# append tgi to the test name
test_name=tgi_$test_name
# get common parameters
common_params=$(echo "$params" | jq -r '.common_parameters')
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port')
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
# get client and server arguments
server_params=$(echo "$params" | jq -r '.tgi_server_parameters')
client_params=$(echo "$params" | jq -r '.tgi_client_parameters')
server_args=$(json2args "$server_params")
client_args=$(json2args "$client_params")
qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
echo "Running over qps list $qps_list"
# check if there is enough GPU to run the test
if [[ $gpu_count -lt $tp ]]; then
echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
fi
if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
echo "Key 'fp8' exists in common params."
server_command="/tgi-entrypoint.sh \
--model-id $model \
--num-shard $tp \
--port $port \
--quantize fp8 \
$server_args"
else
echo "Key 'fp8' does not exist in common params."
server_command="/tgi-entrypoint.sh \
--model-id $model \
--num-shard $tp \
--port $port \
$server_args"
fi
# run the server
echo "Running test case $test_name"
echo "Server command: $server_command"
eval "$server_command" &
# wait until the server is alive
wait_for_server
if [ $? -eq 0 ]; then
echo ""
echo "tgi server is up and running."
else
echo ""
echo "tgi failed to start within the timeout period."
break
fi
# iterate over different QPS
for qps in $qps_list; do
# remove the surrounding single quote from qps
if [[ "$qps" == *"inf"* ]]; then
echo "qps was $qps"
qps="inf"
echo "now qps is $qps"
fi
new_test_name=$test_name"_qps_"$qps
client_command="python3 benchmark_serving.py \
--backend tgi \
--model $model \
--dataset-name $dataset_name \
--dataset-path $dataset_path \
--num-prompts $num_prompts \
--port $port \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
$client_args"
echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command"
eval "$client_command"
# record the benchmarking commands
jq_output=$(jq -n \
--arg server "$server_command" \
--arg client "$client_command" \
--arg gpu "$gpu_type" \
--arg engine "tgi" \
'{
server_command: $server,
client_command: $client,
gpu_type: $gpu,
engine: $engine
}')
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
done
# clean up
kill_gpu_processes
rm -rf /root/.cache/huggingface/*
done
}
upload_to_buildkite() {
# upload the benchmarking results to buildkite
# if the agent binary is not found, skip uploading the results, exit 0
if [ ! -f /workspace/buildkite-agent ]; then
echo "buildkite-agent binary not found. Skip uploading the results."
return 0
fi
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
}
main() {
check_gpus
# enter vllm directory
cd $VLLM_SOURCE_CODE_LOC/benchmarks
declare -g RESULTS_FOLDER=results/
mkdir -p $RESULTS_FOLDER
BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
export CURRENT_LLM_SERVING_ENGINE=tgi
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
python -m pip install tabulate pandas
python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
upload_to_buildkite
}
main "$@"
#!/bin/bash
set -o pipefail
check_gpus() {
# check the number of GPUs and GPU type.
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
if [[ $gpu_count -gt 0 ]]; then
echo "GPU found."
else
echo "Need at least 1 GPU to run benchmarking."
exit 1
fi
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
echo "GPU type is $gpu_type"
}
kill_gpu_processes() {
pkill tritonserver || true
# waiting for GPU processes to be fully killed
sleep 20
# Print the GPU memory usage
# so that we know if all GPU processes are killed.
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
# The memory usage should be 0 MB.
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
}
json2args() {
# transforms the JSON string to command line args, and '_' is replaced to '-'
# example:
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
local json_string=$1
local args=$(
echo "$json_string" | jq -r '
to_entries |
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
join(" ")
'
)
echo "$args"
}
wait_for_server() {
timeout 1200 bash -c '
until curl -s localhost:8000/generate_stream > /dev/null; do
sleep 1
done' && return 0 || return 1
}
run_serving_tests() {
# run serving tests using `benchmark_serving.py`
# $1: a json file specifying serving test cases
local serving_test_file
serving_test_file=$1
# Iterate over serving tests
jq -c '.[]' "$serving_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
# if TEST_SELECTOR is set, only run the test cases that match the selector
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
echo "Skip test case $test_name."
continue
fi
# append trt to the test name
test_name=trt_$test_name
# get common parameters
common_params=$(echo "$params" | jq -r '.common_parameters')
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port')
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
# get client and server arguments
server_params=$(echo "$params" | jq -r '.trt_server_parameters')
client_params=$(echo "$params" | jq -r '.trt_client_parameters')
client_args=$(json2args "$client_params")
qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
echo "Running over qps list $qps_list"
# check if there is enough GPU to run the test
if [[ $gpu_count -lt $tp ]]; then
echo "Required model_tp_size $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
fi
cd $VLLM_SOURCE_CODE_LOC/benchmarks
echo "Running test case $test_name"
bash ../.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh "$server_params" "$common_params"
# wait until the server is alive
wait_for_server
if [ $? -eq 0 ]; then
echo ""
echo "trt server is up and running."
else
echo ""
echo "trt failed to start within the timeout period."
break
fi
# prepare tokenizer
cd $VLLM_SOURCE_CODE_LOC/benchmarks
rm -rf /tokenizer_cache
mkdir /tokenizer_cache
python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
--model "$model" \
--cachedir /tokenizer_cache
cd $VLLM_SOURCE_CODE_LOC/benchmarks
# iterate over different QPS
for qps in $qps_list; do
# remove the surrounding single quote from qps
if [[ "$qps" == *"inf"* ]]; then
echo "qps was $qps"
qps="inf"
echo "now qps is $qps"
fi
new_test_name=$test_name"_qps_"$qps
client_command="python3 benchmark_serving.py \
--backend tensorrt-llm \
--tokenizer /tokenizer_cache \
--model $model \
--dataset-name $dataset_name \
--dataset-path $dataset_path \
--num-prompts $num_prompts \
--port $port \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
$client_args"
echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command"
eval "$client_command"
server_command=""
# record the benchmarking commands
jq_output=$(jq -n \
--arg server "$server_command" \
--arg client "$client_command" \
--arg gpu "$gpu_type" \
--arg engine "trt" \
'{
server_command: $server,
client_command: $client,
gpu_type: $gpu,
engine: $engine
}')
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
done
# clean up
kill_gpu_processes
rm -rf /root/.cache/huggingface/*
done
}
upload_to_buildkite() {
# upload the benchmarking results to buildkite
# if the agent binary is not found, skip uploading the results, exit 0
if [ ! -f /workspace/buildkite-agent ]; then
echo "buildkite-agent binary not found. Skip uploading the results."
return 0
fi
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
}
main() {
check_gpus
# enter vllm directory
cd $VLLM_SOURCE_CODE_LOC/benchmarks
declare -g RESULTS_FOLDER=results/
mkdir -p $RESULTS_FOLDER
BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
# update transformers package, to make sure mixtral tokenizer is available
python -m pip install transformers -U
export CURRENT_LLM_SERVING_ENGINE=trt
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
python -m pip install tabulate pandas
python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
upload_to_buildkite
}
main "$@"
#!/bin/bash
set -o pipefail
check_gpus() {
# check the number of GPUs and GPU type.
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
if [[ $gpu_count -gt 0 ]]; then
echo "GPU found."
else
echo "Need at least 1 GPU to run benchmarking."
exit 1
fi
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
echo "GPU type is $gpu_type"
}
kill_gpu_processes() {
# kill all processes on GPU.
pkill pt_main_thread
sleep 10
# remove vllm config file
rm -rf ~/.config/vllm
# Print the GPU memory usage
# so that we know if all GPU processes are killed.
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
# The memory usage should be 0 MB.
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
}
json2args() {
# transforms the JSON string to command line args, and '_' is replaced to '-'
# example:
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
local json_string=$1
local args=$(
echo "$json_string" | jq -r '
to_entries |
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
join(" ")
'
)
echo "$args"
}
wait_for_server() {
# wait for vllm server to start
# return 1 if vllm server crashes
timeout 1200 bash -c '
until curl -s localhost:8000/v1/completions > /dev/null; do
sleep 1
done' && return 0 || return 1
}
run_serving_tests() {
# run serving tests using `benchmark_serving.py`
# $1: a json file specifying serving test cases
local serving_test_file
serving_test_file=$1
# Iterate over serving tests
jq -c '.[]' "$serving_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
# if TEST_SELECTOR is set, only run the test cases that match the selector
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
echo "Skip test case $test_name."
continue
fi
# append vllm to the test name
test_name=vllm_$test_name
# get common parameters
common_params=$(echo "$params" | jq -r '.common_parameters')
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port')
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
# get client and server arguments
server_params=$(echo "$params" | jq -r '.vllm_server_parameters')
client_params=$(echo "$params" | jq -r '.vllm_client_parameters')
server_args=$(json2args "$server_params")
client_args=$(json2args "$client_params")
qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
echo "Running over qps list $qps_list"
# check if there is enough GPU to run the test
if [[ $gpu_count -lt $tp ]]; then
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
fi
if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
server_command="python3 \
-m vllm.entrypoints.openai.api_server \
-tp $tp \
--model $model \
--port $port \
$server_args"
else
echo "Key 'fp8' does not exist in common params."
server_command="python3 \
-m vllm.entrypoints.openai.api_server \
-tp $tp \
--model $model \
--port $port \
$server_args"
fi
# run the server
echo "Running test case $test_name"
echo "Server command: $server_command"
eval "$server_command" &
# wait until the server is alive
wait_for_server
if [ $? -eq 0 ]; then
echo ""
echo "vllm server is up and running."
else
echo ""
echo "vllm failed to start within the timeout period."
break
fi
# iterate over different QPS
for qps in $qps_list; do
# remove the surrounding single quote from qps
if [[ "$qps" == *"inf"* ]]; then
echo "qps was $qps"
qps="inf"
echo "now qps is $qps"
fi
new_test_name=$test_name"_qps_"$qps
client_command="python3 benchmark_serving.py \
--backend vllm \
--model $model \
--dataset-name $dataset_name \
--dataset-path $dataset_path \
--num-prompts $num_prompts \
--port $port \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
$client_args"
echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command"
eval "$client_command"
# record the benchmarking commands
jq_output=$(jq -n \
--arg server "$server_command" \
--arg client "$client_command" \
--arg gpu "$gpu_type" \
--arg engine "vllm" \
'{
server_command: $server,
client_command: $client,
gpu_type: $gpu,
engine: $engine
}')
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
done
# clean up
kill_gpu_processes
rm -rf /root/.cache/huggingface/*
done
}
upload_to_buildkite() {
# upload the benchmarking results to buildkite
# if the agent binary is not found, skip uploading the results, exit 0
if [ ! -f /workspace/buildkite-agent ]; then
echo "buildkite-agent binary not found. Skip uploading the results."
return 0
fi
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
}
main() {
check_gpus
# enter vllm directory
cd $VLLM_SOURCE_CODE_LOC/benchmarks
declare -g RESULTS_FOLDER=results/
mkdir -p $RESULTS_FOLDER
BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
export CURRENT_LLM_SERVING_ENGINE=vllm
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
python3 -m pip install tabulate pandas
python3 $BENCHMARK_ROOT/scripts/summary-nightly-results.py
upload_to_buildkite
}
main "$@"
...@@ -17,10 +17,17 @@ serving_column_mapping = { ...@@ -17,10 +17,17 @@ serving_column_mapping = {
"request_throughput": "Tput (req/s)", "request_throughput": "Tput (req/s)",
"mean_ttft_ms": "Mean TTFT (ms)", "mean_ttft_ms": "Mean TTFT (ms)",
"std_ttft_ms": "Std TTFT (ms)", "std_ttft_ms": "Std TTFT (ms)",
"median_ttft_ms": "Median TTFT (ms)",
"mean_itl_ms": "Mean ITL (ms)", "mean_itl_ms": "Mean ITL (ms)",
"std_itl_ms": "Std ITL (ms)", "std_itl_ms": "Std ITL (ms)",
"input_throughput": "Input Tput (tok/s)", "median_itl_ms": "Median ITL (ms)",
"mean_tpot_ms": "Mean TPOT (ms)",
"std_tpot_ms": "Std TPOT (ms)",
"median_tpot_ms": "Median TPOT (ms)",
"total_token_throughput": "Total Token Tput (tok/s)",
"output_throughput": "Output Tput (tok/s)", "output_throughput": "Output Tput (tok/s)",
"total_input_tokens": "Total input tokens",
"total_output_tokens": "Total output tokens",
"engine": "Engine", "engine": "Engine",
} }
......
...@@ -2,9 +2,11 @@ ...@@ -2,9 +2,11 @@
TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull" | jq -r .token) TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull" | jq -r .token)
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT" URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
TIMEOUT_SECONDS=10
retries=0 retries=0
while [ $retries -lt 1000 ]; do while [ $retries -lt 1000 ]; do
if [ $(curl -s -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then if [ $(curl -s --max-time $TIMEOUT_SECONDS -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then
exit 0 exit 0
fi fi
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
{ {
"test_name": "latency_llama8B_tp1", "test_name": "latency_llama8B_tp1",
"parameters": { "parameters": {
"model": "meta-llama/Meta-Llama-3-8B", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1, "tensor_parallel_size": 1,
"load_format": "dummy", "load_format": "dummy",
"num_iters_warmup": 5, "num_iters_warmup": 5,
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
{ {
"test_name": "latency_llama70B_tp4", "test_name": "latency_llama70B_tp4",
"parameters": { "parameters": {
"model": "meta-llama/Meta-Llama-3-70B-Instruct", "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"tensor_parallel_size": 4, "tensor_parallel_size": 4,
"load_format": "dummy", "load_format": "dummy",
"num-iters-warmup": 5, "num-iters-warmup": 5,
......
[ [
{ {
"test_name": "llama8B_tp1", "test_name": "llama8B_tp1_sharegpt",
"qps_list": [4], "qps_list": [4,8,16,32,"inf"],
"common_parameters": { "common_parameters": {
"model": "meta-llama/Meta-Llama-3-8B", "model": "meta-llama/Meta-Llama-3-8B-Instruct",
"tp": 1, "tp": 1,
"dataset_name": "sharegpt", "dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 500, "num_prompts": 500,
"port": 8000 "port": 8000,
"reuse_server": false
}, },
"lmdeploy_server_parameters": { "lmdeploy_server_parameters": {
"dtype": "bfloat16"
}, },
"lmdeploy_client_parameters": { "lmdeploy_client_parameters": {
}, },
...@@ -21,34 +23,158 @@ ...@@ -21,34 +23,158 @@
}, },
"trt_server_parameters": { "trt_server_parameters": {
"model_type": "llama", "model_type": "llama",
"model_dtype": "float16", "model_dtype": "bfloat16",
"max_batch_size": 256, "max_batch_size": 2048,
"max_input_len": 4096, "max_input_len": 4096,
"max_output_len": 4096, "max_seq_len": 6144,
"trt_llm_version": "r24.04" "max_num_tokens": 16384,
"trt_llm_version": "v0.11.0"
}, },
"trt_client_parameters": { "trt_client_parameters": {
"endpoint": "/v2/models/ensemble/generate_stream" "endpoint": "/v2/models/ensemble/generate_stream"
}, },
"vllm_server_parameters": { "vllm_server_parameters": {
"disable_log_stats": "", "disable_log_stats": "",
"disable_log_requests": "" "disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
}, },
"vllm_client_parameters": { "vllm_client_parameters": {
},
"sglang_server_parameters": {
"disable_radix_cache": "",
"enable_torch_compile": "",
"dtype": "bfloat16"
},
"sglang_client_parameters": {
}
},
{
"test_name": "llama8B_tp1_sonnet_512_16",
"qps_list": [4,8,16,32,"inf"],
"common_parameters": {
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
"tp": 1,
"dataset_name": "sonnet",
"dataset_path": "./sonnet_4x.txt",
"num_prompts": 500,
"port": 8000,
"sonnet_input_len": 512,
"sonnet_output_len": 16,
"sonnet_prefix_len": 50,
"reuse_server": true
},
"lmdeploy_server_parameters": {
"dtype": "bfloat16"
},
"lmdeploy_client_parameters": {
},
"tgi_server_parameters": {
},
"tgi_client_parameters": {
"endpoint": "/generate_stream"
},
"trt_server_parameters": {
"model_type": "llama",
"model_dtype": "bfloat16",
"max_batch_size": 2048,
"max_input_len": 4096,
"max_seq_len": 6144,
"max_num_tokens": 16384,
"trt_llm_version": "v0.11.0"
},
"trt_client_parameters": {
"endpoint": "/v2/models/ensemble/generate_stream"
},
"vllm_server_parameters": {
"disable_log_stats": "",
"disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
},
"vllm_client_parameters": {
},
"sglang_server_parameters": {
"disable_radix_cache": "",
"enable_torch_compile": "",
"dtype": "bfloat16"
},
"sglang_client_parameters": {
}
},
{
"test_name": "llama8B_tp1_sonnet_512_256",
"qps_list": [4,8,16,32,"inf"],
"common_parameters": {
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
"tp": 1,
"dataset_name": "sonnet",
"dataset_path": "./sonnet_4x.txt",
"num_prompts": 500,
"port": 8000,
"sonnet_input_len": 512,
"sonnet_output_len": 256,
"sonnet_prefix_len": 50,
"reuse_server": true
},
"lmdeploy_server_parameters": {
"dtype": "bfloat16"
},
"lmdeploy_client_parameters": {
},
"tgi_server_parameters": {
},
"tgi_client_parameters": {
"endpoint": "/generate_stream"
},
"trt_server_parameters": {
"model_type": "llama",
"model_dtype": "bfloat16",
"max_batch_size": 2048,
"max_input_len": 4096,
"max_seq_len": 6144,
"max_num_tokens": 16384,
"trt_llm_version": "v0.11.0"
},
"trt_client_parameters": {
"endpoint": "/v2/models/ensemble/generate_stream"
},
"vllm_server_parameters": {
"disable_log_stats": "",
"disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
},
"vllm_client_parameters": {
},
"sglang_server_parameters": {
"disable_radix_cache": "",
"enable_torch_compile": "",
"dtype": "bfloat16"
},
"sglang_client_parameters": {
} }
}, },
{ {
"test_name": "llama70B_tp4", "test_name": "llama70B_tp4_sharegpt",
"qps_list": [2], "qps_list": [4,8,16,32,"inf"],
"common_parameters": { "common_parameters": {
"model": "meta-llama/Meta-Llama-3-70B-Instruct", "model": "meta-llama/Meta-Llama-3-70B-Instruct",
"tp": 4, "tp": 4,
"dataset_name": "sharegpt", "dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 500, "num_prompts": 500,
"port": 8000 "port": 8000,
"reuse_server": false
}, },
"lmdeploy_server_parameters": { "lmdeploy_server_parameters": {
"dtype": "bfloat16"
}, },
"lmdeploy_client_parameters": { "lmdeploy_client_parameters": {
}, },
...@@ -59,34 +185,50 @@ ...@@ -59,34 +185,50 @@
}, },
"trt_server_parameters": { "trt_server_parameters": {
"model_type": "llama", "model_type": "llama",
"model_dtype": "float16", "model_dtype": "bfloat16",
"max_batch_size": 256, "max_batch_size": 2048,
"max_input_len": 4096, "max_input_len": 4096,
"max_output_len": 4096, "max_seq_len": 6144,
"trt_llm_version": "r24.04" "max_num_tokens": 16384,
"trt_llm_version": "v0.11.0"
}, },
"trt_client_parameters": { "trt_client_parameters": {
"endpoint": "/v2/models/ensemble/generate_stream" "endpoint": "/v2/models/ensemble/generate_stream"
}, },
"vllm_server_parameters": { "vllm_server_parameters": {
"disable_log_stats": "", "disable_log_stats": "",
"disable_log_requests": "" "disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
}, },
"vllm_client_parameters": { "vllm_client_parameters": {
},
"sglang_server_parameters": {
"disable_radix_cache": "",
"dtype": "bfloat16"
},
"sglang_client_parameters": {
} }
}, },
{ {
"test_name": "mixtral8x7B_tp2", "test_name": "llama70B_tp4_sonnet_512_16",
"qps_list": [2], "qps_list": [4,8,16,32,"inf"],
"common_parameters": { "common_parameters": {
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1", "model": "meta-llama/Meta-Llama-3-70B-Instruct",
"tp": 2, "tp": 4,
"dataset_name": "sharegpt", "dataset_name": "sonnet",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "dataset_path": "./sonnet_4x.txt",
"num_prompts": 500, "num_prompts": 500,
"port": 8000 "port": 8000,
"sonnet_input_len": 512,
"sonnet_output_len": 16,
"sonnet_prefix_len": 50,
"reuse_server": true
}, },
"lmdeploy_server_parameters": { "lmdeploy_server_parameters": {
"dtype": "bfloat16"
}, },
"lmdeploy_client_parameters": { "lmdeploy_client_parameters": {
}, },
...@@ -97,20 +239,85 @@ ...@@ -97,20 +239,85 @@
}, },
"trt_server_parameters": { "trt_server_parameters": {
"model_type": "llama", "model_type": "llama",
"model_dtype": "float16", "model_dtype": "bfloat16",
"max_batch_size": 256, "max_batch_size": 2048,
"max_input_len": 4096, "max_input_len": 4096,
"max_output_len": 4096, "max_seq_len": 6144,
"trt_llm_version": "r24.04" "max_num_tokens": 16384,
"trt_llm_version": "v0.11.0"
}, },
"trt_client_parameters": { "trt_client_parameters": {
"endpoint": "/v2/models/ensemble/generate_stream" "endpoint": "/v2/models/ensemble/generate_stream"
}, },
"vllm_server_parameters": { "vllm_server_parameters": {
"disable_log_stats": "", "disable_log_stats": "",
"disable_log_requests": "" "disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
}, },
"vllm_client_parameters": { "vllm_client_parameters": {
},
"sglang_server_parameters": {
"disable_radix_cache": "",
"dtype": "bfloat16"
},
"sglang_client_parameters": {
}
},
{
"test_name": "llama70B_tp4_sonnet_512_256",
"qps_list": [4,8,16,32,"inf"],
"common_parameters": {
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
"tp": 4,
"dataset_name": "sonnet",
"dataset_path": "./sonnet_4x.txt",
"num_prompts": 500,
"port": 8000,
"sonnet_input_len": 512,
"sonnet_output_len": 256,
"sonnet_prefix_len": 50,
"reuse_server": true
},
"lmdeploy_server_parameters": {
"dtype": "bfloat16"
},
"lmdeploy_client_parameters": {
},
"tgi_server_parameters": {
},
"tgi_client_parameters": {
"endpoint": "/generate_stream"
},
"trt_server_parameters": {
"model_type": "llama",
"model_dtype": "bfloat16",
"max_batch_size": 2048,
"max_input_len": 4096,
"max_seq_len": 6144,
"max_num_tokens": 16384,
"trt_llm_version": "v0.11.0"
},
"trt_client_parameters": {
"endpoint": "/v2/models/ensemble/generate_stream"
},
"vllm_server_parameters": {
"disable_log_stats": "",
"disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
},
"vllm_client_parameters": {
},
"sglang_server_parameters": {
"disable_radix_cache": "",
"dtype": "bfloat16"
},
"sglang_client_parameters": {
} }
} }
] ]
\ No newline at end of file
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
"test_name": "serving_llama8B_tp1_sharegpt", "test_name": "serving_llama8B_tp1_sharegpt",
"qps_list": [1, 4, 16, "inf"], "qps_list": [1, 4, 16, "inf"],
"server_parameters": { "server_parameters": {
"model": "meta-llama/Meta-Llama-3-8B", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1, "tensor_parallel_size": 1,
"swap_space": 16, "swap_space": 16,
"disable_log_stats": "", "disable_log_stats": "",
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
"load_format": "dummy" "load_format": "dummy"
}, },
"client_parameters": { "client_parameters": {
"model": "meta-llama/Meta-Llama-3-8B", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"backend": "vllm", "backend": "vllm",
"dataset_name": "sharegpt", "dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
"test_name": "serving_llama70B_tp4_sharegpt", "test_name": "serving_llama70B_tp4_sharegpt",
"qps_list": [1, 4, 16, "inf"], "qps_list": [1, 4, 16, "inf"],
"server_parameters": { "server_parameters": {
"model": "meta-llama/Meta-Llama-3-70B-Instruct", "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"tensor_parallel_size": 4, "tensor_parallel_size": 4,
"swap_space": 16, "swap_space": 16,
"disable_log_stats": "", "disable_log_stats": "",
...@@ -30,7 +30,7 @@ ...@@ -30,7 +30,7 @@
"load_format": "dummy" "load_format": "dummy"
}, },
"client_parameters": { "client_parameters": {
"model": "meta-llama/Meta-Llama-3-70B-Instruct", "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"backend": "vllm", "backend": "vllm",
"dataset_name": "sharegpt", "dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
...@@ -60,7 +60,7 @@ ...@@ -60,7 +60,7 @@
"test_name": "serving_llama70B_tp4_sharegpt_specdecode", "test_name": "serving_llama70B_tp4_sharegpt_specdecode",
"qps_list": [2], "qps_list": [2],
"server_parameters": { "server_parameters": {
"model": "meta-llama/Meta-Llama-3-70B-Instruct", "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"disable_log_requests": "", "disable_log_requests": "",
"tensor_parallel_size": 4, "tensor_parallel_size": 4,
"swap_space": 16, "swap_space": 16,
...@@ -70,7 +70,7 @@ ...@@ -70,7 +70,7 @@
"use_v2_block_manager": "" "use_v2_block_manager": ""
}, },
"client_parameters": { "client_parameters": {
"model": "meta-llama/Meta-Llama-3-70B-Instruct", "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"backend": "vllm", "backend": "vllm",
"dataset_name": "sharegpt", "dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
{ {
"test_name": "throughput_llama8B_tp1", "test_name": "throughput_llama8B_tp1",
"parameters": { "parameters": {
"model": "meta-llama/Meta-Llama-3-8B", "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1, "tensor_parallel_size": 1,
"load_format": "dummy", "load_format": "dummy",
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
{ {
"test_name": "throughput_llama70B_tp4", "test_name": "throughput_llama70B_tp4",
"parameters": { "parameters": {
"model": "meta-llama/Meta-Llama-3-70B-Instruct", "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"tensor_parallel_size": 4, "tensor_parallel_size": 4,
"load_format": "dummy", "load_format": "dummy",
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
......
steps: steps:
- label: "Build wheel - CUDA {{matrix.cuda_version}}" - label: "Build wheel - CUDA 12.1"
agents: agents:
queue: cpu_queue queue: cpu_queue
commands: commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION={{matrix.cuda_version}} --tag vllm-ci:build-image --target build --progress plain ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
# rename the files to change linux -> manylinux1
- "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
- "mv artifacts/dist/$(ls artifacts/dist) artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
- "aws s3 cp artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl s3://vllm-wheels/$BUILDKITE_COMMIT/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
- "aws s3 cp artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl s3://vllm-wheels/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
env:
DOCKER_BUILDKIT: "1"
- block: "Build CUDA 11.8 wheel"
key: block-build-cu118-wheel
- label: "Build wheel - CUDA 11.8"
depends_on: block-build-cu118-wheel
agents:
queue: cpu_queue
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
- "mkdir artifacts" - "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
# rename the files to change linux -> manylinux1 # rename the files to change linux -> manylinux1
...@@ -12,8 +31,3 @@ steps: ...@@ -12,8 +31,3 @@ steps:
- "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/" - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
matrix:
setup:
cuda_version:
- "11.8.0"
- "12.1.0"
# This script runs test inside the corresponding ROCm docker container. # This script runs test inside the corresponding ROCm docker container.
set -ex set -o pipefail
# Print ROCm version # Print ROCm version
echo "--- Confirming Clean Initial State" echo "--- Confirming Clean Initial State"
...@@ -70,15 +70,85 @@ HF_CACHE="$(realpath ~)/huggingface" ...@@ -70,15 +70,85 @@ HF_CACHE="$(realpath ~)/huggingface"
mkdir -p ${HF_CACHE} mkdir -p ${HF_CACHE}
HF_MOUNT="/root/.cache/huggingface" HF_MOUNT="/root/.cache/huggingface"
docker run \ commands=$@
echo "Commands:$commands"
#ignore certain kernels tests
if [[ $commands == *" kernels "* ]]; then
commands="${commands} \
--ignore=kernels/test_attention.py \
--ignore=kernels/test_attention_selector.py \
--ignore=kernels/test_blocksparse_attention.py \
--ignore=kernels/test_causal_conv1d.py \
--ignore=kernels/test_cutlass.py \
--ignore=kernels/test_encoder_decoder_attn.py \
--ignore=kernels/test_flash_attn.py \
--ignore=kernels/test_flashinfer.py \
--ignore=kernels/test_gguf.py \
--ignore=kernels/test_int8_quant.py \
--ignore=kernels/test_machete_gemm.py \
--ignore=kernels/test_mamba_ssm.py \
--ignore=kernels/test_marlin_gemm.py \
--ignore=kernels/test_moe.py \
--ignore=kernels/test_prefix_prefill.py \
--ignore=kernels/test_rand.py \
--ignore=kernels/test_sampler.py"
fi
#ignore certain Entrypoints tests
if [[ $commands == *" entrypoints/openai "* ]]; then
commands=${commands//" entrypoints/openai "/" entrypoints/openai \
--ignore=entrypoints/openai/test_accuracy.py \
--ignore=entrypoints/openai/test_audio.py \
--ignore=entrypoints/openai/test_encoder_decoder.py \
--ignore=entrypoints/openai/test_embedding.py \
--ignore=entrypoints/openai/test_oot_registration.py "}
fi
PARALLEL_JOB_COUNT=8
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
if [[ $commands == *"--shard-id="* ]]; then
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
#replace shard arguments
commands=${commands//"--shard-id= "/"--shard-id=${GPU} "}
commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
echo "Shard ${GPU} commands:$commands"
docker run \
--device /dev/kfd --device /dev/dri \
--network host \
--shm-size=16gb \
--rm \
-e HIP_VISIBLE_DEVICES=${GPU} \
-e HF_TOKEN \
-v ${HF_CACHE}:${HF_MOUNT} \
-e HF_HOME=${HF_MOUNT} \
--name ${container_name}_${GPU} \
${image_name} \
/bin/bash -c "${commands}" \
|& while read -r line; do echo ">>Shard $GPU: $line"; done &
PIDS+=($!)
done
#wait for all processes to finish and collect exit codes
for pid in ${PIDS[@]}; do
wait ${pid}
STATUS+=($?)
done
for st in ${STATUS[@]}; do
if [[ ${st} -ne 0 ]]; then
echo "One of the processes failed with $st"
exit ${st}
fi
done
else
docker run \
--device /dev/kfd --device /dev/dri \ --device /dev/kfd --device /dev/dri \
--network host \ --network host \
--shm-size=16gb \ --shm-size=16gb \
--rm \ --rm \
-e HIP_VISIBLE_DEVICES=0 \
-e HF_TOKEN \ -e HF_TOKEN \
-v ${HF_CACHE}:${HF_MOUNT} \ -v ${HF_CACHE}:${HF_MOUNT} \
-e HF_HOME=${HF_MOUNT} \ -e HF_HOME=${HF_MOUNT} \
--name ${container_name} \ --name ${container_name} \
${image_name} \ ${image_name} \
/bin/bash -c "${@}" /bin/bash -c "${commands}"
fi
# This script build the CPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
set -ex
# Try building the docker image
docker build -t cpu-test -f Dockerfile.ppc64le .
# Setup cleanup
remove_docker_container() { docker rm -f cpu-test || true; }
trap remove_docker_container EXIT
remove_docker_container
# Run the image, setting --shm-size=4g for tensor parallel.
source /etc/environment
#docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN=$HF_TOKEN --name cpu-test cpu-test
# Run basic model test
docker exec cpu-test bash -c "
pip install pytest matplotlib einops transformers_stream_generator
pytest -v -s tests/models -m \"not vlm\" \
--ignore=tests/models/test_embedding.py \
--ignore=tests/models/test_oot_registration.py \
--ignore=tests/models/test_registry.py \
--ignore=tests/models/test_jamba.py \
--ignore=tests/models/test_mamba.py \
--ignore=tests/models/test_danube3_4b.py" # Mamba kernels and Danube3-4B on CPU is not supported
# online inference
docker exec cpu-test bash -c "
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m &
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
python3 benchmarks/benchmark_serving.py \
--backend vllm \
--dataset-name random \
--model facebook/opt-125m \
--num-prompts 20 \
--endpoint /v1/completions \
--tokenizer facebook/opt-125m"
...@@ -22,8 +22,25 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py" ...@@ -22,8 +22,25 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
# Run basic model test # Run basic model test
docker exec cpu-test bash -c " docker exec cpu-test bash -c "
pip install pytest Pillow protobuf pip install pytest matplotlib einops transformers_stream_generator datamodel_code_generator
pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported pytest -v -s tests/models/encoder_decoder/language
pytest -v -s tests/models/decoder_only/language \
--ignore=tests/models/test_fp8.py \
--ignore=tests/models/decoder_only/language/test_jamba.py \
--ignore=tests/models/decoder_only/language/test_mamba.py \
--ignore=tests/models/decoder_only/language/test_granitemoe.py \
--ignore=tests/models/decoder_only/language/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
# Run compressed-tensor test
docker exec cpu-test bash -c "
pytest -s -v \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
# Run AWQ test
docker exec cpu-test bash -c "
pytest -s -v \
tests/quantization/test_ipex_quant.py"
# online inference # online inference
docker exec cpu-test bash -c " docker exec cpu-test bash -c "
......
...@@ -12,5 +12,4 @@ remove_docker_container ...@@ -12,5 +12,4 @@ remove_docker_container
# For HF_TOKEN. # For HF_TOKEN.
source /etc/environment source /etc/environment
# Run a simple end-to-end example. # Run a simple end-to-end example.
docker run --privileged --net host --shm-size=16G -it -e HF_TOKEN=$HF_TOKEN --name tpu-test vllm-tpu \ docker run --privileged --net host --shm-size=16G -it -e HF_TOKEN=$HF_TOKEN --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py"
python3 /workspace/vllm/examples/offline_inference_tpu.py
...@@ -11,4 +11,4 @@ trap remove_docker_container EXIT ...@@ -11,4 +11,4 @@ trap remove_docker_container EXIT
remove_docker_container remove_docker_container
# Run the image and launch offline inference # Run the image and launch offline inference
docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path xpu-test python3 examples/offline_inference.py docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test python3 examples/offline_inference.py
This diff is collapsed.
/.venv
/build
dist
vllm/*.so vllm/*.so
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
.mypy_cache
# Distribution / packaging
.Python
/build/
cmake-build-*/
CMakeUserPresets.json
develop-eggs/
/dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment