Unverified Commit 441b65d8 authored by Chenyaaang's avatar Chenyaaang Committed by GitHub
Browse files

[Misc][Tools][Benchmark] Fix and improve auto tune script (#19163)


Signed-off-by: default avatarChenyaaang <chenyangli@google.com>
parent 46ecc579
...@@ -10,11 +10,15 @@ ...@@ -10,11 +10,15 @@
# 3. Set variables (ALL REQUIRED) # 3. Set variables (ALL REQUIRED)
# BASE: your directory for vllm repo # BASE: your directory for vllm repo
# MODEL: the model served by vllm # MODEL: the model served by vllm
# TP: ways of tensor parallelism
# DOWNLOAD_DIR: directory to download and load model weights. # DOWNLOAD_DIR: directory to download and load model weights.
# INPUT_LEN: request input len # INPUT_LEN: request input len
# OUTPUT_LEN: request output len # OUTPUT_LEN: request output len
# MIN_CACHE_HIT_PCT: prefix cache rate # MIN_CACHE_HIT_PCT: prefix cache rate
# MAX_LATENCY_ALLOWED_MS: (e2e) latency requirement. If there's no latency requirement, set it to a large number like 1000000000 # MAX_LATENCY_ALLOWED_MS: (e2e) latency requirement. If there's no latency requirement, set it to a large number like 1000000000
# NUM_SEQS_LIST: a list of `max-num-seqs` you want to loop with.
# NUM_BATCHED_TOKENS_LIST: a list of `max-num-batched-tokens` you want to loop with.
# Note that the default NUM_SEQS_LIST and NUM_BATCHED_TOKENS_LIST are set for medium size input/output len, for extra short context (such as 20:20), you might need to include larger numbers in NUM_SEQS_LIST.
# 4. Run the script, it might take a long time, you can use tmux to avoid the script stop if disconnection happens. # 4. Run the script, it might take a long time, you can use tmux to avoid the script stop if disconnection happens.
# 5. The final result will be saved in RESULT file. # 5. The final result will be saved in RESULT file.
...@@ -30,31 +34,27 @@ ...@@ -30,31 +34,27 @@
TAG=$(date +"%Y_%m_%d_%H_%M") TAG=$(date +"%Y_%m_%d_%H_%M")
BASE="" BASE=""
MODEL="meta-llama/Llama-3.1-8B-Instruct" MODEL="meta-llama/Llama-3.1-8B-Instruct"
TP=1
DOWNLOAD_DIR="" DOWNLOAD_DIR=""
INPUT_LEN=4000 INPUT_LEN=4000
OUTPUT_LEN=16 OUTPUT_LEN=16
MIN_CACHE_HIT_PCT_PCT=0 MIN_CACHE_HIT_PCT=0
MAX_LATENCY_ALLOWED_MS=100000000000 MAX_LATENCY_ALLOWED_MS=100000000000
NUM_SEQS_LIST="128 256"
NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096"
LOG_FOLDER="$BASE/auto-benchmark/$TAG" LOG_FOLDER="$BASE/auto-benchmark/$TAG"
RESULT="$LOG_FOLDER/result.txt" RESULT="$LOG_FOLDER/result.txt"
echo "result file$ $RESULT" echo "result file: $RESULT"
echo "model: $MODEL" echo "model: $MODEL"
echo
rm -rf $LOG_FOLDER rm -rf $LOG_FOLDER
mkdir -p $LOG_FOLDER mkdir -p $LOG_FOLDER
cd "$BASE/vllm" cd "$BASE/vllm"
# create sonnet-4x.txt so that we can sample 2048 tokens for input
echo "" > benchmarks/sonnet_4x.txt
for _ in {1..4}
do
cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
done
pip install datasets pip install -q datasets
current_hash=$(git rev-parse HEAD) current_hash=$(git rev-parse HEAD)
echo "hash:$current_hash" >> "$RESULT" echo "hash:$current_hash" >> "$RESULT"
...@@ -64,53 +64,69 @@ best_throughput=0 ...@@ -64,53 +64,69 @@ best_throughput=0
best_max_num_seqs=0 best_max_num_seqs=0
best_num_batched_tokens=0 best_num_batched_tokens=0
best_goodput=0 best_goodput=0
run_benchmark() {
local max_num_seqs=$1
local max_num_batched_tokens=$2
echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
echo "vllm_log: $vllm_log"
echo
rm -f $vllm_log
# start the server start_server() {
local gpu_memory_utilization=$1
local max_num_seqs=$2
local max_num_batched_tokens=$3
local vllm_log=$4
pkill -f vllm
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 vllm serve $MODEL \ VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 vllm serve $MODEL \
--disable-log-requests \ --disable-log-requests \
--port 8004 \ --port 8004 \
--gpu-memory-utilization 0.98 \ --gpu-memory-utilization $gpu_memory_utilization \
--max-num-seqs $max_num_seqs \ --max-num-seqs $max_num_seqs \
--max-num-batched-tokens $max_num_batched_tokens \ --max-num-batched-tokens $max_num_batched_tokens \
--tensor-parallel-size 1 \ --tensor-parallel-size $TP \
--enable-prefix-caching \ --enable-prefix-caching \
--load-format dummy \ --load-format dummy \
--download-dir $DOWNLOAD_DIR \ --download-dir "$DOWNLOAD_DIR" \
--max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 & --max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 &
echo "wait for 10 minutes.."
echo
# wait for 10 minutes... # wait for 10 minutes...
server_started=0 server_started=0
for i in {1..60}; do for i in {1..60}; do
if grep -Fq "Application startup complete" "$vllm_log"; then RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
echo "Application started" STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
if [[ "$STATUS_CODE" -eq 200 ]]; then
server_started=1 server_started=1
break break
else else
# echo "wait for 10 seconds..."
sleep 10 sleep 10
fi fi
done done
if (( ! server_started )); then if (( ! server_started )); then
echo "server did not start within 10 minutes, terminate the benchmarking. Please check server log at $vllm_log" echo "server did not start within 10 minutes. Please check server log at $vllm_log".
echo "pkill -f vllm"
echo
pkill vllm
sleep 10
return 1 return 1
else
return 0
fi fi
}
echo "run benchmark test..." run_benchmark() {
local max_num_seqs=$1
local max_num_batched_tokens=$2
local gpu_memory_utilization=$3
echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
echo "vllm_log: $vllm_log"
echo echo
rm -f $vllm_log
pkill -f vllm
echo "starting server..."
start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log
result=$?
if [[ "$result" -eq 1 ]]; then
echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
else
echo "server started."
fi
echo
echo "run benchmark test..."
meet_latency_requirement=0 meet_latency_requirement=0
# get a basic qps by using request-rate inf # get a basic qps by using request-rate inf
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt" bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
...@@ -118,29 +134,29 @@ run_benchmark() { ...@@ -118,29 +134,29 @@ run_benchmark() {
python benchmarks/benchmark_serving.py \ python benchmarks/benchmark_serving.py \
--backend vllm \ --backend vllm \
--model $MODEL \ --model $MODEL \
--dataset-name sonnet \ --dataset-name random \
--dataset-path benchmarks/sonnet_4x.txt \ --random-input-len $INPUT_LEN \
--sonnet-input-len $INPUT_LEN \ --random-output-len $OUTPUT_LEN \
--sonnet-output-len $OUTPUT_LEN \
--ignore-eos \ --ignore-eos \
--disable-tqdm \ --disable-tqdm \
--request-rate inf \ --request-rate inf \
--percentile-metrics ttft,tpot,itl,e2el \ --percentile-metrics ttft,tpot,itl,e2el \
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \ --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
--num-prompts 100 \ --num-prompts 1000 \
--sonnet-prefix-len $prefix_len \ --random-prefix-len $prefix_len \
--port 8004 > "$bm_log" --port 8004 &> "$bm_log"
through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}') e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
meet_latency_requirement=1 meet_latency_requirement=1
request_rate=inf
fi fi
if (( ! meet_latency_requirement )); then if (( ! meet_latency_requirement )); then
# start from request-rate as int(through_put) + 1 # start from request-rate as int(throughput) + 1
request_rate=$((${through_put%.*} + 1)) request_rate=$((${throughput%.*} + 1))
while ((request_rate > 0)); do while ((request_rate > 0)); do
# clear prefix cache # clear prefix cache
curl -X POST http://0.0.0.0:8004/reset_prefix_cache curl -X POST http://0.0.0.0:8004/reset_prefix_cache
...@@ -149,19 +165,18 @@ run_benchmark() { ...@@ -149,19 +165,18 @@ run_benchmark() {
python benchmarks/benchmark_serving.py \ python benchmarks/benchmark_serving.py \
--backend vllm \ --backend vllm \
--model $MODEL \ --model $MODEL \
--dataset-name sonnet \ --dataset-name random \
--dataset-path benchmarks/sonnet_4x.txt \ --random-input-len $INPUT_LEN \
--sonnet-input-len $INPUT_LEN \ --random-output-len $OUTPUT_LEN \
--sonnet-output-len $OUTPUT_LEN \ --ignore-eos \
--ignore_eos \
--disable-tqdm \ --disable-tqdm \
--request-rate $request_rate \ --request-rate $request_rate \
--percentile-metrics ttft,tpot,itl,e2el \ --percentile-metrics ttft,tpot,itl,e2el \
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \ --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
--num-prompts 100 \ --num-prompts 100 \
--sonnet-prefix-len $prefix_len \ --random-prefix-len $prefix_len \
--port 8004 > "$bm_log" --port 8004 &> "$bm_log"
through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}') e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
...@@ -173,10 +188,10 @@ run_benchmark() { ...@@ -173,10 +188,10 @@ run_benchmark() {
fi fi
# write the results and update the best result. # write the results and update the best result.
if ((meet_latency_requirement)); then if ((meet_latency_requirement)); then
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput" echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput"
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput" >> "$RESULT" echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput" >> "$RESULT"
if (( $(echo "$through_put > $best_throughput" | bc -l) )); then if (( $(echo "$throughput > $best_throughput" | bc -l) )); then
best_throughput=$through_put best_throughput=$throughput
best_max_num_seqs=$max_num_seqs best_max_num_seqs=$max_num_seqs
best_num_batched_tokens=$max_num_batched_tokens best_num_batched_tokens=$max_num_batched_tokens
best_goodput=$goodput best_goodput=$goodput
...@@ -188,22 +203,39 @@ run_benchmark() { ...@@ -188,22 +203,39 @@ run_benchmark() {
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput" echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
echo "pkill -f vllm"
echo
pkill vllm pkill vllm
sleep 10 sleep 10
rm -f $vllm_log
printf '=%.0s' $(seq 1 20) printf '=%.0s' $(seq 1 20)
return 0 return 0
} }
read -r -a num_seqs_list <<< "$NUM_SEQS_LIST"
read -r -a num_batched_tokens_list <<< "$NUM_BATCHED_TOKENS_LIST"
# first find out the max gpu-memory-utilization without HBM OOM.
gpu_memory_utilization=0.98
find_gpu_memory_utilization=0
while (( $(echo "$gpu_memory_utilization >= 0.9" | bc -l) )); do
start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log"
result=$?
if [[ "$result" -eq 0 ]]; then
find_gpu_memory_utilization=1
break
else
gpu_memory_utilization=$(echo "$gpu_memory_utilization - 0.01" | bc)
fi
done
if [[ "$find_gpu_memory_utilization" -eq 1 ]]; then
echo "Using gpu_memory_utilization=$gpu_memory_utilization to serve model."
else
echo "Cannot find a proper gpu_memory_utilization over 0.9 to serve the model, please check logs in $LOG_FOLDER."
exit 1
fi
num_seqs_list="128 256" for num_seqs in "${num_seqs_list[@]}"; do
num_batched_tokens_list="512 1024 2048 4096" for num_batched_tokens in "${num_batched_tokens_list[@]}"; do
for num_seqs in $num_seqs_list; do run_benchmark $num_seqs $num_batched_tokens $gpu_memory_utilization
for num_batched_tokens in $num_batched_tokens_list; do
run_benchmark $num_seqs $num_batched_tokens
exit 0
done done
done done
echo "finish permutations" echo "finish permutations"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment