auto_tune.sh 9.12 KB
Newer Older
1
2
3
#!/bin/bash

# This script aims to tune the best server parameter combinations to maximize throughput for given requirement. 
4
# See details in README (benchmarks/auto_tune/README.md).
5
6
7
8

TAG=$(date +"%Y_%m_%d_%H_%M")
BASE=""
MODEL="meta-llama/Llama-3.1-8B-Instruct"
9
SYSTEM="TPU"
10
TP=1
11
12
13
DOWNLOAD_DIR=""
INPUT_LEN=4000
OUTPUT_LEN=16
14
MIN_CACHE_HIT_PCT=0
15
MAX_LATENCY_ALLOWED_MS=100000000000
16
17
NUM_SEQS_LIST="128 256"
NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096"
18
19
20

LOG_FOLDER="$BASE/auto-benchmark/$TAG"
RESULT="$LOG_FOLDER/result.txt"
21
PROFILE_PATH="$LOG_FOLDER/profile"
22

23
echo "result file: $RESULT"
24
25
26
echo "model: $MODEL"

rm -rf $LOG_FOLDER
27
rm -rf $PROFILE_PATH
28
mkdir -p $LOG_FOLDER
29
mkdir -p $PROFILE_PATH
30
31
32

cd "$BASE/vllm"

33
pip install -q datasets
34
35
36
37
38
39
40
41
42
43

current_hash=$(git rev-parse HEAD)
echo "hash:$current_hash" >> "$RESULT"
echo "current_hash: $current_hash"

best_throughput=0
best_max_num_seqs=0
best_num_batched_tokens=0
best_goodput=0

44
45
46
47
48
start_server() {
    local gpu_memory_utilization=$1
    local max_num_seqs=$2
    local max_num_batched_tokens=$3
    local vllm_log=$4
49
    local profile_dir=$5
50
51
52
    
    pkill -f vllm

53
    VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir vllm serve $MODEL \
54
55
        --disable-log-requests \
        --port 8004 \
56
        --gpu-memory-utilization $gpu_memory_utilization \
57
58
        --max-num-seqs $max_num_seqs \
        --max-num-batched-tokens $max_num_batched_tokens \
59
        --tensor-parallel-size $TP \
60
61
        --enable-prefix-caching \
        --load-format dummy \
62
        --download-dir "$DOWNLOAD_DIR" \
63
        --max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 &
64

65
66
    # wait for 10 minutes...
    server_started=0
67
68
69
70
    for i in {1..60}; do  
        RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
        STATUS_CODE=$(echo "$RESPONSE" | tail -n 1) 
        if [[ "$STATUS_CODE" -eq 200 ]]; then
71
72
73
74
75
76
77
            server_started=1
            break
        else
            sleep 10
        fi
    done
    if (( ! server_started )); then
78
        echo "server did not start within 10 minutes. Please check server log at $vllm_log".
79
        return 1
80
81
    else
        return 0
82
    fi
83
84
}

85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
update_best_profile() {
    local profile_dir=$1
    local profile_index=$2
    sorted_paths=($(find "$profile_dir" -maxdepth 1 -not -path "$profile_dir" | sort))
    selected_profile_file=
    if [[ "$SYSTEM" == "TPU" ]]; then
        selected_profile_file="${sorted_paths[$profile_index]}/*.xplane.pb"
    fi 
    if [[ "$SYSTEM" == "GPU" ]]; then
        selected_profile_file="${sorted_paths[$profile_index]}"
    fi 
    rm -f $PROFILE_PATH/*
    cp $selected_profile_file $PROFILE_PATH
}

100
101
102
103
104
105
run_benchmark() {
    local max_num_seqs=$1
    local max_num_batched_tokens=$2
    local gpu_memory_utilization=$3
    echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
    local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
106
    local profile_dir="$LOG_FOLDER/profile_${max_num_seqs}_${max_num_batched_tokens}"
107
108
109
    echo "vllm_log: $vllm_log"
    echo
    rm -f $vllm_log
110
    mkdir -p $profile_dir
111
    pkill -f vllm
112
    local profile_index=0
113
114

    echo "starting server..."
115
    start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log $profile_dir
116
117
118
119
120
121
122
    result=$?
    if [[ "$result" -eq 1 ]]; then
        echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
    else
        echo "server started."
    fi
    echo
123
124
125
126
127
128
129
130
131
    
    echo "run benchmark test..."
    meet_latency_requirement=0
    # get a basic qps by using request-rate inf
    bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
    prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
    python benchmarks/benchmark_serving.py \
        --backend vllm \
        --model $MODEL  \
132
133
134
        --dataset-name random \
        --random-input-len $INPUT_LEN \
        --random-output-len $OUTPUT_LEN \
135
136
137
138
139
        --ignore-eos \
        --disable-tqdm \
        --request-rate inf \
        --percentile-metrics ttft,tpot,itl,e2el \
        --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
140
141
        --num-prompts 1000 \
        --random-prefix-len $prefix_len \
142
143
        --port 8004 \
        --profile &> "$bm_log"
144
    throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
145
146
147
148
149
    e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
    goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')

    if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
        meet_latency_requirement=1
150
        request_rate=inf
151
152
153
    fi

    if (( ! meet_latency_requirement )); then
154
155
    # start from request-rate as int(throughput) + 1
        request_rate=$((${throughput%.*} + 1))
156
        while ((request_rate > 0)); do
157
            profile_index=$((profile_index+1))
158
159
160
161
162
163
164
            # clear prefix cache
            curl -X POST http://0.0.0.0:8004/reset_prefix_cache
            sleep 5
            bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
            python benchmarks/benchmark_serving.py \
                --backend vllm \
                --model $MODEL  \
165
166
167
168
                --dataset-name random \
                --random-input-len $INPUT_LEN \
                --random-output-len $OUTPUT_LEN \
                --ignore-eos \
169
170
171
172
173
                --disable-tqdm \
                --request-rate $request_rate \
                --percentile-metrics ttft,tpot,itl,e2el \
                --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
                --num-prompts 100 \
174
175
176
                --random-prefix-len $prefix_len \
                --port 8004 &> "$bm_log"
            throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
177
178
179
180
181
182
183
184
185
186
187
            e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
            goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
            if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
                meet_latency_requirement=1
                break
            fi
            request_rate=$((request_rate-1))
        done
    fi
    # write the results and update the best result.
    if ((meet_latency_requirement)); then
188
189
190
191
        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput"
        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput" >> "$RESULT"
        if (( $(echo "$throughput > $best_throughput" | bc -l) )); then
            best_throughput=$throughput
192
193
194
            best_max_num_seqs=$max_num_seqs
            best_num_batched_tokens=$max_num_batched_tokens
            best_goodput=$goodput
195
196
197
198
199
200
            if [[ "$SYSTEM" == "TPU" ]]; then
                update_best_profile "$profile_dir/plugins/profile" $profile_index
            fi
            if [[ "$SYSTEM" == "GPU" ]]; then
                update_best_profile "$profile_dir" $profile_index
            fi
201
202
203
204
205
206
207
208
209
210
211
212
213
214
        fi
    else
        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}"
        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}" >> "$RESULT"
    fi

    echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"

    pkill vllm
    sleep 10
    printf '=%.0s' $(seq 1 20)
    return 0
}

215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
read -r -a num_seqs_list <<< "$NUM_SEQS_LIST"
read -r -a num_batched_tokens_list <<< "$NUM_BATCHED_TOKENS_LIST"

# first find out the max gpu-memory-utilization without HBM OOM.
gpu_memory_utilization=0.98
find_gpu_memory_utilization=0
while (( $(echo "$gpu_memory_utilization >= 0.9" | bc -l) )); do
    start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log"
    result=$?
    if [[ "$result" -eq 0 ]]; then
        find_gpu_memory_utilization=1
        break
    else
        gpu_memory_utilization=$(echo "$gpu_memory_utilization - 0.01" | bc)
    fi
done

if [[ "$find_gpu_memory_utilization" -eq 1 ]]; then
    echo "Using gpu_memory_utilization=$gpu_memory_utilization to serve model."
else
    echo "Cannot find a proper gpu_memory_utilization over 0.9 to serve the model, please check logs in $LOG_FOLDER."
    exit 1
fi
238

239
240
241
for num_seqs in "${num_seqs_list[@]}"; do
    for num_batched_tokens in "${num_batched_tokens_list[@]}"; do
        run_benchmark $num_seqs $num_batched_tokens $gpu_memory_utilization
242
243
244
    done
done
echo "finish permutations"
245
246
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"
247