auto_tune.sh 12.2 KB
Newer Older
1
2
#!/bin/bash

3
# This script aims to tune the best server parameter combinations to maximize throughput for given requirement.
4
# See details in README (benchmarks/auto_tune/README.md).
5
6

TAG=$(date +"%Y_%m_%d_%H_%M")
7
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
8
9
10
11
12
13
14
15
16
17
18
19
20
VLLM_LOGGING_LEVEL=${VLLM_LOGGING_LEVEL:-INFO}
BASE=${BASE:-"$SCRIPT_DIR/../../.."}
MODEL=${MODEL:-"meta-llama/Llama-3.1-8B-Instruct"}
SYSTEM=${SYSTEM:-"TPU"}
TP=${TP:-1}
DOWNLOAD_DIR=${DOWNLOAD_DIR:-""}
INPUT_LEN=${INPUT_LEN:-4000}
OUTPUT_LEN=${OUTPUT_LEN:-16}
MAX_MODEL_LEN=${MAX_MODEL_LEN:-4096}
MIN_CACHE_HIT_PCT=${MIN_CACHE_HIT_PCT:-0}
MAX_LATENCY_ALLOWED_MS=${MAX_LATENCY_ALLOWED_MS:-100000000000}
NUM_SEQS_LIST=${NUM_SEQS_LIST:-"128 256"}
NUM_BATCHED_TOKENS_LIST=${NUM_BATCHED_TOKENS_LIST:-"512 1024 2048 4096"}
21
22
23

LOG_FOLDER="$BASE/auto-benchmark/$TAG"
RESULT="$LOG_FOLDER/result.txt"
24
PROFILE_PATH="$LOG_FOLDER/profile"
25

26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
echo "====================== AUTO TUNE PARAMETERS ===================="
echo "SCRIPT_DIR=$SCRIPT_DIR"
echo "BASE=$BASE"
echo "MODEL=$MODEL"
echo "SYSTEM=$SYSTEM"
echo "TP=$TP"
echo "DOWNLOAD_DIR=$DOWNLOAD_DIR"
echo "INPUT_LEN=$INPUT_LEN"
echo "OUTPUT_LEN=$OUTPUT_LEN"
echo "MAX_MODEL_LEN=$MAX_MODEL_LEN"
echo "MIN_CACHE_HIT_PCT=$MIN_CACHE_HIT_PCT"
echo "MAX_LATENCY_ALLOWED_MS=$MAX_LATENCY_ALLOWED_MS"
echo "NUM_SEQS_LIST=$NUM_SEQS_LIST"
echo "NUM_BATCHED_TOKENS_LIST=$NUM_BATCHED_TOKENS_LIST"
echo "VLLM_LOGGING_LEVEL=$VLLM_LOGGING_LEVEL"
echo "RESULT_FILE=$RESULT"
echo "====================== AUTO TUNEPARAMETERS ===================="
43
44

rm -rf $LOG_FOLDER
45
rm -rf $PROFILE_PATH
46
mkdir -p $LOG_FOLDER
47
mkdir -p $PROFILE_PATH
48
49
50

cd "$BASE/vllm"

51
pip install -q datasets
52
53
54
55
56

current_hash=$(git rev-parse HEAD)
echo "hash:$current_hash" >> "$RESULT"
echo "current_hash: $current_hash"

57
58
59
60
61
62
63
TOTAL_LEN=$((INPUT_LEN + OUTPUT_LEN))
RED='\033[0;31m'
if (( TOTAL_LEN > MAX_MODEL_LEN )); then
    echo -e "${RED}FAILED: INPUT_LEN($INPUT_LEN) + OUTPUT_LEN($OUTPUT_LEN) = $TOTAL_LEN, which is > MAX_MODEL_LEN = $MAX_MODEL_LEN.\033[0m" >&2
    exit 1
fi

64
65
66
67
best_throughput=0
best_max_num_seqs=0
best_num_batched_tokens=0
best_goodput=0
68
best_request_rate=0
69

70
71
72
73
74
start_server() {
    local gpu_memory_utilization=$1
    local max_num_seqs=$2
    local max_num_batched_tokens=$3
    local vllm_log=$4
75
    local profile_dir=$5
76

77
    pkill -if "vllm serve" || true
78

79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
    # Define the common arguments as a bash array.
    # Each argument and its value are separate elements.
    local common_args_array=(
        "$MODEL"
        "--disable-log-requests"
        "--port" "8004"
        "--gpu-memory-utilization" "$gpu_memory_utilization"
        "--max-num-seqs" "$max_num_seqs"
        "--max-num-batched-tokens" "$max_num_batched_tokens"
        "--tensor-parallel-size" "$TP"
        "--enable-prefix-caching"
        "--load-format" "dummy"
        "--download-dir" "$DOWNLOAD_DIR"
        "--max-model-len" "$MAX_MODEL_LEN"
    )

    # Use the array expansion "${common_args_array[@]}"
    # This correctly passes each element as a separate argument.
    if [[ -n "$profile_dir" ]]; then
        # Start server with profiling enabled
99
100
101
        local profile_config_json="{\"profiler\": \"torch\", \"torch_profiler_dir\": \"$profile_dir\"}"
        VLLM_SERVER_DEV_MODE=1 \
            vllm serve --profiler-config "$profile_config_json" "${common_args_array[@]}" > "$vllm_log" 2>&1 &
102
103
    else
        # Start server without profiling
104
        VLLM_SERVER_DEV_MODE=1 \
105
106
            vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
    fi
107
    local server_pid=$!
108

109
110
    # wait for 10 minutes...
    server_started=0
111
    for i in {1..60}; do
112
113
114
115
        # This line checks whether the server is still alive or not,
        # since that we should always have permission to send signal to the server process.
        kill -0 $server_pid 2> /dev/null || break

116
        RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
117
        STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
118
        if [[ "$STATUS_CODE" -eq 200 ]]; then
119
120
121
122
123
124
            server_started=1
            break
        else
            sleep 10
        fi
    done
125

126
    if (( ! server_started )); then
127
        echo "server did not start within 10 minutes or crashed. Please check server log at $vllm_log".
128
        return 1
129
130
    else
        return 0
131
    fi
132
133
134
135
136
137
138
139
140
141
142
}

run_benchmark() {
    local max_num_seqs=$1
    local max_num_batched_tokens=$2
    local gpu_memory_utilization=$3
    echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
    local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
    echo "vllm_log: $vllm_log"
    echo
    rm -f $vllm_log
143
    pkill -if "vllm serve" || true
144
145

    echo "starting server..."
146
147
    # Call start_server without a profile_dir to avoid profiling overhead
    start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log ""
148
149
150
151
152
153
154
    result=$?
    if [[ "$result" -eq 1 ]]; then
        echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
    else
        echo "server started."
    fi
    echo
155

156
157
158
159
160
    echo "run benchmark test..."
    meet_latency_requirement=0
    # get a basic qps by using request-rate inf
    bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
    prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
161
162
    adjusted_input_len=$(( INPUT_LEN - prefix_len ))
    # --profile flag is removed from this call
163
    vllm bench serve \
164
165
        --backend vllm \
        --model $MODEL  \
166
        --dataset-name random \
167
        --random-input-len $adjusted_input_len \
168
        --random-output-len $OUTPUT_LEN \
169
170
171
172
173
        --ignore-eos \
        --disable-tqdm \
        --request-rate inf \
        --percentile-metrics ttft,tpot,itl,e2el \
        --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
174
175
        --num-prompts 1000 \
        --random-prefix-len $prefix_len \
176
        --port 8004 &> "$bm_log"
177
    throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
178
179
180
181
182
    e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
    goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')

    if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
        meet_latency_requirement=1
183
        request_rate=inf
184
185
186
    fi

    if (( ! meet_latency_requirement )); then
187
188
    # start from request-rate as int(throughput) + 1
        request_rate=$((${throughput%.*} + 1))
189
190
191
192
193
        while ((request_rate > 0)); do
            # clear prefix cache
            curl -X POST http://0.0.0.0:8004/reset_prefix_cache
            sleep 5
            bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
194
            vllm bench serve \
195
196
                --backend vllm \
                --model $MODEL  \
197
                --dataset-name random \
198
                --random-input-len $adjusted_input_len \
199
200
                --random-output-len $OUTPUT_LEN \
                --ignore-eos \
201
202
203
204
205
                --disable-tqdm \
                --request-rate $request_rate \
                --percentile-metrics ttft,tpot,itl,e2el \
                --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
                --num-prompts 100 \
206
207
208
                --random-prefix-len $prefix_len \
                --port 8004 &> "$bm_log"
            throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
209
210
211
212
213
214
215
216
217
218
219
            e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
            goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
            if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
                meet_latency_requirement=1
                break
            fi
            request_rate=$((request_rate-1))
        done
    fi
    # write the results and update the best result.
    if ((meet_latency_requirement)); then
220
221
222
223
        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput"
        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput" >> "$RESULT"
        if (( $(echo "$throughput > $best_throughput" | bc -l) )); then
            best_throughput=$throughput
224
225
226
            best_max_num_seqs=$max_num_seqs
            best_num_batched_tokens=$max_num_batched_tokens
            best_goodput=$goodput
227
            best_request_rate=$request_rate
228
229
230
231
232
233
234
235
        fi
    else
        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}"
        echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}" >> "$RESULT"
    fi

    echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"

236
    pkill -if "vllm serve" || true
237
    sleep 10
238
    echo "===================="
239
240
241
    return 0
}

242
243
244
245
246
247
248
read -r -a num_seqs_list <<< "$NUM_SEQS_LIST"
read -r -a num_batched_tokens_list <<< "$NUM_BATCHED_TOKENS_LIST"

# first find out the max gpu-memory-utilization without HBM OOM.
gpu_memory_utilization=0.98
find_gpu_memory_utilization=0
while (( $(echo "$gpu_memory_utilization >= 0.9" | bc -l) )); do
249
250
    # Pass empty string for profile_dir argument
    start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log" ""
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
    result=$?
    if [[ "$result" -eq 0 ]]; then
        find_gpu_memory_utilization=1
        break
    else
        gpu_memory_utilization=$(echo "$gpu_memory_utilization - 0.01" | bc)
    fi
done

if [[ "$find_gpu_memory_utilization" -eq 1 ]]; then
    echo "Using gpu_memory_utilization=$gpu_memory_utilization to serve model."
else
    echo "Cannot find a proper gpu_memory_utilization over 0.9 to serve the model, please check logs in $LOG_FOLDER."
    exit 1
fi
266

267
268
269
for num_seqs in "${num_seqs_list[@]}"; do
    for num_batched_tokens in "${num_batched_tokens_list[@]}"; do
        run_benchmark $num_seqs $num_batched_tokens $gpu_memory_utilization
270
271
272
    done
done
echo "finish permutations"
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311

# =================================================================================
# FINAL PROFILING RUN FOR THE BEST CONFIGURATION
# =================================================================================
if (( $(echo "$best_throughput > 0" | bc -l) )); then
    echo
    echo "Benchmark tuning finished. Now running profiling on the best configuration found..."
    echo "Best config: max_num_seqs: $best_max_num_seqs, max_num_batched_tokens: $best_num_batched_tokens, throughput: $best_throughput"
    echo

    vllm_log="$LOG_FOLDER/vllm_log_BEST_PROFILE.txt"
    bm_log="$LOG_FOLDER/bm_log_BEST_PROFILE.txt"

    # Start server with the best params and profiling ENABLED
    echo "Starting server for profiling..."
    start_server $gpu_memory_utilization $best_max_num_seqs $best_num_batched_tokens "$vllm_log" "$PROFILE_PATH"

    # Run benchmark with the best params and the --profile flag
    echo "Running benchmark with profiling..."
    prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
    adjusted_input_len=$(( INPUT_LEN - prefix_len ))
    vllm bench serve \
        --backend vllm \
        --model $MODEL \
        --dataset-name random \
        --random-input-len $adjusted_input_len \
        --random-output-len $OUTPUT_LEN \
        --ignore-eos \
        --disable-tqdm \
        --request-rate $best_request_rate \
        --percentile-metrics ttft,tpot,itl,e2el \
        --goodput e2el:$MAX_LATENCY_ALLOWED_MS \
        --num-prompts 100 \
        --random-prefix-len $prefix_len \
        --port 8004 \
        --profile &> "$bm_log"
else
    echo "No configuration met the latency requirements. Skipping final profiling run."
fi
312
pkill -if "vllm serve" || true
313
314
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"