Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
309c1bb8
Unverified
Commit
309c1bb8
authored
Aug 04, 2025
by
ericehanley
Committed by
GitHub
Aug 04, 2025
Browse files
[Bug] Update auto_tune.sh to separate benchmarking and profiling. (#21629)
Signed-off-by:
Eric Hanley
<
ericehanley@google.com
>
parent
9af654cc
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
80 additions
and
43 deletions
+80
-43
benchmarks/auto_tune/auto_tune.sh
benchmarks/auto_tune/auto_tune.sh
+80
-43
No files found.
benchmarks/auto_tune/auto_tune.sh
View file @
309c1bb8
...
...
@@ -49,6 +49,7 @@ best_throughput=0
best_max_num_seqs
=
0
best_num_batched_tokens
=
0
best_goodput
=
0
best_request_rate
=
0
start_server
()
{
local
gpu_memory_utilization
=
$1
...
...
@@ -57,18 +58,35 @@ start_server() {
local
vllm_log
=
$4
local
profile_dir
=
$5
pkill
-f
vllm
pkill
-
i
f
vllm
VLLM_USE_V1
=
1
VLLM_SERVER_DEV_MODE
=
1
VLLM_TORCH_PROFILER_DIR
=
$profile_dir
vllm serve
$MODEL
\
--port
8004
\
--gpu-memory-utilization
$gpu_memory_utilization
\
--max-num-seqs
$max_num_seqs
\
--max-num-batched-tokens
$max_num_batched_tokens
\
--tensor-parallel-size
$TP
\
--enable-prefix-caching
\
--load-format
dummy
\
--download-dir
"
$DOWNLOAD_DIR
"
\
--max-model-len
$MAX_MODEL_LEN
>
"
$vllm_log
"
2>&1 &
# Define the common arguments as a bash array.
# Each argument and its value are separate elements.
local
common_args_array
=(
"
$MODEL
"
"--disable-log-requests"
"--port"
"8004"
"--gpu-memory-utilization"
"
$gpu_memory_utilization
"
"--max-num-seqs"
"
$max_num_seqs
"
"--max-num-batched-tokens"
"
$max_num_batched_tokens
"
"--tensor-parallel-size"
"
$TP
"
"--enable-prefix-caching"
"--load-format"
"dummy"
"--download-dir"
"
$DOWNLOAD_DIR
"
"--max-model-len"
"
$MAX_MODEL_LEN
"
)
# Use the array expansion "${common_args_array[@]}"
# This correctly passes each element as a separate argument.
if
[[
-n
"
$profile_dir
"
]]
;
then
# Start server with profiling enabled
VLLM_USE_V1
=
1
VLLM_SERVER_DEV_MODE
=
1
VLLM_TORCH_PROFILER_DIR
=
$profile_dir
\
vllm serve
"
${
common_args_array
[@]
}
"
>
"
$vllm_log
"
2>&1 &
else
# Start server without profiling
VLLM_USE_V1
=
1
VLLM_SERVER_DEV_MODE
=
1
\
vllm serve
"
${
common_args_array
[@]
}
"
>
"
$vllm_log
"
2>&1 &
fi
# wait for 10 minutes...
server_started
=
0
...
...
@@ -82,6 +100,7 @@ start_server() {
sleep
10
fi
done
if
((
!
server_started
))
;
then
echo
"server did not start within 10 minutes. Please check server log at
$vllm_log
"
.
return
1
...
...
@@ -90,37 +109,20 @@ start_server() {
fi
}
update_best_profile
()
{
local
profile_dir
=
$1
local
profile_index
=
$2
sorted_paths
=(
$(
find
"
$profile_dir
"
-maxdepth
1
-not
-path
"
$profile_dir
"
|
sort
)
)
selected_profile_file
=
if
[[
"
$SYSTEM
"
==
"TPU"
]]
;
then
selected_profile_file
=
"
${
sorted_paths
[
$profile_index
]
}
/*.xplane.pb"
fi
if
[[
"
$SYSTEM
"
==
"GPU"
]]
;
then
selected_profile_file
=
"
${
sorted_paths
[
$profile_index
]
}
"
fi
rm
-f
$PROFILE_PATH
/
*
cp
$selected_profile_file
$PROFILE_PATH
}
run_benchmark
()
{
local
max_num_seqs
=
$1
local
max_num_batched_tokens
=
$2
local
gpu_memory_utilization
=
$3
echo
"max_num_seq:
$max_num_seqs
, max_num_batched_tokens:
$max_num_batched_tokens
"
local
vllm_log
=
"
$LOG_FOLDER
/vllm_log_
${
max_num_seqs
}
_
${
max_num_batched_tokens
}
.txt"
local
profile_dir
=
"
$LOG_FOLDER
/profile_
${
max_num_seqs
}
_
${
max_num_batched_tokens
}
"
echo
"vllm_log:
$vllm_log
"
echo
rm
-f
$vllm_log
mkdir
-p
$profile_dir
pkill
-f
vllm
local
profile_index
=
0
pkill
-if
vllm
echo
"starting server..."
start_server
$gpu_memory_utilization
$max_num_seqs
$max_num_batched_tokens
$vllm_log
$profile_dir
# Call start_server without a profile_dir to avoid profiling overhead
start_server
$gpu_memory_utilization
$max_num_seqs
$max_num_batched_tokens
$vllm_log
""
result
=
$?
if
[[
"
$result
"
-eq
1
]]
;
then
echo
"server failed to start. gpu_memory_utilization:
$gpu_memory_utilization
, max_num_seqs:
$max_num_seqs
, max_num_batched_tokens:
$max_num_batched_tokens
"
...
...
@@ -134,7 +136,8 @@ run_benchmark() {
# get a basic qps by using request-rate inf
bm_log
=
"
$LOG_FOLDER
/bm_log_
${
max_num_seqs
}
_
${
max_num_batched_tokens
}
_requestrate_inf.txt"
prefix_len
=
$((
INPUT_LEN
*
MIN_CACHE_HIT_PCT
/
100
))
adjusted_input_len
=
$((
INPUT_LEN
-
prefix_len
))
adjusted_input_len
=
$((
INPUT_LEN
-
prefix_len
))
# --profile flag is removed from this call
vllm bench serve
\
--backend
vllm
\
--model
$MODEL
\
...
...
@@ -148,8 +151,7 @@ adjusted_input_len=$(( INPUT_LEN - prefix_len ))
--goodput
e2el:
$MAX_LATENCY_ALLOWED_MS
\
--num-prompts
1000
\
--random-prefix-len
$prefix_len
\
--port
8004
\
--profile
&>
"
$bm_log
"
--port
8004 &>
"
$bm_log
"
throughput
=
$(
grep
"Request throughput (req/s):"
"
$bm_log
"
|
sed
's/[^0-9.]//g'
)
e2el
=
$(
grep
"P99 E2EL (ms):"
"
$bm_log
"
|
awk
'{print $NF}'
)
goodput
=
$(
grep
"Request goodput (req/s):"
"
$bm_log
"
|
sed
's/[^0-9.]//g'
)
...
...
@@ -163,7 +165,6 @@ adjusted_input_len=$(( INPUT_LEN - prefix_len ))
# start from request-rate as int(throughput) + 1
request_rate
=
$((${
throughput
%.*
}
+
1
))
while
((
request_rate
>
0
))
;
do
profile_index
=
$((
profile_index+1
))
# clear prefix cache
curl
-X
POST http://0.0.0.0:8004/reset_prefix_cache
sleep
5
...
...
@@ -201,12 +202,7 @@ adjusted_input_len=$(( INPUT_LEN - prefix_len ))
best_max_num_seqs
=
$max_num_seqs
best_num_batched_tokens
=
$max_num_batched_tokens
best_goodput
=
$goodput
if
[[
"
$SYSTEM
"
==
"TPU"
]]
;
then
update_best_profile
"
$profile_dir
/plugins/profile"
$profile_index
fi
if
[[
"
$SYSTEM
"
==
"GPU"
]]
;
then
update_best_profile
"
$profile_dir
"
$profile_index
fi
best_request_rate
=
$request_rate
fi
else
echo
"max_num_seqs:
$max_num_seqs
, max_num_batched_tokens:
$max_num_batched_tokens
does not meet latency requirement
${
MAX_LATENCY_ALLOWED_MS
}
"
...
...
@@ -215,7 +211,7 @@ adjusted_input_len=$(( INPUT_LEN - prefix_len ))
echo
"best_max_num_seqs:
$best_max_num_seqs
, best_num_batched_tokens:
$best_num_batched_tokens
, best_throughput:
$best_throughput
"
pkill vllm
pkill
-if
vllm
sleep
10
printf
'=%.0s'
$(
seq
1 20
)
return
0
...
...
@@ -228,7 +224,8 @@ read -r -a num_batched_tokens_list <<< "$NUM_BATCHED_TOKENS_LIST"
gpu_memory_utilization
=
0.98
find_gpu_memory_utilization
=
0
while
((
$(
echo
"
$gpu_memory_utilization
>= 0.9"
| bc
-l
)
))
;
do
start_server
$gpu_memory_utilization
"
${
num_seqs_list
[-1]
}
"
"
${
num_batched_tokens_list
[-1]
}
"
"
$LOG_FOLDER
/vllm_log_gpu_memory_utilization_
$gpu_memory_utilization
.log"
# Pass empty string for profile_dir argument
start_server
$gpu_memory_utilization
"
${
num_seqs_list
[-1]
}
"
"
${
num_batched_tokens_list
[-1]
}
"
"
$LOG_FOLDER
/vllm_log_gpu_memory_utilization_
$gpu_memory_utilization
.log"
""
result
=
$?
if
[[
"
$result
"
-eq
0
]]
;
then
find_gpu_memory_utilization
=
1
...
...
@@ -251,5 +248,45 @@ for num_seqs in "${num_seqs_list[@]}"; do
done
done
echo
"finish permutations"
# =================================================================================
# FINAL PROFILING RUN FOR THE BEST CONFIGURATION
# =================================================================================
if
((
$(
echo
"
$best_throughput
> 0"
| bc
-l
)
))
;
then
echo
echo
"Benchmark tuning finished. Now running profiling on the best configuration found..."
echo
"Best config: max_num_seqs:
$best_max_num_seqs
, max_num_batched_tokens:
$best_num_batched_tokens
, throughput:
$best_throughput
"
echo
vllm_log
=
"
$LOG_FOLDER
/vllm_log_BEST_PROFILE.txt"
bm_log
=
"
$LOG_FOLDER
/bm_log_BEST_PROFILE.txt"
# Start server with the best params and profiling ENABLED
echo
"Starting server for profiling..."
start_server
$gpu_memory_utilization
$best_max_num_seqs
$best_num_batched_tokens
"
$vllm_log
"
"
$PROFILE_PATH
"
# Run benchmark with the best params and the --profile flag
echo
"Running benchmark with profiling..."
prefix_len
=
$((
INPUT_LEN
*
MIN_CACHE_HIT_PCT
/
100
))
adjusted_input_len
=
$((
INPUT_LEN
-
prefix_len
))
vllm bench serve
\
--backend
vllm
\
--model
$MODEL
\
--dataset-name
random
\
--random-input-len
$adjusted_input_len
\
--random-output-len
$OUTPUT_LEN
\
--ignore-eos
\
--disable-tqdm
\
--request-rate
$best_request_rate
\
--percentile-metrics
ttft,tpot,itl,e2el
\
--goodput
e2el:
$MAX_LATENCY_ALLOWED_MS
\
--num-prompts
100
\
--random-prefix-len
$prefix_len
\
--port
8004
\
--profile
&>
"
$bm_log
"
else
echo
"No configuration met the latency requirements. Skipping final profiling run."
fi
pkill
-if
vllm
echo
"best_max_num_seqs:
$best_max_num_seqs
, best_num_batched_tokens:
$best_num_batched_tokens
, best_throughput:
$best_throughput
, profile saved in:
$PROFILE_PATH
"
echo
"best_max_num_seqs:
$best_max_num_seqs
, best_num_batched_tokens:
$best_num_batched_tokens
, best_throughput:
$best_throughput
, profile saved in:
$PROFILE_PATH
"
>>
"
$RESULT
"
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment