Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
ee5ad8d2
Unverified
Commit
ee5ad8d2
authored
Jun 23, 2025
by
Chenyaaang
Committed by
GitHub
Jun 24, 2025
Browse files
[Misc][Tools][Benchmark] Add profile to autotune script (#19711)
Signed-off-by:
Chenyaaang
<
chenyangli@google.com
>
parent
a738dbb2
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
37 additions
and
5 deletions
+37
-5
benchmarks/auto_tune.sh
benchmarks/auto_tune.sh
+37
-5
No files found.
benchmarks/auto_tune.sh
View file @
ee5ad8d2
...
@@ -10,6 +10,7 @@
...
@@ -10,6 +10,7 @@
# 3. Set variables (ALL REQUIRED)
# 3. Set variables (ALL REQUIRED)
# BASE: your directory for vllm repo
# BASE: your directory for vllm repo
# MODEL: the model served by vllm
# MODEL: the model served by vllm
# SYSTEM: the hardware, choice TPU or GPU, for other systems, "get best profile" might not support.
# TP: ways of tensor parallelism
# TP: ways of tensor parallelism
# DOWNLOAD_DIR: directory to download and load model weights.
# DOWNLOAD_DIR: directory to download and load model weights.
# INPUT_LEN: request input len
# INPUT_LEN: request input len
...
@@ -34,6 +35,7 @@
...
@@ -34,6 +35,7 @@
TAG
=
$(
date
+
"%Y_%m_%d_%H_%M"
)
TAG
=
$(
date
+
"%Y_%m_%d_%H_%M"
)
BASE
=
""
BASE
=
""
MODEL
=
"meta-llama/Llama-3.1-8B-Instruct"
MODEL
=
"meta-llama/Llama-3.1-8B-Instruct"
SYSTEM
=
"TPU"
TP
=
1
TP
=
1
DOWNLOAD_DIR
=
""
DOWNLOAD_DIR
=
""
INPUT_LEN
=
4000
INPUT_LEN
=
4000
...
@@ -45,12 +47,15 @@ NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096"
...
@@ -45,12 +47,15 @@ NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096"
LOG_FOLDER
=
"
$BASE
/auto-benchmark/
$TAG
"
LOG_FOLDER
=
"
$BASE
/auto-benchmark/
$TAG
"
RESULT
=
"
$LOG_FOLDER
/result.txt"
RESULT
=
"
$LOG_FOLDER
/result.txt"
PROFILE_PATH
=
"
$LOG_FOLDER
/profile"
echo
"result file:
$RESULT
"
echo
"result file:
$RESULT
"
echo
"model:
$MODEL
"
echo
"model:
$MODEL
"
rm
-rf
$LOG_FOLDER
rm
-rf
$LOG_FOLDER
rm
-rf
$PROFILE_PATH
mkdir
-p
$LOG_FOLDER
mkdir
-p
$LOG_FOLDER
mkdir
-p
$PROFILE_PATH
cd
"
$BASE
/vllm"
cd
"
$BASE
/vllm"
...
@@ -70,10 +75,11 @@ start_server() {
...
@@ -70,10 +75,11 @@ start_server() {
local
max_num_seqs
=
$2
local
max_num_seqs
=
$2
local
max_num_batched_tokens
=
$3
local
max_num_batched_tokens
=
$3
local
vllm_log
=
$4
local
vllm_log
=
$4
local
profile_dir
=
$5
pkill
-f
vllm
pkill
-f
vllm
VLLM_USE_V1
=
1
VLLM_SERVER_DEV_MODE
=
1 vllm serve
$MODEL
\
VLLM_USE_V1
=
1
VLLM_SERVER_DEV_MODE
=
1
VLLM_TORCH_PROFILER_DIR
=
$profile_dir
vllm serve
$MODEL
\
--disable-log-requests
\
--disable-log-requests
\
--port
8004
\
--port
8004
\
--gpu-memory-utilization
$gpu_memory_utilization
\
--gpu-memory-utilization
$gpu_memory_utilization
\
...
@@ -105,19 +111,37 @@ start_server() {
...
@@ -105,19 +111,37 @@ start_server() {
fi
fi
}
}
update_best_profile
()
{
local
profile_dir
=
$1
local
profile_index
=
$2
sorted_paths
=(
$(
find
"
$profile_dir
"
-maxdepth
1
-not
-path
"
$profile_dir
"
|
sort
)
)
selected_profile_file
=
if
[[
"
$SYSTEM
"
==
"TPU"
]]
;
then
selected_profile_file
=
"
${
sorted_paths
[
$profile_index
]
}
/*.xplane.pb"
fi
if
[[
"
$SYSTEM
"
==
"GPU"
]]
;
then
selected_profile_file
=
"
${
sorted_paths
[
$profile_index
]
}
"
fi
rm
-f
$PROFILE_PATH
/
*
cp
$selected_profile_file
$PROFILE_PATH
}
run_benchmark
()
{
run_benchmark
()
{
local
max_num_seqs
=
$1
local
max_num_seqs
=
$1
local
max_num_batched_tokens
=
$2
local
max_num_batched_tokens
=
$2
local
gpu_memory_utilization
=
$3
local
gpu_memory_utilization
=
$3
echo
"max_num_seq:
$max_num_seqs
, max_num_batched_tokens:
$max_num_batched_tokens
"
echo
"max_num_seq:
$max_num_seqs
, max_num_batched_tokens:
$max_num_batched_tokens
"
local
vllm_log
=
"
$LOG_FOLDER
/vllm_log_
${
max_num_seqs
}
_
${
max_num_batched_tokens
}
.txt"
local
vllm_log
=
"
$LOG_FOLDER
/vllm_log_
${
max_num_seqs
}
_
${
max_num_batched_tokens
}
.txt"
local
profile_dir
=
"
$LOG_FOLDER
/profile_
${
max_num_seqs
}
_
${
max_num_batched_tokens
}
"
echo
"vllm_log:
$vllm_log
"
echo
"vllm_log:
$vllm_log
"
echo
echo
rm
-f
$vllm_log
rm
-f
$vllm_log
mkdir
-p
$profile_dir
pkill
-f
vllm
pkill
-f
vllm
local
profile_index
=
0
echo
"starting server..."
echo
"starting server..."
start_server
$gpu_memory_utilization
$max_num_seqs
$max_num_batched_tokens
$vllm_log
start_server
$gpu_memory_utilization
$max_num_seqs
$max_num_batched_tokens
$vllm_log
$profile_dir
result
=
$?
result
=
$?
if
[[
"
$result
"
-eq
1
]]
;
then
if
[[
"
$result
"
-eq
1
]]
;
then
echo
"server failed to start. gpu_memory_utilization:
$gpu_memory_utilization
, max_num_seqs:
$max_num_seqs
, max_num_batched_tokens:
$max_num_batched_tokens
"
echo
"server failed to start. gpu_memory_utilization:
$gpu_memory_utilization
, max_num_seqs:
$max_num_seqs
, max_num_batched_tokens:
$max_num_batched_tokens
"
...
@@ -144,7 +168,8 @@ run_benchmark() {
...
@@ -144,7 +168,8 @@ run_benchmark() {
--goodput
e2el:
$MAX_LATENCY_ALLOWED_MS
\
--goodput
e2el:
$MAX_LATENCY_ALLOWED_MS
\
--num-prompts
1000
\
--num-prompts
1000
\
--random-prefix-len
$prefix_len
\
--random-prefix-len
$prefix_len
\
--port
8004 &>
"
$bm_log
"
--port
8004
\
--profile
&>
"
$bm_log
"
throughput
=
$(
grep
"Request throughput (req/s):"
"
$bm_log
"
|
sed
's/[^0-9.]//g'
)
throughput
=
$(
grep
"Request throughput (req/s):"
"
$bm_log
"
|
sed
's/[^0-9.]//g'
)
e2el
=
$(
grep
"P99 E2EL (ms):"
"
$bm_log
"
|
awk
'{print $NF}'
)
e2el
=
$(
grep
"P99 E2EL (ms):"
"
$bm_log
"
|
awk
'{print $NF}'
)
goodput
=
$(
grep
"Request goodput (req/s):"
"
$bm_log
"
|
sed
's/[^0-9.]//g'
)
goodput
=
$(
grep
"Request goodput (req/s):"
"
$bm_log
"
|
sed
's/[^0-9.]//g'
)
...
@@ -158,6 +183,7 @@ run_benchmark() {
...
@@ -158,6 +183,7 @@ run_benchmark() {
# start from request-rate as int(throughput) + 1
# start from request-rate as int(throughput) + 1
request_rate
=
$((${
throughput
%.*
}
+
1
))
request_rate
=
$((${
throughput
%.*
}
+
1
))
while
((
request_rate
>
0
))
;
do
while
((
request_rate
>
0
))
;
do
profile_index
=
$((
profile_index+1
))
# clear prefix cache
# clear prefix cache
curl
-X
POST http://0.0.0.0:8004/reset_prefix_cache
curl
-X
POST http://0.0.0.0:8004/reset_prefix_cache
sleep
5
sleep
5
...
@@ -195,6 +221,12 @@ run_benchmark() {
...
@@ -195,6 +221,12 @@ run_benchmark() {
best_max_num_seqs
=
$max_num_seqs
best_max_num_seqs
=
$max_num_seqs
best_num_batched_tokens
=
$max_num_batched_tokens
best_num_batched_tokens
=
$max_num_batched_tokens
best_goodput
=
$goodput
best_goodput
=
$goodput
if
[[
"
$SYSTEM
"
==
"TPU"
]]
;
then
update_best_profile
"
$profile_dir
/plugins/profile"
$profile_index
fi
if
[[
"
$SYSTEM
"
==
"GPU"
]]
;
then
update_best_profile
"
$profile_dir
"
$profile_index
fi
fi
fi
else
else
echo
"max_num_seqs:
$max_num_seqs
, max_num_batched_tokens:
$max_num_batched_tokens
does not meet latency requirement
${
MAX_LATENCY_ALLOWED_MS
}
"
echo
"max_num_seqs:
$max_num_seqs
, max_num_batched_tokens:
$max_num_batched_tokens
does not meet latency requirement
${
MAX_LATENCY_ALLOWED_MS
}
"
...
@@ -239,6 +271,6 @@ for num_seqs in "${num_seqs_list[@]}"; do
...
@@ -239,6 +271,6 @@ for num_seqs in "${num_seqs_list[@]}"; do
done
done
done
done
echo
"finish permutations"
echo
"finish permutations"
echo
"best_max_num_seqs:
$best_max_num_seqs
, best_num_batched_tokens:
$best_num_batched_tokens
, best_throughput:
$best_throughput
"
echo
"best_max_num_seqs:
$best_max_num_seqs
, best_num_batched_tokens:
$best_num_batched_tokens
, best_throughput:
$best_throughput
, profile saved in:
$PROFILE_PATH
"
echo
"best_max_num_seqs:
$best_max_num_seqs
, best_num_batched_tokens:
$best_num_batched_tokens
, best_throughput:
$best_throughput
"
>>
"
$RESULT
"
echo
"best_max_num_seqs:
$best_max_num_seqs
, best_num_batched_tokens:
$best_num_batched_tokens
, best_throughput:
$best_throughput
, profile saved in:
$PROFILE_PATH
"
>>
"
$RESULT
"
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment