Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
zhaoyu6
sglang
Commits
9ce89bc1
"vscode:/vscode.git/clone" did not exist on "55121d1635f63d041e9645f119dbf524c3ef7c5a"
Unverified
Commit
9ce89bc1
authored
Jun 28, 2024
by
Ying Sheng
Committed by
GitHub
Jun 28, 2024
Browse files
Update benchmark script (#571)
parent
badf3fa0
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
16 additions
and
5 deletions
+16
-5
python/sglang/bench_latency.py
python/sglang/bench_latency.py
+16
-5
No files found.
python/sglang/bench_latency.py
View file @
9ce89bc1
...
@@ -81,6 +81,7 @@ def load_model(server_args, tp_rank):
...
@@ -81,6 +81,7 @@ def load_model(server_args, tp_rank):
nccl_port
=
28888
,
nccl_port
=
28888
,
server_args
=
server_args
,
server_args
=
server_args
,
)
)
print
(
f
"max_total_num_tokens=
{
model_runner
.
max_total_num_tokens
}
"
)
tokenizer
=
get_tokenizer
(
tokenizer
=
get_tokenizer
(
server_args
.
tokenizer_path
,
server_args
.
tokenizer_path
,
tokenizer_mode
=
server_args
.
tokenizer_mode
,
tokenizer_mode
=
server_args
.
tokenizer_mode
,
...
@@ -209,6 +210,7 @@ def latency_test(
...
@@ -209,6 +210,7 @@ def latency_test(
# Load the model
# Load the model
model_runner
,
tokenizer
=
load_model
(
server_args
,
tp_rank
)
model_runner
,
tokenizer
=
load_model
(
server_args
,
tp_rank
)
print
(
f
"max_batch_size=
{
model_runner
.
max_total_num_tokens
//
(
bench_args
.
input_len
+
bench_args
.
output_len
)
}
"
)
# Prepare inputs
# Prepare inputs
reqs
=
prepare_synthetic_inputs
(
bench_args
,
tokenizer
)
reqs
=
prepare_synthetic_inputs
(
bench_args
,
tokenizer
)
...
@@ -221,22 +223,31 @@ def latency_test(
...
@@ -221,22 +223,31 @@ def latency_test(
def
run_once
(
output_len
):
def
run_once
(
output_len
):
# Prefill
# Prefill
torch
.
cuda
.
synchronize
()
torch
.
cuda
.
synchronize
()
tot_latency
=
0
tic
=
time
.
time
()
tic
=
time
.
time
()
next_token_ids
,
_
,
batch
=
extend
(
reqs
,
model_runner
)
next_token_ids
,
_
,
batch
=
extend
(
reqs
,
model_runner
)
torch
.
cuda
.
synchronize
()
torch
.
cuda
.
synchronize
()
latency
=
time
.
time
()
-
tic
prefill_latency
=
time
.
time
()
-
tic
throughput
=
bench_args
.
input_len
*
bench_args
.
batch_size
/
latency
tot_latency
+=
prefill_latency
rank_print
(
f
"Prefill. latency:
{
latency
:
6.3
f
}
ms, throughput:
{
throughput
:
9.2
f
}
token/s"
)
throughput
=
bench_args
.
input_len
*
bench_args
.
batch_size
/
prefill_latency
rank_print
(
f
"Prefill. latency:
{
prefill_latency
:
6.5
f
}
ms, throughput:
{
throughput
:
9.2
f
}
token/s"
)
# Decode
# Decode
for
_
in
range
(
output_len
):
for
i
in
range
(
output_len
):
torch
.
cuda
.
synchronize
()
torch
.
cuda
.
synchronize
()
tic
=
time
.
time
()
tic
=
time
.
time
()
next_token_ids
,
_
=
decode
(
next_token_ids
,
batch
,
model_runner
)
next_token_ids
,
_
=
decode
(
next_token_ids
,
batch
,
model_runner
)
torch
.
cuda
.
synchronize
()
torch
.
cuda
.
synchronize
()
latency
=
time
.
time
()
-
tic
latency
=
time
.
time
()
-
tic
tot_latency
+=
latency
throughput
=
bench_args
.
batch_size
/
latency
throughput
=
bench_args
.
batch_size
/
latency
rank_print
(
f
"Decode. latency:
{
latency
:
6.3
f
}
ms, throughput:
{
throughput
:
9.2
f
}
token/s"
)
if
i
<
5
:
rank_print
(
f
"Decode. latency:
{
latency
:
6.5
f
}
ms, throughput:
{
throughput
:
9.2
f
}
token/s"
)
avg_decode_latency
=
(
tot_latency
-
prefill_latency
)
/
output_len
avg_decode_throughput
=
bench_args
.
batch_size
/
avg_decode_latency
rank_print
(
f
"Decode. avg latency:
{
avg_decode_latency
:
6.5
f
}
ms, avg throughput:
{
avg_decode_throughput
:
9.2
f
}
token/s"
)
throughput
=
(
bench_args
.
input_len
+
bench_args
.
output_len
)
*
bench_args
.
batch_size
/
tot_latency
rank_print
(
f
"Total. latency:
{
tot_latency
:
6.3
f
}
ms, throughput:
{
throughput
:
9.2
f
}
token/s"
)
# Warm up
# Warm up
run_once
(
4
)
run_once
(
4
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment