Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
afd411d0
Unverified
Commit
afd411d0
authored
Aug 04, 2024
by
min-xu-et
Committed by
GitHub
Aug 04, 2024
Browse files
enhance latency test - part 2 (#915)
parent
e1eae1fd
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
75 additions
and
62 deletions
+75
-62
python/sglang/bench_latency.py
python/sglang/bench_latency.py
+75
-62
No files found.
python/sglang/bench_latency.py
View file @
afd411d0
...
...
@@ -220,6 +220,68 @@ def correctness_test(
rank_print
(
tokenizer
.
decode
(
output_ids
[
i
]))
@
torch
.
inference_mode
()
def
latency_test_run_once
(
model_runner
,
rank_print
,
reqs
,
batch_size
,
input_len
,
output_len
):
# Clear the pools.
model_runner
.
req_to_token_pool
.
clear
()
model_runner
.
token_to_kv_pool
.
clear
()
measurement_results
=
{
"run_name"
:
"before"
,
"batch_size"
:
batch_size
,
"input_len"
:
input_len
,
"output_len"
:
output_len
,
}
tot_latency
=
0
# Prefill
torch
.
cuda
.
synchronize
()
tic
=
time
.
time
()
next_token_ids
,
_
,
batch
=
extend
(
reqs
,
model_runner
)
torch
.
cuda
.
synchronize
()
prefill_latency
=
time
.
time
()
-
tic
tot_latency
+=
prefill_latency
throughput
=
input_len
*
batch_size
/
prefill_latency
rank_print
(
f
"Prefill. latency:
{
prefill_latency
:
6.5
f
}
s, throughput:
{
throughput
:
9.2
f
}
token/s"
)
measurement_results
[
"prefill_latency"
]
=
prefill_latency
measurement_results
[
"prefill_throughput"
]
=
throughput
# Decode
for
i
in
range
(
output_len
):
torch
.
cuda
.
synchronize
()
tic
=
time
.
time
()
next_token_ids
,
_
=
decode
(
next_token_ids
,
batch
,
model_runner
)
torch
.
cuda
.
synchronize
()
latency
=
time
.
time
()
-
tic
tot_latency
+=
latency
throughput
=
batch_size
/
latency
if
i
<
5
:
rank_print
(
f
"Decode. latency:
{
latency
:
6.5
f
}
s, throughput:
{
throughput
:
9.2
f
}
token/s"
)
avg_decode_latency
=
(
tot_latency
-
prefill_latency
)
/
output_len
avg_decode_throughput
=
batch_size
/
avg_decode_latency
rank_print
(
f
"Decode. avg latency:
{
avg_decode_latency
:
6.5
f
}
s, avg throughput:
{
avg_decode_throughput
:
9.2
f
}
token/s"
)
measurement_results
[
"avg_decode_latency"
]
=
avg_decode_latency
measurement_results
[
"avg_decode_throughput"
]
=
avg_decode_throughput
throughput
=
(
input_len
+
output_len
)
*
batch_size
/
tot_latency
rank_print
(
f
"Total. latency:
{
tot_latency
:
6.3
f
}
s, throughput:
{
throughput
:
9.2
f
}
token/s"
)
measurement_results
[
"total_latency"
]
=
tot_latency
measurement_results
[
"total_throughput"
]
=
throughput
return
measurement_results
def
latency_test
(
server_args
,
bench_args
,
...
...
@@ -241,72 +303,23 @@ def latency_test(
bench_args
.
batch_size
,
bench_args
.
input_len
)
def
clear
():
model_runner
.
req_to_token_pool
.
clear
()
model_runner
.
token_to_kv_pool
.
clear
()
@
torch
.
inference_mode
()
def
run_once
(
output_len
):
measurement_results
=
{
"batch_size"
:
bench_args
.
batch_size
,
"output_len"
:
output_len
,
}
# Prefill
torch
.
cuda
.
synchronize
()
tot_latency
=
0
tic
=
time
.
time
()
next_token_ids
,
_
,
batch
=
extend
(
reqs
,
model_runner
)
torch
.
cuda
.
synchronize
()
prefill_latency
=
time
.
time
()
-
tic
tot_latency
+=
prefill_latency
throughput
=
bench_args
.
input_len
*
bench_args
.
batch_size
/
prefill_latency
rank_print
(
f
"Prefill. latency:
{
prefill_latency
:
6.5
f
}
s, throughput:
{
throughput
:
9.2
f
}
token/s"
)
measurement_results
[
"prefill_latency"
]
=
prefill_latency
measurement_results
[
"prefill_throughput"
]
=
throughput
# Decode
for
i
in
range
(
output_len
):
torch
.
cuda
.
synchronize
()
tic
=
time
.
time
()
next_token_ids
,
_
=
decode
(
next_token_ids
,
batch
,
model_runner
)
torch
.
cuda
.
synchronize
()
latency
=
time
.
time
()
-
tic
tot_latency
+=
latency
throughput
=
bench_args
.
batch_size
/
latency
if
i
<
5
:
rank_print
(
f
"Decode. latency:
{
latency
:
6.5
f
}
s, throughput:
{
throughput
:
9.2
f
}
token/s"
)
avg_decode_latency
=
(
tot_latency
-
prefill_latency
)
/
output_len
avg_decode_throughput
=
bench_args
.
batch_size
/
avg_decode_latency
rank_print
(
f
"Decode. avg latency:
{
avg_decode_latency
:
6.5
f
}
s, avg throughput:
{
avg_decode_throughput
:
9.2
f
}
token/s"
)
measurement_results
[
"avg_decode_latency"
]
=
avg_decode_latency
measurement_results
[
"avg_decode_throughput"
]
=
avg_decode_throughput
throughput
=
(
(
bench_args
.
input_len
+
bench_args
.
output_len
)
*
bench_args
.
batch_size
/
tot_latency
)
rank_print
(
f
"Total. latency:
{
tot_latency
:
6.3
f
}
s, throughput:
{
throughput
:
9.2
f
}
token/s"
)
measurement_results
[
"total_latency"
]
=
tot_latency
measurement_results
[
"total_throughput"
]
=
throughput
return
measurement_results
# Warm up
run_once
(
4
)
clear
()
latency_test_run_once
(
model_runner
,
rank_print
,
reqs
,
bench_args
.
batch_size
,
bench_args
.
input_len
,
4
)
# Run again
result_list
=
[]
result_list
.
append
(
run_once
(
bench_args
.
output_len
))
result_list
.
append
(
latency_test_run_once
(
model_runner
,
rank_print
,
reqs
,
bench_args
.
batch_size
,
bench_args
.
input_len
,
bench_args
.
output_len
,
)
)
# Write results in jsonlines format.
if
bench_args
.
result_filename
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment