Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinilm
Commits
e6df4acf
Commit
e6df4acf
authored
Feb 27, 2026
by
PanZezhong
Browse files
issue/239 adjust benchmark
parent
e0e5827f
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
12 additions
and
7 deletions
+12
-7
test/bench/bench_vllm.py
test/bench/bench_vllm.py
+12
-7
No files found.
test/bench/bench_vllm.py
View file @
e6df4acf
...
@@ -87,16 +87,19 @@ def run_one_case(
...
@@ -87,16 +87,19 @@ def run_one_case(
request_ids
.
append
(
rid
)
request_ids
.
append
(
rid
)
# ------------------------------------------------------------
# ------------------------------------------------------------
# 2. Run until first decode token appears (prefill timing)
# 2. Run until first decode token appears
for all requests
(prefill timing)
# ------------------------------------------------------------
# ------------------------------------------------------------
t0
=
time
.
perf_counter
()
t0
=
time
.
perf_counter
()
first_token_seen
=
False
pre_decode
=
0
# some decode tokens can be mixed with prefill batch
pending
=
set
(
f
"req_
{
i
}
"
for
i
in
range
(
batch_size
))
while
not
first_token_seen
:
while
pending
:
outputs
=
engine
.
step
()
outputs
=
engine
.
step
()
for
out
in
outputs
:
for
out
in
outputs
:
if
out
.
outputs
and
len
(
out
.
outputs
[
0
].
token_ids
)
>
0
:
if
len
(
out
.
outputs
[
0
].
token_ids
)
>
0
:
first_token_seen
=
True
if
out
.
request_id
in
pending
:
pending
.
remove
(
out
.
request_id
)
else
:
pre_decode
+=
1
torch
.
cuda
.
synchronize
()
torch
.
cuda
.
synchronize
()
t1
=
time
.
perf_counter
()
t1
=
time
.
perf_counter
()
...
@@ -115,7 +118,9 @@ def run_one_case(
...
@@ -115,7 +118,9 @@ def run_one_case(
decode_end
=
time
.
perf_counter
()
decode_end
=
time
.
perf_counter
()
decode_time
=
decode_end
-
decode_start
decode_time
=
decode_end
-
decode_start
decode_tokens
=
batch_size
*
(
output_len
-
1
)
decode_tokens
=
(
batch_size
*
(
output_len
-
1
)
-
pre_decode
)
# exclude prefill-mixed tokens
return
{
return
{
"batch_size"
:
batch_size
,
"batch_size"
:
batch_size
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment