Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
e6bbec9e
Commit
e6bbec9e
authored
Mar 28, 2025
by
lizhigong
Browse files
fix pending error in zero overhead
parent
2825dacd
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
12 additions
and
7 deletions
+12
-7
benchmarks/benchmark_serving.py
benchmarks/benchmark_serving.py
+2
-0
vllm/sequence.py
vllm/sequence.py
+10
-7
No files found.
benchmarks/benchmark_serving.py
View file @
e6bbec9e
...
...
@@ -570,6 +570,8 @@ async def benchmark(
else
:
print
(
"Initial test run completed. Starting main benchmark run..."
)
time
.
sleep
(
0.1
)
# ZERO_OVERHEAD : sleep and wait the last step in warmup
if
profile
:
print
(
"Starting profiler..."
)
profile_input
=
RequestFuncInput
(
model
=
model_id
,
...
...
vllm/sequence.py
View file @
e6bbec9e
...
...
@@ -315,7 +315,8 @@ class SequenceData(msgspec.Struct,
effect_offset
=
self
.
_effective_length
-
len
(
self
.
output_token_ids
)
if
effect_offset
<
0
:
self
.
_output_token_ids
[
effect_offset
]
=
token_id
self
.
_new_appended_tokens
[
effect_offset
]
=
token_id
if
len
(
self
.
_new_appended_tokens
)
>=
effect_offset
*
-
1
:
self
.
_new_appended_tokens
[
effect_offset
]
=
token_id
self
.
_cached_all_token_ids
[
effect_offset
]
=
token_id
self
.
_effective_length
+=
1
...
...
@@ -848,17 +849,19 @@ class SequenceGroup:
def
set_last_token_time
(
self
,
now
:
float
)
->
None
:
"""Sets the last token time for Request level timings."""
# If still in prefill phase, assertion fails.
assert
not
self
.
is_prefill
(),
(
"seq_group.set_last_token_time() should not be called "
"if the seq_group is in prefill phase."
)
if
not
self
.
seqs
[
0
].
zero_overhead
:
assert
not
self
.
is_prefill
(),
(
"seq_group.set_last_token_time() should not be called "
"if the seq_group is in prefill phase."
)
self
.
last_token_latency
=
now
-
self
.
metrics
.
last_token_time
self
.
metrics
.
last_token_time
=
now
def
get_last_token_latency
(
self
)
->
float
:
"""Returns the latency of the last token."""
assert
not
self
.
is_prefill
(),
(
"seq_group.get_last_token_latency() should not be called "
"if the seq_group is in prefill phase."
)
if
not
self
.
seqs
[
0
].
zero_overhead
:
assert
not
self
.
is_prefill
(),
(
"seq_group.get_last_token_latency() should not be called "
"if the seq_group is in prefill phase."
)
return
self
.
last_token_latency
def
maybe_set_first_token_time
(
self
,
time
:
float
)
->
None
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment