Commit e6bbec9e authored by lizhigong's avatar lizhigong
Browse files

fix pending error in zero overhead

parent 2825dacd
......@@ -570,6 +570,8 @@ async def benchmark(
else:
print("Initial test run completed. Starting main benchmark run...")
time.sleep(0.1) # ZERO_OVERHEAD : sleep and wait the last step in warmup
if profile:
print("Starting profiler...")
profile_input = RequestFuncInput(model=model_id,
......
......@@ -315,6 +315,7 @@ class SequenceData(msgspec.Struct,
effect_offset = self._effective_length - len(self.output_token_ids)
if effect_offset < 0:
self._output_token_ids[effect_offset] = token_id
if len(self._new_appended_tokens) >= effect_offset * -1:
self._new_appended_tokens[effect_offset] = token_id
self._cached_all_token_ids[effect_offset] = token_id
self._effective_length += 1
......@@ -848,6 +849,7 @@ class SequenceGroup:
def set_last_token_time(self, now: float) -> None:
"""Sets the last token time for Request level timings."""
# If still in prefill phase, assertion fails.
if not self.seqs[0].zero_overhead:
assert not self.is_prefill(), (
"seq_group.set_last_token_time() should not be called "
"if the seq_group is in prefill phase.")
......@@ -856,6 +858,7 @@ class SequenceGroup:
def get_last_token_latency(self) -> float:
"""Returns the latency of the last token."""
if not self.seqs[0].zero_overhead:
assert not self.is_prefill(), (
"seq_group.get_last_token_latency() should not be called "
"if the seq_group is in prefill phase.")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment