Commit e6bbec9e authored by lizhigong's avatar lizhigong
Browse files

fix pending error in zero overhead

parent 2825dacd
...@@ -570,6 +570,8 @@ async def benchmark( ...@@ -570,6 +570,8 @@ async def benchmark(
else: else:
print("Initial test run completed. Starting main benchmark run...") print("Initial test run completed. Starting main benchmark run...")
time.sleep(0.1) # ZERO_OVERHEAD : sleep and wait the last step in warmup
if profile: if profile:
print("Starting profiler...") print("Starting profiler...")
profile_input = RequestFuncInput(model=model_id, profile_input = RequestFuncInput(model=model_id,
......
...@@ -315,7 +315,8 @@ class SequenceData(msgspec.Struct, ...@@ -315,7 +315,8 @@ class SequenceData(msgspec.Struct,
effect_offset = self._effective_length - len(self.output_token_ids) effect_offset = self._effective_length - len(self.output_token_ids)
if effect_offset < 0: if effect_offset < 0:
self._output_token_ids[effect_offset] = token_id self._output_token_ids[effect_offset] = token_id
self._new_appended_tokens[effect_offset] = token_id if len(self._new_appended_tokens) >= effect_offset * -1:
self._new_appended_tokens[effect_offset] = token_id
self._cached_all_token_ids[effect_offset] = token_id self._cached_all_token_ids[effect_offset] = token_id
self._effective_length += 1 self._effective_length += 1
...@@ -848,17 +849,19 @@ class SequenceGroup: ...@@ -848,17 +849,19 @@ class SequenceGroup:
def set_last_token_time(self, now: float) -> None: def set_last_token_time(self, now: float) -> None:
"""Sets the last token time for Request level timings.""" """Sets the last token time for Request level timings."""
# If still in prefill phase, assertion fails. # If still in prefill phase, assertion fails.
assert not self.is_prefill(), ( if not self.seqs[0].zero_overhead:
"seq_group.set_last_token_time() should not be called " assert not self.is_prefill(), (
"if the seq_group is in prefill phase.") "seq_group.set_last_token_time() should not be called "
"if the seq_group is in prefill phase.")
self.last_token_latency = now - self.metrics.last_token_time self.last_token_latency = now - self.metrics.last_token_time
self.metrics.last_token_time = now self.metrics.last_token_time = now
def get_last_token_latency(self) -> float: def get_last_token_latency(self) -> float:
"""Returns the latency of the last token.""" """Returns the latency of the last token."""
assert not self.is_prefill(), ( if not self.seqs[0].zero_overhead:
"seq_group.get_last_token_latency() should not be called " assert not self.is_prefill(), (
"if the seq_group is in prefill phase.") "seq_group.get_last_token_latency() should not be called "
"if the seq_group is in prefill phase.")
return self.last_token_latency return self.last_token_latency
def maybe_set_first_token_time(self, time: float) -> None: def maybe_set_first_token_time(self, time: float) -> None:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment