fix pending error in zero overhead

e6bbec9e · lizhigong · 2825dacd · e6bbec9e · e6bbec9e
Commit e6bbec9e authored Mar 28, 2025 by lizhigong
Hide whitespace changes
Inline Side-by-side

Showing with 12 additions and 7 deletions

benchmarks/benchmark_serving.py benchmarks/benchmark_serving.py +2 -0

vllm/sequence.py vllm/sequence.py +10 -7

No files found.
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -570,6 +570,8 @@ async def benchmark(
    else:
        print("Initial test run completed. Starting main benchmark run...")

+    time.sleep(0.1) # ZERO_OVERHEAD : sleep and wait the last step in warmup
+    
    if profile:
        print("Starting profiler...")
        profile_input = RequestFuncInput(model=model_id,

--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -315,7 +315,8 @@ class SequenceData(msgspec.Struct,
        effect_offset = self._effective_length - len(self.output_token_ids)
        if effect_offset < 0:
            self._output_token_ids[effect_offset] = token_id
-            self._new_appended_tokens[effect_offset] = token_id
+            if len(self._new_appended_tokens) >= effect_offset * -1:
+                self._new_appended_tokens[effect_offset] = token_id
            self._cached_all_token_ids[effect_offset] = token_id
            self._effective_length += 1

@@ -848,17 +849,19 @@ class SequenceGroup:
    def set_last_token_time(self, now: float) -> None:
        """Sets the last token time for Request level timings."""
        # If still in prefill phase, assertion fails.
-        assert not self.is_prefill(), (
-            "seq_group.set_last_token_time() should not be called "
-            "if the seq_group is in prefill phase.")
+        if not self.seqs[0].zero_overhead:
+            assert not self.is_prefill(), (
+                "seq_group.set_last_token_time() should not be called "
+                "if the seq_group is in prefill phase.")
        self.last_token_latency = now - self.metrics.last_token_time
        self.metrics.last_token_time = now

    def get_last_token_latency(self) -> float:
        """Returns the latency of the last token."""
-        assert not self.is_prefill(), (
-            "seq_group.get_last_token_latency() should not be called "
-            "if the seq_group is in prefill phase.")
+        if not self.seqs[0].zero_overhead:
+            assert not self.is_prefill(), (
+                "seq_group.get_last_token_latency() should not be called "
+                "if the seq_group is in prefill phase.")
        return self.last_token_latency

    def maybe_set_first_token_time(self, time: float) -> None: