fix zero overhead bug when kvcache oom and schedule

bd1e64d6 · lizhigong · aa906d98 · bd1e64d6
Commit bd1e64d6 authored Jun 17, 2025 by lizhigong
Hide whitespace changes
Inline Side-by-side

Showing with 10 additions and 7 deletions

vllm/sequence.py vllm/sequence.py +10 -7

No files found.
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -14,6 +14,7 @@ from typing import Any, Callable, Optional, Union
 import msgspec
 import torch

+from vllm import envs
 from vllm.inputs import SingletonInputs
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict
@@ -809,18 +810,20 @@ class SequenceGroup:

    def set_last_token_time(self, now: float) -> None:
        """Sets the last token time for Request level timings."""
-        # If still in prefill phase, assertion fails.
-        assert not self.is_prefill(), (
-            "seq_group.set_last_token_time() should not be called "
-            "if the seq_group is in prefill phase.")
+        if not envs.VLLM_ZERO_OVERHEAD:
+            # If still in prefill phase, assertion fails.
+            assert not self.is_prefill(), (
+                "seq_group.set_last_token_time() should not be called "
+                "if the seq_group is in prefill phase.")
        self.last_token_latency = now - self.metrics.last_token_time
        self.metrics.last_token_time = now

    def get_last_token_latency(self) -> float:
        """Returns the latency of the last token."""
-        assert not self.is_prefill(), (
-            "seq_group.get_last_token_latency() should not be called "
-            "if the seq_group is in prefill phase.")
+        if not envs.VLLM_ZERO_OVERHEAD:
+            assert not self.is_prefill(), (
+                "seq_group.get_last_token_latency() should not be called "
+                "if the seq_group is in prefill phase.")
        return self.last_token_latency

    def maybe_set_first_token_time(self, time: float) -> None: