fix zero overhead bug when kvcache oom and schedule

bd1e64d6 · lizhigong · aa906d98 · bd1e64d6
Commit bd1e64d6 authored Jun 17, 2025 by lizhigong
Show whitespace changes
Inline Side-by-side

Showing with 10 additions and 7 deletions

vllm/sequence.py vllm/sequence.py +10 -7

No files found.
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -14,6 +14,7 @@ from typing import Any, Callable, Optional, Union
 import msgspec
 import torch
+from vllm import envs
 from vllm.inputs import SingletonInputs
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict
@@ -809,6 +810,7 @@ class SequenceGroup:
    def set_last_token_time(self, now: float) -> None:
        """Sets the last token time for Request level timings."""
+        if not envs.VLLM_ZERO_OVERHEAD:
            # If still in prefill phase, assertion fails.
            assert not self.is_prefill(), (
                "seq_group.set_last_token_time() should not be called "
@@ -818,6 +820,7 @@ class SequenceGroup:
    def get_last_token_latency(self) -> float:
        """Returns the latency of the last token."""
+        if not envs.VLLM_ZERO_OVERHEAD:
            assert not self.is_prefill(), (
                "seq_group.get_last_token_latency() should not be called "
                "if the seq_group is in prefill phase.")