Commit bd1e64d6 authored by lizhigong's avatar lizhigong
Browse files

fix zero overhead bug when kvcache oom and schedule

parent aa906d98
...@@ -14,6 +14,7 @@ from typing import Any, Callable, Optional, Union ...@@ -14,6 +14,7 @@ from typing import Any, Callable, Optional, Union
import msgspec import msgspec
import torch import torch
from vllm import envs
from vllm.inputs import SingletonInputs from vllm.inputs import SingletonInputs
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict
...@@ -809,6 +810,7 @@ class SequenceGroup: ...@@ -809,6 +810,7 @@ class SequenceGroup:
def set_last_token_time(self, now: float) -> None: def set_last_token_time(self, now: float) -> None:
"""Sets the last token time for Request level timings.""" """Sets the last token time for Request level timings."""
if not envs.VLLM_ZERO_OVERHEAD:
# If still in prefill phase, assertion fails. # If still in prefill phase, assertion fails.
assert not self.is_prefill(), ( assert not self.is_prefill(), (
"seq_group.set_last_token_time() should not be called " "seq_group.set_last_token_time() should not be called "
...@@ -818,6 +820,7 @@ class SequenceGroup: ...@@ -818,6 +820,7 @@ class SequenceGroup:
def get_last_token_latency(self) -> float: def get_last_token_latency(self) -> float:
"""Returns the latency of the last token.""" """Returns the latency of the last token."""
if not envs.VLLM_ZERO_OVERHEAD:
assert not self.is_prefill(), ( assert not self.is_prefill(), (
"seq_group.get_last_token_latency() should not be called " "seq_group.get_last_token_latency() should not be called "
"if the seq_group is in prefill phase.") "if the seq_group is in prefill phase.")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment