Commit bd1e64d6 authored by lizhigong's avatar lizhigong
Browse files

fix zero overhead bug when kvcache oom and schedule

parent aa906d98
...@@ -14,6 +14,7 @@ from typing import Any, Callable, Optional, Union ...@@ -14,6 +14,7 @@ from typing import Any, Callable, Optional, Union
import msgspec import msgspec
import torch import torch
from vllm import envs
from vllm.inputs import SingletonInputs from vllm.inputs import SingletonInputs
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict
...@@ -809,18 +810,20 @@ class SequenceGroup: ...@@ -809,18 +810,20 @@ class SequenceGroup:
def set_last_token_time(self, now: float) -> None: def set_last_token_time(self, now: float) -> None:
"""Sets the last token time for Request level timings.""" """Sets the last token time for Request level timings."""
# If still in prefill phase, assertion fails. if not envs.VLLM_ZERO_OVERHEAD:
assert not self.is_prefill(), ( # If still in prefill phase, assertion fails.
"seq_group.set_last_token_time() should not be called " assert not self.is_prefill(), (
"if the seq_group is in prefill phase.") "seq_group.set_last_token_time() should not be called "
"if the seq_group is in prefill phase.")
self.last_token_latency = now - self.metrics.last_token_time self.last_token_latency = now - self.metrics.last_token_time
self.metrics.last_token_time = now self.metrics.last_token_time = now
def get_last_token_latency(self) -> float: def get_last_token_latency(self) -> float:
"""Returns the latency of the last token.""" """Returns the latency of the last token."""
assert not self.is_prefill(), ( if not envs.VLLM_ZERO_OVERHEAD:
"seq_group.get_last_token_latency() should not be called " assert not self.is_prefill(), (
"if the seq_group is in prefill phase.") "seq_group.get_last_token_latency() should not be called "
"if the seq_group is in prefill phase.")
return self.last_token_latency return self.last_token_latency
def maybe_set_first_token_time(self, time: float) -> None: def maybe_set_first_token_time(self, time: float) -> None:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment