Commit bd1e64d6 authored by lizhigong's avatar lizhigong
Browse files

fix zero overhead bug when kvcache oom and schedule

parent aa906d98
......@@ -14,6 +14,7 @@ from typing import Any, Callable, Optional, Union
import msgspec
import torch
from vllm import envs
from vllm.inputs import SingletonInputs
from vllm.lora.request import LoRARequest
from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict
......@@ -809,18 +810,20 @@ class SequenceGroup:
def set_last_token_time(self, now: float) -> None:
"""Sets the last token time for Request level timings."""
# If still in prefill phase, assertion fails.
assert not self.is_prefill(), (
"seq_group.set_last_token_time() should not be called "
"if the seq_group is in prefill phase.")
if not envs.VLLM_ZERO_OVERHEAD:
# If still in prefill phase, assertion fails.
assert not self.is_prefill(), (
"seq_group.set_last_token_time() should not be called "
"if the seq_group is in prefill phase.")
self.last_token_latency = now - self.metrics.last_token_time
self.metrics.last_token_time = now
def get_last_token_latency(self) -> float:
"""Returns the latency of the last token."""
assert not self.is_prefill(), (
"seq_group.get_last_token_latency() should not be called "
"if the seq_group is in prefill phase.")
if not envs.VLLM_ZERO_OVERHEAD:
assert not self.is_prefill(), (
"seq_group.get_last_token_latency() should not be called "
"if the seq_group is in prefill phase.")
return self.last_token_latency
def maybe_set_first_token_time(self, time: float) -> None:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment