stats.py 2.58 KB
Newer Older
1
import time
2
from dataclasses import dataclass
3
from typing import TYPE_CHECKING, List
4
5

if TYPE_CHECKING:
6
    from vllm.outputs import RequestOutput
7
    from vllm.v1.engine import EngineCoreOutput
8
9
10
11
12
13
14
15
16
17
18


@dataclass
class SchedulerStats:
    """Stats associated with the scheduler."""

    num_running_reqs: int = 0
    num_waiting_reqs: int = 0

    # gpu_cache_usage: float = 0.0
    # gpu_prefix_cache_hit_rate: float = 0.0
19
20


21
22
23
24
25
@dataclass
class RequestStateStats:
    """Stats that need to be tracked across delta updates."""

    num_generation_tokens: int = 0
26
    last_token_time: float = 0.0
27
28
29
30
31
32
33
34
35
36


@dataclass
class FinishedRequestStats:
    """Stats associated with a finished request."""

    num_prompt_tokens: int = 0
    num_generation_tokens: int = 0


37
38
39
40
41
42
43
class IterationStats:
    """Stats associated with a single set of EngineCoreOutputs."""

    def __init__(self, log_stats: bool):
        self.log_stats = log_stats
        self.num_generation_tokens = 0
        self.num_prompt_tokens = 0
44
        self.finished_requests: List[FinishedRequestStats] = []
45
46
        self.time_to_first_tokens_iter: List[float] = []
        self.time_per_output_tokens_iter: List[float] = []
47
48

    def update_from_output(self, output: "EngineCoreOutput",
49
50
                           is_prefilling: bool, prompt_len: int,
                           request_state_stats: RequestStateStats):
51
52
53
        if not self.log_stats:
            return

54
        num_new_generation_tokens = len(output.new_token_ids)
55
56
        now = time.time()
        last_token_latency = now - request_state_stats.last_token_time
57
58

        self.num_generation_tokens += num_new_generation_tokens
59
60
61
62
63
        if is_prefilling:
            # This relies on the invariant that EngineCore does
            # not stream outputs for partially completed prefills
            # (scheduler.update_from_output makes EngineCoreOutput
            # iff num_computed_tokens == num_tokens).
64
            assert (num_new_generation_tokens > 0)
65
            self.num_prompt_tokens += prompt_len
66

67
68
69
70
            self.time_to_first_tokens_iter.append(last_token_latency)
        else:
            self.time_per_output_tokens_iter.append(last_token_latency)

71
        request_state_stats.num_generation_tokens += num_new_generation_tokens
72
        request_state_stats.last_token_time = now
73
74
75
76
77
78

    def update_from_finished_request(self, request_output: "RequestOutput",
                                     request_state_stats: RequestStateStats):
        self.finished_requests.append(
            FinishedRequestStats(len(request_output.prompt_token_ids),
                                 request_state_stats.num_generation_tokens))