stats.py 3.55 KB
Newer Older
1
2
# SPDX-License-Identifier: Apache-2.0

3
import time
4
from dataclasses import dataclass, field
5
from typing import TYPE_CHECKING, List
6
7

if TYPE_CHECKING:
8
    from vllm.outputs import RequestOutput
9
    from vllm.v1.engine import EngineCoreOutput, FinishReason
10
11


12
13
14
15
16
17
18
19
20
21
22
23
24
25
@dataclass
class PrefixCacheStats:
    """Stores prefix cache hit statistics."""
    # Whether reset_prefix_cache was invoked.
    reset: bool = False
    # The number of requests in this update.
    requests: int = 0
    # The number of queries in these requests. Note that "queries" here
    # means the number of blocks that were queried from the cache.
    queries: int = 0
    # The number of hits in these requests.
    hits: int = 0


26
27
28
29
30
31
32
@dataclass
class SchedulerStats:
    """Stats associated with the scheduler."""

    num_running_reqs: int = 0
    num_waiting_reqs: int = 0

33
    gpu_cache_usage: float = 0.0
34
35
36

    prefix_cache_stats: PrefixCacheStats = field(
        default_factory=PrefixCacheStats)
37
38


39
40
41
42
43
@dataclass
class RequestStateStats:
    """Stats that need to be tracked across delta updates."""

    num_generation_tokens: int = 0
44
    last_token_time: float = 0.0
45
46
47
48
49
50


@dataclass
class FinishedRequestStats:
    """Stats associated with a finished request."""

51
    finish_reason: "FinishReason"
52
53
54
55
    num_prompt_tokens: int = 0
    num_generation_tokens: int = 0


56
57
58
59
60
61
62
class IterationStats:
    """Stats associated with a single set of EngineCoreOutputs."""

    def __init__(self, log_stats: bool):
        self.log_stats = log_stats
        self.num_generation_tokens = 0
        self.num_prompt_tokens = 0
63
        self.finished_requests: List[FinishedRequestStats] = []
64
65
        self.time_to_first_tokens_iter: List[float] = []
        self.time_per_output_tokens_iter: List[float] = []
66
67

    def update_from_output(self, output: "EngineCoreOutput",
68
69
                           is_prefilling: bool, prompt_len: int,
                           request_state_stats: RequestStateStats):
70
71
72
        if not self.log_stats:
            return

73
        num_new_generation_tokens = len(output.new_token_ids)
74
75
        now = time.time()
        last_token_latency = now - request_state_stats.last_token_time
76
77

        self.num_generation_tokens += num_new_generation_tokens
78
        if is_prefilling:
79
80
81
82
83
84
85
86
87
88
89
            # TODO(andy): we used to assert that num_new_generation_tokens
            # > 0 with an invariant that EngineCore does not stream outputs
            # for partially completed prefills (scheduler.update_from_output
            # makes EngineCoreOutput iff num_computed_tokens == num_tokens).
            # When prompt logprobs are enabled, we currently stream out the
            # partially completed prompt.
            # This will be reverted in a follow up PR and we should re-enable
            # this assertion / invariant.
            if num_new_generation_tokens > 0:
                self.num_prompt_tokens += prompt_len
                self.time_to_first_tokens_iter.append(last_token_latency)
90
91
92
        else:
            self.time_per_output_tokens_iter.append(last_token_latency)

93
        request_state_stats.num_generation_tokens += num_new_generation_tokens
94
        request_state_stats.last_token_time = now
95

96
    def update_from_finished_request(self, finish_reason: "FinishReason",
97
                                     request_output: "RequestOutput",
98
99
                                     request_state_stats: RequestStateStats):
        self.finished_requests.append(
100
101
            FinishedRequestStats(finish_reason,
                                 len(request_output.prompt_token_ids),
102
                                 request_state_stats.num_generation_tokens))