metrics.py 6.48 KB
Newer Older
1
2
# SPDX-License-Identifier: Apache-2.0

3
4
from dataclasses import dataclass, field
from typing import Optional
5
6

import numpy as np
7
import prometheus_client
8

9
from vllm.config import SpeculativeConfig
10
11
12
13
14
15
16
from vllm.logger import init_logger

logger = init_logger(__name__)


@dataclass
class SpecDecodingStats:
17
18
19
20
21
22
23
24
25
    """Per-step iteration decoding stats from scheduler.

    Each scheduler step, statistics on spec decoding performance are
    aggregated across requests by the scheduler and returned to the
    frontend in EngineCoreOutputs->SchedulerStats.
    """

    num_spec_tokens: int
    num_drafts: int = 0
26
27
    num_draft_tokens: int = 0
    num_accepted_tokens: int = 0
28
    num_accepted_tokens_per_pos: list[int] = field(default_factory=list)
29

30
31
32
33
    @classmethod
    def new(cls, num_spec_tokens: int) -> "SpecDecodingStats":
        return cls(num_spec_tokens=num_spec_tokens,
                   num_accepted_tokens_per_pos=[0] * num_spec_tokens)
34

35
36
    def observe_draft(self, num_draft_tokens: int, num_accepted_tokens: int):
        self.num_drafts += 1
37
38
        self.num_draft_tokens += num_draft_tokens
        self.num_accepted_tokens += num_accepted_tokens
39
40
41
42
        assert num_accepted_tokens <= self.num_spec_tokens
        for i in range(num_accepted_tokens):
            self.num_accepted_tokens_per_pos[i] += 1

43

44
45
class SpecDecodingLogging:
    """Aggregate and log spec decoding metrics.
46

47
48
49
50
    LoggingStatLogger aggregates per-iteration metrics over a set
    time interval using observe() and then logs them using log()
    before resetting to zero.
    """
51
52
53
54
55

    def __init__(self):
        self.reset()

    def reset(self):
56
        self.num_drafts: list[int] = []
57
58
        self.num_draft_tokens: list[int] = []
        self.num_accepted_tokens: list[int] = []
59
        self.accepted_tokens_per_pos_lists: list[list[int]] = []
60
61

    def observe(self, spec_decoding_stats: SpecDecodingStats):
62
        self.num_drafts.append(spec_decoding_stats.num_drafts)
63
64
65
        self.num_draft_tokens.append(spec_decoding_stats.num_draft_tokens)
        self.num_accepted_tokens.append(
            spec_decoding_stats.num_accepted_tokens)
66
67
        self.accepted_tokens_per_pos_lists.append(
            spec_decoding_stats.num_accepted_tokens_per_pos)
68

69
    def log(self, log_fn=logger.info):
70
71
        if not self.num_drafts:
            return
72
        num_drafts = np.sum(self.num_drafts)
73
74
75
        num_draft_tokens = np.sum(self.num_draft_tokens)
        num_accepted_tokens = np.sum(self.num_accepted_tokens)

76
77
        draft_acceptance_rate = (num_accepted_tokens / num_draft_tokens *
                                 100 if num_draft_tokens > 0 else float("nan"))
78
79
80

        # Conventionally, mean acceptance length includes the bonus token
        mean_acceptance_length = 1 + (num_accepted_tokens / num_drafts)
81
82
83
84

        pos_matrix = np.array(self.accepted_tokens_per_pos_lists)
        acceptance_rates = np.sum(pos_matrix, axis=0) / num_drafts
        rates_str = ", ".join(f"{p:.3f}" for p in acceptance_rates)
85

86
        log_fn(
87
88
            "SpecDecoding metrics: "
            "Draft acceptance rate: %.1f%%, "
89
            "Mean acceptance length: %.2f, "
90
            "Accepted: %d tokens, "
91
92
            "Drafted: %d tokens, "
            "Per-position acceptance rate: %s",
93
            draft_acceptance_rate,
94
            mean_acceptance_length,
95
96
            num_accepted_tokens,
            num_draft_tokens,
97
            rates_str,
98
        )
99
        self.reset()
100
101
102
103
104
105
106
107
108
109


class SpecDecodingProm:
    """Record spec decoding metrics in Prometheus.

    The acceptance rate can be calculated using a PromQL query:

      rate(vllm:spec_decode_num_accepted_tokens_total[$interval]) /
      rate(vllm:spec_decode_num_draft_tokens_total[$interval])

110
111
    The mean acceptance length (conventionally including bonus tokens)
    can be calculated using:
112

113
      1 + (
114
      rate(vllm:spec_decode_num_accepted_tokens_total[$interval]) /
115
      rate(vllm:spec_decode_num_drafts[$interval]))
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170

    A per-position acceptance rate vector can be computed using

      vllm:spec_decode_num_accepted_tokens_per_pos[$interval] /
      vllm:spec_decode_num_drafts[$interval]
    """

    def __init__(self, speculative_config: Optional[SpeculativeConfig],
                 labelnames: list[str], labelvalues: list[str]):
        self.spec_decoding_enabled = speculative_config is not None
        if not self.spec_decoding_enabled:
            return

        self.counter_spec_decode_num_drafts = \
            prometheus_client.Counter(
                name="vllm:spec_decode_num_drafts_total",
                documentation="Number of spec decoding drafts.",
                labelnames=labelnames).labels(*labelvalues)
        self.counter_spec_decode_num_draft_tokens = \
            prometheus_client.Counter(
                name="vllm:spec_decode_num_draft_tokens_total",
                documentation="Number of draft tokens.",
                labelnames=labelnames).labels(*labelvalues)
        self.counter_spec_decode_num_accepted_tokens = \
            prometheus_client.Counter(
                name="vllm:spec_decode_num_accepted_tokens_total",
                documentation="Number of accepted tokens.",
                labelnames=labelnames).labels(*labelvalues)

        assert speculative_config is not None
        num_spec_tokens = (speculative_config.num_speculative_tokens
                           if self.spec_decoding_enabled else 0)
        pos_labelnames = labelnames + ["position"]
        base_counter = prometheus_client.Counter(
            name="vllm:spec_decode_num_accepted_tokens_per_pos",
            documentation="Accepted tokens per draft position.",
            labelnames=pos_labelnames)
        self.counter_spec_decode_num_accepted_tokens_per_pos: \
            list[prometheus_client.Counter] = []
        for pos in range(num_spec_tokens):
            pos_labelvalues = labelvalues + [str(pos)]
            self.counter_spec_decode_num_accepted_tokens_per_pos.append(
                base_counter.labels(*pos_labelvalues))

    def observe(self, spec_decoding_stats: SpecDecodingStats):
        if not self.spec_decoding_enabled:
            return
        self.counter_spec_decode_num_drafts.inc(spec_decoding_stats.num_drafts)
        self.counter_spec_decode_num_draft_tokens.inc(
            spec_decoding_stats.num_draft_tokens)
        self.counter_spec_decode_num_accepted_tokens.inc(
            spec_decoding_stats.num_accepted_tokens)
        for pos, counter in enumerate(
                self.counter_spec_decode_num_accepted_tokens_per_pos):
            counter.inc(spec_decoding_stats.num_accepted_tokens_per_pos[pos])