"examples/deployments/router_standalone_trtllm/__init__.py" did not exist on "ba3ac23560cb4a986b0e26c87162b68a778da286"
metrics.py 6.51 KB
Newer Older
1
2
# SPDX-License-Identifier: Apache-2.0

3
4
from dataclasses import dataclass, field
from typing import Optional
5
6

import numpy as np
7
import prometheus_client
8

9
from vllm.config import SpeculativeConfig
10
11
12
13
14
15
16
from vllm.logger import init_logger

logger = init_logger(__name__)


@dataclass
class SpecDecodingStats:
17
18
19
20
21
22
23
24
25
    """Per-step iteration decoding stats from scheduler.

    Each scheduler step, statistics on spec decoding performance are
    aggregated across requests by the scheduler and returned to the
    frontend in EngineCoreOutputs->SchedulerStats.
    """

    num_spec_tokens: int
    num_drafts: int = 0
26
27
    num_draft_tokens: int = 0
    num_accepted_tokens: int = 0
28
    num_accepted_tokens_per_pos: list[int] = field(default_factory=list)
29

30
31
32
33
    @classmethod
    def new(cls, num_spec_tokens: int) -> "SpecDecodingStats":
        return cls(num_spec_tokens=num_spec_tokens,
                   num_accepted_tokens_per_pos=[0] * num_spec_tokens)
34

35
36
    def observe_draft(self, num_draft_tokens: int, num_accepted_tokens: int):
        self.num_drafts += 1
37
38
        self.num_draft_tokens += num_draft_tokens
        self.num_accepted_tokens += num_accepted_tokens
39
40
41
42
        assert num_accepted_tokens <= self.num_spec_tokens
        for i in range(num_accepted_tokens):
            self.num_accepted_tokens_per_pos[i] += 1

43

44
45
class SpecDecodingLogging:
    """Aggregate and log spec decoding metrics.
46

47
48
49
50
    LoggingStatLogger aggregates per-iteration metrics over a set
    time interval using observe() and then logs them using log()
    before resetting to zero.
    """
51
52
53
54
55

    def __init__(self):
        self.reset()

    def reset(self):
56
        self.num_drafts: list[int] = []
57
58
        self.num_draft_tokens: list[int] = []
        self.num_accepted_tokens: list[int] = []
59
        self.accepted_tokens_per_pos_lists: list[list[int]] = []
60
61

    def observe(self, spec_decoding_stats: SpecDecodingStats):
62
        self.num_drafts.append(spec_decoding_stats.num_drafts)
63
64
65
        self.num_draft_tokens.append(spec_decoding_stats.num_draft_tokens)
        self.num_accepted_tokens.append(
            spec_decoding_stats.num_accepted_tokens)
66
67
        self.accepted_tokens_per_pos_lists.append(
            spec_decoding_stats.num_accepted_tokens_per_pos)
68

69
    def log(self, log_fn=logger.info):
70
71
        if not self.num_drafts:
            return
72
        num_drafts = np.sum(self.num_drafts)
73
74
75
        num_draft_tokens = np.sum(self.num_draft_tokens)
        num_accepted_tokens = np.sum(self.num_accepted_tokens)

76
77
        draft_acceptance_rate = (num_accepted_tokens / num_draft_tokens *
                                 100 if num_draft_tokens > 0 else float("nan"))
78
79
80

        # Conventionally, mean acceptance length includes the bonus token
        mean_acceptance_length = 1 + (num_accepted_tokens / num_drafts)
81
82
83
84

        pos_matrix = np.array(self.accepted_tokens_per_pos_lists)
        acceptance_rates = np.sum(pos_matrix, axis=0) / num_drafts
        rates_str = ", ".join(f"{p:.3f}" for p in acceptance_rates)
85

86
        log_fn(
87
88
            "SpecDecoding metrics: "
            "Draft acceptance rate: %.1f%%, "
89
            "Mean acceptance length: %.2f, "
90
            "Accepted: %d tokens, "
91
92
            "Drafted: %d tokens, "
            "Per-position acceptance rate: %s",
93
            draft_acceptance_rate,
94
            mean_acceptance_length,
95
96
            num_accepted_tokens,
            num_draft_tokens,
97
            rates_str,
98
        )
99
        self.reset()
100
101
102
103
104
105
106
107
108
109


class SpecDecodingProm:
    """Record spec decoding metrics in Prometheus.

    The acceptance rate can be calculated using a PromQL query:

      rate(vllm:spec_decode_num_accepted_tokens_total[$interval]) /
      rate(vllm:spec_decode_num_draft_tokens_total[$interval])

110
111
    The mean acceptance length (conventionally including bonus tokens)
    can be calculated using:
112

113
      1 + (
114
      rate(vllm:spec_decode_num_accepted_tokens_total[$interval]) /
115
      rate(vllm:spec_decode_num_drafts[$interval]))
116
117
118
119
120
121
122

    A per-position acceptance rate vector can be computed using

      vllm:spec_decode_num_accepted_tokens_per_pos[$interval] /
      vllm:spec_decode_num_drafts[$interval]
    """

123
124
125
126
127
128
129
130
    _counter_cls = prometheus_client.Counter

    def __init__(
        self,
        speculative_config: Optional[SpeculativeConfig],
        labelnames: list[str],
        labelvalues: list[str],
    ):
131
132
133
134
135
        self.spec_decoding_enabled = speculative_config is not None
        if not self.spec_decoding_enabled:
            return

        self.counter_spec_decode_num_drafts = \
136
            self._counter_cls(
137
                name="vllm:spec_decode_num_drafts",
138
139
140
                documentation="Number of spec decoding drafts.",
                labelnames=labelnames).labels(*labelvalues)
        self.counter_spec_decode_num_draft_tokens = \
141
            self._counter_cls(
142
                name="vllm:spec_decode_num_draft_tokens",
143
                documentation="Number of draft tokens.",
144
                labelnames=labelnames,).labels(*labelvalues)
145
        self.counter_spec_decode_num_accepted_tokens = \
146
            self._counter_cls(
147
                name="vllm:spec_decode_num_accepted_tokens",
148
149
150
151
152
153
154
                documentation="Number of accepted tokens.",
                labelnames=labelnames).labels(*labelvalues)

        assert speculative_config is not None
        num_spec_tokens = (speculative_config.num_speculative_tokens
                           if self.spec_decoding_enabled else 0)
        pos_labelnames = labelnames + ["position"]
155
        base_counter = self._counter_cls(
156
157
            name="vllm:spec_decode_num_accepted_tokens_per_pos",
            documentation="Accepted tokens per draft position.",
158
159
160
161
            labelnames=pos_labelnames,
        )
        self.counter_spec_decode_num_accepted_tokens_per_pos: list[
            prometheus_client.Counter] = []
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
        for pos in range(num_spec_tokens):
            pos_labelvalues = labelvalues + [str(pos)]
            self.counter_spec_decode_num_accepted_tokens_per_pos.append(
                base_counter.labels(*pos_labelvalues))

    def observe(self, spec_decoding_stats: SpecDecodingStats):
        if not self.spec_decoding_enabled:
            return
        self.counter_spec_decode_num_drafts.inc(spec_decoding_stats.num_drafts)
        self.counter_spec_decode_num_draft_tokens.inc(
            spec_decoding_stats.num_draft_tokens)
        self.counter_spec_decode_num_accepted_tokens.inc(
            spec_decoding_stats.num_accepted_tokens)
        for pos, counter in enumerate(
                self.counter_spec_decode_num_accepted_tokens_per_pos):
            counter.inc(spec_decoding_stats.num_accepted_tokens_per_pos[pos])