collector.py 8.24 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
# Copyright 2023-2024 SGLang Team
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
14
15
"""Utilities for Prometheus Metrics Collection."""

16
import time
17
18
19
20
21
22
23
24
25
26
27
28
from dataclasses import dataclass
from typing import Dict, Union


@dataclass
class SchedulerStats:
    num_running_reqs: int = 0
    num_used_tokens: int = 0
    token_usage: float = 0.0
    gen_throughput: float = 0.0
    num_queue_reqs: int = 0
    cache_hit_rate: float = 0.0
29
    spec_accept_length: float = 0.0
30
31
32
33
34
35


class SchedulerMetricsCollector:

    def __init__(self, labels: Dict[str, str]) -> None:
        # We need to import prometheus_client after setting the env variable `PROMETHEUS_MULTIPROC_DIR`
36
        from prometheus_client import Gauge, Histogram
37
38

        self.labels = labels
39
        self.last_log_time = time.time()
40
41
42

        self.num_running_reqs = Gauge(
            name="sglang:num_running_reqs",
43
            documentation="The number of running requests.",
44
            labelnames=labels.keys(),
45
            multiprocess_mode="mostrecent",
46
47
48
49
        )

        self.num_used_tokens = Gauge(
            name="sglang:num_used_tokens",
50
            documentation="The number of used tokens.",
51
            labelnames=labels.keys(),
52
            multiprocess_mode="mostrecent",
53
54
55
56
        )

        self.token_usage = Gauge(
            name="sglang:token_usage",
57
            documentation="The token usage.",
58
59
60
61
62
63
            labelnames=labels.keys(),
            multiprocess_mode="mostrecent",
        )

        self.gen_throughput = Gauge(
            name="sglang:gen_throughput",
64
            documentation="The generation throughput (token/s).",
65
            labelnames=labels.keys(),
66
            multiprocess_mode="mostrecent",
67
68
69
70
        )

        self.num_queue_reqs = Gauge(
            name="sglang:num_queue_reqs",
71
            documentation="The number of requests in the waiting queue.",
72
            labelnames=labels.keys(),
73
            multiprocess_mode="mostrecent",
74
75
76
77
        )

        self.cache_hit_rate = Gauge(
            name="sglang:cache_hit_rate",
78
79
80
81
82
83
84
85
            documentation="The prefix cache hit rate.",
            labelnames=labels.keys(),
            multiprocess_mode="mostrecent",
        )

        self.spec_accept_length = Gauge(
            name="sglang:spec_accept_length",
            documentation="The average acceptance length of speculative decoding.",
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
            labelnames=labels.keys(),
            multiprocess_mode="mostrecent",
        )

    def _log_gauge(self, gauge, data: Union[int, float]) -> None:
        # Convenience function for logging to gauge.
        gauge.labels(**self.labels).set(data)

    def log_stats(self, stats: SchedulerStats) -> None:
        self._log_gauge(self.num_running_reqs, stats.num_running_reqs)
        self._log_gauge(self.num_used_tokens, stats.num_used_tokens)
        self._log_gauge(self.token_usage, stats.token_usage)
        self._log_gauge(self.gen_throughput, stats.gen_throughput)
        self._log_gauge(self.num_queue_reqs, stats.num_queue_reqs)
        self._log_gauge(self.cache_hit_rate, stats.cache_hit_rate)
101
        self._log_gauge(self.spec_accept_length, stats.spec_accept_length)
102
        self.last_log_time = time.time()
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123


class TokenizerMetricsCollector:
    def __init__(self, labels: Dict[str, str]) -> None:
        # We need to import prometheus_client after setting the env variable `PROMETHEUS_MULTIPROC_DIR`
        from prometheus_client import Counter, Histogram

        self.labels = labels

        self.prompt_tokens_total = Counter(
            name="sglang:prompt_tokens_total",
            documentation="Number of prefill tokens processed.",
            labelnames=labels.keys(),
        )

        self.generation_tokens_total = Counter(
            name="sglang:generation_tokens_total",
            documentation="Number of generation tokens processed.",
            labelnames=labels.keys(),
        )

124
125
126
127
128
129
        self.cached_tokens_total = Counter(
            name="sglang:cached_tokens_total",
            documentation="Number of cached prompt tokens.",
            labelnames=labels.keys(),
        )

130
131
132
133
134
135
        self.num_requests_total = Counter(
            name="sglang:num_requests_total",
            documentation="Number of requests processed.",
            labelnames=labels.keys(),
        )

136
137
138
139
140
141
        self.histogram_time_to_first_token = Histogram(
            name="sglang:time_to_first_token_seconds",
            documentation="Histogram of time to first token in seconds.",
            labelnames=labels.keys(),
            buckets=[
                0.1,
142
143
144
145
                0.2,
                0.4,
                0.6,
                0.8,
146
147
                1,
                2,
148
149
150
                4,
                6,
                8,
151
152
153
154
155
                10,
                20,
                40,
                60,
                80,
156
157
158
                100,
                200,
                400,
159
160
161
162
163
164
165
166
167
168
169
170
171
            ],
        )

        self.histogram_inter_token_latency_seconds = Histogram(
            name="sglang:inter_token_latency_seconds",
            documentation="Histogram of inter-token latency in seconds.",
            labelnames=labels.keys(),
            buckets=[
                0.002,
                0.004,
                0.006,
                0.008,
                0.010,
172
                0.015,
173
                0.020,
174
                0.025,
175
176
177
                0.030,
                0.035,
                0.040,
178
179
                0.060,
                0.080,
180
181
182
                0.100,
                0.200,
                0.400,
183
184
                0.600,
                0.800,
185
186
                1.000,
                2.000,
187
188
189
                4.000,
                6.000,
                8.000,
190
191
192
193
194
195
196
197
            ],
        )

        self.histogram_e2e_request_latency = Histogram(
            name="sglang:e2e_request_latency_seconds",
            documentation="Histogram of End-to-end request latency in seconds",
            labelnames=labels.keys(),
            buckets=[
198
                0.1,
199
200
                0.2,
                0.4,
201
                0.6,
202
                0.8,
203
204
                1,
                2,
205
206
207
                4,
                6,
                8,
208
209
210
211
212
                10,
                20,
                40,
                60,
                80,
213
214
                100,
                200,
215
216
                400,
                800,
217
218
219
            ],
        )

220
221
222
    def _log_histogram(self, histogram, data: Union[int, float]) -> None:
        histogram.labels(**self.labels).observe(data)

223
224
225
226
    def observe_one_finished_request(
        self,
        prompt_tokens: int,
        generation_tokens: int,
227
        cached_tokens: int,
228
229
        e2e_latency: float,
    ):
230
231
        self.prompt_tokens_total.labels(**self.labels).inc(prompt_tokens)
        self.generation_tokens_total.labels(**self.labels).inc(generation_tokens)
232
233
        if cached_tokens > 0:
            self.cached_tokens_total.labels(**self.labels).inc(cached_tokens)
234
        self.num_requests_total.labels(**self.labels).inc(1)
235
236
237
238
        self._log_histogram(self.histogram_e2e_request_latency, e2e_latency)

    def observe_time_to_first_token(self, value: float):
        self.histogram_time_to_first_token.labels(**self.labels).observe(value)
239

240
241
    def observe_inter_token_latency(self, internval: float, num_new_tokens: int):
        adjusted_interval = internval / num_new_tokens
242

243
244
245
246
        # A faster version of the Histogram::observe which observes multiple values at the same time.
        # reference: https://github.com/prometheus/client_python/blob/v0.21.1/prometheus_client/metrics.py#L639
        his = self.histogram_inter_token_latency_seconds.labels(**self.labels)
        his._sum.inc(internval)
247

248
249
250
251
        for i, bound in enumerate(his._upper_bounds):
            if adjusted_interval <= bound:
                his._buckets[i].inc(num_new_tokens)
                break