loggers.py 19.6 KB
Newer Older
1
2
# SPDX-License-Identifier: Apache-2.0

3
4
import time
from abc import ABC, abstractmethod
5
from typing import Optional
6

7
import numpy as np
8
import prometheus_client
9

10
from vllm.config import SupportsMetricsInfo, VllmConfig
11
from vllm.logger import init_logger
12
from vllm.v1.core.kv_cache_utils import PrefixCachingMetrics
13
from vllm.v1.engine import FinishReason
14
from vllm.v1.metrics.stats import IterationStats, SchedulerStats
15
from vllm.v1.spec_decode.metrics import SpecDecodingMetrics
16
17
18
19
20
21
22
23
24

logger = init_logger(__name__)

_LOCAL_LOGGING_INTERVAL_SEC = 5.0


class StatLoggerBase(ABC):

    @abstractmethod
25
    def record(self, scheduler_stats: SchedulerStats,
26
               iteration_stats: Optional[IterationStats]):
27
28
        ...

29
30
31
    def log(self):  # noqa
        pass

32
33
34

class LoggingStatLogger(StatLoggerBase):

35
36
    def __init__(self, engine_index: int = 0):
        self.engine_index = engine_index
37
        self._reset(time.monotonic())
38
        self.last_scheduler_stats = SchedulerStats()
39
40
41
        # Prefix cache metrics. This cannot be reset.
        # TODO: Make the interval configurable.
        self.prefix_caching_metrics = PrefixCachingMetrics()
42
        self.spec_decoding_metrics = SpecDecodingMetrics()
43
44
        self.last_prompt_throughput: float = 0.0
        self.last_generation_throughput: float = 0.0
45

46
47
48
49
    def _reset(self, now):
        self.last_log_time = now

        # Tracked stats over current local logging interval.
50
51
        self.num_prompt_tokens: list[int] = []
        self.num_generation_tokens: list[int] = []
52

53
54
55
56
57
58
    def _track_iteration_stats(self, iteration_stats: IterationStats):
        # Save tracked stats for token counters.
        self.num_prompt_tokens.append(iteration_stats.num_prompt_tokens)
        self.num_generation_tokens.append(
            iteration_stats.num_generation_tokens)

59
    def _get_throughput(self, tracked_stats: list[int], now: float) -> float:
60
61
62
        # Compute summary metrics for tracked stats
        return float(np.sum(tracked_stats) / (now - self.last_log_time))

63
    def record(self, scheduler_stats: SchedulerStats,
64
               iteration_stats: Optional[IterationStats]):
65
66
        """Log Stats to standard output."""

67
68
        if iteration_stats:
            self._track_iteration_stats(iteration_stats)
69

70
71
        self.prefix_caching_metrics.observe(scheduler_stats.prefix_cache_stats)

72
73
74
75
        if scheduler_stats.spec_decoding_stats is not None:
            self.spec_decoding_metrics.observe(
                scheduler_stats.spec_decoding_stats)

76
        self.last_scheduler_stats = scheduler_stats
77

78
79
    def log(self):
        now = time.monotonic()
80
81
82
83
84
        prompt_throughput = self._get_throughput(self.num_prompt_tokens, now)
        generation_throughput = self._get_throughput(
            self.num_generation_tokens, now)

        self._reset(now)
85

86
87
        scheduler_stats = self.last_scheduler_stats

88
89
90
91
92
93
94
95
96
        log_fn = logger.info
        if not any(
            (prompt_throughput, generation_throughput,
             self.last_prompt_throughput, self.last_generation_throughput)):
            # Avoid log noise on an idle production system
            log_fn = logger.debug
        self.last_generation_throughput = generation_throughput
        self.last_prompt_throughput = prompt_throughput

97
        # Format and print output.
98
        log_fn(
99
            "Engine %03d: "
100
101
            "Avg prompt throughput: %.1f tokens/s, "
            "Avg generation throughput: %.1f tokens/s, "
102
103
104
            "Running: %d reqs, Waiting: %d reqs, "
            "GPU KV cache usage: %.1f%%, "
            "Prefix cache hit rate: %.1f%%",
105
            self.engine_index,
106
107
            prompt_throughput,
            generation_throughput,
108
109
            scheduler_stats.num_running_reqs,
            scheduler_stats.num_waiting_reqs,
110
            scheduler_stats.gpu_cache_usage * 100,
111
            self.prefix_caching_metrics.hit_rate * 100,
112
        )
113

114
        if scheduler_stats.spec_decoding_stats is not None:
115
            self.spec_decoding_metrics.log(log_fn=log_fn)
116

117
118
119

class PrometheusStatLogger(StatLoggerBase):

120
    def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
121
        self._unregister_vllm_metrics()
122

123
124
125
126
127
        # Use this flag to hide metrics that were deprecated in
        # a previous release and which will be removed future
        self.show_hidden_metrics = \
            vllm_config.observability_config.show_hidden_metrics

128
129
130
131
132
        labelnames = ["model_name", "engine"]
        labelvalues = [
            vllm_config.model_config.served_model_name,
            str(engine_index)
        ]
133

134
        max_model_len = vllm_config.model_config.max_model_len
135

136
137
138
        #
        # Scheduler state
        #
139
140
141
142
143
144
145
146
147
148
        self.gauge_scheduler_running = prometheus_client.Gauge(
            name="vllm:num_requests_running",
            documentation="Number of requests in model execution batches.",
            labelnames=labelnames).labels(*labelvalues)

        self.gauge_scheduler_waiting = prometheus_client.Gauge(
            name="vllm:num_requests_waiting",
            documentation="Number of requests waiting to be processed.",
            labelnames=labelnames).labels(*labelvalues)

149
150
151
        #
        # GPU cache
        #
152
153
154
155
156
        self.gauge_gpu_cache_usage = prometheus_client.Gauge(
            name="vllm:gpu_cache_usage_perc",
            documentation="GPU KV-cache usage. 1 means 100 percent usage.",
            labelnames=labelnames).labels(*labelvalues)

157
158
159
160
161
162
163
164
165
166
167
168
        self.counter_gpu_prefix_cache_queries = prometheus_client.Counter(
            name="vllm:gpu_prefix_cache_queries",
            documentation=
            "GPU prefix cache queries, in terms of number of queried blocks.",
            labelnames=labelnames).labels(*labelvalues)

        self.counter_gpu_prefix_cache_hits = prometheus_client.Counter(
            name="vllm:gpu_prefix_cache_hits",
            documentation=
            "GPU prefix cache hits, in terms of number of cached blocks.",
            labelnames=labelnames).labels(*labelvalues)

169
170
171
        #
        # Counters
        #
172
173
174
175
176
        self.counter_num_preempted_reqs = prometheus_client.Counter(
            name="vllm:num_preemptions_total",
            documentation="Cumulative number of preemption from the engine.",
            labelnames=labelnames).labels(*labelvalues)

177
178
179
180
181
182
183
184
185
186
        self.counter_prompt_tokens = prometheus_client.Counter(
            name="vllm:prompt_tokens_total",
            documentation="Number of prefill tokens processed.",
            labelnames=labelnames).labels(*labelvalues)

        self.counter_generation_tokens = prometheus_client.Counter(
            name="vllm:generation_tokens_total",
            documentation="Number of generation tokens processed.",
            labelnames=labelnames).labels(*labelvalues)

187
        self.counter_request_success: dict[FinishReason,
188
189
190
191
192
                                           prometheus_client.Counter] = {}
        counter_request_success_base = prometheus_client.Counter(
            name="vllm:request_success_total",
            documentation="Count of successfully processed requests.",
            labelnames=labelnames + ["finished_reason"])
193
        for reason in FinishReason:
194
195
196
197
            self.counter_request_success[
                reason] = counter_request_success_base.labels(*(labelvalues +
                                                                [str(reason)]))

198
199
200
        #
        # Histograms of counts
        #
201
202
203
204
205
206
207
208
209
210
211
212
213
214
        self.histogram_num_prompt_tokens_request = \
            prometheus_client.Histogram(
                name="vllm:request_prompt_tokens",
                documentation="Number of prefill tokens processed.",
                buckets=build_1_2_5_buckets(max_model_len),
                labelnames=labelnames).labels(*labelvalues)

        self.histogram_num_generation_tokens_request = \
            prometheus_client.Histogram(
                name="vllm:request_generation_tokens",
                documentation="Number of generation tokens processed.",
                buckets=build_1_2_5_buckets(max_model_len),
                labelnames=labelnames).labels(*labelvalues)

215
216
217
218
219
220
221
        self.histogram_iteration_tokens = \
            prometheus_client.Histogram(
                name="vllm:iteration_tokens_total",
                documentation="Histogram of number of tokens per engine_step.",
                buckets=build_cudagraph_buckets(vllm_config),
                labelnames=labelnames).labels(*labelvalues)

222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
        self.histogram_max_num_generation_tokens_request = \
            prometheus_client.Histogram(
                name="vllm:request_max_num_generation_tokens",
                documentation=
                "Histogram of maximum number of requested generation tokens.",
                buckets=build_1_2_5_buckets(max_model_len),
                labelnames=labelnames).labels(*labelvalues)

        self.histogram_n_request = \
            prometheus_client.Histogram(
                name="vllm:request_params_n",
                documentation="Histogram of the n request parameter.",
                buckets=[1, 2, 5, 10, 20],
                labelnames=labelnames).labels(*labelvalues)

        self.histogram_max_tokens_request = \
            prometheus_client.Histogram(
                name="vllm:request_params_max_tokens",
                documentation="Histogram of the max_tokens request parameter.",
                buckets=build_1_2_5_buckets(max_model_len),
                labelnames=labelnames).labels(*labelvalues)

        #
        # Histogram of timing intervals
        #
247
248
249
250
251
252
        self.histogram_time_to_first_token = \
            prometheus_client.Histogram(
                name="vllm:time_to_first_token_seconds",
                documentation="Histogram of time to first token in seconds.",
                buckets=[
                    0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
253
254
                    0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0,
                    640.0, 2560.0
255
256
257
258
259
260
261
262
263
                ],
                labelnames=labelnames).labels(*labelvalues)

        self.histogram_time_per_output_token = \
            prometheus_client.Histogram(
                name="vllm:time_per_output_token_seconds",
                documentation="Histogram of time per output token in seconds.",
                buckets=[
                    0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5,
264
                    0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0
265
266
267
                ],
                labelnames=labelnames).labels(*labelvalues)

268
269
        request_latency_buckets = [
            0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0,
270
            40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
        ]
        self.histogram_e2e_time_request = \
            prometheus_client.Histogram(
                name="vllm:e2e_request_latency_seconds",
                documentation="Histogram of e2e request latency in seconds.",
                buckets=request_latency_buckets,
                labelnames=labelnames).labels(*labelvalues)
        self.histogram_queue_time_request = \
            prometheus_client.Histogram(
                name="vllm:request_queue_time_seconds",
                documentation=
                "Histogram of time spent in WAITING phase for request.",
                buckets=request_latency_buckets,
                labelnames=labelnames).labels(*labelvalues)
        self.histogram_inference_time_request = \
            prometheus_client.Histogram(
                name="vllm:request_inference_time_seconds",
                documentation=
                "Histogram of time spent in RUNNING phase for request.",
                buckets=request_latency_buckets,
                labelnames=labelnames).labels(*labelvalues)
        self.histogram_prefill_time_request = \
            prometheus_client.Histogram(
                name="vllm:request_prefill_time_seconds",
                documentation=
                "Histogram of time spent in PREFILL phase for request.",
                buckets=request_latency_buckets,
                labelnames=labelnames).labels(*labelvalues)
        self.histogram_decode_time_request = \
            prometheus_client.Histogram(
                name="vllm:request_decode_time_seconds",
                documentation=
                "Histogram of time spent in DECODE phase for request.",
                buckets=request_latency_buckets,
                labelnames=labelnames).labels(*labelvalues)

307
308
309
        #
        # LoRA metrics
        #
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
        self.gauge_lora_info: Optional[prometheus_client.Gauge] = None
        if vllm_config.lora_config is not None:
            self.labelname_max_lora = "max_lora"
            self.labelname_waiting_lora_adapters = "waiting_lora_adapters"
            self.labelname_running_lora_adapters = "running_lora_adapters"
            self.max_lora = vllm_config.lora_config.max_loras
            self.gauge_lora_info = \
                prometheus_client.Gauge(
                    name="vllm:lora_requests_info",
                    documentation="Running stats on lora requests.",
                    labelnames=[
                        self.labelname_max_lora,
                        self.labelname_waiting_lora_adapters,
                        self.labelname_running_lora_adapters,
                    ])

326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
        #
        # Speculative Decoding metrics
        # The acceptance rate can be calculated using a PromQL query:
        #
        #   rate(vllm:spec_decode_num_accepted_tokens_total[$interval]) /
        #   rate(vllm:spec_decode_num_draft_tokens_total[$interval])
        #
        self.counter_spec_decode_num_draft_tokens = \
            prometheus_client.Counter(
                name="vllm:spec_decode_num_draft_tokens_total",
                documentation="Number of draft tokens.",
                labelnames=labelnames).labels(*labelvalues)
        self.counter_spec_decode_num_accepted_tokens = \
            prometheus_client.Counter(
                name="vllm:spec_decode_num_accepted_tokens_total",
                documentation="Number of accepted tokens.",
                labelnames=labelnames).labels(*labelvalues)

344
345
346
        #
        # Cache config info metric
        #
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
        self.log_metrics_info("cache_config", vllm_config.cache_config)

    def log_metrics_info(self, type: str, config_obj: SupportsMetricsInfo):
        metrics_info = config_obj.metrics_info()

        name, documentation = None, None
        if type == "cache_config":
            name = "vllm:cache_config_info"
            documentation = "Information of the LLMEngine CacheConfig"
        assert name is not None, f"Unknown metrics info type {type}"

        # Info type metrics are syntactic sugar for a gauge permanently set to 1
        # Since prometheus multiprocessing mode does not support Info, emulate
        # info here with a gauge.
        info_gauge = prometheus_client.Gauge(
            name=name,
            documentation=documentation,
            labelnames=metrics_info.keys()).labels(**metrics_info)
        info_gauge.set(1)

367
    def record(self, scheduler_stats: SchedulerStats,
368
               iteration_stats: Optional[IterationStats]):
369
370
371
372
        """Log to prometheus."""
        self.gauge_scheduler_running.set(scheduler_stats.num_running_reqs)
        self.gauge_scheduler_waiting.set(scheduler_stats.num_waiting_reqs)

373
374
        self.gauge_gpu_cache_usage.set(scheduler_stats.gpu_cache_usage)

375
376
377
378
379
        self.counter_gpu_prefix_cache_queries.inc(
            scheduler_stats.prefix_cache_stats.queries)
        self.counter_gpu_prefix_cache_hits.inc(
            scheduler_stats.prefix_cache_stats.hits)

380
381
382
383
384
385
        if scheduler_stats.spec_decoding_stats is not None:
            self.counter_spec_decode_num_draft_tokens.inc(
                scheduler_stats.spec_decoding_stats.num_draft_tokens)
            self.counter_spec_decode_num_accepted_tokens.inc(
                scheduler_stats.spec_decoding_stats.num_accepted_tokens)

386
387
388
        if iteration_stats is None:
            return

389
        self.counter_num_preempted_reqs.inc(iteration_stats.num_preempted_reqs)
390
391
392
        self.counter_prompt_tokens.inc(iteration_stats.num_prompt_tokens)
        self.counter_generation_tokens.inc(
            iteration_stats.num_generation_tokens)
393
394
395
        self.histogram_iteration_tokens.observe(
            iteration_stats.num_prompt_tokens + \
            iteration_stats.num_generation_tokens)
396

397
398
399
400
401
        for max_gen_tokens in iteration_stats.max_num_generation_tokens_iter:
            self.histogram_max_num_generation_tokens_request.observe(
                max_gen_tokens)
        for n_param in iteration_stats.n_params_iter:
            self.histogram_n_request.observe(n_param)
402
403
404
405
406
        for ttft in iteration_stats.time_to_first_tokens_iter:
            self.histogram_time_to_first_token.observe(ttft)
        for tpot in iteration_stats.time_per_output_tokens_iter:
            self.histogram_time_per_output_token.observe(tpot)

407
        for finished_request in iteration_stats.finished_requests:
408
            self.counter_request_success[finished_request.finish_reason].inc()
409
410
            self.histogram_e2e_time_request.observe(
                finished_request.e2e_latency)
411
412
413
414
            self.histogram_queue_time_request.observe(
                finished_request.queued_time)
            self.histogram_prefill_time_request.observe(
                finished_request.prefill_time)
415
416
417
418
            self.histogram_inference_time_request.observe(
                finished_request.inference_time)
            self.histogram_decode_time_request.observe(
                finished_request.decode_time)
419
420
421
422
            self.histogram_num_prompt_tokens_request.observe(
                finished_request.num_prompt_tokens)
            self.histogram_num_generation_tokens_request.observe(
                finished_request.num_generation_tokens)
423
424
            self.histogram_max_tokens_request.observe(
                finished_request.max_tokens_param)
425

426
427
428
429
430
431
432
433
434
435
436
437
438
        if self.gauge_lora_info is not None:
            running_lora_adapters = \
                ",".join(iteration_stats.running_lora_adapters.keys())
            waiting_lora_adapters = \
                ",".join(iteration_stats.waiting_lora_adapters.keys())
            lora_info_labels = {
                self.labelname_running_lora_adapters: running_lora_adapters,
                self.labelname_waiting_lora_adapters: waiting_lora_adapters,
                self.labelname_max_lora: self.max_lora,
            }
            self.gauge_lora_info.labels(**lora_info_labels)\
                                .set_to_current_time()

439
440
441
442
443
444
    @staticmethod
    def _unregister_vllm_metrics():
        # Unregister any existing vLLM collectors (for CI/CD
        for collector in list(prometheus_client.REGISTRY._collector_to_names):
            if hasattr(collector, "_name") and "vllm" in collector._name:
                prometheus_client.REGISTRY.unregister(collector)
445
446


447
def build_buckets(mantissa_lst: list[int], max_value: int) -> list[int]:
448
449
450
451
452
453
    """
    Builds a list of buckets with increasing powers of 10 multiplied by
    mantissa values until the value exceeds the specified maximum.

    """
    exponent = 0
454
    buckets: list[int] = []
455
456
457
458
459
460
461
462
463
464
    while True:
        for m in mantissa_lst:
            value = m * 10**exponent
            if value <= max_value:
                buckets.append(value)
            else:
                return buckets
        exponent += 1


465
def build_1_2_5_buckets(max_value: int) -> list[int]:
466
467
468
469
470
471
    """
    Example:
    >>> build_1_2_5_buckets(100)
    [1, 2, 5, 10, 20, 50, 100]
    """
    return build_buckets([1, 2, 5], max_value)
472
473


474
def build_cudagraph_buckets(vllm_config: VllmConfig) -> list[int]:
475
476
477
478
479
480
481
    if not vllm_config.model_config.enforce_eager:
        buckets = vllm_config.compilation_config.\
            cudagraph_capture_sizes.copy()
        buckets.sort()
        return buckets
    else:
        return [1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8096]