loggers.py 43.5 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
import logging
5
6
import time
from abc import ABC, abstractmethod
7
8
from collections.abc import Callable
from typing import TypeAlias
9

10
from prometheus_client import Counter, Gauge, Histogram
11

12
from vllm.config import SupportsMetricsInfo, VllmConfig
13
14
15
16
from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
    KVConnectorLogging,
    KVConnectorPrometheus,
)
17
from vllm.logger import init_logger
18
from vllm.plugins import load_plugins_by_group
19
from vllm.v1.engine import FinishReason
20
from vllm.v1.metrics.prometheus import unregister_vllm_metrics
21
22
23
24
25
26
from vllm.v1.metrics.stats import (
    CachingMetrics,
    IterationStats,
    MultiModalCacheStats,
    SchedulerStats,
)
27
from vllm.v1.spec_decode.metrics import SpecDecodingLogging, SpecDecodingProm
28
29
30

logger = init_logger(__name__)

31
32
33
PerEngineStatLoggerFactory = Callable[[VllmConfig, int], "StatLoggerBase"]
AggregateStatLoggerFactory = type["AggregateStatLoggerBase"]
StatLoggerFactory = AggregateStatLoggerFactory | PerEngineStatLoggerFactory
34

35
36

class StatLoggerBase(ABC):
37
38
39
40
41
42
43
44
    """Interface for logging metrics.

    API users may define custom loggers that implement this interface.
    However, note that the `SchedulerStats` and `IterationStats` classes
    are not considered stable interfaces and may change in future versions.
    """

    @abstractmethod
45
    def __init__(self, vllm_config: VllmConfig, engine_index: int = 0): ...
46
47

    @abstractmethod
48
49
    def record(
        self,
50
51
52
        scheduler_stats: SchedulerStats | None,
        iteration_stats: IterationStats | None,
        mm_cache_stats: MultiModalCacheStats | None = None,
53
54
        engine_idx: int = 0,
    ): ...
55

56
    @abstractmethod
57
    def log_engine_initialized(self): ...
58

59
60
61
    def log(self):  # noqa
        pass

62
63
64
    def record_sleep_state(self, is_awake: int, level: int):  # noqa
        pass

65

66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
def load_stat_logger_plugin_factories() -> list[StatLoggerFactory]:
    factories: list[StatLoggerFactory] = []

    for name, plugin_class in load_plugins_by_group("vllm.stat_logger_plugins").items():
        if not isinstance(plugin_class, type) or not issubclass(
            plugin_class, StatLoggerBase
        ):
            raise TypeError(
                f"Stat logger plugin {name!r} must be a subclass of "
                f"StatLoggerBase (got {plugin_class!r})."
            )

        factories.append(plugin_class)

    return factories


83
84
85
86
87
88
89
90
class AggregateStatLoggerBase(StatLoggerBase):
    """Abstract base class for loggers that
    aggregate across multiple DP engines."""

    @abstractmethod
    def __init__(self, vllm_config: VllmConfig, engine_indexes: list[int]): ...


91
class LoggingStatLogger(StatLoggerBase):
92
    def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
93
        self.engine_index = engine_index
94
        self.vllm_config = vllm_config
95
        self._reset(time.monotonic())
96

97
        self.last_scheduler_stats = SchedulerStats()
98
99

        # Caching metrics. This cannot be reset.
100
        # TODO: Make the interval configurable.
101
        self.prefix_caching_metrics = CachingMetrics()
102
        self.connector_prefix_caching_metrics = CachingMetrics()
103
104
        self.mm_caching_metrics = CachingMetrics()

105
        self.spec_decoding_logging = SpecDecodingLogging()
106
        kv_tranfer_config = self.vllm_config.kv_transfer_config
107
        self.kv_connector_logging = KVConnectorLogging(kv_tranfer_config)
108
109
        self.last_prompt_throughput: float = 0.0
        self.last_generation_throughput: float = 0.0
110
111
        self.engine_is_idle = False
        self.aggregated = False
112

113
114
115
116
    def _reset(self, now):
        self.last_log_time = now

        # Tracked stats over current local logging interval.
117
118
        self.num_prompt_tokens: int = 0
        self.num_generation_tokens: int = 0
119

120
121
    def _track_iteration_stats(self, iteration_stats: IterationStats):
        # Save tracked stats for token counters.
122
123
        self.num_prompt_tokens += iteration_stats.num_prompt_tokens
        self.num_generation_tokens += iteration_stats.num_generation_tokens
124

125
    def _get_throughput(self, tracked_stats: int, now: float) -> float:
126
        # Compute summary metrics for tracked stats
127
128
129
130
        delta_time = now - self.last_log_time
        if delta_time <= 0.0:
            return 0.0
        return float(tracked_stats / delta_time)
131

132
133
134
135
    @property
    def log_prefix(self):
        return "Engine {:03d}: ".format(self.engine_index)

136
137
    def record(
        self,
138
139
140
        scheduler_stats: SchedulerStats | None,
        iteration_stats: IterationStats | None,
        mm_cache_stats: MultiModalCacheStats | None = None,
141
142
        engine_idx: int = 0,
    ):
143
        """Log Stats to standard output."""
144
145
        if iteration_stats:
            self._track_iteration_stats(iteration_stats)
146

147
        if scheduler_stats is not None:
148
            self.prefix_caching_metrics.observe(scheduler_stats.prefix_cache_stats)
149

150
151
152
153
154
            if scheduler_stats.connector_prefix_cache_stats is not None:
                self.connector_prefix_caching_metrics.observe(
                    scheduler_stats.connector_prefix_cache_stats
                )

155
            if scheduler_stats.spec_decoding_stats is not None:
156
                self.spec_decoding_logging.observe(scheduler_stats.spec_decoding_stats)
157
            if kv_connector_stats := scheduler_stats.kv_connector_stats:
158
                self.kv_connector_logging.observe(kv_connector_stats)
159
160
            if not self.aggregated:
                self.last_scheduler_stats = scheduler_stats
161
162
163
        if mm_cache_stats:
            self.mm_caching_metrics.observe(mm_cache_stats)

164
    def _update_stats(self):
165
        now = time.monotonic()
166
        prompt_throughput = self._get_throughput(self.num_prompt_tokens, now)
167
        generation_throughput = self._get_throughput(self.num_generation_tokens, now)
168
169

        self._reset(now)
170
        self.engine_is_idle = not any(
171
172
173
174
175
176
            (
                prompt_throughput,
                generation_throughput,
                self.last_prompt_throughput,
                self.last_generation_throughput,
            )
177
        )
178
179
180
        self.last_generation_throughput = generation_throughput
        self.last_prompt_throughput = prompt_throughput

181
182
183
184
185
186
187
188
189
    def aggregate_scheduler_stats(self):
        # noop for per engine loggers
        return

    def log(self):
        self._update_stats()
        self.aggregate_scheduler_stats()
        # Avoid log noise on an idle production system
        log_fn = logger.debug if self.engine_is_idle else logger.info
190
        # Format and print output.
191
192
193
194
195
196
        log_parts = [
            "Avg prompt throughput: %.1f tokens/s",
            "Avg generation throughput: %.1f tokens/s",
            "Running: %d reqs",
            "Waiting: %d reqs",
            "GPU KV cache usage: %.1f%%",
197
            "Prefix cache hit rate: %.1f%%",
198
199
        ]
        log_args = [
200
201
202
203
204
            self.last_prompt_throughput,
            self.last_generation_throughput,
            self.last_scheduler_stats.num_running_reqs,
            self.last_scheduler_stats.num_waiting_reqs,
            self.last_scheduler_stats.kv_cache_usage * 100,
205
            self.prefix_caching_metrics.hit_rate * 100,
206
        ]
207
208
209
        if not self.connector_prefix_caching_metrics.empty:
            log_parts.append("External prefix cache hit rate: %.1f%%")
            log_args.append(self.connector_prefix_caching_metrics.hit_rate * 100)
210
        if not self.mm_caching_metrics.empty:
211
212
213
214
            log_parts.append("MM cache hit rate: %.1f%%")
            log_args.append(self.mm_caching_metrics.hit_rate * 100)

        log_fn(
215
            self.log_prefix + ", ".join(log_parts),
216
            *log_args,
217
        )
218

219
        self.spec_decoding_logging.log(log_fn=log_fn)
220
        self.kv_connector_logging.log(log_fn=log_fn)
221

222
    def log_engine_initialized(self):
223
        if self.vllm_config.cache_config.num_gpu_blocks:
224
            logger.debug(
225
                "Engine %03d: vllm cache_config_info with initialization "
226
227
228
229
                "after num_gpu_blocks is: %d",
                self.engine_index,
                self.vllm_config.cache_config.num_gpu_blocks,
            )
230

231

232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
class AggregatedLoggingStatLogger(LoggingStatLogger, AggregateStatLoggerBase):
    def __init__(
        self,
        vllm_config: VllmConfig,
        engine_indexes: list[int],
    ):
        self.engine_indexes = engine_indexes
        self.last_scheduler_stats_dict: dict[int, SchedulerStats] = {
            idx: SchedulerStats() for idx in self.engine_indexes
        }
        LoggingStatLogger.__init__(self, vllm_config, engine_index=-1)
        self.aggregated = True

    @property
    def log_prefix(self):
        return "{} Engines Aggregated: ".format(len(self.engine_indexes))

    def record(
        self,
        scheduler_stats: SchedulerStats | None,
        iteration_stats: IterationStats | None,
        mm_cache_stats: MultiModalCacheStats | None = None,
        engine_idx: int = 0,
    ):
        if engine_idx not in self.engine_indexes:
            logger.warning("Unexpected engine_idx: %d", engine_idx)
            return
        LoggingStatLogger.record(
            self,
            scheduler_stats,
            iteration_stats,
            mm_cache_stats=mm_cache_stats,
            engine_idx=engine_idx,
        )
        if scheduler_stats is not None:
            self.last_scheduler_stats_dict[engine_idx] = scheduler_stats

    def aggregate_scheduler_stats(self):
        self.last_scheduler_stats = SchedulerStats()
        for last_scheduler_stats in self.last_scheduler_stats_dict.values():
            self.last_scheduler_stats.num_waiting_reqs += (
                last_scheduler_stats.num_waiting_reqs
            )
            self.last_scheduler_stats.num_running_reqs += (
                last_scheduler_stats.num_running_reqs
            )
            self.last_scheduler_stats.num_corrupted_reqs += (
                last_scheduler_stats.num_corrupted_reqs
            )
            self.last_scheduler_stats.kv_cache_usage += (
                last_scheduler_stats.kv_cache_usage
            )
        self.last_scheduler_stats.kv_cache_usage /= len(self.last_scheduler_stats_dict)

    def log(self):
        LoggingStatLogger.log(self)

    def log_engine_initialized(self):
        if self.vllm_config.cache_config.num_gpu_blocks:
            logger.info(
                "%d Engines: vllm cache_config_info with initialization "
                "after num_gpu_blocks is: %d",
                len(self.engine_indexes),
                self.vllm_config.cache_config.num_gpu_blocks,
            )


class PerEngineStatLoggerAdapter(AggregateStatLoggerBase):
    def __init__(
        self,
        vllm_config: VllmConfig,
        engine_indexes: list[int],
        per_engine_stat_logger_factory: PerEngineStatLoggerFactory,
    ) -> None:
        self.per_engine_stat_loggers = {}
        self.engine_indexes = engine_indexes
        for engine_index in engine_indexes:
            self.per_engine_stat_loggers[engine_index] = per_engine_stat_logger_factory(
                vllm_config, engine_index
            )

    def record(
        self,
        scheduler_stats: SchedulerStats | None,
        iteration_stats: IterationStats | None,
        mm_cache_stats: MultiModalCacheStats | None = None,
        engine_idx: int = 0,
    ):
        if engine_idx not in self.per_engine_stat_loggers:
            logger.warning("Unexpected engine_idx: %d", engine_idx)
            return
        self.per_engine_stat_loggers[engine_idx].record(
            scheduler_stats,
            iteration_stats,
            mm_cache_stats=mm_cache_stats,
            engine_idx=engine_idx,
        )

    def log(self):
        for per_engine_stat_logger in self.per_engine_stat_loggers.values():
            per_engine_stat_logger.log()

    def log_engine_initialized(self):
        for per_engine_stat_logger in self.per_engine_stat_loggers.values():
            per_engine_stat_logger.log_engine_initialized()


class PrometheusStatLogger(AggregateStatLoggerBase):
340
341
342
    _gauge_cls = Gauge
    _counter_cls = Counter
    _histogram_cls = Histogram
343
    _spec_decoding_cls = SpecDecodingProm
344
    _kv_connector_cls = KVConnectorPrometheus
345

346
    def __init__(
347
        self, vllm_config: VllmConfig, engine_indexes: list[int] | None = None
348
    ):
349
350
        if engine_indexes is None:
            engine_indexes = [0]
351

352
        self.engine_indexes = engine_indexes
353
354

        unregister_vllm_metrics()
355
        self.vllm_config = vllm_config
356
357
        # Use this flag to hide metrics that were deprecated in
        # a previous release and which will be removed future
358
        self.show_hidden_metrics = vllm_config.observability_config.show_hidden_metrics
359

360
        labelnames = ["model_name", "engine"]
361
        model_name = vllm_config.model_config.served_model_name
362
        max_model_len = vllm_config.model_config.max_model_len
363

364
        per_engine_labelvalues: dict[int, list[str]] = {
365
            idx: [model_name, str(idx)] for idx in engine_indexes
366
367
        }

368
        self.spec_decoding_prom = self._spec_decoding_cls(
369
370
371
372
            vllm_config.speculative_config, labelnames, per_engine_labelvalues
        )
        self.kv_connector_prom = self._kv_connector_cls(
            vllm_config, labelnames, per_engine_labelvalues
373
        )
374

375
376
377
        #
        # Scheduler state
        #
378
        gauge_scheduler_running = self._gauge_cls(
379
380
            name="vllm:num_requests_running",
            documentation="Number of requests in model execution batches.",
381
            multiprocess_mode="mostrecent",
382
383
384
385
386
            labelnames=labelnames,
        )
        self.gauge_scheduler_running = make_per_engine(
            gauge_scheduler_running, engine_indexes, model_name
        )
387

388
        gauge_scheduler_waiting = self._gauge_cls(
389
390
            name="vllm:num_requests_waiting",
            documentation="Number of requests waiting to be processed.",
391
            multiprocess_mode="mostrecent",
392
393
394
395
396
            labelnames=labelnames,
        )
        self.gauge_scheduler_waiting = make_per_engine(
            gauge_scheduler_waiting, engine_indexes, model_name
        )
397

398
399
400
401
402
403
404
405
406
407
408
409
410
411
        gauge_engine_sleep_state = self._gauge_cls(
            name="vllm:engine_sleep_state",
            documentation=(
                "Engine sleep state; awake = 0 means engine is sleeping; "
                "awake = 1 means engine is awake; "
                "weights_offloaded = 1 means sleep level 1; "
                "discard_all = 1 means sleep level 2."
            ),
            labelnames=labelnames + ["sleep_state"],
            multiprocess_mode="mostrecent",
        )

        self.gauge_engine_sleep_state = {}
        sleep_state = ["awake", "weights_offloaded", "discard_all"]
412

413
414
415
416
417
418
419
        for s in sleep_state:
            self.gauge_engine_sleep_state[s] = {
                idx: gauge_engine_sleep_state.labels(
                    engine=idx, model_name=model_name, sleep_state=s
                )
                for idx in engine_indexes
            }
420

421
422
        # Setting default values
        self.record_sleep_state()
423

424
425
        # GPU cache
        #
426
427
428
429
430
431
432
433
        # Deprecated in 0.9.2 - Renamed as vllm:kv_cache_usage_perc
        # With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
        # TODO: remove in 0.12.0
        if self.show_hidden_metrics:
            gauge_gpu_cache_usage = self._gauge_cls(
                name="vllm:gpu_cache_usage_perc",
                documentation=(
                    "GPU KV-cache usage. 1 means 100 percent usage."
434
435
                    "DEPRECATED: Use vllm:kv_cache_usage_perc instead."
                ),
436
                multiprocess_mode="mostrecent",
437
438
                labelnames=labelnames,
            )
439
            self.gauge_gpu_cache_usage = make_per_engine(
440
441
                gauge_gpu_cache_usage, engine_indexes, model_name
            )
442
443
444
445
446
447
448
449
450
451
452

        # Deprecated in 0.9.2 - Renamed as vllm:prefix_cache_queries
        # With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
        # TODO: remove in 0.12.0
        if self.show_hidden_metrics:
            counter_gpu_prefix_cache_queries = self._counter_cls(
                name="vllm:gpu_prefix_cache_queries",
                documentation=(
                    "GPU prefix cache queries, in terms of number of queried"
                    "tokens. DEPRECATED: Use vllm:prefix_cache_queries instead."
                ),
453
454
                labelnames=labelnames,
            )
455
            self.counter_gpu_prefix_cache_queries = make_per_engine(
456
457
                counter_gpu_prefix_cache_queries, engine_indexes, model_name
            )
458
459
460
461
462
463
464
465
466

        # Deprecated in 0.9.2 - Renamed as vllm:prefix_cache_hits
        # With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
        # TODO: remove in 0.12.0
        if self.show_hidden_metrics:
            counter_gpu_prefix_cache_hits = self._counter_cls(
                name="vllm:gpu_prefix_cache_hits",
                documentation=(
                    "GPU prefix cache hits, in terms of number of cached "
467
468
469
470
                    "tokens. DEPRECATED: Use vllm:prefix_cache_hits instead."
                ),
                labelnames=labelnames,
            )
471
            self.counter_gpu_prefix_cache_hits = make_per_engine(
472
473
                counter_gpu_prefix_cache_hits, engine_indexes, model_name
            )
474

475
        gauge_kv_cache_usage = self._gauge_cls(
476
477
            name="vllm:kv_cache_usage_perc",
            documentation="KV-cache usage. 1 means 100 percent usage.",
478
479
480
481
482
            labelnames=labelnames,
        )
        self.gauge_kv_cache_usage = make_per_engine(
            gauge_kv_cache_usage, engine_indexes, model_name
        )
483

484
        counter_prefix_cache_queries = self._counter_cls(
485
486
            name="vllm:prefix_cache_queries",
            documentation=(
487
488
489
490
                "Prefix cache queries, in terms of number of queried tokens."
            ),
            labelnames=labelnames,
        )
491
        self.counter_prefix_cache_queries = make_per_engine(
492
493
            counter_prefix_cache_queries, engine_indexes, model_name
        )
494

495
        counter_prefix_cache_hits = self._counter_cls(
496
            name="vllm:prefix_cache_hits",
497
498
499
            documentation=("Prefix cache hits, in terms of number of cached tokens."),
            labelnames=labelnames,
        )
500
        self.counter_prefix_cache_hits = make_per_engine(
501
502
            counter_prefix_cache_hits, engine_indexes, model_name
        )
503

504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
        #
        # External - KV connector prefix cache
        #

        counter_connector_prefix_cache_queries = self._counter_cls(
            name="vllm:external_prefix_cache_queries",
            documentation=(
                "External prefix cache queries from KV connector "
                "cross-instance cache sharing, in terms of number of queried tokens."
            ),
            labelnames=labelnames,
        )
        self.counter_connector_prefix_cache_queries = make_per_engine(
            counter_connector_prefix_cache_queries, engine_indexes, model_name
        )

        counter_connector_prefix_cache_hits = self._counter_cls(
            name="vllm:external_prefix_cache_hits",
            documentation=(
                "External prefix cache hits from KV connector "
                "cross-instance cache sharing, in terms of number of cached tokens."
            ),
            labelnames=labelnames,
        )
        self.counter_connector_prefix_cache_hits = make_per_engine(
            counter_connector_prefix_cache_hits, engine_indexes, model_name
        )

532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
        #
        # Multi-modal cache
        #

        counter_mm_cache_queries = self._counter_cls(
            name="vllm:mm_cache_queries",
            documentation=(
                "Multi-modal cache queries, in terms of number of queried items."
            ),
            labelnames=labelnames,
        )
        self.counter_mm_cache_queries = make_per_engine(
            counter_mm_cache_queries, engine_indexes, model_name
        )

        counter_mm_cache_hits = self._counter_cls(
            name="vllm:mm_cache_hits",
            documentation=(
                "Multi-modal cache hits, in terms of number of cached items."
            ),
            labelnames=labelnames,
        )
        self.counter_mm_cache_hits = make_per_engine(
            counter_mm_cache_hits, engine_indexes, model_name
        )

558
559
560
        #
        # Counters
        #
561
        counter_num_preempted_reqs = self._counter_cls(
562
            name="vllm:num_preemptions",
563
            documentation="Cumulative number of preemption from the engine.",
564
565
            labelnames=labelnames,
        )
566
        self.counter_num_preempted_reqs = make_per_engine(
567
568
            counter_num_preempted_reqs, engine_indexes, model_name
        )
569

570
        counter_prompt_tokens = self._counter_cls(
571
            name="vllm:prompt_tokens",
572
            documentation="Number of prefill tokens processed.",
573
574
575
576
577
            labelnames=labelnames,
        )
        self.counter_prompt_tokens = make_per_engine(
            counter_prompt_tokens, engine_indexes, model_name
        )
578

579
        counter_generation_tokens = self._counter_cls(
580
            name="vllm:generation_tokens",
581
            documentation="Number of generation tokens processed.",
582
583
            labelnames=labelnames,
        )
584
        self.counter_generation_tokens = make_per_engine(
585
586
            counter_generation_tokens, engine_indexes, model_name
        )
587

588
        self.counter_request_success: dict[FinishReason, dict[int, Counter]] = {}
589
        counter_request_success_base = self._counter_cls(
590
            name="vllm:request_success",
591
            documentation="Count of successfully processed requests.",
592
593
            labelnames=labelnames + ["finished_reason"],
        )
594
        for reason in FinishReason:
595
            self.counter_request_success[reason] = {
596
597
598
                idx: counter_request_success_base.labels(
                    model_name, str(idx), str(reason)
                )
599
600
                for idx in engine_indexes
            }
601

602
603
604
        #
        # Histograms of counts
        #
605
606
607
608
        histogram_num_prompt_tokens_request = self._histogram_cls(
            name="vllm:request_prompt_tokens",
            documentation="Number of prefill tokens processed.",
            buckets=build_1_2_5_buckets(max_model_len),
609
610
            labelnames=labelnames,
        )
611
        self.histogram_num_prompt_tokens_request = make_per_engine(
612
613
            histogram_num_prompt_tokens_request, engine_indexes, model_name
        )
614
615
616
617
618

        histogram_num_generation_tokens_request = self._histogram_cls(
            name="vllm:request_generation_tokens",
            documentation="Number of generation tokens processed.",
            buckets=build_1_2_5_buckets(max_model_len),
619
620
            labelnames=labelnames,
        )
621
        self.histogram_num_generation_tokens_request = make_per_engine(
622
623
            histogram_num_generation_tokens_request, engine_indexes, model_name
        )
624

625
626
627
        # TODO: This metric might be incorrect in case of using multiple
        # api_server counts which uses prometheus mp.
        # See: https://github.com/vllm-project/vllm/pull/18053
628
629
630
        histogram_iteration_tokens = self._histogram_cls(
            name="vllm:iteration_tokens_total",
            documentation="Histogram of number of tokens per engine_step.",
631
632
633
            buckets=[1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
            labelnames=labelnames,
        )
634
        self.histogram_iteration_tokens = make_per_engine(
635
636
            histogram_iteration_tokens, engine_indexes, model_name
        )
637
638
639

        histogram_max_num_generation_tokens_request = self._histogram_cls(
            name="vllm:request_max_num_generation_tokens",
640
            documentation="Histogram of maximum number of requested generation tokens.",
641
            buckets=build_1_2_5_buckets(max_model_len),
642
643
            labelnames=labelnames,
        )
644
        self.histogram_max_num_generation_tokens_request = make_per_engine(
645
646
            histogram_max_num_generation_tokens_request, engine_indexes, model_name
        )
647
648
649
650
651

        histogram_n_request = self._histogram_cls(
            name="vllm:request_params_n",
            documentation="Histogram of the n request parameter.",
            buckets=[1, 2, 5, 10, 20],
652
653
654
655
656
            labelnames=labelnames,
        )
        self.histogram_n_request = make_per_engine(
            histogram_n_request, engine_indexes, model_name
        )
657
658
659
660
661

        histogram_max_tokens_request = self._histogram_cls(
            name="vllm:request_params_max_tokens",
            documentation="Histogram of the max_tokens request parameter.",
            buckets=build_1_2_5_buckets(max_model_len),
662
663
            labelnames=labelnames,
        )
664
        self.histogram_max_tokens_request = make_per_engine(
665
666
            histogram_max_tokens_request, engine_indexes, model_name
        )
667
668
669
670

        #
        # Histogram of timing intervals
        #
671
672
673
674
        histogram_time_to_first_token = self._histogram_cls(
            name="vllm:time_to_first_token_seconds",
            documentation="Histogram of time to first token in seconds.",
            buckets=[
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
                0.001,
                0.005,
                0.01,
                0.02,
                0.04,
                0.06,
                0.08,
                0.1,
                0.25,
                0.5,
                0.75,
                1.0,
                2.5,
                5.0,
                7.5,
                10.0,
                20.0,
                40.0,
                80.0,
                160.0,
                640.0,
                2560.0,
697
            ],
698
699
            labelnames=labelnames,
        )
700
        self.histogram_time_to_first_token = make_per_engine(
701
702
            histogram_time_to_first_token, engine_indexes, model_name
        )
703

704
705
        # Deprecated in 0.11 - Renamed as vllm:inter_token_latency_seconds
        # TODO: in 0.12, only enable if show_hidden_metrics=True
706
707
        histogram_time_per_output_token = self._histogram_cls(
            name="vllm:time_per_output_token_seconds",
708
709
            documentation=(
                "Histogram of time per output token in seconds."
710
711
                "DEPRECATED: Use vllm:inter_token_latency_seconds instead."
            ),
712
            buckets=[
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
                0.01,
                0.025,
                0.05,
                0.075,
                0.1,
                0.15,
                0.2,
                0.3,
                0.4,
                0.5,
                0.75,
                1.0,
                2.5,
                5.0,
                7.5,
                10.0,
                20.0,
                40.0,
                80.0,
732
            ],
733
734
            labelnames=labelnames,
        )
735
        self.histogram_time_per_output_token = make_per_engine(
736
737
            histogram_time_per_output_token, engine_indexes, model_name
        )
738

739
740
741
742
        histogram_inter_token_latency = self._histogram_cls(
            name="vllm:inter_token_latency_seconds",
            documentation="Histogram of inter-token latency in seconds.",
            buckets=[
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
                0.01,
                0.025,
                0.05,
                0.075,
                0.1,
                0.15,
                0.2,
                0.3,
                0.4,
                0.5,
                0.75,
                1.0,
                2.5,
                5.0,
                7.5,
                10.0,
                20.0,
                40.0,
                80.0,
762
            ],
763
764
            labelnames=labelnames,
        )
765
        self.histogram_inter_token_latency = make_per_engine(
766
767
            histogram_inter_token_latency, engine_indexes, model_name
        )
768

769
770
        histogram_request_time_per_output_token = self._histogram_cls(
            name="vllm:request_time_per_output_token_seconds",
771
            documentation="Histogram of time_per_output_token_seconds per request.",
772
            buckets=[
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
                0.01,
                0.025,
                0.05,
                0.075,
                0.1,
                0.15,
                0.2,
                0.3,
                0.4,
                0.5,
                0.75,
                1.0,
                2.5,
                5.0,
                7.5,
                10.0,
                20.0,
                40.0,
                80.0,
792
            ],
793
794
            labelnames=labelnames,
        )
795
        self.histogram_request_time_per_output_token = make_per_engine(
796
797
            histogram_request_time_per_output_token, engine_indexes, model_name
        )
798

799
        request_latency_buckets = [
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
            0.3,
            0.5,
            0.8,
            1.0,
            1.5,
            2.0,
            2.5,
            5.0,
            10.0,
            15.0,
            20.0,
            30.0,
            40.0,
            50.0,
            60.0,
            120.0,
            240.0,
            480.0,
            960.0,
            1920.0,
            7680.0,
821
        ]
822
823
824
825
        histogram_e2e_time_request = self._histogram_cls(
            name="vllm:e2e_request_latency_seconds",
            documentation="Histogram of e2e request latency in seconds.",
            buckets=request_latency_buckets,
826
827
            labelnames=labelnames,
        )
828
        self.histogram_e2e_time_request = make_per_engine(
829
830
            histogram_e2e_time_request, engine_indexes, model_name
        )
831
832
833

        histogram_queue_time_request = self._histogram_cls(
            name="vllm:request_queue_time_seconds",
834
            documentation="Histogram of time spent in WAITING phase for request.",
835
            buckets=request_latency_buckets,
836
837
            labelnames=labelnames,
        )
838
        self.histogram_queue_time_request = make_per_engine(
839
840
            histogram_queue_time_request, engine_indexes, model_name
        )
841
842
843

        histogram_inference_time_request = self._histogram_cls(
            name="vllm:request_inference_time_seconds",
844
            documentation="Histogram of time spent in RUNNING phase for request.",
845
            buckets=request_latency_buckets,
846
847
            labelnames=labelnames,
        )
848
        self.histogram_inference_time_request = make_per_engine(
849
850
            histogram_inference_time_request, engine_indexes, model_name
        )
851
852
853

        histogram_prefill_time_request = self._histogram_cls(
            name="vllm:request_prefill_time_seconds",
854
            documentation="Histogram of time spent in PREFILL phase for request.",
855
            buckets=request_latency_buckets,
856
857
            labelnames=labelnames,
        )
858
        self.histogram_prefill_time_request = make_per_engine(
859
860
            histogram_prefill_time_request, engine_indexes, model_name
        )
861
862
863

        histogram_decode_time_request = self._histogram_cls(
            name="vllm:request_decode_time_seconds",
864
            documentation="Histogram of time spent in DECODE phase for request.",
865
            buckets=request_latency_buckets,
866
867
            labelnames=labelnames,
        )
868
        self.histogram_decode_time_request = make_per_engine(
869
870
            histogram_decode_time_request, engine_indexes, model_name
        )
871

872
873
874
        #
        # LoRA metrics
        #
875
876
877

        # TODO: This metric might be incorrect in case of using multiple
        # api_server counts which uses prometheus mp.
878
        self.gauge_lora_info: Gauge | None = None
879
        if vllm_config.lora_config is not None:
880
            if len(self.engine_indexes) > 1:
881
                raise NotImplementedError("LoRA in DP mode is not supported yet.")
882
883
884
885
            self.labelname_max_lora = "max_lora"
            self.labelname_waiting_lora_adapters = "waiting_lora_adapters"
            self.labelname_running_lora_adapters = "running_lora_adapters"
            self.max_lora = vllm_config.lora_config.max_loras
886
887
888
889
890
891
892
893
894
895
            self.gauge_lora_info = self._gauge_cls(
                name="vllm:lora_requests_info",
                documentation="Running stats on lora requests.",
                multiprocess_mode="sum",
                labelnames=[
                    self.labelname_max_lora,
                    self.labelname_waiting_lora_adapters,
                    self.labelname_running_lora_adapters,
                ],
            )
896

897
898
    def log_metrics_info(self, type: str, config_obj: SupportsMetricsInfo):
        metrics_info = config_obj.metrics_info()
899
        metrics_info["engine"] = ""
900
901
902
903
904
905
906
907
908
909

        name, documentation = None, None
        if type == "cache_config":
            name = "vllm:cache_config_info"
            documentation = "Information of the LLMEngine CacheConfig"
        assert name is not None, f"Unknown metrics info type {type}"

        # Info type metrics are syntactic sugar for a gauge permanently set to 1
        # Since prometheus multiprocessing mode does not support Info, emulate
        # info here with a gauge.
910
        info_gauge = self._gauge_cls(
911
912
            name=name,
            documentation=documentation,
913
914
            multiprocess_mode="mostrecent",
            labelnames=metrics_info.keys(),
915
916
917
918
919
920
        )
        for engine_index in self.engine_indexes:
            metrics_info = config_obj.metrics_info()
            metrics_info["engine"] = str(engine_index)
            info_gauge.labels(**metrics_info).set(1)

921
922
    def record(
        self,
923
924
925
        scheduler_stats: SchedulerStats | None,
        iteration_stats: IterationStats | None,
        mm_cache_stats: MultiModalCacheStats | None = None,
926
927
        engine_idx: int = 0,
    ):
928
        """Log to prometheus."""
929
        if scheduler_stats is not None:
930
            self.gauge_scheduler_running[engine_idx].set(
931
932
                scheduler_stats.num_running_reqs
            )
933
            self.gauge_scheduler_waiting[engine_idx].set(
934
935
                scheduler_stats.num_waiting_reqs
            )
936

937
938
            if self.show_hidden_metrics:
                self.gauge_gpu_cache_usage[engine_idx].set(
939
940
941
                    scheduler_stats.kv_cache_usage
                )
            self.gauge_kv_cache_usage[engine_idx].set(scheduler_stats.kv_cache_usage)
942

943
944
            if self.show_hidden_metrics:
                self.counter_gpu_prefix_cache_queries[engine_idx].inc(
945
946
                    scheduler_stats.prefix_cache_stats.queries
                )
947
                self.counter_gpu_prefix_cache_hits[engine_idx].inc(
948
949
                    scheduler_stats.prefix_cache_stats.hits
                )
950

951
            self.counter_prefix_cache_queries[engine_idx].inc(
952
953
                scheduler_stats.prefix_cache_stats.queries
            )
954
            self.counter_prefix_cache_hits[engine_idx].inc(
955
956
                scheduler_stats.prefix_cache_stats.hits
            )
957

958
959
960
961
962
963
964
965
            if scheduler_stats.connector_prefix_cache_stats is not None:
                self.counter_connector_prefix_cache_queries[engine_idx].inc(
                    scheduler_stats.connector_prefix_cache_stats.queries
                )
                self.counter_connector_prefix_cache_hits[engine_idx].inc(
                    scheduler_stats.connector_prefix_cache_stats.hits
                )

966
967
            if scheduler_stats.spec_decoding_stats is not None:
                self.spec_decoding_prom.observe(
968
969
                    scheduler_stats.spec_decoding_stats, engine_idx
                )
970

971
972
973
974
975
            if scheduler_stats.kv_connector_stats is not None:
                self.kv_connector_prom.observe(
                    scheduler_stats.kv_connector_stats, engine_idx
                )

976
977
978
979
        if mm_cache_stats is not None:
            self.counter_mm_cache_queries[engine_idx].inc(mm_cache_stats.queries)
            self.counter_mm_cache_hits[engine_idx].inc(mm_cache_stats.hits)

980
981
982
        if iteration_stats is None:
            return

983
        self.counter_num_preempted_reqs[engine_idx].inc(
984
985
986
            iteration_stats.num_preempted_reqs
        )
        self.counter_prompt_tokens[engine_idx].inc(iteration_stats.num_prompt_tokens)
987
        self.counter_generation_tokens[engine_idx].inc(
988
989
            iteration_stats.num_generation_tokens
        )
990
        self.histogram_iteration_tokens[engine_idx].observe(
991
992
            iteration_stats.num_prompt_tokens + iteration_stats.num_generation_tokens
        )
993

994
        for max_gen_tokens in iteration_stats.max_num_generation_tokens_iter:
995
996
997
            self.histogram_max_num_generation_tokens_request[engine_idx].observe(
                max_gen_tokens
            )
998
        for n_param in iteration_stats.n_params_iter:
999
            self.histogram_n_request[engine_idx].observe(n_param)
1000
        for ttft in iteration_stats.time_to_first_tokens_iter:
1001
            self.histogram_time_to_first_token[engine_idx].observe(ttft)
1002
1003
1004
        for itl in iteration_stats.inter_token_latencies_iter:
            self.histogram_inter_token_latency[engine_idx].observe(itl)
            self.histogram_time_per_output_token[engine_idx].observe(itl)
1005

1006
        for finished_request in iteration_stats.finished_requests:
1007
1008
1009
            self.counter_request_success[finished_request.finish_reason][
                engine_idx
            ].inc()
1010
            self.histogram_e2e_time_request[engine_idx].observe(
1011
1012
                finished_request.e2e_latency
            )
1013
            self.histogram_queue_time_request[engine_idx].observe(
1014
1015
                finished_request.queued_time
            )
1016
            self.histogram_prefill_time_request[engine_idx].observe(
1017
1018
                finished_request.prefill_time
            )
1019
            self.histogram_inference_time_request[engine_idx].observe(
1020
1021
                finished_request.inference_time
            )
1022
            self.histogram_decode_time_request[engine_idx].observe(
1023
1024
                finished_request.decode_time
            )
1025
            self.histogram_num_prompt_tokens_request[engine_idx].observe(
1026
1027
                finished_request.num_prompt_tokens
            )
1028
            self.histogram_num_generation_tokens_request[engine_idx].observe(
1029
1030
                finished_request.num_generation_tokens
            )
1031
            self.histogram_request_time_per_output_token[engine_idx].observe(
1032
1033
                finished_request.mean_time_per_output_token
            )
1034
            if finished_request.max_tokens_param:
1035
                self.histogram_max_tokens_request[engine_idx].observe(
1036
1037
                    finished_request.max_tokens_param
                )
1038

1039
        if self.gauge_lora_info is not None:
1040
1041
1042
1043
1044
1045
            running_lora_adapters = ",".join(
                iteration_stats.running_lora_adapters.keys()
            )
            waiting_lora_adapters = ",".join(
                iteration_stats.waiting_lora_adapters.keys()
            )
1046
1047
1048
1049
1050
            lora_info_labels = {
                self.labelname_running_lora_adapters: running_lora_adapters,
                self.labelname_waiting_lora_adapters: waiting_lora_adapters,
                self.labelname_max_lora: self.max_lora,
            }
1051
            self.gauge_lora_info.labels(**lora_info_labels).set_to_current_time()
1052

1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
    def record_sleep_state(self, sleep: int = 0, level: int = 0):
        awake = 1
        discard_all = 0
        weights_offloaded = 0

        if sleep == 1:
            awake = 0
            if level == 1:
                weights_offloaded = 1
            elif level == 2:
                discard_all = 1

        for engine_idx in self.engine_indexes:
            self.gauge_engine_sleep_state["discard_all"][engine_idx].set(discard_all)
            self.gauge_engine_sleep_state["weights_offloaded"][engine_idx].set(
                weights_offloaded
            )
            self.gauge_engine_sleep_state["awake"][engine_idx].set(awake)

1072
1073
1074
    def log_engine_initialized(self):
        self.log_metrics_info("cache_config", self.vllm_config.cache_config)

1075

1076
PromMetric: TypeAlias = Gauge | Counter | Histogram
1077
1078


1079
1080
1081
def make_per_engine(
    metric: PromMetric, engine_idxs: list[int], model_name: str
) -> dict[int, PromMetric]:
1082
1083
1084
    return {idx: metric.labels(model_name, str(idx)) for idx in engine_idxs}


1085
def build_buckets(mantissa_lst: list[int], max_value: int) -> list[int]:
1086
1087
1088
1089
1090
1091
    """
    Builds a list of buckets with increasing powers of 10 multiplied by
    mantissa values until the value exceeds the specified maximum.

    """
    exponent = 0
1092
    buckets: list[int] = []
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
    while True:
        for m in mantissa_lst:
            value = m * 10**exponent
            if value <= max_value:
                buckets.append(value)
            else:
                return buckets
        exponent += 1


1103
def build_1_2_5_buckets(max_value: int) -> list[int]:
1104
1105
1106
1107
1108
1109
    """
    Example:
    >>> build_1_2_5_buckets(100)
    [1, 2, 5, 10, 20, 50, 100]
    """
    return build_buckets([1, 2, 5], max_value)
1110
1111


1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
class StatLoggerManager:
    """
    StatLoggerManager:
        Logging happens at the level of the EngineCore (per scheduler).
         * DP: >1 EngineCore per AsyncLLM - loggers for each EngineCore.
         * With Local Logger, just make N copies for N EngineCores.
         * With Prometheus, we need a single logger with N "labels"

        This class abstracts away this implementation detail from
        the AsyncLLM, allowing the AsyncLLM to just call .record()
        and .log() to a simple interface.
    """

    def __init__(
        self,
        vllm_config: VllmConfig,
1128
1129
        engine_idxs: list[int] | None = None,
        custom_stat_loggers: list[StatLoggerFactory] | None = None,
1130
        enable_default_loggers: bool = True,
1131
        aggregate_engine_logging: bool = False,
1132
        client_count: int = 1,
1133
    ):
1134
1135
1136
        self.engine_indexes = engine_idxs if engine_idxs else [0]
        self.stat_loggers: list[AggregateStatLoggerBase] = []
        stat_logger_factories: list[StatLoggerFactory] = []
1137
        if custom_stat_loggers is not None:
1138
            stat_logger_factories.extend(custom_stat_loggers)
1139
        if enable_default_loggers and logger.isEnabledFor(logging.INFO):
1140
1141
1142
            if client_count > 1:
                logger.warning(
                    "AsyncLLM created with api_server_count more than 1; "
1143
1144
                    "disabling stats logging to avoid incomplete stats."
                )
1145
            else:
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
                default_logger_factory = (
                    AggregatedLoggingStatLogger
                    if aggregate_engine_logging
                    else LoggingStatLogger
                )
                stat_logger_factories.append(default_logger_factory)
        custom_prometheus_logger: bool = False
        for stat_logger_factory in stat_logger_factories:
            if isinstance(stat_logger_factory, type) and issubclass(
                stat_logger_factory, AggregateStatLoggerBase
            ):
                global_stat_logger = stat_logger_factory(
                    vllm_config=vllm_config,
                    engine_indexes=self.engine_indexes,
                )
                if isinstance(global_stat_logger, PrometheusStatLogger):
                    custom_prometheus_logger = True
            else:
                # per engine logger
                global_stat_logger = PerEngineStatLoggerAdapter(
                    vllm_config=vllm_config,
                    engine_indexes=self.engine_indexes,
                    per_engine_stat_logger_factory=stat_logger_factory,  # type: ignore[arg-type]
                )
            self.stat_loggers.append(global_stat_logger)
        if not custom_prometheus_logger:
            self.stat_loggers.append(
                PrometheusStatLogger(vllm_config, self.engine_indexes)
            )
1175
1176
1177

    def record(
        self,
1178
1179
1180
1181
        scheduler_stats: SchedulerStats | None,
        iteration_stats: IterationStats | None,
        mm_cache_stats: MultiModalCacheStats | None = None,
        engine_idx: int | None = None,
1182
1183
1184
    ):
        if engine_idx is None:
            engine_idx = 0
1185
        for logger in self.stat_loggers:
1186
1187
1188
1189
1190
1191
            logger.record(
                scheduler_stats,
                iteration_stats,
                mm_cache_stats=mm_cache_stats,
                engine_idx=engine_idx,
            )
1192

1193
1194
1195
1196
    def record_sleep_state(self, sleep: int = 0, level: int = 0):
        for logger in self.stat_loggers:
            logger.record_sleep_state(sleep, level)

1197
    def log(self):
1198
1199
        for logger in self.stat_loggers:
            logger.log()
1200
1201

    def log_engine_initialized(self):
1202
1203
        for agg_logger in self.stat_loggers:
            agg_logger.log_engine_initialized()