loggers.py 42.4 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
import logging
5
6
import time
from abc import ABC, abstractmethod
7
8
from collections.abc import Callable
from typing import TypeAlias
9

10
from prometheus_client import Counter, Gauge, Histogram
11

12
import vllm.envs as envs
13
from vllm.config import SupportsMetricsInfo, VllmConfig
14
15
16
17
from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
    KVConnectorLogging,
    KVConnectorPrometheus,
)
18
from vllm.logger import init_logger
wangxiyuan's avatar
wangxiyuan committed
19
from vllm.plugins import STAT_LOGGER_PLUGINS_GROUP, load_plugins_by_group
20
from vllm.v1.engine import FinishReason
21
from vllm.v1.metrics.prometheus import unregister_vllm_metrics
22
23
24
25
26
27
from vllm.v1.metrics.stats import (
    CachingMetrics,
    IterationStats,
    MultiModalCacheStats,
    SchedulerStats,
)
28
from vllm.v1.spec_decode.metrics import SpecDecodingLogging, SpecDecodingProm
29
30
31

logger = init_logger(__name__)

32
33
34
PerEngineStatLoggerFactory = Callable[[VllmConfig, int], "StatLoggerBase"]
AggregateStatLoggerFactory = type["AggregateStatLoggerBase"]
StatLoggerFactory = AggregateStatLoggerFactory | PerEngineStatLoggerFactory
35

36
37

class StatLoggerBase(ABC):
38
39
40
41
42
43
44
45
    """Interface for logging metrics.

    API users may define custom loggers that implement this interface.
    However, note that the `SchedulerStats` and `IterationStats` classes
    are not considered stable interfaces and may change in future versions.
    """

    @abstractmethod
46
    def __init__(self, vllm_config: VllmConfig, engine_index: int = 0): ...
47
48

    @abstractmethod
49
50
    def record(
        self,
51
52
53
        scheduler_stats: SchedulerStats | None,
        iteration_stats: IterationStats | None,
        mm_cache_stats: MultiModalCacheStats | None = None,
54
55
        engine_idx: int = 0,
    ): ...
56

57
    @abstractmethod
58
    def log_engine_initialized(self): ...
59

60
61
62
    def log(self):  # noqa
        pass

63
64
65
    def record_sleep_state(self, is_awake: int, level: int):  # noqa
        pass

66

67
68
69
def load_stat_logger_plugin_factories() -> list[StatLoggerFactory]:
    factories: list[StatLoggerFactory] = []

wangxiyuan's avatar
wangxiyuan committed
70
    for name, plugin_class in load_plugins_by_group(STAT_LOGGER_PLUGINS_GROUP).items():
71
72
73
74
75
76
77
78
79
80
81
82
83
        if not isinstance(plugin_class, type) or not issubclass(
            plugin_class, StatLoggerBase
        ):
            raise TypeError(
                f"Stat logger plugin {name!r} must be a subclass of "
                f"StatLoggerBase (got {plugin_class!r})."
            )

        factories.append(plugin_class)

    return factories


84
85
86
87
88
89
90
91
class AggregateStatLoggerBase(StatLoggerBase):
    """Abstract base class for loggers that
    aggregate across multiple DP engines."""

    @abstractmethod
    def __init__(self, vllm_config: VllmConfig, engine_indexes: list[int]): ...


92
class LoggingStatLogger(StatLoggerBase):
93
    def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
94
        self.engine_index = engine_index
95
        self.vllm_config = vllm_config
96
        self._reset(time.monotonic())
97

98
        self.last_scheduler_stats = SchedulerStats()
99
100

        # Caching metrics. This cannot be reset.
101
        # TODO: Make the interval configurable.
102
        self.prefix_caching_metrics = CachingMetrics()
103
        self.connector_prefix_caching_metrics = CachingMetrics()
104
105
        self.mm_caching_metrics = CachingMetrics()

106
        self.spec_decoding_logging = SpecDecodingLogging()
107
108
        kv_transfer_config = self.vllm_config.kv_transfer_config
        self.kv_connector_logging = KVConnectorLogging(kv_transfer_config)
109
110
        self.last_prompt_throughput: float = 0.0
        self.last_generation_throughput: float = 0.0
111
112
        self.engine_is_idle = False
        self.aggregated = False
113

114
115
116
117
    def _reset(self, now):
        self.last_log_time = now

        # Tracked stats over current local logging interval.
118
119
        self.num_prompt_tokens: int = 0
        self.num_generation_tokens: int = 0
120
        self.num_corrupted_reqs: int = 0
121
        self.num_preemptions: int = 0
122

123
124
    def _track_iteration_stats(self, iteration_stats: IterationStats):
        # Save tracked stats for token counters.
125
126
        self.num_prompt_tokens += iteration_stats.num_prompt_tokens
        self.num_generation_tokens += iteration_stats.num_generation_tokens
127
        self.num_corrupted_reqs += iteration_stats.num_corrupted_reqs
128
        self.num_preemptions += iteration_stats.num_preempted_reqs
129

130
    def _get_throughput(self, tracked_stats: int, now: float) -> float:
131
        # Compute summary metrics for tracked stats
132
133
134
135
        delta_time = now - self.last_log_time
        if delta_time <= 0.0:
            return 0.0
        return float(tracked_stats / delta_time)
136

137
138
139
140
    @property
    def log_prefix(self):
        return "Engine {:03d}: ".format(self.engine_index)

141
142
    def record(
        self,
143
144
145
        scheduler_stats: SchedulerStats | None,
        iteration_stats: IterationStats | None,
        mm_cache_stats: MultiModalCacheStats | None = None,
146
147
        engine_idx: int = 0,
    ):
148
        """Log Stats to standard output."""
149
150
        if iteration_stats:
            self._track_iteration_stats(iteration_stats)
151

152
        if scheduler_stats is not None:
153
            self.prefix_caching_metrics.observe(scheduler_stats.prefix_cache_stats)
154

155
156
157
158
159
            if scheduler_stats.connector_prefix_cache_stats is not None:
                self.connector_prefix_caching_metrics.observe(
                    scheduler_stats.connector_prefix_cache_stats
                )

160
            if scheduler_stats.spec_decoding_stats is not None:
161
                self.spec_decoding_logging.observe(scheduler_stats.spec_decoding_stats)
162
            if kv_connector_stats := scheduler_stats.kv_connector_stats:
163
                self.kv_connector_logging.observe(kv_connector_stats)
164
165
            if not self.aggregated:
                self.last_scheduler_stats = scheduler_stats
166
167
168
        if mm_cache_stats:
            self.mm_caching_metrics.observe(mm_cache_stats)

169
    def _update_stats(self):
170
        now = time.monotonic()
171
        prompt_throughput = self._get_throughput(self.num_prompt_tokens, now)
172
        generation_throughput = self._get_throughput(self.num_generation_tokens, now)
173
174

        self._reset(now)
175
        self.engine_is_idle = not any(
176
177
178
179
180
181
            (
                prompt_throughput,
                generation_throughput,
                self.last_prompt_throughput,
                self.last_generation_throughput,
            )
182
        )
183
184
185
        self.last_generation_throughput = generation_throughput
        self.last_prompt_throughput = prompt_throughput

186
187
188
189
190
191
192
193
194
    def aggregate_scheduler_stats(self):
        # noop for per engine loggers
        return

    def log(self):
        self._update_stats()
        self.aggregate_scheduler_stats()
        # Avoid log noise on an idle production system
        log_fn = logger.debug if self.engine_is_idle else logger.info
195
        # Format and print output.
196
197
198
199
200
201
202
        log_parts = [
            "Avg prompt throughput: %.1f tokens/s",
            "Avg generation throughput: %.1f tokens/s",
            "Running: %d reqs",
            "Waiting: %d reqs",
        ]
        log_args = [
203
204
205
206
            self.last_prompt_throughput,
            self.last_generation_throughput,
            self.last_scheduler_stats.num_running_reqs,
            self.last_scheduler_stats.num_waiting_reqs,
207
        ]
208

209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
        if self.num_preemptions > 0:
            log_parts.append("Preemptions: %d")
            log_args.append(self.num_preemptions)

        log_parts.extend(
            [
                "GPU KV cache usage: %.1f%%",
                "Prefix cache hit rate: %.1f%%",
            ]
        )
        log_args.extend(
            [
                self.last_scheduler_stats.kv_cache_usage * 100,
                self.prefix_caching_metrics.hit_rate * 100,
            ]
        )

226
227
228
        if envs.VLLM_COMPUTE_NANS_IN_LOGITS:
            log_parts.append("Corrupted: %d reqs")
            log_args.append(self.num_corrupted_reqs)
229
230
231
        if not self.connector_prefix_caching_metrics.empty:
            log_parts.append("External prefix cache hit rate: %.1f%%")
            log_args.append(self.connector_prefix_caching_metrics.hit_rate * 100)
232
        if not self.mm_caching_metrics.empty:
233
234
235
236
            log_parts.append("MM cache hit rate: %.1f%%")
            log_args.append(self.mm_caching_metrics.hit_rate * 100)

        log_fn(
237
            self.log_prefix + ", ".join(log_parts),
238
            *log_args,
239
        )
240

241
        self.spec_decoding_logging.log(log_fn=log_fn)
242
        self.kv_connector_logging.log(log_fn=log_fn)
243

244
    def log_engine_initialized(self):
245
        if self.vllm_config.cache_config.num_gpu_blocks:
246
            logger.debug(
247
                "Engine %03d: vllm cache_config_info with initialization "
248
249
250
251
                "after num_gpu_blocks is: %d",
                self.engine_index,
                self.vllm_config.cache_config.num_gpu_blocks,
            )
252

253

254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
class AggregatedLoggingStatLogger(LoggingStatLogger, AggregateStatLoggerBase):
    def __init__(
        self,
        vllm_config: VllmConfig,
        engine_indexes: list[int],
    ):
        self.engine_indexes = engine_indexes
        self.last_scheduler_stats_dict: dict[int, SchedulerStats] = {
            idx: SchedulerStats() for idx in self.engine_indexes
        }
        LoggingStatLogger.__init__(self, vllm_config, engine_index=-1)
        self.aggregated = True

    @property
    def log_prefix(self):
        return "{} Engines Aggregated: ".format(len(self.engine_indexes))

    def record(
        self,
        scheduler_stats: SchedulerStats | None,
        iteration_stats: IterationStats | None,
        mm_cache_stats: MultiModalCacheStats | None = None,
        engine_idx: int = 0,
    ):
        if engine_idx not in self.engine_indexes:
            logger.warning("Unexpected engine_idx: %d", engine_idx)
            return
        LoggingStatLogger.record(
            self,
            scheduler_stats,
            iteration_stats,
            mm_cache_stats=mm_cache_stats,
            engine_idx=engine_idx,
        )
        if scheduler_stats is not None:
            self.last_scheduler_stats_dict[engine_idx] = scheduler_stats

    def aggregate_scheduler_stats(self):
        self.last_scheduler_stats = SchedulerStats()
        for last_scheduler_stats in self.last_scheduler_stats_dict.values():
            self.last_scheduler_stats.num_waiting_reqs += (
                last_scheduler_stats.num_waiting_reqs
            )
            self.last_scheduler_stats.num_running_reqs += (
                last_scheduler_stats.num_running_reqs
            )
            self.last_scheduler_stats.kv_cache_usage += (
                last_scheduler_stats.kv_cache_usage
            )
        self.last_scheduler_stats.kv_cache_usage /= len(self.last_scheduler_stats_dict)

    def log(self):
        LoggingStatLogger.log(self)

    def log_engine_initialized(self):
        if self.vllm_config.cache_config.num_gpu_blocks:
            logger.info(
                "%d Engines: vllm cache_config_info with initialization "
                "after num_gpu_blocks is: %d",
                len(self.engine_indexes),
                self.vllm_config.cache_config.num_gpu_blocks,
            )


class PerEngineStatLoggerAdapter(AggregateStatLoggerBase):
    def __init__(
        self,
        vllm_config: VllmConfig,
        engine_indexes: list[int],
        per_engine_stat_logger_factory: PerEngineStatLoggerFactory,
    ) -> None:
        self.per_engine_stat_loggers = {}
        self.engine_indexes = engine_indexes
        for engine_index in engine_indexes:
            self.per_engine_stat_loggers[engine_index] = per_engine_stat_logger_factory(
                vllm_config, engine_index
            )

    def record(
        self,
        scheduler_stats: SchedulerStats | None,
        iteration_stats: IterationStats | None,
        mm_cache_stats: MultiModalCacheStats | None = None,
        engine_idx: int = 0,
    ):
        if engine_idx not in self.per_engine_stat_loggers:
            logger.warning("Unexpected engine_idx: %d", engine_idx)
            return
        self.per_engine_stat_loggers[engine_idx].record(
            scheduler_stats,
            iteration_stats,
            mm_cache_stats=mm_cache_stats,
            engine_idx=engine_idx,
        )

    def log(self):
        for per_engine_stat_logger in self.per_engine_stat_loggers.values():
            per_engine_stat_logger.log()

    def log_engine_initialized(self):
        for per_engine_stat_logger in self.per_engine_stat_loggers.values():
            per_engine_stat_logger.log_engine_initialized()


class PrometheusStatLogger(AggregateStatLoggerBase):
359
360
361
    _gauge_cls = Gauge
    _counter_cls = Counter
    _histogram_cls = Histogram
362
    _spec_decoding_cls = SpecDecodingProm
363
    _kv_connector_cls = KVConnectorPrometheus
364

365
    def __init__(
366
        self, vllm_config: VllmConfig, engine_indexes: list[int] | None = None
367
    ):
368
369
        if engine_indexes is None:
            engine_indexes = [0]
370

371
        self.engine_indexes = engine_indexes
372
373

        unregister_vllm_metrics()
374
        self.vllm_config = vllm_config
375
376
        # Use this flag to hide metrics that were deprecated in
        # a previous release and which will be removed future
377
        self.show_hidden_metrics = vllm_config.observability_config.show_hidden_metrics
378

379
        labelnames = ["model_name", "engine"]
380
        model_name = vllm_config.model_config.served_model_name
381
        max_model_len = vllm_config.model_config.max_model_len
382

383
        per_engine_labelvalues: dict[int, list[object]] = {
384
            idx: [model_name, str(idx)] for idx in engine_indexes
385
386
        }

387
        self.spec_decoding_prom = self._spec_decoding_cls(
388
389
390
391
            vllm_config.speculative_config, labelnames, per_engine_labelvalues
        )
        self.kv_connector_prom = self._kv_connector_cls(
            vllm_config, labelnames, per_engine_labelvalues
392
        )
393

394
395
396
        #
        # Scheduler state
        #
397
        gauge_scheduler_running = self._gauge_cls(
398
399
            name="vllm:num_requests_running",
            documentation="Number of requests in model execution batches.",
400
            multiprocess_mode="mostrecent",
401
402
403
404
405
            labelnames=labelnames,
        )
        self.gauge_scheduler_running = make_per_engine(
            gauge_scheduler_running, engine_indexes, model_name
        )
406

407
        gauge_scheduler_waiting = self._gauge_cls(
408
409
            name="vllm:num_requests_waiting",
            documentation="Number of requests waiting to be processed.",
410
            multiprocess_mode="mostrecent",
411
412
413
414
415
            labelnames=labelnames,
        )
        self.gauge_scheduler_waiting = make_per_engine(
            gauge_scheduler_waiting, engine_indexes, model_name
        )
416

417
418
419
420
421
422
423
424
425
426
427
428
429
430
        gauge_engine_sleep_state = self._gauge_cls(
            name="vllm:engine_sleep_state",
            documentation=(
                "Engine sleep state; awake = 0 means engine is sleeping; "
                "awake = 1 means engine is awake; "
                "weights_offloaded = 1 means sleep level 1; "
                "discard_all = 1 means sleep level 2."
            ),
            labelnames=labelnames + ["sleep_state"],
            multiprocess_mode="mostrecent",
        )

        self.gauge_engine_sleep_state = {}
        sleep_state = ["awake", "weights_offloaded", "discard_all"]
431

432
433
434
435
436
437
438
        for s in sleep_state:
            self.gauge_engine_sleep_state[s] = {
                idx: gauge_engine_sleep_state.labels(
                    engine=idx, model_name=model_name, sleep_state=s
                )
                for idx in engine_indexes
            }
439

440
441
        # Setting default values
        self.record_sleep_state()
442

443
        gauge_kv_cache_usage = self._gauge_cls(
444
445
            name="vllm:kv_cache_usage_perc",
            documentation="KV-cache usage. 1 means 100 percent usage.",
446
            multiprocess_mode="mostrecent",
447
448
449
450
451
            labelnames=labelnames,
        )
        self.gauge_kv_cache_usage = make_per_engine(
            gauge_kv_cache_usage, engine_indexes, model_name
        )
452

453
454
455
456
457
458
459
460
461
462
463
464
465
        if envs.VLLM_COMPUTE_NANS_IN_LOGITS:
            counter_corrupted_requests = self._counter_cls(
                name="vllm:corrupted_requests",
                documentation=(
                    "Corrupted requests, in terms of total number of requests "
                    "with NaNs in logits."
                ),
                labelnames=labelnames,
            )
            self.counter_corrupted_requests = make_per_engine(
                counter_corrupted_requests, engine_indexes, model_name
            )

466
        counter_prefix_cache_queries = self._counter_cls(
467
468
            name="vllm:prefix_cache_queries",
            documentation=(
469
470
471
472
                "Prefix cache queries, in terms of number of queried tokens."
            ),
            labelnames=labelnames,
        )
473
        self.counter_prefix_cache_queries = make_per_engine(
474
475
            counter_prefix_cache_queries, engine_indexes, model_name
        )
476

477
        counter_prefix_cache_hits = self._counter_cls(
478
            name="vllm:prefix_cache_hits",
479
480
481
            documentation=("Prefix cache hits, in terms of number of cached tokens."),
            labelnames=labelnames,
        )
482
        self.counter_prefix_cache_hits = make_per_engine(
483
484
            counter_prefix_cache_hits, engine_indexes, model_name
        )
485

486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
        #
        # External - KV connector prefix cache
        #

        counter_connector_prefix_cache_queries = self._counter_cls(
            name="vllm:external_prefix_cache_queries",
            documentation=(
                "External prefix cache queries from KV connector "
                "cross-instance cache sharing, in terms of number of queried tokens."
            ),
            labelnames=labelnames,
        )
        self.counter_connector_prefix_cache_queries = make_per_engine(
            counter_connector_prefix_cache_queries, engine_indexes, model_name
        )

        counter_connector_prefix_cache_hits = self._counter_cls(
            name="vllm:external_prefix_cache_hits",
            documentation=(
                "External prefix cache hits from KV connector "
                "cross-instance cache sharing, in terms of number of cached tokens."
            ),
            labelnames=labelnames,
        )
        self.counter_connector_prefix_cache_hits = make_per_engine(
            counter_connector_prefix_cache_hits, engine_indexes, model_name
        )

514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
        #
        # Multi-modal cache
        #

        counter_mm_cache_queries = self._counter_cls(
            name="vllm:mm_cache_queries",
            documentation=(
                "Multi-modal cache queries, in terms of number of queried items."
            ),
            labelnames=labelnames,
        )
        self.counter_mm_cache_queries = make_per_engine(
            counter_mm_cache_queries, engine_indexes, model_name
        )

        counter_mm_cache_hits = self._counter_cls(
            name="vllm:mm_cache_hits",
            documentation=(
                "Multi-modal cache hits, in terms of number of cached items."
            ),
            labelnames=labelnames,
        )
        self.counter_mm_cache_hits = make_per_engine(
            counter_mm_cache_hits, engine_indexes, model_name
        )

540
541
542
        #
        # Counters
        #
543
        counter_num_preempted_reqs = self._counter_cls(
544
            name="vllm:num_preemptions",
545
            documentation="Cumulative number of preemption from the engine.",
546
547
            labelnames=labelnames,
        )
548
        self.counter_num_preempted_reqs = make_per_engine(
549
550
            counter_num_preempted_reqs, engine_indexes, model_name
        )
551

552
        counter_prompt_tokens = self._counter_cls(
553
            name="vllm:prompt_tokens",
554
            documentation="Number of prefill tokens processed.",
555
556
557
558
559
            labelnames=labelnames,
        )
        self.counter_prompt_tokens = make_per_engine(
            counter_prompt_tokens, engine_indexes, model_name
        )
560

561
        counter_generation_tokens = self._counter_cls(
562
            name="vllm:generation_tokens",
563
            documentation="Number of generation tokens processed.",
564
565
            labelnames=labelnames,
        )
566
        self.counter_generation_tokens = make_per_engine(
567
568
            counter_generation_tokens, engine_indexes, model_name
        )
569

570
        self.counter_request_success: dict[FinishReason, dict[int, Counter]] = {}
571
        counter_request_success_base = self._counter_cls(
572
            name="vllm:request_success",
573
            documentation="Count of successfully processed requests.",
574
575
            labelnames=labelnames + ["finished_reason"],
        )
576
        for reason in FinishReason:
577
            self.counter_request_success[reason] = {
578
579
580
                idx: counter_request_success_base.labels(
                    model_name, str(idx), str(reason)
                )
581
582
                for idx in engine_indexes
            }
583

584
585
586
        #
        # Histograms of counts
        #
587
588
589
590
        histogram_num_prompt_tokens_request = self._histogram_cls(
            name="vllm:request_prompt_tokens",
            documentation="Number of prefill tokens processed.",
            buckets=build_1_2_5_buckets(max_model_len),
591
592
            labelnames=labelnames,
        )
593
        self.histogram_num_prompt_tokens_request = make_per_engine(
594
595
            histogram_num_prompt_tokens_request, engine_indexes, model_name
        )
596
597
598
599
600

        histogram_num_generation_tokens_request = self._histogram_cls(
            name="vllm:request_generation_tokens",
            documentation="Number of generation tokens processed.",
            buckets=build_1_2_5_buckets(max_model_len),
601
602
            labelnames=labelnames,
        )
603
        self.histogram_num_generation_tokens_request = make_per_engine(
604
605
            histogram_num_generation_tokens_request, engine_indexes, model_name
        )
606

607
608
609
        # TODO: This metric might be incorrect in case of using multiple
        # api_server counts which uses prometheus mp.
        # See: https://github.com/vllm-project/vllm/pull/18053
610
611
612
        histogram_iteration_tokens = self._histogram_cls(
            name="vllm:iteration_tokens_total",
            documentation="Histogram of number of tokens per engine_step.",
613
614
615
            buckets=[1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
            labelnames=labelnames,
        )
616
        self.histogram_iteration_tokens = make_per_engine(
617
618
            histogram_iteration_tokens, engine_indexes, model_name
        )
619
620
621

        histogram_max_num_generation_tokens_request = self._histogram_cls(
            name="vllm:request_max_num_generation_tokens",
622
            documentation="Histogram of maximum number of requested generation tokens.",
623
            buckets=build_1_2_5_buckets(max_model_len),
624
625
            labelnames=labelnames,
        )
626
        self.histogram_max_num_generation_tokens_request = make_per_engine(
627
628
            histogram_max_num_generation_tokens_request, engine_indexes, model_name
        )
629
630
631
632
633

        histogram_n_request = self._histogram_cls(
            name="vllm:request_params_n",
            documentation="Histogram of the n request parameter.",
            buckets=[1, 2, 5, 10, 20],
634
635
636
637
638
            labelnames=labelnames,
        )
        self.histogram_n_request = make_per_engine(
            histogram_n_request, engine_indexes, model_name
        )
639
640
641
642
643

        histogram_max_tokens_request = self._histogram_cls(
            name="vllm:request_params_max_tokens",
            documentation="Histogram of the max_tokens request parameter.",
            buckets=build_1_2_5_buckets(max_model_len),
644
645
            labelnames=labelnames,
        )
646
        self.histogram_max_tokens_request = make_per_engine(
647
648
            histogram_max_tokens_request, engine_indexes, model_name
        )
649
650
651
652

        #
        # Histogram of timing intervals
        #
653
654
655
656
        histogram_time_to_first_token = self._histogram_cls(
            name="vllm:time_to_first_token_seconds",
            documentation="Histogram of time to first token in seconds.",
            buckets=[
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
                0.001,
                0.005,
                0.01,
                0.02,
                0.04,
                0.06,
                0.08,
                0.1,
                0.25,
                0.5,
                0.75,
                1.0,
                2.5,
                5.0,
                7.5,
                10.0,
                20.0,
                40.0,
                80.0,
                160.0,
                640.0,
                2560.0,
679
            ],
680
681
            labelnames=labelnames,
        )
682
        self.histogram_time_to_first_token = make_per_engine(
683
684
            histogram_time_to_first_token, engine_indexes, model_name
        )
685

686
        # Deprecated in 0.11 - Renamed as vllm:inter_token_latency_seconds
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
        # With 0.12.x you can enable with --show-hidden-metrics-for-version=0.11
        # TODO: remove in 0.13.0
        if self.show_hidden_metrics:
            histogram_time_per_output_token = self._histogram_cls(
                name="vllm:time_per_output_token_seconds",
                documentation=(
                    "Histogram of time per output token in seconds."
                    "DEPRECATED: Use vllm:inter_token_latency_seconds instead."
                ),
                buckets=[
                    0.01,
                    0.025,
                    0.05,
                    0.075,
                    0.1,
                    0.15,
                    0.2,
                    0.3,
                    0.4,
                    0.5,
                    0.75,
                    1.0,
                    2.5,
                    5.0,
                    7.5,
                    10.0,
                    20.0,
                    40.0,
                    80.0,
                ],
                labelnames=labelnames,
            )
            self.histogram_time_per_output_token = make_per_engine(
                histogram_time_per_output_token, engine_indexes, model_name
            )
722

723
724
725
726
        histogram_inter_token_latency = self._histogram_cls(
            name="vllm:inter_token_latency_seconds",
            documentation="Histogram of inter-token latency in seconds.",
            buckets=[
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
                0.01,
                0.025,
                0.05,
                0.075,
                0.1,
                0.15,
                0.2,
                0.3,
                0.4,
                0.5,
                0.75,
                1.0,
                2.5,
                5.0,
                7.5,
                10.0,
                20.0,
                40.0,
                80.0,
746
            ],
747
748
            labelnames=labelnames,
        )
749
        self.histogram_inter_token_latency = make_per_engine(
750
751
            histogram_inter_token_latency, engine_indexes, model_name
        )
752

753
754
        histogram_request_time_per_output_token = self._histogram_cls(
            name="vllm:request_time_per_output_token_seconds",
755
            documentation="Histogram of time_per_output_token_seconds per request.",
756
            buckets=[
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
                0.01,
                0.025,
                0.05,
                0.075,
                0.1,
                0.15,
                0.2,
                0.3,
                0.4,
                0.5,
                0.75,
                1.0,
                2.5,
                5.0,
                7.5,
                10.0,
                20.0,
                40.0,
                80.0,
776
            ],
777
778
            labelnames=labelnames,
        )
779
        self.histogram_request_time_per_output_token = make_per_engine(
780
781
            histogram_request_time_per_output_token, engine_indexes, model_name
        )
782

783
        request_latency_buckets = [
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
            0.3,
            0.5,
            0.8,
            1.0,
            1.5,
            2.0,
            2.5,
            5.0,
            10.0,
            15.0,
            20.0,
            30.0,
            40.0,
            50.0,
            60.0,
            120.0,
            240.0,
            480.0,
            960.0,
            1920.0,
            7680.0,
805
        ]
806
807
808
809
        histogram_e2e_time_request = self._histogram_cls(
            name="vllm:e2e_request_latency_seconds",
            documentation="Histogram of e2e request latency in seconds.",
            buckets=request_latency_buckets,
810
811
            labelnames=labelnames,
        )
812
        self.histogram_e2e_time_request = make_per_engine(
813
814
            histogram_e2e_time_request, engine_indexes, model_name
        )
815
816
817

        histogram_queue_time_request = self._histogram_cls(
            name="vllm:request_queue_time_seconds",
818
            documentation="Histogram of time spent in WAITING phase for request.",
819
            buckets=request_latency_buckets,
820
821
            labelnames=labelnames,
        )
822
        self.histogram_queue_time_request = make_per_engine(
823
824
            histogram_queue_time_request, engine_indexes, model_name
        )
825
826
827

        histogram_inference_time_request = self._histogram_cls(
            name="vllm:request_inference_time_seconds",
828
            documentation="Histogram of time spent in RUNNING phase for request.",
829
            buckets=request_latency_buckets,
830
831
            labelnames=labelnames,
        )
832
        self.histogram_inference_time_request = make_per_engine(
833
834
            histogram_inference_time_request, engine_indexes, model_name
        )
835
836
837

        histogram_prefill_time_request = self._histogram_cls(
            name="vllm:request_prefill_time_seconds",
838
            documentation="Histogram of time spent in PREFILL phase for request.",
839
            buckets=request_latency_buckets,
840
841
            labelnames=labelnames,
        )
842
        self.histogram_prefill_time_request = make_per_engine(
843
844
            histogram_prefill_time_request, engine_indexes, model_name
        )
845
846
847

        histogram_decode_time_request = self._histogram_cls(
            name="vllm:request_decode_time_seconds",
848
            documentation="Histogram of time spent in DECODE phase for request.",
849
            buckets=request_latency_buckets,
850
851
            labelnames=labelnames,
        )
852
        self.histogram_decode_time_request = make_per_engine(
853
854
            histogram_decode_time_request, engine_indexes, model_name
        )
855

856
857
858
        #
        # LoRA metrics
        #
859
860
861

        # TODO: This metric might be incorrect in case of using multiple
        # api_server counts which uses prometheus mp.
862
        self.gauge_lora_info: Gauge | None = None
863
        if vllm_config.lora_config is not None:
864
            if len(self.engine_indexes) > 1:
865
                raise NotImplementedError("LoRA in DP mode is not supported yet.")
866
867
868
869
            self.labelname_max_lora = "max_lora"
            self.labelname_waiting_lora_adapters = "waiting_lora_adapters"
            self.labelname_running_lora_adapters = "running_lora_adapters"
            self.max_lora = vllm_config.lora_config.max_loras
870
871
872
873
874
875
876
877
878
879
            self.gauge_lora_info = self._gauge_cls(
                name="vllm:lora_requests_info",
                documentation="Running stats on lora requests.",
                multiprocess_mode="sum",
                labelnames=[
                    self.labelname_max_lora,
                    self.labelname_waiting_lora_adapters,
                    self.labelname_running_lora_adapters,
                ],
            )
880

881
882
    def log_metrics_info(self, type: str, config_obj: SupportsMetricsInfo):
        metrics_info = config_obj.metrics_info()
883
        metrics_info["engine"] = ""
884
885
886
887
888
889
890
891
892
893

        name, documentation = None, None
        if type == "cache_config":
            name = "vllm:cache_config_info"
            documentation = "Information of the LLMEngine CacheConfig"
        assert name is not None, f"Unknown metrics info type {type}"

        # Info type metrics are syntactic sugar for a gauge permanently set to 1
        # Since prometheus multiprocessing mode does not support Info, emulate
        # info here with a gauge.
894
        info_gauge = self._gauge_cls(
895
896
            name=name,
            documentation=documentation,
897
898
            multiprocess_mode="mostrecent",
            labelnames=metrics_info.keys(),
899
900
901
902
903
904
        )
        for engine_index in self.engine_indexes:
            metrics_info = config_obj.metrics_info()
            metrics_info["engine"] = str(engine_index)
            info_gauge.labels(**metrics_info).set(1)

905
906
    def record(
        self,
907
908
909
        scheduler_stats: SchedulerStats | None,
        iteration_stats: IterationStats | None,
        mm_cache_stats: MultiModalCacheStats | None = None,
910
911
        engine_idx: int = 0,
    ):
912
        """Log to prometheus."""
913
        if scheduler_stats is not None:
914
            self.gauge_scheduler_running[engine_idx].set(
915
916
                scheduler_stats.num_running_reqs
            )
917
            self.gauge_scheduler_waiting[engine_idx].set(
918
919
920
                scheduler_stats.num_waiting_reqs
            )
            self.gauge_kv_cache_usage[engine_idx].set(scheduler_stats.kv_cache_usage)
921

922
            self.counter_prefix_cache_queries[engine_idx].inc(
923
924
                scheduler_stats.prefix_cache_stats.queries
            )
925
            self.counter_prefix_cache_hits[engine_idx].inc(
926
927
                scheduler_stats.prefix_cache_stats.hits
            )
928

929
930
931
932
933
934
935
936
            if scheduler_stats.connector_prefix_cache_stats is not None:
                self.counter_connector_prefix_cache_queries[engine_idx].inc(
                    scheduler_stats.connector_prefix_cache_stats.queries
                )
                self.counter_connector_prefix_cache_hits[engine_idx].inc(
                    scheduler_stats.connector_prefix_cache_stats.hits
                )

937
938
            if scheduler_stats.spec_decoding_stats is not None:
                self.spec_decoding_prom.observe(
939
940
                    scheduler_stats.spec_decoding_stats, engine_idx
                )
941

942
943
944
945
946
            if scheduler_stats.kv_connector_stats is not None:
                self.kv_connector_prom.observe(
                    scheduler_stats.kv_connector_stats, engine_idx
                )

947
948
949
950
951
952
953
954
955
956
957
958
959
960
            if self.gauge_lora_info is not None:
                running_lora_adapters = ",".join(
                    scheduler_stats.running_lora_adapters.keys()
                )
                waiting_lora_adapters = ",".join(
                    scheduler_stats.waiting_lora_adapters.keys()
                )
                lora_info_labels = {
                    self.labelname_running_lora_adapters: running_lora_adapters,
                    self.labelname_waiting_lora_adapters: waiting_lora_adapters,
                    self.labelname_max_lora: self.max_lora,
                }
                self.gauge_lora_info.labels(**lora_info_labels).set_to_current_time()

961
962
963
964
        if mm_cache_stats is not None:
            self.counter_mm_cache_queries[engine_idx].inc(mm_cache_stats.queries)
            self.counter_mm_cache_hits[engine_idx].inc(mm_cache_stats.hits)

965
966
        if iteration_stats is None:
            return
967
968
969
970
        if envs.VLLM_COMPUTE_NANS_IN_LOGITS:
            self.counter_corrupted_requests[engine_idx].inc(
                iteration_stats.num_corrupted_reqs
            )
971
        self.counter_num_preempted_reqs[engine_idx].inc(
972
973
974
            iteration_stats.num_preempted_reqs
        )
        self.counter_prompt_tokens[engine_idx].inc(iteration_stats.num_prompt_tokens)
975
        self.counter_generation_tokens[engine_idx].inc(
976
977
            iteration_stats.num_generation_tokens
        )
978
        self.histogram_iteration_tokens[engine_idx].observe(
979
980
            iteration_stats.num_prompt_tokens + iteration_stats.num_generation_tokens
        )
981

982
        for max_gen_tokens in iteration_stats.max_num_generation_tokens_iter:
983
984
985
            self.histogram_max_num_generation_tokens_request[engine_idx].observe(
                max_gen_tokens
            )
986
        for n_param in iteration_stats.n_params_iter:
987
            self.histogram_n_request[engine_idx].observe(n_param)
988
        for ttft in iteration_stats.time_to_first_tokens_iter:
989
            self.histogram_time_to_first_token[engine_idx].observe(ttft)
990
991
        for itl in iteration_stats.inter_token_latencies_iter:
            self.histogram_inter_token_latency[engine_idx].observe(itl)
992
993
            if self.show_hidden_metrics:
                self.histogram_time_per_output_token[engine_idx].observe(itl)
994

995
        for finished_request in iteration_stats.finished_requests:
996
997
998
            self.counter_request_success[finished_request.finish_reason][
                engine_idx
            ].inc()
999
            self.histogram_e2e_time_request[engine_idx].observe(
1000
1001
                finished_request.e2e_latency
            )
1002
            self.histogram_queue_time_request[engine_idx].observe(
1003
1004
                finished_request.queued_time
            )
1005
            self.histogram_prefill_time_request[engine_idx].observe(
1006
1007
                finished_request.prefill_time
            )
1008
            self.histogram_inference_time_request[engine_idx].observe(
1009
1010
                finished_request.inference_time
            )
1011
            self.histogram_decode_time_request[engine_idx].observe(
1012
1013
                finished_request.decode_time
            )
1014
            self.histogram_num_prompt_tokens_request[engine_idx].observe(
1015
1016
                finished_request.num_prompt_tokens
            )
1017
            self.histogram_num_generation_tokens_request[engine_idx].observe(
1018
1019
                finished_request.num_generation_tokens
            )
1020
            self.histogram_request_time_per_output_token[engine_idx].observe(
1021
1022
                finished_request.mean_time_per_output_token
            )
1023
            if finished_request.max_tokens_param:
1024
                self.histogram_max_tokens_request[engine_idx].observe(
1025
1026
                    finished_request.max_tokens_param
                )
1027

1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
    def record_sleep_state(self, sleep: int = 0, level: int = 0):
        awake = 1
        discard_all = 0
        weights_offloaded = 0

        if sleep == 1:
            awake = 0
            if level == 1:
                weights_offloaded = 1
            elif level == 2:
                discard_all = 1

        for engine_idx in self.engine_indexes:
            self.gauge_engine_sleep_state["discard_all"][engine_idx].set(discard_all)
            self.gauge_engine_sleep_state["weights_offloaded"][engine_idx].set(
                weights_offloaded
            )
            self.gauge_engine_sleep_state["awake"][engine_idx].set(awake)

1047
1048
1049
    def log_engine_initialized(self):
        self.log_metrics_info("cache_config", self.vllm_config.cache_config)

1050

1051
PromMetric: TypeAlias = Gauge | Counter | Histogram
1052
1053


1054
def make_per_engine(
1055
    metric: PromMetric, engine_idxs: list[int], model_name: object
1056
) -> dict[int, PromMetric]:
1057
1058
1059
    return {idx: metric.labels(model_name, str(idx)) for idx in engine_idxs}


1060
def build_buckets(mantissa_lst: list[int], max_value: int) -> list[int]:
1061
1062
1063
1064
1065
1066
    """
    Builds a list of buckets with increasing powers of 10 multiplied by
    mantissa values until the value exceeds the specified maximum.

    """
    exponent = 0
1067
    buckets: list[int] = []
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
    while True:
        for m in mantissa_lst:
            value = m * 10**exponent
            if value <= max_value:
                buckets.append(value)
            else:
                return buckets
        exponent += 1


1078
def build_1_2_5_buckets(max_value: int) -> list[int]:
1079
1080
1081
1082
1083
1084
    """
    Example:
    >>> build_1_2_5_buckets(100)
    [1, 2, 5, 10, 20, 50, 100]
    """
    return build_buckets([1, 2, 5], max_value)
1085
1086


1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
class StatLoggerManager:
    """
    StatLoggerManager:
        Logging happens at the level of the EngineCore (per scheduler).
         * DP: >1 EngineCore per AsyncLLM - loggers for each EngineCore.
         * With Local Logger, just make N copies for N EngineCores.
         * With Prometheus, we need a single logger with N "labels"

        This class abstracts away this implementation detail from
        the AsyncLLM, allowing the AsyncLLM to just call .record()
        and .log() to a simple interface.
    """

    def __init__(
        self,
        vllm_config: VllmConfig,
1103
1104
        engine_idxs: list[int] | None = None,
        custom_stat_loggers: list[StatLoggerFactory] | None = None,
1105
        enable_default_loggers: bool = True,
1106
        aggregate_engine_logging: bool = False,
1107
        client_count: int = 1,
1108
    ):
1109
1110
1111
        self.engine_indexes = engine_idxs if engine_idxs else [0]
        self.stat_loggers: list[AggregateStatLoggerBase] = []
        stat_logger_factories: list[StatLoggerFactory] = []
1112
        if custom_stat_loggers is not None:
1113
            stat_logger_factories.extend(custom_stat_loggers)
1114
        if enable_default_loggers and logger.isEnabledFor(logging.INFO):
1115
1116
1117
            if client_count > 1:
                logger.warning(
                    "AsyncLLM created with api_server_count more than 1; "
1118
1119
                    "disabling stats logging to avoid incomplete stats."
                )
1120
            else:
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
                default_logger_factory = (
                    AggregatedLoggingStatLogger
                    if aggregate_engine_logging
                    else LoggingStatLogger
                )
                stat_logger_factories.append(default_logger_factory)
        custom_prometheus_logger: bool = False
        for stat_logger_factory in stat_logger_factories:
            if isinstance(stat_logger_factory, type) and issubclass(
                stat_logger_factory, AggregateStatLoggerBase
            ):
                global_stat_logger = stat_logger_factory(
                    vllm_config=vllm_config,
                    engine_indexes=self.engine_indexes,
                )
                if isinstance(global_stat_logger, PrometheusStatLogger):
                    custom_prometheus_logger = True
            else:
                # per engine logger
                global_stat_logger = PerEngineStatLoggerAdapter(
                    vllm_config=vllm_config,
                    engine_indexes=self.engine_indexes,
                    per_engine_stat_logger_factory=stat_logger_factory,  # type: ignore[arg-type]
                )
            self.stat_loggers.append(global_stat_logger)
        if not custom_prometheus_logger:
            self.stat_loggers.append(
                PrometheusStatLogger(vllm_config, self.engine_indexes)
            )
1150
1151
1152

    def record(
        self,
1153
1154
1155
1156
        scheduler_stats: SchedulerStats | None,
        iteration_stats: IterationStats | None,
        mm_cache_stats: MultiModalCacheStats | None = None,
        engine_idx: int | None = None,
1157
1158
1159
    ):
        if engine_idx is None:
            engine_idx = 0
1160
        for logger in self.stat_loggers:
1161
1162
1163
1164
1165
1166
            logger.record(
                scheduler_stats,
                iteration_stats,
                mm_cache_stats=mm_cache_stats,
                engine_idx=engine_idx,
            )
1167

1168
1169
1170
1171
    def record_sleep_state(self, sleep: int = 0, level: int = 0):
        for logger in self.stat_loggers:
            logger.record_sleep_state(sleep, level)

1172
    def log(self):
1173
1174
        for logger in self.stat_loggers:
            logger.log()
1175
1176

    def log_engine_initialized(self):
1177
1178
        for agg_logger in self.stat_loggers:
            agg_logger.log_engine_initialized()