[Metrics] Add bucket for `request_latency`, `time_to_first_token` and...

[Metrics] Add bucket for `request_latency`, `time_to_first_token` and `time_per_output_token` (#15202) Signed-off-by: Kay Yan <kay.yan@daocloud.io>

[Metrics] Add bucket for `request_latency`, `time_to_first_token` and...
[Metrics] Add bucket for `request_latency`, `time_to_first_token` and `time_per_output_token` (#15202) Signed-off-by: Kay Yan <kay.yan@daocloud.io>
86fc2321 · Kay Yan · GitHub · 2549c0df · 86fc2321 · 86fc2321
Unverified Commit 86fc2321 authored Apr 07, 2025 by Kay Yan Committed by GitHub Apr 06, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 8 additions and 6 deletions

vllm/engine/metrics.py vllm/engine/metrics.py +4 -3

vllm/v1/metrics/loggers.py vllm/v1/metrics/loggers.py +4 -3

No files found.
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -156,7 +156,8 @@ class Metrics:
            labelnames=labelnames,
            buckets=[
                0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
-                0.75, 1.0, 2.5, 5.0, 7.5, 10.0
+                0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0,
+                2560.0
            ])
        self.histogram_time_per_output_token = self._histogram_cls(
            name="vllm:time_per_output_token_seconds",
@@ -164,14 +165,14 @@ class Metrics:
            labelnames=labelnames,
            buckets=[
                0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
-                1.0, 2.5
+                1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0
            ])
        # Request stats
        #   Latency
        request_latency_buckets = [
            0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0,
-            40.0, 50.0, 60.0
+            40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0
        ]
        self.histogram_e2e_time_request = self._histogram_cls(
            name="vllm:e2e_request_latency_seconds",

--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -239,7 +239,8 @@ class PrometheusStatLogger(StatLoggerBase):
                documentation="Histogram of time to first token in seconds.",
                buckets=[
                    0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5,
-                    0.75, 1.0, 2.5, 5.0, 7.5, 10.0
+                    0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0,
+                    640.0, 2560.0
                ],
                labelnames=labelnames).labels(*labelvalues)
@@ -249,13 +250,13 @@ class PrometheusStatLogger(StatLoggerBase):
                documentation="Histogram of time per output token in seconds.",
                buckets=[
                    0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5,
-                    0.75, 1.0, 2.5
+                    0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0
                ],
                labelnames=labelnames).labels(*labelvalues)
        request_latency_buckets = [
            0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0,
-            40.0, 50.0, 60.0
+            40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0
        ]
        self.histogram_e2e_time_request = \
            prometheus_client.Histogram(