[Metrics] Complete removal of deprecated vllm:time_per_output_token_seconds metric (#32661)

This PR completes the removal of the deprecated vllm:time_per_output_token_seconds metric that was deprecated in v0.11, hidden in v0.12, scheduled for removal in v0.13, but delayed until v0.15. Signed-off-by: carlory <baofa.fan@daocloud.io> Co-authored-by: Claude Haiku 4.5 <noreply@anthropic.com>

[Metrics] Complete removal of deprecated vllm:time_per_output_token_seconds metric (#32661)
This PR completes the removal of the deprecated vllm:time_per_output_token_seconds metric that was deprecated in v0.11, hidden in v0.12, scheduled for removal in v0.13, but delayed until v0.15. Signed-off-by: carlory <baofa.fan@daocloud.io> Co-authored-by: Claude Haiku 4.5 <noreply@anthropic.com>
bb917203 · 杨朱 · Kiki · GitHub · c4e5bdf6 · bb917203 · bb917203
Unverified Commit bb917203 authored Jan 20, 2026 by 杨朱 · Kiki Committed by GitHub Jan 20, 2026
5 changed files
--- a/docs/design/metrics.md
+++ b/docs/design/metrics.md
@@ -49,7 +49,7 @@ The subset of metrics exposed in the Grafana dashboard gives us an indication of
 - `vllm:e2e_request_latency_seconds_bucket` - End to end request latency measured in seconds.
 - `vllm:prompt_tokens` - Prompt tokens.
 - `vllm:generation_tokens` - Generation tokens.
- `vllm:time_per_output_token_seconds` - Inter-token latency (Time Per Output Token, TPOT) in seconds.
+- `vllm:inter_token_latency_seconds` - Inter-token latency (Time Per Output Token, TPOT) in seconds.
 - `vllm:time_to_first_token_seconds` - Time to First Token (TTFT) latency in seconds.
 - `vllm:num_requests_running` (also, `_swapped` and `_waiting`) - Number of requests in the RUNNING, WAITING, and SWAPPED states.
 - `vllm:kv_cache_usage_perc` - Percentage of used cache blocks by vLLM.

--- a/examples/online_serving/dashboards/grafana/performance_statistics.json
+++ b/examples/online_serving/dashboards/grafana/performance_statistics.json
@@ -884,7 +884,7 @@
      "targets": [
        {
          "editorMode": "code",
-          "expr": "rate(vllm:time_per_output_token_seconds_sum[$__interval]) / rate(vllm:time_per_output_token_seconds_count[$__interval])",
+          "expr": "rate(vllm:inter_token_latency_seconds_sum[$__interval]) / rate(vllm:inter_token_latency_seconds_count[$__interval])",
          "legendFormat": "ITL (Avg)",
          "range": true,
          "refId": "A"
@@ -895,7 +895,7 @@
            "uid": "${DS_PROMETHEUS}"
          },
          "editorMode": "code",
-          "expr": "histogram_quantile(0.50, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__interval])))",
+          "expr": "histogram_quantile(0.50, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket[$__interval])))",
          "hide": false,
          "instant": false,
          "legendFormat": "ITL (p50)",
@@ -908,7 +908,7 @@
            "uid": "${DS_PROMETHEUS}"
          },
          "editorMode": "code",
-          "expr": "histogram_quantile(0.90, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__interval])))",
+          "expr": "histogram_quantile(0.90, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket[$__interval])))",
          "hide": false,
          "instant": false,
          "legendFormat": "ITL (p90)",
@@ -921,7 +921,7 @@
            "uid": "${DS_PROMETHEUS}"
          },
          "editorMode": "code",
-          "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__interval])))",
+          "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket[$__interval])))",
          "hide": false,
          "instant": false,
          "legendFormat": "ITL (p99)",
@@ -990,7 +990,7 @@
      "targets": [
        {
          "editorMode": "code",
-          "expr": "histogram_quantile(0.90, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__range])))",
+          "expr": "histogram_quantile(0.90, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket[$__range])))",
          "legendFormat": "__auto",
          "range": true,
          "refId": "A"
@@ -1057,7 +1057,7 @@
      "targets": [
        {
          "editorMode": "code",
-          "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__range])))",
+          "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket[$__range])))",
          "legendFormat": "__auto",
          "range": true,
          "refId": "A"
@@ -1124,7 +1124,7 @@
      "targets": [
        {
          "editorMode": "code",
-          "expr": "(sum(increase(vllm:time_per_output_token_seconds_sum[$__range])) / sum(increase(vllm:time_per_output_token_seconds_count[$__range])))",
+          "expr": "(sum(increase(vllm:inter_token_latency_seconds_sum[$__range])) / sum(increase(vllm:inter_token_latency_seconds_count[$__range])))",
          "legendFormat": "__auto",
          "range": true,
          "refId": "A"
@@ -1191,7 +1191,7 @@
      "targets": [
        {
          "editorMode": "code",
-          "expr": "histogram_quantile(0.50, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__range])))",
+          "expr": "histogram_quantile(0.50, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket[$__range])))",
          "legendFormat": "__auto",
          "range": true,
          "refId": "A"

--- a/examples/online_serving/dashboards/perses/performance_statistics.yaml
+++ b/examples/online_serving/dashboards/perses/performance_statistics.yaml
@@ -309,9 +309,9 @@ spec:
                    kind: PrometheusDatasource
                    name: accelerators-thanos-querier-datasource
                  query: >
-                    sum by (model_name) (rate(vllm:time_per_output_token_seconds_sum{model_name=~"$Deployment_id"}[$__interval]))
+                    sum by (model_name) (rate(vllm:inter_token_latency_seconds_sum{model_name=~"$Deployment_id"}[$__interval]))
                    /
-                    sum by (model_name) (rate(vllm:time_per_output_token_seconds_count{model_name=~"$Deployment_id"}[$__interval]))
+                    sum by (model_name) (rate(vllm:inter_token_latency_seconds_count{model_name=~"$Deployment_id"}[$__interval]))
                  seriesNameFormat: '{{model_name}}'
          - kind: TimeSeriesQuery
            spec:
@@ -325,7 +325,7 @@ spec:
                    histogram_quantile(
                      0.50,
                      sum by (le, model_name) (
-                        rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
+                        rate(vllm:inter_token_latency_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
                      )
                    )
                  seriesNameFormat: '{{model_name}} p50'
@@ -341,7 +341,7 @@ spec:
                    histogram_quantile(
                      0.90,
                      sum by (le, model_name) (
-                        rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
+                        rate(vllm:inter_token_latency_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
                      )
                    )
                  seriesNameFormat: '{{model_name}} p90'
@@ -357,7 +357,7 @@ spec:
                    histogram_quantile(
                      0.99,
                      sum by (le, model_name) (
-                        rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
+                        rate(vllm:inter_token_latency_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
                      )
                    )
                  seriesNameFormat: '{{model_name}} p99'
@@ -381,9 +381,9 @@ spec:
                    kind: PrometheusDatasource
                    name: accelerators-thanos-querier-datasource
                  query: >
-                    (sum by (model_name) (increase(vllm:time_per_output_token_seconds_sum{model_name=~"$Deployment_id"}[$__range])))
+                    (sum by (model_name) (increase(vllm:inter_token_latency_seconds_sum{model_name=~"$Deployment_id"}[$__range])))
                    /
-                    (sum by (model_name) (increase(vllm:time_per_output_token_seconds_count{model_name=~"$Deployment_id"}[$__range])))
+                    (sum by (model_name) (increase(vllm:inter_token_latency_seconds_count{model_name=~"$Deployment_id"}[$__range])))

    "13":
      kind: Panel
@@ -407,7 +407,7 @@ spec:
                    histogram_quantile(
                      0.50,
                      sum by (le, model_name) (
-                        rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
+                        rate(vllm:inter_token_latency_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
                      )
                    )

@@ -433,7 +433,7 @@ spec:
                    histogram_quantile(
                      0.90,
                      sum by (le, model_name) (
-                        rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
+                        rate(vllm:inter_token_latency_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
                      )
                    )

@@ -459,7 +459,7 @@ spec:
                    histogram_quantile(
                      0.99,
                      sum by (le, model_name) (
-                        rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
+                        rate(vllm:inter_token_latency_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
                      )
                    )


--- a/tests/entrypoints/instrumentator/test_metrics.py
+++ b/tests/entrypoints/instrumentator/test_metrics.py
@@ -77,7 +77,7 @@ def _get_expected_values(num_requests: int, prompt_ids: list[int], max_tokens: i
    # {metric_family: [(suffix, expected_value)]}
    return {
        "vllm:time_to_first_token_seconds": [("_count", num_requests)],
-        "vllm:time_per_output_token_seconds": [
+        "vllm:inter_token_latency_seconds": [
            ("_count", num_requests * (max_tokens - 1))
        ],
        "vllm:e2e_request_latency_seconds": [("_count", num_requests)],
@@ -203,9 +203,6 @@ EXPECTED_METRICS_V1 = [
    "vllm:request_params_max_tokens_sum",
    "vllm:request_params_max_tokens_bucket",
    "vllm:request_params_max_tokens_count",
-    "vllm:time_per_output_token_seconds_sum",
-    "vllm:time_per_output_token_seconds_bucket",
-    "vllm:time_per_output_token_seconds_count",
    "vllm:time_to_first_token_seconds_sum",
    "vllm:time_to_first_token_seconds_bucket",
    "vllm:time_to_first_token_seconds_count",
@@ -238,9 +235,6 @@ HIDDEN_DEPRECATED_METRICS: list[str] = [
    "vllm:gpu_cache_usage_perc",
    "vllm:gpu_prefix_cache_queries",
    "vllm:gpu_prefix_cache_hits",
-    "vllm:time_per_output_token_seconds_sum",
-    "vllm:time_per_output_token_seconds_bucket",
-    "vllm:time_per_output_token_seconds_count",
 ]



--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -715,43 +715,6 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
            histogram_time_to_first_token, engine_indexes, model_name
        )

-        # Deprecated in 0.11 - Renamed as vllm:inter_token_latency_seconds
-        # With 0.12.x you can enable with --show-hidden-metrics-for-version=0.11
-        # TODO: remove in 0.13.0
-        if self.show_hidden_metrics:
-            histogram_time_per_output_token = self._histogram_cls(
-                name="vllm:time_per_output_token_seconds",
-                documentation=(
-                    "Histogram of time per output token in seconds."
-                    "DEPRECATED: Use vllm:inter_token_latency_seconds instead."
-                ),
-                buckets=[
-                    0.01,
-                    0.025,
-                    0.05,
-                    0.075,
-                    0.1,
-                    0.15,
-                    0.2,
-                    0.3,
-                    0.4,
-                    0.5,
-                    0.75,
-                    1.0,
-                    2.5,
-                    5.0,
-                    7.5,
-                    10.0,
-                    20.0,
-                    40.0,
-                    80.0,
-                ],
-                labelnames=labelnames,
-            )
-            self.histogram_time_per_output_token = make_per_engine(
-                histogram_time_per_output_token, engine_indexes, model_name
-            )
-
        histogram_inter_token_latency = self._histogram_cls(
            name="vllm:inter_token_latency_seconds",
            documentation="Histogram of inter-token latency in seconds.",
@@ -1124,8 +1087,6 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
            self.histogram_time_to_first_token[engine_idx].observe(ttft)
        for itl in iteration_stats.inter_token_latencies_iter:
            self.histogram_inter_token_latency[engine_idx].observe(itl)
-            if self.show_hidden_metrics:
-                self.histogram_time_per_output_token[engine_idx].observe(itl)

        for finished_request in iteration_stats.finished_requests:
            self.counter_request_success[finished_request.finish_reason][