Unverified Commit bb917203 authored by 杨朱 · Kiki's avatar 杨朱 · Kiki Committed by GitHub
Browse files

[Metrics] Complete removal of deprecated vllm:time_per_output_token_seconds metric (#32661)



This PR completes the removal of the deprecated vllm:time_per_output_token_seconds
metric that was deprecated in v0.11, hidden in v0.12, scheduled for removal in v0.13,
but delayed until v0.15.
Signed-off-by: default avatarcarlory <baofa.fan@daocloud.io>
Co-authored-by: default avatarClaude Haiku 4.5 <noreply@anthropic.com>
parent c4e5bdf6
......@@ -49,7 +49,7 @@ The subset of metrics exposed in the Grafana dashboard gives us an indication of
- `vllm:e2e_request_latency_seconds_bucket` - End to end request latency measured in seconds.
- `vllm:prompt_tokens` - Prompt tokens.
- `vllm:generation_tokens` - Generation tokens.
- `vllm:time_per_output_token_seconds` - Inter-token latency (Time Per Output Token, TPOT) in seconds.
- `vllm:inter_token_latency_seconds` - Inter-token latency (Time Per Output Token, TPOT) in seconds.
- `vllm:time_to_first_token_seconds` - Time to First Token (TTFT) latency in seconds.
- `vllm:num_requests_running` (also, `_swapped` and `_waiting`) - Number of requests in the RUNNING, WAITING, and SWAPPED states.
- `vllm:kv_cache_usage_perc` - Percentage of used cache blocks by vLLM.
......
......@@ -884,7 +884,7 @@
"targets": [
{
"editorMode": "code",
"expr": "rate(vllm:time_per_output_token_seconds_sum[$__interval]) / rate(vllm:time_per_output_token_seconds_count[$__interval])",
"expr": "rate(vllm:inter_token_latency_seconds_sum[$__interval]) / rate(vllm:inter_token_latency_seconds_count[$__interval])",
"legendFormat": "ITL (Avg)",
"range": true,
"refId": "A"
......@@ -895,7 +895,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.50, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__interval])))",
"expr": "histogram_quantile(0.50, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket[$__interval])))",
"hide": false,
"instant": false,
"legendFormat": "ITL (p50)",
......@@ -908,7 +908,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.90, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__interval])))",
"expr": "histogram_quantile(0.90, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket[$__interval])))",
"hide": false,
"instant": false,
"legendFormat": "ITL (p90)",
......@@ -921,7 +921,7 @@
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__interval])))",
"expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket[$__interval])))",
"hide": false,
"instant": false,
"legendFormat": "ITL (p99)",
......@@ -990,7 +990,7 @@
"targets": [
{
"editorMode": "code",
"expr": "histogram_quantile(0.90, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__range])))",
"expr": "histogram_quantile(0.90, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket[$__range])))",
"legendFormat": "__auto",
"range": true,
"refId": "A"
......@@ -1057,7 +1057,7 @@
"targets": [
{
"editorMode": "code",
"expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__range])))",
"expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket[$__range])))",
"legendFormat": "__auto",
"range": true,
"refId": "A"
......@@ -1124,7 +1124,7 @@
"targets": [
{
"editorMode": "code",
"expr": "(sum(increase(vllm:time_per_output_token_seconds_sum[$__range])) / sum(increase(vllm:time_per_output_token_seconds_count[$__range])))",
"expr": "(sum(increase(vllm:inter_token_latency_seconds_sum[$__range])) / sum(increase(vllm:inter_token_latency_seconds_count[$__range])))",
"legendFormat": "__auto",
"range": true,
"refId": "A"
......@@ -1191,7 +1191,7 @@
"targets": [
{
"editorMode": "code",
"expr": "histogram_quantile(0.50, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__range])))",
"expr": "histogram_quantile(0.50, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket[$__range])))",
"legendFormat": "__auto",
"range": true,
"refId": "A"
......
......@@ -309,9 +309,9 @@ spec:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
query: >
sum by (model_name) (rate(vllm:time_per_output_token_seconds_sum{model_name=~"$Deployment_id"}[$__interval]))
sum by (model_name) (rate(vllm:inter_token_latency_seconds_sum{model_name=~"$Deployment_id"}[$__interval]))
/
sum by (model_name) (rate(vllm:time_per_output_token_seconds_count{model_name=~"$Deployment_id"}[$__interval]))
sum by (model_name) (rate(vllm:inter_token_latency_seconds_count{model_name=~"$Deployment_id"}[$__interval]))
seriesNameFormat: '{{model_name}}'
- kind: TimeSeriesQuery
spec:
......@@ -325,7 +325,7 @@ spec:
histogram_quantile(
0.50,
sum by (le, model_name) (
rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
rate(vllm:inter_token_latency_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
)
)
seriesNameFormat: '{{model_name}} p50'
......@@ -341,7 +341,7 @@ spec:
histogram_quantile(
0.90,
sum by (le, model_name) (
rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
rate(vllm:inter_token_latency_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
)
)
seriesNameFormat: '{{model_name}} p90'
......@@ -357,7 +357,7 @@ spec:
histogram_quantile(
0.99,
sum by (le, model_name) (
rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
rate(vllm:inter_token_latency_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
)
)
seriesNameFormat: '{{model_name}} p99'
......@@ -381,9 +381,9 @@ spec:
kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource
query: >
(sum by (model_name) (increase(vllm:time_per_output_token_seconds_sum{model_name=~"$Deployment_id"}[$__range])))
(sum by (model_name) (increase(vllm:inter_token_latency_seconds_sum{model_name=~"$Deployment_id"}[$__range])))
/
(sum by (model_name) (increase(vllm:time_per_output_token_seconds_count{model_name=~"$Deployment_id"}[$__range])))
(sum by (model_name) (increase(vllm:inter_token_latency_seconds_count{model_name=~"$Deployment_id"}[$__range])))
"13":
kind: Panel
......@@ -407,7 +407,7 @@ spec:
histogram_quantile(
0.50,
sum by (le, model_name) (
rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
rate(vllm:inter_token_latency_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
)
)
......@@ -433,7 +433,7 @@ spec:
histogram_quantile(
0.90,
sum by (le, model_name) (
rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
rate(vllm:inter_token_latency_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
)
)
......@@ -459,7 +459,7 @@ spec:
histogram_quantile(
0.99,
sum by (le, model_name) (
rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
rate(vllm:inter_token_latency_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
)
)
......
......@@ -77,7 +77,7 @@ def _get_expected_values(num_requests: int, prompt_ids: list[int], max_tokens: i
# {metric_family: [(suffix, expected_value)]}
return {
"vllm:time_to_first_token_seconds": [("_count", num_requests)],
"vllm:time_per_output_token_seconds": [
"vllm:inter_token_latency_seconds": [
("_count", num_requests * (max_tokens - 1))
],
"vllm:e2e_request_latency_seconds": [("_count", num_requests)],
......@@ -203,9 +203,6 @@ EXPECTED_METRICS_V1 = [
"vllm:request_params_max_tokens_sum",
"vllm:request_params_max_tokens_bucket",
"vllm:request_params_max_tokens_count",
"vllm:time_per_output_token_seconds_sum",
"vllm:time_per_output_token_seconds_bucket",
"vllm:time_per_output_token_seconds_count",
"vllm:time_to_first_token_seconds_sum",
"vllm:time_to_first_token_seconds_bucket",
"vllm:time_to_first_token_seconds_count",
......@@ -238,9 +235,6 @@ HIDDEN_DEPRECATED_METRICS: list[str] = [
"vllm:gpu_cache_usage_perc",
"vllm:gpu_prefix_cache_queries",
"vllm:gpu_prefix_cache_hits",
"vllm:time_per_output_token_seconds_sum",
"vllm:time_per_output_token_seconds_bucket",
"vllm:time_per_output_token_seconds_count",
]
......
......@@ -715,43 +715,6 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
histogram_time_to_first_token, engine_indexes, model_name
)
# Deprecated in 0.11 - Renamed as vllm:inter_token_latency_seconds
# With 0.12.x you can enable with --show-hidden-metrics-for-version=0.11
# TODO: remove in 0.13.0
if self.show_hidden_metrics:
histogram_time_per_output_token = self._histogram_cls(
name="vllm:time_per_output_token_seconds",
documentation=(
"Histogram of time per output token in seconds."
"DEPRECATED: Use vllm:inter_token_latency_seconds instead."
),
buckets=[
0.01,
0.025,
0.05,
0.075,
0.1,
0.15,
0.2,
0.3,
0.4,
0.5,
0.75,
1.0,
2.5,
5.0,
7.5,
10.0,
20.0,
40.0,
80.0,
],
labelnames=labelnames,
)
self.histogram_time_per_output_token = make_per_engine(
histogram_time_per_output_token, engine_indexes, model_name
)
histogram_inter_token_latency = self._histogram_cls(
name="vllm:inter_token_latency_seconds",
documentation="Histogram of inter-token latency in seconds.",
......@@ -1124,8 +1087,6 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
self.histogram_time_to_first_token[engine_idx].observe(ttft)
for itl in iteration_stats.inter_token_latencies_iter:
self.histogram_inter_token_latency[engine_idx].observe(itl)
if self.show_hidden_metrics:
self.histogram_time_per_output_token[engine_idx].observe(itl)
for finished_request in iteration_stats.finished_requests:
self.counter_request_success[finished_request.finish_reason][
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment