Unverified Commit bb917203 authored by 杨朱 · Kiki's avatar 杨朱 · Kiki Committed by GitHub
Browse files

[Metrics] Complete removal of deprecated vllm:time_per_output_token_seconds metric (#32661)



This PR completes the removal of the deprecated vllm:time_per_output_token_seconds
metric that was deprecated in v0.11, hidden in v0.12, scheduled for removal in v0.13,
but delayed until v0.15.
Signed-off-by: default avatarcarlory <baofa.fan@daocloud.io>
Co-authored-by: default avatarClaude Haiku 4.5 <noreply@anthropic.com>
parent c4e5bdf6
...@@ -49,7 +49,7 @@ The subset of metrics exposed in the Grafana dashboard gives us an indication of ...@@ -49,7 +49,7 @@ The subset of metrics exposed in the Grafana dashboard gives us an indication of
- `vllm:e2e_request_latency_seconds_bucket` - End to end request latency measured in seconds. - `vllm:e2e_request_latency_seconds_bucket` - End to end request latency measured in seconds.
- `vllm:prompt_tokens` - Prompt tokens. - `vllm:prompt_tokens` - Prompt tokens.
- `vllm:generation_tokens` - Generation tokens. - `vllm:generation_tokens` - Generation tokens.
- `vllm:time_per_output_token_seconds` - Inter-token latency (Time Per Output Token, TPOT) in seconds. - `vllm:inter_token_latency_seconds` - Inter-token latency (Time Per Output Token, TPOT) in seconds.
- `vllm:time_to_first_token_seconds` - Time to First Token (TTFT) latency in seconds. - `vllm:time_to_first_token_seconds` - Time to First Token (TTFT) latency in seconds.
- `vllm:num_requests_running` (also, `_swapped` and `_waiting`) - Number of requests in the RUNNING, WAITING, and SWAPPED states. - `vllm:num_requests_running` (also, `_swapped` and `_waiting`) - Number of requests in the RUNNING, WAITING, and SWAPPED states.
- `vllm:kv_cache_usage_perc` - Percentage of used cache blocks by vLLM. - `vllm:kv_cache_usage_perc` - Percentage of used cache blocks by vLLM.
......
...@@ -884,7 +884,7 @@ ...@@ -884,7 +884,7 @@
"targets": [ "targets": [
{ {
"editorMode": "code", "editorMode": "code",
"expr": "rate(vllm:time_per_output_token_seconds_sum[$__interval]) / rate(vllm:time_per_output_token_seconds_count[$__interval])", "expr": "rate(vllm:inter_token_latency_seconds_sum[$__interval]) / rate(vllm:inter_token_latency_seconds_count[$__interval])",
"legendFormat": "ITL (Avg)", "legendFormat": "ITL (Avg)",
"range": true, "range": true,
"refId": "A" "refId": "A"
...@@ -895,7 +895,7 @@ ...@@ -895,7 +895,7 @@
"uid": "${DS_PROMETHEUS}" "uid": "${DS_PROMETHEUS}"
}, },
"editorMode": "code", "editorMode": "code",
"expr": "histogram_quantile(0.50, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__interval])))", "expr": "histogram_quantile(0.50, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket[$__interval])))",
"hide": false, "hide": false,
"instant": false, "instant": false,
"legendFormat": "ITL (p50)", "legendFormat": "ITL (p50)",
...@@ -908,7 +908,7 @@ ...@@ -908,7 +908,7 @@
"uid": "${DS_PROMETHEUS}" "uid": "${DS_PROMETHEUS}"
}, },
"editorMode": "code", "editorMode": "code",
"expr": "histogram_quantile(0.90, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__interval])))", "expr": "histogram_quantile(0.90, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket[$__interval])))",
"hide": false, "hide": false,
"instant": false, "instant": false,
"legendFormat": "ITL (p90)", "legendFormat": "ITL (p90)",
...@@ -921,7 +921,7 @@ ...@@ -921,7 +921,7 @@
"uid": "${DS_PROMETHEUS}" "uid": "${DS_PROMETHEUS}"
}, },
"editorMode": "code", "editorMode": "code",
"expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__interval])))", "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket[$__interval])))",
"hide": false, "hide": false,
"instant": false, "instant": false,
"legendFormat": "ITL (p99)", "legendFormat": "ITL (p99)",
...@@ -990,7 +990,7 @@ ...@@ -990,7 +990,7 @@
"targets": [ "targets": [
{ {
"editorMode": "code", "editorMode": "code",
"expr": "histogram_quantile(0.90, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__range])))", "expr": "histogram_quantile(0.90, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket[$__range])))",
"legendFormat": "__auto", "legendFormat": "__auto",
"range": true, "range": true,
"refId": "A" "refId": "A"
...@@ -1057,7 +1057,7 @@ ...@@ -1057,7 +1057,7 @@
"targets": [ "targets": [
{ {
"editorMode": "code", "editorMode": "code",
"expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__range])))", "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket[$__range])))",
"legendFormat": "__auto", "legendFormat": "__auto",
"range": true, "range": true,
"refId": "A" "refId": "A"
...@@ -1124,7 +1124,7 @@ ...@@ -1124,7 +1124,7 @@
"targets": [ "targets": [
{ {
"editorMode": "code", "editorMode": "code",
"expr": "(sum(increase(vllm:time_per_output_token_seconds_sum[$__range])) / sum(increase(vllm:time_per_output_token_seconds_count[$__range])))", "expr": "(sum(increase(vllm:inter_token_latency_seconds_sum[$__range])) / sum(increase(vllm:inter_token_latency_seconds_count[$__range])))",
"legendFormat": "__auto", "legendFormat": "__auto",
"range": true, "range": true,
"refId": "A" "refId": "A"
...@@ -1191,7 +1191,7 @@ ...@@ -1191,7 +1191,7 @@
"targets": [ "targets": [
{ {
"editorMode": "code", "editorMode": "code",
"expr": "histogram_quantile(0.50, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__range])))", "expr": "histogram_quantile(0.50, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket[$__range])))",
"legendFormat": "__auto", "legendFormat": "__auto",
"range": true, "range": true,
"refId": "A" "refId": "A"
......
...@@ -309,9 +309,9 @@ spec: ...@@ -309,9 +309,9 @@ spec:
kind: PrometheusDatasource kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource name: accelerators-thanos-querier-datasource
query: > query: >
sum by (model_name) (rate(vllm:time_per_output_token_seconds_sum{model_name=~"$Deployment_id"}[$__interval])) sum by (model_name) (rate(vllm:inter_token_latency_seconds_sum{model_name=~"$Deployment_id"}[$__interval]))
/ /
sum by (model_name) (rate(vllm:time_per_output_token_seconds_count{model_name=~"$Deployment_id"}[$__interval])) sum by (model_name) (rate(vllm:inter_token_latency_seconds_count{model_name=~"$Deployment_id"}[$__interval]))
seriesNameFormat: '{{model_name}}' seriesNameFormat: '{{model_name}}'
- kind: TimeSeriesQuery - kind: TimeSeriesQuery
spec: spec:
...@@ -325,7 +325,7 @@ spec: ...@@ -325,7 +325,7 @@ spec:
histogram_quantile( histogram_quantile(
0.50, 0.50,
sum by (le, model_name) ( sum by (le, model_name) (
rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval]) rate(vllm:inter_token_latency_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
) )
) )
seriesNameFormat: '{{model_name}} p50' seriesNameFormat: '{{model_name}} p50'
...@@ -341,7 +341,7 @@ spec: ...@@ -341,7 +341,7 @@ spec:
histogram_quantile( histogram_quantile(
0.90, 0.90,
sum by (le, model_name) ( sum by (le, model_name) (
rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval]) rate(vllm:inter_token_latency_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
) )
) )
seriesNameFormat: '{{model_name}} p90' seriesNameFormat: '{{model_name}} p90'
...@@ -357,7 +357,7 @@ spec: ...@@ -357,7 +357,7 @@ spec:
histogram_quantile( histogram_quantile(
0.99, 0.99,
sum by (le, model_name) ( sum by (le, model_name) (
rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval]) rate(vllm:inter_token_latency_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
) )
) )
seriesNameFormat: '{{model_name}} p99' seriesNameFormat: '{{model_name}} p99'
...@@ -381,9 +381,9 @@ spec: ...@@ -381,9 +381,9 @@ spec:
kind: PrometheusDatasource kind: PrometheusDatasource
name: accelerators-thanos-querier-datasource name: accelerators-thanos-querier-datasource
query: > query: >
(sum by (model_name) (increase(vllm:time_per_output_token_seconds_sum{model_name=~"$Deployment_id"}[$__range]))) (sum by (model_name) (increase(vllm:inter_token_latency_seconds_sum{model_name=~"$Deployment_id"}[$__range])))
/ /
(sum by (model_name) (increase(vllm:time_per_output_token_seconds_count{model_name=~"$Deployment_id"}[$__range]))) (sum by (model_name) (increase(vllm:inter_token_latency_seconds_count{model_name=~"$Deployment_id"}[$__range])))
"13": "13":
kind: Panel kind: Panel
...@@ -407,7 +407,7 @@ spec: ...@@ -407,7 +407,7 @@ spec:
histogram_quantile( histogram_quantile(
0.50, 0.50,
sum by (le, model_name) ( sum by (le, model_name) (
rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval]) rate(vllm:inter_token_latency_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
) )
) )
...@@ -433,7 +433,7 @@ spec: ...@@ -433,7 +433,7 @@ spec:
histogram_quantile( histogram_quantile(
0.90, 0.90,
sum by (le, model_name) ( sum by (le, model_name) (
rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval]) rate(vllm:inter_token_latency_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
) )
) )
...@@ -459,7 +459,7 @@ spec: ...@@ -459,7 +459,7 @@ spec:
histogram_quantile( histogram_quantile(
0.99, 0.99,
sum by (le, model_name) ( sum by (le, model_name) (
rate(vllm:time_per_output_token_seconds_bucket{model_name=~"$Deployment_id"}[$__interval]) rate(vllm:inter_token_latency_seconds_bucket{model_name=~"$Deployment_id"}[$__interval])
) )
) )
......
...@@ -77,7 +77,7 @@ def _get_expected_values(num_requests: int, prompt_ids: list[int], max_tokens: i ...@@ -77,7 +77,7 @@ def _get_expected_values(num_requests: int, prompt_ids: list[int], max_tokens: i
# {metric_family: [(suffix, expected_value)]} # {metric_family: [(suffix, expected_value)]}
return { return {
"vllm:time_to_first_token_seconds": [("_count", num_requests)], "vllm:time_to_first_token_seconds": [("_count", num_requests)],
"vllm:time_per_output_token_seconds": [ "vllm:inter_token_latency_seconds": [
("_count", num_requests * (max_tokens - 1)) ("_count", num_requests * (max_tokens - 1))
], ],
"vllm:e2e_request_latency_seconds": [("_count", num_requests)], "vllm:e2e_request_latency_seconds": [("_count", num_requests)],
...@@ -203,9 +203,6 @@ EXPECTED_METRICS_V1 = [ ...@@ -203,9 +203,6 @@ EXPECTED_METRICS_V1 = [
"vllm:request_params_max_tokens_sum", "vllm:request_params_max_tokens_sum",
"vllm:request_params_max_tokens_bucket", "vllm:request_params_max_tokens_bucket",
"vllm:request_params_max_tokens_count", "vllm:request_params_max_tokens_count",
"vllm:time_per_output_token_seconds_sum",
"vllm:time_per_output_token_seconds_bucket",
"vllm:time_per_output_token_seconds_count",
"vllm:time_to_first_token_seconds_sum", "vllm:time_to_first_token_seconds_sum",
"vllm:time_to_first_token_seconds_bucket", "vllm:time_to_first_token_seconds_bucket",
"vllm:time_to_first_token_seconds_count", "vllm:time_to_first_token_seconds_count",
...@@ -238,9 +235,6 @@ HIDDEN_DEPRECATED_METRICS: list[str] = [ ...@@ -238,9 +235,6 @@ HIDDEN_DEPRECATED_METRICS: list[str] = [
"vllm:gpu_cache_usage_perc", "vllm:gpu_cache_usage_perc",
"vllm:gpu_prefix_cache_queries", "vllm:gpu_prefix_cache_queries",
"vllm:gpu_prefix_cache_hits", "vllm:gpu_prefix_cache_hits",
"vllm:time_per_output_token_seconds_sum",
"vllm:time_per_output_token_seconds_bucket",
"vllm:time_per_output_token_seconds_count",
] ]
......
...@@ -715,43 +715,6 @@ class PrometheusStatLogger(AggregateStatLoggerBase): ...@@ -715,43 +715,6 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
histogram_time_to_first_token, engine_indexes, model_name histogram_time_to_first_token, engine_indexes, model_name
) )
# Deprecated in 0.11 - Renamed as vllm:inter_token_latency_seconds
# With 0.12.x you can enable with --show-hidden-metrics-for-version=0.11
# TODO: remove in 0.13.0
if self.show_hidden_metrics:
histogram_time_per_output_token = self._histogram_cls(
name="vllm:time_per_output_token_seconds",
documentation=(
"Histogram of time per output token in seconds."
"DEPRECATED: Use vllm:inter_token_latency_seconds instead."
),
buckets=[
0.01,
0.025,
0.05,
0.075,
0.1,
0.15,
0.2,
0.3,
0.4,
0.5,
0.75,
1.0,
2.5,
5.0,
7.5,
10.0,
20.0,
40.0,
80.0,
],
labelnames=labelnames,
)
self.histogram_time_per_output_token = make_per_engine(
histogram_time_per_output_token, engine_indexes, model_name
)
histogram_inter_token_latency = self._histogram_cls( histogram_inter_token_latency = self._histogram_cls(
name="vllm:inter_token_latency_seconds", name="vllm:inter_token_latency_seconds",
documentation="Histogram of inter-token latency in seconds.", documentation="Histogram of inter-token latency in seconds.",
...@@ -1124,8 +1087,6 @@ class PrometheusStatLogger(AggregateStatLoggerBase): ...@@ -1124,8 +1087,6 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
self.histogram_time_to_first_token[engine_idx].observe(ttft) self.histogram_time_to_first_token[engine_idx].observe(ttft)
for itl in iteration_stats.inter_token_latencies_iter: for itl in iteration_stats.inter_token_latencies_iter:
self.histogram_inter_token_latency[engine_idx].observe(itl) self.histogram_inter_token_latency[engine_idx].observe(itl)
if self.show_hidden_metrics:
self.histogram_time_per_output_token[engine_idx].observe(itl)
for finished_request in iteration_stats.finished_requests: for finished_request in iteration_stats.finished_requests:
self.counter_request_success[finished_request.finish_reason][ self.counter_request_success[finished_request.finish_reason][
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment