Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
24177984
Unverified
Commit
24177984
authored
Sep 02, 2025
by
Mark McLoughlin
Committed by
GitHub
Sep 02, 2025
Browse files
[Metrics] Deprecate TPOT in favor of ITL (#24110)
Signed-off-by:
Mark McLoughlin
<
markmc@redhat.com
>
parent
9480ae24
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
64 additions
and
26 deletions
+64
-26
examples/online_serving/prometheus_grafana/grafana.json
examples/online_serving/prometheus_grafana/grafana.json
+6
-6
tests/entrypoints/openai/test_metrics.py
tests/entrypoints/openai/test_metrics.py
+15
-7
vllm/engine/llm_engine.py
vllm/engine/llm_engine.py
+4
-4
vllm/engine/metrics.py
vllm/engine/metrics.py
+16
-2
vllm/engine/metrics_types.py
vllm/engine/metrics_types.py
+1
-1
vllm/v1/metrics/loggers.py
vllm/v1/metrics/loggers.py
+19
-3
vllm/v1/metrics/stats.py
vllm/v1/metrics/stats.py
+3
-3
No files found.
examples/online_serving/prometheus_grafana/grafana.json
View file @
24177984
...
...
@@ -402,7 +402,7 @@
},
"disableTextWrap"
:
false
,
"editorMode"
:
"builder"
,
"expr"
:
"histogram_quantile(0.99, sum by(le) (rate(vllm:
time_per_output_token
_seconds_bucket{model_name=
\"
$model_name
\"
}[$__rate_interval])))"
,
"expr"
:
"histogram_quantile(0.99, sum by(le) (rate(vllm:
inter_token_latency
_seconds_bucket{model_name=
\"
$model_name
\"
}[$__rate_interval])))"
,
"fullMetaSearch"
:
false
,
"includeNullMetadata"
:
false
,
"instant"
:
false
,
...
...
@@ -418,7 +418,7 @@
},
"disableTextWrap"
:
false
,
"editorMode"
:
"builder"
,
"expr"
:
"histogram_quantile(0.95, sum by(le) (rate(vllm:
time_per_output_token
_seconds_bucket{model_name=
\"
$model_name
\"
}[$__rate_interval])))"
,
"expr"
:
"histogram_quantile(0.95, sum by(le) (rate(vllm:
inter_token_latency
_seconds_bucket{model_name=
\"
$model_name
\"
}[$__rate_interval])))"
,
"fullMetaSearch"
:
false
,
"hide"
:
false
,
"includeNullMetadata"
:
false
,
...
...
@@ -435,7 +435,7 @@
},
"disableTextWrap"
:
false
,
"editorMode"
:
"builder"
,
"expr"
:
"histogram_quantile(0.9, sum by(le) (rate(vllm:
time_per_output_token
_seconds_bucket{model_name=
\"
$model_name
\"
}[$__rate_interval])))"
,
"expr"
:
"histogram_quantile(0.9, sum by(le) (rate(vllm:
inter_token_latency
_seconds_bucket{model_name=
\"
$model_name
\"
}[$__rate_interval])))"
,
"fullMetaSearch"
:
false
,
"hide"
:
false
,
"includeNullMetadata"
:
false
,
...
...
@@ -452,7 +452,7 @@
},
"disableTextWrap"
:
false
,
"editorMode"
:
"builder"
,
"expr"
:
"histogram_quantile(0.5, sum by(le) (rate(vllm:
time_per_output_token
_seconds_bucket{model_name=
\"
$model_name
\"
}[$__rate_interval])))"
,
"expr"
:
"histogram_quantile(0.5, sum by(le) (rate(vllm:
inter_token_latency
_seconds_bucket{model_name=
\"
$model_name
\"
}[$__rate_interval])))"
,
"fullMetaSearch"
:
false
,
"hide"
:
false
,
"includeNullMetadata"
:
false
,
...
...
@@ -468,7 +468,7 @@
"uid"
:
"${DS_PROMETHEUS}"
},
"editorMode"
:
"code"
,
"expr"
:
"rate(vllm:
time_per_output_token
_seconds_sum{model_name=
\"
$model_name
\"
}[$__rate_interval])
\n
/
\n
rate(vllm:
time_per_output_token
_seconds_count{model_name=
\"
$model_name
\"
}[$__rate_interval])"
,
"expr"
:
"rate(vllm:
inter_token_latency
_seconds_sum{model_name=
\"
$model_name
\"
}[$__rate_interval])
\n
/
\n
rate(vllm:
inter_token_latency
_seconds_count{model_name=
\"
$model_name
\"
}[$__rate_interval])"
,
"hide"
:
false
,
"instant"
:
false
,
"legendFormat"
:
"Mean"
,
...
...
@@ -476,7 +476,7 @@
"refId"
:
"E"
}
],
"title"
:
"
Time Per Output
Token Latency"
,
"title"
:
"
Inter
Token Latency"
,
"type"
:
"timeseries"
},
{
...
...
tests/entrypoints/openai/test_metrics.py
View file @
24177984
...
...
@@ -250,12 +250,15 @@ EXPECTED_METRICS_V1 = [
"vllm:request_params_max_tokens_sum"
,
"vllm:request_params_max_tokens_bucket"
,
"vllm:request_params_max_tokens_count"
,
"vllm:time_to_first_token_seconds_sum"
,
"vllm:time_to_first_token_seconds_bucket"
,
"vllm:time_to_first_token_seconds_count"
,
"vllm:time_per_output_token_seconds_sum"
,
"vllm:time_per_output_token_seconds_bucket"
,
"vllm:time_per_output_token_seconds_count"
,
"vllm:time_to_first_token_seconds_sum"
,
"vllm:time_to_first_token_seconds_bucket"
,
"vllm:time_to_first_token_seconds_count"
,
"vllm:inter_token_latency_seconds_sum"
,
"vllm:inter_token_latency_seconds_bucket"
,
"vllm:inter_token_latency_seconds_count"
,
"vllm:e2e_request_latency_seconds_sum"
,
"vllm:e2e_request_latency_seconds_bucket"
,
"vllm:e2e_request_latency_seconds_count"
,
...
...
@@ -273,7 +276,11 @@ EXPECTED_METRICS_V1 = [
"vllm:request_decode_time_seconds_count"
,
]
HIDDEN_DEPRECATED_METRICS
:
list
[
str
]
=
[]
HIDDEN_DEPRECATED_METRICS
:
list
[
str
]
=
[
"vllm:time_per_output_token_seconds_sum"
,
"vllm:time_per_output_token_seconds_bucket"
,
"vllm:time_per_output_token_seconds_count"
,
]
@
pytest
.
mark
.
asyncio
...
...
@@ -289,9 +296,10 @@ async def test_metrics_exist(server: RemoteOpenAIServer,
assert
response
.
status_code
==
HTTPStatus
.
OK
for
metric
in
(
EXPECTED_METRICS_V1
if
use_v1
else
EXPECTED_METRICS
):
if
(
not
server
.
show_hidden_metrics
and
metric
not
in
HIDDEN_DEPRECATED_METRICS
):
assert
metric
in
response
.
text
if
(
metric
in
HIDDEN_DEPRECATED_METRICS
and
not
server
.
show_hidden_metrics
):
continue
assert
metric
in
response
.
text
@
pytest
.
mark
.
asyncio
...
...
vllm/engine/llm_engine.py
View file @
24177984
...
...
@@ -1414,7 +1414,7 @@ class LLMEngine:
num_generation_tokens_iter
=
0
num_tokens_iter
=
0
time_to_first_tokens_iter
:
List
[
float
]
=
[]
time_per_output_token
s_iter
:
List
[
float
]
=
[]
inter_token_latencie
s_iter
:
List
[
float
]
=
[]
num_preemption_iter
=
(
0
if
scheduler_outputs
is
None
else
scheduler_outputs
.
preempted
)
...
...
@@ -1498,9 +1498,9 @@ class LLMEngine:
num_generation_tokens_from_prefill_groups
+=
(
seq_group
.
num_seqs
())
else
:
#
TPOTs.
#
ITLs
latency
=
seq_group
.
get_last_token_latency
()
time_per_output_token
s_iter
.
append
(
latency
)
inter_token_latencie
s_iter
.
append
(
latency
)
if
seq_group
.
state
.
current_step
==
0
:
# For async_output_proc, the do_log_stats()
# is called following init_multi_step(), which
...
...
@@ -1582,7 +1582,7 @@ class LLMEngine:
num_generation_tokens_iter
=
num_generation_tokens_iter
,
num_tokens_iter
=
num_tokens_iter
,
time_to_first_tokens_iter
=
time_to_first_tokens_iter
,
time_per_output_tokens_iter
=
time_per_output_token
s_iter
,
inter_token_latencies_iter
=
inter_token_latencie
s_iter
,
num_preemption_iter
=
num_preemption_iter
,
# Request stats
...
...
vllm/engine/metrics.py
View file @
24177984
...
...
@@ -113,9 +113,21 @@ class Metrics:
0.75
,
1.0
,
2.5
,
5.0
,
7.5
,
10.0
,
20.0
,
40.0
,
80.0
,
160.0
,
640.0
,
2560.0
])
# Deprecated in 0.11 - Renamed as vllm:inter_token_latency_seconds
# TODO: in 0.12, only enable if show_hidden_metrics=True
self
.
histogram_time_per_output_token
=
self
.
_histogram_cls
(
name
=
"vllm:time_per_output_token_seconds"
,
documentation
=
"Histogram of time per output token in seconds."
,
documentation
=
(
"Histogram of time per output token in seconds."
"DEPRECATED: Use vllm:inter_token_latency_seconds instead."
),
labelnames
=
labelnames
,
buckets
=
[
0.01
,
0.025
,
0.05
,
0.075
,
0.1
,
0.15
,
0.2
,
0.3
,
0.4
,
0.5
,
0.75
,
1.0
,
2.5
,
5.0
,
7.5
,
10.0
,
20.0
,
40.0
,
80.0
])
self
.
histogram_inter_token_latency
=
self
.
_histogram_cls
(
name
=
"vllm:inter_token_latency_seconds"
,
documentation
=
"Histogram of inter token latency in seconds."
,
labelnames
=
labelnames
,
buckets
=
[
0.01
,
0.025
,
0.05
,
0.075
,
0.1
,
0.15
,
0.2
,
0.3
,
0.4
,
0.5
,
0.75
,
...
...
@@ -491,7 +503,9 @@ class PrometheusStatLogger(StatLoggerBase):
self
.
_log_histogram
(
self
.
metrics
.
histogram_time_to_first_token
,
stats
.
time_to_first_tokens_iter
)
self
.
_log_histogram
(
self
.
metrics
.
histogram_time_per_output_token
,
stats
.
time_per_output_tokens_iter
)
stats
.
inter_token_latencies_iter
)
self
.
_log_histogram
(
self
.
metrics
.
histogram_inter_token_latency
,
stats
.
inter_token_latencies_iter
)
# Request level data
# Latency
...
...
vllm/engine/metrics_types.py
View file @
24177984
...
...
@@ -43,7 +43,7 @@ class Stats:
num_generation_tokens_iter
:
int
num_tokens_iter
:
int
time_to_first_tokens_iter
:
List
[
float
]
time_per_output_token
s_iter
:
List
[
float
]
inter_token_latencie
s_iter
:
List
[
float
]
num_preemption_iter
:
int
# Request stats (should have _requests suffix)
...
...
vllm/v1/metrics/loggers.py
View file @
24177984
...
...
@@ -377,9 +377,13 @@ class PrometheusStatLogger(StatLoggerBase):
self
.
histogram_time_to_first_token
=
make_per_engine
(
histogram_time_to_first_token
,
engine_indexes
,
model_name
)
# Deprecated in 0.11 - Renamed as vllm:inter_token_latency_seconds
# TODO: in 0.12, only enable if show_hidden_metrics=True
histogram_time_per_output_token
=
self
.
_histogram_cls
(
name
=
"vllm:time_per_output_token_seconds"
,
documentation
=
"Histogram of time per output token in seconds."
,
documentation
=
(
"Histogram of time per output token in seconds."
"DEPRECATED: Use vllm:inter_token_latency_seconds instead."
),
buckets
=
[
0.01
,
0.025
,
0.05
,
0.075
,
0.1
,
0.15
,
0.2
,
0.3
,
0.4
,
0.5
,
0.75
,
1.0
,
2.5
,
5.0
,
7.5
,
10.0
,
20.0
,
40.0
,
80.0
...
...
@@ -388,6 +392,17 @@ class PrometheusStatLogger(StatLoggerBase):
self
.
histogram_time_per_output_token
=
make_per_engine
(
histogram_time_per_output_token
,
engine_indexes
,
model_name
)
histogram_inter_token_latency
=
self
.
_histogram_cls
(
name
=
"vllm:inter_token_latency_seconds"
,
documentation
=
"Histogram of inter-token latency in seconds."
,
buckets
=
[
0.01
,
0.025
,
0.05
,
0.075
,
0.1
,
0.15
,
0.2
,
0.3
,
0.4
,
0.5
,
0.75
,
1.0
,
2.5
,
5.0
,
7.5
,
10.0
,
20.0
,
40.0
,
80.0
],
labelnames
=
labelnames
)
self
.
histogram_inter_token_latency
=
make_per_engine
(
histogram_inter_token_latency
,
engine_indexes
,
model_name
)
request_latency_buckets
=
[
0.3
,
0.5
,
0.8
,
1.0
,
1.5
,
2.0
,
2.5
,
5.0
,
10.0
,
15.0
,
20.0
,
30.0
,
40.0
,
50.0
,
60.0
,
120.0
,
240.0
,
480.0
,
960.0
,
1920.0
,
7680.0
...
...
@@ -537,8 +552,9 @@ class PrometheusStatLogger(StatLoggerBase):
self
.
histogram_n_request
[
engine_idx
].
observe
(
n_param
)
for
ttft
in
iteration_stats
.
time_to_first_tokens_iter
:
self
.
histogram_time_to_first_token
[
engine_idx
].
observe
(
ttft
)
for
tpot
in
iteration_stats
.
time_per_output_tokens_iter
:
self
.
histogram_time_per_output_token
[
engine_idx
].
observe
(
tpot
)
for
itl
in
iteration_stats
.
inter_token_latencies_iter
:
self
.
histogram_inter_token_latency
[
engine_idx
].
observe
(
itl
)
self
.
histogram_time_per_output_token
[
engine_idx
].
observe
(
itl
)
for
finished_request
in
iteration_stats
.
finished_requests
:
self
.
counter_request_success
[
...
...
vllm/v1/metrics/stats.py
View file @
24177984
...
...
@@ -96,7 +96,7 @@ class IterationStats:
self
.
max_num_generation_tokens_iter
:
list
[
int
]
=
[]
self
.
n_params_iter
:
list
[
int
]
=
[]
self
.
time_to_first_tokens_iter
:
list
[
float
]
=
[]
self
.
time_per_output_token
s_iter
:
list
[
float
]
=
[]
self
.
inter_token_latencie
s_iter
:
list
[
float
]
=
[]
self
.
waiting_lora_adapters
:
dict
[
str
,
int
]
=
{}
self
.
running_lora_adapters
:
dict
[
str
,
int
]
=
{}
...
...
@@ -128,8 +128,8 @@ class IterationStats:
if
is_prefilling
:
req_stats
.
first_token_ts
=
engine_core_timestamp
else
:
tpot
=
engine_core_timestamp
-
req_stats
.
last_token_ts
self
.
time_per_output_token
s_iter
.
append
(
tpot
)
itl
=
engine_core_timestamp
-
req_stats
.
last_token_ts
self
.
inter_token_latencie
s_iter
.
append
(
itl
)
req_stats
.
last_token_ts
=
engine_core_timestamp
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment