Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
9cf4edae
Unverified
Commit
9cf4edae
authored
Nov 25, 2025
by
Mark McLoughlin
Committed by
GitHub
Nov 25, 2025
Browse files
[Metrics] Scheduled removal of deprecated metrics (#29330)
Signed-off-by:
Mark McLoughlin
<
markmc@redhat.com
>
parent
7012d8b4
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
37 additions
and
100 deletions
+37
-100
tests/entrypoints/openai/test_metrics.py
tests/entrypoints/openai/test_metrics.py
+0
-3
vllm/v1/metrics/loggers.py
vllm/v1/metrics/loggers.py
+37
-97
No files found.
tests/entrypoints/openai/test_metrics.py
View file @
9cf4edae
...
@@ -183,9 +183,6 @@ async def test_metrics_counts(
...
@@ -183,9 +183,6 @@ async def test_metrics_counts(
EXPECTED_METRICS_V1
=
[
EXPECTED_METRICS_V1
=
[
"vllm:num_requests_running"
,
"vllm:num_requests_running"
,
"vllm:num_requests_waiting"
,
"vllm:num_requests_waiting"
,
"vllm:gpu_cache_usage_perc"
,
"vllm:gpu_prefix_cache_queries"
,
"vllm:gpu_prefix_cache_hits"
,
"vllm:kv_cache_usage_perc"
,
"vllm:kv_cache_usage_perc"
,
"vllm:prefix_cache_queries"
,
"vllm:prefix_cache_queries"
,
"vllm:prefix_cache_hits"
,
"vllm:prefix_cache_hits"
,
...
...
vllm/v1/metrics/loggers.py
View file @
9cf4edae
...
@@ -440,57 +440,6 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
...
@@ -440,57 +440,6 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
# Setting default values
# Setting default values
self
.
record_sleep_state
()
self
.
record_sleep_state
()
# GPU cache
#
# Deprecated in 0.9.2 - Renamed as vllm:kv_cache_usage_perc
# With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
# TODO: remove in 0.12.0
if
self
.
show_hidden_metrics
:
gauge_gpu_cache_usage
=
self
.
_gauge_cls
(
name
=
"vllm:gpu_cache_usage_perc"
,
documentation
=
(
"GPU KV-cache usage. 1 means 100 percent usage."
"DEPRECATED: Use vllm:kv_cache_usage_perc instead."
),
multiprocess_mode
=
"mostrecent"
,
labelnames
=
labelnames
,
)
self
.
gauge_gpu_cache_usage
=
make_per_engine
(
gauge_gpu_cache_usage
,
engine_indexes
,
model_name
)
# Deprecated in 0.9.2 - Renamed as vllm:prefix_cache_queries
# With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
# TODO: remove in 0.12.0
if
self
.
show_hidden_metrics
:
counter_gpu_prefix_cache_queries
=
self
.
_counter_cls
(
name
=
"vllm:gpu_prefix_cache_queries"
,
documentation
=
(
"GPU prefix cache queries, in terms of number of queried"
"tokens. DEPRECATED: Use vllm:prefix_cache_queries instead."
),
labelnames
=
labelnames
,
)
self
.
counter_gpu_prefix_cache_queries
=
make_per_engine
(
counter_gpu_prefix_cache_queries
,
engine_indexes
,
model_name
)
# Deprecated in 0.9.2 - Renamed as vllm:prefix_cache_hits
# With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
# TODO: remove in 0.12.0
if
self
.
show_hidden_metrics
:
counter_gpu_prefix_cache_hits
=
self
.
_counter_cls
(
name
=
"vllm:gpu_prefix_cache_hits"
,
documentation
=
(
"GPU prefix cache hits, in terms of number of cached "
"tokens. DEPRECATED: Use vllm:prefix_cache_hits instead."
),
labelnames
=
labelnames
,
)
self
.
counter_gpu_prefix_cache_hits
=
make_per_engine
(
counter_gpu_prefix_cache_hits
,
engine_indexes
,
model_name
)
gauge_kv_cache_usage
=
self
.
_gauge_cls
(
gauge_kv_cache_usage
=
self
.
_gauge_cls
(
name
=
"vllm:kv_cache_usage_perc"
,
name
=
"vllm:kv_cache_usage_perc"
,
documentation
=
"KV-cache usage. 1 means 100 percent usage."
,
documentation
=
"KV-cache usage. 1 means 100 percent usage."
,
...
@@ -735,7 +684,9 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
...
@@ -735,7 +684,9 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
)
)
# Deprecated in 0.11 - Renamed as vllm:inter_token_latency_seconds
# Deprecated in 0.11 - Renamed as vllm:inter_token_latency_seconds
# TODO: in 0.12, only enable if show_hidden_metrics=True
# With 0.12.x you can enable with --show-hidden-metrics-for-version=0.11
# TODO: remove in 0.13.0
if
self
.
show_hidden_metrics
:
histogram_time_per_output_token
=
self
.
_histogram_cls
(
histogram_time_per_output_token
=
self
.
_histogram_cls
(
name
=
"vllm:time_per_output_token_seconds"
,
name
=
"vllm:time_per_output_token_seconds"
,
documentation
=
(
documentation
=
(
...
@@ -966,20 +917,8 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
...
@@ -966,20 +917,8 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
self
.
gauge_scheduler_waiting
[
engine_idx
].
set
(
self
.
gauge_scheduler_waiting
[
engine_idx
].
set
(
scheduler_stats
.
num_waiting_reqs
scheduler_stats
.
num_waiting_reqs
)
)
if
self
.
show_hidden_metrics
:
self
.
gauge_gpu_cache_usage
[
engine_idx
].
set
(
scheduler_stats
.
kv_cache_usage
)
self
.
gauge_kv_cache_usage
[
engine_idx
].
set
(
scheduler_stats
.
kv_cache_usage
)
self
.
gauge_kv_cache_usage
[
engine_idx
].
set
(
scheduler_stats
.
kv_cache_usage
)
if
self
.
show_hidden_metrics
:
self
.
counter_gpu_prefix_cache_queries
[
engine_idx
].
inc
(
scheduler_stats
.
prefix_cache_stats
.
queries
)
self
.
counter_gpu_prefix_cache_hits
[
engine_idx
].
inc
(
scheduler_stats
.
prefix_cache_stats
.
hits
)
self
.
counter_prefix_cache_queries
[
engine_idx
].
inc
(
self
.
counter_prefix_cache_queries
[
engine_idx
].
inc
(
scheduler_stats
.
prefix_cache_stats
.
queries
scheduler_stats
.
prefix_cache_stats
.
queries
)
)
...
@@ -1050,6 +989,7 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
...
@@ -1050,6 +989,7 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
self
.
histogram_time_to_first_token
[
engine_idx
].
observe
(
ttft
)
self
.
histogram_time_to_first_token
[
engine_idx
].
observe
(
ttft
)
for
itl
in
iteration_stats
.
inter_token_latencies_iter
:
for
itl
in
iteration_stats
.
inter_token_latencies_iter
:
self
.
histogram_inter_token_latency
[
engine_idx
].
observe
(
itl
)
self
.
histogram_inter_token_latency
[
engine_idx
].
observe
(
itl
)
if
self
.
show_hidden_metrics
:
self
.
histogram_time_per_output_token
[
engine_idx
].
observe
(
itl
)
self
.
histogram_time_per_output_token
[
engine_idx
].
observe
(
itl
)
for
finished_request
in
iteration_stats
.
finished_requests
:
for
finished_request
in
iteration_stats
.
finished_requests
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment