Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
2942970d
Unverified
Commit
2942970d
authored
Sep 16, 2025
by
Mark McLoughlin
Committed by
GitHub
Sep 15, 2025
Browse files
[Metrics] Hide deprecated metrics with gpu_ prefix (#24245)
Signed-off-by:
Mark McLoughlin
<
markmc@redhat.com
>
parent
3c96e7b8
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
63 additions
and
46 deletions
+63
-46
tests/entrypoints/openai/test_metrics.py
tests/entrypoints/openai/test_metrics.py
+15
-6
vllm/v1/metrics/loggers.py
vllm/v1/metrics/loggers.py
+48
-40
No files found.
tests/entrypoints/openai/test_metrics.py
View file @
2942970d
...
...
@@ -232,6 +232,9 @@ EXPECTED_METRICS_V1 = [
"vllm:gpu_cache_usage_perc"
,
"vllm:gpu_prefix_cache_queries"
,
"vllm:gpu_prefix_cache_hits"
,
"vllm:kv_cache_usage_perc"
,
"vllm:prefix_cache_queries"
,
"vllm:prefix_cache_hits"
,
"vllm:num_preemptions_total"
,
"vllm:prompt_tokens_total"
,
"vllm:generation_tokens_total"
,
...
...
@@ -277,6 +280,9 @@ EXPECTED_METRICS_V1 = [
]
HIDDEN_DEPRECATED_METRICS
:
list
[
str
]
=
[
"vllm:gpu_cache_usage_perc"
,
"vllm:gpu_prefix_cache_queries"
,
"vllm:gpu_prefix_cache_hits"
,
"vllm:time_per_output_token_seconds_sum"
,
"vllm:time_per_output_token_seconds_bucket"
,
"vllm:time_per_output_token_seconds_count"
,
...
...
@@ -307,7 +313,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer,
client
:
openai
.
AsyncClient
,
use_v1
:
bool
):
running_requests
,
waiting_requests
,
kv_cache_usage
=
(
_get_running_metrics_from_api
(
server
))
_get_running_metrics_from_api
(
server
,
use_v1
))
# Expect no running requests or kvcache usage
assert
running_requests
==
0
...
...
@@ -330,7 +336,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer,
# Check that we have running requests
running_requests
,
waiting_requests
,
kv_cache_usage
=
(
_get_running_metrics_from_api
(
server
))
_get_running_metrics_from_api
(
server
,
use_v1
))
# Expect running requests and kvcache usage
assert
running_requests
>
0
...
...
@@ -349,7 +355,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer,
# Verify running and waiting requests counts and KV cache usage are zero
running_requests_after
,
waiting_requests_after
,
kv_cache_usage_after
=
(
_get_running_metrics_from_api
(
server
))
_get_running_metrics_from_api
(
server
,
use_v1
))
assert
running_requests_after
==
0
,
\
(
f
"Expected 0 running requests after abort, got "
...
...
@@ -362,7 +368,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer,
f
"
{
kv_cache_usage_after
}
"
)
def
_get_running_metrics_from_api
(
server
:
RemoteOpenAIServer
):
def
_get_running_metrics_from_api
(
server
:
RemoteOpenAIServer
,
use_v1
:
bool
):
"""Return (running_count, waiting_count, kv_cache_usage)"""
response
=
requests
.
get
(
server
.
url_for
(
"metrics"
))
...
...
@@ -371,6 +377,9 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer):
# Verify running and waiting requests counts and KV cache usage are zero
running_requests
,
waiting_requests
,
kv_cache_usage
=
None
,
None
,
None
kv_cache_usage_metric
=
(
"vllm:kv_cache_usage_perc"
if
use_v1
else
"vllm:gpu_cache_usage_perc"
)
for
family
in
text_string_to_metric_families
(
response
.
text
):
if
family
.
name
==
"vllm:num_requests_running"
:
for
sample
in
family
.
samples
:
...
...
@@ -382,9 +391,9 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer):
if
sample
.
name
==
"vllm:num_requests_waiting"
:
waiting_requests
=
sample
.
value
break
elif
family
.
name
==
"vllm:gpu
_cache_usage_
perc"
:
elif
family
.
name
==
kv
_cache_usage_
metric
:
for
sample
in
family
.
samples
:
if
sample
.
name
==
"vllm:gpu
_cache_usage_
perc"
:
if
sample
.
name
==
kv
_cache_usage_
metric
:
kv_cache_usage
=
sample
.
value
break
...
...
vllm/v1/metrics/loggers.py
View file @
2942970d
...
...
@@ -202,8 +202,10 @@ class PrometheusStatLogger(StatLoggerBase):
#
# GPU cache
#
# Deprecated in 0.9 - Renamed as vllm:kv_cache_usage_perc
# TODO: in 0.10, only enable if show_hidden_metrics=True
# Deprecated in 0.9.2 - Renamed as vllm:kv_cache_usage_perc
# With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
# TODO: remove in 0.12.0
if
self
.
show_hidden_metrics
:
gauge_gpu_cache_usage
=
self
.
_gauge_cls
(
name
=
"vllm:gpu_cache_usage_perc"
,
documentation
=
(
...
...
@@ -211,23 +213,27 @@ class PrometheusStatLogger(StatLoggerBase):
"DEPRECATED: Use vllm:kv_cache_usage_perc instead."
),
multiprocess_mode
=
"mostrecent"
,
labelnames
=
labelnames
)
self
.
gauge_gpu_cache_usage
=
make_per_engine
(
gauge_gpu_cache_usage
,
engine_indexes
,
model_name
)
self
.
gauge_gpu_cache_usage
=
make_per_engine
(
gauge_gpu_cache_usage
,
engine_indexes
,
model_name
)
# Deprecated in 0.9 - Renamed as vllm:prefix_cache_queries
# TODO: in 0.10, only enable if show_hidden_metrics=True
# Deprecated in 0.9.2 - Renamed as vllm:prefix_cache_queries
# With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
# TODO: remove in 0.12.0
if
self
.
show_hidden_metrics
:
counter_gpu_prefix_cache_queries
=
self
.
_counter_cls
(
name
=
"vllm:gpu_prefix_cache_queries"
,
documentation
=
(
"GPU prefix cache queries, in terms of number of queried"
"tokens. DEPRECATED: Use vllm:prefix_cache_queries instead."
),
"tokens. DEPRECATED: Use vllm:prefix_cache_queries instead."
),
labelnames
=
labelnames
)
self
.
counter_gpu_prefix_cache_queries
=
make_per_engine
(
counter_gpu_prefix_cache_queries
,
engine_indexes
,
model_name
)
# Deprecated in 0.9 - Renamed as vllm:prefix_cache_hits
# TODO: in 0.10, only enable if show_hidden_metrics=True
# Deprecated in 0.9.2 - Renamed as vllm:prefix_cache_hits
# With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
# TODO: remove in 0.12.0
if
self
.
show_hidden_metrics
:
counter_gpu_prefix_cache_hits
=
self
.
_counter_cls
(
name
=
"vllm:gpu_prefix_cache_hits"
,
documentation
=
(
...
...
@@ -509,11 +515,13 @@ class PrometheusStatLogger(StatLoggerBase):
self
.
gauge_scheduler_waiting
[
engine_idx
].
set
(
scheduler_stats
.
num_waiting_reqs
)
if
self
.
show_hidden_metrics
:
self
.
gauge_gpu_cache_usage
[
engine_idx
].
set
(
scheduler_stats
.
kv_cache_usage
)
self
.
gauge_kv_cache_usage
[
engine_idx
].
set
(
scheduler_stats
.
kv_cache_usage
)
if
self
.
show_hidden_metrics
:
self
.
counter_gpu_prefix_cache_queries
[
engine_idx
].
inc
(
scheduler_stats
.
prefix_cache_stats
.
queries
)
self
.
counter_gpu_prefix_cache_hits
[
engine_idx
].
inc
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment