Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
2942970d
Unverified
Commit
2942970d
authored
Sep 16, 2025
by
Mark McLoughlin
Committed by
GitHub
Sep 15, 2025
Browse files
[Metrics] Hide deprecated metrics with gpu_ prefix (#24245)
Signed-off-by:
Mark McLoughlin
<
markmc@redhat.com
>
parent
3c96e7b8
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
63 additions
and
46 deletions
+63
-46
tests/entrypoints/openai/test_metrics.py
tests/entrypoints/openai/test_metrics.py
+15
-6
vllm/v1/metrics/loggers.py
vllm/v1/metrics/loggers.py
+48
-40
No files found.
tests/entrypoints/openai/test_metrics.py
View file @
2942970d
...
@@ -232,6 +232,9 @@ EXPECTED_METRICS_V1 = [
...
@@ -232,6 +232,9 @@ EXPECTED_METRICS_V1 = [
"vllm:gpu_cache_usage_perc"
,
"vllm:gpu_cache_usage_perc"
,
"vllm:gpu_prefix_cache_queries"
,
"vllm:gpu_prefix_cache_queries"
,
"vllm:gpu_prefix_cache_hits"
,
"vllm:gpu_prefix_cache_hits"
,
"vllm:kv_cache_usage_perc"
,
"vllm:prefix_cache_queries"
,
"vllm:prefix_cache_hits"
,
"vllm:num_preemptions_total"
,
"vllm:num_preemptions_total"
,
"vllm:prompt_tokens_total"
,
"vllm:prompt_tokens_total"
,
"vllm:generation_tokens_total"
,
"vllm:generation_tokens_total"
,
...
@@ -277,6 +280,9 @@ EXPECTED_METRICS_V1 = [
...
@@ -277,6 +280,9 @@ EXPECTED_METRICS_V1 = [
]
]
HIDDEN_DEPRECATED_METRICS
:
list
[
str
]
=
[
HIDDEN_DEPRECATED_METRICS
:
list
[
str
]
=
[
"vllm:gpu_cache_usage_perc"
,
"vllm:gpu_prefix_cache_queries"
,
"vllm:gpu_prefix_cache_hits"
,
"vllm:time_per_output_token_seconds_sum"
,
"vllm:time_per_output_token_seconds_sum"
,
"vllm:time_per_output_token_seconds_bucket"
,
"vllm:time_per_output_token_seconds_bucket"
,
"vllm:time_per_output_token_seconds_count"
,
"vllm:time_per_output_token_seconds_count"
,
...
@@ -307,7 +313,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer,
...
@@ -307,7 +313,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer,
client
:
openai
.
AsyncClient
,
use_v1
:
bool
):
client
:
openai
.
AsyncClient
,
use_v1
:
bool
):
running_requests
,
waiting_requests
,
kv_cache_usage
=
(
running_requests
,
waiting_requests
,
kv_cache_usage
=
(
_get_running_metrics_from_api
(
server
))
_get_running_metrics_from_api
(
server
,
use_v1
))
# Expect no running requests or kvcache usage
# Expect no running requests or kvcache usage
assert
running_requests
==
0
assert
running_requests
==
0
...
@@ -330,7 +336,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer,
...
@@ -330,7 +336,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer,
# Check that we have running requests
# Check that we have running requests
running_requests
,
waiting_requests
,
kv_cache_usage
=
(
running_requests
,
waiting_requests
,
kv_cache_usage
=
(
_get_running_metrics_from_api
(
server
))
_get_running_metrics_from_api
(
server
,
use_v1
))
# Expect running requests and kvcache usage
# Expect running requests and kvcache usage
assert
running_requests
>
0
assert
running_requests
>
0
...
@@ -349,7 +355,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer,
...
@@ -349,7 +355,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer,
# Verify running and waiting requests counts and KV cache usage are zero
# Verify running and waiting requests counts and KV cache usage are zero
running_requests_after
,
waiting_requests_after
,
kv_cache_usage_after
=
(
running_requests_after
,
waiting_requests_after
,
kv_cache_usage_after
=
(
_get_running_metrics_from_api
(
server
))
_get_running_metrics_from_api
(
server
,
use_v1
))
assert
running_requests_after
==
0
,
\
assert
running_requests_after
==
0
,
\
(
f
"Expected 0 running requests after abort, got "
(
f
"Expected 0 running requests after abort, got "
...
@@ -362,7 +368,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer,
...
@@ -362,7 +368,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer,
f
"
{
kv_cache_usage_after
}
"
)
f
"
{
kv_cache_usage_after
}
"
)
def
_get_running_metrics_from_api
(
server
:
RemoteOpenAIServer
):
def
_get_running_metrics_from_api
(
server
:
RemoteOpenAIServer
,
use_v1
:
bool
):
"""Return (running_count, waiting_count, kv_cache_usage)"""
"""Return (running_count, waiting_count, kv_cache_usage)"""
response
=
requests
.
get
(
server
.
url_for
(
"metrics"
))
response
=
requests
.
get
(
server
.
url_for
(
"metrics"
))
...
@@ -371,6 +377,9 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer):
...
@@ -371,6 +377,9 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer):
# Verify running and waiting requests counts and KV cache usage are zero
# Verify running and waiting requests counts and KV cache usage are zero
running_requests
,
waiting_requests
,
kv_cache_usage
=
None
,
None
,
None
running_requests
,
waiting_requests
,
kv_cache_usage
=
None
,
None
,
None
kv_cache_usage_metric
=
(
"vllm:kv_cache_usage_perc"
if
use_v1
else
"vllm:gpu_cache_usage_perc"
)
for
family
in
text_string_to_metric_families
(
response
.
text
):
for
family
in
text_string_to_metric_families
(
response
.
text
):
if
family
.
name
==
"vllm:num_requests_running"
:
if
family
.
name
==
"vllm:num_requests_running"
:
for
sample
in
family
.
samples
:
for
sample
in
family
.
samples
:
...
@@ -382,9 +391,9 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer):
...
@@ -382,9 +391,9 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer):
if
sample
.
name
==
"vllm:num_requests_waiting"
:
if
sample
.
name
==
"vllm:num_requests_waiting"
:
waiting_requests
=
sample
.
value
waiting_requests
=
sample
.
value
break
break
elif
family
.
name
==
"vllm:gpu
_cache_usage_
perc"
:
elif
family
.
name
==
kv
_cache_usage_
metric
:
for
sample
in
family
.
samples
:
for
sample
in
family
.
samples
:
if
sample
.
name
==
"vllm:gpu
_cache_usage_
perc"
:
if
sample
.
name
==
kv
_cache_usage_
metric
:
kv_cache_usage
=
sample
.
value
kv_cache_usage
=
sample
.
value
break
break
...
...
vllm/v1/metrics/loggers.py
View file @
2942970d
...
@@ -202,40 +202,46 @@ class PrometheusStatLogger(StatLoggerBase):
...
@@ -202,40 +202,46 @@ class PrometheusStatLogger(StatLoggerBase):
#
#
# GPU cache
# GPU cache
#
#
# Deprecated in 0.9 - Renamed as vllm:kv_cache_usage_perc
# Deprecated in 0.9.2 - Renamed as vllm:kv_cache_usage_perc
# TODO: in 0.10, only enable if show_hidden_metrics=True
# With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
gauge_gpu_cache_usage
=
self
.
_gauge_cls
(
# TODO: remove in 0.12.0
name
=
"vllm:gpu_cache_usage_perc"
,
if
self
.
show_hidden_metrics
:
documentation
=
(
gauge_gpu_cache_usage
=
self
.
_gauge_cls
(
"GPU KV-cache usage. 1 means 100 percent usage."
name
=
"vllm:gpu_cache_usage_perc"
,
"DEPRECATED: Use vllm:kv_cache_usage_perc instead."
),
documentation
=
(
multiprocess_mode
=
"mostrecent"
,
"GPU KV-cache usage. 1 means 100 percent usage."
labelnames
=
labelnames
)
"DEPRECATED: Use vllm:kv_cache_usage_perc instead."
),
self
.
gauge_gpu_cache_usage
=
make_per_engine
(
gauge_gpu_cache_usage
,
multiprocess_mode
=
"mostrecent"
,
engine_indexes
,
labelnames
=
labelnames
)
model_name
)
self
.
gauge_gpu_cache_usage
=
make_per_engine
(
gauge_gpu_cache_usage
,
engine_indexes
,
model_name
)
# Deprecated in 0.9 - Renamed as vllm:prefix_cache_queries
# TODO: in 0.10, only enable if show_hidden_metrics=True
# Deprecated in 0.9.2 - Renamed as vllm:prefix_cache_queries
counter_gpu_prefix_cache_queries
=
self
.
_counter_cls
(
# With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
name
=
"vllm:gpu_prefix_cache_queries"
,
# TODO: remove in 0.12.0
documentation
=
(
if
self
.
show_hidden_metrics
:
"GPU prefix cache queries, in terms of number of queried"
counter_gpu_prefix_cache_queries
=
self
.
_counter_cls
(
"tokens. DEPRECATED: Use vllm:prefix_cache_queries instead."
),
name
=
"vllm:gpu_prefix_cache_queries"
,
labelnames
=
labelnames
)
documentation
=
(
self
.
counter_gpu_prefix_cache_queries
=
make_per_engine
(
"GPU prefix cache queries, in terms of number of queried"
counter_gpu_prefix_cache_queries
,
engine_indexes
,
model_name
)
"tokens. DEPRECATED: Use vllm:prefix_cache_queries instead."
),
# Deprecated in 0.9 - Renamed as vllm:prefix_cache_hits
labelnames
=
labelnames
)
# TODO: in 0.10, only enable if show_hidden_metrics=True
self
.
counter_gpu_prefix_cache_queries
=
make_per_engine
(
counter_gpu_prefix_cache_hits
=
self
.
_counter_cls
(
counter_gpu_prefix_cache_queries
,
engine_indexes
,
model_name
)
name
=
"vllm:gpu_prefix_cache_hits"
,
documentation
=
(
# Deprecated in 0.9.2 - Renamed as vllm:prefix_cache_hits
"GPU prefix cache hits, in terms of number of cached "
# With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
"tokens. DEPRECATED: Use vllm:prefix_cache_hits instead."
),
# TODO: remove in 0.12.0
labelnames
=
labelnames
)
if
self
.
show_hidden_metrics
:
self
.
counter_gpu_prefix_cache_hits
=
make_per_engine
(
counter_gpu_prefix_cache_hits
=
self
.
_counter_cls
(
counter_gpu_prefix_cache_hits
,
engine_indexes
,
model_name
)
name
=
"vllm:gpu_prefix_cache_hits"
,
documentation
=
(
"GPU prefix cache hits, in terms of number of cached "
"tokens. DEPRECATED: Use vllm:prefix_cache_hits instead."
),
labelnames
=
labelnames
)
self
.
counter_gpu_prefix_cache_hits
=
make_per_engine
(
counter_gpu_prefix_cache_hits
,
engine_indexes
,
model_name
)
gauge_kv_cache_usage
=
self
.
_gauge_cls
(
gauge_kv_cache_usage
=
self
.
_gauge_cls
(
name
=
"vllm:kv_cache_usage_perc"
,
name
=
"vllm:kv_cache_usage_perc"
,
...
@@ -509,15 +515,17 @@ class PrometheusStatLogger(StatLoggerBase):
...
@@ -509,15 +515,17 @@ class PrometheusStatLogger(StatLoggerBase):
self
.
gauge_scheduler_waiting
[
engine_idx
].
set
(
self
.
gauge_scheduler_waiting
[
engine_idx
].
set
(
scheduler_stats
.
num_waiting_reqs
)
scheduler_stats
.
num_waiting_reqs
)
self
.
gauge_gpu_cache_usage
[
engine_idx
].
set
(
if
self
.
show_hidden_metrics
:
scheduler_stats
.
kv_cache_usage
)
self
.
gauge_gpu_cache_usage
[
engine_idx
].
set
(
scheduler_stats
.
kv_cache_usage
)
self
.
gauge_kv_cache_usage
[
engine_idx
].
set
(
self
.
gauge_kv_cache_usage
[
engine_idx
].
set
(
scheduler_stats
.
kv_cache_usage
)
scheduler_stats
.
kv_cache_usage
)
self
.
counter_gpu_prefix_cache_queries
[
engine_idx
].
inc
(
if
self
.
show_hidden_metrics
:
scheduler_stats
.
prefix_cache_stats
.
queries
)
self
.
counter_gpu_prefix_cache_queries
[
engine_idx
].
inc
(
self
.
counter_gpu_prefix_cache_hits
[
engine_idx
].
inc
(
scheduler_stats
.
prefix_cache_stats
.
queries
)
scheduler_stats
.
prefix_cache_stats
.
hits
)
self
.
counter_gpu_prefix_cache_hits
[
engine_idx
].
inc
(
scheduler_stats
.
prefix_cache_stats
.
hits
)
self
.
counter_prefix_cache_queries
[
engine_idx
].
inc
(
self
.
counter_prefix_cache_queries
[
engine_idx
].
inc
(
scheduler_stats
.
prefix_cache_stats
.
queries
)
scheduler_stats
.
prefix_cache_stats
.
queries
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment