Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
2dfdfed8
Unverified
Commit
2dfdfed8
authored
Mar 03, 2025
by
Mark McLoughlin
Committed by
GitHub
Mar 03, 2025
Browse files
[V0][Metrics] Deprecate some KV/prefix cache metrics (#14136)
Signed-off-by:
Mark McLoughlin
<
markmc@redhat.com
>
parent
c41d2715
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
25 additions
and
5 deletions
+25
-5
vllm/engine/metrics.py
vllm/engine/metrics.py
+25
-5
No files found.
vllm/engine/metrics.py
View file @
2dfdfed8
...
@@ -74,31 +74,51 @@ class Metrics:
...
@@ -74,31 +74,51 @@ class Metrics:
],
],
multiprocess_mode
=
"livemostrecent"
,
multiprocess_mode
=
"livemostrecent"
,
)
)
# Deprecated in 0.8 - KV cache offloading is not used in V1
# TODO: in 0.9, only enable if show_hidden_metrics=True
self
.
gauge_scheduler_swapped
=
self
.
_gauge_cls
(
self
.
gauge_scheduler_swapped
=
self
.
_gauge_cls
(
name
=
"vllm:num_requests_swapped"
,
name
=
"vllm:num_requests_swapped"
,
documentation
=
"Number of requests swapped to CPU."
,
documentation
=
(
"Number of requests swapped to CPU. "
"DEPRECATED: KV cache offloading is not used in V1"
),
labelnames
=
labelnames
,
labelnames
=
labelnames
,
multiprocess_mode
=
"sum"
)
multiprocess_mode
=
"sum"
)
# KV Cache Usage in %
# KV Cache Usage in %
self
.
gauge_gpu_cache_usage
=
self
.
_gauge_cls
(
self
.
gauge_gpu_cache_usage
=
self
.
_gauge_cls
(
name
=
"vllm:gpu_cache_usage_perc"
,
name
=
"vllm:gpu_cache_usage_perc"
,
documentation
=
"GPU KV-cache usage. 1 means 100 percent usage."
,
documentation
=
"GPU KV-cache usage. 1 means 100 percent usage."
,
labelnames
=
labelnames
,
labelnames
=
labelnames
,
multiprocess_mode
=
"sum"
)
multiprocess_mode
=
"sum"
)
# Deprecated in 0.8 - KV cache offloading is not used in V1
# TODO: in 0.9, only enable if show_hidden_metrics=True
self
.
gauge_cpu_cache_usage
=
self
.
_gauge_cls
(
self
.
gauge_cpu_cache_usage
=
self
.
_gauge_cls
(
name
=
"vllm:cpu_cache_usage_perc"
,
name
=
"vllm:cpu_cache_usage_perc"
,
documentation
=
"CPU KV-cache usage. 1 means 100 percent usage."
,
documentation
=
(
"CPU KV-cache usage. 1 means 100 percent usage. "
"DEPRECATED: KV cache offloading is not used in V1"
),
labelnames
=
labelnames
,
labelnames
=
labelnames
,
multiprocess_mode
=
"sum"
)
multiprocess_mode
=
"sum"
)
# Prefix caching block hit rate
# Deprecated in 0.8 - KV cache offloading is not used in V1
# TODO: in 0.9, only enable if show_hidden_metrics=True
self
.
gauge_cpu_prefix_cache_hit_rate
=
self
.
_gauge_cls
(
self
.
gauge_cpu_prefix_cache_hit_rate
=
self
.
_gauge_cls
(
name
=
"vllm:cpu_prefix_cache_hit_rate"
,
name
=
"vllm:cpu_prefix_cache_hit_rate"
,
documentation
=
"CPU prefix cache block hit rate."
,
documentation
=
(
"CPU prefix cache block hit rate. "
"DEPRECATED: KV cache offloading is not used in V1"
),
labelnames
=
labelnames
,
labelnames
=
labelnames
,
multiprocess_mode
=
"sum"
)
multiprocess_mode
=
"sum"
)
# Deprecated in 0.8 - replaced by queries+hits counters in V1
# TODO: in 0.9, only enable if show_hidden_metrics=True
self
.
gauge_gpu_prefix_cache_hit_rate
=
self
.
_gauge_cls
(
self
.
gauge_gpu_prefix_cache_hit_rate
=
self
.
_gauge_cls
(
name
=
"vllm:gpu_prefix_cache_hit_rate"
,
name
=
"vllm:gpu_prefix_cache_hit_rate"
,
documentation
=
"GPU prefix cache block hit rate."
,
documentation
=
(
"GPU prefix cache block hit rate. "
"DEPRECATED: use vllm:gpu_prefix_cache_queries and "
"vllm:gpu_prefix_cache_queries in V1"
),
labelnames
=
labelnames
,
labelnames
=
labelnames
,
multiprocess_mode
=
"sum"
)
multiprocess_mode
=
"sum"
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment