Unverified Commit 2dfdfed8 authored by Mark McLoughlin's avatar Mark McLoughlin Committed by GitHub
Browse files

[V0][Metrics] Deprecate some KV/prefix cache metrics (#14136)


Signed-off-by: default avatarMark McLoughlin <markmc@redhat.com>
parent c41d2715
...@@ -74,31 +74,51 @@ class Metrics: ...@@ -74,31 +74,51 @@ class Metrics:
], ],
multiprocess_mode="livemostrecent", multiprocess_mode="livemostrecent",
) )
# Deprecated in 0.8 - KV cache offloading is not used in V1
# TODO: in 0.9, only enable if show_hidden_metrics=True
self.gauge_scheduler_swapped = self._gauge_cls( self.gauge_scheduler_swapped = self._gauge_cls(
name="vllm:num_requests_swapped", name="vllm:num_requests_swapped",
documentation="Number of requests swapped to CPU.", documentation=(
"Number of requests swapped to CPU. "
"DEPRECATED: KV cache offloading is not used in V1"),
labelnames=labelnames, labelnames=labelnames,
multiprocess_mode="sum") multiprocess_mode="sum")
# KV Cache Usage in % # KV Cache Usage in %
self.gauge_gpu_cache_usage = self._gauge_cls( self.gauge_gpu_cache_usage = self._gauge_cls(
name="vllm:gpu_cache_usage_perc", name="vllm:gpu_cache_usage_perc",
documentation="GPU KV-cache usage. 1 means 100 percent usage.", documentation="GPU KV-cache usage. 1 means 100 percent usage.",
labelnames=labelnames, labelnames=labelnames,
multiprocess_mode="sum") multiprocess_mode="sum")
# Deprecated in 0.8 - KV cache offloading is not used in V1
# TODO: in 0.9, only enable if show_hidden_metrics=True
self.gauge_cpu_cache_usage = self._gauge_cls( self.gauge_cpu_cache_usage = self._gauge_cls(
name="vllm:cpu_cache_usage_perc", name="vllm:cpu_cache_usage_perc",
documentation="CPU KV-cache usage. 1 means 100 percent usage.", documentation=(
"CPU KV-cache usage. 1 means 100 percent usage. "
"DEPRECATED: KV cache offloading is not used in V1"),
labelnames=labelnames, labelnames=labelnames,
multiprocess_mode="sum") multiprocess_mode="sum")
# Prefix caching block hit rate
# Deprecated in 0.8 - KV cache offloading is not used in V1
# TODO: in 0.9, only enable if show_hidden_metrics=True
self.gauge_cpu_prefix_cache_hit_rate = self._gauge_cls( self.gauge_cpu_prefix_cache_hit_rate = self._gauge_cls(
name="vllm:cpu_prefix_cache_hit_rate", name="vllm:cpu_prefix_cache_hit_rate",
documentation="CPU prefix cache block hit rate.", documentation=(
"CPU prefix cache block hit rate. "
"DEPRECATED: KV cache offloading is not used in V1"),
labelnames=labelnames, labelnames=labelnames,
multiprocess_mode="sum") multiprocess_mode="sum")
# Deprecated in 0.8 - replaced by queries+hits counters in V1
# TODO: in 0.9, only enable if show_hidden_metrics=True
self.gauge_gpu_prefix_cache_hit_rate = self._gauge_cls( self.gauge_gpu_prefix_cache_hit_rate = self._gauge_cls(
name="vllm:gpu_prefix_cache_hit_rate", name="vllm:gpu_prefix_cache_hit_rate",
documentation="GPU prefix cache block hit rate.", documentation=("GPU prefix cache block hit rate. "
"DEPRECATED: use vllm:gpu_prefix_cache_queries and "
"vllm:gpu_prefix_cache_queries in V1"),
labelnames=labelnames, labelnames=labelnames,
multiprocess_mode="sum") multiprocess_mode="sum")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment