Unverified Commit 9cf4edae authored by Mark McLoughlin's avatar Mark McLoughlin Committed by GitHub
Browse files

[Metrics] Scheduled removal of deprecated metrics (#29330)


Signed-off-by: default avatarMark McLoughlin <markmc@redhat.com>
parent 7012d8b4
...@@ -183,9 +183,6 @@ async def test_metrics_counts( ...@@ -183,9 +183,6 @@ async def test_metrics_counts(
EXPECTED_METRICS_V1 = [ EXPECTED_METRICS_V1 = [
"vllm:num_requests_running", "vllm:num_requests_running",
"vllm:num_requests_waiting", "vllm:num_requests_waiting",
"vllm:gpu_cache_usage_perc",
"vllm:gpu_prefix_cache_queries",
"vllm:gpu_prefix_cache_hits",
"vllm:kv_cache_usage_perc", "vllm:kv_cache_usage_perc",
"vllm:prefix_cache_queries", "vllm:prefix_cache_queries",
"vllm:prefix_cache_hits", "vllm:prefix_cache_hits",
......
...@@ -440,57 +440,6 @@ class PrometheusStatLogger(AggregateStatLoggerBase): ...@@ -440,57 +440,6 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
# Setting default values # Setting default values
self.record_sleep_state() self.record_sleep_state()
# GPU cache
#
# Deprecated in 0.9.2 - Renamed as vllm:kv_cache_usage_perc
# With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
# TODO: remove in 0.12.0
if self.show_hidden_metrics:
gauge_gpu_cache_usage = self._gauge_cls(
name="vllm:gpu_cache_usage_perc",
documentation=(
"GPU KV-cache usage. 1 means 100 percent usage."
"DEPRECATED: Use vllm:kv_cache_usage_perc instead."
),
multiprocess_mode="mostrecent",
labelnames=labelnames,
)
self.gauge_gpu_cache_usage = make_per_engine(
gauge_gpu_cache_usage, engine_indexes, model_name
)
# Deprecated in 0.9.2 - Renamed as vllm:prefix_cache_queries
# With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
# TODO: remove in 0.12.0
if self.show_hidden_metrics:
counter_gpu_prefix_cache_queries = self._counter_cls(
name="vllm:gpu_prefix_cache_queries",
documentation=(
"GPU prefix cache queries, in terms of number of queried"
"tokens. DEPRECATED: Use vllm:prefix_cache_queries instead."
),
labelnames=labelnames,
)
self.counter_gpu_prefix_cache_queries = make_per_engine(
counter_gpu_prefix_cache_queries, engine_indexes, model_name
)
# Deprecated in 0.9.2 - Renamed as vllm:prefix_cache_hits
# With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
# TODO: remove in 0.12.0
if self.show_hidden_metrics:
counter_gpu_prefix_cache_hits = self._counter_cls(
name="vllm:gpu_prefix_cache_hits",
documentation=(
"GPU prefix cache hits, in terms of number of cached "
"tokens. DEPRECATED: Use vllm:prefix_cache_hits instead."
),
labelnames=labelnames,
)
self.counter_gpu_prefix_cache_hits = make_per_engine(
counter_gpu_prefix_cache_hits, engine_indexes, model_name
)
gauge_kv_cache_usage = self._gauge_cls( gauge_kv_cache_usage = self._gauge_cls(
name="vllm:kv_cache_usage_perc", name="vllm:kv_cache_usage_perc",
documentation="KV-cache usage. 1 means 100 percent usage.", documentation="KV-cache usage. 1 means 100 percent usage.",
...@@ -735,7 +684,9 @@ class PrometheusStatLogger(AggregateStatLoggerBase): ...@@ -735,7 +684,9 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
) )
# Deprecated in 0.11 - Renamed as vllm:inter_token_latency_seconds # Deprecated in 0.11 - Renamed as vllm:inter_token_latency_seconds
# TODO: in 0.12, only enable if show_hidden_metrics=True # With 0.12.x you can enable with --show-hidden-metrics-for-version=0.11
# TODO: remove in 0.13.0
if self.show_hidden_metrics:
histogram_time_per_output_token = self._histogram_cls( histogram_time_per_output_token = self._histogram_cls(
name="vllm:time_per_output_token_seconds", name="vllm:time_per_output_token_seconds",
documentation=( documentation=(
...@@ -966,20 +917,8 @@ class PrometheusStatLogger(AggregateStatLoggerBase): ...@@ -966,20 +917,8 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
self.gauge_scheduler_waiting[engine_idx].set( self.gauge_scheduler_waiting[engine_idx].set(
scheduler_stats.num_waiting_reqs scheduler_stats.num_waiting_reqs
) )
if self.show_hidden_metrics:
self.gauge_gpu_cache_usage[engine_idx].set(
scheduler_stats.kv_cache_usage
)
self.gauge_kv_cache_usage[engine_idx].set(scheduler_stats.kv_cache_usage) self.gauge_kv_cache_usage[engine_idx].set(scheduler_stats.kv_cache_usage)
if self.show_hidden_metrics:
self.counter_gpu_prefix_cache_queries[engine_idx].inc(
scheduler_stats.prefix_cache_stats.queries
)
self.counter_gpu_prefix_cache_hits[engine_idx].inc(
scheduler_stats.prefix_cache_stats.hits
)
self.counter_prefix_cache_queries[engine_idx].inc( self.counter_prefix_cache_queries[engine_idx].inc(
scheduler_stats.prefix_cache_stats.queries scheduler_stats.prefix_cache_stats.queries
) )
...@@ -1050,6 +989,7 @@ class PrometheusStatLogger(AggregateStatLoggerBase): ...@@ -1050,6 +989,7 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
self.histogram_time_to_first_token[engine_idx].observe(ttft) self.histogram_time_to_first_token[engine_idx].observe(ttft)
for itl in iteration_stats.inter_token_latencies_iter: for itl in iteration_stats.inter_token_latencies_iter:
self.histogram_inter_token_latency[engine_idx].observe(itl) self.histogram_inter_token_latency[engine_idx].observe(itl)
if self.show_hidden_metrics:
self.histogram_time_per_output_token[engine_idx].observe(itl) self.histogram_time_per_output_token[engine_idx].observe(itl)
for finished_request in iteration_stats.finished_requests: for finished_request in iteration_stats.finished_requests:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment