Unverified Commit 2942970d authored by Mark McLoughlin's avatar Mark McLoughlin Committed by GitHub
Browse files

[Metrics] Hide deprecated metrics with gpu_ prefix (#24245)


Signed-off-by: default avatarMark McLoughlin <markmc@redhat.com>
parent 3c96e7b8
...@@ -232,6 +232,9 @@ EXPECTED_METRICS_V1 = [ ...@@ -232,6 +232,9 @@ EXPECTED_METRICS_V1 = [
"vllm:gpu_cache_usage_perc", "vllm:gpu_cache_usage_perc",
"vllm:gpu_prefix_cache_queries", "vllm:gpu_prefix_cache_queries",
"vllm:gpu_prefix_cache_hits", "vllm:gpu_prefix_cache_hits",
"vllm:kv_cache_usage_perc",
"vllm:prefix_cache_queries",
"vllm:prefix_cache_hits",
"vllm:num_preemptions_total", "vllm:num_preemptions_total",
"vllm:prompt_tokens_total", "vllm:prompt_tokens_total",
"vllm:generation_tokens_total", "vllm:generation_tokens_total",
...@@ -277,6 +280,9 @@ EXPECTED_METRICS_V1 = [ ...@@ -277,6 +280,9 @@ EXPECTED_METRICS_V1 = [
] ]
HIDDEN_DEPRECATED_METRICS: list[str] = [ HIDDEN_DEPRECATED_METRICS: list[str] = [
"vllm:gpu_cache_usage_perc",
"vllm:gpu_prefix_cache_queries",
"vllm:gpu_prefix_cache_hits",
"vllm:time_per_output_token_seconds_sum", "vllm:time_per_output_token_seconds_sum",
"vllm:time_per_output_token_seconds_bucket", "vllm:time_per_output_token_seconds_bucket",
"vllm:time_per_output_token_seconds_count", "vllm:time_per_output_token_seconds_count",
...@@ -307,7 +313,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer, ...@@ -307,7 +313,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer,
client: openai.AsyncClient, use_v1: bool): client: openai.AsyncClient, use_v1: bool):
running_requests, waiting_requests, kv_cache_usage = ( running_requests, waiting_requests, kv_cache_usage = (
_get_running_metrics_from_api(server)) _get_running_metrics_from_api(server, use_v1))
# Expect no running requests or kvcache usage # Expect no running requests or kvcache usage
assert running_requests == 0 assert running_requests == 0
...@@ -330,7 +336,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer, ...@@ -330,7 +336,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer,
# Check that we have running requests # Check that we have running requests
running_requests, waiting_requests, kv_cache_usage = ( running_requests, waiting_requests, kv_cache_usage = (
_get_running_metrics_from_api(server)) _get_running_metrics_from_api(server, use_v1))
# Expect running requests and kvcache usage # Expect running requests and kvcache usage
assert running_requests > 0 assert running_requests > 0
...@@ -349,7 +355,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer, ...@@ -349,7 +355,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer,
# Verify running and waiting requests counts and KV cache usage are zero # Verify running and waiting requests counts and KV cache usage are zero
running_requests_after, waiting_requests_after, kv_cache_usage_after = ( running_requests_after, waiting_requests_after, kv_cache_usage_after = (
_get_running_metrics_from_api(server)) _get_running_metrics_from_api(server, use_v1))
assert running_requests_after == 0,\ assert running_requests_after == 0,\
(f"Expected 0 running requests after abort, got " (f"Expected 0 running requests after abort, got "
...@@ -362,7 +368,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer, ...@@ -362,7 +368,7 @@ async def test_abort_metrics_reset(server: RemoteOpenAIServer,
f"{kv_cache_usage_after}") f"{kv_cache_usage_after}")
def _get_running_metrics_from_api(server: RemoteOpenAIServer): def _get_running_metrics_from_api(server: RemoteOpenAIServer, use_v1: bool):
"""Return (running_count, waiting_count, kv_cache_usage)""" """Return (running_count, waiting_count, kv_cache_usage)"""
response = requests.get(server.url_for("metrics")) response = requests.get(server.url_for("metrics"))
...@@ -371,6 +377,9 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer): ...@@ -371,6 +377,9 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer):
# Verify running and waiting requests counts and KV cache usage are zero # Verify running and waiting requests counts and KV cache usage are zero
running_requests, waiting_requests, kv_cache_usage = None, None, None running_requests, waiting_requests, kv_cache_usage = None, None, None
kv_cache_usage_metric = ("vllm:kv_cache_usage_perc"
if use_v1 else "vllm:gpu_cache_usage_perc")
for family in text_string_to_metric_families(response.text): for family in text_string_to_metric_families(response.text):
if family.name == "vllm:num_requests_running": if family.name == "vllm:num_requests_running":
for sample in family.samples: for sample in family.samples:
...@@ -382,9 +391,9 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer): ...@@ -382,9 +391,9 @@ def _get_running_metrics_from_api(server: RemoteOpenAIServer):
if sample.name == "vllm:num_requests_waiting": if sample.name == "vllm:num_requests_waiting":
waiting_requests = sample.value waiting_requests = sample.value
break break
elif family.name == "vllm:gpu_cache_usage_perc": elif family.name == kv_cache_usage_metric:
for sample in family.samples: for sample in family.samples:
if sample.name == "vllm:gpu_cache_usage_perc": if sample.name == kv_cache_usage_metric:
kv_cache_usage = sample.value kv_cache_usage = sample.value
break break
......
...@@ -202,40 +202,46 @@ class PrometheusStatLogger(StatLoggerBase): ...@@ -202,40 +202,46 @@ class PrometheusStatLogger(StatLoggerBase):
# #
# GPU cache # GPU cache
# #
# Deprecated in 0.9 - Renamed as vllm:kv_cache_usage_perc # Deprecated in 0.9.2 - Renamed as vllm:kv_cache_usage_perc
# TODO: in 0.10, only enable if show_hidden_metrics=True # With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
gauge_gpu_cache_usage = self._gauge_cls( # TODO: remove in 0.12.0
name="vllm:gpu_cache_usage_perc", if self.show_hidden_metrics:
documentation=( gauge_gpu_cache_usage = self._gauge_cls(
"GPU KV-cache usage. 1 means 100 percent usage." name="vllm:gpu_cache_usage_perc",
"DEPRECATED: Use vllm:kv_cache_usage_perc instead."), documentation=(
multiprocess_mode="mostrecent", "GPU KV-cache usage. 1 means 100 percent usage."
labelnames=labelnames) "DEPRECATED: Use vllm:kv_cache_usage_perc instead."),
self.gauge_gpu_cache_usage = make_per_engine(gauge_gpu_cache_usage, multiprocess_mode="mostrecent",
engine_indexes, labelnames=labelnames)
model_name) self.gauge_gpu_cache_usage = make_per_engine(
gauge_gpu_cache_usage, engine_indexes, model_name)
# Deprecated in 0.9 - Renamed as vllm:prefix_cache_queries
# TODO: in 0.10, only enable if show_hidden_metrics=True # Deprecated in 0.9.2 - Renamed as vllm:prefix_cache_queries
counter_gpu_prefix_cache_queries = self._counter_cls( # With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
name="vllm:gpu_prefix_cache_queries", # TODO: remove in 0.12.0
documentation=( if self.show_hidden_metrics:
"GPU prefix cache queries, in terms of number of queried" counter_gpu_prefix_cache_queries = self._counter_cls(
"tokens. DEPRECATED: Use vllm:prefix_cache_queries instead."), name="vllm:gpu_prefix_cache_queries",
labelnames=labelnames) documentation=(
self.counter_gpu_prefix_cache_queries = make_per_engine( "GPU prefix cache queries, in terms of number of queried"
counter_gpu_prefix_cache_queries, engine_indexes, model_name) "tokens. DEPRECATED: Use vllm:prefix_cache_queries instead."
),
# Deprecated in 0.9 - Renamed as vllm:prefix_cache_hits labelnames=labelnames)
# TODO: in 0.10, only enable if show_hidden_metrics=True self.counter_gpu_prefix_cache_queries = make_per_engine(
counter_gpu_prefix_cache_hits = self._counter_cls( counter_gpu_prefix_cache_queries, engine_indexes, model_name)
name="vllm:gpu_prefix_cache_hits",
documentation=( # Deprecated in 0.9.2 - Renamed as vllm:prefix_cache_hits
"GPU prefix cache hits, in terms of number of cached " # With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
"tokens. DEPRECATED: Use vllm:prefix_cache_hits instead."), # TODO: remove in 0.12.0
labelnames=labelnames) if self.show_hidden_metrics:
self.counter_gpu_prefix_cache_hits = make_per_engine( counter_gpu_prefix_cache_hits = self._counter_cls(
counter_gpu_prefix_cache_hits, engine_indexes, model_name) name="vllm:gpu_prefix_cache_hits",
documentation=(
"GPU prefix cache hits, in terms of number of cached "
"tokens. DEPRECATED: Use vllm:prefix_cache_hits instead."),
labelnames=labelnames)
self.counter_gpu_prefix_cache_hits = make_per_engine(
counter_gpu_prefix_cache_hits, engine_indexes, model_name)
gauge_kv_cache_usage = self._gauge_cls( gauge_kv_cache_usage = self._gauge_cls(
name="vllm:kv_cache_usage_perc", name="vllm:kv_cache_usage_perc",
...@@ -509,15 +515,17 @@ class PrometheusStatLogger(StatLoggerBase): ...@@ -509,15 +515,17 @@ class PrometheusStatLogger(StatLoggerBase):
self.gauge_scheduler_waiting[engine_idx].set( self.gauge_scheduler_waiting[engine_idx].set(
scheduler_stats.num_waiting_reqs) scheduler_stats.num_waiting_reqs)
self.gauge_gpu_cache_usage[engine_idx].set( if self.show_hidden_metrics:
scheduler_stats.kv_cache_usage) self.gauge_gpu_cache_usage[engine_idx].set(
scheduler_stats.kv_cache_usage)
self.gauge_kv_cache_usage[engine_idx].set( self.gauge_kv_cache_usage[engine_idx].set(
scheduler_stats.kv_cache_usage) scheduler_stats.kv_cache_usage)
self.counter_gpu_prefix_cache_queries[engine_idx].inc( if self.show_hidden_metrics:
scheduler_stats.prefix_cache_stats.queries) self.counter_gpu_prefix_cache_queries[engine_idx].inc(
self.counter_gpu_prefix_cache_hits[engine_idx].inc( scheduler_stats.prefix_cache_stats.queries)
scheduler_stats.prefix_cache_stats.hits) self.counter_gpu_prefix_cache_hits[engine_idx].inc(
scheduler_stats.prefix_cache_stats.hits)
self.counter_prefix_cache_queries[engine_idx].inc( self.counter_prefix_cache_queries[engine_idx].inc(
scheduler_stats.prefix_cache_stats.queries) scheduler_stats.prefix_cache_stats.queries)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment