Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
98d7367b
Unverified
Commit
98d7367b
authored
Apr 02, 2025
by
Mark McLoughlin
Committed by
GitHub
Apr 02, 2025
Browse files
[Metrics] Hide deprecated metrics (#15458)
Signed-off-by:
Mark McLoughlin
<
markmc@redhat.com
>
parent
594a8b90
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
114 additions
and
82 deletions
+114
-82
tests/entrypoints/openai/test_metrics.py
tests/entrypoints/openai/test_metrics.py
+17
-4
tests/utils.py
tests/utils.py
+3
-0
vllm/engine/metrics.py
vllm/engine/metrics.py
+85
-78
vllm/version.py
vllm/version.py
+9
-0
No files found.
tests/entrypoints/openai/test_metrics.py
View file @
98d7367b
...
...
@@ -13,9 +13,12 @@ import requests
from
prometheus_client.parser
import
text_string_to_metric_families
from
transformers
import
AutoTokenizer
from
vllm
import
version
from
...utils
import
RemoteOpenAIServer
MODEL_NAME
=
"TinyLlama/TinyLlama-1.1B-Chat-v1.0"
PREV_MINOR_VERSION
=
version
.
_prev_minor_version
()
@
pytest
.
fixture
(
scope
=
"module"
,
params
=
[
True
,
False
])
...
...
@@ -55,6 +58,7 @@ def default_server_args():
""
,
"--enable-chunked-prefill"
,
"--disable-frontend-multiprocessing"
,
f
"--show-hidden-metrics-for-version=
{
PREV_MINOR_VERSION
}
"
,
])
def
server
(
use_v1
,
default_server_args
,
request
):
if
request
.
param
:
...
...
@@ -129,7 +133,9 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
# Loop over all expected metric_families
for
metric_family
,
suffix_values_list
in
EXPECTED_VALUES
.
items
():
if
use_v1
and
metric_family
not
in
EXPECTED_METRICS_V1
:
if
((
use_v1
and
metric_family
not
in
EXPECTED_METRICS_V1
)
or
(
not
server
.
show_hidden_metrics
and
metric_family
in
HIDDEN_DEPRECATED_METRICS
)):
continue
found_metric
=
False
...
...
@@ -165,10 +171,10 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
EXPECTED_METRICS
=
[
"vllm:num_requests_running"
,
"vllm:num_requests_swapped"
,
"vllm:num_requests_swapped"
,
# deprecated
"vllm:num_requests_waiting"
,
"vllm:gpu_cache_usage_perc"
,
"vllm:cpu_cache_usage_perc"
,
"vllm:cpu_cache_usage_perc"
,
# deprecated
"vllm:time_to_first_token_seconds_sum"
,
"vllm:time_to_first_token_seconds_bucket"
,
"vllm:time_to_first_token_seconds_count"
,
...
...
@@ -268,6 +274,11 @@ EXPECTED_METRICS_V1 = [
"vllm:request_decode_time_seconds_count"
,
]
HIDDEN_DEPRECATED_METRICS
=
[
"vllm:num_requests_swapped"
,
"vllm:cpu_cache_usage_perc"
,
]
@
pytest
.
mark
.
asyncio
async
def
test_metrics_exist
(
server
:
RemoteOpenAIServer
,
...
...
@@ -282,7 +293,9 @@ async def test_metrics_exist(server: RemoteOpenAIServer,
assert
response
.
status_code
==
HTTPStatus
.
OK
for
metric
in
(
EXPECTED_METRICS_V1
if
use_v1
else
EXPECTED_METRICS
):
assert
metric
in
response
.
text
if
(
not
server
.
show_hidden_metrics
and
metric
not
in
HIDDEN_DEPRECATED_METRICS
):
assert
metric
in
response
.
text
def
test_metrics_exist_run_batch
(
use_v1
:
bool
):
...
...
tests/utils.py
View file @
98d7367b
...
...
@@ -104,6 +104,9 @@ class RemoteOpenAIServer:
self
.
host
=
str
(
args
.
host
or
'localhost'
)
self
.
port
=
int
(
args
.
port
)
self
.
show_hidden_metrics
=
\
args
.
show_hidden_metrics_for_version
is
not
None
# download the model before starting the server to avoid timeout
is_local
=
os
.
path
.
isdir
(
model
)
if
not
is_local
:
...
...
vllm/engine/metrics.py
View file @
98d7367b
...
...
@@ -52,6 +52,11 @@ class Metrics:
max_model_len
=
vllm_config
.
model_config
.
max_model_len
# Use this flag to hide metrics that were deprecated in
# a previous release and which will be removed future
self
.
show_hidden_metrics
=
\
vllm_config
.
observability_config
.
show_hidden_metrics
# System stats
# Scheduler State
self
.
gauge_scheduler_running
=
self
.
_gauge_cls
(
...
...
@@ -76,14 +81,15 @@ class Metrics:
)
# Deprecated in 0.8 - KV cache offloading is not used in V1
# TODO: in 0.9, only enable if show_hidden_metrics=True
self
.
gauge_scheduler_swapped
=
self
.
_gauge_cls
(
name
=
"vllm:num_requests_swapped"
,
documentation
=
(
"Number of requests swapped to CPU. "
"DEPRECATED: KV cache offloading is not used in V1"
),
labelnames
=
labelnames
,
multiprocess_mode
=
"sum"
)
# Hidden in 0.9, due to be removed in 0.10
if
self
.
show_hidden_metrics
:
self
.
gauge_scheduler_swapped
=
self
.
_gauge_cls
(
name
=
"vllm:num_requests_swapped"
,
documentation
=
(
"Number of requests swapped to CPU. "
"DEPRECATED: KV cache offloading is not used in V1"
),
labelnames
=
labelnames
,
multiprocess_mode
=
"sum"
)
# KV Cache Usage in %
self
.
gauge_gpu_cache_usage
=
self
.
_gauge_cls
(
...
...
@@ -93,34 +99,33 @@ class Metrics:
multiprocess_mode
=
"sum"
)
# Deprecated in 0.8 - KV cache offloading is not used in V1
# TODO: in 0.9, only enable if show_hidden_metrics=True
self
.
gauge_cpu_cache_usage
=
self
.
_gauge_cls
(
name
=
"vllm:cpu_cache_usage_perc"
,
documentation
=
(
"CPU KV-cache usage. 1 means 100 percent usage. "
"DEPRECATED: KV cache offloading is not used in V1"
),
labelnames
=
labelnames
,
multiprocess_mode
=
"sum"
)
# Deprecated in 0.8 - KV cache offloading is not used in V1
# TODO: in 0.9, only enable if show_hidden_metrics=True
self
.
gauge_cpu_prefix_cache_hit_rate
=
self
.
_gauge_cls
(
name
=
"vllm:cpu_prefix_cache_hit_rate"
,
documentation
=
(
"CPU prefix cache block hit rate. "
"DEPRECATED: KV cache offloading is not used in V1"
),
labelnames
=
labelnames
,
multiprocess_mode
=
"sum"
)
# Hidden in 0.9, due to be removed in 0.10
if
self
.
show_hidden_metrics
:
self
.
gauge_cpu_cache_usage
=
self
.
_gauge_cls
(
name
=
"vllm:cpu_cache_usage_perc"
,
documentation
=
(
"CPU KV-cache usage. 1 means 100 percent usage. "
"DEPRECATED: KV cache offloading is not used in V1"
),
labelnames
=
labelnames
,
multiprocess_mode
=
"sum"
)
self
.
gauge_cpu_prefix_cache_hit_rate
=
self
.
_gauge_cls
(
name
=
"vllm:cpu_prefix_cache_hit_rate"
,
documentation
=
(
"CPU prefix cache block hit rate. "
"DEPRECATED: KV cache offloading is not used in V1"
),
labelnames
=
labelnames
,
multiprocess_mode
=
"sum"
)
# Deprecated in 0.8 - replaced by queries+hits counters in V1
# TODO: in 0.9, only enable if show_hidden_metrics=True
self
.
gauge_gpu_prefix_cache_hit_rate
=
self
.
_gauge_cls
(
name
=
"vllm:gpu_prefix_cache_hit_rate"
,
documentation
=
(
"GPU prefix cache block hit rate. "
"DEPRECATED: use vllm:gpu_prefix_cache_queries and "
"vllm:gpu_prefix_cache_queries in V1"
),
labelnames
=
labelnames
,
multiprocess_mode
=
"sum"
)
# Hidden in 0.9, due to be removed in 0.10
if
self
.
show_hidden_metrics
:
self
.
gauge_gpu_prefix_cache_hit_rate
=
self
.
_gauge_cls
(
name
=
"vllm:gpu_prefix_cache_hit_rate"
,
documentation
=
(
"GPU prefix cache block hit rate. "
"DEPRECATED: use vllm:gpu_prefix_cache_queries "
"and vllm:gpu_prefix_cache_queries in V1"
),
labelnames
=
labelnames
,
multiprocess_mode
=
"sum"
)
# Iteration stats
self
.
counter_num_preemption
=
self
.
_counter_cls
(
...
...
@@ -198,33 +203,35 @@ class Metrics:
labelnames
=
labelnames
,
buckets
=
request_latency_buckets
)
# Deprecated in 0.8 - duplicates vllm:request_queue_time_seconds:
# TODO: in 0.9, only enable if show_hidden_metrics=True
self
.
histogram_time_in_queue_request
=
self
.
_histogram_cls
(
name
=
"vllm:time_in_queue_requests"
,
documentation
=
(
"Histogram of time the request spent in the queue in seconds. "
"DEPRECATED: use vllm:request_queue_time_seconds instead."
),
labelnames
=
labelnames
,
buckets
=
request_latency_buckets
)
# Hidden in 0.9, due to be removed in 0.10
if
self
.
show_hidden_metrics
:
self
.
histogram_time_in_queue_request
=
self
.
_histogram_cls
(
name
=
"vllm:time_in_queue_requests"
,
documentation
=
(
"Histogram of time the request spent in the queue in seconds. "
"DEPRECATED: use vllm:request_queue_time_seconds instead."
),
labelnames
=
labelnames
,
buckets
=
request_latency_buckets
)
# Deprecated in 0.8 - use prefill/decode/inference time metrics
# TODO: in 0.9, only enable if show_hidden_metrics=True
self
.
histogram_model_forward_time_request
=
self
.
_histogram_cls
(
name
=
"vllm:model_forward_time_milliseconds"
,
documentation
=
(
"Histogram of time spent in the model forward pass in ms. "
"DEPRECATED: use prefill/decode/inference time metrics instead."
),
labelnames
=
labelnames
,
buckets
=
build_1_2_3_5_8_buckets
(
3000
))
self
.
histogram_model_execute_time_request
=
self
.
_histogram_cls
(
name
=
"vllm:model_execute_time_milliseconds"
,
documentation
=
(
"Histogram of time spent in the model execute function in ms."
"DEPRECATED: use prefill/decode/inference time metrics instead."
),
labelnames
=
labelnames
,
buckets
=
build_1_2_3_5_8_buckets
(
3000
))
# Hidden in 0.9, due to be removed in 0.10
if
self
.
show_hidden_metrics
:
self
.
histogram_model_forward_time_request
=
self
.
_histogram_cls
(
name
=
"vllm:model_forward_time_milliseconds"
,
documentation
=
(
"Histogram of time spent in the model forward pass in ms. "
"DEPRECATED: use prefill/decode/inference time metrics instead"
),
labelnames
=
labelnames
,
buckets
=
build_1_2_3_5_8_buckets
(
3000
))
self
.
histogram_model_execute_time_request
=
self
.
_histogram_cls
(
name
=
"vllm:model_execute_time_milliseconds"
,
documentation
=
(
"Histogram of time spent in the model execute function in ms."
"DEPRECATED: use prefill/decode/inference time metrics instead"
),
labelnames
=
labelnames
,
buckets
=
build_1_2_3_5_8_buckets
(
3000
))
# Metadata
self
.
histogram_num_prompt_tokens_request
=
self
.
_histogram_cls
(
...
...
@@ -543,11 +550,6 @@ class PrometheusStatLogger(StatLoggerBase):
self
.
metrics
=
self
.
_metrics_cls
(
labelnames
=
list
(
labels
.
keys
()),
vllm_config
=
vllm_config
)
# Use this flag to hide metrics that were deprecated in
# a previous release and which will be removed future
self
.
show_hidden_metrics
=
\
vllm_config
.
observability_config
.
show_hidden_metrics
def
_log_gauge
(
self
,
gauge
,
data
:
Union
[
int
,
float
])
->
None
:
# Convenience function for logging to gauge.
gauge
.
labels
(
**
self
.
labels
).
set
(
data
)
...
...
@@ -580,18 +582,20 @@ class PrometheusStatLogger(StatLoggerBase):
# System state data
self
.
_log_gauge
(
self
.
metrics
.
gauge_scheduler_running
,
stats
.
num_running_sys
)
self
.
_log_gauge
(
self
.
metrics
.
gauge_scheduler_swapped
,
stats
.
num_swapped_sys
)
if
self
.
metrics
.
show_hidden_metrics
:
self
.
_log_gauge
(
self
.
metrics
.
gauge_scheduler_swapped
,
stats
.
num_swapped_sys
)
self
.
_log_gauge
(
self
.
metrics
.
gauge_scheduler_waiting
,
stats
.
num_waiting_sys
)
self
.
_log_gauge
(
self
.
metrics
.
gauge_gpu_cache_usage
,
stats
.
gpu_cache_usage_sys
)
self
.
_log_gauge
(
self
.
metrics
.
gauge_cpu_cache_usage
,
stats
.
cpu_cache_usage_sys
)
self
.
_log_gauge
(
self
.
metrics
.
gauge_cpu_prefix_cache_hit_rate
,
stats
.
cpu_prefix_cache_hit_rate
)
self
.
_log_gauge
(
self
.
metrics
.
gauge_gpu_prefix_cache_hit_rate
,
stats
.
gpu_prefix_cache_hit_rate
)
if
self
.
metrics
.
show_hidden_metrics
:
self
.
_log_gauge
(
self
.
metrics
.
gauge_cpu_cache_usage
,
stats
.
cpu_cache_usage_sys
)
self
.
_log_gauge
(
self
.
metrics
.
gauge_cpu_prefix_cache_hit_rate
,
stats
.
cpu_prefix_cache_hit_rate
)
self
.
_log_gauge
(
self
.
metrics
.
gauge_gpu_prefix_cache_hit_rate
,
stats
.
gpu_prefix_cache_hit_rate
)
# Including max-lora in metric, in future this property of lora
# config maybe extended to be dynamic.
lora_info
=
{
...
...
@@ -629,12 +633,15 @@ class PrometheusStatLogger(StatLoggerBase):
stats
.
time_prefill_requests
)
self
.
_log_histogram
(
self
.
metrics
.
histogram_decode_time_request
,
stats
.
time_decode_requests
)
self
.
_log_histogram
(
self
.
metrics
.
histogram_time_in_queue_request
,
stats
.
time_in_queue_requests
)
self
.
_log_histogram
(
self
.
metrics
.
histogram_model_forward_time_request
,
stats
.
model_forward_time_requests
)
self
.
_log_histogram
(
self
.
metrics
.
histogram_model_execute_time_request
,
stats
.
model_execute_time_requests
)
if
self
.
metrics
.
show_hidden_metrics
:
self
.
_log_histogram
(
self
.
metrics
.
histogram_time_in_queue_request
,
stats
.
time_in_queue_requests
)
self
.
_log_histogram
(
self
.
metrics
.
histogram_model_forward_time_request
,
stats
.
model_forward_time_requests
)
self
.
_log_histogram
(
self
.
metrics
.
histogram_model_execute_time_request
,
stats
.
model_execute_time_requests
)
# Metadata
finished_reason_counter
=
CollectionsCounter
(
stats
.
finished_reason_requests
)
...
...
vllm/version.py
View file @
98d7367b
...
...
@@ -28,4 +28,13 @@ def _prev_minor_version_was(version_str):
return
True
# Note - this won't do the right thing when we release 1.0!
assert
__version_tuple__
[
0
]
==
0
assert
isinstance
(
__version_tuple__
[
1
],
int
)
return
version_str
==
f
"
{
__version_tuple__
[
0
]
}
.
{
__version_tuple__
[
1
]
-
1
}
"
def
_prev_minor_version
():
"""For the purpose of testing, return a previous minor version number."""
# In dev tree, this will return "0.-1", but that will work fine"
assert
isinstance
(
__version_tuple__
[
1
],
int
)
return
f
"
{
__version_tuple__
[
0
]
}
.
{
__version_tuple__
[
1
]
-
1
}
"
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment