Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
0e98964e
Unverified
Commit
0e98964e
authored
May 28, 2025
by
Mark McLoughlin
Committed by
GitHub
May 28, 2025
Browse files
[V1][Metrics] Remove metrics that were deprecated in 0.8 (#18837)
Signed-off-by:
Mark McLoughlin
<
markmc@redhat.com
>
parent
c68b5c63
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
1 addition
and
156 deletions
+1
-156
docs/usage/metrics.md
docs/usage/metrics.md
+0
-13
examples/online_serving/prometheus_grafana/grafana.json
examples/online_serving/prometheus_grafana/grafana.json
+0
-30
tests/entrypoints/openai/test_metrics.py
tests/entrypoints/openai/test_metrics.py
+1
-6
vllm/engine/llm_engine.py
vllm/engine/llm_engine.py
+0
-15
vllm/engine/metrics.py
vllm/engine/metrics.py
+0
-89
vllm/engine/metrics_types.py
vllm/engine/metrics_types.py
+0
-3
No files found.
docs/usage/metrics.md
View file @
0e98964e
...
@@ -35,19 +35,6 @@ The following metrics are exposed:
...
@@ -35,19 +35,6 @@ The following metrics are exposed:
--
8
<--
"vllm/engine/metrics.py:metrics-definitions"
--
8
<--
"vllm/engine/metrics.py:metrics-definitions"
```
```
The following metrics are deprecated and due to be removed in a future version:
-
`vllm:num_requests_swapped`
,
`vllm:cpu_cache_usage_perc`
, and
`vllm:cpu_prefix_cache_hit_rate`
because KV cache offloading is not
used in V1.
-
`vllm:gpu_prefix_cache_hit_rate`
is replaced by queries+hits
counters in V1.
-
`vllm:time_in_queue_requests`
because it duplicates
`vllm:request_queue_time_seconds`
.
-
`vllm:model_forward_time_milliseconds`
and
`vllm:model_execute_time_milliseconds`
because
prefill/decode/inference time metrics should be used instead.
Note: when metrics are deprecated in version
`X.Y`
, they are hidden in version
`X.Y+1`
Note: when metrics are deprecated in version
`X.Y`
, they are hidden in version
`X.Y+1`
but can be re-enabled using the
`--show-hidden-metrics-for-version=X.Y`
escape hatch,
but can be re-enabled using the
`--show-hidden-metrics-for-version=X.Y`
escape hatch,
and are then removed in version
`X.Y+2`
.
and are then removed in version
`X.Y+2`
.
examples/online_serving/prometheus_grafana/grafana.json
View file @
0e98964e
...
@@ -577,23 +577,6 @@
...
@@ -577,23 +577,6 @@
"refId"
:
"A"
,
"refId"
:
"A"
,
"useBackend"
:
false
"useBackend"
:
false
},
},
{
"datasource"
:
{
"type"
:
"prometheus"
,
"uid"
:
"${DS_PROMETHEUS}"
},
"disableTextWrap"
:
false
,
"editorMode"
:
"builder"
,
"expr"
:
"vllm:num_requests_swapped{model_name=
\"
$model_name
\"
}"
,
"fullMetaSearch"
:
false
,
"hide"
:
false
,
"includeNullMetadata"
:
true
,
"instant"
:
false
,
"legendFormat"
:
"Num Swapped"
,
"range"
:
true
,
"refId"
:
"B"
,
"useBackend"
:
false
},
{
{
"datasource"
:
{
"datasource"
:
{
"type"
:
"prometheus"
,
"type"
:
"prometheus"
,
...
@@ -874,19 +857,6 @@
...
@@ -874,19 +857,6 @@
"legendFormat"
:
"GPU Cache Usage"
,
"legendFormat"
:
"GPU Cache Usage"
,
"range"
:
true
,
"range"
:
true
,
"refId"
:
"A"
"refId"
:
"A"
},
{
"datasource"
:
{
"type"
:
"prometheus"
,
"uid"
:
"${DS_PROMETHEUS}"
},
"editorMode"
:
"code"
,
"expr"
:
"vllm:cpu_cache_usage_perc{model_name=
\"
$model_name
\"
}"
,
"hide"
:
false
,
"instant"
:
false
,
"legendFormat"
:
"CPU Cache Usage"
,
"range"
:
true
,
"refId"
:
"B"
}
}
],
],
"title"
:
"Cache Utilization"
,
"title"
:
"Cache Utilization"
,
...
...
tests/entrypoints/openai/test_metrics.py
View file @
0e98964e
...
@@ -171,10 +171,8 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
...
@@ -171,10 +171,8 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
EXPECTED_METRICS
=
[
EXPECTED_METRICS
=
[
"vllm:num_requests_running"
,
"vllm:num_requests_running"
,
"vllm:num_requests_swapped"
,
# deprecated
"vllm:num_requests_waiting"
,
"vllm:num_requests_waiting"
,
"vllm:gpu_cache_usage_perc"
,
"vllm:gpu_cache_usage_perc"
,
"vllm:cpu_cache_usage_perc"
,
# deprecated
"vllm:time_to_first_token_seconds_sum"
,
"vllm:time_to_first_token_seconds_sum"
,
"vllm:time_to_first_token_seconds_bucket"
,
"vllm:time_to_first_token_seconds_bucket"
,
"vllm:time_to_first_token_seconds_count"
,
"vllm:time_to_first_token_seconds_count"
,
...
@@ -274,10 +272,7 @@ EXPECTED_METRICS_V1 = [
...
@@ -274,10 +272,7 @@ EXPECTED_METRICS_V1 = [
"vllm:request_decode_time_seconds_count"
,
"vllm:request_decode_time_seconds_count"
,
]
]
HIDDEN_DEPRECATED_METRICS
=
[
HIDDEN_DEPRECATED_METRICS
:
list
[
str
]
=
[]
"vllm:num_requests_swapped"
,
"vllm:cpu_cache_usage_perc"
,
]
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
...
...
vllm/engine/llm_engine.py
View file @
0e98964e
...
@@ -1680,9 +1680,6 @@ class LLMEngine:
...
@@ -1680,9 +1680,6 @@ class LLMEngine:
time_inference_requests
:
List
[
float
]
=
[]
time_inference_requests
:
List
[
float
]
=
[]
time_prefill_requests
:
List
[
float
]
=
[]
time_prefill_requests
:
List
[
float
]
=
[]
time_decode_requests
:
List
[
float
]
=
[]
time_decode_requests
:
List
[
float
]
=
[]
time_in_queue_requests
:
List
[
float
]
=
[]
model_forward_time_requests
:
List
[
float
]
=
[]
model_execute_time_requests
:
List
[
float
]
=
[]
# Metadata
# Metadata
num_prompt_tokens_requests
:
List
[
int
]
=
[]
num_prompt_tokens_requests
:
List
[
int
]
=
[]
num_generation_tokens_requests
:
List
[
int
]
=
[]
num_generation_tokens_requests
:
List
[
int
]
=
[]
...
@@ -1790,15 +1787,6 @@ class LLMEngine:
...
@@ -1790,15 +1787,6 @@ class LLMEngine:
now
-
seq_group
.
metrics
.
first_token_time
)
now
-
seq_group
.
metrics
.
first_token_time
)
time_inference_requests
.
append
(
time_inference_requests
.
append
(
now
-
seq_group
.
metrics
.
first_scheduled_time
)
now
-
seq_group
.
metrics
.
first_scheduled_time
)
if
seq_group
.
metrics
.
time_in_queue
is
not
None
:
time_in_queue_requests
.
append
(
seq_group
.
metrics
.
time_in_queue
)
if
seq_group
.
metrics
.
model_forward_time
is
not
None
:
model_forward_time_requests
.
append
(
seq_group
.
metrics
.
model_forward_time
)
if
seq_group
.
metrics
.
model_execute_time
is
not
None
:
model_execute_time_requests
.
append
(
seq_group
.
metrics
.
model_execute_time
*
1000
)
# Metadata
# Metadata
num_prompt_tokens_requests
.
append
(
num_prompt_tokens_requests
.
append
(
len
(
seq_group
.
prompt_token_ids
))
len
(
seq_group
.
prompt_token_ids
))
...
@@ -1867,9 +1855,6 @@ class LLMEngine:
...
@@ -1867,9 +1855,6 @@ class LLMEngine:
time_inference_requests
=
time_inference_requests
,
time_inference_requests
=
time_inference_requests
,
time_prefill_requests
=
time_prefill_requests
,
time_prefill_requests
=
time_prefill_requests
,
time_decode_requests
=
time_decode_requests
,
time_decode_requests
=
time_decode_requests
,
time_in_queue_requests
=
time_in_queue_requests
,
model_forward_time_requests
=
model_forward_time_requests
,
model_execute_time_requests
=
model_execute_time_requests
,
# Metadata
# Metadata
num_prompt_tokens_requests
=
num_prompt_tokens_requests
,
num_prompt_tokens_requests
=
num_prompt_tokens_requests
,
num_generation_tokens_requests
=
num_generation_tokens_requests
,
num_generation_tokens_requests
=
num_generation_tokens_requests
,
...
...
vllm/engine/metrics.py
View file @
0e98964e
...
@@ -80,17 +80,6 @@ class Metrics:
...
@@ -80,17 +80,6 @@ class Metrics:
multiprocess_mode
=
"livemostrecent"
,
multiprocess_mode
=
"livemostrecent"
,
)
)
# Deprecated in 0.8 - KV cache offloading is not used in V1
# Hidden in 0.9, due to be removed in 0.10
if
self
.
show_hidden_metrics
:
self
.
gauge_scheduler_swapped
=
self
.
_gauge_cls
(
name
=
"vllm:num_requests_swapped"
,
documentation
=
(
"Number of requests swapped to CPU. "
"DEPRECATED: KV cache offloading is not used in V1"
),
labelnames
=
labelnames
,
multiprocess_mode
=
"sum"
)
# KV Cache Usage in %
# KV Cache Usage in %
self
.
gauge_gpu_cache_usage
=
self
.
_gauge_cls
(
self
.
gauge_gpu_cache_usage
=
self
.
_gauge_cls
(
name
=
"vllm:gpu_cache_usage_perc"
,
name
=
"vllm:gpu_cache_usage_perc"
,
...
@@ -98,35 +87,6 @@ class Metrics:
...
@@ -98,35 +87,6 @@ class Metrics:
labelnames
=
labelnames
,
labelnames
=
labelnames
,
multiprocess_mode
=
"sum"
)
multiprocess_mode
=
"sum"
)
# Deprecated in 0.8 - KV cache offloading is not used in V1
# Hidden in 0.9, due to be removed in 0.10
if
self
.
show_hidden_metrics
:
self
.
gauge_cpu_cache_usage
=
self
.
_gauge_cls
(
name
=
"vllm:cpu_cache_usage_perc"
,
documentation
=
(
"CPU KV-cache usage. 1 means 100 percent usage. "
"DEPRECATED: KV cache offloading is not used in V1"
),
labelnames
=
labelnames
,
multiprocess_mode
=
"sum"
)
self
.
gauge_cpu_prefix_cache_hit_rate
=
self
.
_gauge_cls
(
name
=
"vllm:cpu_prefix_cache_hit_rate"
,
documentation
=
(
"CPU prefix cache block hit rate. "
"DEPRECATED: KV cache offloading is not used in V1"
),
labelnames
=
labelnames
,
multiprocess_mode
=
"sum"
)
# Deprecated in 0.8 - replaced by queries+hits counters in V1
# Hidden in 0.9, due to be removed in 0.10
if
self
.
show_hidden_metrics
:
self
.
gauge_gpu_prefix_cache_hit_rate
=
self
.
_gauge_cls
(
name
=
"vllm:gpu_prefix_cache_hit_rate"
,
documentation
=
(
"GPU prefix cache block hit rate. "
"DEPRECATED: use vllm:gpu_prefix_cache_queries "
"and vllm:gpu_prefix_cache_queries in V1"
),
labelnames
=
labelnames
,
multiprocess_mode
=
"sum"
)
# Iteration stats
# Iteration stats
self
.
counter_num_preemption
=
self
.
_counter_cls
(
self
.
counter_num_preemption
=
self
.
_counter_cls
(
name
=
"vllm:num_preemptions_total"
,
name
=
"vllm:num_preemptions_total"
,
...
@@ -200,36 +160,6 @@ class Metrics:
...
@@ -200,36 +160,6 @@ class Metrics:
"Histogram of time spent in DECODE phase for request."
,
"Histogram of time spent in DECODE phase for request."
,
labelnames
=
labelnames
,
labelnames
=
labelnames
,
buckets
=
request_latency_buckets
)
buckets
=
request_latency_buckets
)
# Deprecated in 0.8 - duplicates vllm:request_queue_time_seconds:
# Hidden in 0.9, due to be removed in 0.10
if
self
.
show_hidden_metrics
:
self
.
histogram_time_in_queue_request
=
self
.
_histogram_cls
(
name
=
"vllm:time_in_queue_requests"
,
documentation
=
(
"Histogram of time the request spent in the queue in seconds. "
"DEPRECATED: use vllm:request_queue_time_seconds instead."
),
labelnames
=
labelnames
,
buckets
=
request_latency_buckets
)
# Deprecated in 0.8 - use prefill/decode/inference time metrics
# Hidden in 0.9, due to be removed in 0.10
if
self
.
show_hidden_metrics
:
self
.
histogram_model_forward_time_request
=
self
.
_histogram_cls
(
name
=
"vllm:model_forward_time_milliseconds"
,
documentation
=
(
"Histogram of time spent in the model forward pass in ms. "
"DEPRECATED: use prefill/decode/inference time metrics instead"
),
labelnames
=
labelnames
,
buckets
=
build_1_2_3_5_8_buckets
(
3000
))
self
.
histogram_model_execute_time_request
=
self
.
_histogram_cls
(
name
=
"vllm:model_execute_time_milliseconds"
,
documentation
=
(
"Histogram of time spent in the model execute function in ms."
"DEPRECATED: use prefill/decode/inference time metrics instead"
),
labelnames
=
labelnames
,
buckets
=
build_1_2_3_5_8_buckets
(
3000
))
# Metadata
# Metadata
self
.
histogram_num_prompt_tokens_request
=
self
.
_histogram_cls
(
self
.
histogram_num_prompt_tokens_request
=
self
.
_histogram_cls
(
...
@@ -580,20 +510,10 @@ class PrometheusStatLogger(StatLoggerBase):
...
@@ -580,20 +510,10 @@ class PrometheusStatLogger(StatLoggerBase):
# System state data
# System state data
self
.
_log_gauge
(
self
.
metrics
.
gauge_scheduler_running
,
self
.
_log_gauge
(
self
.
metrics
.
gauge_scheduler_running
,
stats
.
num_running_sys
)
stats
.
num_running_sys
)
if
self
.
metrics
.
show_hidden_metrics
:
self
.
_log_gauge
(
self
.
metrics
.
gauge_scheduler_swapped
,
stats
.
num_swapped_sys
)
self
.
_log_gauge
(
self
.
metrics
.
gauge_scheduler_waiting
,
self
.
_log_gauge
(
self
.
metrics
.
gauge_scheduler_waiting
,
stats
.
num_waiting_sys
)
stats
.
num_waiting_sys
)
self
.
_log_gauge
(
self
.
metrics
.
gauge_gpu_cache_usage
,
self
.
_log_gauge
(
self
.
metrics
.
gauge_gpu_cache_usage
,
stats
.
gpu_cache_usage_sys
)
stats
.
gpu_cache_usage_sys
)
if
self
.
metrics
.
show_hidden_metrics
:
self
.
_log_gauge
(
self
.
metrics
.
gauge_cpu_cache_usage
,
stats
.
cpu_cache_usage_sys
)
self
.
_log_gauge
(
self
.
metrics
.
gauge_cpu_prefix_cache_hit_rate
,
stats
.
cpu_prefix_cache_hit_rate
)
self
.
_log_gauge
(
self
.
metrics
.
gauge_gpu_prefix_cache_hit_rate
,
stats
.
gpu_prefix_cache_hit_rate
)
# Including max-lora in metric, in future this property of lora
# Including max-lora in metric, in future this property of lora
# config maybe extended to be dynamic.
# config maybe extended to be dynamic.
lora_info
=
{
lora_info
=
{
...
@@ -631,15 +551,6 @@ class PrometheusStatLogger(StatLoggerBase):
...
@@ -631,15 +551,6 @@ class PrometheusStatLogger(StatLoggerBase):
stats
.
time_prefill_requests
)
stats
.
time_prefill_requests
)
self
.
_log_histogram
(
self
.
metrics
.
histogram_decode_time_request
,
self
.
_log_histogram
(
self
.
metrics
.
histogram_decode_time_request
,
stats
.
time_decode_requests
)
stats
.
time_decode_requests
)
if
self
.
metrics
.
show_hidden_metrics
:
self
.
_log_histogram
(
self
.
metrics
.
histogram_time_in_queue_request
,
stats
.
time_in_queue_requests
)
self
.
_log_histogram
(
self
.
metrics
.
histogram_model_forward_time_request
,
stats
.
model_forward_time_requests
)
self
.
_log_histogram
(
self
.
metrics
.
histogram_model_execute_time_request
,
stats
.
model_execute_time_requests
)
# Metadata
# Metadata
finished_reason_counter
=
CollectionsCounter
(
finished_reason_counter
=
CollectionsCounter
(
stats
.
finished_reason_requests
)
stats
.
finished_reason_requests
)
...
...
vllm/engine/metrics_types.py
View file @
0e98964e
...
@@ -53,9 +53,6 @@ class Stats:
...
@@ -53,9 +53,6 @@ class Stats:
time_inference_requests
:
List
[
float
]
time_inference_requests
:
List
[
float
]
time_prefill_requests
:
List
[
float
]
time_prefill_requests
:
List
[
float
]
time_decode_requests
:
List
[
float
]
time_decode_requests
:
List
[
float
]
time_in_queue_requests
:
List
[
float
]
model_forward_time_requests
:
List
[
float
]
model_execute_time_requests
:
List
[
float
]
# Metadata
# Metadata
num_prompt_tokens_requests
:
List
[
int
]
num_prompt_tokens_requests
:
List
[
int
]
num_generation_tokens_requests
:
List
[
int
]
num_generation_tokens_requests
:
List
[
int
]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment