Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
f17f1d46
Unverified
Commit
f17f1d46
authored
Jan 30, 2025
by
Mark McLoughlin
Committed by
GitHub
Jan 29, 2025
Browse files
[V1][Metrics] Add GPU cache usage % gauge (#12561)
Signed-off-by:
Mark McLoughlin
<
markmc@redhat.com
>
parent
1c1bb0bb
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
18 additions
and
2 deletions
+18
-2
tests/entrypoints/openai/test_metrics.py
tests/entrypoints/openai/test_metrics.py
+1
-0
vllm/v1/core/kv_cache_manager.py
vllm/v1/core/kv_cache_manager.py
+5
-0
vllm/v1/core/scheduler.py
vllm/v1/core/scheduler.py
+1
-0
vllm/v1/metrics/loggers.py
vllm/v1/metrics/loggers.py
+10
-1
vllm/v1/metrics/stats.py
vllm/v1/metrics/stats.py
+1
-1
No files found.
tests/entrypoints/openai/test_metrics.py
View file @
f17f1d46
...
...
@@ -200,6 +200,7 @@ EXPECTED_METRICS = [
EXPECTED_METRICS_V1
=
[
"vllm:num_requests_running"
,
"vllm:num_requests_waiting"
,
"vllm:gpu_cache_usage_perc"
,
"vllm:prompt_tokens_total"
,
"vllm:generation_tokens_total"
,
"vllm:request_prompt_tokens_sum"
,
...
...
vllm/v1/core/kv_cache_manager.py
View file @
f17f1d46
...
...
@@ -69,6 +69,11 @@ class KVCacheManager:
# is finished.
self
.
req_to_blocks
:
Dict
[
str
,
List
[
KVCacheBlock
]]
=
{}
@
property
def
usage
(
self
)
->
float
:
return
1.0
-
(
self
.
free_block_queue
.
num_free_blocks
/
self
.
num_gpu_blocks
)
def
get_computed_blocks
(
self
,
request
:
Request
)
->
Tuple
[
List
[
KVCacheBlock
],
int
]:
"""Get the computed (cached) blocks for the request.
...
...
vllm/v1/core/scheduler.py
View file @
f17f1d46
...
...
@@ -544,6 +544,7 @@ class Scheduler:
return
SchedulerStats
(
num_running_reqs
=
len
(
self
.
running
),
num_waiting_reqs
=
len
(
self
.
waiting
),
gpu_cache_usage
=
self
.
kv_cache_manager
.
usage
,
)
...
...
vllm/v1/metrics/loggers.py
View file @
f17f1d46
...
...
@@ -69,11 +69,13 @@ class LoggingStatLogger(StatLoggerBase):
logger
.
info
(
"Avg prompt throughput: %.1f tokens/s, "
"Avg generation throughput: %.1f tokens/s, "
"Running: %d reqs, Waiting: %d reqs "
,
"Running: %d reqs, Waiting: %d reqs "
"GPU KV cache usage: %.1f%%."
,
prompt_throughput
,
generation_throughput
,
scheduler_stats
.
num_running_reqs
,
scheduler_stats
.
num_waiting_reqs
,
scheduler_stats
.
gpu_cache_usage
*
100
,
)
...
...
@@ -97,6 +99,11 @@ class PrometheusStatLogger(StatLoggerBase):
documentation
=
"Number of requests waiting to be processed."
,
labelnames
=
labelnames
).
labels
(
*
labelvalues
)
self
.
gauge_gpu_cache_usage
=
prometheus_client
.
Gauge
(
name
=
"vllm:gpu_cache_usage_perc"
,
documentation
=
"GPU KV-cache usage. 1 means 100 percent usage."
,
labelnames
=
labelnames
).
labels
(
*
labelvalues
)
self
.
counter_prompt_tokens
=
prometheus_client
.
Counter
(
name
=
"vllm:prompt_tokens_total"
,
documentation
=
"Number of prefill tokens processed."
,
...
...
@@ -147,6 +154,8 @@ class PrometheusStatLogger(StatLoggerBase):
self
.
gauge_scheduler_running
.
set
(
scheduler_stats
.
num_running_reqs
)
self
.
gauge_scheduler_waiting
.
set
(
scheduler_stats
.
num_waiting_reqs
)
self
.
gauge_gpu_cache_usage
.
set
(
scheduler_stats
.
gpu_cache_usage
)
self
.
counter_prompt_tokens
.
inc
(
iteration_stats
.
num_prompt_tokens
)
self
.
counter_generation_tokens
.
inc
(
iteration_stats
.
num_generation_tokens
)
...
...
vllm/v1/metrics/stats.py
View file @
f17f1d46
...
...
@@ -14,7 +14,7 @@ class SchedulerStats:
num_running_reqs
:
int
=
0
num_waiting_reqs
:
int
=
0
#
gpu_cache_usage: float = 0.0
gpu_cache_usage
:
float
=
0.0
# gpu_prefix_cache_hit_rate: float = 0.0
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment