Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
c386c43c
Unverified
Commit
c386c43c
authored
Jan 28, 2025
by
Mark McLoughlin
Committed by
GitHub
Jan 28, 2025
Browse files
[V1][Metrics] Add per-request prompt/generation_tokens histograms (#12516)
Signed-off-by:
Mark McLoughlin
<
markmc@redhat.com
>
parent
f26d7907
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
102 additions
and
14 deletions
+102
-14
tests/entrypoints/openai/test_metrics.py
tests/entrypoints/openai/test_metrics.py
+6
-0
vllm/v1/engine/async_llm.py
vllm/v1/engine/async_llm.py
+1
-2
vllm/v1/engine/output_processor.py
vllm/v1/engine/output_processor.py
+9
-2
vllm/v1/metrics/loggers.py
vllm/v1/metrics/loggers.py
+54
-6
vllm/v1/metrics/stats.py
vllm/v1/metrics/stats.py
+32
-4
No files found.
tests/entrypoints/openai/test_metrics.py
View file @
c386c43c
...
...
@@ -202,6 +202,12 @@ EXPECTED_METRICS_V1 = [
"vllm:num_requests_waiting"
,
"vllm:prompt_tokens_total"
,
"vllm:generation_tokens_total"
,
"vllm:request_prompt_tokens_sum"
,
"vllm:request_prompt_tokens_bucket"
,
"vllm:request_prompt_tokens_count"
,
"vllm:request_generation_tokens_sum"
,
"vllm:request_generation_tokens_bucket"
,
"vllm:request_generation_tokens_count"
,
]
...
...
vllm/v1/engine/async_llm.py
View file @
c386c43c
...
...
@@ -53,8 +53,7 @@ class AsyncLLM(EngineClient):
self
.
log_stats
=
log_stats
self
.
stat_loggers
:
List
[
StatLoggerBase
]
=
[
LoggingStatLogger
(),
PrometheusStatLogger
(
labels
=
dict
(
model_name
=
self
.
model_config
.
served_model_name
)),
PrometheusStatLogger
(
vllm_config
.
model_config
),
]
# Tokenizer (+ ensure liveness if running in another process).
...
...
vllm/v1/engine/output_processor.py
View file @
c386c43c
...
...
@@ -8,7 +8,7 @@ from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
from
vllm.v1.engine
import
EngineCoreOutput
,
EngineCoreRequest
from
vllm.v1.engine.detokenizer
import
(
DetokenizerOutput
,
IncrementalDetokenizer
)
from
vllm.v1.metrics.stats
import
IterationStats
from
vllm.v1.metrics.stats
import
IterationStats
,
RequestStateStats
@
dataclass
...
...
@@ -37,6 +37,8 @@ class RequestState:
self
.
is_prefilling
=
True
self
.
queue
=
queue
self
.
stats
=
RequestStateStats
()
@
classmethod
def
from_new_request
(
cls
,
...
...
@@ -146,7 +148,8 @@ class OutputProcessor:
# 1) Compute stats for this iteration.
iteration_stats
.
update_from_output
(
engine_core_output
,
req_state
.
is_prefilling
,
req_state
.
prompt_len
)
req_state
.
prompt_len
,
req_state
.
stats
)
req_state
.
is_prefilling
=
False
# 2) Detokenize the token ids into text.
...
...
@@ -171,6 +174,10 @@ class OutputProcessor:
# detected stop string, abort needed in EngineCore.
reqs_to_abort
.
append
(
req_id
)
# Track per-request stats
iteration_stats
.
update_from_finished_request
(
request_output
,
req_state
.
stats
)
return
OutputProcessorOutput
(
request_outputs
=
request_outputs
,
reqs_to_abort
=
reqs_to_abort
,
...
...
vllm/v1/metrics/loggers.py
View file @
c386c43c
import
time
from
abc
import
ABC
,
abstractmethod
from
typing
import
Dict
,
List
from
typing
import
List
import
numpy
as
np
import
prometheus_client
from
vllm.config
import
ModelConfig
from
vllm.logger
import
init_logger
from
vllm.v1.metrics.stats
import
IterationStats
,
SchedulerStats
...
...
@@ -78,13 +79,13 @@ class LoggingStatLogger(StatLoggerBase):
class
PrometheusStatLogger
(
StatLoggerBase
):
def
__init__
(
self
,
labels
:
Dict
[
str
,
str
]
):
self
.
labels
=
labels
def
__init__
(
self
,
model_config
:
ModelConfig
):
self
.
_unregister_vllm_metrics
()
labelnames
=
self
.
labels
.
keys
()
labelvalues
=
self
.
labels
.
values
()
labelnames
=
[
"model_name"
]
labelvalues
=
[
model_config
.
served_model_name
]
self
.
_unregister_vllm_metrics
()
max_model_len
=
model_config
.
max_model_len
self
.
gauge_scheduler_running
=
prometheus_client
.
Gauge
(
name
=
"vllm:num_requests_running"
,
...
...
@@ -106,6 +107,20 @@ class PrometheusStatLogger(StatLoggerBase):
documentation
=
"Number of generation tokens processed."
,
labelnames
=
labelnames
).
labels
(
*
labelvalues
)
self
.
histogram_num_prompt_tokens_request
=
\
prometheus_client
.
Histogram
(
name
=
"vllm:request_prompt_tokens"
,
documentation
=
"Number of prefill tokens processed."
,
buckets
=
build_1_2_5_buckets
(
max_model_len
),
labelnames
=
labelnames
).
labels
(
*
labelvalues
)
self
.
histogram_num_generation_tokens_request
=
\
prometheus_client
.
Histogram
(
name
=
"vllm:request_generation_tokens"
,
documentation
=
"Number of generation tokens processed."
,
buckets
=
build_1_2_5_buckets
(
max_model_len
),
labelnames
=
labelnames
).
labels
(
*
labelvalues
)
def
log
(
self
,
scheduler_stats
:
SchedulerStats
,
iteration_stats
:
IterationStats
):
"""Log to prometheus."""
...
...
@@ -116,9 +131,42 @@ class PrometheusStatLogger(StatLoggerBase):
self
.
counter_generation_tokens
.
inc
(
iteration_stats
.
num_generation_tokens
)
for
finished_request
in
iteration_stats
.
finished_requests
:
self
.
histogram_num_prompt_tokens_request
.
observe
(
finished_request
.
num_prompt_tokens
)
self
.
histogram_num_generation_tokens_request
.
observe
(
finished_request
.
num_generation_tokens
)
@
staticmethod
def
_unregister_vllm_metrics
():
# Unregister any existing vLLM collectors (for CI/CD
for
collector
in
list
(
prometheus_client
.
REGISTRY
.
_collector_to_names
):
if
hasattr
(
collector
,
"_name"
)
and
"vllm"
in
collector
.
_name
:
prometheus_client
.
REGISTRY
.
unregister
(
collector
)
def
build_buckets
(
mantissa_lst
:
List
[
int
],
max_value
:
int
)
->
List
[
int
]:
"""
Builds a list of buckets with increasing powers of 10 multiplied by
mantissa values until the value exceeds the specified maximum.
"""
exponent
=
0
buckets
:
List
[
int
]
=
[]
while
True
:
for
m
in
mantissa_lst
:
value
=
m
*
10
**
exponent
if
value
<=
max_value
:
buckets
.
append
(
value
)
else
:
return
buckets
exponent
+=
1
def
build_1_2_5_buckets
(
max_value
:
int
)
->
List
[
int
]:
"""
Example:
>>> build_1_2_5_buckets(100)
[1, 2, 5, 10, 20, 50, 100]
"""
return
build_buckets
([
1
,
2
,
5
],
max_value
)
vllm/v1/metrics/stats.py
View file @
c386c43c
from
dataclasses
import
dataclass
from
typing
import
TYPE_CHECKING
from
typing
import
TYPE_CHECKING
,
List
if
TYPE_CHECKING
:
from
vllm.outputs
import
RequestOutput
from
vllm.v1.engine
import
EngineCoreOutput
...
...
@@ -16,6 +17,21 @@ class SchedulerStats:
# gpu_prefix_cache_hit_rate: float = 0.0
@
dataclass
class
RequestStateStats
:
"""Stats that need to be tracked across delta updates."""
num_generation_tokens
:
int
=
0
@
dataclass
class
FinishedRequestStats
:
"""Stats associated with a finished request."""
num_prompt_tokens
:
int
=
0
num_generation_tokens
:
int
=
0
class
IterationStats
:
"""Stats associated with a single set of EngineCoreOutputs."""
...
...
@@ -23,17 +39,29 @@ class IterationStats:
self
.
log_stats
=
log_stats
self
.
num_generation_tokens
=
0
self
.
num_prompt_tokens
=
0
self
.
finished_requests
:
List
[
FinishedRequestStats
]
=
[]
def
update_from_output
(
self
,
output
:
"EngineCoreOutput"
,
is_prefilling
:
bool
,
prompt_len
:
int
):
is_prefilling
:
bool
,
prompt_len
:
int
,
request_state_stats
:
RequestStateStats
):
if
not
self
.
log_stats
:
return
self
.
num_generation_tokens
+=
len
(
output
.
new_token_ids
)
num_new_generation_tokens
=
len
(
output
.
new_token_ids
)
self
.
num_generation_tokens
+=
num_new_generation_tokens
if
is_prefilling
:
# This relies on the invariant that EngineCore does
# not stream outputs for partially completed prefills
# (scheduler.update_from_output makes EngineCoreOutput
# iff num_computed_tokens == num_tokens).
assert
(
len
(
output
.
new_token_ids
)
>
0
)
assert
(
num_new_generation_tokens
>
0
)
self
.
num_prompt_tokens
+=
prompt_len
request_state_stats
.
num_generation_tokens
+=
num_new_generation_tokens
def
update_from_finished_request
(
self
,
request_output
:
"RequestOutput"
,
request_state_stats
:
RequestStateStats
):
self
.
finished_requests
.
append
(
FinishedRequestStats
(
len
(
request_output
.
prompt_token_ids
),
request_state_stats
.
num_generation_tokens
))
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment