Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
46fb0567
Unverified
Commit
46fb0567
authored
Jan 29, 2025
by
Mark McLoughlin
Committed by
GitHub
Jan 29, 2025
Browse files
[V1][Metrics] Add TTFT and TPOT histograms (#12530)
Signed-off-by:
Mark McLoughlin
<
markmc@redhat.com
>
parent
dd6a3a02
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
45 additions
and
1 deletion
+45
-1
tests/entrypoints/openai/test_metrics.py
tests/entrypoints/openai/test_metrics.py
+6
-0
vllm/v1/engine/output_processor.py
vllm/v1/engine/output_processor.py
+3
-1
vllm/v1/metrics/loggers.py
vllm/v1/metrics/loggers.py
+25
-0
vllm/v1/metrics/stats.py
vllm/v1/metrics/stats.py
+11
-0
No files found.
tests/entrypoints/openai/test_metrics.py
View file @
46fb0567
...
@@ -208,6 +208,12 @@ EXPECTED_METRICS_V1 = [
...
@@ -208,6 +208,12 @@ EXPECTED_METRICS_V1 = [
"vllm:request_generation_tokens_sum"
,
"vllm:request_generation_tokens_sum"
,
"vllm:request_generation_tokens_bucket"
,
"vllm:request_generation_tokens_bucket"
,
"vllm:request_generation_tokens_count"
,
"vllm:request_generation_tokens_count"
,
"vllm:time_to_first_token_seconds_sum"
,
"vllm:time_to_first_token_seconds_bucket"
,
"vllm:time_to_first_token_seconds_count"
,
"vllm:time_per_output_token_seconds_sum"
,
"vllm:time_per_output_token_seconds_bucket"
,
"vllm:time_per_output_token_seconds_count"
,
]
]
...
...
vllm/v1/engine/output_processor.py
View file @
46fb0567
...
@@ -27,6 +27,7 @@ class RequestState:
...
@@ -27,6 +27,7 @@ class RequestState:
prompt
:
Optional
[
str
],
prompt
:
Optional
[
str
],
prompt_token_ids
:
List
[
int
],
prompt_token_ids
:
List
[
int
],
detokenizer
:
IncrementalDetokenizer
,
detokenizer
:
IncrementalDetokenizer
,
arrival_time
:
float
,
queue
:
Optional
[
asyncio
.
Queue
[
RequestOutput
]],
queue
:
Optional
[
asyncio
.
Queue
[
RequestOutput
]],
):
):
self
.
request_id
=
request_id
self
.
request_id
=
request_id
...
@@ -37,7 +38,7 @@ class RequestState:
...
@@ -37,7 +38,7 @@ class RequestState:
self
.
is_prefilling
=
True
self
.
is_prefilling
=
True
self
.
queue
=
queue
self
.
queue
=
queue
self
.
stats
=
RequestStateStats
()
self
.
stats
=
RequestStateStats
(
last_token_time
=
arrival_time
)
@
classmethod
@
classmethod
def
from_new_request
(
def
from_new_request
(
...
@@ -54,6 +55,7 @@ class RequestState:
...
@@ -54,6 +55,7 @@ class RequestState:
tokenizer
=
tokenizer
,
tokenizer
=
tokenizer
,
request
=
request
,
request
=
request
,
),
),
arrival_time
=
request
.
arrival_time
,
queue
=
queue
,
queue
=
queue
,
)
)
...
...
vllm/v1/metrics/loggers.py
View file @
46fb0567
...
@@ -121,6 +121,26 @@ class PrometheusStatLogger(StatLoggerBase):
...
@@ -121,6 +121,26 @@ class PrometheusStatLogger(StatLoggerBase):
buckets
=
build_1_2_5_buckets
(
max_model_len
),
buckets
=
build_1_2_5_buckets
(
max_model_len
),
labelnames
=
labelnames
).
labels
(
*
labelvalues
)
labelnames
=
labelnames
).
labels
(
*
labelvalues
)
self
.
histogram_time_to_first_token
=
\
prometheus_client
.
Histogram
(
name
=
"vllm:time_to_first_token_seconds"
,
documentation
=
"Histogram of time to first token in seconds."
,
buckets
=
[
0.001
,
0.005
,
0.01
,
0.02
,
0.04
,
0.06
,
0.08
,
0.1
,
0.25
,
0.5
,
0.75
,
1.0
,
2.5
,
5.0
,
7.5
,
10.0
],
labelnames
=
labelnames
).
labels
(
*
labelvalues
)
self
.
histogram_time_per_output_token
=
\
prometheus_client
.
Histogram
(
name
=
"vllm:time_per_output_token_seconds"
,
documentation
=
"Histogram of time per output token in seconds."
,
buckets
=
[
0.01
,
0.025
,
0.05
,
0.075
,
0.1
,
0.15
,
0.2
,
0.3
,
0.4
,
0.5
,
0.75
,
1.0
,
2.5
],
labelnames
=
labelnames
).
labels
(
*
labelvalues
)
def
log
(
self
,
scheduler_stats
:
SchedulerStats
,
def
log
(
self
,
scheduler_stats
:
SchedulerStats
,
iteration_stats
:
IterationStats
):
iteration_stats
:
IterationStats
):
"""Log to prometheus."""
"""Log to prometheus."""
...
@@ -137,6 +157,11 @@ class PrometheusStatLogger(StatLoggerBase):
...
@@ -137,6 +157,11 @@ class PrometheusStatLogger(StatLoggerBase):
self
.
histogram_num_generation_tokens_request
.
observe
(
self
.
histogram_num_generation_tokens_request
.
observe
(
finished_request
.
num_generation_tokens
)
finished_request
.
num_generation_tokens
)
for
ttft
in
iteration_stats
.
time_to_first_tokens_iter
:
self
.
histogram_time_to_first_token
.
observe
(
ttft
)
for
tpot
in
iteration_stats
.
time_per_output_tokens_iter
:
self
.
histogram_time_per_output_token
.
observe
(
tpot
)
@
staticmethod
@
staticmethod
def
_unregister_vllm_metrics
():
def
_unregister_vllm_metrics
():
# Unregister any existing vLLM collectors (for CI/CD
# Unregister any existing vLLM collectors (for CI/CD
...
...
vllm/v1/metrics/stats.py
View file @
46fb0567
import
time
from
dataclasses
import
dataclass
from
dataclasses
import
dataclass
from
typing
import
TYPE_CHECKING
,
List
from
typing
import
TYPE_CHECKING
,
List
...
@@ -22,6 +23,7 @@ class RequestStateStats:
...
@@ -22,6 +23,7 @@ class RequestStateStats:
"""Stats that need to be tracked across delta updates."""
"""Stats that need to be tracked across delta updates."""
num_generation_tokens
:
int
=
0
num_generation_tokens
:
int
=
0
last_token_time
:
float
=
0.0
@
dataclass
@
dataclass
...
@@ -40,6 +42,8 @@ class IterationStats:
...
@@ -40,6 +42,8 @@ class IterationStats:
self
.
num_generation_tokens
=
0
self
.
num_generation_tokens
=
0
self
.
num_prompt_tokens
=
0
self
.
num_prompt_tokens
=
0
self
.
finished_requests
:
List
[
FinishedRequestStats
]
=
[]
self
.
finished_requests
:
List
[
FinishedRequestStats
]
=
[]
self
.
time_to_first_tokens_iter
:
List
[
float
]
=
[]
self
.
time_per_output_tokens_iter
:
List
[
float
]
=
[]
def
update_from_output
(
self
,
output
:
"EngineCoreOutput"
,
def
update_from_output
(
self
,
output
:
"EngineCoreOutput"
,
is_prefilling
:
bool
,
prompt_len
:
int
,
is_prefilling
:
bool
,
prompt_len
:
int
,
...
@@ -48,6 +52,8 @@ class IterationStats:
...
@@ -48,6 +52,8 @@ class IterationStats:
return
return
num_new_generation_tokens
=
len
(
output
.
new_token_ids
)
num_new_generation_tokens
=
len
(
output
.
new_token_ids
)
now
=
time
.
time
()
last_token_latency
=
now
-
request_state_stats
.
last_token_time
self
.
num_generation_tokens
+=
num_new_generation_tokens
self
.
num_generation_tokens
+=
num_new_generation_tokens
if
is_prefilling
:
if
is_prefilling
:
...
@@ -58,7 +64,12 @@ class IterationStats:
...
@@ -58,7 +64,12 @@ class IterationStats:
assert
(
num_new_generation_tokens
>
0
)
assert
(
num_new_generation_tokens
>
0
)
self
.
num_prompt_tokens
+=
prompt_len
self
.
num_prompt_tokens
+=
prompt_len
self
.
time_to_first_tokens_iter
.
append
(
last_token_latency
)
else
:
self
.
time_per_output_tokens_iter
.
append
(
last_token_latency
)
request_state_stats
.
num_generation_tokens
+=
num_new_generation_tokens
request_state_stats
.
num_generation_tokens
+=
num_new_generation_tokens
request_state_stats
.
last_token_time
=
now
def
update_from_finished_request
(
self
,
request_output
:
"RequestOutput"
,
def
update_from_finished_request
(
self
,
request_output
:
"RequestOutput"
,
request_state_stats
:
RequestStateStats
):
request_state_stats
:
RequestStateStats
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment