Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
74fc2d77
Unverified
Commit
74fc2d77
authored
Oct 30, 2024
by
科英
Committed by
GitHub
Oct 29, 2024
Browse files
[Misc] Add metrics for request queue time, forward time, and execute time (#9659)
parent
622b7ab9
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
70 additions
and
15 deletions
+70
-15
vllm/config.py
vllm/config.py
+0
-7
vllm/engine/llm_engine.py
vllm/engine/llm_engine.py
+15
-0
vllm/engine/metrics.py
vllm/engine/metrics.py
+52
-8
vllm/engine/metrics_types.py
vllm/engine/metrics_types.py
+3
-0
No files found.
vllm/config.py
View file @
74fc2d77
...
...
@@ -1892,13 +1892,6 @@ class ObservabilityConfig:
"'otlp_traces_endpoint'. Ensure OpenTelemetry packages are "
f
"installed. Original error:
\n
{
otel_import_error_traceback
}
"
)
if
((
self
.
collect_model_forward_time
or
self
.
collect_model_execute_time
)
and
self
.
otlp_traces_endpoint
is
None
):
raise
ValueError
(
"collect_model_forward_time or collect_model_execute_time "
"requires --otlp-traces-endpoint to be set."
)
@
dataclass
(
frozen
=
True
)
class
EngineConfig
:
...
...
vllm/engine/llm_engine.py
View file @
74fc2d77
...
...
@@ -1645,6 +1645,9 @@ class LLMEngine:
# Request stats
# Latency
time_e2e_requests
:
List
[
float
]
=
[]
time_in_queue_requests
:
List
[
float
]
=
[]
model_forward_time_requests
:
List
[
float
]
=
[]
model_execute_time_requests
:
List
[
float
]
=
[]
# Metadata
num_prompt_tokens_requests
:
List
[
int
]
=
[]
num_generation_tokens_requests
:
List
[
int
]
=
[]
...
...
@@ -1738,6 +1741,15 @@ class LLMEngine:
# Latency timings
time_e2e_requests
.
append
(
now
-
seq_group
.
metrics
.
arrival_time
)
if
seq_group
.
metrics
.
time_in_queue
is
not
None
:
time_in_queue_requests
.
append
(
seq_group
.
metrics
.
time_in_queue
)
if
seq_group
.
metrics
.
model_forward_time
is
not
None
:
model_forward_time_requests
.
append
(
seq_group
.
metrics
.
model_forward_time
)
if
seq_group
.
metrics
.
model_execute_time
is
not
None
:
model_execute_time_requests
.
append
(
seq_group
.
metrics
.
model_execute_time
*
1000
)
# Metadata
num_prompt_tokens_requests
.
append
(
len
(
seq_group
.
prompt_token_ids
))
...
...
@@ -1795,6 +1807,9 @@ class LLMEngine:
# Request stats
# Latency
time_e2e_requests
=
time_e2e_requests
,
time_in_queue_requests
=
time_in_queue_requests
,
model_forward_time_requests
=
model_forward_time_requests
,
model_execute_time_requests
=
model_execute_time_requests
,
# Metadata
num_prompt_tokens_requests
=
num_prompt_tokens_requests
,
num_generation_tokens_requests
=
num_generation_tokens_requests
,
...
...
vllm/engine/metrics.py
View file @
74fc2d77
...
...
@@ -133,7 +133,31 @@ class Metrics:
name
=
"vllm:e2e_request_latency_seconds"
,
documentation
=
"Histogram of end to end request latency in seconds."
,
labelnames
=
labelnames
,
buckets
=
[
1.0
,
2.5
,
5.0
,
10.0
,
15.0
,
20.0
,
30.0
,
40.0
,
50.0
,
60.0
])
buckets
=
[
0.3
,
0.5
,
0.8
,
1.0
,
1.5
,
2.0
,
2.5
,
5.0
,
10.0
,
15.0
,
20.0
,
30.0
,
40.0
,
50.0
,
60.0
])
self
.
histogram_time_in_queue_request
=
self
.
_histogram_cls
(
name
=
"vllm:time_in_queue_requests"
,
documentation
=
"Histogram of time the request spent in the queue in seconds."
,
labelnames
=
labelnames
,
buckets
=
[
0.3
,
0.5
,
0.8
,
1.0
,
1.5
,
2.0
,
2.5
,
5.0
,
10.0
,
15.0
,
20.0
,
30.0
,
40.0
,
50.0
,
60.0
])
self
.
histogram_model_forward_time_request
=
self
.
_histogram_cls
(
name
=
"vllm:model_forward_time_milliseconds"
,
documentation
=
"Histogram of time spent in the model forward pass in ms."
,
labelnames
=
labelnames
,
buckets
=
build_1_2_3_5_8_buckets
(
3000
))
self
.
histogram_model_execute_time_request
=
self
.
_histogram_cls
(
name
=
"vllm:model_execute_time_milliseconds"
,
documentation
=
"Histogram of time spent in the model execute function in ms."
,
labelnames
=
labelnames
,
buckets
=
build_1_2_3_5_8_buckets
(
3000
))
# Metadata
self
.
histogram_num_prompt_tokens_request
=
self
.
_histogram_cls
(
name
=
"vllm:request_prompt_tokens"
,
...
...
@@ -299,16 +323,12 @@ class RayMetrics(Metrics):
pass
def
build_
1_2_5_
buckets
(
max_value
:
int
)
->
List
[
int
]:
def
build_buckets
(
mantissa_lst
:
List
[
int
],
max_value
:
int
)
->
List
[
int
]:
"""
Builds a list of buckets with increasing powers of 10 multiplied by
mantissa values
(1, 2, 5)
until the value exceeds the specified maximum.
Builds a list of buckets with increasing powers of 10 multiplied by
mantissa values until the value exceeds the specified maximum.
Example:
>>> build_1_2_5_buckets(100)
[1, 2, 5, 10, 20, 50, 100]
"""
mantissa_lst
=
[
1
,
2
,
5
]
exponent
=
0
buckets
:
List
[
int
]
=
[]
while
True
:
...
...
@@ -321,6 +341,24 @@ def build_1_2_5_buckets(max_value: int) -> List[int]:
exponent
+=
1
def
build_1_2_5_buckets
(
max_value
:
int
)
->
List
[
int
]:
"""
Example:
>>> build_1_2_5_buckets(100)
[1, 2, 5, 10, 20, 50, 100]
"""
return
build_buckets
([
1
,
2
,
5
],
max_value
)
def
build_1_2_3_5_8_buckets
(
max_value
:
int
)
->
List
[
int
]:
"""
Example:
>>> build_1_2_3_5_8_buckets(100)
[1, 2, 3, 5, 8, 10, 20, 30, 50, 80, 100]
"""
return
build_buckets
([
1
,
2
,
3
,
5
,
8
],
max_value
)
def
local_interval_elapsed
(
now
:
float
,
last_log
:
float
,
local_interval
:
float
)
->
bool
:
elapsed_time
=
now
-
last_log
...
...
@@ -486,6 +524,12 @@ class PrometheusStatLogger(StatLoggerBase):
# Latency
self
.
_log_histogram
(
self
.
metrics
.
histogram_e2e_time_request
,
stats
.
time_e2e_requests
)
self
.
_log_histogram
(
self
.
metrics
.
histogram_time_in_queue_request
,
stats
.
time_in_queue_requests
)
self
.
_log_histogram
(
self
.
metrics
.
histogram_model_forward_time_request
,
stats
.
model_forward_time_requests
)
self
.
_log_histogram
(
self
.
metrics
.
histogram_model_execute_time_request
,
stats
.
model_execute_time_requests
)
# Metadata
finished_reason_counter
=
CollectionsCounter
(
stats
.
finished_reason_requests
)
...
...
vllm/engine/metrics_types.py
View file @
74fc2d77
...
...
@@ -46,6 +46,9 @@ class Stats:
# Request stats (should have _requests suffix)
# Latency
time_e2e_requests
:
List
[
float
]
time_in_queue_requests
:
List
[
float
]
model_forward_time_requests
:
List
[
float
]
model_execute_time_requests
:
List
[
float
]
# Metadata
num_prompt_tokens_requests
:
List
[
int
]
num_generation_tokens_requests
:
List
[
int
]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment