Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
74fc2d77
Unverified
Commit
74fc2d77
authored
Oct 30, 2024
by
科英
Committed by
GitHub
Oct 29, 2024
Browse files
[Misc] Add metrics for request queue time, forward time, and execute time (#9659)
parent
622b7ab9
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
70 additions
and
15 deletions
+70
-15
vllm/config.py
vllm/config.py
+0
-7
vllm/engine/llm_engine.py
vllm/engine/llm_engine.py
+15
-0
vllm/engine/metrics.py
vllm/engine/metrics.py
+52
-8
vllm/engine/metrics_types.py
vllm/engine/metrics_types.py
+3
-0
No files found.
vllm/config.py
View file @
74fc2d77
...
@@ -1892,13 +1892,6 @@ class ObservabilityConfig:
...
@@ -1892,13 +1892,6 @@ class ObservabilityConfig:
"'otlp_traces_endpoint'. Ensure OpenTelemetry packages are "
"'otlp_traces_endpoint'. Ensure OpenTelemetry packages are "
f
"installed. Original error:
\n
{
otel_import_error_traceback
}
"
)
f
"installed. Original error:
\n
{
otel_import_error_traceback
}
"
)
if
((
self
.
collect_model_forward_time
or
self
.
collect_model_execute_time
)
and
self
.
otlp_traces_endpoint
is
None
):
raise
ValueError
(
"collect_model_forward_time or collect_model_execute_time "
"requires --otlp-traces-endpoint to be set."
)
@
dataclass
(
frozen
=
True
)
@
dataclass
(
frozen
=
True
)
class
EngineConfig
:
class
EngineConfig
:
...
...
vllm/engine/llm_engine.py
View file @
74fc2d77
...
@@ -1645,6 +1645,9 @@ class LLMEngine:
...
@@ -1645,6 +1645,9 @@ class LLMEngine:
# Request stats
# Request stats
# Latency
# Latency
time_e2e_requests
:
List
[
float
]
=
[]
time_e2e_requests
:
List
[
float
]
=
[]
time_in_queue_requests
:
List
[
float
]
=
[]
model_forward_time_requests
:
List
[
float
]
=
[]
model_execute_time_requests
:
List
[
float
]
=
[]
# Metadata
# Metadata
num_prompt_tokens_requests
:
List
[
int
]
=
[]
num_prompt_tokens_requests
:
List
[
int
]
=
[]
num_generation_tokens_requests
:
List
[
int
]
=
[]
num_generation_tokens_requests
:
List
[
int
]
=
[]
...
@@ -1738,6 +1741,15 @@ class LLMEngine:
...
@@ -1738,6 +1741,15 @@ class LLMEngine:
# Latency timings
# Latency timings
time_e2e_requests
.
append
(
now
-
time_e2e_requests
.
append
(
now
-
seq_group
.
metrics
.
arrival_time
)
seq_group
.
metrics
.
arrival_time
)
if
seq_group
.
metrics
.
time_in_queue
is
not
None
:
time_in_queue_requests
.
append
(
seq_group
.
metrics
.
time_in_queue
)
if
seq_group
.
metrics
.
model_forward_time
is
not
None
:
model_forward_time_requests
.
append
(
seq_group
.
metrics
.
model_forward_time
)
if
seq_group
.
metrics
.
model_execute_time
is
not
None
:
model_execute_time_requests
.
append
(
seq_group
.
metrics
.
model_execute_time
*
1000
)
# Metadata
# Metadata
num_prompt_tokens_requests
.
append
(
num_prompt_tokens_requests
.
append
(
len
(
seq_group
.
prompt_token_ids
))
len
(
seq_group
.
prompt_token_ids
))
...
@@ -1795,6 +1807,9 @@ class LLMEngine:
...
@@ -1795,6 +1807,9 @@ class LLMEngine:
# Request stats
# Request stats
# Latency
# Latency
time_e2e_requests
=
time_e2e_requests
,
time_e2e_requests
=
time_e2e_requests
,
time_in_queue_requests
=
time_in_queue_requests
,
model_forward_time_requests
=
model_forward_time_requests
,
model_execute_time_requests
=
model_execute_time_requests
,
# Metadata
# Metadata
num_prompt_tokens_requests
=
num_prompt_tokens_requests
,
num_prompt_tokens_requests
=
num_prompt_tokens_requests
,
num_generation_tokens_requests
=
num_generation_tokens_requests
,
num_generation_tokens_requests
=
num_generation_tokens_requests
,
...
...
vllm/engine/metrics.py
View file @
74fc2d77
...
@@ -133,7 +133,31 @@ class Metrics:
...
@@ -133,7 +133,31 @@ class Metrics:
name
=
"vllm:e2e_request_latency_seconds"
,
name
=
"vllm:e2e_request_latency_seconds"
,
documentation
=
"Histogram of end to end request latency in seconds."
,
documentation
=
"Histogram of end to end request latency in seconds."
,
labelnames
=
labelnames
,
labelnames
=
labelnames
,
buckets
=
[
1.0
,
2.5
,
5.0
,
10.0
,
15.0
,
20.0
,
30.0
,
40.0
,
50.0
,
60.0
])
buckets
=
[
0.3
,
0.5
,
0.8
,
1.0
,
1.5
,
2.0
,
2.5
,
5.0
,
10.0
,
15.0
,
20.0
,
30.0
,
40.0
,
50.0
,
60.0
])
self
.
histogram_time_in_queue_request
=
self
.
_histogram_cls
(
name
=
"vllm:time_in_queue_requests"
,
documentation
=
"Histogram of time the request spent in the queue in seconds."
,
labelnames
=
labelnames
,
buckets
=
[
0.3
,
0.5
,
0.8
,
1.0
,
1.5
,
2.0
,
2.5
,
5.0
,
10.0
,
15.0
,
20.0
,
30.0
,
40.0
,
50.0
,
60.0
])
self
.
histogram_model_forward_time_request
=
self
.
_histogram_cls
(
name
=
"vllm:model_forward_time_milliseconds"
,
documentation
=
"Histogram of time spent in the model forward pass in ms."
,
labelnames
=
labelnames
,
buckets
=
build_1_2_3_5_8_buckets
(
3000
))
self
.
histogram_model_execute_time_request
=
self
.
_histogram_cls
(
name
=
"vllm:model_execute_time_milliseconds"
,
documentation
=
"Histogram of time spent in the model execute function in ms."
,
labelnames
=
labelnames
,
buckets
=
build_1_2_3_5_8_buckets
(
3000
))
# Metadata
# Metadata
self
.
histogram_num_prompt_tokens_request
=
self
.
_histogram_cls
(
self
.
histogram_num_prompt_tokens_request
=
self
.
_histogram_cls
(
name
=
"vllm:request_prompt_tokens"
,
name
=
"vllm:request_prompt_tokens"
,
...
@@ -299,16 +323,12 @@ class RayMetrics(Metrics):
...
@@ -299,16 +323,12 @@ class RayMetrics(Metrics):
pass
pass
def
build_
1_2_5_
buckets
(
max_value
:
int
)
->
List
[
int
]:
def
build_buckets
(
mantissa_lst
:
List
[
int
],
max_value
:
int
)
->
List
[
int
]:
"""
"""
Builds a list of buckets with increasing powers of 10 multiplied by
Builds a list of buckets with increasing powers of 10 multiplied by
mantissa values
(1, 2, 5)
until the value exceeds the specified maximum.
mantissa values until the value exceeds the specified maximum.
Example:
>>> build_1_2_5_buckets(100)
[1, 2, 5, 10, 20, 50, 100]
"""
"""
mantissa_lst
=
[
1
,
2
,
5
]
exponent
=
0
exponent
=
0
buckets
:
List
[
int
]
=
[]
buckets
:
List
[
int
]
=
[]
while
True
:
while
True
:
...
@@ -321,6 +341,24 @@ def build_1_2_5_buckets(max_value: int) -> List[int]:
...
@@ -321,6 +341,24 @@ def build_1_2_5_buckets(max_value: int) -> List[int]:
exponent
+=
1
exponent
+=
1
def
build_1_2_5_buckets
(
max_value
:
int
)
->
List
[
int
]:
"""
Example:
>>> build_1_2_5_buckets(100)
[1, 2, 5, 10, 20, 50, 100]
"""
return
build_buckets
([
1
,
2
,
5
],
max_value
)
def
build_1_2_3_5_8_buckets
(
max_value
:
int
)
->
List
[
int
]:
"""
Example:
>>> build_1_2_3_5_8_buckets(100)
[1, 2, 3, 5, 8, 10, 20, 30, 50, 80, 100]
"""
return
build_buckets
([
1
,
2
,
3
,
5
,
8
],
max_value
)
def
local_interval_elapsed
(
now
:
float
,
last_log
:
float
,
def
local_interval_elapsed
(
now
:
float
,
last_log
:
float
,
local_interval
:
float
)
->
bool
:
local_interval
:
float
)
->
bool
:
elapsed_time
=
now
-
last_log
elapsed_time
=
now
-
last_log
...
@@ -486,6 +524,12 @@ class PrometheusStatLogger(StatLoggerBase):
...
@@ -486,6 +524,12 @@ class PrometheusStatLogger(StatLoggerBase):
# Latency
# Latency
self
.
_log_histogram
(
self
.
metrics
.
histogram_e2e_time_request
,
self
.
_log_histogram
(
self
.
metrics
.
histogram_e2e_time_request
,
stats
.
time_e2e_requests
)
stats
.
time_e2e_requests
)
self
.
_log_histogram
(
self
.
metrics
.
histogram_time_in_queue_request
,
stats
.
time_in_queue_requests
)
self
.
_log_histogram
(
self
.
metrics
.
histogram_model_forward_time_request
,
stats
.
model_forward_time_requests
)
self
.
_log_histogram
(
self
.
metrics
.
histogram_model_execute_time_request
,
stats
.
model_execute_time_requests
)
# Metadata
# Metadata
finished_reason_counter
=
CollectionsCounter
(
finished_reason_counter
=
CollectionsCounter
(
stats
.
finished_reason_requests
)
stats
.
finished_reason_requests
)
...
...
vllm/engine/metrics_types.py
View file @
74fc2d77
...
@@ -46,6 +46,9 @@ class Stats:
...
@@ -46,6 +46,9 @@ class Stats:
# Request stats (should have _requests suffix)
# Request stats (should have _requests suffix)
# Latency
# Latency
time_e2e_requests
:
List
[
float
]
time_e2e_requests
:
List
[
float
]
time_in_queue_requests
:
List
[
float
]
model_forward_time_requests
:
List
[
float
]
model_execute_time_requests
:
List
[
float
]
# Metadata
# Metadata
num_prompt_tokens_requests
:
List
[
int
]
num_prompt_tokens_requests
:
List
[
int
]
num_generation_tokens_requests
:
List
[
int
]
num_generation_tokens_requests
:
List
[
int
]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment