Unverified Commit 74fc2d77 authored by 科英's avatar 科英 Committed by GitHub
Browse files

[Misc] Add metrics for request queue time, forward time, and execute time (#9659)

parent 622b7ab9
...@@ -1892,13 +1892,6 @@ class ObservabilityConfig: ...@@ -1892,13 +1892,6 @@ class ObservabilityConfig:
"'otlp_traces_endpoint'. Ensure OpenTelemetry packages are " "'otlp_traces_endpoint'. Ensure OpenTelemetry packages are "
f"installed. Original error:\n{otel_import_error_traceback}") f"installed. Original error:\n{otel_import_error_traceback}")
if ((self.collect_model_forward_time
or self.collect_model_execute_time)
and self.otlp_traces_endpoint is None):
raise ValueError(
"collect_model_forward_time or collect_model_execute_time "
"requires --otlp-traces-endpoint to be set.")
@dataclass(frozen=True) @dataclass(frozen=True)
class EngineConfig: class EngineConfig:
......
...@@ -1645,6 +1645,9 @@ class LLMEngine: ...@@ -1645,6 +1645,9 @@ class LLMEngine:
# Request stats # Request stats
# Latency # Latency
time_e2e_requests: List[float] = [] time_e2e_requests: List[float] = []
time_in_queue_requests: List[float] = []
model_forward_time_requests: List[float] = []
model_execute_time_requests: List[float] = []
# Metadata # Metadata
num_prompt_tokens_requests: List[int] = [] num_prompt_tokens_requests: List[int] = []
num_generation_tokens_requests: List[int] = [] num_generation_tokens_requests: List[int] = []
...@@ -1738,6 +1741,15 @@ class LLMEngine: ...@@ -1738,6 +1741,15 @@ class LLMEngine:
# Latency timings # Latency timings
time_e2e_requests.append(now - time_e2e_requests.append(now -
seq_group.metrics.arrival_time) seq_group.metrics.arrival_time)
if seq_group.metrics.time_in_queue is not None:
time_in_queue_requests.append(
seq_group.metrics.time_in_queue)
if seq_group.metrics.model_forward_time is not None:
model_forward_time_requests.append(
seq_group.metrics.model_forward_time)
if seq_group.metrics.model_execute_time is not None:
model_execute_time_requests.append(
seq_group.metrics.model_execute_time * 1000)
# Metadata # Metadata
num_prompt_tokens_requests.append( num_prompt_tokens_requests.append(
len(seq_group.prompt_token_ids)) len(seq_group.prompt_token_ids))
...@@ -1795,6 +1807,9 @@ class LLMEngine: ...@@ -1795,6 +1807,9 @@ class LLMEngine:
# Request stats # Request stats
# Latency # Latency
time_e2e_requests=time_e2e_requests, time_e2e_requests=time_e2e_requests,
time_in_queue_requests=time_in_queue_requests,
model_forward_time_requests=model_forward_time_requests,
model_execute_time_requests=model_execute_time_requests,
# Metadata # Metadata
num_prompt_tokens_requests=num_prompt_tokens_requests, num_prompt_tokens_requests=num_prompt_tokens_requests,
num_generation_tokens_requests=num_generation_tokens_requests, num_generation_tokens_requests=num_generation_tokens_requests,
......
...@@ -133,7 +133,31 @@ class Metrics: ...@@ -133,7 +133,31 @@ class Metrics:
name="vllm:e2e_request_latency_seconds", name="vllm:e2e_request_latency_seconds",
documentation="Histogram of end to end request latency in seconds.", documentation="Histogram of end to end request latency in seconds.",
labelnames=labelnames, labelnames=labelnames,
buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0]) buckets=[
0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0,
40.0, 50.0, 60.0
])
self.histogram_time_in_queue_request = self._histogram_cls(
name="vllm:time_in_queue_requests",
documentation=
"Histogram of time the request spent in the queue in seconds.",
labelnames=labelnames,
buckets=[
0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0,
40.0, 50.0, 60.0
])
self.histogram_model_forward_time_request = self._histogram_cls(
name="vllm:model_forward_time_milliseconds",
documentation=
"Histogram of time spent in the model forward pass in ms.",
labelnames=labelnames,
buckets=build_1_2_3_5_8_buckets(3000))
self.histogram_model_execute_time_request = self._histogram_cls(
name="vllm:model_execute_time_milliseconds",
documentation=
"Histogram of time spent in the model execute function in ms.",
labelnames=labelnames,
buckets=build_1_2_3_5_8_buckets(3000))
# Metadata # Metadata
self.histogram_num_prompt_tokens_request = self._histogram_cls( self.histogram_num_prompt_tokens_request = self._histogram_cls(
name="vllm:request_prompt_tokens", name="vllm:request_prompt_tokens",
...@@ -299,16 +323,12 @@ class RayMetrics(Metrics): ...@@ -299,16 +323,12 @@ class RayMetrics(Metrics):
pass pass
def build_1_2_5_buckets(max_value: int) -> List[int]: def build_buckets(mantissa_lst: List[int], max_value: int) -> List[int]:
""" """
Builds a list of buckets with increasing powers of 10 multiplied by Builds a list of buckets with increasing powers of 10 multiplied by
mantissa values (1, 2, 5) until the value exceeds the specified maximum. mantissa values until the value exceeds the specified maximum.
Example:
>>> build_1_2_5_buckets(100)
[1, 2, 5, 10, 20, 50, 100]
""" """
mantissa_lst = [1, 2, 5]
exponent = 0 exponent = 0
buckets: List[int] = [] buckets: List[int] = []
while True: while True:
...@@ -321,6 +341,24 @@ def build_1_2_5_buckets(max_value: int) -> List[int]: ...@@ -321,6 +341,24 @@ def build_1_2_5_buckets(max_value: int) -> List[int]:
exponent += 1 exponent += 1
def build_1_2_5_buckets(max_value: int) -> List[int]:
"""
Example:
>>> build_1_2_5_buckets(100)
[1, 2, 5, 10, 20, 50, 100]
"""
return build_buckets([1, 2, 5], max_value)
def build_1_2_3_5_8_buckets(max_value: int) -> List[int]:
"""
Example:
>>> build_1_2_3_5_8_buckets(100)
[1, 2, 3, 5, 8, 10, 20, 30, 50, 80, 100]
"""
return build_buckets([1, 2, 3, 5, 8], max_value)
def local_interval_elapsed(now: float, last_log: float, def local_interval_elapsed(now: float, last_log: float,
local_interval: float) -> bool: local_interval: float) -> bool:
elapsed_time = now - last_log elapsed_time = now - last_log
...@@ -486,6 +524,12 @@ class PrometheusStatLogger(StatLoggerBase): ...@@ -486,6 +524,12 @@ class PrometheusStatLogger(StatLoggerBase):
# Latency # Latency
self._log_histogram(self.metrics.histogram_e2e_time_request, self._log_histogram(self.metrics.histogram_e2e_time_request,
stats.time_e2e_requests) stats.time_e2e_requests)
self._log_histogram(self.metrics.histogram_time_in_queue_request,
stats.time_in_queue_requests)
self._log_histogram(self.metrics.histogram_model_forward_time_request,
stats.model_forward_time_requests)
self._log_histogram(self.metrics.histogram_model_execute_time_request,
stats.model_execute_time_requests)
# Metadata # Metadata
finished_reason_counter = CollectionsCounter( finished_reason_counter = CollectionsCounter(
stats.finished_reason_requests) stats.finished_reason_requests)
......
...@@ -46,6 +46,9 @@ class Stats: ...@@ -46,6 +46,9 @@ class Stats:
# Request stats (should have _requests suffix) # Request stats (should have _requests suffix)
# Latency # Latency
time_e2e_requests: List[float] time_e2e_requests: List[float]
time_in_queue_requests: List[float]
model_forward_time_requests: List[float]
model_execute_time_requests: List[float]
# Metadata # Metadata
num_prompt_tokens_requests: List[int] num_prompt_tokens_requests: List[int]
num_generation_tokens_requests: List[int] num_generation_tokens_requests: List[int]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment