Unverified Commit 24177984 authored by Mark McLoughlin's avatar Mark McLoughlin Committed by GitHub
Browse files

[Metrics] Deprecate TPOT in favor of ITL (#24110)


Signed-off-by: default avatarMark McLoughlin <markmc@redhat.com>
parent 9480ae24
...@@ -402,7 +402,7 @@ ...@@ -402,7 +402,7 @@
}, },
"disableTextWrap": false, "disableTextWrap": false,
"editorMode": "builder", "editorMode": "builder",
"expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
"fullMetaSearch": false, "fullMetaSearch": false,
"includeNullMetadata": false, "includeNullMetadata": false,
"instant": false, "instant": false,
...@@ -418,7 +418,7 @@ ...@@ -418,7 +418,7 @@
}, },
"disableTextWrap": false, "disableTextWrap": false,
"editorMode": "builder", "editorMode": "builder",
"expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
"fullMetaSearch": false, "fullMetaSearch": false,
"hide": false, "hide": false,
"includeNullMetadata": false, "includeNullMetadata": false,
...@@ -435,7 +435,7 @@ ...@@ -435,7 +435,7 @@
}, },
"disableTextWrap": false, "disableTextWrap": false,
"editorMode": "builder", "editorMode": "builder",
"expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
"fullMetaSearch": false, "fullMetaSearch": false,
"hide": false, "hide": false,
"includeNullMetadata": false, "includeNullMetadata": false,
...@@ -452,7 +452,7 @@ ...@@ -452,7 +452,7 @@
}, },
"disableTextWrap": false, "disableTextWrap": false,
"editorMode": "builder", "editorMode": "builder",
"expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
"fullMetaSearch": false, "fullMetaSearch": false,
"hide": false, "hide": false,
"includeNullMetadata": false, "includeNullMetadata": false,
...@@ -468,7 +468,7 @@ ...@@ -468,7 +468,7 @@
"uid": "${DS_PROMETHEUS}" "uid": "${DS_PROMETHEUS}"
}, },
"editorMode": "code", "editorMode": "code",
"expr": "rate(vllm:time_per_output_token_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:time_per_output_token_seconds_count{model_name=\"$model_name\"}[$__rate_interval])", "expr": "rate(vllm:inter_token_latency_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:inter_token_latency_seconds_count{model_name=\"$model_name\"}[$__rate_interval])",
"hide": false, "hide": false,
"instant": false, "instant": false,
"legendFormat": "Mean", "legendFormat": "Mean",
...@@ -476,7 +476,7 @@ ...@@ -476,7 +476,7 @@
"refId": "E" "refId": "E"
} }
], ],
"title": "Time Per Output Token Latency", "title": "Inter Token Latency",
"type": "timeseries" "type": "timeseries"
}, },
{ {
......
...@@ -250,12 +250,15 @@ EXPECTED_METRICS_V1 = [ ...@@ -250,12 +250,15 @@ EXPECTED_METRICS_V1 = [
"vllm:request_params_max_tokens_sum", "vllm:request_params_max_tokens_sum",
"vllm:request_params_max_tokens_bucket", "vllm:request_params_max_tokens_bucket",
"vllm:request_params_max_tokens_count", "vllm:request_params_max_tokens_count",
"vllm:time_to_first_token_seconds_sum",
"vllm:time_to_first_token_seconds_bucket",
"vllm:time_to_first_token_seconds_count",
"vllm:time_per_output_token_seconds_sum", "vllm:time_per_output_token_seconds_sum",
"vllm:time_per_output_token_seconds_bucket", "vllm:time_per_output_token_seconds_bucket",
"vllm:time_per_output_token_seconds_count", "vllm:time_per_output_token_seconds_count",
"vllm:time_to_first_token_seconds_sum",
"vllm:time_to_first_token_seconds_bucket",
"vllm:time_to_first_token_seconds_count",
"vllm:inter_token_latency_seconds_sum",
"vllm:inter_token_latency_seconds_bucket",
"vllm:inter_token_latency_seconds_count",
"vllm:e2e_request_latency_seconds_sum", "vllm:e2e_request_latency_seconds_sum",
"vllm:e2e_request_latency_seconds_bucket", "vllm:e2e_request_latency_seconds_bucket",
"vllm:e2e_request_latency_seconds_count", "vllm:e2e_request_latency_seconds_count",
...@@ -273,7 +276,11 @@ EXPECTED_METRICS_V1 = [ ...@@ -273,7 +276,11 @@ EXPECTED_METRICS_V1 = [
"vllm:request_decode_time_seconds_count", "vllm:request_decode_time_seconds_count",
] ]
HIDDEN_DEPRECATED_METRICS: list[str] = [] HIDDEN_DEPRECATED_METRICS: list[str] = [
"vllm:time_per_output_token_seconds_sum",
"vllm:time_per_output_token_seconds_bucket",
"vllm:time_per_output_token_seconds_count",
]
@pytest.mark.asyncio @pytest.mark.asyncio
...@@ -289,9 +296,10 @@ async def test_metrics_exist(server: RemoteOpenAIServer, ...@@ -289,9 +296,10 @@ async def test_metrics_exist(server: RemoteOpenAIServer,
assert response.status_code == HTTPStatus.OK assert response.status_code == HTTPStatus.OK
for metric in (EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS): for metric in (EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS):
if (not server.show_hidden_metrics if (metric in HIDDEN_DEPRECATED_METRICS
and metric not in HIDDEN_DEPRECATED_METRICS): and not server.show_hidden_metrics):
assert metric in response.text continue
assert metric in response.text
@pytest.mark.asyncio @pytest.mark.asyncio
......
...@@ -1414,7 +1414,7 @@ class LLMEngine: ...@@ -1414,7 +1414,7 @@ class LLMEngine:
num_generation_tokens_iter = 0 num_generation_tokens_iter = 0
num_tokens_iter = 0 num_tokens_iter = 0
time_to_first_tokens_iter: List[float] = [] time_to_first_tokens_iter: List[float] = []
time_per_output_tokens_iter: List[float] = [] inter_token_latencies_iter: List[float] = []
num_preemption_iter = (0 if scheduler_outputs is None else num_preemption_iter = (0 if scheduler_outputs is None else
scheduler_outputs.preempted) scheduler_outputs.preempted)
...@@ -1498,9 +1498,9 @@ class LLMEngine: ...@@ -1498,9 +1498,9 @@ class LLMEngine:
num_generation_tokens_from_prefill_groups += ( num_generation_tokens_from_prefill_groups += (
seq_group.num_seqs()) seq_group.num_seqs())
else: else:
# TPOTs. # ITLs
latency = seq_group.get_last_token_latency() latency = seq_group.get_last_token_latency()
time_per_output_tokens_iter.append(latency) inter_token_latencies_iter.append(latency)
if seq_group.state.current_step == 0: if seq_group.state.current_step == 0:
# For async_output_proc, the do_log_stats() # For async_output_proc, the do_log_stats()
# is called following init_multi_step(), which # is called following init_multi_step(), which
...@@ -1582,7 +1582,7 @@ class LLMEngine: ...@@ -1582,7 +1582,7 @@ class LLMEngine:
num_generation_tokens_iter=num_generation_tokens_iter, num_generation_tokens_iter=num_generation_tokens_iter,
num_tokens_iter=num_tokens_iter, num_tokens_iter=num_tokens_iter,
time_to_first_tokens_iter=time_to_first_tokens_iter, time_to_first_tokens_iter=time_to_first_tokens_iter,
time_per_output_tokens_iter=time_per_output_tokens_iter, inter_token_latencies_iter=inter_token_latencies_iter,
num_preemption_iter=num_preemption_iter, num_preemption_iter=num_preemption_iter,
# Request stats # Request stats
......
...@@ -113,9 +113,21 @@ class Metrics: ...@@ -113,9 +113,21 @@ class Metrics:
0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0, 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0,
2560.0 2560.0
]) ])
# Deprecated in 0.11 - Renamed as vllm:inter_token_latency_seconds
# TODO: in 0.12, only enable if show_hidden_metrics=True
self.histogram_time_per_output_token = self._histogram_cls( self.histogram_time_per_output_token = self._histogram_cls(
name="vllm:time_per_output_token_seconds", name="vllm:time_per_output_token_seconds",
documentation="Histogram of time per output token in seconds.", documentation=(
"Histogram of time per output token in seconds."
"DEPRECATED: Use vllm:inter_token_latency_seconds instead."),
labelnames=labelnames,
buckets=[
0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0
])
self.histogram_inter_token_latency = self._histogram_cls(
name="vllm:inter_token_latency_seconds",
documentation="Histogram of inter token latency in seconds.",
labelnames=labelnames, labelnames=labelnames,
buckets=[ buckets=[
0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
...@@ -491,7 +503,9 @@ class PrometheusStatLogger(StatLoggerBase): ...@@ -491,7 +503,9 @@ class PrometheusStatLogger(StatLoggerBase):
self._log_histogram(self.metrics.histogram_time_to_first_token, self._log_histogram(self.metrics.histogram_time_to_first_token,
stats.time_to_first_tokens_iter) stats.time_to_first_tokens_iter)
self._log_histogram(self.metrics.histogram_time_per_output_token, self._log_histogram(self.metrics.histogram_time_per_output_token,
stats.time_per_output_tokens_iter) stats.inter_token_latencies_iter)
self._log_histogram(self.metrics.histogram_inter_token_latency,
stats.inter_token_latencies_iter)
# Request level data # Request level data
# Latency # Latency
......
...@@ -43,7 +43,7 @@ class Stats: ...@@ -43,7 +43,7 @@ class Stats:
num_generation_tokens_iter: int num_generation_tokens_iter: int
num_tokens_iter: int num_tokens_iter: int
time_to_first_tokens_iter: List[float] time_to_first_tokens_iter: List[float]
time_per_output_tokens_iter: List[float] inter_token_latencies_iter: List[float]
num_preemption_iter: int num_preemption_iter: int
# Request stats (should have _requests suffix) # Request stats (should have _requests suffix)
......
...@@ -377,9 +377,13 @@ class PrometheusStatLogger(StatLoggerBase): ...@@ -377,9 +377,13 @@ class PrometheusStatLogger(StatLoggerBase):
self.histogram_time_to_first_token = make_per_engine( self.histogram_time_to_first_token = make_per_engine(
histogram_time_to_first_token, engine_indexes, model_name) histogram_time_to_first_token, engine_indexes, model_name)
# Deprecated in 0.11 - Renamed as vllm:inter_token_latency_seconds
# TODO: in 0.12, only enable if show_hidden_metrics=True
histogram_time_per_output_token = self._histogram_cls( histogram_time_per_output_token = self._histogram_cls(
name="vllm:time_per_output_token_seconds", name="vllm:time_per_output_token_seconds",
documentation="Histogram of time per output token in seconds.", documentation=(
"Histogram of time per output token in seconds."
"DEPRECATED: Use vllm:inter_token_latency_seconds instead."),
buckets=[ buckets=[
0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0
...@@ -388,6 +392,17 @@ class PrometheusStatLogger(StatLoggerBase): ...@@ -388,6 +392,17 @@ class PrometheusStatLogger(StatLoggerBase):
self.histogram_time_per_output_token = make_per_engine( self.histogram_time_per_output_token = make_per_engine(
histogram_time_per_output_token, engine_indexes, model_name) histogram_time_per_output_token, engine_indexes, model_name)
histogram_inter_token_latency = self._histogram_cls(
name="vllm:inter_token_latency_seconds",
documentation="Histogram of inter-token latency in seconds.",
buckets=[
0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0
],
labelnames=labelnames)
self.histogram_inter_token_latency = make_per_engine(
histogram_inter_token_latency, engine_indexes, model_name)
request_latency_buckets = [ request_latency_buckets = [
0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0,
40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0 40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0
...@@ -537,8 +552,9 @@ class PrometheusStatLogger(StatLoggerBase): ...@@ -537,8 +552,9 @@ class PrometheusStatLogger(StatLoggerBase):
self.histogram_n_request[engine_idx].observe(n_param) self.histogram_n_request[engine_idx].observe(n_param)
for ttft in iteration_stats.time_to_first_tokens_iter: for ttft in iteration_stats.time_to_first_tokens_iter:
self.histogram_time_to_first_token[engine_idx].observe(ttft) self.histogram_time_to_first_token[engine_idx].observe(ttft)
for tpot in iteration_stats.time_per_output_tokens_iter: for itl in iteration_stats.inter_token_latencies_iter:
self.histogram_time_per_output_token[engine_idx].observe(tpot) self.histogram_inter_token_latency[engine_idx].observe(itl)
self.histogram_time_per_output_token[engine_idx].observe(itl)
for finished_request in iteration_stats.finished_requests: for finished_request in iteration_stats.finished_requests:
self.counter_request_success[ self.counter_request_success[
......
...@@ -96,7 +96,7 @@ class IterationStats: ...@@ -96,7 +96,7 @@ class IterationStats:
self.max_num_generation_tokens_iter: list[int] = [] self.max_num_generation_tokens_iter: list[int] = []
self.n_params_iter: list[int] = [] self.n_params_iter: list[int] = []
self.time_to_first_tokens_iter: list[float] = [] self.time_to_first_tokens_iter: list[float] = []
self.time_per_output_tokens_iter: list[float] = [] self.inter_token_latencies_iter: list[float] = []
self.waiting_lora_adapters: dict[str, int] = {} self.waiting_lora_adapters: dict[str, int] = {}
self.running_lora_adapters: dict[str, int] = {} self.running_lora_adapters: dict[str, int] = {}
...@@ -128,8 +128,8 @@ class IterationStats: ...@@ -128,8 +128,8 @@ class IterationStats:
if is_prefilling: if is_prefilling:
req_stats.first_token_ts = engine_core_timestamp req_stats.first_token_ts = engine_core_timestamp
else: else:
tpot = engine_core_timestamp - req_stats.last_token_ts itl = engine_core_timestamp - req_stats.last_token_ts
self.time_per_output_tokens_iter.append(tpot) self.inter_token_latencies_iter.append(itl)
req_stats.last_token_ts = engine_core_timestamp req_stats.last_token_ts = engine_core_timestamp
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment