Unverified Commit 0e98964e authored by Mark McLoughlin's avatar Mark McLoughlin Committed by GitHub
Browse files

[V1][Metrics] Remove metrics that were deprecated in 0.8 (#18837)


Signed-off-by: default avatarMark McLoughlin <markmc@redhat.com>
parent c68b5c63
...@@ -35,19 +35,6 @@ The following metrics are exposed: ...@@ -35,19 +35,6 @@ The following metrics are exposed:
--8<-- "vllm/engine/metrics.py:metrics-definitions" --8<-- "vllm/engine/metrics.py:metrics-definitions"
``` ```
The following metrics are deprecated and due to be removed in a future version:
- `vllm:num_requests_swapped`, `vllm:cpu_cache_usage_perc`, and
`vllm:cpu_prefix_cache_hit_rate` because KV cache offloading is not
used in V1.
- `vllm:gpu_prefix_cache_hit_rate` is replaced by queries+hits
counters in V1.
- `vllm:time_in_queue_requests` because it duplicates
`vllm:request_queue_time_seconds`.
- `vllm:model_forward_time_milliseconds` and
`vllm:model_execute_time_milliseconds` because
prefill/decode/inference time metrics should be used instead.
Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1` Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1`
but can be re-enabled using the `--show-hidden-metrics-for-version=X.Y` escape hatch, but can be re-enabled using the `--show-hidden-metrics-for-version=X.Y` escape hatch,
and are then removed in version `X.Y+2`. and are then removed in version `X.Y+2`.
...@@ -577,23 +577,6 @@ ...@@ -577,23 +577,6 @@
"refId": "A", "refId": "A",
"useBackend": false "useBackend": false
}, },
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"disableTextWrap": false,
"editorMode": "builder",
"expr": "vllm:num_requests_swapped{model_name=\"$model_name\"}",
"fullMetaSearch": false,
"hide": false,
"includeNullMetadata": true,
"instant": false,
"legendFormat": "Num Swapped",
"range": true,
"refId": "B",
"useBackend": false
},
{ {
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
...@@ -874,19 +857,6 @@ ...@@ -874,19 +857,6 @@
"legendFormat": "GPU Cache Usage", "legendFormat": "GPU Cache Usage",
"range": true, "range": true,
"refId": "A" "refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "vllm:cpu_cache_usage_perc{model_name=\"$model_name\"}",
"hide": false,
"instant": false,
"legendFormat": "CPU Cache Usage",
"range": true,
"refId": "B"
} }
], ],
"title": "Cache Utilization", "title": "Cache Utilization",
......
...@@ -171,10 +171,8 @@ async def test_metrics_counts(server: RemoteOpenAIServer, ...@@ -171,10 +171,8 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
EXPECTED_METRICS = [ EXPECTED_METRICS = [
"vllm:num_requests_running", "vllm:num_requests_running",
"vllm:num_requests_swapped", # deprecated
"vllm:num_requests_waiting", "vllm:num_requests_waiting",
"vllm:gpu_cache_usage_perc", "vllm:gpu_cache_usage_perc",
"vllm:cpu_cache_usage_perc", # deprecated
"vllm:time_to_first_token_seconds_sum", "vllm:time_to_first_token_seconds_sum",
"vllm:time_to_first_token_seconds_bucket", "vllm:time_to_first_token_seconds_bucket",
"vllm:time_to_first_token_seconds_count", "vllm:time_to_first_token_seconds_count",
...@@ -274,10 +272,7 @@ EXPECTED_METRICS_V1 = [ ...@@ -274,10 +272,7 @@ EXPECTED_METRICS_V1 = [
"vllm:request_decode_time_seconds_count", "vllm:request_decode_time_seconds_count",
] ]
HIDDEN_DEPRECATED_METRICS = [ HIDDEN_DEPRECATED_METRICS: list[str] = []
"vllm:num_requests_swapped",
"vllm:cpu_cache_usage_perc",
]
@pytest.mark.asyncio @pytest.mark.asyncio
......
...@@ -1680,9 +1680,6 @@ class LLMEngine: ...@@ -1680,9 +1680,6 @@ class LLMEngine:
time_inference_requests: List[float] = [] time_inference_requests: List[float] = []
time_prefill_requests: List[float] = [] time_prefill_requests: List[float] = []
time_decode_requests: List[float] = [] time_decode_requests: List[float] = []
time_in_queue_requests: List[float] = []
model_forward_time_requests: List[float] = []
model_execute_time_requests: List[float] = []
# Metadata # Metadata
num_prompt_tokens_requests: List[int] = [] num_prompt_tokens_requests: List[int] = []
num_generation_tokens_requests: List[int] = [] num_generation_tokens_requests: List[int] = []
...@@ -1790,15 +1787,6 @@ class LLMEngine: ...@@ -1790,15 +1787,6 @@ class LLMEngine:
now - seq_group.metrics.first_token_time) now - seq_group.metrics.first_token_time)
time_inference_requests.append( time_inference_requests.append(
now - seq_group.metrics.first_scheduled_time) now - seq_group.metrics.first_scheduled_time)
if seq_group.metrics.time_in_queue is not None:
time_in_queue_requests.append(
seq_group.metrics.time_in_queue)
if seq_group.metrics.model_forward_time is not None:
model_forward_time_requests.append(
seq_group.metrics.model_forward_time)
if seq_group.metrics.model_execute_time is not None:
model_execute_time_requests.append(
seq_group.metrics.model_execute_time * 1000)
# Metadata # Metadata
num_prompt_tokens_requests.append( num_prompt_tokens_requests.append(
len(seq_group.prompt_token_ids)) len(seq_group.prompt_token_ids))
...@@ -1867,9 +1855,6 @@ class LLMEngine: ...@@ -1867,9 +1855,6 @@ class LLMEngine:
time_inference_requests=time_inference_requests, time_inference_requests=time_inference_requests,
time_prefill_requests=time_prefill_requests, time_prefill_requests=time_prefill_requests,
time_decode_requests=time_decode_requests, time_decode_requests=time_decode_requests,
time_in_queue_requests=time_in_queue_requests,
model_forward_time_requests=model_forward_time_requests,
model_execute_time_requests=model_execute_time_requests,
# Metadata # Metadata
num_prompt_tokens_requests=num_prompt_tokens_requests, num_prompt_tokens_requests=num_prompt_tokens_requests,
num_generation_tokens_requests=num_generation_tokens_requests, num_generation_tokens_requests=num_generation_tokens_requests,
......
...@@ -80,17 +80,6 @@ class Metrics: ...@@ -80,17 +80,6 @@ class Metrics:
multiprocess_mode="livemostrecent", multiprocess_mode="livemostrecent",
) )
# Deprecated in 0.8 - KV cache offloading is not used in V1
# Hidden in 0.9, due to be removed in 0.10
if self.show_hidden_metrics:
self.gauge_scheduler_swapped = self._gauge_cls(
name="vllm:num_requests_swapped",
documentation=(
"Number of requests swapped to CPU. "
"DEPRECATED: KV cache offloading is not used in V1"),
labelnames=labelnames,
multiprocess_mode="sum")
# KV Cache Usage in % # KV Cache Usage in %
self.gauge_gpu_cache_usage = self._gauge_cls( self.gauge_gpu_cache_usage = self._gauge_cls(
name="vllm:gpu_cache_usage_perc", name="vllm:gpu_cache_usage_perc",
...@@ -98,35 +87,6 @@ class Metrics: ...@@ -98,35 +87,6 @@ class Metrics:
labelnames=labelnames, labelnames=labelnames,
multiprocess_mode="sum") multiprocess_mode="sum")
# Deprecated in 0.8 - KV cache offloading is not used in V1
# Hidden in 0.9, due to be removed in 0.10
if self.show_hidden_metrics:
self.gauge_cpu_cache_usage = self._gauge_cls(
name="vllm:cpu_cache_usage_perc",
documentation=(
"CPU KV-cache usage. 1 means 100 percent usage. "
"DEPRECATED: KV cache offloading is not used in V1"),
labelnames=labelnames,
multiprocess_mode="sum")
self.gauge_cpu_prefix_cache_hit_rate = self._gauge_cls(
name="vllm:cpu_prefix_cache_hit_rate",
documentation=(
"CPU prefix cache block hit rate. "
"DEPRECATED: KV cache offloading is not used in V1"),
labelnames=labelnames,
multiprocess_mode="sum")
# Deprecated in 0.8 - replaced by queries+hits counters in V1
# Hidden in 0.9, due to be removed in 0.10
if self.show_hidden_metrics:
self.gauge_gpu_prefix_cache_hit_rate = self._gauge_cls(
name="vllm:gpu_prefix_cache_hit_rate",
documentation=("GPU prefix cache block hit rate. "
"DEPRECATED: use vllm:gpu_prefix_cache_queries "
"and vllm:gpu_prefix_cache_queries in V1"),
labelnames=labelnames,
multiprocess_mode="sum")
# Iteration stats # Iteration stats
self.counter_num_preemption = self._counter_cls( self.counter_num_preemption = self._counter_cls(
name="vllm:num_preemptions_total", name="vllm:num_preemptions_total",
...@@ -200,36 +160,6 @@ class Metrics: ...@@ -200,36 +160,6 @@ class Metrics:
"Histogram of time spent in DECODE phase for request.", "Histogram of time spent in DECODE phase for request.",
labelnames=labelnames, labelnames=labelnames,
buckets=request_latency_buckets) buckets=request_latency_buckets)
# Deprecated in 0.8 - duplicates vllm:request_queue_time_seconds:
# Hidden in 0.9, due to be removed in 0.10
if self.show_hidden_metrics:
self.histogram_time_in_queue_request = self._histogram_cls(
name="vllm:time_in_queue_requests",
documentation=
("Histogram of time the request spent in the queue in seconds. "
"DEPRECATED: use vllm:request_queue_time_seconds instead."),
labelnames=labelnames,
buckets=request_latency_buckets)
# Deprecated in 0.8 - use prefill/decode/inference time metrics
# Hidden in 0.9, due to be removed in 0.10
if self.show_hidden_metrics:
self.histogram_model_forward_time_request = self._histogram_cls(
name="vllm:model_forward_time_milliseconds",
documentation=
("Histogram of time spent in the model forward pass in ms. "
"DEPRECATED: use prefill/decode/inference time metrics instead"
),
labelnames=labelnames,
buckets=build_1_2_3_5_8_buckets(3000))
self.histogram_model_execute_time_request = self._histogram_cls(
name="vllm:model_execute_time_milliseconds",
documentation=
("Histogram of time spent in the model execute function in ms."
"DEPRECATED: use prefill/decode/inference time metrics instead"
),
labelnames=labelnames,
buckets=build_1_2_3_5_8_buckets(3000))
# Metadata # Metadata
self.histogram_num_prompt_tokens_request = self._histogram_cls( self.histogram_num_prompt_tokens_request = self._histogram_cls(
...@@ -580,20 +510,10 @@ class PrometheusStatLogger(StatLoggerBase): ...@@ -580,20 +510,10 @@ class PrometheusStatLogger(StatLoggerBase):
# System state data # System state data
self._log_gauge(self.metrics.gauge_scheduler_running, self._log_gauge(self.metrics.gauge_scheduler_running,
stats.num_running_sys) stats.num_running_sys)
if self.metrics.show_hidden_metrics:
self._log_gauge(self.metrics.gauge_scheduler_swapped,
stats.num_swapped_sys)
self._log_gauge(self.metrics.gauge_scheduler_waiting, self._log_gauge(self.metrics.gauge_scheduler_waiting,
stats.num_waiting_sys) stats.num_waiting_sys)
self._log_gauge(self.metrics.gauge_gpu_cache_usage, self._log_gauge(self.metrics.gauge_gpu_cache_usage,
stats.gpu_cache_usage_sys) stats.gpu_cache_usage_sys)
if self.metrics.show_hidden_metrics:
self._log_gauge(self.metrics.gauge_cpu_cache_usage,
stats.cpu_cache_usage_sys)
self._log_gauge(self.metrics.gauge_cpu_prefix_cache_hit_rate,
stats.cpu_prefix_cache_hit_rate)
self._log_gauge(self.metrics.gauge_gpu_prefix_cache_hit_rate,
stats.gpu_prefix_cache_hit_rate)
# Including max-lora in metric, in future this property of lora # Including max-lora in metric, in future this property of lora
# config maybe extended to be dynamic. # config maybe extended to be dynamic.
lora_info = { lora_info = {
...@@ -631,15 +551,6 @@ class PrometheusStatLogger(StatLoggerBase): ...@@ -631,15 +551,6 @@ class PrometheusStatLogger(StatLoggerBase):
stats.time_prefill_requests) stats.time_prefill_requests)
self._log_histogram(self.metrics.histogram_decode_time_request, self._log_histogram(self.metrics.histogram_decode_time_request,
stats.time_decode_requests) stats.time_decode_requests)
if self.metrics.show_hidden_metrics:
self._log_histogram(self.metrics.histogram_time_in_queue_request,
stats.time_in_queue_requests)
self._log_histogram(
self.metrics.histogram_model_forward_time_request,
stats.model_forward_time_requests)
self._log_histogram(
self.metrics.histogram_model_execute_time_request,
stats.model_execute_time_requests)
# Metadata # Metadata
finished_reason_counter = CollectionsCounter( finished_reason_counter = CollectionsCounter(
stats.finished_reason_requests) stats.finished_reason_requests)
......
...@@ -53,9 +53,6 @@ class Stats: ...@@ -53,9 +53,6 @@ class Stats:
time_inference_requests: List[float] time_inference_requests: List[float]
time_prefill_requests: List[float] time_prefill_requests: List[float]
time_decode_requests: List[float] time_decode_requests: List[float]
time_in_queue_requests: List[float]
model_forward_time_requests: List[float]
model_execute_time_requests: List[float]
# Metadata # Metadata
num_prompt_tokens_requests: List[int] num_prompt_tokens_requests: List[int]
num_generation_tokens_requests: List[int] num_generation_tokens_requests: List[int]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment