Unverified Commit 9bb10a7d authored by Kunjan's avatar Kunjan Committed by GitHub
Browse files

[MISC] Add lora requests to metrics (#9477)


Co-authored-by: default avatarKunjan Patel <kunjanp_google_com@vllm.us-central1-a.c.kunjanp-gke-dev-2.internal>
parent 3921a2f2
import time import time
from collections import Counter as collectionsCounter
from collections import deque from collections import deque
from contextlib import contextmanager from contextlib import contextmanager
from dataclasses import dataclass from dataclasses import dataclass
...@@ -1617,6 +1618,25 @@ class LLMEngine: ...@@ -1617,6 +1618,25 @@ class LLMEngine:
n_requests: List[int] = [] n_requests: List[int] = []
finished_reason_requests: List[str] = [] finished_reason_requests: List[str] = []
# Lora requests
running_lora_adapters = dict(
collectionsCounter([
running_request.lora_request.lora_name
for scheduler in self.scheduler
for running_request in scheduler.running
if running_request.lora_request
]))
waiting_lora_adapters = dict(
collectionsCounter([
waiting_request.lora_request.lora_name
for scheduler in self.scheduler
for waiting_request in scheduler.waiting
if waiting_request.lora_request
]))
max_lora_stat = "0"
if self.lora_config:
max_lora_stat = str(self.lora_config.max_loras)
# NOTE: This loop assumes prefill seq_groups are before # NOTE: This loop assumes prefill seq_groups are before
# decode seq_groups in scheduled_seq_groups. # decode seq_groups in scheduled_seq_groups.
if scheduler_outputs is not None: if scheduler_outputs is not None:
...@@ -1738,7 +1758,9 @@ class LLMEngine: ...@@ -1738,7 +1758,9 @@ class LLMEngine:
num_generation_tokens_requests=num_generation_tokens_requests, num_generation_tokens_requests=num_generation_tokens_requests,
n_requests=n_requests, n_requests=n_requests,
finished_reason_requests=finished_reason_requests, finished_reason_requests=finished_reason_requests,
) max_lora=str(max_lora_stat),
waiting_lora_adapters=list(waiting_lora_adapters.keys()),
running_lora_adapters=list(running_lora_adapters.keys()))
def add_lora(self, lora_request: LoRARequest) -> bool: def add_lora(self, lora_request: LoRARequest) -> bool:
return self.model_executor.add_lora(lora_request) return self.model_executor.add_lora(lora_request)
......
...@@ -34,7 +34,11 @@ class Metrics: ...@@ -34,7 +34,11 @@ class Metrics:
See https://prometheus.github.io/client_python/multiprocess/ for more See https://prometheus.github.io/client_python/multiprocess/ for more
details on limitations. details on limitations.
""" """
labelname_finish_reason = "finished_reason" labelname_finish_reason = "finished_reason"
labelname_waiting_lora_adapters = "waiting_lora_adapters"
labelname_running_lora_adapters = "running_lora_adapters"
labelname_max_lora = "max_lora"
_gauge_cls = prometheus_client.Gauge _gauge_cls = prometheus_client.Gauge
_counter_cls = prometheus_client.Counter _counter_cls = prometheus_client.Counter
_histogram_cls = prometheus_client.Histogram _histogram_cls = prometheus_client.Histogram
...@@ -55,6 +59,16 @@ class Metrics: ...@@ -55,6 +59,16 @@ class Metrics:
documentation="Number of requests waiting to be processed.", documentation="Number of requests waiting to be processed.",
labelnames=labelnames, labelnames=labelnames,
multiprocess_mode="sum") multiprocess_mode="sum")
self.gauge_lora_info = self._gauge_cls(
name="vllm:lora_requests_info",
documentation="Running stats on lora requests.",
labelnames=[
self.labelname_running_lora_adapters,
self.labelname_max_lora,
self.labelname_waiting_lora_adapters,
],
multiprocess_mode="livemostrecent",
)
self.gauge_scheduler_swapped = self._gauge_cls( self.gauge_scheduler_swapped = self._gauge_cls(
name="vllm:num_requests_swapped", name="vllm:num_requests_swapped",
documentation="Number of requests swapped to CPU.", documentation="Number of requests swapped to CPU.",
...@@ -426,6 +440,9 @@ class PrometheusStatLogger(StatLoggerBase): ...@@ -426,6 +440,9 @@ class PrometheusStatLogger(StatLoggerBase):
for datum in data: for datum in data:
histogram.labels(**self.labels).observe(datum) histogram.labels(**self.labels).observe(datum)
def _log_gauge_string(self, gauge, data: Dict[str, str]) -> None:
gauge.labels(**data).set(1)
def _log_prometheus(self, stats: Stats) -> None: def _log_prometheus(self, stats: Stats) -> None:
# System state data # System state data
self._log_gauge(self.metrics.gauge_scheduler_running, self._log_gauge(self.metrics.gauge_scheduler_running,
...@@ -442,7 +459,17 @@ class PrometheusStatLogger(StatLoggerBase): ...@@ -442,7 +459,17 @@ class PrometheusStatLogger(StatLoggerBase):
stats.cpu_prefix_cache_hit_rate) stats.cpu_prefix_cache_hit_rate)
self._log_gauge(self.metrics.gauge_gpu_prefix_cache_hit_rate, self._log_gauge(self.metrics.gauge_gpu_prefix_cache_hit_rate,
stats.gpu_prefix_cache_hit_rate) stats.gpu_prefix_cache_hit_rate)
# Including max-lora in metric, in future this property of lora
# config maybe extended to be dynamic.
lora_info = {
self.metrics.labelname_running_lora_adapters:
",".join(stats.running_lora_adapters),
self.metrics.labelname_waiting_lora_adapters:
",".join(stats.waiting_lora_adapters),
self.metrics.labelname_max_lora:
stats.max_lora,
}
self._log_gauge_string(self.metrics.gauge_lora_info, lora_info)
# Iteration level data # Iteration level data
self._log_counter(self.metrics.counter_num_preemption, self._log_counter(self.metrics.counter_num_preemption,
stats.num_preemption_iter) stats.num_preemption_iter)
......
...@@ -51,6 +51,9 @@ class Stats: ...@@ -51,6 +51,9 @@ class Stats:
num_generation_tokens_requests: List[int] num_generation_tokens_requests: List[int]
n_requests: List[int] n_requests: List[int]
finished_reason_requests: List[str] finished_reason_requests: List[str]
waiting_lora_adapters: List[str]
running_lora_adapters: List[str]
max_lora: str
spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment