Unverified Commit 3fd1fb63 authored by Mark McLoughlin's avatar Mark McLoughlin Committed by GitHub
Browse files

[V1][Metrics] Hook up IterationStats for Prometheus metrics (#12478)


Signed-off-by: default avatarMark McLoughlin <markmc@redhat.com>
parent 925d2f19
...@@ -105,8 +105,6 @@ EXPECTED_VALUES = { ...@@ -105,8 +105,6 @@ EXPECTED_VALUES = {
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_metrics_counts(server: RemoteOpenAIServer, async def test_metrics_counts(server: RemoteOpenAIServer,
client: openai.AsyncClient, use_v1: bool): client: openai.AsyncClient, use_v1: bool):
if use_v1:
pytest.skip("Skipping test on vllm V1")
for _ in range(_NUM_REQUESTS): for _ in range(_NUM_REQUESTS):
# sending a request triggers the metrics to be logged. # sending a request triggers the metrics to be logged.
await client.completions.create( await client.completions.create(
...@@ -120,6 +118,9 @@ async def test_metrics_counts(server: RemoteOpenAIServer, ...@@ -120,6 +118,9 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
# Loop over all expected metric_families # Loop over all expected metric_families
for metric_family, suffix_values_list in EXPECTED_VALUES.items(): for metric_family, suffix_values_list in EXPECTED_VALUES.items():
if use_v1 and metric_family not in EXPECTED_METRICS_V1:
continue
found_metric = False found_metric = False
# Check to see if the metric_family is found in the prom endpoint. # Check to see if the metric_family is found in the prom endpoint.
...@@ -199,6 +200,8 @@ EXPECTED_METRICS = [ ...@@ -199,6 +200,8 @@ EXPECTED_METRICS = [
EXPECTED_METRICS_V1 = [ EXPECTED_METRICS_V1 = [
"vllm:num_requests_running", "vllm:num_requests_running",
"vllm:num_requests_waiting", "vllm:num_requests_waiting",
"vllm:prompt_tokens_total",
"vllm:generation_tokens_total",
] ]
......
...@@ -305,7 +305,8 @@ class AsyncLLM(EngineClient): ...@@ -305,7 +305,8 @@ class AsyncLLM(EngineClient):
return return
for logger in self.stat_loggers: for logger in self.stat_loggers:
logger.log(scheduler_stats=scheduler_stats) logger.log(scheduler_stats=scheduler_stats,
iteration_stats=iteration_stats)
def encode( def encode(
self, self,
......
import time import time
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Dict from typing import Dict, List
import numpy as np
import prometheus_client import prometheus_client
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.v1.metrics.stats import SchedulerStats from vllm.v1.metrics.stats import IterationStats, SchedulerStats
logger = init_logger(__name__) logger = init_logger(__name__)
...@@ -15,27 +16,61 @@ _LOCAL_LOGGING_INTERVAL_SEC = 5.0 ...@@ -15,27 +16,61 @@ _LOCAL_LOGGING_INTERVAL_SEC = 5.0
class StatLoggerBase(ABC): class StatLoggerBase(ABC):
@abstractmethod @abstractmethod
def log(self, scheduler_stats: SchedulerStats): def log(self, scheduler_stats: SchedulerStats,
iteration_stats: IterationStats):
... ...
class LoggingStatLogger(StatLoggerBase): class LoggingStatLogger(StatLoggerBase):
def __init__(self): def __init__(self):
self.last_log_time = time.monotonic() self._reset(time.monotonic())
def log(self, scheduler_stats: SchedulerStats): def _reset(self, now):
"""Log Stats to standard output.""" self.last_log_time = now
# Tracked stats over current local logging interval.
self.num_prompt_tokens: List[int] = []
self.num_generation_tokens: List[int] = []
def _local_interval_elapsed(self, now: float) -> bool:
# Log every _LOCAL_LOGGING_INTERVAL_SEC. # Log every _LOCAL_LOGGING_INTERVAL_SEC.
elapsed_time = now - self.last_log_time
return elapsed_time > _LOCAL_LOGGING_INTERVAL_SEC
def _track_iteration_stats(self, iteration_stats: IterationStats):
# Save tracked stats for token counters.
self.num_prompt_tokens.append(iteration_stats.num_prompt_tokens)
self.num_generation_tokens.append(
iteration_stats.num_generation_tokens)
def _get_throughput(self, tracked_stats: List[int], now: float) -> float:
# Compute summary metrics for tracked stats
return float(np.sum(tracked_stats) / (now - self.last_log_time))
def log(self, scheduler_stats: SchedulerStats,
iteration_stats: IterationStats):
"""Log Stats to standard output."""
self._track_iteration_stats(iteration_stats)
now = time.monotonic() now = time.monotonic()
if now - self.last_log_time < _LOCAL_LOGGING_INTERVAL_SEC: if not self._local_interval_elapsed(now):
return return
self.last_log_time = now
prompt_throughput = self._get_throughput(self.num_prompt_tokens, now)
generation_throughput = self._get_throughput(
self.num_generation_tokens, now)
self._reset(now)
# Format and print output. # Format and print output.
logger.info( logger.info(
"Avg prompt throughput: %.1f tokens/s, "
"Avg generation throughput: %.1f tokens/s, "
"Running: %d reqs, Waiting: %d reqs ", "Running: %d reqs, Waiting: %d reqs ",
prompt_throughput,
generation_throughput,
scheduler_stats.num_running_reqs, scheduler_stats.num_running_reqs,
scheduler_stats.num_waiting_reqs, scheduler_stats.num_waiting_reqs,
) )
...@@ -61,11 +96,26 @@ class PrometheusStatLogger(StatLoggerBase): ...@@ -61,11 +96,26 @@ class PrometheusStatLogger(StatLoggerBase):
documentation="Number of requests waiting to be processed.", documentation="Number of requests waiting to be processed.",
labelnames=labelnames).labels(*labelvalues) labelnames=labelnames).labels(*labelvalues)
def log(self, scheduler_stats: SchedulerStats): self.counter_prompt_tokens = prometheus_client.Counter(
name="vllm:prompt_tokens_total",
documentation="Number of prefill tokens processed.",
labelnames=labelnames).labels(*labelvalues)
self.counter_generation_tokens = prometheus_client.Counter(
name="vllm:generation_tokens_total",
documentation="Number of generation tokens processed.",
labelnames=labelnames).labels(*labelvalues)
def log(self, scheduler_stats: SchedulerStats,
iteration_stats: IterationStats):
"""Log to prometheus.""" """Log to prometheus."""
self.gauge_scheduler_running.set(scheduler_stats.num_running_reqs) self.gauge_scheduler_running.set(scheduler_stats.num_running_reqs)
self.gauge_scheduler_waiting.set(scheduler_stats.num_waiting_reqs) self.gauge_scheduler_waiting.set(scheduler_stats.num_waiting_reqs)
self.counter_prompt_tokens.inc(iteration_stats.num_prompt_tokens)
self.counter_generation_tokens.inc(
iteration_stats.num_generation_tokens)
@staticmethod @staticmethod
def _unregister_vllm_metrics(): def _unregister_vllm_metrics():
# Unregister any existing vLLM collectors (for CI/CD # Unregister any existing vLLM collectors (for CI/CD
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment