"...git@developer.sourcefind.cn:OpenDAS/lmdeploy.git" did not exist on "858087a625c1dc431ab8b174331dfc95210f6e3a"
Unverified Commit 93b38bea authored by Robert Shaw's avatar Robert Shaw Committed by GitHub
Browse files

Refactor Prometheus and Add Request Level Metrics (#2316)

parent d0d93b92
# vLLM + Prometheus/Grafana
This is a simple example that shows you how to connect vLLM metric logging to the Prometheus/Grafana stack. For this example, we launch Prometheus and Grafana via Docker. You can checkout other methods through [Prometheus](https://prometheus.io/) and [Grafana](https://grafana.com/) websites.
Install:
- [`docker`](https://docs.docker.com/engine/install/)
- [`docker compose`](https://docs.docker.com/compose/install/linux/#install-using-the-repository)
### Launch
Prometheus metric logging is enabled by default in the OpenAI-compatible server. Launch via the entrypoint:
```bash
python3 -m vllm.entrypoints.openai.api_server \
--model mistralai/Mistral-7B-v0.1 \
--max-model-len 2048 \
--disable-log-requests
```
Launch Prometheus and Grafana servers with `docker compose`:
```bash
docker compose up
```
Submit some sample requests to the server:
```bash
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
python3 ../../benchmarks/benchmark_serving.py \
--model mistralai/Mistral-7B-v0.1 \
--tokenizer mistralai/Mistral-7B-v0.1 \
--endpoint /v1/completions \
--dataset ShareGPT_V3_unfiltered_cleaned_split.json \
--request-rate 3.0
```
Navigating to [`http://localhost:8000/metrics`](http://localhost:8000/metrics) will show the raw Prometheus metrics being exposed by vLLM.
### Grafana Dashboard
Navigate to [`http://localhost:3000`](http://localhost:3000). Log in with the default username (`admin`) and password (`admin`).
#### Add Prometheus Data Source
Navigate to [`http://localhost:3000/connections/datasources/new`](http://localhost:3000/connections/datasources/new) and select Prometheus.
On Prometheus configuration page, we need to add the `Prometheus Server URL` in `Connection`. For this setup, Grafana and Prometheus are running in separate containers, but Docker creates DNS name for each containers. You can just use `http://prometheus:9090`.
Click `Save & Test`. You should get a green check saying "Successfully queried the Prometheus API.".
#### Import Dashboard
Navigate to [`http://localhost:3000/dashboard/import`](http://localhost:3000/dashboard/import), upload `grafana.json`, and select the `prometheus` datasource. You should see a screen that looks like the following:
![Grafana Dashboard Image](https://i.imgur.com/R2vH9VW.png)
# docker-compose.yaml
version: "3"
services:
prometheus:
image: prom/prometheus:latest
extra_hosts:
- "host.docker.internal:host-gateway" # allow a direct connection from container to the local machine
ports:
- "9090:9090" # the default port used by Prometheus
volumes:
- ${PWD}/prometheus.yaml:/etc/prometheus/prometheus.yml # mount Prometheus config file
grafana:
image: grafana/grafana:latest
depends_on:
- prometheus
ports:
- "3000:3000" # the default port used by Grafana
This diff is collapsed.
# prometheus.yaml
global:
scrape_interval: 5s
evaluation_interval: 30s
scrape_configs:
- job_name: vllm
static_configs:
- targets:
- 'host.docker.internal:8000'
...@@ -10,7 +10,7 @@ from vllm.config import (CacheConfig, ModelConfig, ParallelConfig, ...@@ -10,7 +10,7 @@ from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
SchedulerConfig, LoRAConfig) SchedulerConfig, LoRAConfig)
from vllm.core.scheduler import Scheduler, SchedulerOutputs from vllm.core.scheduler import Scheduler, SchedulerOutputs
from vllm.engine.arg_utils import EngineArgs from vllm.engine.arg_utils import EngineArgs
from vllm.engine.metrics import record_metrics from vllm.engine.metrics import StatLogger, Stats
from vllm.engine.ray_utils import RayWorkerVllm, initialize_cluster, ray from vllm.engine.ray_utils import RayWorkerVllm, initialize_cluster, ray
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.outputs import RequestOutput from vllm.outputs import RequestOutput
...@@ -28,8 +28,7 @@ if TYPE_CHECKING: ...@@ -28,8 +28,7 @@ if TYPE_CHECKING:
from ray.util.placement_group import PlacementGroup from ray.util.placement_group import PlacementGroup
logger = init_logger(__name__) logger = init_logger(__name__)
_LOCAL_LOGGING_INTERVAL_SEC = 5
_LOGGING_INTERVAL_SEC = 5
class LLMEngine: class LLMEngine:
...@@ -116,12 +115,10 @@ class LLMEngine: ...@@ -116,12 +115,10 @@ class LLMEngine:
# Create the scheduler. # Create the scheduler.
self.scheduler = Scheduler(scheduler_config, cache_config, lora_config) self.scheduler = Scheduler(scheduler_config, cache_config, lora_config)
# Logging. # Metric Logging.
self.last_logging_time = 0.0 if self.log_stats:
# List of (timestamp, num_tokens) self.stat_logger = StatLogger(
self.num_prompt_tokens: List[Tuple[float, int]] = [] local_interval=_LOCAL_LOGGING_INTERVAL_SEC)
# List of (timestamp, num_tokens)
self.num_generation_tokens: List[Tuple[float, int]] = []
def get_tokenizer_for_seq(self, sequence: Sequence): def get_tokenizer_for_seq(self, sequence: Sequence):
return self.tokenizer.get_lora_tokenizer(sequence.lora_request) return self.tokenizer.get_lora_tokenizer(sequence.lora_request)
...@@ -537,6 +534,7 @@ class LLMEngine: ...@@ -537,6 +534,7 @@ class LLMEngine:
def _process_sequence_group_outputs(self, seq_group: SequenceGroup, def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
outputs: SequenceGroupOutput) -> None: outputs: SequenceGroupOutput) -> None:
# Process prompt logprobs # Process prompt logprobs
prompt_logprobs = outputs.prompt_logprobs prompt_logprobs = outputs.prompt_logprobs
if prompt_logprobs is not None: if prompt_logprobs is not None:
...@@ -732,10 +730,10 @@ class LLMEngine: ...@@ -732,10 +730,10 @@ class LLMEngine:
and not seq_group.prefix.computed): and not seq_group.prefix.computed):
seq_group.prefix.computed = True seq_group.prefix.computed = True
# Log stats.
if self.log_stats: if self.log_stats:
# Log the system stats. self.stat_logger.log(self._get_stats(scheduler_outputs))
self._log_system_stats(scheduler_outputs.prompt_run,
scheduler_outputs.num_batched_tokens)
return request_outputs return request_outputs
def step(self) -> List[RequestOutput]: def step(self) -> List[RequestOutput]:
...@@ -810,81 +808,73 @@ class LLMEngine: ...@@ -810,81 +808,73 @@ class LLMEngine:
return self._process_model_outputs(output, scheduler_outputs) return self._process_model_outputs(output, scheduler_outputs)
def do_log_stats(self) -> None: def do_log_stats(self) -> None:
self._log_system_stats(False, 0) """Forced log when no requests active."""
if self.log_stats:
self.stat_logger.log(self._get_stats(scheduler_outputs=None))
def _log_system_stats( def _get_stats(self,
self, scheduler_outputs: Optional[SchedulerOutputs]) -> Stats:
prompt_run: bool, """Get Stats to be Logged to Prometheus."""
num_batched_tokens: int,
) -> None:
now = time.monotonic() now = time.monotonic()
# Log the number of batched input tokens.
if prompt_run:
self.num_prompt_tokens.append((now, num_batched_tokens))
else:
self.num_generation_tokens.append((now, num_batched_tokens))
should_log = now - self.last_logging_time >= _LOGGING_INTERVAL_SEC # KV Cache Usage in %.
if not should_log: num_total_gpu = self.cache_config.num_gpu_blocks
return num_free_gpu = self.scheduler.block_manager.get_num_free_gpu_blocks()
gpu_cache_usage = 1.0 - (num_free_gpu / num_total_gpu)
# Discard the old stats. num_total_cpu = self.cache_config.num_cpu_blocks
self.num_prompt_tokens = [(t, n) for t, n in self.num_prompt_tokens cpu_cache_usage = 0.
if now - t < _LOGGING_INTERVAL_SEC] if num_total_cpu > 0:
self.num_generation_tokens = [(t, n) num_free_cpu = self.scheduler.block_manager.get_num_free_cpu_blocks(
for t, n in self.num_generation_tokens )
if now - t < _LOGGING_INTERVAL_SEC] cpu_cache_usage = 1.0 - (num_free_cpu / num_total_cpu)
if len(self.num_prompt_tokens) > 1: # Scheduler State
total_num_tokens = sum(n for _, n in self.num_prompt_tokens[:-1]) num_running = len(self.scheduler.running)
window = now - self.num_prompt_tokens[0][0] num_swapped = len(self.scheduler.swapped)
avg_prompt_throughput = total_num_tokens / window num_waiting = len(self.scheduler.waiting)
else:
avg_prompt_throughput = 0.0 # Iteration stats if we have scheduler output.
if len(self.num_generation_tokens) > 1: num_prompt_tokens = 0
total_num_tokens = sum(n num_generation_tokens = 0
for _, n in self.num_generation_tokens[:-1]) time_to_first_tokens = []
window = now - self.num_generation_tokens[0][0] time_per_output_tokens = []
avg_generation_throughput = total_num_tokens / window time_e2e_requests = []
else: if scheduler_outputs is not None:
avg_generation_throughput = 0.0 prompt_run = scheduler_outputs.prompt_run
total_num_gpu_blocks = self.cache_config.num_gpu_blocks # Number of Tokens.
num_free_gpu_blocks = ( if prompt_run:
self.scheduler.block_manager.get_num_free_gpu_blocks()) num_prompt_tokens = scheduler_outputs.num_batched_tokens
num_used_gpu_blocks = total_num_gpu_blocks - num_free_gpu_blocks else:
gpu_cache_usage = num_used_gpu_blocks / total_num_gpu_blocks num_generation_tokens = scheduler_outputs.num_batched_tokens
total_num_cpu_blocks = self.cache_config.num_cpu_blocks # Latency Timings.
if total_num_cpu_blocks > 0: time_last_iters = []
num_free_cpu_blocks = ( for seq_group in scheduler_outputs.scheduled_seq_groups:
self.scheduler.block_manager.get_num_free_cpu_blocks()) # Time since last token. (n.b. updates seq_group.last_token_time)
num_used_cpu_blocks = total_num_cpu_blocks - num_free_cpu_blocks time_last_iters.append(seq_group.get_last_latency(now))
cpu_cache_usage = num_used_cpu_blocks / total_num_cpu_blocks # Time since arrival for all finished requests.
else: if seq_group.is_finished():
cpu_cache_usage = 0.0 time_e2e_requests.append(now - seq_group.arrival_time)
record_metrics( time_to_first_tokens = time_last_iters if prompt_run else []
avg_prompt_throughput=avg_prompt_throughput, time_per_output_tokens = [] if prompt_run else time_last_iters
avg_generation_throughput=avg_generation_throughput,
scheduler_running=len(self.scheduler.running), return Stats(
scheduler_swapped=len(self.scheduler.swapped), now=now,
scheduler_waiting=len(self.scheduler.waiting), num_running=num_running,
num_swapped=num_swapped,
num_waiting=num_waiting,
gpu_cache_usage=gpu_cache_usage, gpu_cache_usage=gpu_cache_usage,
cpu_cache_usage=cpu_cache_usage, cpu_cache_usage=cpu_cache_usage,
num_prompt_tokens=num_prompt_tokens,
num_generation_tokens=num_generation_tokens,
time_to_first_tokens=time_to_first_tokens,
time_per_output_tokens=time_per_output_tokens,
time_e2e_requests=time_e2e_requests,
) )
logger.info("Avg prompt throughput: "
f"{avg_prompt_throughput:.1f} tokens/s, "
"Avg generation throughput: "
f"{avg_generation_throughput:.1f} tokens/s, "
f"Running: {len(self.scheduler.running)} reqs, "
f"Swapped: {len(self.scheduler.swapped)} reqs, "
f"Pending: {len(self.scheduler.waiting)} reqs, "
f"GPU KV cache usage: {gpu_cache_usage * 100:.1f}%, "
f"CPU KV cache usage: {cpu_cache_usage * 100:.1f}%")
self.last_logging_time = now
def _decode_sequence(self, seq: Sequence, prms: SamplingParams) -> None: def _decode_sequence(self, seq: Sequence, prms: SamplingParams) -> None:
"""Decodes the new token for a sequence.""" """Decodes the new token for a sequence."""
(new_tokens, new_output_text, prefix_offset, (new_tokens, new_output_text, prefix_offset,
......
from aioprometheus import Gauge from vllm.logger import init_logger
from aioprometheus import Counter, Gauge, Histogram
import time
import numpy as np
from typing import List
from dataclasses import dataclass
logger = init_logger(__name__)
labels = {}
def add_global_metrics_labels(**kwargs):
labels.update(kwargs)
# The begin-* and end* here are used by the documentation generator # The begin-* and end* here are used by the documentation generator
# to extract the metrics definitions. # to extract the metrics definitions.
...@@ -9,12 +24,16 @@ gauge_avg_prompt_throughput = Gauge("vllm:avg_prompt_throughput_toks_per_s", ...@@ -9,12 +24,16 @@ gauge_avg_prompt_throughput = Gauge("vllm:avg_prompt_throughput_toks_per_s",
gauge_avg_generation_throughput = Gauge( gauge_avg_generation_throughput = Gauge(
"vllm:avg_generation_throughput_toks_per_s", "vllm:avg_generation_throughput_toks_per_s",
"Average generation throughput in tokens/s.") "Average generation throughput in tokens/s.")
counter_prompt_tokens = Counter("vllm:prompt_tokens_total",
"Number of prefill tokens processed.")
counter_generation_tokens = Counter("vllm:generation_tokens_total",
"Number of generation tokens processed.")
gauge_scheduler_running = Gauge( gauge_scheduler_running = Gauge(
"vllm:num_requests_running", "vllm:num_requests_running",
"Number of requests that is currently running for inference.") "Number of requests currently running on GPU.")
gauge_scheduler_swapped = Gauge("vllm:num_requests_swapped", gauge_scheduler_swapped = Gauge("vllm:num_requests_swapped",
"Number requests swapped to CPU.") "Number of requests swapped to CPU.")
gauge_scheduler_waiting = Gauge("vllm:num_requests_waiting", gauge_scheduler_waiting = Gauge("vllm:num_requests_waiting",
"Number of requests waiting to be processed.") "Number of requests waiting to be processed.")
...@@ -24,28 +43,131 @@ gauge_gpu_cache_usage = Gauge( ...@@ -24,28 +43,131 @@ gauge_gpu_cache_usage = Gauge(
gauge_cpu_cache_usage = Gauge( gauge_cpu_cache_usage = Gauge(
"vllm:cpu_cache_usage_perc", "vllm:cpu_cache_usage_perc",
"CPU KV-cache usage. 1 means 100 percent usage.") "CPU KV-cache usage. 1 means 100 percent usage.")
histogram_time_to_first_token = Histogram(
"vllm:time_to_first_token_seconds",
"Histogram of time to first token in seconds.",
buckets=[
0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5, 0.75, 1.0,
2.5, 5.0, 7.5, 10.0
])
histogram_time_per_output_tokens = Histogram(
"vllm:time_per_output_token_seconds",
"Histogram of time per output token in seconds.",
buckets=[
0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, 1.0, 2.5
])
histogram_e2e_request_latency = Histogram(
"vllm:e2e_request_latency_seconds",
"Histogram of end to end request latency in seconds.",
buckets=[1.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0])
# end-metrics-definitions # end-metrics-definitions
labels = {}
@dataclass
class Stats:
"""Created by LLMEngine for use by StatLogger."""
now: float
def add_global_metrics_labels(**kwargs): # System stats.
labels.update(kwargs) num_running: int
num_waiting: int
num_swapped: int
gpu_cache_usage: float
cpu_cache_usage: float
# Raw stats from last model iteration.
num_prompt_tokens: int
num_generation_tokens: int
time_to_first_tokens: List[float]
time_per_output_tokens: List[float]
time_e2e_requests: List[float]
class StatLogger:
"""StatLogger is used LLMEngine to log to Promethus and Stdout."""
def __init__(self, local_interval: float) -> None:
# Metadata for logging locally.
self.last_local_log = time.monotonic()
self.local_interval = local_interval
# Tracked stats over current local logging interval.
self.num_prompt_tokens: List[int] = []
self.num_generation_tokens: List[int] = []
def _get_throughput(self, tracked_stats: List[int], now: float) -> float:
return float(np.sum(tracked_stats) / (now - self.last_local_log))
def _local_interval_elapsed(self, now: float) -> bool:
elapsed_time = now - self.last_local_log
return elapsed_time > self.local_interval
def _log_prometheus(self, stats: Stats) -> None:
# Set system stat gauges.
gauge_scheduler_running.set(labels, stats.num_running)
gauge_scheduler_swapped.set(labels, stats.num_swapped)
gauge_scheduler_waiting.set(labels, stats.num_waiting)
gauge_gpu_cache_usage.set(labels, stats.gpu_cache_usage)
gauge_cpu_cache_usage.set(labels, stats.cpu_cache_usage)
# Add to token counters.
counter_prompt_tokens.add(labels, stats.num_prompt_tokens)
counter_generation_tokens.add(labels, stats.num_generation_tokens)
# Observe request level latencies in histograms.
for ttft in stats.time_to_first_tokens:
histogram_time_to_first_token.observe(labels, ttft)
for tpot in stats.time_per_output_tokens:
histogram_time_per_output_tokens.observe(labels, tpot)
for e2e in stats.time_e2e_requests:
histogram_e2e_request_latency.observe(labels, e2e)
def _log_prometheus_interval(self, prompt_throughput: float,
generation_throughput: float) -> None:
# Logs metrics to prometheus that are computed every logging_interval.
# Support legacy gauge metrics that make throughput calculations on the vLLM side.
# Moving forward, we should use counters like counter_prompt_tokens, counter_generation_tokens
# Which log raw data and calculate summaries using rate() on the grafana/prometheus side.
# See https://github.com/vllm-project/vllm/pull/2316#discussion_r1464204666
gauge_avg_prompt_throughput.set(labels, prompt_throughput)
gauge_avg_generation_throughput.set(labels, generation_throughput)
def log(self, stats: Stats) -> None:
"""Called by LLMEngine.
Logs to prometheus and tracked stats every iteration.
Logs to Stdout every self.local_interval seconds."""
# Log to prometheus.
self._log_prometheus(stats)
# Save tracked stats for token counters.
self.num_prompt_tokens.append(stats.num_prompt_tokens)
self.num_generation_tokens.append(stats.num_generation_tokens)
# Log locally every local_interval seconds.
if self._local_interval_elapsed(stats.now):
# Compute summary metrics for tracked stats (and log them to promethus if applicable).
prompt_throughput = self._get_throughput(self.num_prompt_tokens,
now=stats.now)
generation_throughput = self._get_throughput(
self.num_generation_tokens, now=stats.now)
self._log_prometheus_interval(
prompt_throughput=prompt_throughput,
generation_throughput=generation_throughput)
# Log to stdout.
logger.info(
f"Avg prompt throughput: {prompt_throughput:.1f} tokens/s, "
f"Avg generation throughput: {generation_throughput:.1f} tokens/s, "
f"Running: {stats.num_running} reqs, "
f"Swapped: {stats.num_swapped} reqs, "
f"Pending: {stats.num_waiting} reqs, "
f"GPU KV cache usage: {stats.gpu_cache_usage * 100:.1f}%, "
f"CPU KV cache usage: {stats.cpu_cache_usage * 100:.1f}%")
def record_metrics( # Reset tracked stats for next interval.
avg_prompt_throughput: float, self.num_prompt_tokens = []
avg_generation_throughput: float, self.num_generation_tokens = []
scheduler_running: int, self.last_local_log = stats.now
scheduler_swapped: int,
scheduler_waiting: int,
gpu_cache_usage: float,
cpu_cache_usage: float,
):
gauge_avg_prompt_throughput.set(labels, avg_prompt_throughput)
gauge_avg_generation_throughput.set(labels, avg_generation_throughput)
gauge_scheduler_running.set(labels, scheduler_running)
gauge_scheduler_swapped.set(labels, scheduler_swapped)
gauge_scheduler_waiting.set(labels, scheduler_waiting)
gauge_gpu_cache_usage.set(labels, gpu_cache_usage)
gauge_cpu_cache_usage.set(labels, cpu_cache_usage)
...@@ -52,7 +52,6 @@ class SequenceStatus(enum.Enum): ...@@ -52,7 +52,6 @@ class SequenceStatus(enum.Enum):
class SequenceData: class SequenceData:
"""Data associated with a sequence. """Data associated with a sequence.
Args: Args:
prompt_token_ids: The token IDs of the prompt. prompt_token_ids: The token IDs of the prompt.
...@@ -254,6 +253,7 @@ class SequenceGroup: ...@@ -254,6 +253,7 @@ class SequenceGroup:
self.seqs_dict = {seq.seq_id: seq for seq in seqs} self.seqs_dict = {seq.seq_id: seq for seq in seqs}
self.sampling_params = sampling_params self.sampling_params = sampling_params
self.arrival_time = arrival_time self.arrival_time = arrival_time
self.last_token_time = arrival_time
self.lora_request = lora_request self.lora_request = lora_request
self.prefix: Optional[Prefix] = prefix self.prefix: Optional[Prefix] = prefix
self.prompt_logprobs: Optional[PromptLogprobs] = None self.prompt_logprobs: Optional[PromptLogprobs] = None
...@@ -274,6 +274,12 @@ class SequenceGroup: ...@@ -274,6 +274,12 @@ class SequenceGroup:
def lora_int_id(self) -> int: def lora_int_id(self) -> int:
return self.lora_request.lora_int_id if self.lora_request else 0 return self.lora_request.lora_int_id if self.lora_request else 0
def get_last_latency(self, now: float) -> float:
"""Gets last token latency for Request level timings."""
latency = now - self.last_token_time
self.last_token_time = now
return latency
def get_max_num_running_seqs(self) -> int: def get_max_num_running_seqs(self) -> int:
"""The maximum number of sequences running in parallel in the remaining """The maximum number of sequences running in parallel in the remaining
lifetime of the request.""" lifetime of the request."""
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment