Unverified Commit 5313c2cb authored by Simon Mo's avatar Simon Mo Committed by GitHub
Browse files

Add Production Metrics in Prometheus format (#1890)

parent 5f09cbdb
......@@ -67,6 +67,7 @@ Documentation
serving/deploying_with_triton
serving/deploying_with_docker
serving/serving_with_langchain
serving/metrics
.. toctree::
:maxdepth: 1
......
Production Metrics
==================
vLLM exposes a number of metrics that can be used to monitor the health of the
system. These metrics are exposed via the `/metrics` endpoint on the vLLM
OpenAI compatible API server.
The following metrics are exposed:
.. literalinclude:: ../../../vllm/engine/metrics.py
:language: python
:start-after: begin-metrics-definitions
:end-before: end-metrics-definitions
......@@ -12,3 +12,4 @@ xformers >= 0.0.22.post7 # Required for CUDA 12.1.
fastapi
uvicorn[standard]
pydantic == 1.10.13 # Required for OpenAI server.
aioprometheus[starlette]
......@@ -7,6 +7,7 @@ from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
SchedulerConfig)
from vllm.core.scheduler import Scheduler, SchedulerOutputs
from vllm.engine.arg_utils import EngineArgs
from vllm.engine.metrics import record_metrics
from vllm.engine.ray_utils import RayWorkerVllm, initialize_cluster, ray
from vllm.logger import init_logger
from vllm.outputs import RequestOutput
......@@ -591,8 +592,8 @@ class LLMEngine:
else:
self.num_generation_tokens.append((now, num_batched_tokens))
elapsed_time = now - self.last_logging_time
if elapsed_time < _LOGGING_INTERVAL_SEC:
should_log = now - self.last_logging_time >= _LOGGING_INTERVAL_SEC
if not should_log:
return
# Discard the old stats.
......@@ -631,6 +632,16 @@ class LLMEngine:
else:
cpu_cache_usage = 0.0
record_metrics(
avg_prompt_throughput=avg_prompt_throughput,
avg_generation_throughput=avg_generation_throughput,
scheduler_running=len(self.scheduler.running),
scheduler_swapped=len(self.scheduler.swapped),
scheduler_waiting=len(self.scheduler.waiting),
gpu_cache_usage=gpu_cache_usage,
cpu_cache_usage=cpu_cache_usage,
)
logger.info("Avg prompt throughput: "
f"{avg_prompt_throughput:.1f} tokens/s, "
"Avg generation throughput: "
......
from aioprometheus import Gauge
# The begin-* and end* here are used by the documentation generator
# to extract the metrics definitions.
# begin-metrics-definitions
gauge_avg_prompt_throughput = Gauge("vllm:avg_prompt_throughput_toks_per_s",
"Average prefill throughput in tokens/s.")
gauge_avg_generation_throughput = Gauge(
"vllm:avg_generation_throughput_toks_per_s",
"Average generation throughput in tokens/s.")
gauge_scheduler_running = Gauge(
"vllm:num_requests_running",
"Number of requests that is currently running for inference.")
gauge_scheduler_swapped = Gauge("vllm:num_requests_swapped",
"Number requests swapped to CPU.")
gauge_scheduler_waiting = Gauge("vllm:num_requests_waiting",
"Number of requests waiting to be processed.")
gauge_gpu_cache_usage = Gauge(
"vllm:gpu_cache_usage_perc",
"GPU KV-cache usage. 1 means 100 percent usage.")
gauge_cpu_cache_usage = Gauge(
"vllm:cpu_cache_usage_perc",
"CPU KV-cache usage. 1 means 100 percent usage.")
# end-metrics-definitions
labels = {}
def add_global_metrics_labels(**kwargs):
labels.update(kwargs)
def record_metrics(
avg_prompt_throughput: float,
avg_generation_throughput: float,
scheduler_running: int,
scheduler_swapped: int,
scheduler_waiting: int,
gpu_cache_usage: float,
cpu_cache_usage: float,
):
gauge_avg_prompt_throughput.set(labels, avg_prompt_throughput)
gauge_avg_generation_throughput.set(labels, avg_generation_throughput)
gauge_scheduler_running.set(labels, scheduler_running)
gauge_scheduler_swapped.set(labels, scheduler_swapped)
gauge_scheduler_waiting.set(labels, scheduler_waiting)
gauge_gpu_cache_usage.set(labels, gpu_cache_usage)
gauge_cpu_cache_usage.set(labels, cpu_cache_usage)
......@@ -9,6 +9,8 @@ import time
from http import HTTPStatus
from typing import AsyncGenerator, Dict, List, Optional, Tuple, Union
from aioprometheus import MetricsMiddleware
from aioprometheus.asgi.starlette import metrics
import fastapi
import uvicorn
from fastapi import Request
......@@ -18,6 +20,7 @@ from fastapi.responses import JSONResponse, StreamingResponse, Response
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.engine.metrics import add_global_metrics_labels
from vllm.entrypoints.openai.protocol import (
CompletionRequest, CompletionResponse, CompletionResponseChoice,
CompletionResponseStreamChoice, CompletionStreamResponse,
......@@ -82,6 +85,10 @@ def parse_args():
return parser.parse_args()
app.add_middleware(MetricsMiddleware) # Trace HTTP server metrics
app.add_route("/metrics", metrics) # Exposes HTTP metrics
def create_error_response(status_code: HTTPStatus,
message: str) -> JSONResponse:
return JSONResponse(ErrorResponse(message=message,
......@@ -722,6 +729,9 @@ if __name__ == "__main__":
trust_remote_code=engine_model_config.trust_remote_code)
load_chat_template(args, tokenizer)
# Register labels for metrics
add_global_metrics_labels(model_name=engine_args.model)
uvicorn.run(app,
host=args.host,
port=args.port,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment