Unverified Commit be001a58 authored by Keiven C's avatar Keiven C Committed by GitHub
Browse files

chore: Remove deprecated components/metrics and references (#3475)


Signed-off-by: default avatarKeiven Chang <keivenchang@users.noreply.github.com>
parent f712653e
...@@ -4551,25 +4551,6 @@ dependencies = [ ...@@ -4551,25 +4551,6 @@ dependencies = [
"paste", "paste",
] ]
[[package]]
name = "metrics"
version = "0.5.1"
dependencies = [
"axum 0.8.4",
"clap 4.5.48",
"dynamo-llm",
"dynamo-runtime",
"futures",
"prometheus",
"rand 0.9.2",
"reqwest 0.12.23",
"serde",
"serde_json",
"thiserror 2.0.16",
"tokio",
"tracing",
]
[[package]] [[package]]
name = "mime" name = "mime"
version = "0.3.17" version = "0.3.17"
......
...@@ -3,7 +3,6 @@ ...@@ -3,7 +3,6 @@
[workspace] [workspace]
members = [ members = [
"components/metrics",
"launch/dynamo-run", "launch/dynamo-run",
"lib/llm", "lib/llm",
"lib/runtime", "lib/runtime",
...@@ -18,7 +17,6 @@ members = [ ...@@ -18,7 +17,6 @@ members = [
# - launch/dynamo-run # - launch/dynamo-run
# - lib/engines/* # - lib/engines/*
default-members = [ default-members = [
"components/metrics",
"lib/llm", "lib/llm",
"lib/runtime", "lib/runtime",
"lib/tokens", "lib/tokens",
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
[package]
name = "metrics"
version.workspace = true
edition.workspace = true
authors.workspace = true
license.workspace = true
homepage.workspace = true
repository.workspace = true
[dependencies]
dynamo-llm = { workspace = true }
dynamo-runtime = { workspace = true }
futures = { workspace = true }
prometheus = { workspace = true }
rand = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }
thiserror = { workspace = true }
tokio = { workspace = true }
tracing = { workspace = true }
axum = { version = "0.8" }
clap = { version = "4.5", features = ["derive", "env"] }
reqwest = { version = "0.12.22", default-features = false, features = ["json", "rustls-tls"] }
# Metrics
⚠️ **DEPRECATION NOTICE** ⚠️
**This `metrics` component is unmaintained and being deprecated.**
The deprecated `metrics` component is being replaced by the **`MetricsRegistry`** built-in functionality that is now available directly in the `DistributedRuntime` framework. The `MetricsRegistry` provides:
**For new projects and existing deployments, please migrate to using `MetricsRegistry` instead of this component.**
This component may be migrated to the MetricsRegistry in the future.
**📖 See the [Dynamo MetricsRegistry Guide](../../docs/guides/metrics.md) for detailed information on using the new metrics system.**
---
The deprecated `metrics` component is a utility for collecting, aggregating, and publishing metrics from a Dynamo deployment, but it is unmaintained and being deprecated in favor of `MetricsRegistry`.
**Note**: This is a demo implementation. The deprecated `metrics` component is no longer under active development.
- In this demo the metrics names use the prefix "llm", but in production they will be prefixed with "dynamo" (e.g., the HTTP `/metrics` endpoint will serve metrics with "dynamo" prefixes)
<div align="center">
<img src="images/dynamo_metrics_grafana.png" alt="Dynamo Metrics Dashboard"/>
</div>
## Quickstart
To start the deprecated `metrics` component, simply point it at the `namespace/component/endpoint`
trio for the Dynamo workers that you're interested in monitoring metrics on.
This will:
1. Collect statistics from workers associated with that `namespace/component/endpoint`
2. Postprocess and aggregate those statistics across the workers
3. Publish them on a Prometheus-compatible metrics endpoint
For example:
```bash
# Default namespace is "dynamo", but can be configured with --namespace
# For more detailed output, try setting the env var: DYN_LOG=debug
metrics --component MyComponent --endpoint my_endpoint
# 2025-03-17T00:07:05.202558Z INFO metrics: Scraping endpoint dynamo/MyComponent/my_endpoint for stats
# 2025-03-17T00:07:05.202955Z INFO metrics: Prometheus metrics server started at 0.0.0.0:9091/metrics
# ...
```
With no matching endpoints running to collect stats from, you should see warnings in the logs:
```bash
2025-03-17T00:07:06.204756Z WARN metrics: No endpoints found matching dynamo/MyComponent/my_endpoint
```
After a worker with a matching endpoint gets started, the endpoint
will get automatically discovered and the warnings will stop.
## Workers
The deprecated `metrics` component needs running workers to gather metrics from,
so below are some examples of workers and how they can be monitored.
### Mock Worker
To try out how the deprecated `metrics` component works, there is a demo Rust-based
[mock worker](src/bin/mock_worker.rs) that provides sample data through two mechanisms:
1. Exposes a stats handler at `dynamo/MyComponent/my_endpoint` that responds to polling requests (from the deprecated `metrics` component) with randomly generated `ForwardPassMetrics` data
2. Publishes mock `KVHitRateEvent` data every second to demonstrate event-based metrics
Step 1: Launch a mock workers via the following command (if already built):
```bash
# or build/run from source: DYN_LOG=DEBUG cargo run --bin mock_worker
mock_worker
# 2025-03-16T23:49:28.101668Z INFO mock_worker: Starting Mock Worker on Endpoint: dynamo/MyComponent/my_endpoint
```
Step 2: Monitor the metrics of these mock workers, and prepare its own Prometheus endpoint at
port 9091 (a default, when --port is not specified) on /metrics:
```bash
metrics --component MyComponent --endpoint my_endpoint
```
### Real Worker
To run a more realistic deployment to gather metrics:
```bash
python -m dynamo.frontend &
python -m dynamo.vllm --model-path <your-model-checkout>
```
Then, to monitor the metrics of these VllmWorkers, run:
```bash
metrics --component backend --endpoint load_metrics
```
**NOTE**: `load_metrics` is currently a
[hard-coded](https://github.com/ai-dynamo/dynamo/blob/d5220c7b1151372ba3d2a061c7d0a7ed72724789/lib/llm/src/kv_router/publisher.rs#L108)
endpoint name used for python-based workers that register a `WorkerMetricsPublisher`.
## Visualization
To visualize the metrics being exposed on the Prometheus endpoint,
see the Prometheus and Grafana configurations in
[deploy/metrics](../../deploy/metrics):
```bash
docker compose -f deploy/docker-compose.yml --profile metrics up -d
```
## Metrics Collection Modes
The deprecated `metrics` component supports two modes for exposing metrics in a Prometheus format:
### Pull Mode (Default)
When running in pull mode (the default), the deprecated `metrics` component will expose a
Prometheus metrics endpoint on the specified host and port that a
Prometheus server or curl client can pull from:
```bash
# Start metrics server on default host (0.0.0.0) and port (9091)
metrics --component MyComponent --endpoint my_endpoint
# Or specify a custom port
metrics --component MyComponent --endpoint my_endpoint --port 9092
```
In pull mode:
- The `--host` parameter must be a valid IPv4 or IPv6 address (e.g., "0.0.0.0", "127.0.0.1")
- The `--port` parameter specifies which port the HTTP server will listen on
You can then query the metrics using:
```bash
curl localhost:9091/metrics
# # HELP llm_kv_blocks_active Active KV cache blocks
# # TYPE llm_kv_blocks_active gauge
# llm_kv_blocks_active{component="MyComponent",endpoint="my_endpoint",worker_id="7587884888253033398"} 40
# llm_kv_blocks_active{component="MyComponent",endpoint="my_endpoint",worker_id="7587884888253033401"} 2
# # HELP llm_kv_blocks_total Total KV cache blocks
# # TYPE llm_kv_blocks_total gauge
# llm_kv_blocks_total{component="MyComponent",endpoint="my_endpoint",worker_id="7587884888253033398"} 100
# llm_kv_blocks_total{component="MyComponent",endpoint="my_endpoint",worker_id="7587884888253033401"} 100
```
### Push Mode
For ephemeral or batch jobs, or when metrics need to be pushed through a firewall,
you can use Push mode. In this mode, the deprecated `metrics` component will periodically push
metrics to an externally hosted
[Prometheus PushGateway](https://prometheus.io/docs/instrumenting/pushing/):
Start a prometheus push gateway service via docker:
```bash
docker run --rm -d -p 9091:9091 --name pushgateway prom/pushgateway
```
Start the deprecated `metrics` component in `--push` mode, specifying the host and port of your PushGateway:
```bash
# Push metrics to a Prometheus PushGateway every --push-interval seconds
metrics \
--component MyComponent \
--endpoint my_endpoint \
--host 127.0.0.1 \
--port 9091 \
--push
```
When using Push mode:
- The `--host` parameter must be a valid IPv4 or IPv6 address (e.g., "0.0.0.0", "127.0.0.1")
that the Prometheus PushGateway is running on
- The `--port` parameter specifies the port of the Prometheus PushGateway
- The push interval can be configured with `--push-interval` (default: 2 seconds)
- A default job name of "dynamo_metrics" is used for the Prometheus job label
- Metrics persist in the PushGateway until explicitly deleted
- Prometheus should be configured to scrape the PushGateway with `honor_labels: true`
To view the metrics hosted on the PushGateway:
```bash
# View all metrics
# curl http://<pushgateway_ip>:<pushgateway_port>/metrics
curl 127.0.0.1:9091/metrics
```
## Building/Running from Source
For easy iteration while making edits to the deprecated `metrics` component, you can use `cargo run`
to build and run with your local changes:
```bash
cargo run --bin metrics -- --component MyComponent --endpoint my_endpoint
```
File suppressed by a .gitattributes entry or the file's encoding is unsupported.
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
use dynamo_llm::kv_router::{
KV_HIT_RATE_SUBJECT,
protocols::{ForwardPassMetrics, KvStats, WorkerStats},
scheduler::KVHitRateEvent,
};
use dynamo_runtime::{
DistributedRuntime, Result, Runtime, Worker,
component::{Namespace, service::EndpointStats},
logging,
pipeline::{
AsyncEngine, AsyncEngineContextProvider, Error, ManyOut, ResponseStream, SingleIn,
async_trait, network::Ingress,
},
protocols::annotated::Annotated,
stream,
traits::events::EventPublisher,
};
use rand::Rng;
use std::sync::Arc;
use tokio::time::{Duration, interval};
fn main() -> Result<()> {
logging::init();
let worker = Worker::from_settings()?;
worker.execute(app)
}
async fn app(runtime: Runtime) -> Result<()> {
let distributed = DistributedRuntime::from_settings(runtime.clone()).await?;
backend(distributed).await
}
struct MockRequestHandler {}
impl MockRequestHandler {
fn new() -> Arc<Self> {
Arc::new(Self {})
}
}
#[async_trait]
impl AsyncEngine<SingleIn<String>, ManyOut<Annotated<String>>, Error> for MockRequestHandler {
async fn generate(&self, input: SingleIn<String>) -> Result<ManyOut<Annotated<String>>> {
let (data, ctx) = input.into_parts();
let chars = data
.chars()
.map(|c| Annotated::from_data(c.to_string()))
.collect::<Vec<_>>();
let stream = stream::iter(chars);
Ok(ResponseStream::new(Box::pin(stream), ctx.context()))
}
}
// FIXME: These events are just for testing and may not currently be used.
/// Spawns a background task that periodically publishes mock KV hit rate events
async fn mock_event_publisher(namespace: Namespace) {
// NOTE: These events are just for testing, and shouldn't be interpreted
// in correlation with the stats handler's data:
// 1. The worker ID associated with the events here won't match the
// worker ID of the endpoint's service stats handler.
// 2. These events aren't coming through the KV Router, so the metrics won't
// be reflective of the KV Router's performance.
// 3. The data in these events aren't in sync with the stats handler's
// ForwardPassMetrics data, so they may not correlate well.
let worker_id = rand::rng().random_range(1..=1000);
let mut interval = interval(Duration::from_secs(1));
loop {
interval.tick().await;
// Generate random KV hit rate event using a new thread_rng each time
let isl_blocks = rand::rng().random_range(0..=100);
let overlap_blocks = rand::rng().random_range(0..=isl_blocks);
let event = KVHitRateEvent {
worker_id,
isl_blocks,
overlap_blocks: overlap_blocks as u32,
};
if let Err(e) = namespace.publish(KV_HIT_RATE_SUBJECT, &event).await {
tracing::warn!("Failed to publish KV hit rate event: {e}");
} else {
tracing::debug!(
"Published KV hit rate event: worker_id={worker_id}, isl_blocks={isl_blocks}, overlap_blocks={overlap_blocks}, hit_rate={:.2}%",
(overlap_blocks as f64 / isl_blocks as f64) * 100.0
);
}
}
}
/// Generates mock forward pass metrics for stats handler
fn mock_stats_handler(_stats: EndpointStats) -> serde_json::Value {
let request_total_slots = 100;
let request_active_slots = rand::rng().random_range(0..=request_total_slots);
let kv_total_blocks = 100;
let kv_active_blocks = rand::rng().random_range(0..=kv_total_blocks);
let num_requests_waiting = rand::rng().random_range(0..=100);
let gpu_cache_usage_perc = rand::rng().random_range(0.0..=1.0);
let gpu_prefix_cache_hit_rate = rand::rng().random_range(0.0..=1.0);
let worker_stats = WorkerStats {
data_parallel_rank: None, // Default for backwards compatibility
request_active_slots,
request_total_slots,
num_requests_waiting,
};
let kv_stats = KvStats {
kv_active_blocks,
kv_total_blocks,
gpu_cache_usage_perc,
gpu_prefix_cache_hit_rate,
};
let spec_decode_stats = None;
let stats = ForwardPassMetrics {
worker_stats,
kv_stats,
spec_decode_stats,
};
tracing::info!("Stats: {stats:?}");
serde_json::to_value(stats).unwrap()
}
async fn backend(runtime: DistributedRuntime) -> Result<()> {
let namespace = runtime.namespace("dynamo")?;
// we must first create a service, then we can attach one more more endpoints
let component = namespace
.component("MyComponent")?
.service_builder()
.create()
.await?;
let endpoint = component.endpoint("my_endpoint");
tracing::info!("Starting Mock Worker on Endpoint: {}", endpoint.path());
// Spawn background task for publishing KV hit rate events
let namespace_clone = namespace.clone();
tokio::spawn(async move {
mock_event_publisher(namespace_clone).await;
});
// Attach an ingress to the engine
let ingress = Ingress::for_engine(MockRequestHandler::new())?;
// Make the ingress discoverable via a component service
endpoint
.endpoint_builder()
// Dummy stats handler to demonstrate how to attach a custom stats handler
.stats_handler(mock_stats_handler)
.handler(ingress)
.start()
.await
}
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Library functions for the metrics application.
//!
//! This library provides functionality to expose Prometheus metrics either through a local HTTP server
//! or by pushing to a Prometheus PushGateway.
//!
//! # Examples
//!
//! ## Using the metrics pull mode
//! ```no_run
//! use metrics::{PrometheusMetricsCollector, MetricsMode};
//!
//! #[tokio::main]
//! async fn main() -> Result<(), Box<dyn std::error::Error>> {
//! let mut collector = PrometheusMetricsCollector::new()?;
//!
//! // Start a metrics server with default values
//! collector.start(MetricsMode::default())?;
//!
//! // Or explicitly specify values
//! collector.start(MetricsMode::Pull {
//! host: "127.0.0.1".to_string(),
//! port: 9090,
//! })?;
//!
//! // Or use the convenience constructor
//! collector.start(MetricsMode::new_pull())?;
//!
//! // Your application code here
//! tokio::signal::ctrl_c().await?;
//!
//! // Stop the metrics server gracefully
//! collector.stop();
//! Ok(())
//! }
//! ```
//!
//! ## Using the Push mode
//! ```no_run
//! use metrics::{PrometheusMetricsCollector, MetricsMode};
//!
//! #[tokio::main]
//! async fn main() -> Result<(), Box<dyn std::error::Error>> {
//! let mut collector = PrometheusMetricsCollector::new()?;
//!
//! // Start pushing metrics to a Prometheus PushGateway with default values
//! collector.start(MetricsMode::new_push())?;
//!
//! // Or explicitly specify values
//! collector.start(MetricsMode::Push {
//! host: "127.0.0.1".to_string(),
//! port: 9091,
//! job: "custom_job".to_string(),
//! interval: 5, // Push every 5 seconds
//! })?;
//!
//! // Your application code here
//! tokio::signal::ctrl_c().await?;
//!
//! // Stop pushing metrics gracefully
//! collector.stop();
//! Ok(())
//! }
use axum::{Router, routing::get};
use prometheus::{Encoder, TextEncoder, register_counter_vec, register_gauge_vec};
use reqwest::Client;
use serde::{Deserialize, Serialize};
use std::net::SocketAddr;
use std::time::Duration as StdDuration;
use dynamo_llm::kv_router::protocols::{ForwardPassMetrics, LoadMetrics};
use dynamo_llm::kv_router::scoring::Endpoint;
use dynamo_llm::kv_router::scoring::ProcessedEndpoints;
use dynamo_runtime::{
Result, distributed::Component, error, service::EndpointInfo, utils::Duration,
};
/// Configuration for metrics collection mode
#[derive(Debug, Clone)]
pub enum MetricsMode {
/// Host a Prometheus metrics server for pull-based collection
Pull {
/// Host to listen on (e.g. "0.0.0.0")
host: String,
/// Port to listen on (e.g. 9091)
port: u16,
},
/// Push to a Prometheus PushGateway
Push {
/// PushGateway host (e.g. "http://localhost")
host: String,
/// PushGateway port (e.g. 9091)
port: u16,
/// Job name for the metrics
job: String,
/// Push interval in seconds
interval: u64,
},
}
impl Default for MetricsMode {
fn default() -> Self {
Self::new_pull()
}
}
impl MetricsMode {
/// Create a new Pull mode with default values
pub fn new_pull() -> Self {
Self::Pull {
host: "0.0.0.0".to_string(),
port: 9091,
}
}
/// Create a new Push mode with default values
pub fn new_push() -> Self {
Self::Push {
host: "127.0.0.1".to_string(),
port: 9091,
job: "dynamo_metrics".to_string(),
interval: 2,
}
}
}
/// Configuration for LLM worker load capacity metrics
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LLMWorkerLoadCapacityConfig {
pub component_name: String,
pub endpoint_name: String,
pub model_name: Option<String>,
}
/// Metrics collector for exposing metrics to prometheus/grafana
pub struct PrometheusMetricsCollector {
metrics: PrometheusMetrics,
mode: Option<MetricsMode>,
shutdown_tx: Option<tokio::sync::oneshot::Sender<()>>,
}
impl PrometheusMetricsCollector {
pub fn new() -> Result<Self> {
Ok(Self {
metrics: PrometheusMetrics::new()?,
mode: None,
shutdown_tx: None,
})
}
/// Start metrics collection with the specified mode
pub fn start(&mut self, mode: MetricsMode) -> Result<()> {
// Store the mode
self.mode = Some(mode.clone());
match mode {
MetricsMode::Pull { host, port } => self.start_pull_mode(host, port),
MetricsMode::Push {
host,
port,
job,
interval,
} => self.start_push_mode(host, port, job, interval),
}
}
/// Stop metrics collection
pub fn stop(&mut self) {
if let Some(tx) = self.shutdown_tx.take() {
let _ = tx.send(());
}
}
/// Start a metrics server for pull-based collection on the specified port
fn start_pull_mode(&mut self, host: String, port: u16) -> Result<()> {
// Create an axum router with a metrics endpoint
let app = Router::new().route(
"/metrics",
get(|| async {
// Gather and encode metrics
let encoder = TextEncoder::new();
let mut buffer = Vec::new();
encoder.encode(&prometheus::gather(), &mut buffer).unwrap();
String::from_utf8(buffer).unwrap()
}),
);
// Create a socket address to listen on
let ip_addr = host.parse().map_err(|e| {
error!("Failed to parse host '{}' as IP address: {}. Use a valid IPv4 or IPv6 address (e.g. '0.0.0.0' or '127.0.0.1')", host, e)
})?;
let addr = SocketAddr::new(ip_addr, port);
// Create shutdown channel
let (tx, rx) = tokio::sync::oneshot::channel();
self.shutdown_tx = Some(tx);
// Spawn the server in a background task
tokio::spawn(async move {
let listener = tokio::net::TcpListener::bind(addr)
.await
.unwrap_or_else(|_| panic!("could not bind to address: {addr}"));
let server = axum::serve(listener, app);
// Create a future that completes when shutdown signal is received
let shutdown_future = async {
rx.await.ok();
};
// Run the server with graceful shutdown
tokio::select! {
result = server => {
if let Err(e) = result {
tracing::error!("Metrics server error: {}", e);
}
},
_ = shutdown_future => {
tracing::info!("Metrics server shutting down gracefully");
},
}
});
tracing::info!("Prometheus metrics server started at {addr}/metrics");
Ok(())
}
/// Start pushing metrics to a Prometheus PushGateway
fn start_push_mode(
&mut self,
host: String,
port: u16,
job: String,
interval: u64,
) -> Result<()> {
// Create shutdown channel
let (tx, mut rx) = tokio::sync::oneshot::channel();
self.shutdown_tx = Some(tx);
// Create HTTP client
let client = Client::new();
let url = format!("http://{host}:{port}/metrics/job/{job}");
let url_clone = url.clone();
let interval_duration = StdDuration::from_secs(interval);
// Spawn background task to periodically push metrics
tokio::spawn(async move {
let mut interval = tokio::time::interval(interval_duration);
loop {
tokio::select! {
_ = interval.tick() => {
// Gather and encode metrics
let encoder = TextEncoder::new();
let mut buffer = Vec::new();
if let Err(e) = encoder.encode(&prometheus::gather(), &mut buffer) {
tracing::error!("Failed to encode metrics: {}", e);
continue;
}
// Push metrics to the gateway
match client.post(&url)
.header("Content-Type", encoder.format_type())
.body(buffer)
.send()
.await
{
Ok(response) => {
if response.status().is_success() {
tracing::debug!("Successfully pushed metrics to PushGateway");
} else {
tracing::error!(
"Failed to push metrics to PushGateway. Status: {}, Error: {:?}",
response.status(),
response.text().await
);
}
}
Err(e) => {
tracing::error!("Failed to push metrics to PushGateway: {}", e);
}
}
}
_ = &mut rx => {
tracing::info!("Stopping metrics push task");
break;
}
}
}
});
tracing::info!(
"Started pushing metrics to PushGateway at '{url_clone}' with job name '{job}'"
);
Ok(())
}
/// Update metrics with current values
pub fn update(&mut self, config: &LLMWorkerLoadCapacityConfig, processed: &ProcessedEndpoints) {
self.metrics.update(config, processed);
}
/// Update KV hit rate metrics
pub fn update_kv_hit_rate(
&mut self,
config: &LLMWorkerLoadCapacityConfig,
worker_id: i64,
isl_blocks: usize,
overlap_blocks: usize,
) {
self.metrics
.update_kv_hit_rate(config, worker_id, isl_blocks, overlap_blocks);
}
}
/// Prometheus metrics collection
pub struct PrometheusMetrics {
kv_blocks_active: prometheus::GaugeVec,
kv_blocks_total: prometheus::GaugeVec,
requests_active: prometheus::GaugeVec,
requests_total: prometheus::GaugeVec,
load_avg: prometheus::GaugeVec,
load_std: prometheus::GaugeVec,
// KV hit rate metrics
kv_hit_rate_percent: prometheus::GaugeVec,
// FIXME: These are currently unused outside of mock_worker
kv_hit_rate_isl_blocks: prometheus::CounterVec,
kv_hit_rate_overlap_blocks: prometheus::CounterVec,
}
impl PrometheusMetrics {
/// Initialize all metrics
fn new() -> Result<Self> {
Ok(Self {
kv_blocks_active: register_gauge_vec!(
"llm_kv_blocks_active",
"Active KV cache blocks",
&["component", "endpoint", "worker_id"]
)?,
kv_blocks_total: register_gauge_vec!(
"llm_kv_blocks_total",
"Total KV cache blocks",
&["component", "endpoint", "worker_id"]
)?,
requests_active: register_gauge_vec!(
"llm_requests_active_slots",
"Active request slots",
&["component", "endpoint", "worker_id"]
)?,
requests_total: register_gauge_vec!(
"llm_requests_total_slots",
"Total request slots",
&["component", "endpoint", "worker_id"]
)?,
load_avg: register_gauge_vec!(
"llm_load_avg",
"Average load across workers",
&["component", "endpoint"]
)?,
load_std: register_gauge_vec!(
"llm_load_std",
"Load standard deviation across workers",
&["component", "endpoint"]
)?,
// KV hit rate (ForwardPassMetrics)
kv_hit_rate_percent: register_gauge_vec!(
"llm_kv_hit_rate_percent",
"KV hit rate percentage per worker",
&["component", "endpoint", "worker_id"]
)?,
// FIXME: Cleanup/remove event based metrics after finalizaing
// metrics collection approach with vllm/trtllm workers.
// Event-based KV hit rate metrics (not currently used outside mock worker)
kv_hit_rate_isl_blocks: register_counter_vec!(
"llm_kv_hit_rate_isl_blocks",
"Cumulative count of ISL blocks in KV hit rate events",
&["component", "endpoint", "worker_id"]
)?,
kv_hit_rate_overlap_blocks: register_counter_vec!(
"llm_kv_hit_rate_overlap_blocks",
"Cumulative count of overlapping blocks in KV hit rate events",
&["component", "endpoint", "worker_id"]
)?,
})
}
/// Helper method to set a gauge with worker-specific labels (3 labels)
fn set_worker_gauge(
&self,
gauge: &prometheus::GaugeVec,
config: &LLMWorkerLoadCapacityConfig,
worker_id: &String,
value: f64,
) {
gauge
.with_label_values(&[&config.component_name, &config.endpoint_name, worker_id])
.set(value);
}
/// Helper method to increment a counter with worker-specific labels (3 labels)
fn increment_worker_counter(
&self,
counter: &prometheus::CounterVec,
config: &LLMWorkerLoadCapacityConfig,
worker_id: &String,
value: f64,
) {
counter
.with_label_values(&[&config.component_name, &config.endpoint_name, worker_id])
.inc_by(value);
}
/// Helper method to set a gauge with component/endpoint labels only (2 labels)
fn set_endpoint_gauge(
&self,
gauge: &prometheus::GaugeVec,
config: &LLMWorkerLoadCapacityConfig,
value: f64,
) {
gauge
.with_label_values(&[&config.component_name, &config.endpoint_name])
.set(value);
}
/// Update metrics with current values
fn update(&self, config: &LLMWorkerLoadCapacityConfig, processed: &ProcessedEndpoints) {
// Update per-worker metrics
for (worker_id, endpoint) in processed.endpoints.iter() {
let worker_id = worker_id.to_string();
let load_metrics = endpoint.data.clone();
let LoadMetrics::EngineLoadMetrics(metrics) = load_metrics else {
panic!("Can only update with ForwardPassMetrics");
};
self.set_worker_gauge(
&self.kv_blocks_active,
config,
&worker_id,
metrics.kv_stats.kv_active_blocks as f64,
);
self.set_worker_gauge(
&self.kv_blocks_total,
config,
&worker_id,
metrics.kv_stats.kv_total_blocks as f64,
);
self.set_worker_gauge(
&self.requests_active,
config,
&worker_id,
metrics.worker_stats.request_active_slots as f64,
);
self.set_worker_gauge(
&self.requests_total,
config,
&worker_id,
metrics.worker_stats.request_total_slots as f64,
);
self.set_worker_gauge(
&self.kv_hit_rate_percent,
config,
&worker_id,
metrics.kv_stats.gpu_prefix_cache_hit_rate as f64,
);
}
// Update aggregate metrics
self.set_endpoint_gauge(&self.load_avg, config, processed.load_avg);
self.set_endpoint_gauge(&self.load_std, config, processed.load_std);
}
/// Update KV hit rate metrics
pub fn update_kv_hit_rate(
&self,
config: &LLMWorkerLoadCapacityConfig,
worker_id: i64,
isl_blocks: usize,
overlap_blocks: usize,
) {
let worker_id_str = worker_id.to_string();
// Increment the ISL blocks and overlap blocks counters
self.increment_worker_counter(
&self.kv_hit_rate_isl_blocks,
config,
&worker_id_str,
isl_blocks as f64,
);
self.increment_worker_counter(
&self.kv_hit_rate_overlap_blocks,
config,
&worker_id_str,
overlap_blocks as f64,
);
// TODO: The cumulative hit rate percentage can probably be computed by consumers
// of Prometheus metrics like Grafana instead, but we'll compute it here for now
// for convenient debugging/logging.
// Calculate and set the cumulative hit rate percentage
let cumulative_isl = self
.kv_hit_rate_isl_blocks
.with_label_values(&[
&config.component_name,
&config.endpoint_name,
&worker_id_str,
])
.get();
let cumulative_overlap = self
.kv_hit_rate_overlap_blocks
.with_label_values(&[
&config.component_name,
&config.endpoint_name,
&worker_id_str,
])
.get();
if cumulative_isl > 0.0 {
let cumulative_hit_rate = (cumulative_overlap / cumulative_isl) * 100.0;
tracing::debug!(
"Estimated Cumulative KV hit rate: {cumulative_hit_rate:.2}% (Overlap: {cumulative_overlap} / ISL: {cumulative_isl})"
);
}
}
}
/// Collect endpoints from a component
pub async fn collect_endpoints(
component: &Component,
subject: &str,
timeout: Duration,
) -> Result<Vec<EndpointInfo>> {
// Collect stats from each backend
let stream = component.scrape_stats(timeout).await?;
// Filter the stats by the service subject
let endpoints = stream
.into_endpoints()
.filter(|e| e.subject.starts_with(subject))
.collect::<Vec<_>>();
tracing::debug!("Endpoints: {endpoints:?}");
Ok(endpoints)
}
/// Extract metrics from endpoints
pub fn extract_metrics(endpoints: &[EndpointInfo]) -> Vec<ForwardPassMetrics> {
let endpoint_data = endpoints.iter().map(|e| e.data.clone()).collect::<Vec<_>>();
// Extract ForwardPassMetrics objects from endpoint services
let metrics: Vec<ForwardPassMetrics> = endpoint_data
.iter()
.filter_map(|e| {
let metrics_data = e.as_ref()?;
match metrics_data.clone().decode::<ForwardPassMetrics>() {
Ok(stats) => Some(stats),
Err(err) => {
tracing::error!(
"Failed to decode ForwardPassMetrics data: {}. Raw data: {:?}",
err,
metrics_data
);
None
}
}
})
.collect();
tracing::debug!("Metrics: {metrics:?}");
metrics
}
/// Create ProcessedEndpoints from metrics and endpoints
pub fn postprocess_metrics(
metrics: &[ForwardPassMetrics],
endpoints: &[EndpointInfo],
) -> ProcessedEndpoints {
let processed_endpoints: Vec<Endpoint> = metrics
.iter()
.zip(endpoints.iter())
.filter_map(|(m, e)| {
e.id().ok().map(|id| Endpoint {
name: format!("worker-{id}"),
subject: e.subject.clone(),
data: LoadMetrics::EngineLoadMetrics(m.clone()),
})
})
.collect();
ProcessedEndpoints::new(processed_endpoints)
}
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Metrics is a metrics aggregator designed to operate within a namespace and collect
//! metrics from all workers.
//!
//! Metrics will collect for now:
//!
//! - LLM Worker Load:Capacity
//! - These metrics will be scraped by the LLM NATS Service API's stats request
//! - Request Slots: [Active, Total]
//! - KV Cache Blocks: [Active, Total]
//! - KV Hit Rate:
//! - These metrics will be collected from KV hit rate events published by the KV router
//! - ISL Blocks: Cumulative count of total blocks in all KV hit rate events
//! - Overlap Blocks: Cumulative count of blocks that were already in the KV cache
use clap::Parser;
use dynamo_llm::kv_router::KV_HIT_RATE_SUBJECT;
use dynamo_llm::kv_router::scheduler::KVHitRateEvent;
use dynamo_runtime::{
DistributedRuntime, ErrorContext, Result, Runtime, Worker, error, logging,
traits::events::{EventPublisher, EventSubscriber},
utils::{Duration, Instant},
};
use futures::stream::StreamExt;
use std::sync::Arc;
// Import from our library
use metrics::{
LLMWorkerLoadCapacityConfig, MetricsMode, PrometheusMetricsCollector, collect_endpoints,
extract_metrics, postprocess_metrics,
};
/// CLI arguments for the metrics application
#[derive(Parser, Debug)]
#[command(author, version, about, long_about = None)]
struct Args {
/// Namespace to operate in and subscribe to events on
#[arg(long, env = "DYN_NAMESPACE", default_value = "dynamo")]
namespace: String,
/// Component to scrape metrics from
#[arg(long)]
component: String,
/// Endpoint to scrape metrics from
#[arg(long)]
endpoint: String,
/// Model name for the target component (optional)
#[arg(long)]
model_name: Option<String>,
/// Polling interval in seconds for scraping dynamo endpoint stats (minimum 1 second)
#[arg(long, default_value = "1")]
poll_interval: u64,
/// Host for serving or pushing prometheus metrics (default: 0.0.0.0)
#[arg(
long,
default_value = "0.0.0.0",
help_heading = "Prometheus Metrics Config"
)]
host: String,
/// Port to run the Prometheus metrics server on (default: 9091)
#[arg(
long,
default_value = "9091",
help_heading = "Prometheus Metrics Config"
)]
port: u16,
/// Push metrics to an external Prometheus Pushgateway instead of hosting them in-process
#[arg(long, help_heading = "Prometheus Metrics Config")]
push: bool,
/// Push interval in seconds, when using push mode (minimum 1 second, default: 2)
#[arg(long, default_value = "2", help_heading = "Prometheus Metrics Config")]
push_interval: u64,
}
fn get_config(args: &Args) -> Result<LLMWorkerLoadCapacityConfig> {
if args.component.is_empty() {
return Err(error!("Component name cannot be empty"));
}
if args.endpoint.is_empty() {
return Err(error!("Endpoint name cannot be empty"));
}
if args.poll_interval < 1 {
return Err(error!("Polling interval must be at least 1 second"));
}
if args.push && args.push_interval < 1 {
return Err(error!("Push interval must be at least 1 second"));
}
Ok(LLMWorkerLoadCapacityConfig {
component_name: args.component.clone(),
endpoint_name: args.endpoint.clone(),
model_name: args.model_name.clone(),
})
}
async fn app(runtime: Runtime) -> Result<()> {
let args = Args::parse();
let config = get_config(&args)?;
tracing::debug!("Config: {config:?}");
let drt = DistributedRuntime::from_settings(runtime.clone()).await?;
let namespace = drt.namespace(args.namespace)?;
let component = namespace.component("count")?;
// Create unique instance of Count
let key = format!("{}/instance", component.etcd_root());
tracing::debug!("Creating unique instance of Count at {key}");
drt.etcd_client()
.expect("Unreachable because of DistributedRuntime::from_settings above")
.kv_create(&key, serde_json::to_vec_pretty(&config)?, None)
.await
.context("Unable to create unique instance of Count; possibly one already exists")?;
let target_component = namespace.component(&config.component_name)?;
let target_endpoint = target_component.endpoint(&config.endpoint_name);
let service_path = target_endpoint.path();
let service_subject = target_endpoint.subject();
tracing::info!("Scraping endpoint {service_path} for stats");
// Safety: DistributedRuntime::from_settings ensures this is Some
let token = drt.primary_lease().unwrap().child_token();
let event_name = format!("l2c.{}.{}", config.component_name, config.endpoint_name);
// Initialize Prometheus metrics with the selected mode
let metrics_collector = PrometheusMetricsCollector::new()?;
let metrics_collector = Arc::new(tokio::sync::Mutex::new(metrics_collector));
// Start metrics collection in the selected mode
let metrics_mode = if args.push {
MetricsMode::Push {
host: args.host,
port: args.port,
job: "dynamo_push_metrics".to_string(),
interval: args.push_interval,
}
} else {
MetricsMode::Pull {
host: args.host,
port: args.port,
}
};
metrics_collector.lock().await.start(metrics_mode)?;
// TODO: Consider removing event subscription until metrics are more standardized
// Subscribe to KV hit rate events
let kv_hit_rate_subject = KV_HIT_RATE_SUBJECT;
tracing::debug!("Subscribing to KV hit rate events on subject: {kv_hit_rate_subject}");
// Clone fields for the event subscription task
let config_clone = config.clone();
let namespace_clone = namespace.clone();
let metrics_collector_clone = metrics_collector.clone();
// Note: Subscribing to KVHitRateEvent for illustration purposes. They're not used in production.
// Spawn a task to handle KV hit rate events
tokio::spawn(async move {
match namespace_clone.subscribe(kv_hit_rate_subject).await {
Ok(mut subscriber) => {
tracing::debug!("Successfully subscribed to KV hit rate events");
while let Some(msg) = subscriber.next().await {
match serde_json::from_slice::<KVHitRateEvent>(&msg.payload) {
Ok(event) => {
// TODO: Lower to debug
let cache_hit_pct =
(event.overlap_blocks as f64 / event.isl_blocks as f64) * 100.0;
tracing::debug!(
"Received KV hit rate event: worker_id={}, isl_blocks={}, overlap_blocks={}, cache_hit_pct={:.2}%",
event.worker_id,
event.isl_blocks,
event.overlap_blocks,
cache_hit_pct
);
// Update metrics with the event data
let mut metrics = metrics_collector_clone.lock().await;
metrics.update_kv_hit_rate(
&config_clone,
event.worker_id,
event.isl_blocks,
event.overlap_blocks as usize,
);
}
Err(e) => {
tracing::warn!("Failed to deserialize KV hit rate event: {e}");
}
}
}
tracing::warn!("KV hit rate event subscription stream ended");
}
Err(e) => {
tracing::error!("Failed to subscribe to KV hit rate events: {:?}", e);
}
}
});
loop {
let next = Instant::now() + Duration::from_secs(args.poll_interval);
// Collect and process metrics
let scrape_timeout = Duration::from_secs(1);
let endpoints =
collect_endpoints(&target_component, &service_subject, scrape_timeout).await?;
if endpoints.is_empty() {
tracing::warn!("No endpoints found matching {service_path}");
continue;
}
let metrics = extract_metrics(&endpoints);
let processed = postprocess_metrics(&metrics, &endpoints);
if processed.endpoints.is_empty() {
tracing::warn!("No metrics found matching {service_path}");
} else {
tracing::info!("Aggregated metrics: {processed:?}");
}
// Update Prometheus metrics
metrics_collector.lock().await.update(&config, &processed);
// TODO: Enable KV Routers to subscribe to metrics events published here
// for a single view of the aggregated metrics, as opposed to the current
// approach where each KV Router computes and published its own metrics.
// Publish metrics event
namespace.publish(&event_name, &processed).await?;
// Wait until cancelled or the next tick
match tokio::time::timeout_at(next, token.cancelled()).await {
Ok(_) => break,
Err(_) => continue,
}
}
Ok(())
}
fn main() -> Result<()> {
logging::init();
let worker = Worker::from_settings()?;
worker.execute(app)
}
#[cfg(test)]
mod tests {
use super::*;
use std::env;
#[test]
fn test_namespace_from_env() {
unsafe { env::set_var("DYN_NAMESPACE", "test-namespace") };
let args = Args::parse_from(["count", "--component", "comp", "--endpoint", "end"]);
assert_eq!(args.namespace, "test-namespace");
}
}
...@@ -174,7 +174,7 @@ The following configuration files should be present in this directory: ...@@ -174,7 +174,7 @@ The following configuration files should be present in this directory:
- [grafana_dashboards/grafana-dashboard-providers.yml](./grafana_dashboards/grafana-dashboard-providers.yml): Contains Grafana dashboard provider configuration - [grafana_dashboards/grafana-dashboard-providers.yml](./grafana_dashboards/grafana-dashboard-providers.yml): Contains Grafana dashboard provider configuration
- [grafana_dashboards/grafana-dynamo-dashboard.json](./grafana_dashboards/grafana-dynamo-dashboard.json): A general Dynamo Dashboard for both SW and HW metrics. - [grafana_dashboards/grafana-dynamo-dashboard.json](./grafana_dashboards/grafana-dynamo-dashboard.json): A general Dynamo Dashboard for both SW and HW metrics.
- [grafana_dashboards/grafana-dcgm-metrics.json](./grafana_dashboards/grafana-dcgm-metrics.json): Contains Grafana dashboard configuration for DCGM GPU metrics - [grafana_dashboards/grafana-dcgm-metrics.json](./grafana_dashboards/grafana-dcgm-metrics.json): Contains Grafana dashboard configuration for DCGM GPU metrics
- [grafana_dashboards/grafana-llm-metrics.json](./grafana_dashboards/grafana-llm-metrics.json): This file, which is being phased out, contains the Grafana dashboard configuration for LLM-specific metrics. It requires an additional `metrics` component to operate concurrently. A new version is under development. - [grafana_dashboards/grafana-kvbm-dashboard.json](./grafana_dashboards/grafana-kvbm-dashboard.json): Contains Grafana dashboard configuration for KVBM metrics
### Metric Name Constants ### Metric Name Constants
...@@ -237,8 +237,6 @@ This centralized approach ensures all Dynamo components use consistent, valid Pr ...@@ -237,8 +237,6 @@ This centralized approach ensures all Dynamo components use consistent, valid Pr
- DCGM Exporter: `http://localhost:9401/metrics` - DCGM Exporter: `http://localhost:9401/metrics`
- Start the [components/metrics](../../components/metrics/README.md) application to begin monitoring for metric events from dynamo workers and aggregating them on a Prometheus metrics endpoint: `http://localhost:9091/metrics`.
- Uncomment the appropriate lines in prometheus.yml to poll port 9091.
- Start worker(s) that publishes KV Cache metrics: [lib/runtime/examples/service_metrics/README.md](../../lib/runtime/examples/service_metrics/README.md) can populate dummy KV Cache metrics. - Start worker(s) that publishes KV Cache metrics: [lib/runtime/examples/service_metrics/README.md](../../lib/runtime/examples/service_metrics/README.md) can populate dummy KV Cache metrics.
### Configuration ### Configuration
...@@ -275,7 +273,7 @@ Grafana is pre-configured with: ...@@ -275,7 +273,7 @@ Grafana is pre-configured with:
docker compose logs grafana docker compose logs grafana
``` ```
3. For issues with the legacy metrics component (being phased out), see [components/metrics/README.md](../../components/metrics/README.md) for details on the exposed metrics and troubleshooting steps. 3. Check Prometheus targets at `http://localhost:9090/targets` to verify metric collection.
## Developer Guide ## Developer Guide
...@@ -477,21 +475,6 @@ let requests_total = namespace.create_counter( ...@@ -477,21 +475,6 @@ let requests_total = namespace.create_counter(
)?; )?;
``` ```
## Running the deprecated `components/metrics` program
⚠️ **DEPRECATION NOTICE** ⚠️
When you run the example [components/metrics](../../components/metrics/README.md) program, it exposes a Prometheus /metrics endpoint with the following metrics (defined in [components/metrics/src/lib.rs](../../components/metrics/src/lib.rs)):
**⚠️ The following `llm_kv_*` metrics are deprecated:**
- `llm_requests_active_slots`: Active request slots per worker
- `llm_requests_total_slots`: Total available request slots per worker
- `llm_kv_blocks_active`: Active KV blocks per worker
- `llm_kv_blocks_total`: Total KV blocks available per worker
- `llm_kv_hit_rate_percent`: KV Cache hit percent per worker
- `llm_load_avg`: Average load across workers
- `llm_load_std`: Load standard deviation across workers
## Troubleshooting ## Troubleshooting
...@@ -506,4 +489,4 @@ When you run the example [components/metrics](../../components/metrics/README.md ...@@ -506,4 +489,4 @@ When you run the example [components/metrics](../../components/metrics/README.md
docker compose logs grafana docker compose logs grafana
``` ```
3. For issues with the legacy metrics component (being phased out), see [components/metrics/README.md](../../components/metrics/README.md) for details on the exposed metrics and troubleshooting steps. 3. Check Prometheus targets at `http://localhost:9090/targets` to verify metric collection.
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "grafana",
"uid": "-- Grafana --"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"copyright": [
"SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.",
"SPDX-License-Identifier: Apache-2.0",
"Licensed under the Apache License, Version 2.0 (the \"License\");",
"you may not use this file except in compliance with the License.",
"You may obtain a copy of the License at",
"http://www.apache.org/licenses/LICENSE-2.0",
"Unless required by applicable law or agreed to in writing, software",
"distributed under the License is distributed on an \"AS IS\" BASIS,",
"WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.",
"See the License for the specific language governing permissions and",
"limitations under the License.",
"",
"DEPRECATION NOTICE:",
"This dashboard uses deprecated llm_kv_* metrics (llm_kv_blocks_active, llm_kv_blocks_total, llm_kv_hit_rate_percent)",
"that are part of the deprecated metrics aggregation service. These metrics will be removed in a future release.",
"Please migrate to the new MetricsRegistry system which provides dynamo_* metrics instead.",
"See docs/guides/metrics.md for migration guidance."
],
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": 1,
"links": [],
"liveNow": false,
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 0
},
"id": 1,
"options": {
"legend": {
"calcs": [
"mean",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"title": "KV Cache Utilization by Worker",
"type": "timeseries",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"editorMode": "code",
"expr": "100 * llm_kv_blocks_active{component=\"$component\", endpoint=\"$endpoint\"} / llm_kv_blocks_total{component=\"$component\", endpoint=\"$endpoint\"}",
"legendFormat": "Worker {{worker_id}}",
"range": true,
"refId": "A"
}
]
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 0
},
"id": 2,
"options": {
"legend": {
"calcs": [
"mean",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"title": "Request Slot Utilization by Worker",
"type": "timeseries",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"editorMode": "code",
"expr": "100 * llm_requests_active_slots{component=\"$component\", endpoint=\"$endpoint\"} / llm_requests_total_slots{component=\"$component\", endpoint=\"$endpoint\"}",
"legendFormat": "Worker {{worker_id}}",
"range": true,
"refId": "A"
}
]
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 50
},
{
"color": "red",
"value": 80
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 4,
"x": 0,
"y": 8
},
"id": 3,
"options": {
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showThresholdLabels": false,
"showThresholdMarkers": true
},
"pluginVersion": "10.0.0",
"title": "Average KV Cache Utilization",
"type": "gauge",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"editorMode": "code",
"expr": "100 * avg(llm_kv_blocks_active{component=\"$component\", endpoint=\"$endpoint\"}) / avg(llm_kv_blocks_total{component=\"$component\", endpoint=\"$endpoint\"})",
"legendFormat": "__auto",
"range": true,
"refId": "A"
}
]
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "yellow",
"value": 50
},
{
"color": "red",
"value": 80
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 4,
"x": 4,
"y": 8
},
"id": 4,
"options": {
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showThresholdLabels": false,
"showThresholdMarkers": true
},
"pluginVersion": "10.0.0",
"title": "Average Request Slot Utilization",
"type": "gauge",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"editorMode": "code",
"expr": "100 * avg(llm_requests_active_slots{component=\"$component\", endpoint=\"$endpoint\"}) / avg(llm_requests_total_slots{component=\"$component\", endpoint=\"$endpoint\"})",
"legendFormat": "__auto",
"range": true,
"refId": "A"
}
]
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 4,
"x": 8,
"y": 8
},
"id": 7,
"options": {
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"showThresholdLabels": false,
"showThresholdMarkers": true
},
"pluginVersion": "10.0.0",
"title": "Average KV Cache Hit Rate",
"type": "gauge",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"editorMode": "code",
"expr": "100 * avg(llm_kv_hit_rate_percent{component=\"$component\", endpoint=\"$endpoint\"})",
"legendFormat": "__auto",
"range": true,
"refId": "A"
}
]
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
}
]
},
"unit": "none"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 8
},
"id": 5,
"options": {
"legend": {
"calcs": [
"mean",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"title": "Load Average & Standard Deviation",
"type": "timeseries",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"editorMode": "code",
"expr": "llm_load_avg{component=\"$component\", endpoint=\"$endpoint\"}",
"legendFormat": "Average",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"editorMode": "code",
"expr": "llm_load_std{component=\"$component\", endpoint=\"$endpoint\"}",
"hide": false,
"legendFormat": "StdDev",
"range": true,
"refId": "B"
}
]
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 16
},
"id": 8,
"options": {
"legend": {
"calcs": [
"mean",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"title": "KV Cache Hit Rate by Worker",
"type": "timeseries",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"editorMode": "code",
"expr": "100 * llm_kv_hit_rate_percent{component=\"$component\", endpoint=\"$endpoint\"}",
"legendFormat": "Worker {{worker_id}}",
"range": true,
"refId": "A"
}
]
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
}
]
},
"unit": "percent"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 16
},
"id": 9,
"options": {
"legend": {
"calcs": [
"mean",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"title": "Average KV Cache Hit Rate",
"type": "timeseries",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"editorMode": "code",
"expr": "avg(100 * llm_kv_hit_rate_percent{component=\"$component\", endpoint=\"$endpoint\"})",
"legendFormat": "Average Hit Rate",
"range": true,
"refId": "A"
}
]
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 20,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
}
]
},
"unit": "none"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 24
},
"id": 6,
"options": {
"legend": {
"calcs": [
"mean",
"max"
],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"title": "Available Resources",
"type": "timeseries",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"editorMode": "code",
"expr": "sum(llm_kv_blocks_total{component=\"$component\", endpoint=\"$endpoint\"} - llm_kv_blocks_active{component=\"$component\", endpoint=\"$endpoint\"})",
"legendFormat": "Available KV Blocks",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"editorMode": "code",
"expr": "sum(llm_requests_total_slots{component=\"$component\", endpoint=\"$endpoint\"} - llm_requests_active_slots{component=\"$component\", endpoint=\"$endpoint\"})",
"hide": false,
"legendFormat": "Available Request Slots",
"range": true,
"refId": "B"
}
]
}
],
"refresh": "2s",
"schemaVersion": 38,
"style": "dark",
"tags": [
"llm",
"metrics"
],
"templating": {
"list": [
{
"current": {
"selected": false,
"text": "component",
"value": "vllm"
},
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"definition": "label_values(llm_kv_blocks_active, component)",
"hide": 0,
"includeAll": false,
"label": "Component",
"multi": false,
"name": "component",
"options": [],
"query": {
"query": "label_values(llm_kv_blocks_active, component)",
"refId": "StandardVariableQuery"
},
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"type": "query"
},
{
"current": {
"selected": false,
"text": "endpoint",
"value": "load_metrics"
},
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"definition": "label_values(llm_kv_blocks_active{component=\"$component\"}, endpoint)",
"hide": 0,
"includeAll": false,
"label": "Endpoint",
"multi": false,
"name": "endpoint",
"options": [],
"query": {
"query": "label_values(llm_kv_blocks_active{component=\"$component\"}, endpoint)",
"refId": "StandardVariableQuery"
},
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"type": "query"
}
]
},
"time": {
"from": "now-5m",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "LLM Worker Metrics",
"uid": "llm-worker-metrics",
"version": 1,
"weekStart": ""
}
\ No newline at end of file
...@@ -33,13 +33,13 @@ scrape_configs: ...@@ -33,13 +33,13 @@ scrape_configs:
static_configs: static_configs:
- targets: ['dcgm-exporter:9401'] # on the "monitoring" network - targets: ['dcgm-exporter:9401'] # on the "monitoring" network
# This is a demo service that needs to be launched manually. See components/metrics/README.md # This is a demo service that needs to be launched manually
# Note that you may need to disable the firewall on your host. On Ubuntu: sudo ufw allow 8080/tcp # Note that you may need to disable the firewall on your host. On Ubuntu: sudo ufw allow 8000/tcp
# You can also force the port, if the default is different: python -m dynamo.frontend --http-port 8080 # You can also force the port, if the default is different: python -m dynamo.frontend --http-port 8000
- job_name: 'dynamo-frontend' - job_name: 'dynamo-frontend'
scrape_interval: 10s scrape_interval: 10s
static_configs: static_configs:
- targets: ['host.docker.internal:8080'] # on the "monitoring" network - targets: ['host.docker.internal:8000'] # on the "monitoring" network
# Launch via: DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 dynamo.<backend> ... # Launch via: DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 dynamo.<backend> ...
# If you want to update the scrape_interval, you may want to also update component.rs's MAX_DELAY # If you want to update the scrape_interval, you may want to also update component.rs's MAX_DELAY
...@@ -48,15 +48,6 @@ scrape_configs: ...@@ -48,15 +48,6 @@ scrape_configs:
static_configs: static_configs:
- targets: ['host.docker.internal:8081'] - targets: ['host.docker.internal:8081']
# DEPRECATED: This metrics aggregation service is being deprecated in favor of MetricsRegistry
# The new system uses the 'dynamo-backend' job above instead of this separate service
# This is another demo aggregator that needs to be launched manually. See components/metrics/README.md
# Note that you may need to disable the firewall on your host. On Ubuntu: sudo ufw allow 9091/tcp
- job_name: 'metrics-aggregation-service'
scrape_interval: 2s
static_configs:
# - targets: ['localhost:9091'] # metrics aggregation service on host
- targets: ['host.docker.internal:9091'] # metrics aggregation service on host
# KVBM leader related metrics # KVBM leader related metrics
- job_name: 'kvbm-metrics' - job_name: 'kvbm-metrics'
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment