Unverified Commit be001a58 authored by Keiven C's avatar Keiven C Committed by GitHub
Browse files

chore: Remove deprecated components/metrics and references (#3475)


Signed-off-by: default avatarKeiven Chang <keivenchang@users.noreply.github.com>
parent f712653e
...@@ -4551,25 +4551,6 @@ dependencies = [ ...@@ -4551,25 +4551,6 @@ dependencies = [
"paste", "paste",
] ]
[[package]]
name = "metrics"
version = "0.5.1"
dependencies = [
"axum 0.8.4",
"clap 4.5.48",
"dynamo-llm",
"dynamo-runtime",
"futures",
"prometheus",
"rand 0.9.2",
"reqwest 0.12.23",
"serde",
"serde_json",
"thiserror 2.0.16",
"tokio",
"tracing",
]
[[package]] [[package]]
name = "mime" name = "mime"
version = "0.3.17" version = "0.3.17"
......
...@@ -3,7 +3,6 @@ ...@@ -3,7 +3,6 @@
[workspace] [workspace]
members = [ members = [
"components/metrics",
"launch/dynamo-run", "launch/dynamo-run",
"lib/llm", "lib/llm",
"lib/runtime", "lib/runtime",
...@@ -18,7 +17,6 @@ members = [ ...@@ -18,7 +17,6 @@ members = [
# - launch/dynamo-run # - launch/dynamo-run
# - lib/engines/* # - lib/engines/*
default-members = [ default-members = [
"components/metrics",
"lib/llm", "lib/llm",
"lib/runtime", "lib/runtime",
"lib/tokens", "lib/tokens",
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
[package]
name = "metrics"
version.workspace = true
edition.workspace = true
authors.workspace = true
license.workspace = true
homepage.workspace = true
repository.workspace = true
[dependencies]
dynamo-llm = { workspace = true }
dynamo-runtime = { workspace = true }
futures = { workspace = true }
prometheus = { workspace = true }
rand = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }
thiserror = { workspace = true }
tokio = { workspace = true }
tracing = { workspace = true }
axum = { version = "0.8" }
clap = { version = "4.5", features = ["derive", "env"] }
reqwest = { version = "0.12.22", default-features = false, features = ["json", "rustls-tls"] }
# Metrics
⚠️ **DEPRECATION NOTICE** ⚠️
**This `metrics` component is unmaintained and being deprecated.**
The deprecated `metrics` component is being replaced by the **`MetricsRegistry`** built-in functionality that is now available directly in the `DistributedRuntime` framework. The `MetricsRegistry` provides:
**For new projects and existing deployments, please migrate to using `MetricsRegistry` instead of this component.**
This component may be migrated to the MetricsRegistry in the future.
**📖 See the [Dynamo MetricsRegistry Guide](../../docs/guides/metrics.md) for detailed information on using the new metrics system.**
---
The deprecated `metrics` component is a utility for collecting, aggregating, and publishing metrics from a Dynamo deployment, but it is unmaintained and being deprecated in favor of `MetricsRegistry`.
**Note**: This is a demo implementation. The deprecated `metrics` component is no longer under active development.
- In this demo the metrics names use the prefix "llm", but in production they will be prefixed with "dynamo" (e.g., the HTTP `/metrics` endpoint will serve metrics with "dynamo" prefixes)
<div align="center">
<img src="images/dynamo_metrics_grafana.png" alt="Dynamo Metrics Dashboard"/>
</div>
## Quickstart
To start the deprecated `metrics` component, simply point it at the `namespace/component/endpoint`
trio for the Dynamo workers that you're interested in monitoring metrics on.
This will:
1. Collect statistics from workers associated with that `namespace/component/endpoint`
2. Postprocess and aggregate those statistics across the workers
3. Publish them on a Prometheus-compatible metrics endpoint
For example:
```bash
# Default namespace is "dynamo", but can be configured with --namespace
# For more detailed output, try setting the env var: DYN_LOG=debug
metrics --component MyComponent --endpoint my_endpoint
# 2025-03-17T00:07:05.202558Z INFO metrics: Scraping endpoint dynamo/MyComponent/my_endpoint for stats
# 2025-03-17T00:07:05.202955Z INFO metrics: Prometheus metrics server started at 0.0.0.0:9091/metrics
# ...
```
With no matching endpoints running to collect stats from, you should see warnings in the logs:
```bash
2025-03-17T00:07:06.204756Z WARN metrics: No endpoints found matching dynamo/MyComponent/my_endpoint
```
After a worker with a matching endpoint gets started, the endpoint
will get automatically discovered and the warnings will stop.
## Workers
The deprecated `metrics` component needs running workers to gather metrics from,
so below are some examples of workers and how they can be monitored.
### Mock Worker
To try out how the deprecated `metrics` component works, there is a demo Rust-based
[mock worker](src/bin/mock_worker.rs) that provides sample data through two mechanisms:
1. Exposes a stats handler at `dynamo/MyComponent/my_endpoint` that responds to polling requests (from the deprecated `metrics` component) with randomly generated `ForwardPassMetrics` data
2. Publishes mock `KVHitRateEvent` data every second to demonstrate event-based metrics
Step 1: Launch a mock workers via the following command (if already built):
```bash
# or build/run from source: DYN_LOG=DEBUG cargo run --bin mock_worker
mock_worker
# 2025-03-16T23:49:28.101668Z INFO mock_worker: Starting Mock Worker on Endpoint: dynamo/MyComponent/my_endpoint
```
Step 2: Monitor the metrics of these mock workers, and prepare its own Prometheus endpoint at
port 9091 (a default, when --port is not specified) on /metrics:
```bash
metrics --component MyComponent --endpoint my_endpoint
```
### Real Worker
To run a more realistic deployment to gather metrics:
```bash
python -m dynamo.frontend &
python -m dynamo.vllm --model-path <your-model-checkout>
```
Then, to monitor the metrics of these VllmWorkers, run:
```bash
metrics --component backend --endpoint load_metrics
```
**NOTE**: `load_metrics` is currently a
[hard-coded](https://github.com/ai-dynamo/dynamo/blob/d5220c7b1151372ba3d2a061c7d0a7ed72724789/lib/llm/src/kv_router/publisher.rs#L108)
endpoint name used for python-based workers that register a `WorkerMetricsPublisher`.
## Visualization
To visualize the metrics being exposed on the Prometheus endpoint,
see the Prometheus and Grafana configurations in
[deploy/metrics](../../deploy/metrics):
```bash
docker compose -f deploy/docker-compose.yml --profile metrics up -d
```
## Metrics Collection Modes
The deprecated `metrics` component supports two modes for exposing metrics in a Prometheus format:
### Pull Mode (Default)
When running in pull mode (the default), the deprecated `metrics` component will expose a
Prometheus metrics endpoint on the specified host and port that a
Prometheus server or curl client can pull from:
```bash
# Start metrics server on default host (0.0.0.0) and port (9091)
metrics --component MyComponent --endpoint my_endpoint
# Or specify a custom port
metrics --component MyComponent --endpoint my_endpoint --port 9092
```
In pull mode:
- The `--host` parameter must be a valid IPv4 or IPv6 address (e.g., "0.0.0.0", "127.0.0.1")
- The `--port` parameter specifies which port the HTTP server will listen on
You can then query the metrics using:
```bash
curl localhost:9091/metrics
# # HELP llm_kv_blocks_active Active KV cache blocks
# # TYPE llm_kv_blocks_active gauge
# llm_kv_blocks_active{component="MyComponent",endpoint="my_endpoint",worker_id="7587884888253033398"} 40
# llm_kv_blocks_active{component="MyComponent",endpoint="my_endpoint",worker_id="7587884888253033401"} 2
# # HELP llm_kv_blocks_total Total KV cache blocks
# # TYPE llm_kv_blocks_total gauge
# llm_kv_blocks_total{component="MyComponent",endpoint="my_endpoint",worker_id="7587884888253033398"} 100
# llm_kv_blocks_total{component="MyComponent",endpoint="my_endpoint",worker_id="7587884888253033401"} 100
```
### Push Mode
For ephemeral or batch jobs, or when metrics need to be pushed through a firewall,
you can use Push mode. In this mode, the deprecated `metrics` component will periodically push
metrics to an externally hosted
[Prometheus PushGateway](https://prometheus.io/docs/instrumenting/pushing/):
Start a prometheus push gateway service via docker:
```bash
docker run --rm -d -p 9091:9091 --name pushgateway prom/pushgateway
```
Start the deprecated `metrics` component in `--push` mode, specifying the host and port of your PushGateway:
```bash
# Push metrics to a Prometheus PushGateway every --push-interval seconds
metrics \
--component MyComponent \
--endpoint my_endpoint \
--host 127.0.0.1 \
--port 9091 \
--push
```
When using Push mode:
- The `--host` parameter must be a valid IPv4 or IPv6 address (e.g., "0.0.0.0", "127.0.0.1")
that the Prometheus PushGateway is running on
- The `--port` parameter specifies the port of the Prometheus PushGateway
- The push interval can be configured with `--push-interval` (default: 2 seconds)
- A default job name of "dynamo_metrics" is used for the Prometheus job label
- Metrics persist in the PushGateway until explicitly deleted
- Prometheus should be configured to scrape the PushGateway with `honor_labels: true`
To view the metrics hosted on the PushGateway:
```bash
# View all metrics
# curl http://<pushgateway_ip>:<pushgateway_port>/metrics
curl 127.0.0.1:9091/metrics
```
## Building/Running from Source
For easy iteration while making edits to the deprecated `metrics` component, you can use `cargo run`
to build and run with your local changes:
```bash
cargo run --bin metrics -- --component MyComponent --endpoint my_endpoint
```
File suppressed by a .gitattributes entry or the file's encoding is unsupported.
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
use dynamo_llm::kv_router::{
KV_HIT_RATE_SUBJECT,
protocols::{ForwardPassMetrics, KvStats, WorkerStats},
scheduler::KVHitRateEvent,
};
use dynamo_runtime::{
DistributedRuntime, Result, Runtime, Worker,
component::{Namespace, service::EndpointStats},
logging,
pipeline::{
AsyncEngine, AsyncEngineContextProvider, Error, ManyOut, ResponseStream, SingleIn,
async_trait, network::Ingress,
},
protocols::annotated::Annotated,
stream,
traits::events::EventPublisher,
};
use rand::Rng;
use std::sync::Arc;
use tokio::time::{Duration, interval};
fn main() -> Result<()> {
logging::init();
let worker = Worker::from_settings()?;
worker.execute(app)
}
async fn app(runtime: Runtime) -> Result<()> {
let distributed = DistributedRuntime::from_settings(runtime.clone()).await?;
backend(distributed).await
}
struct MockRequestHandler {}
impl MockRequestHandler {
fn new() -> Arc<Self> {
Arc::new(Self {})
}
}
#[async_trait]
impl AsyncEngine<SingleIn<String>, ManyOut<Annotated<String>>, Error> for MockRequestHandler {
async fn generate(&self, input: SingleIn<String>) -> Result<ManyOut<Annotated<String>>> {
let (data, ctx) = input.into_parts();
let chars = data
.chars()
.map(|c| Annotated::from_data(c.to_string()))
.collect::<Vec<_>>();
let stream = stream::iter(chars);
Ok(ResponseStream::new(Box::pin(stream), ctx.context()))
}
}
// FIXME: These events are just for testing and may not currently be used.
/// Spawns a background task that periodically publishes mock KV hit rate events
async fn mock_event_publisher(namespace: Namespace) {
// NOTE: These events are just for testing, and shouldn't be interpreted
// in correlation with the stats handler's data:
// 1. The worker ID associated with the events here won't match the
// worker ID of the endpoint's service stats handler.
// 2. These events aren't coming through the KV Router, so the metrics won't
// be reflective of the KV Router's performance.
// 3. The data in these events aren't in sync with the stats handler's
// ForwardPassMetrics data, so they may not correlate well.
let worker_id = rand::rng().random_range(1..=1000);
let mut interval = interval(Duration::from_secs(1));
loop {
interval.tick().await;
// Generate random KV hit rate event using a new thread_rng each time
let isl_blocks = rand::rng().random_range(0..=100);
let overlap_blocks = rand::rng().random_range(0..=isl_blocks);
let event = KVHitRateEvent {
worker_id,
isl_blocks,
overlap_blocks: overlap_blocks as u32,
};
if let Err(e) = namespace.publish(KV_HIT_RATE_SUBJECT, &event).await {
tracing::warn!("Failed to publish KV hit rate event: {e}");
} else {
tracing::debug!(
"Published KV hit rate event: worker_id={worker_id}, isl_blocks={isl_blocks}, overlap_blocks={overlap_blocks}, hit_rate={:.2}%",
(overlap_blocks as f64 / isl_blocks as f64) * 100.0
);
}
}
}
/// Generates mock forward pass metrics for stats handler
fn mock_stats_handler(_stats: EndpointStats) -> serde_json::Value {
let request_total_slots = 100;
let request_active_slots = rand::rng().random_range(0..=request_total_slots);
let kv_total_blocks = 100;
let kv_active_blocks = rand::rng().random_range(0..=kv_total_blocks);
let num_requests_waiting = rand::rng().random_range(0..=100);
let gpu_cache_usage_perc = rand::rng().random_range(0.0..=1.0);
let gpu_prefix_cache_hit_rate = rand::rng().random_range(0.0..=1.0);
let worker_stats = WorkerStats {
data_parallel_rank: None, // Default for backwards compatibility
request_active_slots,
request_total_slots,
num_requests_waiting,
};
let kv_stats = KvStats {
kv_active_blocks,
kv_total_blocks,
gpu_cache_usage_perc,
gpu_prefix_cache_hit_rate,
};
let spec_decode_stats = None;
let stats = ForwardPassMetrics {
worker_stats,
kv_stats,
spec_decode_stats,
};
tracing::info!("Stats: {stats:?}");
serde_json::to_value(stats).unwrap()
}
async fn backend(runtime: DistributedRuntime) -> Result<()> {
let namespace = runtime.namespace("dynamo")?;
// we must first create a service, then we can attach one more more endpoints
let component = namespace
.component("MyComponent")?
.service_builder()
.create()
.await?;
let endpoint = component.endpoint("my_endpoint");
tracing::info!("Starting Mock Worker on Endpoint: {}", endpoint.path());
// Spawn background task for publishing KV hit rate events
let namespace_clone = namespace.clone();
tokio::spawn(async move {
mock_event_publisher(namespace_clone).await;
});
// Attach an ingress to the engine
let ingress = Ingress::for_engine(MockRequestHandler::new())?;
// Make the ingress discoverable via a component service
endpoint
.endpoint_builder()
// Dummy stats handler to demonstrate how to attach a custom stats handler
.stats_handler(mock_stats_handler)
.handler(ingress)
.start()
.await
}
This diff is collapsed.
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Metrics is a metrics aggregator designed to operate within a namespace and collect
//! metrics from all workers.
//!
//! Metrics will collect for now:
//!
//! - LLM Worker Load:Capacity
//! - These metrics will be scraped by the LLM NATS Service API's stats request
//! - Request Slots: [Active, Total]
//! - KV Cache Blocks: [Active, Total]
//! - KV Hit Rate:
//! - These metrics will be collected from KV hit rate events published by the KV router
//! - ISL Blocks: Cumulative count of total blocks in all KV hit rate events
//! - Overlap Blocks: Cumulative count of blocks that were already in the KV cache
use clap::Parser;
use dynamo_llm::kv_router::KV_HIT_RATE_SUBJECT;
use dynamo_llm::kv_router::scheduler::KVHitRateEvent;
use dynamo_runtime::{
DistributedRuntime, ErrorContext, Result, Runtime, Worker, error, logging,
traits::events::{EventPublisher, EventSubscriber},
utils::{Duration, Instant},
};
use futures::stream::StreamExt;
use std::sync::Arc;
// Import from our library
use metrics::{
LLMWorkerLoadCapacityConfig, MetricsMode, PrometheusMetricsCollector, collect_endpoints,
extract_metrics, postprocess_metrics,
};
/// CLI arguments for the metrics application
#[derive(Parser, Debug)]
#[command(author, version, about, long_about = None)]
struct Args {
/// Namespace to operate in and subscribe to events on
#[arg(long, env = "DYN_NAMESPACE", default_value = "dynamo")]
namespace: String,
/// Component to scrape metrics from
#[arg(long)]
component: String,
/// Endpoint to scrape metrics from
#[arg(long)]
endpoint: String,
/// Model name for the target component (optional)
#[arg(long)]
model_name: Option<String>,
/// Polling interval in seconds for scraping dynamo endpoint stats (minimum 1 second)
#[arg(long, default_value = "1")]
poll_interval: u64,
/// Host for serving or pushing prometheus metrics (default: 0.0.0.0)
#[arg(
long,
default_value = "0.0.0.0",
help_heading = "Prometheus Metrics Config"
)]
host: String,
/// Port to run the Prometheus metrics server on (default: 9091)
#[arg(
long,
default_value = "9091",
help_heading = "Prometheus Metrics Config"
)]
port: u16,
/// Push metrics to an external Prometheus Pushgateway instead of hosting them in-process
#[arg(long, help_heading = "Prometheus Metrics Config")]
push: bool,
/// Push interval in seconds, when using push mode (minimum 1 second, default: 2)
#[arg(long, default_value = "2", help_heading = "Prometheus Metrics Config")]
push_interval: u64,
}
fn get_config(args: &Args) -> Result<LLMWorkerLoadCapacityConfig> {
if args.component.is_empty() {
return Err(error!("Component name cannot be empty"));
}
if args.endpoint.is_empty() {
return Err(error!("Endpoint name cannot be empty"));
}
if args.poll_interval < 1 {
return Err(error!("Polling interval must be at least 1 second"));
}
if args.push && args.push_interval < 1 {
return Err(error!("Push interval must be at least 1 second"));
}
Ok(LLMWorkerLoadCapacityConfig {
component_name: args.component.clone(),
endpoint_name: args.endpoint.clone(),
model_name: args.model_name.clone(),
})
}
async fn app(runtime: Runtime) -> Result<()> {
let args = Args::parse();
let config = get_config(&args)?;
tracing::debug!("Config: {config:?}");
let drt = DistributedRuntime::from_settings(runtime.clone()).await?;
let namespace = drt.namespace(args.namespace)?;
let component = namespace.component("count")?;
// Create unique instance of Count
let key = format!("{}/instance", component.etcd_root());
tracing::debug!("Creating unique instance of Count at {key}");
drt.etcd_client()
.expect("Unreachable because of DistributedRuntime::from_settings above")
.kv_create(&key, serde_json::to_vec_pretty(&config)?, None)
.await
.context("Unable to create unique instance of Count; possibly one already exists")?;
let target_component = namespace.component(&config.component_name)?;
let target_endpoint = target_component.endpoint(&config.endpoint_name);
let service_path = target_endpoint.path();
let service_subject = target_endpoint.subject();
tracing::info!("Scraping endpoint {service_path} for stats");
// Safety: DistributedRuntime::from_settings ensures this is Some
let token = drt.primary_lease().unwrap().child_token();
let event_name = format!("l2c.{}.{}", config.component_name, config.endpoint_name);
// Initialize Prometheus metrics with the selected mode
let metrics_collector = PrometheusMetricsCollector::new()?;
let metrics_collector = Arc::new(tokio::sync::Mutex::new(metrics_collector));
// Start metrics collection in the selected mode
let metrics_mode = if args.push {
MetricsMode::Push {
host: args.host,
port: args.port,
job: "dynamo_push_metrics".to_string(),
interval: args.push_interval,
}
} else {
MetricsMode::Pull {
host: args.host,
port: args.port,
}
};
metrics_collector.lock().await.start(metrics_mode)?;
// TODO: Consider removing event subscription until metrics are more standardized
// Subscribe to KV hit rate events
let kv_hit_rate_subject = KV_HIT_RATE_SUBJECT;
tracing::debug!("Subscribing to KV hit rate events on subject: {kv_hit_rate_subject}");
// Clone fields for the event subscription task
let config_clone = config.clone();
let namespace_clone = namespace.clone();
let metrics_collector_clone = metrics_collector.clone();
// Note: Subscribing to KVHitRateEvent for illustration purposes. They're not used in production.
// Spawn a task to handle KV hit rate events
tokio::spawn(async move {
match namespace_clone.subscribe(kv_hit_rate_subject).await {
Ok(mut subscriber) => {
tracing::debug!("Successfully subscribed to KV hit rate events");
while let Some(msg) = subscriber.next().await {
match serde_json::from_slice::<KVHitRateEvent>(&msg.payload) {
Ok(event) => {
// TODO: Lower to debug
let cache_hit_pct =
(event.overlap_blocks as f64 / event.isl_blocks as f64) * 100.0;
tracing::debug!(
"Received KV hit rate event: worker_id={}, isl_blocks={}, overlap_blocks={}, cache_hit_pct={:.2}%",
event.worker_id,
event.isl_blocks,
event.overlap_blocks,
cache_hit_pct
);
// Update metrics with the event data
let mut metrics = metrics_collector_clone.lock().await;
metrics.update_kv_hit_rate(
&config_clone,
event.worker_id,
event.isl_blocks,
event.overlap_blocks as usize,
);
}
Err(e) => {
tracing::warn!("Failed to deserialize KV hit rate event: {e}");
}
}
}
tracing::warn!("KV hit rate event subscription stream ended");
}
Err(e) => {
tracing::error!("Failed to subscribe to KV hit rate events: {:?}", e);
}
}
});
loop {
let next = Instant::now() + Duration::from_secs(args.poll_interval);
// Collect and process metrics
let scrape_timeout = Duration::from_secs(1);
let endpoints =
collect_endpoints(&target_component, &service_subject, scrape_timeout).await?;
if endpoints.is_empty() {
tracing::warn!("No endpoints found matching {service_path}");
continue;
}
let metrics = extract_metrics(&endpoints);
let processed = postprocess_metrics(&metrics, &endpoints);
if processed.endpoints.is_empty() {
tracing::warn!("No metrics found matching {service_path}");
} else {
tracing::info!("Aggregated metrics: {processed:?}");
}
// Update Prometheus metrics
metrics_collector.lock().await.update(&config, &processed);
// TODO: Enable KV Routers to subscribe to metrics events published here
// for a single view of the aggregated metrics, as opposed to the current
// approach where each KV Router computes and published its own metrics.
// Publish metrics event
namespace.publish(&event_name, &processed).await?;
// Wait until cancelled or the next tick
match tokio::time::timeout_at(next, token.cancelled()).await {
Ok(_) => break,
Err(_) => continue,
}
}
Ok(())
}
fn main() -> Result<()> {
logging::init();
let worker = Worker::from_settings()?;
worker.execute(app)
}
#[cfg(test)]
mod tests {
use super::*;
use std::env;
#[test]
fn test_namespace_from_env() {
unsafe { env::set_var("DYN_NAMESPACE", "test-namespace") };
let args = Args::parse_from(["count", "--component", "comp", "--endpoint", "end"]);
assert_eq!(args.namespace, "test-namespace");
}
}
...@@ -174,7 +174,7 @@ The following configuration files should be present in this directory: ...@@ -174,7 +174,7 @@ The following configuration files should be present in this directory:
- [grafana_dashboards/grafana-dashboard-providers.yml](./grafana_dashboards/grafana-dashboard-providers.yml): Contains Grafana dashboard provider configuration - [grafana_dashboards/grafana-dashboard-providers.yml](./grafana_dashboards/grafana-dashboard-providers.yml): Contains Grafana dashboard provider configuration
- [grafana_dashboards/grafana-dynamo-dashboard.json](./grafana_dashboards/grafana-dynamo-dashboard.json): A general Dynamo Dashboard for both SW and HW metrics. - [grafana_dashboards/grafana-dynamo-dashboard.json](./grafana_dashboards/grafana-dynamo-dashboard.json): A general Dynamo Dashboard for both SW and HW metrics.
- [grafana_dashboards/grafana-dcgm-metrics.json](./grafana_dashboards/grafana-dcgm-metrics.json): Contains Grafana dashboard configuration for DCGM GPU metrics - [grafana_dashboards/grafana-dcgm-metrics.json](./grafana_dashboards/grafana-dcgm-metrics.json): Contains Grafana dashboard configuration for DCGM GPU metrics
- [grafana_dashboards/grafana-llm-metrics.json](./grafana_dashboards/grafana-llm-metrics.json): This file, which is being phased out, contains the Grafana dashboard configuration for LLM-specific metrics. It requires an additional `metrics` component to operate concurrently. A new version is under development. - [grafana_dashboards/grafana-kvbm-dashboard.json](./grafana_dashboards/grafana-kvbm-dashboard.json): Contains Grafana dashboard configuration for KVBM metrics
### Metric Name Constants ### Metric Name Constants
...@@ -237,8 +237,6 @@ This centralized approach ensures all Dynamo components use consistent, valid Pr ...@@ -237,8 +237,6 @@ This centralized approach ensures all Dynamo components use consistent, valid Pr
- DCGM Exporter: `http://localhost:9401/metrics` - DCGM Exporter: `http://localhost:9401/metrics`
- Start the [components/metrics](../../components/metrics/README.md) application to begin monitoring for metric events from dynamo workers and aggregating them on a Prometheus metrics endpoint: `http://localhost:9091/metrics`.
- Uncomment the appropriate lines in prometheus.yml to poll port 9091.
- Start worker(s) that publishes KV Cache metrics: [lib/runtime/examples/service_metrics/README.md](../../lib/runtime/examples/service_metrics/README.md) can populate dummy KV Cache metrics. - Start worker(s) that publishes KV Cache metrics: [lib/runtime/examples/service_metrics/README.md](../../lib/runtime/examples/service_metrics/README.md) can populate dummy KV Cache metrics.
### Configuration ### Configuration
...@@ -275,7 +273,7 @@ Grafana is pre-configured with: ...@@ -275,7 +273,7 @@ Grafana is pre-configured with:
docker compose logs grafana docker compose logs grafana
``` ```
3. For issues with the legacy metrics component (being phased out), see [components/metrics/README.md](../../components/metrics/README.md) for details on the exposed metrics and troubleshooting steps. 3. Check Prometheus targets at `http://localhost:9090/targets` to verify metric collection.
## Developer Guide ## Developer Guide
...@@ -477,21 +475,6 @@ let requests_total = namespace.create_counter( ...@@ -477,21 +475,6 @@ let requests_total = namespace.create_counter(
)?; )?;
``` ```
## Running the deprecated `components/metrics` program
⚠️ **DEPRECATION NOTICE** ⚠️
When you run the example [components/metrics](../../components/metrics/README.md) program, it exposes a Prometheus /metrics endpoint with the following metrics (defined in [components/metrics/src/lib.rs](../../components/metrics/src/lib.rs)):
**⚠️ The following `llm_kv_*` metrics are deprecated:**
- `llm_requests_active_slots`: Active request slots per worker
- `llm_requests_total_slots`: Total available request slots per worker
- `llm_kv_blocks_active`: Active KV blocks per worker
- `llm_kv_blocks_total`: Total KV blocks available per worker
- `llm_kv_hit_rate_percent`: KV Cache hit percent per worker
- `llm_load_avg`: Average load across workers
- `llm_load_std`: Load standard deviation across workers
## Troubleshooting ## Troubleshooting
...@@ -506,4 +489,4 @@ When you run the example [components/metrics](../../components/metrics/README.md ...@@ -506,4 +489,4 @@ When you run the example [components/metrics](../../components/metrics/README.md
docker compose logs grafana docker compose logs grafana
``` ```
3. For issues with the legacy metrics component (being phased out), see [components/metrics/README.md](../../components/metrics/README.md) for details on the exposed metrics and troubleshooting steps. 3. Check Prometheus targets at `http://localhost:9090/targets` to verify metric collection.
...@@ -33,13 +33,13 @@ scrape_configs: ...@@ -33,13 +33,13 @@ scrape_configs:
static_configs: static_configs:
- targets: ['dcgm-exporter:9401'] # on the "monitoring" network - targets: ['dcgm-exporter:9401'] # on the "monitoring" network
# This is a demo service that needs to be launched manually. See components/metrics/README.md # This is a demo service that needs to be launched manually
# Note that you may need to disable the firewall on your host. On Ubuntu: sudo ufw allow 8080/tcp # Note that you may need to disable the firewall on your host. On Ubuntu: sudo ufw allow 8000/tcp
# You can also force the port, if the default is different: python -m dynamo.frontend --http-port 8080 # You can also force the port, if the default is different: python -m dynamo.frontend --http-port 8000
- job_name: 'dynamo-frontend' - job_name: 'dynamo-frontend'
scrape_interval: 10s scrape_interval: 10s
static_configs: static_configs:
- targets: ['host.docker.internal:8080'] # on the "monitoring" network - targets: ['host.docker.internal:8000'] # on the "monitoring" network
# Launch via: DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 dynamo.<backend> ... # Launch via: DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 dynamo.<backend> ...
# If you want to update the scrape_interval, you may want to also update component.rs's MAX_DELAY # If you want to update the scrape_interval, you may want to also update component.rs's MAX_DELAY
...@@ -48,15 +48,6 @@ scrape_configs: ...@@ -48,15 +48,6 @@ scrape_configs:
static_configs: static_configs:
- targets: ['host.docker.internal:8081'] - targets: ['host.docker.internal:8081']
# DEPRECATED: This metrics aggregation service is being deprecated in favor of MetricsRegistry
# The new system uses the 'dynamo-backend' job above instead of this separate service
# This is another demo aggregator that needs to be launched manually. See components/metrics/README.md
# Note that you may need to disable the firewall on your host. On Ubuntu: sudo ufw allow 9091/tcp
- job_name: 'metrics-aggregation-service'
scrape_interval: 2s
static_configs:
# - targets: ['localhost:9091'] # metrics aggregation service on host
- targets: ['host.docker.internal:9091'] # metrics aggregation service on host
# KVBM leader related metrics # KVBM leader related metrics
- job_name: 'kvbm-metrics' - job_name: 'kvbm-metrics'
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment